def read_FASTA_files_no_groups(meta_genomes, dir_path,dir_structure='tree'): cur_dir = os.getcwd() if os.path.isfile(dir_path): os.chdir(os.path.dirname(dir_path)) seq_file = os.path.basename(dir_path) else: os.chdir(dir_path) if dir_structure == 'single_fasta_file': seq_list = list(SeqIO.parse(seq_file,"fasta")) seq_dic = {} for seq in seq_list: seq_dic[seq.id] = seq genomes = [] for genome_data in meta_genomes: dir_name = genome_data['file_name'] if dir_structure == 'tree': fasta_files = os.listdir(dir_name) for fasta_file in fasta_files: genome_file = open(dir_name + '/' + fasta_file) identifier = genome_file.readline() # Only use non-plasmid genomes # Some bacterial genomes contain more than 1 chromosome, # but assumed not more than 2 if identifier.find('plasmid') == -1 and \ (identifier.find('complete genome') != -1 or\ identifier.find('chromosome 1') != -1): # Close and reopen the same file genome_file.close() genome_file = open(dir_name + '/' + fasta_file) genome_seq = list(SeqIO.parse(genome_file, "fasta")) if len(genome_seq) > 1: sys.stderr.write("Warning! The file " + fasta_file + " in directory " + dir_name + " contained more than one sequence, ignoring all but the first!" + os.linesep) genome = DNA(id = dir_name, seq= str(genome_seq[0].seq)) genome.genus = genome_data['genus'] genome.species = genome_data['species'] genome.family = genome_data['family'] genomes.append(genome) genome_file.close() elif dir_structure == 'single_fasta_file': seq = seq_dic[genome_data['file_name']] genome = DNA(id = seq.id, seq= str(seq.seq)) genome.genus = genome_data['genus'] genome.species = genome_data['species'] genome.family = genome_data['family'] genomes.append(genome) os.chdir(cur_dir) return genomes
def read_contigs_file(open_contigs_file, start_position=False,taxonomy_info=True): """ Read contigs file generated by generate_contigs script""" contigs = [] seqs = list(SeqIO.parse(open_contigs_file, "fasta")) for seq in seqs: if taxonomy_info: contig_id_hash = parse_contig_description(seq.description, start_position=start_position) contig = DNA(id=contig_id_hash["genome"], seq=str(seq.seq)) if start_position: contig.start_position = contig_id_hash["start_position"] contig.family = contig_id_hash["family"] contig.genus = contig_id_hash["genus"] contig.species = contig_id_hash["species"] contig.contig_id = contig_id_hash["contig_id"] else: contig = DNA(id=seq.id,seq=str(seq.seq)) contig.contig_id = seq.id contigs.append(contig) return contigs
def main(open_name_file, dir_path, kmer_length, x_set): groups = [] DNA.generate_kmer_hash(kmer_length) # Read the file with all names, divide them into groups for line in open_name_file: if line[0:12] == 'family_name:': family = line.split('\t')[1].strip() elif line[0:11] == 'genus_name:': genus = line.split('\t')[1].strip() new_group = GenomeGroup(genus) new_group.family = family groups.append(new_group) elif line[0:6] == 'entry:': genome_name = line.split('\t')[2].strip() genome_species = line.split('\t')[1].strip() meta_genome = {'id': genome_name, 'species': genome_species, 'genus': genus, 'family': family, 'file_name': genome_name } groups[-1].genome_data.append(meta_genome) # Each genome in a group is a bin, fit parameters to all bins os.chdir(dir_path) for group in groups: for genome_data in group.genome_data: dir_name = genome_data['file_name'] fasta_files = os.listdir(dir_name) for fasta_file in fasta_files: genome_file = open(dir_name + '/' + fasta_file) identifier = genome_file.readline() # Only use non-plasmid genomes # Some bacterial genomes contain more than 1 chromosonme, # but assumed not more than 2 if identifier.find('plasmid') == -1 and identifier.find('chromosome 2') == -1: genome_file.close() #Close and reopen the same file genome_file = open(dir_name + '/' + fasta_file) genome_seq = list(SeqIO.parse(genome_file, "fasta")) if len(genome_seq) > 1: sys.stderr.write("Warning! The file " + fasta_file + " in directory " + dir_name + " contained more than one sequence, ignoring all but the first!" + os.linesep) genome = DNA(id = dir_name, seq= str(genome_seq[0].seq)) genome.calculate_signature() genome.genus = genome_data['genus'] genome.species = genome_data['species'] genome.family = genome_data['family'] group.genomes.append(genome) genome_file.close() # For each bin, generate a number of contigs, # re-calculate parameters for that bin without contig-section. # Further score this contig against all bins, keep within-group # scores separate from outside-group scores. all_scores = [] id_generator = Uniq_id(1000) for group_index in range(len(groups)): group = groups[group_index] rest_groups = all_but_index(groups, group_index) test = Experiment(x_set, group, rest_groups, id_generator) group_scores = test.execute() all_scores.append(group_scores) sys.stdout.write("p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id" + os.linesep) for group_scores in all_scores: for genome_scores in group_scores: for score in genome_scores: sys.stdout.write(str(score) + '\n')