def create_crosstable(sico_files, target_crosstable): """Create crosstable with vertically the orthologs, horizontally the genomes, and gene IDs at intersections.""" with open(target_crosstable, mode='w') as write_handle: #Create dictionaries mapping genomes to gene IDs per sico file row_data = [(sico_file, dict(itemgetter(0, 2)(fasta_record.id.split('|')) for fasta_record in SeqIO.parse(sico_file, 'fasta'))) for sico_file in sico_files] #Retrieve unique genomes across all sico files, just to be safe genomes = sorted(set(key for row in row_data for key in row[1].keys())) genome_dicts = select_genomes_by_ids(genomes).values() #Write out values to file write_handle.write('\t' + '\t'.join(genomes)) write_handle.write('\tCOGs\tProduct\n') for sico_file, row in row_data: ortholog = os.path.split(sico_file)[1].split('.')[0] write_handle.write(ortholog + '\t') write_handle.write('\t'.join(row.get(genome, '') for genome in genomes)) #Parse sequence records again, but now to retrieve cogs and products seq_records = list(SeqIO.parse(sico_file, 'fasta')) #COGs cogs = find_cogs_in_sequence_records(seq_records) write_handle.write('\t' + ','.join(cogs)) #Product product = get_most_recent_gene_name(genome_dicts, seq_records) write_handle.write('\t' + product) #New line write_handle.write('\n')
def _extract_cog_digits_and_letters(clade_calcs): '''Add the COG digits and letters to the clade_calcs.values dictionary for all strains in clade_calcs.alignment.''' cog_digits = [] cog_letters = [] for cog in find_cogs_in_sequence_records(clade_calcs.alignment): # Match digits and letters separately matchobj = re.match('(COG[0-9]+)([A-Z]*)', cog) if matchobj: cog_digits.append(matchobj.groups()[0]) cog_letters.append(matchobj.groups()[1]) # Join the found digits and letters using a comma clade_calcs.values[COG_DIGITS] = ','.join(cog_digits) clade_calcs.values[COG_LETTERS] = ','.join(cog_letters)
def _occurences_and_cogs(genome_ids, ortholog_files): """Generator that returns how many sequences exist per genome in each ortholog in order and which COGs occur.""" genomes = select_genomes_by_ids(genome_ids).values() for fasta_file in ortholog_files: records = tuple(SeqIO.parse(fasta_file, 'fasta')) ids = [record.id.split('|')[0] for record in records] count_per_id = [ids.count(genome_id) for genome_id in genome_ids] cogs = sorted(find_cogs_in_sequence_records(records)) ortholog_nr = os.path.splitext(os.path.split(fasta_file)[1])[0] for record in records: #SeqIO mucks up ids containing spaces, so we have to assign description as value for id record.id = record.description product = get_most_recent_gene_name(genomes, records) yield count_per_id, ortholog_nr, cogs, product
def _group_cog_issues(sico_files): """Find issues with COG assignments within SICO files by looking at COG conflicts, transferable and missing COGs.""" cog_conflicts = {} cog_transferable = {} cog_missing = [] for sico_file in sico_files: cogs = find_cogs_in_sequence_records(SeqIO.parse(sico_file, 'fasta'), include_none=True) if 0 == len(cogs): cog_missing.append(sico_file) continue if 1 < len(cogs): if None in cogs: cogs.remove(None) if len(cogs) == 1: cog_transferable[sico_file] = cogs.pop() continue cog_conflicts[sico_file] = cogs return cog_conflicts, cog_transferable, cog_missing
def _phipack_for_all_orthologs(run_dir, aligned_files, stats_file): """Filter aligned fasta files where there is evidence of recombination when inspecting PhiPack values. Return two collections of aligned files, the first without recombination, the second with recombination.""" log.info('Running PhiPack for %i orthologs to find recombination', len(aligned_files)) #Create separate directory for phipack related values phipack_dir = create_directory('phipack', inside_dir=run_dir) with open(stats_file, mode='w') as write_handle: write_handle.write('\t'.join(['Ortholog', 'Informative sites', 'Phi', 'Max Chi^2', 'NSS', 'COGs', 'Product']) + '\n') #Retrieve unique genomes from first ortholog file genome_ids = set(fasta_record.id.split('|')[0] for fasta_record in SeqIO.parse(aligned_files[0], 'fasta')) genome_dicts = select_genomes_by_ids(genome_ids).values() #Assign ortholog files to the correct collection based on whether they show recombination for ortholog_file in aligned_files: orth_name = os.path.split(ortholog_file)[1].split('.')[0] #Parse tree file to ensure all genome_ids_a & genome_ids_b group together in the tree phipack_values = run_phipack(phipack_dir, ortholog_file) #Write PhiPack values to line write_handle.write('{0}\t{1[PhiPack sites]}\t{1[Phi]}\t{1[Max Chi^2]}\t{1[NSS]}'.format(orth_name, phipack_values)) #Parse sequence records again, but now to retrieve cogs and products seq_records = list(SeqIO.parse(ortholog_file, 'fasta')) #COGs cogs = find_cogs_in_sequence_records(seq_records) write_handle.write('\t' + ','.join(cogs)) #Product product = get_most_recent_gene_name(genome_dicts, seq_records) write_handle.write('\t' + product) #End line write_handle.write('\n')
def create_crosstable(sico_files, target_crosstable): """Create crosstable with vertically the orthologs, horizontally the genomes, and gene IDs at intersections.""" with open(target_crosstable, mode='w') as write_handle: #Create dictionaries mapping genomes to gene IDs per sico file row_data = [(sico_file, dict( itemgetter(0, 2)(fasta_record.id.split('|')) for fasta_record in SeqIO.parse(sico_file, 'fasta'))) for sico_file in sico_files] #Retrieve unique genomes across all sico files, just to be safe genomes = sorted(set(key for row in row_data for key in row[1].keys())) genome_dicts = select_genomes_by_ids(genomes).values() #Write out values to file write_handle.write('\t' + '\t'.join(genomes)) write_handle.write('\tCOGs\tProduct\n') for sico_file, row in row_data: ortholog = os.path.split(sico_file)[1].split('.')[0] write_handle.write(ortholog + '\t') write_handle.write('\t'.join( row.get(genome, '') for genome in genomes)) #Parse sequence records again, but now to retrieve cogs and products seq_records = list(SeqIO.parse(sico_file, 'fasta')) #COGs cogs = find_cogs_in_sequence_records(seq_records) write_handle.write('\t' + ','.join(cogs)) #Product product = get_most_recent_gene_name(genome_dicts, seq_records) write_handle.write('\t' + product) #New line write_handle.write('\n')
def _perform_calculations(alignment, codeml_values): """Perform actual calculations on the alignment to determine pN, pS, SFS & the number of ignored cases per SICO.""" synonymous_sfs = {} four_fold_syn_sfs = {} non_synonymous_sfs = {} four_fold_synonymous_sites = 0 mixed_synonymous_polymorphisms = 0 multiple_site_polymorphisms = 0 #Calculate sequence_lengths here so we can handle alignments that are not multiples of three sequence_lengths = len(alignment[0]) - len(alignment[0]) % 3 #Split into codon_alignments codon_alignments = (alignment[:, index:index + 3] for index in range(0, sequence_lengths, 3)) for codon_alignment in codon_alignments: #Get string representations of codons for simplicity codons = [str(seqr.seq) for seqr in codon_alignment] #As per AEW: ignore codons with gaps, and codons with unresolved bases: Basically anything but ACGT if 0 < len(''.join(codons).translate(None, 'ACGTactg')): continue #Skip codons where any of the alignment codons is a stopcodon, same as in codeml if any(codon in BACTERIAL_CODON_TABLE.stop_codons for codon in codons): continue #Retrieve translations of codons now that inconclusive & stop-codons have been removed translations = [ BACTERIAL_CODON_TABLE.forward_table.get(codon) for codon in codons ] #Count unique translations across strains translation_usage = dict( (aa, translations.count(aa)) for aa in set(translations)) #Mutations are synonymous when all codons encode the same AA, and there are no skipped codons synonymous = len(translation_usage) == 1 and len(translations) == len( codon_alignment) #Retrieve nucleotides per site within the codon site1 = [nucl for nucl in codon_alignment[:, 0]] site2 = [nucl for nucl in codon_alignment[:, 1]] site3 = [nucl for nucl in codon_alignment[:, 2]] #Count occurrences of distinct nucleotides across strains site1_usage = dict((nucl, site1.count(nucl)) for nucl in set(site1)) site2_usage = dict((nucl, site2.count(nucl)) for nucl in set(site2)) site3_usage = dict((nucl, site3.count(nucl)) for nucl in set(site3)) #Sites are polymorphic if they contain more than one nucleotide site1_polymorphic = 1 < len(site1_usage) site2_polymorphic = 1 < len(site2_usage) site3_polymorphic = 1 < len(site3_usage) polymorphisms = site1_polymorphic, site2_polymorphic, site3_polymorphic #Continue with next codon if none of the sites is polymorphic if not any(polymorphisms): #But do increase the number of 4-fold synonymous sites if the pattern matches codon = codons[0] if re.match(FOUR_FOLD_DEGENERATE_PATTERN, codon): #Increase by one, as this site is for fold degenerate, even if it is not polymorphic four_fold_synonymous_sites += 1 continue #Determine if only one site is polymorphic by using boolean xor and not all single_site_polymorphism = site1_polymorphic ^ site2_polymorphic ^ site3_polymorphic and not all( polymorphisms) #Skip multiple site polymorphisms, but do keep a count of how many we encounter if not single_site_polymorphism: multiple_site_polymorphisms += 1 continue #Determine which site_usage is the single site polymorphism polymorph_site_usage = site1_usage if site1_polymorphic else site2_usage if site2_polymorphic else site3_usage #Find the 'reference' nucleotide as (one of) the most occurring occupations in this site, so we can -1 later psu_values = polymorph_site_usage.values() reference_allele_count = max(psu_values) #Calculate the local site frequency spectrum, to be added to the gene-wide SFS later #We'll be using Site Frequency Spectrum to calculate the number of synonymous and non synonymous polymorphisms #Note: this requires a complete SFS across synonymous & non_synonymous polymorphisms, be careful when updating local_sfs = dict( (ntimes, psu_values.count(ntimes)) for ntimes in set(psu_values)) #Deduct one for the reference_allele_count, which should not count towards the SFS local_sfs[ reference_allele_count] = local_sfs[reference_allele_count] - 1 #Remove empty value as possible result of the above decrement operation if local_sfs[reference_allele_count] == 0: del local_sfs[reference_allele_count] def _update_sfs_with_local_sfs(sfs, local_sfs): """Add values from local_sfs to gene-wide sfs""" for maf, count in local_sfs.iteritems(): prev_occupations = sfs.get(maf, 0) sfs[maf] = prev_occupations + count if synonymous: #If all polymorphisms encode for the same AA, we have multiple synonymous polymorphisms, where: #2 nucleotides = 1 polymorphism, 3 nucleotides = 2 polymorphisms, 4 nucleotides = 3 polymorphisms #Update synonymous SFS by adding values from local SFS _update_sfs_with_local_sfs(synonymous_sfs, local_sfs) #Codon is four fold degenerate if it matches FOUR_FOLD_DEGENERATE_PATTERN if site3_polymorphic: codon = codons[0] if re.match(FOUR_FOLD_DEGENERATE_PATTERN, codon): #Update four fold degenerate SFS by adding values from local SFS _update_sfs_with_local_sfs(four_fold_syn_sfs, local_sfs) #Increase the number of four_fold synonymous sites here as well four_fold_synonymous_sites += 1 else: #not synonymous if len(polymorph_site_usage) == len(translation_usage): #If all polymorphisms encode for different AA, we have multiple non-synonymous polymorphisms, where: #2 nucleotides = 1 polymorphism, 3 nucleotides = 2 polymorphisms, 4 nucleotides = 3 polymorphisms #Update non synonymous SFS by adding values from local SFS _update_sfs_with_local_sfs(non_synonymous_sfs, local_sfs) else: #Some, but not all polymorphisms encode for different AA, making it unclear how this should be scored mixed_synonymous_polymorphisms += 1 #Compute combined values from the above counted statistics computed_values = _compute_values_from_statistics( len(alignment), sequence_lengths, codeml_values, synonymous_sfs, non_synonymous_sfs, four_fold_syn_sfs, four_fold_synonymous_sites) #Miscellaneous additional values computed_values['codons'] = sequence_lengths // 3 computed_values[ 'multiple site polymorphisms'] = multiple_site_polymorphisms computed_values[ 'complex codons (with both synonymous and non-synonymous polymorphisms segregating)'] = mixed_synonymous_polymorphisms #Add COGs to output file in split columns cog_digits = [] cog_letters = [] for cog in find_cogs_in_sequence_records(alignment): matchobj = re.match('(COG[0-9]+)([A-Z]*)', cog) if matchobj: cog_digits.append(matchobj.groups()[0]) cog_letters.append(matchobj.groups()[1]) computed_values['cog digits'] = ','.join(cog_digits) computed_values['cog letters'] = ','.join(cog_letters) return computed_values
def _perform_calculations(alignment, codeml_values): """Perform actual calculations on the alignment to determine pN, pS, SFS & the number of ignored cases per SICO.""" synonymous_sfs = {} four_fold_syn_sfs = {} non_synonymous_sfs = {} four_fold_synonymous_sites = 0 mixed_synonymous_polymorphisms = 0 multiple_site_polymorphisms = 0 #Calculate sequence_lengths here so we can handle alignments that are not multiples of three sequence_lengths = len(alignment[0]) - len(alignment[0]) % 3 #Split into codon_alignments codon_alignments = (alignment[:, index:index + 3] for index in range(0, sequence_lengths, 3)) for codon_alignment in codon_alignments: #Get string representations of codons for simplicity codons = [str(seqr.seq) for seqr in codon_alignment] #As per AEW: ignore codons with gaps, and codons with unresolved bases: Basically anything but ACGT if 0 < len(''.join(codons).translate(None, 'ACGTactg')): continue #Skip codons where any of the alignment codons is a stopcodon, same as in codeml if any(codon in BACTERIAL_CODON_TABLE.stop_codons for codon in codons): continue #Retrieve translations of codons now that inconclusive & stop-codons have been removed translations = [BACTERIAL_CODON_TABLE.forward_table.get(codon) for codon in codons] #Count unique translations across strains translation_usage = dict((aa, translations.count(aa)) for aa in set(translations)) #Mutations are synonymous when all codons encode the same AA, and there are no skipped codons synonymous = len(translation_usage) == 1 and len(translations) == len(codon_alignment) #Retrieve nucleotides per site within the codon site1 = [nucl for nucl in codon_alignment[:, 0]] site2 = [nucl for nucl in codon_alignment[:, 1]] site3 = [nucl for nucl in codon_alignment[:, 2]] #Count occurrences of distinct nucleotides across strains site1_usage = dict((nucl, site1.count(nucl)) for nucl in set(site1)) site2_usage = dict((nucl, site2.count(nucl)) for nucl in set(site2)) site3_usage = dict((nucl, site3.count(nucl)) for nucl in set(site3)) #Sites are polymorphic if they contain more than one nucleotide site1_polymorphic = 1 < len(site1_usage) site2_polymorphic = 1 < len(site2_usage) site3_polymorphic = 1 < len(site3_usage) polymorphisms = site1_polymorphic, site2_polymorphic, site3_polymorphic #Continue with next codon if none of the sites is polymorphic if not any(polymorphisms): #But do increase the number of 4-fold synonymous sites if the pattern matches codon = codons[0] if re.match(FOUR_FOLD_DEGENERATE_PATTERN, codon): #Increase by one, as this site is for fold degenerate, even if it is not polymorphic four_fold_synonymous_sites += 1 continue #Determine if only one site is polymorphic by using boolean xor and not all single_site_polymorphism = site1_polymorphic ^ site2_polymorphic ^ site3_polymorphic and not all(polymorphisms) #Skip multiple site polymorphisms, but do keep a count of how many we encounter if not single_site_polymorphism: multiple_site_polymorphisms += 1 continue #Determine which site_usage is the single site polymorphism polymorph_site_usage = site1_usage if site1_polymorphic else site2_usage if site2_polymorphic else site3_usage #Find the 'reference' nucleotide as (one of) the most occurring occupations in this site, so we can -1 later psu_values = polymorph_site_usage.values() reference_allele_count = max(psu_values) #Calculate the local site frequency spectrum, to be added to the gene-wide SFS later #We'll be using Site Frequency Spectrum to calculate the number of synonymous and non synonymous polymorphisms #Note: this requires a complete SFS across synonymous & non_synonymous polymorphisms, be careful when updating local_sfs = dict((ntimes, psu_values.count(ntimes)) for ntimes in set(psu_values)) #Deduct one for the reference_allele_count, which should not count towards the SFS local_sfs[reference_allele_count] = local_sfs[reference_allele_count] - 1 #Remove empty value as possible result of the above decrement operation if local_sfs[reference_allele_count] == 0: del local_sfs[reference_allele_count] def _update_sfs_with_local_sfs(sfs, local_sfs): """Add values from local_sfs to gene-wide sfs""" for maf, count in local_sfs.iteritems(): prev_occupations = sfs.get(maf, 0) sfs[maf] = prev_occupations + count if synonymous: #If all polymorphisms encode for the same AA, we have multiple synonymous polymorphisms, where: #2 nucleotides = 1 polymorphism, 3 nucleotides = 2 polymorphisms, 4 nucleotides = 3 polymorphisms #Update synonymous SFS by adding values from local SFS _update_sfs_with_local_sfs(synonymous_sfs, local_sfs) #Codon is four fold degenerate if it matches FOUR_FOLD_DEGENERATE_PATTERN if site3_polymorphic: codon = codons[0] if re.match(FOUR_FOLD_DEGENERATE_PATTERN, codon): #Update four fold degenerate SFS by adding values from local SFS _update_sfs_with_local_sfs(four_fold_syn_sfs, local_sfs) #Increase the number of four_fold synonymous sites here as well four_fold_synonymous_sites += 1 else: #not synonymous if len(polymorph_site_usage) == len(translation_usage): #If all polymorphisms encode for different AA, we have multiple non-synonymous polymorphisms, where: #2 nucleotides = 1 polymorphism, 3 nucleotides = 2 polymorphisms, 4 nucleotides = 3 polymorphisms #Update non synonymous SFS by adding values from local SFS _update_sfs_with_local_sfs(non_synonymous_sfs, local_sfs) else: #Some, but not all polymorphisms encode for different AA, making it unclear how this should be scored mixed_synonymous_polymorphisms += 1 #Compute combined values from the above counted statistics computed_values = _compute_values_from_statistics(len(alignment), sequence_lengths, codeml_values, synonymous_sfs, non_synonymous_sfs, four_fold_syn_sfs, four_fold_synonymous_sites) #Miscellaneous additional values computed_values['codons'] = sequence_lengths // 3 computed_values['multiple site polymorphisms'] = multiple_site_polymorphisms computed_values['complex codons (with both synonymous and non-synonymous polymorphisms segregating)'] = mixed_synonymous_polymorphisms #Add COGs to output file in split columns cog_digits = [] cog_letters = [] for cog in find_cogs_in_sequence_records(alignment): matchobj = re.match('(COG[0-9]+)([A-Z]*)', cog) if matchobj: cog_digits.append(matchobj.groups()[0]) cog_letters.append(matchobj.groups()[1]) computed_values['cog digits'] = ','.join(cog_digits) computed_values['cog letters'] = ','.join(cog_letters) return computed_values