Пример #1
0
def create_crosstable(sico_files, target_crosstable):
    """Create crosstable with vertically the orthologs, horizontally the genomes, and gene IDs at intersections."""
    with open(target_crosstable, mode='w') as write_handle:
        #Create dictionaries mapping genomes to gene IDs per sico file
        row_data = [(sico_file, dict(itemgetter(0, 2)(fasta_record.id.split('|'))
                         for fasta_record in SeqIO.parse(sico_file, 'fasta')))
                    for sico_file in sico_files]

        #Retrieve unique genomes across all sico files, just to be safe
        genomes = sorted(set(key for row in row_data for key in row[1].keys()))
        genome_dicts = select_genomes_by_ids(genomes).values()

        #Write out values to file
        write_handle.write('\t' + '\t'.join(genomes))
        write_handle.write('\tCOGs\tProduct\n')
        for sico_file, row in row_data:
            ortholog = os.path.split(sico_file)[1].split('.')[0]
            write_handle.write(ortholog + '\t')
            write_handle.write('\t'.join(row.get(genome, '') for genome in genomes))

            #Parse sequence records again, but now to retrieve cogs and products
            seq_records = list(SeqIO.parse(sico_file, 'fasta'))

            #COGs
            cogs = find_cogs_in_sequence_records(seq_records)
            write_handle.write('\t' + ','.join(cogs))

            #Product
            product = get_most_recent_gene_name(genome_dicts, seq_records)
            write_handle.write('\t' + product)

            #New line
            write_handle.write('\n')
Пример #2
0
    def __init__(self, alignment, genomes):
        self.alignment = alignment
        self.nr_of_strains = len(alignment)
        self.sequence_lengths = len(alignment[0])

        self.values = defaultdict(int)

        # The most basic calculation added to the output file
        self.values[CODONS] = self.sequence_lengths // 3

        # Get the most recent gene name for the strains in a given clade_calcs instance
        self.values[PRODUCT] = get_most_recent_gene_name(genomes, self.alignment)
Пример #3
0
 def _occurences_and_cogs(genome_ids, ortholog_files):
     """Generator that returns how many sequences exist per genome in each ortholog in order and which COGs occur."""
     genomes = select_genomes_by_ids(genome_ids).values()
     for fasta_file in ortholog_files:
         records = tuple(SeqIO.parse(fasta_file, 'fasta'))
         ids = [record.id.split('|')[0] for record in records]
         count_per_id = [ids.count(genome_id) for genome_id in genome_ids]
         cogs = sorted(find_cogs_in_sequence_records(records))
         ortholog_nr = os.path.splitext(os.path.split(fasta_file)[1])[0]
         for record in records:
             #SeqIO mucks up ids containing spaces, so we have to assign description as value for id
             record.id = record.description
         product = get_most_recent_gene_name(genomes, records)
         yield count_per_id, ortholog_nr, cogs, product
Пример #4
0
def _phipack_for_all_orthologs(run_dir, aligned_files, stats_file):
    """Filter aligned fasta files where there is evidence of recombination when inspecting PhiPack values.
    Return two collections of aligned files, the first without recombination, the second with recombination."""

    log.info('Running PhiPack for %i orthologs to find recombination', len(aligned_files))

    #Create separate directory for phipack related values
    phipack_dir = create_directory('phipack', inside_dir=run_dir)

    with open(stats_file, mode='w') as write_handle:
        write_handle.write('\t'.join(['Ortholog',
                                      'Informative sites',
                                      'Phi',
                                      'Max Chi^2',
                                      'NSS',
                                      'COGs',
                                      'Product']) + '\n')

        #Retrieve unique genomes from first ortholog file
        genome_ids = set(fasta_record.id.split('|')[0] for fasta_record in SeqIO.parse(aligned_files[0], 'fasta'))
        genome_dicts = select_genomes_by_ids(genome_ids).values()

        #Assign ortholog files to the correct collection based on whether they show recombination
        for ortholog_file in aligned_files:
            orth_name = os.path.split(ortholog_file)[1].split('.')[0]

            #Parse tree file to ensure all genome_ids_a & genome_ids_b group together in the tree
            phipack_values = run_phipack(phipack_dir, ortholog_file)

            #Write PhiPack values to line
            write_handle.write('{0}\t{1[PhiPack sites]}\t{1[Phi]}\t{1[Max Chi^2]}\t{1[NSS]}'.format(orth_name,
                                                                                                    phipack_values))

            #Parse sequence records again, but now to retrieve cogs and products
            seq_records = list(SeqIO.parse(ortholog_file, 'fasta'))
            #COGs
            cogs = find_cogs_in_sequence_records(seq_records)
            write_handle.write('\t' + ','.join(cogs))
            #Product
            product = get_most_recent_gene_name(genome_dicts, seq_records)
            write_handle.write('\t' + product)

            #End line
            write_handle.write('\n')
Пример #5
0
def create_crosstable(sico_files, target_crosstable):
    """Create crosstable with vertically the orthologs, horizontally the genomes, and gene IDs at intersections."""
    with open(target_crosstable, mode='w') as write_handle:
        #Create dictionaries mapping genomes to gene IDs per sico file
        row_data = [(sico_file,
                     dict(
                         itemgetter(0, 2)(fasta_record.id.split('|'))
                         for fasta_record in SeqIO.parse(sico_file, 'fasta')))
                    for sico_file in sico_files]

        #Retrieve unique genomes across all sico files, just to be safe
        genomes = sorted(set(key for row in row_data for key in row[1].keys()))
        genome_dicts = select_genomes_by_ids(genomes).values()

        #Write out values to file
        write_handle.write('\t' + '\t'.join(genomes))
        write_handle.write('\tCOGs\tProduct\n')
        for sico_file, row in row_data:
            ortholog = os.path.split(sico_file)[1].split('.')[0]
            write_handle.write(ortholog + '\t')
            write_handle.write('\t'.join(
                row.get(genome, '') for genome in genomes))

            #Parse sequence records again, but now to retrieve cogs and products
            seq_records = list(SeqIO.parse(sico_file, 'fasta'))

            #COGs
            cogs = find_cogs_in_sequence_records(seq_records)
            write_handle.write('\t' + ','.join(cogs))

            #Product
            product = get_most_recent_gene_name(genome_dicts, seq_records)
            write_handle.write('\t' + product)

            #New line
            write_handle.write('\n')
Пример #6
0
def calculate_tables(genome_ids_a, genome_ids_b, sico_files, oddeven=False):
    """Compute a spreadsheet of data points each for A and B based the SICO files, without duplicating computations."""
    #Convert file names into identifiers while preserving filenames, as filenames are used both for BioPython & PhiPack
    orth_files = [(os.path.split(sico_file)[1].split('.')[0], sico_file)
                  for sico_file in sico_files]

    #Find PhiPack values for each sico file
    orth_phipack_values = _phipack_values_for_sicos(orth_files)

    #Convert list of sico files into ortholog name mapped to BioPython Alignment object
    sico_alignments = [(ortholog, AlignIO.read(sico_file, 'fasta'))
                       for ortholog, sico_file in orth_files]

    #Only retrieve genomes once which we'll use to link gene names to orthologs
    all_genome_ids = list(genome_ids_a)
    all_genome_ids.extend(genome_ids_b)
    genomes = select_genomes_by_ids(all_genome_ids).values()

    #For each ortholog, determine the newest gene name across taxa so unannotated taxa also get gene names
    ortholog_gene_names = dict(
        (ortholog, get_most_recent_gene_name(genomes, alignmnt))
        for ortholog, alignmnt in sico_alignments)

    #Split individual sico alignments into separate alignments for each of the clades per ortholog
    #These split alignments can later be reversed and/or subselections can be made to calculate for alternate alignments
    split_alignments = [
        (ortholog,
         MultipleSeqAlignment(seqr for seqr in alignmnt
                              if seqr.id.split('|')[0] in genome_ids_a),
         MultipleSeqAlignment(seqr for seqr in alignmnt
                              if seqr.id.split('|')[0] in genome_ids_b))
        for ortholog, alignmnt in sico_alignments
    ]

    #Calculate tables for normal sico alignments
    log.info('Starting calculations for full alignments')
    table_a, table_b = _tables_for_split_alignments(split_alignments,
                                                    ortholog_gene_names,
                                                    orth_phipack_values)

    if not oddeven:
        return table_a, table_b

    #As an alternate method of calculating number of substitutions for independent X-axis of eventual graph:
    #split each alignment for a and b into two further alignments of odd and even codons
    odd_even_split_orth_alignments = [
        (orthologname, _every_other_codon_alignments(alignment_x),
         _every_other_codon_alignments(alignment_y))
        for orthologname, alignment_x, alignment_y in split_alignments
    ]

    #Recover odd alignments as first from each pair of alignments
    odd_split_alignments = [(orthologname, odd_even_x[0], odd_even_y[0])
                            for orthologname, odd_even_x, odd_even_y in
                            odd_even_split_orth_alignments]

    #Create files for all the odd codon alignments, so we can run PhiPack for them
    odd_alignments_dir = tempfile.mkdtemp(prefix='odd_codon_alignments_')
    odd_files = dict(
        (ortholog, os.path.join(odd_alignments_dir, ortholog + '.ffn'))
        for ortholog, odd_x, odd_y in odd_split_alignments)
    for ortholog, odd_x, odd_y in odd_split_alignments:
        AlignIO.write([odd_x, odd_y], odd_files[ortholog], 'fasta')
    odd_phipack_vals = _phipack_values_for_sicos(odd_files.items())
    shutil.rmtree(odd_alignments_dir)

    #Calculate tables for odd codon sico alignments
    log.info('Starting calculations for odd alignments')
    table_a_odd, table_b_odd = _tables_for_split_alignments(
        odd_split_alignments, ortholog_gene_names, odd_phipack_vals)

    #Recover even alignments as second from each pair of alignments
    even_split_alignments = [(orthologname, odd_even_x[1], odd_even_y[1])
                             for orthologname, odd_even_x, odd_even_y in
                             odd_even_split_orth_alignments]

    #Create files for all the odd codon alignments, so we can run PhiPack for them
    even_alignments_dir = tempfile.mkdtemp(prefix='even_codon_alignments_')
    even_files = dict(
        (ortholog, os.path.join(even_alignments_dir, ortholog + '.ffn'))
        for ortholog, even_x, even_y in even_split_alignments)
    for ortholog, even_x, even_y in even_split_alignments:
        AlignIO.write([even_x, even_y], even_files[ortholog], 'fasta')
    even_phipack_vals = _phipack_values_for_sicos(even_files.items())
    shutil.rmtree(even_alignments_dir)

    #Calculate tables for even codon sico alignments
    log.info('Starting calculations for even alignments')
    table_a_even, table_b_even = _tables_for_split_alignments(
        even_split_alignments, ortholog_gene_names, even_phipack_vals)

    #Concatenate tables and return their values
    table_a_full = tempfile.mkstemp(suffix='.tsv', prefix='table_a_full_')[1]
    table_b_full = tempfile.mkstemp(suffix='.tsv', prefix='table_b_full_')[1]
    concatenate(table_a_full, [table_a, table_a_odd, table_a_even])
    concatenate(table_b_full, [table_b, table_b_odd, table_b_even])
    return table_a_full, table_b_full
Пример #7
0
def calculate_tables(genome_ids_a, genome_ids_b, sico_files, oddeven=False):
    """Compute a spreadsheet of data points each for A and B based the SICO files, without duplicating computations."""
    #Convert file names into identifiers while preserving filenames, as filenames are used both for BioPython & PhiPack
    orth_files = [(os.path.split(sico_file)[1].split('.')[0], sico_file) for sico_file in sico_files]

    #Find PhiPack values for each sico file
    orth_phipack_values = _phipack_values_for_sicos(orth_files)

    #Convert list of sico files into ortholog name mapped to BioPython Alignment object
    sico_alignments = [(ortholog, AlignIO.read(sico_file, 'fasta'))
                       for ortholog, sico_file in orth_files]

    #Only retrieve genomes once which we'll use to link gene names to orthologs
    all_genome_ids = list(genome_ids_a)
    all_genome_ids.extend(genome_ids_b)
    genomes = select_genomes_by_ids(all_genome_ids).values()

    #For each ortholog, determine the newest gene name across taxa so unannotated taxa also get gene names
    ortholog_gene_names = dict((ortholog, get_most_recent_gene_name(genomes, alignmnt))
                               for ortholog, alignmnt in sico_alignments)

    #Split individual sico alignments into separate alignments for each of the clades per ortholog
    #These split alignments can later be reversed and/or subselections can be made to calculate for alternate alignments
    split_alignments = [(ortholog,
                         MultipleSeqAlignment(seqr for seqr in alignmnt if seqr.id.split('|')[0] in genome_ids_a),
                         MultipleSeqAlignment(seqr for seqr in alignmnt if seqr.id.split('|')[0] in genome_ids_b))
                        for ortholog, alignmnt in sico_alignments]

    #Calculate tables for normal sico alignments
    log.info('Starting calculations for full alignments')
    table_a, table_b = _tables_for_split_alignments(split_alignments, ortholog_gene_names, orth_phipack_values)

    if not oddeven:
        return table_a, table_b

    #As an alternate method of calculating number of substitutions for independent X-axis of eventual graph:
    #split each alignment for a and b into two further alignments of odd and even codons
    odd_even_split_orth_alignments = [(orthologname,
                                      _every_other_codon_alignments(alignment_x),
                                      _every_other_codon_alignments(alignment_y))
                                      for orthologname, alignment_x, alignment_y in split_alignments]

    #Recover odd alignments as first from each pair of alignments
    odd_split_alignments = [(orthologname,
                            odd_even_x[0],
                            odd_even_y[0])
                            for orthologname, odd_even_x, odd_even_y in odd_even_split_orth_alignments]

    #Create files for all the odd codon alignments, so we can run PhiPack for them
    odd_alignments_dir = tempfile.mkdtemp(prefix='odd_codon_alignments_')
    odd_files = dict((ortholog, os.path.join(odd_alignments_dir, ortholog + '.ffn'))
                     for ortholog, odd_x, odd_y in odd_split_alignments)
    for ortholog, odd_x, odd_y in odd_split_alignments:
        AlignIO.write([odd_x, odd_y], odd_files[ortholog], 'fasta')
    odd_phipack_vals = _phipack_values_for_sicos(odd_files.items())
    shutil.rmtree(odd_alignments_dir)

    #Calculate tables for odd codon sico alignments
    log.info('Starting calculations for odd alignments')
    table_a_odd, table_b_odd = _tables_for_split_alignments(odd_split_alignments, ortholog_gene_names, odd_phipack_vals)

    #Recover even alignments as second from each pair of alignments
    even_split_alignments = [(orthologname,
                            odd_even_x[1],
                            odd_even_y[1])
                            for orthologname, odd_even_x, odd_even_y in odd_even_split_orth_alignments]

    #Create files for all the odd codon alignments, so we can run PhiPack for them
    even_alignments_dir = tempfile.mkdtemp(prefix='even_codon_alignments_')
    even_files = dict((ortholog, os.path.join(even_alignments_dir, ortholog + '.ffn'))
                     for ortholog, even_x, even_y in even_split_alignments)
    for ortholog, even_x, even_y in even_split_alignments:
        AlignIO.write([even_x, even_y], even_files[ortholog], 'fasta')
    even_phipack_vals = _phipack_values_for_sicos(even_files.items())
    shutil.rmtree(even_alignments_dir)

    #Calculate tables for even codon sico alignments
    log.info('Starting calculations for even alignments')
    table_a_even, table_b_even = _tables_for_split_alignments(even_split_alignments,
                                                              ortholog_gene_names,
                                                              even_phipack_vals)

    #Concatenate tables and return their values
    table_a_full = tempfile.mkstemp(suffix='.tsv', prefix='table_a_full_')[1]
    table_b_full = tempfile.mkstemp(suffix='.tsv', prefix='table_b_full_')[1]
    concatenate(table_a_full, [table_a, table_a_odd, table_a_even])
    concatenate(table_b_full, [table_b, table_b_odd, table_b_even])
    return table_a_full, table_b_full