示例#1
0
def create_crosstable(sico_files, target_crosstable):
    """Create crosstable with vertically the orthologs, horizontally the genomes, and gene IDs at intersections."""
    with open(target_crosstable, mode='w') as write_handle:
        # Create dictionaries mapping genomes to gene IDs per sico file
        row_data = [(sico_file, dict(itemgetter(0, 2)(fasta_record.id.split('|'))
                         for fasta_record in SeqIO.parse(sico_file, 'fasta')))
                    for sico_file in sico_files]

        # Retrieve unique genomes across all sico files, just to be safe
        genomes = sorted(set(key for row in row_data for key in row[1].keys()))
        genome_dicts = select_genomes_by_ids(genomes).values()

        # Write out values to file
        write_handle.write('\t' + '\t'.join(genomes))
        write_handle.write('\tCOGs\tProduct\n')
        for sico_file, row in row_data:
            ortholog = os.path.split(sico_file)[1].split('.')[0]
            write_handle.write(ortholog + '\t')
            write_handle.write('\t'.join(row.get(genome, '') for genome in genomes))

            # Parse sequence records again, but now to retrieve cogs and products
            seq_records = list(SeqIO.parse(sico_file, 'fasta'))

            # COGs
            cogs = find_cogs_in_sequence_records(seq_records)
            write_handle.write('\t' + ','.join(cogs))

            # Product
            product = get_most_recent_gene_name(genome_dicts, seq_records)
            write_handle.write('\t' + product)

            # New line
            write_handle.write('\n')
示例#2
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: concatenate_orthologs.py
--orthologs-zip=FILE     archive of orthologous genes in FASTA format
--coding-regions=FILE    destination file path archive of trimmed orthologous coding regions per genomes
--concatemer=FILE        destination file path for super-concatemer of all genomes
--taxon-a=FILE           destination file path for genome IDs for taxon A
--taxon-b=FILE           destination file path for genome IDs for taxon B
--tree=FILE              destination file path for tree visualization
"""
    options = ['orthologs-zip', 'coding-regions', 'concatemer', 'taxon-a', 'taxon-b', 'tree']
    orthologs_zip, target_coding_regions, target_concat_file, target_taxon_a, target_taxon_b, target_tree = \
        parse_options(usage, options, args)

    # Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='concatemer_tree_')

    # Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir)

    # Separate out orthologs per genome to create trimmed coding region files per genome
    genome_coding_regions_files = coding_regions_per_genome(run_dir, ortholog_files)
    create_archive_of_files(target_coding_regions, genome_coding_regions_files)

    # Concatenate coding region files per genome
    concatemer_files = concatemer_per_genome(run_dir, genome_coding_regions_files)
    # Create super concatemer
    create_super_concatemer(concatemer_files, target_concat_file)

    # Determine the taxa present in the super concatemer tree by building a phylogenetic tree from genome concatemer and
    # reading genome ids in the two largest clades.
    super_distance_file = _run_dna_dist(run_dir, target_concat_file)
    super_tree_file = _run_neighbor(run_dir, super_distance_file)
    genome_ids_a, genome_ids_b = _read_taxa_from_tree(super_tree_file)

    # Map Project IDs to Organism names
    id_to_name_map = dict((gid, genome['Organism/Name'])
                          for gid, genome in select_genomes_by_ids(genome_ids_a + genome_ids_b).iteritems())

    # Write Project IDs and Organism Names to files, with a fallback to genome_id for external genome
    with open(target_taxon_a, mode='w') as write_handle:
        for genome_id in genome_ids_a:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id, name=id_to_name_map.get(genome_id, genome_id)))
    with open(target_taxon_b, mode='w') as write_handle:
        for genome_id in genome_ids_b:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id, name=id_to_name_map.get(genome_id, genome_id)))

    # Visualize tree
    visualize_tree(super_tree_file, id_to_name_map, target_tree)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info('Produced: \n%s\n%s\n%s\n%s\n%s', target_coding_regions, target_concat_file,
             target_taxon_a, target_taxon_b, target_tree)
示例#3
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: translate.py
--genomes=FILE         file with selected genome IDs followed by Organism Name on each line
--external-zip=FILE    optional archive of user provided external genomes containing formatted nucleotide fasta files
--dna-zip=FILE         destination file path for zip archive of extracted DNA files
--protein-zip=FILE     destination file path for zip archive of translated protein files
"""
    options = ['genomes', 'external-zip=?', 'dna-zip', 'protein-zip']
    genome_ids_file, external_zip, dna_zipfile, protein_zipfile = parse_options(usage, options, args)

    dna_files = []
    protein_files = []

    # Read GenBank Project IDs from genomes_file, each on their own line
    with open(genome_ids_file) as read_handle:
        genome_ids = [line.split()[0] for line in read_handle
                      if not line.startswith('#') and 'external genome' not in line]

        if len(genome_ids):
            # Retrieve associated genome dictionaries from complete genomes table
            genomes = select_genomes_by_ids(genome_ids).values()
            genomes = sorted(genomes, key=itemgetter('Organism/Name'))

            # Actually translate the genomes to produced a set of files for both  dna files & protein files
            dna_files, protein_files = translate_genomes(genomes)

    # Also translate the external genomes
    if external_zip:
        # Extract external genomes archive
        external_dir = tempfile.mkdtemp(prefix='external_genomes_')
        external_dna_files = extract_archive_of_files(external_zip, external_dir)

        # Append IDs of external fasta files to genome IDs file
        _append_external_genomes(external_dna_files, genome_ids_file)

        # Translate individual files
        external_protein_files = [translate_fasta_coding_regions(dna_file) for dna_file in external_dna_files]

        # Add the files to the appropriate collections
        dna_files.extend(external_dna_files)
        protein_files.extend(external_protein_files)

    # Write the produced files to command line argument filenames
    create_archive_of_files(dna_zipfile, dna_files)
    create_archive_of_files(protein_zipfile, protein_files)

    # Do not clean up extracted DNA files or Protein translations: Keep them as cache

    # But do clean up external_dir now that the compressed archives are created
    if external_zip:
        shutil.rmtree(external_dir)

    # Exit after a comforting log message
    log.info("Produced: \n%s &\n%s", dna_zipfile, protein_zipfile)
示例#4
0
    def test_translate_93125_2(self):
        # Select genomes
        genomes = select_genomes_by_ids(['93125.2']).values()

        # Call translate
        aafiles = translate.translate_genomes(genomes)[1]

        # Verify no header appears twice
        headers = [record.id for record in SeqIO.parse(aafiles[0], 'fasta')]
        self.assertEqual(len(headers), len(set(headers)))
示例#5
0
def _table_calculations(genome_ids_a, genome_ids_b, sico_files, phipack_values):
    '''Perform calculations for comparsion of genome_ids_a with genome_ids_b.'''
    # retrieve genomes once for both
    genomes_a = select_genomes_by_ids(genome_ids_a).values()

    # dictionary to hold the values calculated per file
    calculations = []
    # loop over orthologs
    for sico_file in sico_files:
        # parse alignment
        alignment = AlignIO.read(sico_file, 'fasta')

        # split alignments
        alignment_a = MultipleSeqAlignment(seqr for seqr in alignment if seqr.id.split('|')[0] in genome_ids_a)
        alignment_b = MultipleSeqAlignment(seqr for seqr in alignment if seqr.id.split('|')[0] in genome_ids_b)

        # calculate codeml values
        codeml_values = _get_codeml_values(alignment_a, alignment_b)

        # create gathering instance of clade_calcs
        instance = clade_calcs(alignment_a, genomes_a)

        # store ortholog name retrieved from filename
        ortholog = os.path.basename(sico_file).split('.')[0]
        instance.values[ORTHOLOG] = ortholog

        # add codeml_values to clade_calcs instance values
        instance.values.update(codeml_values)

        # add phipack values for this file
        instance.values.update(phipack_values[sico_file])

        # add COG digits and letters
        _extract_cog_digits_and_letters(instance)

        # add SFS related values
        _codon_site_freq_spec(instance)

        # add additional deduced calculation
        _add_combined_calculations(instance)

        # store the clade_calc values
        calculations.append(instance)

    # calculcate mean and averages
    max_nton = len(genome_ids_a) // 2
    sum_stats, mean_stats = _calculcate_mean_and_averages(calculations, max_nton)

    # neutrality index calculation and bootstrapping
    ni_stats, ni_lower_stats, ni_upper_stats = _neutrality_indices(calculations)

    # finally append statistics to calculations so they show up in file
    calculations.extend((sum_stats, mean_stats, ni_stats, ni_lower_stats, ni_upper_stats))

    return calculations
示例#6
0
 def _occurences_and_cogs(genome_ids, ortholog_files):
     """Generator that returns how many sequences exist per genome in each ortholog in order and which COGs occur."""
     genomes = select_genomes_by_ids(genome_ids).values()
     for fasta_file in ortholog_files:
         records = tuple(SeqIO.parse(fasta_file, 'fasta'))
         ids = [record.id.split('|')[0] for record in records]
         count_per_id = [ids.count(genome_id) for genome_id in genome_ids]
         cogs = sorted(find_cogs_in_sequence_records(records))
         ortholog_nr = os.path.splitext(os.path.split(fasta_file)[1])[0]
         for record in records:
             # SeqIO mucks up ids containing spaces, so we have to assign description as value for id
             record.id = record.description
         product = get_most_recent_gene_name(genomes, records)
         yield count_per_id, ortholog_nr, cogs, product
示例#7
0
 def _occurences_and_cogs(genome_ids, ortholog_files):
     """Generator that returns how many sequences exist per genome in each ortholog in order and which COGs occur."""
     genomes = select_genomes_by_ids(genome_ids).values()
     for fasta_file in ortholog_files:
         records = tuple(SeqIO.parse(fasta_file, 'fasta'))
         ids = [record.id.split('|')[0] for record in records]
         count_per_id = [ids.count(genome_id) for genome_id in genome_ids]
         cogs = sorted(find_cogs_in_sequence_records(records))
         ortholog_nr = os.path.splitext(os.path.split(fasta_file)[1])[0]
         for record in records:
             # SeqIO mucks up ids containing spaces, so we have to assign description as value for id
             record.id = record.description
         product = get_most_recent_gene_name(genomes, records)
         yield count_per_id, ortholog_nr, cogs, product
示例#8
0
    def test_translate_genomes(self):
        # Select genomes
        genomes = select_genomes_by_ids(['13305.1']).values()

        # Call translate
        dnafiles, aafiles = translate.translate_genomes(genomes)

        # Verify expected output
        first_header = '13305.1|NC_008253.1|YP_667942.1|None|thr'
        first = next(SeqIO.parse(dnafiles[0], 'fasta'))
        self.assertEqual(first_header, first.id)
        first = next(SeqIO.parse(aafiles[0], 'fasta'))
        self.assertEqual(first_header, first.id)

        # Verify no header appears twice
        headers = [record.id for record in SeqIO.parse(aafiles[0], 'fasta')]
        self.assertEqual(len(headers), len(set(headers)))
示例#9
0
def _phipack_for_all_orthologs(run_dir, aligned_files, stats_file):
    """Filter aligned fasta files where there is evidence of recombination when inspecting PhiPack values.
    Return two collections of aligned files, the first without recombination, the second with recombination."""

    log.info('Running PhiPack for %i orthologs to find recombination',
             len(aligned_files))

    # Create separate directory for phipack related values
    phipack_dir = create_directory('phipack', inside_dir=run_dir)

    with open(stats_file, mode='w') as write_handle:
        write_handle.write('\t'.join([
            'Ortholog', 'Informative sites', 'Phi', 'Max Chi^2', 'NSS', 'COGs',
            'Product'
        ]) + '\n')

        # Retrieve unique genomes from first ortholog file
        genome_ids = set(
            fasta_record.id.split('|')[0]
            for fasta_record in SeqIO.parse(aligned_files[0], 'fasta'))
        genome_dicts = select_genomes_by_ids(genome_ids).values()

        # Assign ortholog files to the correct collection based on whether they show recombination
        for ortholog_file in aligned_files:
            orth_name = os.path.split(ortholog_file)[1].split('.')[0]

            # Parse tree file to ensure all genome_ids_a & genome_ids_b group together in the tree
            phipack_values = run_phipack(phipack_dir, ortholog_file)

            # Write PhiPack values to line
            write_handle.write(
                '{0}\t{1[PhiPack sites]}\t{1[Phi]}\t{1[Max Chi^2]}\t{1[NSS]}'.
                format(orth_name, phipack_values))

            # Parse sequence records again, but now to retrieve cogs and products
            seq_records = list(SeqIO.parse(ortholog_file, 'fasta'))
            # COGs
            cogs = find_cogs_in_sequence_records(seq_records)
            write_handle.write('\t' + ','.join(cogs))
            # Product
            product = get_most_recent_gene_name(genome_dicts, seq_records)
            write_handle.write('\t' + product)

            # End line
            write_handle.write('\n')
示例#10
0
def _phipack_for_all_orthologs(run_dir, aligned_files, stats_file):
    """Filter aligned fasta files where there is evidence of recombination when inspecting PhiPack values.
    Return two collections of aligned files, the first without recombination, the second with recombination."""

    log.info('Running PhiPack for %i orthologs to find recombination', len(aligned_files))

    # Create separate directory for phipack related values
    phipack_dir = create_directory('phipack', inside_dir=run_dir)

    with open(stats_file, mode='w') as write_handle:
        write_handle.write('\t'.join(['Ortholog',
                                      'Informative sites',
                                      'Phi',
                                      'Max Chi^2',
                                      'NSS',
                                      'COGs',
                                      'Product']) + '\n')

        # Retrieve unique genomes from first ortholog file
        genome_ids = set(fasta_record.id.split('|')[0] for fasta_record in SeqIO.parse(aligned_files[0], 'fasta'))
        genome_dicts = select_genomes_by_ids(genome_ids).values()

        # Assign ortholog files to the correct collection based on whether they show recombination
        for ortholog_file in aligned_files:
            orth_name = os.path.split(ortholog_file)[1].split('.')[0]

            # Parse tree file to ensure all genome_ids_a & genome_ids_b group together in the tree
            phipack_values = run_phipack(phipack_dir, ortholog_file)

            # Write PhiPack values to line
            write_handle.write('{0}\t{1[PhiPack sites]}\t{1[Phi]}\t{1[Max Chi^2]}\t{1[NSS]}'.format(orth_name,
                                                                                                    phipack_values))

            # Parse sequence records again, but now to retrieve cogs and products
            seq_records = list(SeqIO.parse(ortholog_file, 'fasta'))
            # COGs
            cogs = find_cogs_in_sequence_records(seq_records)
            write_handle.write('\t' + ','.join(cogs))
            # Product
            product = get_most_recent_gene_name(genome_dicts, seq_records)
            write_handle.write('\t' + product)

            # End line
            write_handle.write('\n')
示例#11
0
def _table_calculations(genome_ids_a, genome_ids_b, sico_files,
                        phipack_values):
    '''Perform calculations for comparsion of genome_ids_a with genome_ids_b.'''
    # retrieve genomes once for both
    genomes_a = select_genomes_by_ids(genome_ids_a).values()

    # dictionary to hold the values calculated per file
    calculations = []
    # loop over orthologs
    for sico_file in sico_files:
        # parse alignment
        alignment = AlignIO.read(sico_file, 'fasta')

        # split alignments
        alignment_a = MultipleSeqAlignment(
            seqr for seqr in alignment
            if seqr.id.split('|')[0] in genome_ids_a)
        alignment_b = MultipleSeqAlignment(
            seqr for seqr in alignment
            if seqr.id.split('|')[0] in genome_ids_b)

        # calculate codeml values
        codeml_values = _get_codeml_values(alignment_a, alignment_b)

        # create gathering instance of clade_calcs
        instance = clade_calcs(alignment_a, genomes_a)

        # store ortholog name retrieved from filename
        ortholog = os.path.basename(sico_file).split('.')[0]
        instance.values[ORTHOLOG] = ortholog

        # add codeml_values to clade_calcs instance values
        instance.values.update(codeml_values)

        # add phipack values for this file
        instance.values.update(phipack_values[sico_file])

        # add COG digits and letters
        _extract_cog_digits_and_letters(instance)

        # add SFS related values
        _codon_site_freq_spec(instance)

        # add additional deduced calculation
        _add_combined_calculations(instance)

        # store the clade_calc values
        calculations.append(instance)

    # calculcate mean and averages
    max_nton = len(genome_ids_a) // 2
    sum_stats, mean_stats = _calculcate_mean_and_averages(
        calculations, max_nton)

    # neutrality index calculation and bootstrapping
    ni_stats, ni_lower_stats, ni_upper_stats = _neutrality_indices(
        calculations)

    # finally append statistics to calculations so they show up in file
    calculations.extend(
        (sum_stats, mean_stats, ni_stats, ni_lower_stats, ni_upper_stats))

    return calculations
示例#12
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: concatenate_orthologs.py
--orthologs-zip=FILE     archive of orthologous genes in FASTA format
--coding-regions=FILE    destination file path archive of trimmed orthologous coding regions per genomes
--concatemer=FILE        destination file path for super-concatemer of all genomes
--taxon-a=FILE           destination file path for genome IDs for taxon A
--taxon-b=FILE           destination file path for genome IDs for taxon B
--tree=FILE              destination file path for tree visualization
"""
    options = [
        'orthologs-zip', 'coding-regions', 'concatemer', 'taxon-a', 'taxon-b',
        'tree'
    ]
    orthologs_zip, target_coding_regions, target_concat_file, target_taxon_a, target_taxon_b, target_tree = \
        parse_options(usage, options, args)

    # Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='concatemer_tree_')

    # Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir)

    # Separate out orthologs per genome to create trimmed coding region files per genome
    genome_coding_regions_files = coding_regions_per_genome(
        run_dir, ortholog_files)
    create_archive_of_files(target_coding_regions, genome_coding_regions_files)

    # Concatenate coding region files per genome
    concatemer_files = concatemer_per_genome(run_dir,
                                             genome_coding_regions_files)
    # Create super concatemer
    create_super_concatemer(concatemer_files, target_concat_file)

    # Determine the taxa present in the super concatemer tree by building a phylogenetic tree from genome concatemer and
    # reading genome ids in the two largest clades.
    super_distance_file = _run_dna_dist(run_dir, target_concat_file)
    super_tree_file = _run_neighbor(run_dir, super_distance_file)
    genome_ids_a, genome_ids_b = _read_taxa_from_tree(super_tree_file)

    # Map Project IDs to Organism names
    id_to_name_map = dict(
        (gid, genome['Organism/Name'])
        for gid, genome in select_genomes_by_ids(genome_ids_a +
                                                 genome_ids_b).iteritems())

    # Write Project IDs and Organism Names to files, with a fallback to genome_id for external genome
    with open(target_taxon_a, mode='w') as write_handle:
        for genome_id in genome_ids_a:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id,
                                                       name=id_to_name_map.get(
                                                           genome_id,
                                                           genome_id)))
    with open(target_taxon_b, mode='w') as write_handle:
        for genome_id in genome_ids_b:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id,
                                                       name=id_to_name_map.get(
                                                           genome_id,
                                                           genome_id)))

    # Visualize tree
    visualize_tree(super_tree_file, id_to_name_map, target_tree)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info('Produced: \n%s\n%s\n%s\n%s\n%s', target_coding_regions,
             target_concat_file, target_taxon_a, target_taxon_b, target_tree)