예제 #1
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: select_taxa.py
--external-genomes=    comma-separated list of label:nucleotide fasta file pairs of externally supplied genomes.
    label:FILE,...     labels should be unique as genomes will be identified by this label in further output files
--external-zip=FILE    destination path for archive of user provided external genomes containing formatted nucleotide fasta files
"""
    options = ['external-genomes', 'external-zip']
    external_genomes, external_zip = parse_options(usage, options, args)

    #External genomes are nucleotide fasta files uploaded by the user of which we will reformat the header
    external_fasta_files = {}

    #Handle externally uploaded genomes
    #Sample line: label1:file1,label2:file2, #Note trailing the trailing , that's a Galaxy artifact we'll ignore
    for label, filename in (label_file.split(':') for label_file in external_genomes.split(',') if label_file):
        if len(label) == 0:
            log.error('Empty label provided for upload genome %s. Please provide a label and try again.', filename)
            break
        log.info('Formatting external genome labeled %s at %s', label, filename)
        formatted_file = format_fasta_genome_headers(label, filename)
        external_fasta_files[label] = formatted_file

    #Copy formatted external genome files to archive that will be output as well
    create_archive_of_files(external_zip, external_fasta_files.values())

    #Remove temporary formatted files
    for formatted_file in external_fasta_files.values():
        os.remove(formatted_file)

    #Exit after a comforting log message
    log.info("Produced: \n%s", external_zip)
예제 #2
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: concatenate_orthologs.py
--orthologs-zip=FILE     archive of orthologous genes in FASTA format
--coding-regions=FILE    destination file path archive of trimmed orthologous coding regions per genomes
--concatemer=FILE        destination file path for super-concatemer of all genomes
--taxon-a=FILE           destination file path for genome IDs for taxon A
--taxon-b=FILE           destination file path for genome IDs for taxon B
--tree=FILE              destination file path for tree visualization
"""
    options = ['orthologs-zip', 'coding-regions', 'concatemer', 'taxon-a', 'taxon-b', 'tree']
    orthologs_zip, target_coding_regions, target_concat_file, target_taxon_a, target_taxon_b, target_tree = \
        parse_options(usage, options, args)

    #Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='concatemer_tree_')

    #Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir)

    #Separate out orthologs per genome to create trimmed coding region files per genome
    genome_coding_regions_files = coding_regions_per_genome(run_dir, ortholog_files)
    create_archive_of_files(target_coding_regions, genome_coding_regions_files)

    #Concatenate coding region files per genome
    concatemer_files = concatemer_per_genome(run_dir, genome_coding_regions_files)
    #Create super concatemer
    create_super_concatemer(concatemer_files, target_concat_file)

    #Determine the taxa present in the super concatemer tree by building a phylogenetic tree from genome concatemer and
    #reading genome ids in the two largest clades.
    super_distance_file = _run_dna_dist(run_dir, target_concat_file)
    super_tree_file = _run_neighbor(run_dir, super_distance_file)
    genome_ids_a, genome_ids_b = _read_taxa_from_tree(super_tree_file)

    #Map Project IDs to Organism names
    id_to_name_map = dict((gid, genome['Organism/Name'])
                          for gid, genome in select_genomes_by_ids(genome_ids_a + genome_ids_b).iteritems())

    #Write Project IDs and Organism Names to files, with a fallback to genome_id for external genome
    with open(target_taxon_a, mode='w') as write_handle:
        for genome_id in genome_ids_a:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id, name=id_to_name_map.get(genome_id, genome_id)))
    with open(target_taxon_b, mode='w') as write_handle:
        for genome_id in genome_ids_b:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id, name=id_to_name_map.get(genome_id, genome_id)))

    #Visualize tree
    visualize_tree(super_tree_file, id_to_name_map, target_tree)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info('Produced: \n%s\n%s\n%s\n%s\n%s', target_coding_regions, target_concat_file,
             target_taxon_a, target_taxon_b, target_tree)
예제 #3
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: translate.py
--genomes=FILE         file with selected genome IDs followed by Organism Name on each line
--external-zip=FILE    optional archive of user provided external genomes containing formatted nucleotide fasta files
--dna-zip=FILE         destination file path for zip archive of extracted DNA files
--protein-zip=FILE     destination file path for zip archive of translated protein files
"""
    options = ['genomes', 'external-zip=?', 'dna-zip', 'protein-zip']
    genome_ids_file, external_zip, dna_zipfile, protein_zipfile = parse_options(usage, options, args)

    dna_files = []
    protein_files = []

    #Read GenBank Project IDs from genomes_file, each on their own line
    with open(genome_ids_file) as read_handle:
        genome_ids = [line.split()[0] for line in read_handle
                      if not line.startswith('#') and 'external genome' not in line]

        if len(genome_ids):
            #Retrieve associated genome dictionaries from complete genomes table
            genomes = select_genomes_by_ids(genome_ids).values()
            genomes = sorted(genomes, key=itemgetter('Organism/Name'))

            #Actually translate the genomes to produced a set of files for both  dna files & protein files
            dna_files, protein_files = translate_genomes(genomes)

    #Also translate the external genomes
    if external_zip:
        #Extract external genomes archive
        external_dir = tempfile.mkdtemp(prefix='external_genomes_')
        external_dna_files = extract_archive_of_files(external_zip, external_dir)

        #Append IDs of external fasta files to genome IDs file
        _append_external_genomes(external_dna_files, genome_ids_file)

        #Translate individual files
        external_protein_files = [translate_fasta_coding_regions(dna_file) for dna_file in external_dna_files]

        #Add the files to the appropriate collections
        dna_files.extend(external_dna_files)
        protein_files.extend(external_protein_files)

    #Write the produced files to command line argument filenames
    create_archive_of_files(dna_zipfile, dna_files)
    create_archive_of_files(protein_zipfile, protein_files)

    #Do not clean up extracted DNA files or Protein translations: Keep them as cache

    #But do clean up external_dir now that the compressed archives are created
    if external_zip:
        shutil.rmtree(external_dir)

    #Exit after a comforting log message
    log.info("Produced: \n%s &\n%s", dna_zipfile, protein_zipfile)
예제 #4
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: run_codeml.py
--genomes-a=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon A
--genomes-b=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon B
--sico-zip=FILE      archive of aligned & trimmed single copy orthologous (SICO) genes
--codeml-zip=FILE     destination file path for archive of codeml output per SICO gene
--dnds-stats=FILE     destination file path for file with dN, dS & dN/dS values per SICO gene
"""
    options = [
        'genomes-a', 'genomes-b', 'sico-zip', 'codeml-zip', 'dnds-stats'
    ]
    genome_a_ids_file, genome_b_ids_file, sico_zip, codeml_zip, dnds_file = parse_options(
        usage, options, args)

    # Parse file to extract GenBank Project IDs
    with open(genome_a_ids_file) as read_handle:
        genome_ids_a = [line.split()[0] for line in read_handle]
    with open(genome_b_ids_file) as read_handle:
        genome_ids_b = [line.split()[0] for line in read_handle]

    # Create run_dir to hold files relating to this run
    run_dir = tempfile.mkdtemp(prefix='run_codeml_')

    # Extract files from zip archive
    sico_files = extract_archive_of_files(
        sico_zip, create_directory('sicos', inside_dir=run_dir))

    # Actually run codeml
    codeml_files = run_codeml_for_sicos(run_dir, genome_ids_a, genome_ids_b,
                                        sico_files)

    # Write dnds values to single output file
    _write_dnds_per_ortholog(dnds_file, codeml_files)

    # Write the produced files to command line argument filenames
    create_archive_of_files(codeml_zip, codeml_files)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info("Produced: \n%s\n%s", codeml_zip, dnds_file)
예제 #5
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: split_by_taxa.py
--genomes-a=FILE        file with genome GenBank Project ID and Organism name on each line for taxon A
--genomes-b=FILE        file with genome GenBank Project ID and Organism name on each line for taxon B
--orthologs-zip=FILE    archive of aligned & trimmed single copy orthologous (SICO) genes
--taxon-a-zip=FILE      destination file path for archive of SICO genes belonging to taxon A
--taxon-b-zip=FILE      destination file path for archive of SICO genes belonging to taxon B
"""
    options = ['genomes-a', 'genomes-b', 'orthologs-zip', 'taxon-a-zip', 'taxon-b-zip']
    genome_a_ids_file, genome_b_ids_file, orthologs_zip, taxon_a_zip, taxon_b_zip = parse_options(usage, options, args)

    #Parse file containing RefSeq project IDs to extract RefSeq project IDs
    with open(genome_a_ids_file) as read_handle:
        lines = [line.split('\t') for line in read_handle]
        genome_ids_a = [line[0] for line in lines]
        common_prefix_a = _common_prefix([line[1] for line in lines], 'taxon_a')
    with open(genome_b_ids_file) as read_handle:
        lines = [line.split('\t') for line in read_handle]
        genome_ids_b = [line[0] for line in lines]
        common_prefix_b = _common_prefix([line[1] for line in lines], 'taxon_b')

    #Create run_dir to hold files related to this run
    run_dir = tempfile.mkdtemp(prefix='split_by_taxa_')

    #Extract files from zip archive
    ortholog_files = extract_archive_of_files(orthologs_zip, create_directory('alignments', inside_dir=run_dir))

    #Actually split alignments per taxon
    taxon_a_files, taxon_b_files = split_alignment_by_taxa(run_dir, ortholog_files,
                                                           (genome_ids_a, common_prefix_a),
                                                           (genome_ids_b, common_prefix_b))

    #Write the produced files to command line argument filenames
    create_archive_of_files(taxon_a_zip, taxon_a_files)
    create_archive_of_files(taxon_b_zip, taxon_b_files)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info("Produced: \n%s\n%s", taxon_a_zip, taxon_b_zip)
    return taxon_a_zip, taxon_b_zip
예제 #6
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: run_codeml.py
--genomes-a=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon A
--genomes-b=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon B
--sico-zip=FILE      archive of aligned & trimmed single copy orthologous (SICO) genes
--codeml-zip=FILE     destination file path for archive of codeml output per SICO gene
--dnds-stats=FILE     destination file path for file with dN, dS & dN/dS values per SICO gene
"""
    options = ['genomes-a', 'genomes-b', 'sico-zip', 'codeml-zip', 'dnds-stats']
    genome_a_ids_file, genome_b_ids_file, sico_zip, codeml_zip, dnds_file = parse_options(usage, options, args)

    # Parse file to extract GenBank Project IDs
    with open(genome_a_ids_file) as read_handle:
        genome_ids_a = [line.split()[0] for line in read_handle]
    with open(genome_b_ids_file) as read_handle:
        genome_ids_b = [line.split()[0] for line in read_handle]

    # Create run_dir to hold files relating to this run
    run_dir = tempfile.mkdtemp(prefix='run_codeml_')

    # Extract files from zip archive
    sico_files = extract_archive_of_files(sico_zip, create_directory('sicos', inside_dir=run_dir))

    # Actually run codeml
    codeml_files = run_codeml_for_sicos(run_dir, genome_ids_a, genome_ids_b, sico_files)

    # Write dnds values to single output file
    _write_dnds_per_ortholog(dnds_file, codeml_files)

    # Write the produced files to command line argument filenames
    create_archive_of_files(codeml_zip, codeml_files)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info("Produced: \n%s\n%s", codeml_zip, dnds_file)
예제 #7
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: filter_orthologs.py
--orthologs-zip=FILE           archive of orthologous genes in FASTA format
--retained-threshold=PERC      filter orthologs that retain less than PERC % of sequence after trimming alignment
--max-indel-length=NUMBER      filter orthologs that contain insertions / deletions longer than N in middle of alignment
--aligned-zip=FILE             destination file path for archive of aligned orthologous genes
--misaligned-zip=FILE          destination file path for archive of misaligned orthologous genes
--trimmed-zip=FILE             destination file path for archive of aligned & trimmed orthologous genes
--stats=FILE                   destination file path for ortholog trimming statistics file
--scatterplot=FILE             destination file path for scatterplot of retained and filtered sequences by length
"""
    options = [
        'orthologs-zip', 'retained-threshold', 'max-indel-length',
        'aligned-zip', 'misaligned-zip', 'trimmed-zip', 'stats', 'scatterplot'
    ]
    orthologs_zip, retained_threshold, max_indel_length, \
    aligned_zip, misaligned_zip, trimmed_zip, target_stats_path, target_scatterplot = \
        parse_options(usage, options, args)

    #Convert retained threshold to integer, so we can fail fast if argument value format was wrong
    retained_threshold = int(retained_threshold)
    max_indel_length = int(max_indel_length)

    #Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='align_trim_')

    #Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    sico_files = extract_archive_of_files(orthologs_zip, temp_dir)

    #Align SICOs so all sequences become equal length sequences
    aligned_files = _align_sicos(run_dir, sico_files)

    #Filter orthologs that retain less than PERC % of sequence after trimming alignment
    trimmed_files, misaligned_files = _trim_alignments(run_dir, aligned_files,
                                                       retained_threshold,
                                                       max_indel_length,
                                                       target_stats_path,
                                                       target_scatterplot)

    #Create archives of files on command line specified output paths
    create_archive_of_files(aligned_zip, aligned_files)
    create_archive_of_files(misaligned_zip, misaligned_files)
    create_archive_of_files(trimmed_zip, trimmed_files)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info(
        'Produced: \n%s', '\n'.join((aligned_zip, misaligned_zip, trimmed_zip,
                                     target_stats_path, target_scatterplot)))
예제 #8
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: filter_orthologs.py
--orthologs-zip=FILE           archive of orthologous genes in FASTA format
--retained-threshold=PERC      filter orthologs that retain less than PERC % of sequence after trimming alignment
--max-indel-length=NUMBER      filter orthologs that contain insertions / deletions longer than N in middle of alignment
--aligned-zip=FILE             destination file path for archive of aligned orthologous genes
--misaligned-zip=FILE          destination file path for archive of misaligned orthologous genes
--trimmed-zip=FILE             destination file path for archive of aligned & trimmed orthologous genes
--stats=FILE                   destination file path for ortholog trimming statistics file
--scatterplot=FILE             destination file path for scatterplot of retained and filtered sequences by length
"""
    options = ['orthologs-zip', 'retained-threshold', 'max-indel-length',
               'aligned-zip', 'misaligned-zip', 'trimmed-zip', 'stats', 'scatterplot']
    orthologs_zip, retained_threshold, max_indel_length, \
    aligned_zip, misaligned_zip, trimmed_zip, target_stats_path, target_scatterplot = \
        parse_options(usage, options, args)

    #Convert retained threshold to integer, so we can fail fast if argument value format was wrong
    retained_threshold = int(retained_threshold)
    max_indel_length = int(max_indel_length)

    #Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='align_trim_')

    #Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    sico_files = extract_archive_of_files(orthologs_zip, temp_dir)

    #Align SICOs so all sequences become equal length sequences
    aligned_files = _align_sicos(run_dir, sico_files)

    #Filter orthologs that retain less than PERC % of sequence after trimming alignment
    trimmed_files, misaligned_files = _trim_alignments(run_dir, aligned_files, retained_threshold, max_indel_length,
                                                       target_stats_path, target_scatterplot)

    #Create archives of files on command line specified output paths
    create_archive_of_files(aligned_zip, aligned_files)
    create_archive_of_files(misaligned_zip, misaligned_files)
    create_archive_of_files(trimmed_zip, trimmed_files)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info('Produced: \n%s', '\n'.join((aligned_zip, misaligned_zip, trimmed_zip,
                                         target_stats_path, target_scatterplot)))
예제 #9
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: extract_orthologs.py
--genomes=FILE       file with GenBank Project IDs from complete genomes table on each line
--dna-zip=FILE       zip archive of extracted DNA files
--groups=FILE        file listing groups of orthologous proteins
--require-limiter    flag whether extracted core set of genomes should contain the limiter added in OrthoMCL [OPTIONAL]

--sico-zip=FILE      destination file path for archive of shared single copy orthologous (SICO) genes
--muco-zip=FILE      destination file path for archive of shared multiple copy orthologous genes
--subset-zip=FILE    destination file path for archive of variable copy orthologous genes shared for a subset only
--stats=FILE         destination file path for ortholog statistics file
--heatmap=FILE       destination file path heatmap of orthologs and occurrences of ortholog per genome
--orfans=FILE        destination file path ORFans
"""
    options = ['genomes', 'dna-zip', 'groups', 'require-limiter?',
               'sico-zip', 'muco-zip=?', 'subset-zip=?', 'stats', 'heatmap', 'orfans']
    genome_ids_file, dna_zip, groups_file, require_limiter, \
    target_sico, target_muco, target_subset, target_stats_path, target_heat, target_orfans = \
    parse_options(usage, options, args)

    #Parse file extract GenBank Project IDs
    with open(genome_ids_file) as read_handle:
        genomes = [line.split()[0] for line in read_handle if not line.startswith('#')]

    #Create temporary directory within which to extract orthologs
    run_dir = tempfile.mkdtemp(prefix='extract_orthologs_run_')

    #Extract files from zip archive
    temp_dir = create_directory('dna_files', inside_dir=run_dir)
    dna_files = extract_archive_of_files(dna_zip, temp_dir)

    #Actually run ortholog extraction
    sico_files, muco_files, subset_files, stats_file, heatmap_file, orfans_file = \
        extract_orthologs(run_dir, genomes, dna_files, groups_file, require_limiter)

    #Append the orfans to the heatmap file
    _append_orfans_to_heatmap(orfans_file, genomes, heatmap_file)

    #Move produced files to command line specified output paths
    create_archive_of_files(target_sico, sico_files)
    if target_muco:
        create_archive_of_files(target_muco, muco_files)
    if target_subset:
        create_archive_of_files(target_subset, subset_files)
    shutil.move(stats_file, target_stats_path)
    shutil.move(heatmap_file, target_heat)
    shutil.move(orfans_file, target_orfans)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info("Produced:")
    log.info("%s", target_sico)
    if target_muco:
        log.info("%s", target_muco)
    if target_subset:
        log.info("%s", target_subset)
    log.info("%s", target_stats_path)
    log.info("%s", target_heat)
예제 #10
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: concatenate_orthologs.py
--orthologs-zip=FILE     archive of orthologous genes in FASTA format
--coding-regions=FILE    destination file path archive of trimmed orthologous coding regions per genomes
--concatemer=FILE        destination file path for super-concatemer of all genomes
--taxon-a=FILE           destination file path for genome IDs for taxon A
--taxon-b=FILE           destination file path for genome IDs for taxon B
--tree=FILE              destination file path for tree visualization
"""
    options = [
        'orthologs-zip', 'coding-regions', 'concatemer', 'taxon-a', 'taxon-b',
        'tree'
    ]
    orthologs_zip, target_coding_regions, target_concat_file, target_taxon_a, target_taxon_b, target_tree = \
        parse_options(usage, options, args)

    #Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='concatemer_tree_')

    #Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir)

    #Separate out orthologs per genome to create trimmed coding region files per genome
    genome_coding_regions_files = coding_regions_per_genome(
        run_dir, ortholog_files)
    create_archive_of_files(target_coding_regions, genome_coding_regions_files)

    #Concatenate coding region files per genome
    concatemer_files = concatemer_per_genome(run_dir,
                                             genome_coding_regions_files)
    #Create super concatemer
    create_super_concatemer(concatemer_files, target_concat_file)

    #Determine the taxa present in the super concatemer tree by building a phylogenetic tree from genome concatemer and
    #reading genome ids in the two largest clades.
    super_distance_file = _run_dna_dist(run_dir, target_concat_file)
    super_tree_file = _run_neighbor(run_dir, super_distance_file)
    genome_ids_a, genome_ids_b = _read_taxa_from_tree(super_tree_file)

    #Map Project IDs to Organism names
    id_to_name_map = dict(
        (gid, genome['Organism/Name'])
        for gid, genome in select_genomes_by_ids(genome_ids_a +
                                                 genome_ids_b).iteritems())

    #Write Project IDs and Organism Names to files, with a fallback to genome_id for external genome
    with open(target_taxon_a, mode='w') as write_handle:
        for genome_id in genome_ids_a:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id,
                                                       name=id_to_name_map.get(
                                                           genome_id,
                                                           genome_id)))
    with open(target_taxon_b, mode='w') as write_handle:
        for genome_id in genome_ids_b:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id,
                                                       name=id_to_name_map.get(
                                                           genome_id,
                                                           genome_id)))

    #Visualize tree
    visualize_tree(super_tree_file, id_to_name_map, target_tree)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info('Produced: \n%s\n%s\n%s\n%s\n%s', target_coding_regions,
             target_concat_file, target_taxon_a, target_taxon_b, target_tree)
예제 #11
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: translate.py
--genomes=FILE         file with selected genome IDs followed by Organism Name on each line
--external-zip=FILE    optional archive of user provided external genomes containing formatted nucleotide fasta files
--dna-zip=FILE         destination file path for zip archive of extracted DNA files
--protein-zip=FILE     destination file path for zip archive of translated protein files
"""
    options = ['genomes', 'external-zip=?', 'dna-zip', 'protein-zip']
    genome_ids_file, external_zip, dna_zipfile, protein_zipfile = parse_options(
        usage, options, args)

    dna_files = []
    protein_files = []

    #Read GenBank Project IDs from genomes_file, each on their own line
    with open(genome_ids_file) as read_handle:
        genome_ids = [
            line.split()[0] for line in read_handle
            if not line.startswith('#') and 'external genome' not in line
        ]

        if len(genome_ids):
            #Retrieve associated genome dictionaries from complete genomes table
            genomes = select_genomes_by_ids(genome_ids).values()
            genomes = sorted(genomes, key=itemgetter('Organism/Name'))

            #Actually translate the genomes to produced a set of files for both  dna files & protein files
            dna_files, protein_files = translate_genomes(genomes)

    #Also translate the external genomes
    if external_zip:
        #Extract external genomes archive
        external_dir = tempfile.mkdtemp(prefix='external_genomes_')
        external_dna_files = extract_archive_of_files(external_zip,
                                                      external_dir)

        #Append IDs of external fasta files to genome IDs file
        _append_external_genomes(external_dna_files, genome_ids_file)

        #Translate individual files
        external_protein_files = [
            translate_fasta_coding_regions(dna_file)
            for dna_file in external_dna_files
        ]

        #Add the files to the appropriate collections
        dna_files.extend(external_dna_files)
        protein_files.extend(external_protein_files)

    #Write the produced files to command line argument filenames
    create_archive_of_files(dna_zipfile, dna_files)
    create_archive_of_files(protein_zipfile, protein_files)

    #Do not clean up extracted DNA files or Protein translations: Keep them as cache

    #But do clean up external_dir now that the compressed archives are created
    if external_zip:
        shutil.rmtree(external_dir)

    #Exit after a comforting log message
    log.info("Produced: \n%s &\n%s", dna_zipfile, protein_zipfile)
예제 #12
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: filter_orthologs.py
--orthologs-zip=FILE            archive of orthologous genes in FASTA format
--filter-multiple-cogs          filter orthologs with multiple COG annotations among genes [OPTIONAL]

--filter-recombination=FILE     filter orthologs that show recombination when comparing phylogenetic trees [OPTIONAL]
                                destination file path for archive of recombination orthologs
--recombined-crosstable=FILE    destination file path for recombined crosstable of GeneIDs, COGs and Products [OPTIONAL]
--taxon-a=FILE                  file with genome IDs for taxon A to use in recombination filtering
--taxon-b=FILE                  file with genome IDs for taxon B to use in recombination filtering
--retained-zip=FILE             destination file path for archive of retained orthologs after filtering

--orthologs-per-genome=FILE      destination file path for orthologs split out per genome, based on the retained.zip
--concatemer=FILE                destination file path for super-concatemer of all genomes
"""
    options = ('orthologs-zip', 'filter-multiple-cogs=?', 'filter-recombination=?', 'recombined-crosstable=?',
               'taxon-a=?', 'taxon-b=?', 'retained-zip', 'orthologs-per-genome', 'concatemer')
    orthologs_zip, filter_cogs, filter_recombination, recombined_crosstable, \
    taxona, taxonb, retained_zip, target_orth_per_genome, target_concat_file = parse_options(usage, options, args)

    #Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='filter_orthologs_')

    #Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir)

    #Filter orthologs with multiple COG annotations among genes if flag was set
    if filter_cogs:
        ortholog_files, transfered_cogs = _filter_multiple_cog_orthologs(run_dir, ortholog_files)

    #Possible extension: filter ortholog when any strain has been flagged as 'mobile element', 'phage' or 'IS element'

    #Filter orthologs that show recombination when comparing phylogenetic trees if flag was set
    if filter_recombination:
        #Parse file to extract GenBank Project IDs
        with open(taxona) as read_handle:
            genome_ids_a = [line.split()[0] for line in read_handle]
        with open(taxonb) as read_handle:
            genome_ids_b = [line.split()[0] for line in read_handle]
        ortholog_files, recombined_files = _phipack_for_all_orthologs(run_dir, ortholog_files,
                                                                       genome_ids_a, genome_ids_b)
        #Create crosstable
        create_crosstable(recombined_files, recombined_crosstable)

    #Create archives of files on command line specified output paths
    if filter_cogs:
        shutil.move(transfered_cogs, filter_cogs)
    if filter_recombination:
        create_archive_of_files(filter_recombination, recombined_files)
    create_archive_of_files(retained_zip, ortholog_files)

    #Run the steps required after filtering orthologs
    post_recombination_filter(taxona, taxonb, retained_zip,
                              target_orth_per_genome, target_concat_file, run_dir)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info('Produced:')
    if filter_cogs:
        log.info(filter_cogs)
    if filter_recombination:
        log.info(filter_recombination)
    log.info(retained_zip)
    log.info(target_orth_per_genome)
    log.info(target_concat_file)