Exemplo n.º 1
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: run_phipack.py
--orthologs-zip=FILE     archive of orthologous genes in FASTA format
--stats-file=FILE        destination file path for values found through PhiPack for each ortholog
"""
    options = ('orthologs-zip', 'stats-file')
    orthologs_zip, stats_file = parse_options(usage, options, args)

    #Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='run_phipack_')

    #Extract files from zip archive
    extraction_dir = create_directory('extracted_orthologs', inside_dir=run_dir)
    ortholog_files = extract_archive_of_files(orthologs_zip, extraction_dir)

    #Find recombination in all ortholog_files
    _phipack_for_all_orthologs(run_dir, ortholog_files, stats_file)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info('Produced:\n%s', stats_file)
Exemplo n.º 2
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: calculations.py
--genomes-a=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon A
--genomes-b=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon B
--sico-zip=FILE      archive of aligned & trimmed single copy orthologous (SICO) genes
--table-a=FILE       destination file path for summary statistics table based on orthologs in taxon A
--table-b=FILE       destination file path for summary statistics table based on orthologs in taxon B
--append-odd-even    append separate tables calculated for odd and even codons of ortholog alignments [OPTIONAL]
"""
    options = [
        'genomes-a', 'genomes-b', 'sico-zip', 'table-a', 'table-b',
        'append-odd-even?'
    ]
    genome_a_ids_file, genome_b_ids_file, sico_zip, table_a, table_b, oddeven = parse_options(
        usage, options, args)

    #Parse file containing GenBank GenBank Project IDs to extract GenBank Project IDs
    with open(genome_a_ids_file) as read_handle:
        lines = [line.strip() for line in read_handle]
        genome_ids_a = [line.split()[0] for line in lines]
        common_prefix_a = os.path.commonprefix(
            [line.split('\t')[1] for line in lines]).strip()
    with open(genome_b_ids_file) as read_handle:
        lines = [line.strip() for line in read_handle]
        genome_ids_b = [line.split()[0] for line in lines]
        common_prefix_b = os.path.commonprefix(
            [line.split('\t')[1] for line in lines]).strip()

    #Prepend headers to each of the output tables
    _prepend_table_header(table_a, genome_ids_a, common_prefix_a, genome_ids_b,
                          common_prefix_b, oddeven)
    _prepend_table_header(table_b, genome_ids_b, common_prefix_b, genome_ids_a,
                          common_prefix_a, oddeven)

    #Create run_dir to hold files relating to this run
    run_dir = tempfile.mkdtemp(prefix='calculations_')

    #Extract files from zip archive
    sico_files = extract_archive_of_files(
        sico_zip, create_directory('sicos', inside_dir=run_dir))

    #Actually do calculations
    tmp_table_tuple = calculate_tables(genome_ids_a, genome_ids_b, sico_files,
                                       oddeven)

    #Write the produced files to command line argument filenames
    with open(table_a, mode='ab') as append_handle:
        shutil.copyfileobj(open(tmp_table_tuple[0], mode='rb'), append_handle)
    with open(table_b, mode='ab') as append_handle:
        shutil.copyfileobj(open(tmp_table_tuple[1], mode='rb'), append_handle)

    #Remove now unused files to free disk space
    shutil.rmtree(run_dir)
    os.remove(tmp_table_tuple[0])
    os.remove(tmp_table_tuple[1])

    #Exit after a comforting log message
    log.info("Produced: \n%s\n%s", table_a, table_b)
Exemplo n.º 3
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: compare_taxa.py
--unfiltered-taxon-a=FILE    genome IDs for taxon A as deduced from phylogenetic tree of unfiltered concatemers
--unfiltered-taxon-b=FILE    genome IDs for taxon B as deduced from phylogenetic tree of unfiltered concatemers
--filtered-taxon-a=FILE      genome IDs for taxon A as deduced from phylogenetic tree of filtered concatemers
--filtered-taxon-b=FILE      genome IDs for taxon B as deduced from phylogenetic tree of filtered concatemers
"""
    options = ['unfiltered-taxon-a', 'unfiltered-taxon-b', 'filtered-taxon-a', 'filtered-taxon-b']
    unfiltered_a_file, unfiltered_b_file, filtered_a_file, filtered_b_file = parse_options(usage, options, args)

    #Parse ID files to extract GenBank Project IDs & Organism Name
    with open(unfiltered_a_file) as read_handle:
        unfiltered_a = dict((line.split('\t')[0], line.strip().split('\t')[1]) for line in read_handle)
    with open(unfiltered_b_file) as read_handle:
        unfiltered_b = dict((line.split('\t')[0], line.strip().split('\t')[1]) for line in read_handle)
    with open(filtered_a_file) as read_handle:
        filtered_a = dict((line.split('\t')[0], line.strip().split('\t')[1]) for line in read_handle)
    with open(filtered_b_file) as read_handle:
        filtered_b = dict((line.split('\t')[0], line.strip().split('\t')[1]) for line in read_handle)

    #Otherwise fail after
    if unfiltered_a.keys()[0] in filtered_a:
        if not (set(unfiltered_a.keys()) == set(filtered_a.keys())
                and set(unfiltered_b.keys()) == set(filtered_b.keys())):
            fail(unfiltered_a, unfiltered_b, filtered_a, filtered_b)
    else:
        if not (set(unfiltered_a.keys()) == set(filtered_b.keys())
                and set(unfiltered_b.keys()) == set(filtered_a.keys())):
            fail(unfiltered_a, unfiltered_b, filtered_b, filtered_a)

    #Else: no problems were found
    log.info('Succes: Unfiltered & filtered tree clustering did not result in different taxa.')
Exemplo n.º 4
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: select_taxa.py
--external-genomes=    comma-separated list of label:nucleotide fasta file pairs of externally supplied genomes.
    label:FILE,...     labels should be unique as genomes will be identified by this label in further output files
--external-zip=FILE    destination path for archive of user provided external genomes containing formatted nucleotide fasta files
"""
    options = ['external-genomes', 'external-zip']
    external_genomes, external_zip = parse_options(usage, options, args)

    #External genomes are nucleotide fasta files uploaded by the user of which we will reformat the header
    external_fasta_files = {}

    #Handle externally uploaded genomes
    #Sample line: label1:file1,label2:file2, #Note trailing the trailing , that's a Galaxy artifact we'll ignore
    for label, filename in (label_file.split(':') for label_file in external_genomes.split(',') if label_file):
        if len(label) == 0:
            log.error('Empty label provided for upload genome %s. Please provide a label and try again.', filename)
            break
        log.info('Formatting external genome labeled %s at %s', label, filename)
        formatted_file = format_fasta_genome_headers(label, filename)
        external_fasta_files[label] = formatted_file

    #Copy formatted external genome files to archive that will be output as well
    create_archive_of_files(external_zip, external_fasta_files.values())

    #Remove temporary formatted files
    for formatted_file in external_fasta_files.values():
        os.remove(formatted_file)

    #Exit after a comforting log message
    log.info("Produced: \n%s", external_zip)
Exemplo n.º 5
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: concatenate_orthologs.py
--orthologs-zip=FILE     archive of orthologous genes in FASTA format
--coding-regions=FILE    destination file path archive of trimmed orthologous coding regions per genomes
--concatemer=FILE        destination file path for super-concatemer of all genomes
--taxon-a=FILE           destination file path for genome IDs for taxon A
--taxon-b=FILE           destination file path for genome IDs for taxon B
--tree=FILE              destination file path for tree visualization
"""
    options = ['orthologs-zip', 'coding-regions', 'concatemer', 'taxon-a', 'taxon-b', 'tree']
    orthologs_zip, target_coding_regions, target_concat_file, target_taxon_a, target_taxon_b, target_tree = \
        parse_options(usage, options, args)

    #Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='concatemer_tree_')

    #Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir)

    #Separate out orthologs per genome to create trimmed coding region files per genome
    genome_coding_regions_files = coding_regions_per_genome(run_dir, ortholog_files)
    create_archive_of_files(target_coding_regions, genome_coding_regions_files)

    #Concatenate coding region files per genome
    concatemer_files = concatemer_per_genome(run_dir, genome_coding_regions_files)
    #Create super concatemer
    create_super_concatemer(concatemer_files, target_concat_file)

    #Determine the taxa present in the super concatemer tree by building a phylogenetic tree from genome concatemer and
    #reading genome ids in the two largest clades.
    super_distance_file = _run_dna_dist(run_dir, target_concat_file)
    super_tree_file = _run_neighbor(run_dir, super_distance_file)
    genome_ids_a, genome_ids_b = _read_taxa_from_tree(super_tree_file)

    #Map Project IDs to Organism names
    id_to_name_map = dict((gid, genome['Organism/Name'])
                          for gid, genome in select_genomes_by_ids(genome_ids_a + genome_ids_b).iteritems())

    #Write Project IDs and Organism Names to files, with a fallback to genome_id for external genome
    with open(target_taxon_a, mode='w') as write_handle:
        for genome_id in genome_ids_a:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id, name=id_to_name_map.get(genome_id, genome_id)))
    with open(target_taxon_b, mode='w') as write_handle:
        for genome_id in genome_ids_b:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id, name=id_to_name_map.get(genome_id, genome_id)))

    #Visualize tree
    visualize_tree(super_tree_file, id_to_name_map, target_tree)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info('Produced: \n%s\n%s\n%s\n%s\n%s', target_coding_regions, target_concat_file,
             target_taxon_a, target_taxon_b, target_tree)
Exemplo n.º 6
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: select_taxa.py
--genomes=ID,...           optional comma-separated list of selected GenBank Project IDs from complete genomes table
--previous-file=FILE       optional previously or externally created GenBank Project IDs file whose genomes should be reselected
--require-protein-table    require protein table files to be present for all downloaded genomes
--genomes-file=FILE        destination path for file with selected genome IDs followed by Organism Name on each line
"""
    options = ['genomes=?', 'previous-file=?', 'require-protein-table?', 'genomes-file']
    genomes_line, previous_file, require_ptt, genomes_file = parse_options(usage, options, args)

    #Genome IDs selected by the user that refer to GenBank or RefSeq entries
    genome_ids = []

    #Split ids on comma
    if genomes_line:
        genome_ids.extend(val for val in genomes_line.split(',') if val)

    #Allow for input of previous or externally created genomes-file to rerun an analysis
    if previous_file:
        #Read previous GenBank Project IDs from previous_file, each on their own line
        with open(previous_file) as read_handle:
            genome_ids.extend(line.split()[0] for line in read_handle
                              #But skip external genomes as their IDs will fail to download
                              if 'external genome' not in line)

    #Assert each clade contains enough IDs
    maximum = 100
    #TODO Move this test to translate, where we can see how many translations succeeded + how many externals there are
    if  maximum < len(genome_ids):
        log.error('Expected between two and {0} selected genomes, but was {1}'.format(maximum, len(genome_ids)))
        sys.exit(1)

    #Retrieve genome dictionaries to get to Organism Name
    genomes = select_genomes_by_ids(genome_ids).values()
    genomes = sorted(genomes, key=itemgetter('Organism/Name'))

    #Semi-touch genomes file in case no genomes were selected, for instance when uploading external genomes
    open(genomes_file, mode='a').close()

    #Write IDs to file, with organism name as second column to make the project ID files more self explanatory.
    for genome in genomes:
        #Download files here, but ignore returned files: These can be retrieved from cache during extraction/translation
        download_genome_files(genome, genomes_file, require_ptt=require_ptt)

    # Post check after translation to see if more than one genome actually had some genomic contents
    with open(genomes_file) as read_handle:
        genome_ids = [line.split()[0] for line in read_handle]
        # If some genomes were skipped, ensure at least two genomes remain
        if len([gid for gid in genome_ids if gid.startswith('#')]):
            assert 2 <= len([gid for gid in genome_ids if not gid.startswith('#')]), \
                "Some genomes were skipped, leaving us with less than two genomes to operate on; " \
                "Inspect messages in Project ID list and reevaluate genome selection"

    #Exit after a comforting log message
    log.info("Produced: \n%s", genomes_file)
Exemplo n.º 7
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: translate.py
--genomes=FILE         file with selected genome IDs followed by Organism Name on each line
--external-zip=FILE    optional archive of user provided external genomes containing formatted nucleotide fasta files
--dna-zip=FILE         destination file path for zip archive of extracted DNA files
--protein-zip=FILE     destination file path for zip archive of translated protein files
"""
    options = ['genomes', 'external-zip=?', 'dna-zip', 'protein-zip']
    genome_ids_file, external_zip, dna_zipfile, protein_zipfile = parse_options(usage, options, args)

    dna_files = []
    protein_files = []

    #Read GenBank Project IDs from genomes_file, each on their own line
    with open(genome_ids_file) as read_handle:
        genome_ids = [line.split()[0] for line in read_handle
                      if not line.startswith('#') and 'external genome' not in line]

        if len(genome_ids):
            #Retrieve associated genome dictionaries from complete genomes table
            genomes = select_genomes_by_ids(genome_ids).values()
            genomes = sorted(genomes, key=itemgetter('Organism/Name'))

            #Actually translate the genomes to produced a set of files for both  dna files & protein files
            dna_files, protein_files = translate_genomes(genomes)

    #Also translate the external genomes
    if external_zip:
        #Extract external genomes archive
        external_dir = tempfile.mkdtemp(prefix='external_genomes_')
        external_dna_files = extract_archive_of_files(external_zip, external_dir)

        #Append IDs of external fasta files to genome IDs file
        _append_external_genomes(external_dna_files, genome_ids_file)

        #Translate individual files
        external_protein_files = [translate_fasta_coding_regions(dna_file) for dna_file in external_dna_files]

        #Add the files to the appropriate collections
        dna_files.extend(external_dna_files)
        protein_files.extend(external_protein_files)

    #Write the produced files to command line argument filenames
    create_archive_of_files(dna_zipfile, dna_files)
    create_archive_of_files(protein_zipfile, protein_files)

    #Do not clean up extracted DNA files or Protein translations: Keep them as cache

    #But do clean up external_dir now that the compressed archives are created
    if external_zip:
        shutil.rmtree(external_dir)

    #Exit after a comforting log message
    log.info("Produced: \n%s &\n%s", dna_zipfile, protein_zipfile)
Exemplo n.º 8
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: filter_orthologs.py
--orthologs-zip=FILE           archive of orthologous genes in FASTA format
--retained-threshold=PERC      filter orthologs that retain less than PERC % of sequence after trimming alignment
--max-indel-length=NUMBER      filter orthologs that contain insertions / deletions longer than N in middle of alignment
--aligned-zip=FILE             destination file path for archive of aligned orthologous genes
--misaligned-zip=FILE          destination file path for archive of misaligned orthologous genes
--trimmed-zip=FILE             destination file path for archive of aligned & trimmed orthologous genes
--stats=FILE                   destination file path for ortholog trimming statistics file
--scatterplot=FILE             destination file path for scatterplot of retained and filtered sequences by length
"""
    options = [
        'orthologs-zip', 'retained-threshold', 'max-indel-length',
        'aligned-zip', 'misaligned-zip', 'trimmed-zip', 'stats', 'scatterplot'
    ]
    orthologs_zip, retained_threshold, max_indel_length, \
    aligned_zip, misaligned_zip, trimmed_zip, target_stats_path, target_scatterplot = \
        parse_options(usage, options, args)

    #Convert retained threshold to integer, so we can fail fast if argument value format was wrong
    retained_threshold = int(retained_threshold)
    max_indel_length = int(max_indel_length)

    #Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='align_trim_')

    #Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    sico_files = extract_archive_of_files(orthologs_zip, temp_dir)

    #Align SICOs so all sequences become equal length sequences
    aligned_files = _align_sicos(run_dir, sico_files)

    #Filter orthologs that retain less than PERC % of sequence after trimming alignment
    trimmed_files, misaligned_files = _trim_alignments(run_dir, aligned_files,
                                                       retained_threshold,
                                                       max_indel_length,
                                                       target_stats_path,
                                                       target_scatterplot)

    #Create archives of files on command line specified output paths
    create_archive_of_files(aligned_zip, aligned_files)
    create_archive_of_files(misaligned_zip, misaligned_files)
    create_archive_of_files(trimmed_zip, trimmed_files)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info(
        'Produced: \n%s', '\n'.join((aligned_zip, misaligned_zip, trimmed_zip,
                                     target_stats_path, target_scatterplot)))
Exemplo n.º 9
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: calculations.py
--genomes-a=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon A
--genomes-b=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon B
--sico-zip=FILE      archive of aligned & trimmed single copy orthologous (SICO) genes
--table-a=FILE       destination file path for summary statistics table based on orthologs in taxon A
--table-b=FILE       destination file path for summary statistics table based on orthologs in taxon B
--append-odd-even    append separate tables calculated for odd and even codons of ortholog alignments [OPTIONAL]
"""
    options = ['genomes-a', 'genomes-b', 'sico-zip', 'table-a', 'table-b', 'append-odd-even?']
    genome_a_ids_file, genome_b_ids_file, sico_zip, table_a, table_b, oddeven = parse_options(usage, options, args)

    #Parse file containing GenBank GenBank Project IDs to extract GenBank Project IDs
    with open(genome_a_ids_file) as read_handle:
        lines = [line.strip() for line in read_handle]
        genome_ids_a = [line.split()[0] for line in lines]
        common_prefix_a = os.path.commonprefix([line.split('\t')[1] for line in lines]).strip()
    with open(genome_b_ids_file) as read_handle:
        lines = [line.strip() for line in read_handle]
        genome_ids_b = [line.split()[0] for line in lines]
        common_prefix_b = os.path.commonprefix([line.split('\t')[1] for line in lines]).strip()

    #Prepend headers to each of the output tables
    _prepend_table_header(table_a, genome_ids_a, common_prefix_a, genome_ids_b, common_prefix_b, oddeven)
    _prepend_table_header(table_b, genome_ids_b, common_prefix_b, genome_ids_a, common_prefix_a, oddeven)

    #Create run_dir to hold files relating to this run
    run_dir = tempfile.mkdtemp(prefix='calculations_')

    #Extract files from zip archive
    sico_files = extract_archive_of_files(sico_zip, create_directory('sicos', inside_dir=run_dir))

    #Actually do calculations
    tmp_table_tuple = calculate_tables(genome_ids_a, genome_ids_b, sico_files, oddeven)

    #Write the produced files to command line argument filenames
    with open(table_a, mode='ab') as append_handle:
        shutil.copyfileobj(open(tmp_table_tuple[0], mode='rb'), append_handle)
    with open(table_b, mode='ab') as append_handle:
        shutil.copyfileobj(open(tmp_table_tuple[1], mode='rb'), append_handle)

    #Remove now unused files to free disk space
    shutil.rmtree(run_dir)
    os.remove(tmp_table_tuple[0])
    os.remove(tmp_table_tuple[1])

    #Exit after a comforting log message
    log.info("Produced: \n%s\n%s", table_a, table_b)
Exemplo n.º 10
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: filter_orthologs.py
--orthologs-zip=FILE           archive of orthologous genes in FASTA format
--retained-threshold=PERC      filter orthologs that retain less than PERC % of sequence after trimming alignment
--max-indel-length=NUMBER      filter orthologs that contain insertions / deletions longer than N in middle of alignment
--aligned-zip=FILE             destination file path for archive of aligned orthologous genes
--misaligned-zip=FILE          destination file path for archive of misaligned orthologous genes
--trimmed-zip=FILE             destination file path for archive of aligned & trimmed orthologous genes
--stats=FILE                   destination file path for ortholog trimming statistics file
--scatterplot=FILE             destination file path for scatterplot of retained and filtered sequences by length
"""
    options = ['orthologs-zip', 'retained-threshold', 'max-indel-length',
               'aligned-zip', 'misaligned-zip', 'trimmed-zip', 'stats', 'scatterplot']
    orthologs_zip, retained_threshold, max_indel_length, \
    aligned_zip, misaligned_zip, trimmed_zip, target_stats_path, target_scatterplot = \
        parse_options(usage, options, args)

    #Convert retained threshold to integer, so we can fail fast if argument value format was wrong
    retained_threshold = int(retained_threshold)
    max_indel_length = int(max_indel_length)

    #Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='align_trim_')

    #Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    sico_files = extract_archive_of_files(orthologs_zip, temp_dir)

    #Align SICOs so all sequences become equal length sequences
    aligned_files = _align_sicos(run_dir, sico_files)

    #Filter orthologs that retain less than PERC % of sequence after trimming alignment
    trimmed_files, misaligned_files = _trim_alignments(run_dir, aligned_files, retained_threshold, max_indel_length,
                                                       target_stats_path, target_scatterplot)

    #Create archives of files on command line specified output paths
    create_archive_of_files(aligned_zip, aligned_files)
    create_archive_of_files(misaligned_zip, misaligned_files)
    create_archive_of_files(trimmed_zip, trimmed_files)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info('Produced: \n%s', '\n'.join((aligned_zip, misaligned_zip, trimmed_zip,
                                         target_stats_path, target_scatterplot)))
Exemplo n.º 11
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: run_orthomcl.py
--protein-zip=FILE           zip archive of translated protein files
--ortholog-limiter=FILE      nucleotide fasta file containing coding regions in individual records. this file will be
                             translated to protein and fed into orthomcl along with files in protein - zip to influence
                             the clustering of orthologs, and (optionally) later the extraction of orthologs [OPTIONAL]
--poor-protein-length=INT    filter poor proteins when smaller than poor-protein-length
--evalue-exponent=INT        filter OrthoMCL BLAST similarities with Expect value exponents greater than this value
--poor-proteins=FILE         destination file path for filtered poor proteins
--groups=FILE                destination file path for file listing groups of orthologous proteins
"""
    options = [
        'protein-zip', 'ortholog-limiter=?', 'poor-protein-length',
        'evalue-exponent', 'poor-proteins', 'groups'
    ]
    protein_zipfile, limiter_file, poor_protein_length, evalue_exponent, target_poor_proteins, target_groups_path = \
        parse_options(usage, options, args)

    #Extract files from zip archive
    temp_dir = tempfile.mkdtemp(prefix='orthomcl_proteins_')
    proteome_files = extract_archive_of_files(protein_zipfile, temp_dir)

    #If limiter file is defined, add it to the set op protein files
    if limiter_file:
        #First format nucleotide fasta file to contain the correct fasta headers
        formatted_fasta_file = format_fasta_genome_headers(
            'limiter', limiter_file)
        #Then translate it from nucleotide to protein
        translated_limiter = translate_fasta_coding_regions(
            formatted_fasta_file)
        #Then append it to the list op proteome files
        proteome_files.append(translated_limiter)

    #Actually run orthomcl
    run_orthomcl(proteome_files, poor_protein_length, evalue_exponent,
                 target_poor_proteins, target_groups_path)

    #Remove unused files to free disk space
    shutil.rmtree(temp_dir)

    #Exit after a comforting log message
    log.info("Produced: \n%s\n%s", target_poor_proteins, target_groups_path)
Exemplo n.º 12
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: run_codeml.py
--genomes-a=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon A
--genomes-b=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon B
--sico-zip=FILE      archive of aligned & trimmed single copy orthologous (SICO) genes
--codeml-zip=FILE     destination file path for archive of codeml output per SICO gene
--dnds-stats=FILE     destination file path for file with dN, dS & dN/dS values per SICO gene
"""
    options = [
        'genomes-a', 'genomes-b', 'sico-zip', 'codeml-zip', 'dnds-stats'
    ]
    genome_a_ids_file, genome_b_ids_file, sico_zip, codeml_zip, dnds_file = parse_options(
        usage, options, args)

    # Parse file to extract GenBank Project IDs
    with open(genome_a_ids_file) as read_handle:
        genome_ids_a = [line.split()[0] for line in read_handle]
    with open(genome_b_ids_file) as read_handle:
        genome_ids_b = [line.split()[0] for line in read_handle]

    # Create run_dir to hold files relating to this run
    run_dir = tempfile.mkdtemp(prefix='run_codeml_')

    # Extract files from zip archive
    sico_files = extract_archive_of_files(
        sico_zip, create_directory('sicos', inside_dir=run_dir))

    # Actually run codeml
    codeml_files = run_codeml_for_sicos(run_dir, genome_ids_a, genome_ids_b,
                                        sico_files)

    # Write dnds values to single output file
    _write_dnds_per_ortholog(dnds_file, codeml_files)

    # Write the produced files to command line argument filenames
    create_archive_of_files(codeml_zip, codeml_files)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info("Produced: \n%s\n%s", codeml_zip, dnds_file)
Exemplo n.º 13
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: split_by_taxa.py
--genomes-a=FILE        file with genome GenBank Project ID and Organism name on each line for taxon A
--genomes-b=FILE        file with genome GenBank Project ID and Organism name on each line for taxon B
--orthologs-zip=FILE    archive of aligned & trimmed single copy orthologous (SICO) genes
--taxon-a-zip=FILE      destination file path for archive of SICO genes belonging to taxon A
--taxon-b-zip=FILE      destination file path for archive of SICO genes belonging to taxon B
"""
    options = ['genomes-a', 'genomes-b', 'orthologs-zip', 'taxon-a-zip', 'taxon-b-zip']
    genome_a_ids_file, genome_b_ids_file, orthologs_zip, taxon_a_zip, taxon_b_zip = parse_options(usage, options, args)

    #Parse file containing RefSeq project IDs to extract RefSeq project IDs
    with open(genome_a_ids_file) as read_handle:
        lines = [line.split('\t') for line in read_handle]
        genome_ids_a = [line[0] for line in lines]
        common_prefix_a = _common_prefix([line[1] for line in lines], 'taxon_a')
    with open(genome_b_ids_file) as read_handle:
        lines = [line.split('\t') for line in read_handle]
        genome_ids_b = [line[0] for line in lines]
        common_prefix_b = _common_prefix([line[1] for line in lines], 'taxon_b')

    #Create run_dir to hold files related to this run
    run_dir = tempfile.mkdtemp(prefix='split_by_taxa_')

    #Extract files from zip archive
    ortholog_files = extract_archive_of_files(orthologs_zip, create_directory('alignments', inside_dir=run_dir))

    #Actually split alignments per taxon
    taxon_a_files, taxon_b_files = split_alignment_by_taxa(run_dir, ortholog_files,
                                                           (genome_ids_a, common_prefix_a),
                                                           (genome_ids_b, common_prefix_b))

    #Write the produced files to command line argument filenames
    create_archive_of_files(taxon_a_zip, taxon_a_files)
    create_archive_of_files(taxon_b_zip, taxon_b_files)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info("Produced: \n%s\n%s", taxon_a_zip, taxon_b_zip)
    return taxon_a_zip, taxon_b_zip
Exemplo n.º 14
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: crosstable_gene_ids.py
--sico-zip=FILE      archive of single copy orthologous (SICO) genes in separate files per ortholog
--crosstable=FILE    destination file path for crosstable between orthologs & genomes with gene IDs at intersections
"""
    options = ['sico-zip', 'crosstable']
    sizo_zip, target_crosstable = parse_options(usage, options, args)

    #Create tempdir
    run_dir = tempfile.mkdtemp(prefix='crosstable_')
    sico_files = extract_archive_of_files(sizo_zip, run_dir)

    #Create crosstable
    create_crosstable(sico_files, target_crosstable)

    #Remove extracted files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    logging.info("Produced: \n%s", target_crosstable)
Exemplo n.º 15
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: crosstable_gene_ids.py
--sico-zip=FILE      archive of single copy orthologous (SICO) genes in separate files per ortholog
--crosstable=FILE    destination file path for crosstable between orthologs & genomes with gene IDs at intersections
"""
    options = ['sico-zip', 'crosstable']
    sizo_zip, target_crosstable = parse_options(usage, options, args)

    #Create tempdir
    run_dir = tempfile.mkdtemp(prefix='crosstable_')
    sico_files = extract_archive_of_files(sizo_zip, run_dir)

    #Create crosstable
    create_crosstable(sico_files, target_crosstable)

    #Remove extracted files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    logging.info("Produced: \n%s", target_crosstable)
Exemplo n.º 16
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: run_codeml.py
--genomes-a=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon A
--genomes-b=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon B
--sico-zip=FILE      archive of aligned & trimmed single copy orthologous (SICO) genes
--codeml-zip=FILE     destination file path for archive of codeml output per SICO gene
--dnds-stats=FILE     destination file path for file with dN, dS & dN/dS values per SICO gene
"""
    options = ['genomes-a', 'genomes-b', 'sico-zip', 'codeml-zip', 'dnds-stats']
    genome_a_ids_file, genome_b_ids_file, sico_zip, codeml_zip, dnds_file = parse_options(usage, options, args)

    # Parse file to extract GenBank Project IDs
    with open(genome_a_ids_file) as read_handle:
        genome_ids_a = [line.split()[0] for line in read_handle]
    with open(genome_b_ids_file) as read_handle:
        genome_ids_b = [line.split()[0] for line in read_handle]

    # Create run_dir to hold files relating to this run
    run_dir = tempfile.mkdtemp(prefix='run_codeml_')

    # Extract files from zip archive
    sico_files = extract_archive_of_files(sico_zip, create_directory('sicos', inside_dir=run_dir))

    # Actually run codeml
    codeml_files = run_codeml_for_sicos(run_dir, genome_ids_a, genome_ids_b, sico_files)

    # Write dnds values to single output file
    _write_dnds_per_ortholog(dnds_file, codeml_files)

    # Write the produced files to command line argument filenames
    create_archive_of_files(codeml_zip, codeml_files)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info("Produced: \n%s\n%s", codeml_zip, dnds_file)
Exemplo n.º 17
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: run_orthomcl.py
--protein-zip=FILE           zip archive of translated protein files
--ortholog-limiter=FILE      nucleotide fasta file containing coding regions in individual records. this file will be
                             translated to protein and fed into orthomcl along with files in protein - zip to influence
                             the clustering of orthologs, and (optionally) later the extraction of orthologs [OPTIONAL]
--poor-protein-length=INT    filter poor proteins when smaller than poor-protein-length
--evalue-exponent=INT        filter OrthoMCL BLAST similarities with Expect value exponents greater than this value
--poor-proteins=FILE         destination file path for filtered poor proteins
--groups=FILE                destination file path for file listing groups of orthologous proteins
"""
    options = ['protein-zip', 'ortholog-limiter=?', 'poor-protein-length', 'evalue-exponent', 'poor-proteins', 'groups']
    protein_zipfile, limiter_file, poor_protein_length, evalue_exponent, target_poor_proteins, target_groups_path = \
        parse_options(usage, options, args)

    #Extract files from zip archive
    temp_dir = tempfile.mkdtemp(prefix='orthomcl_proteins_')
    proteome_files = extract_archive_of_files(protein_zipfile, temp_dir)

    #If limiter file is defined, add it to the set op protein files
    if limiter_file:
        #First format nucleotide fasta file to contain the correct fasta headers
        formatted_fasta_file = format_fasta_genome_headers('limiter', limiter_file)
        #Then translate it from nucleotide to protein
        translated_limiter = translate_fasta_coding_regions(formatted_fasta_file)
        #Then append it to the list op proteome files
        proteome_files.append(translated_limiter)

    #Actually run orthomcl
    run_orthomcl(proteome_files, poor_protein_length, evalue_exponent, target_poor_proteins, target_groups_path)

    #Remove unused files to free disk space
    shutil.rmtree(temp_dir)

    #Exit after a comforting log message
    log.info("Produced: \n%s\n%s", target_poor_proteins, target_groups_path)
Exemplo n.º 18
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: filter_orthologs.py
--orthologs-zip=FILE            archive of orthologous genes in FASTA format
--filter-multiple-cogs          filter orthologs with multiple COG annotations among genes [OPTIONAL]

--filter-recombination=FILE     filter orthologs that show recombination when comparing phylogenetic trees [OPTIONAL]
                                destination file path for archive of recombination orthologs
--recombined-crosstable=FILE    destination file path for recombined crosstable of GeneIDs, COGs and Products [OPTIONAL]
--taxon-a=FILE                  file with genome IDs for taxon A to use in recombination filtering
--taxon-b=FILE                  file with genome IDs for taxon B to use in recombination filtering
--retained-zip=FILE             destination file path for archive of retained orthologs after filtering

--orthologs-per-genome=FILE      destination file path for orthologs split out per genome, based on the retained.zip
--concatemer=FILE                destination file path for super-concatemer of all genomes
"""
    options = ('orthologs-zip', 'filter-multiple-cogs=?', 'filter-recombination=?', 'recombined-crosstable=?',
               'taxon-a=?', 'taxon-b=?', 'retained-zip', 'orthologs-per-genome', 'concatemer')
    orthologs_zip, filter_cogs, filter_recombination, recombined_crosstable, \
    taxona, taxonb, retained_zip, target_orth_per_genome, target_concat_file = parse_options(usage, options, args)

    #Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='filter_orthologs_')

    #Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir)

    #Filter orthologs with multiple COG annotations among genes if flag was set
    if filter_cogs:
        ortholog_files, transfered_cogs = _filter_multiple_cog_orthologs(run_dir, ortholog_files)

    #Possible extension: filter ortholog when any strain has been flagged as 'mobile element', 'phage' or 'IS element'

    #Filter orthologs that show recombination when comparing phylogenetic trees if flag was set
    if filter_recombination:
        #Parse file to extract GenBank Project IDs
        with open(taxona) as read_handle:
            genome_ids_a = [line.split()[0] for line in read_handle]
        with open(taxonb) as read_handle:
            genome_ids_b = [line.split()[0] for line in read_handle]
        ortholog_files, recombined_files = _phipack_for_all_orthologs(run_dir, ortholog_files,
                                                                       genome_ids_a, genome_ids_b)
        #Create crosstable
        create_crosstable(recombined_files, recombined_crosstable)

    #Create archives of files on command line specified output paths
    if filter_cogs:
        shutil.move(transfered_cogs, filter_cogs)
    if filter_recombination:
        create_archive_of_files(filter_recombination, recombined_files)
    create_archive_of_files(retained_zip, ortholog_files)

    #Run the steps required after filtering orthologs
    post_recombination_filter(taxona, taxonb, retained_zip,
                              target_orth_per_genome, target_concat_file, run_dir)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info('Produced:')
    if filter_cogs:
        log.info(filter_cogs)
    if filter_recombination:
        log.info(filter_recombination)
    log.info(retained_zip)
    log.info(target_orth_per_genome)
    log.info(target_concat_file)
Exemplo n.º 19
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: translate.py
--genomes=FILE         file with selected genome IDs followed by Organism Name on each line
--external-zip=FILE    optional archive of user provided external genomes containing formatted nucleotide fasta files
--dna-zip=FILE         destination file path for zip archive of extracted DNA files
--protein-zip=FILE     destination file path for zip archive of translated protein files
"""
    options = ['genomes', 'external-zip=?', 'dna-zip', 'protein-zip']
    genome_ids_file, external_zip, dna_zipfile, protein_zipfile = parse_options(
        usage, options, args)

    dna_files = []
    protein_files = []

    #Read GenBank Project IDs from genomes_file, each on their own line
    with open(genome_ids_file) as read_handle:
        genome_ids = [
            line.split()[0] for line in read_handle
            if not line.startswith('#') and 'external genome' not in line
        ]

        if len(genome_ids):
            #Retrieve associated genome dictionaries from complete genomes table
            genomes = select_genomes_by_ids(genome_ids).values()
            genomes = sorted(genomes, key=itemgetter('Organism/Name'))

            #Actually translate the genomes to produced a set of files for both  dna files & protein files
            dna_files, protein_files = translate_genomes(genomes)

    #Also translate the external genomes
    if external_zip:
        #Extract external genomes archive
        external_dir = tempfile.mkdtemp(prefix='external_genomes_')
        external_dna_files = extract_archive_of_files(external_zip,
                                                      external_dir)

        #Append IDs of external fasta files to genome IDs file
        _append_external_genomes(external_dna_files, genome_ids_file)

        #Translate individual files
        external_protein_files = [
            translate_fasta_coding_regions(dna_file)
            for dna_file in external_dna_files
        ]

        #Add the files to the appropriate collections
        dna_files.extend(external_dna_files)
        protein_files.extend(external_protein_files)

    #Write the produced files to command line argument filenames
    create_archive_of_files(dna_zipfile, dna_files)
    create_archive_of_files(protein_zipfile, protein_files)

    #Do not clean up extracted DNA files or Protein translations: Keep them as cache

    #But do clean up external_dir now that the compressed archives are created
    if external_zip:
        shutil.rmtree(external_dir)

    #Exit after a comforting log message
    log.info("Produced: \n%s &\n%s", dna_zipfile, protein_zipfile)
Exemplo n.º 20
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: extract_orthologs.py
--genomes=FILE       file with GenBank Project IDs from complete genomes table on each line
--dna-zip=FILE       zip archive of extracted DNA files
--groups=FILE        file listing groups of orthologous proteins
--require-limiter    flag whether extracted core set of genomes should contain the limiter added in OrthoMCL [OPTIONAL]

--sico-zip=FILE      destination file path for archive of shared single copy orthologous (SICO) genes
--muco-zip=FILE      destination file path for archive of shared multiple copy orthologous genes
--subset-zip=FILE    destination file path for archive of variable copy orthologous genes shared for a subset only
--stats=FILE         destination file path for ortholog statistics file
--heatmap=FILE       destination file path heatmap of orthologs and occurrences of ortholog per genome
--orfans=FILE        destination file path ORFans
"""
    options = ['genomes', 'dna-zip', 'groups', 'require-limiter?',
               'sico-zip', 'muco-zip=?', 'subset-zip=?', 'stats', 'heatmap', 'orfans']
    genome_ids_file, dna_zip, groups_file, require_limiter, \
    target_sico, target_muco, target_subset, target_stats_path, target_heat, target_orfans = \
    parse_options(usage, options, args)

    #Parse file extract GenBank Project IDs
    with open(genome_ids_file) as read_handle:
        genomes = [line.split()[0] for line in read_handle if not line.startswith('#')]

    #Create temporary directory within which to extract orthologs
    run_dir = tempfile.mkdtemp(prefix='extract_orthologs_run_')

    #Extract files from zip archive
    temp_dir = create_directory('dna_files', inside_dir=run_dir)
    dna_files = extract_archive_of_files(dna_zip, temp_dir)

    #Actually run ortholog extraction
    sico_files, muco_files, subset_files, stats_file, heatmap_file, orfans_file = \
        extract_orthologs(run_dir, genomes, dna_files, groups_file, require_limiter)

    #Append the orfans to the heatmap file
    _append_orfans_to_heatmap(orfans_file, genomes, heatmap_file)

    #Move produced files to command line specified output paths
    create_archive_of_files(target_sico, sico_files)
    if target_muco:
        create_archive_of_files(target_muco, muco_files)
    if target_subset:
        create_archive_of_files(target_subset, subset_files)
    shutil.move(stats_file, target_stats_path)
    shutil.move(heatmap_file, target_heat)
    shutil.move(orfans_file, target_orfans)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info("Produced:")
    log.info("%s", target_sico)
    if target_muco:
        log.info("%s", target_muco)
    if target_subset:
        log.info("%s", target_subset)
    log.info("%s", target_stats_path)
    log.info("%s", target_heat)
Exemplo n.º 21
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: concatenate_orthologs.py
--orthologs-zip=FILE     archive of orthologous genes in FASTA format
--coding-regions=FILE    destination file path archive of trimmed orthologous coding regions per genomes
--concatemer=FILE        destination file path for super-concatemer of all genomes
--taxon-a=FILE           destination file path for genome IDs for taxon A
--taxon-b=FILE           destination file path for genome IDs for taxon B
--tree=FILE              destination file path for tree visualization
"""
    options = [
        'orthologs-zip', 'coding-regions', 'concatemer', 'taxon-a', 'taxon-b',
        'tree'
    ]
    orthologs_zip, target_coding_regions, target_concat_file, target_taxon_a, target_taxon_b, target_tree = \
        parse_options(usage, options, args)

    #Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='concatemer_tree_')

    #Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir)

    #Separate out orthologs per genome to create trimmed coding region files per genome
    genome_coding_regions_files = coding_regions_per_genome(
        run_dir, ortholog_files)
    create_archive_of_files(target_coding_regions, genome_coding_regions_files)

    #Concatenate coding region files per genome
    concatemer_files = concatemer_per_genome(run_dir,
                                             genome_coding_regions_files)
    #Create super concatemer
    create_super_concatemer(concatemer_files, target_concat_file)

    #Determine the taxa present in the super concatemer tree by building a phylogenetic tree from genome concatemer and
    #reading genome ids in the two largest clades.
    super_distance_file = _run_dna_dist(run_dir, target_concat_file)
    super_tree_file = _run_neighbor(run_dir, super_distance_file)
    genome_ids_a, genome_ids_b = _read_taxa_from_tree(super_tree_file)

    #Map Project IDs to Organism names
    id_to_name_map = dict(
        (gid, genome['Organism/Name'])
        for gid, genome in select_genomes_by_ids(genome_ids_a +
                                                 genome_ids_b).iteritems())

    #Write Project IDs and Organism Names to files, with a fallback to genome_id for external genome
    with open(target_taxon_a, mode='w') as write_handle:
        for genome_id in genome_ids_a:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id,
                                                       name=id_to_name_map.get(
                                                           genome_id,
                                                           genome_id)))
    with open(target_taxon_b, mode='w') as write_handle:
        for genome_id in genome_ids_b:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id,
                                                       name=id_to_name_map.get(
                                                           genome_id,
                                                           genome_id)))

    #Visualize tree
    visualize_tree(super_tree_file, id_to_name_map, target_tree)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info('Produced: \n%s\n%s\n%s\n%s\n%s', target_coding_regions,
             target_concat_file, target_taxon_a, target_taxon_b, target_tree)