Пример #1
0
def _dna_file_per_sico(run_dir, dna_files, shared_single_copy, shared_multi_copy, non_shared):
    """Create fasta files with all sequences per ortholog."""
    #Delete & create directory to remove any previously existing SICO files
    sico_dir = create_directory('sico', inside_dir=run_dir)
    muco_dir = create_directory('muco', inside_dir=run_dir)
    subset_dir = create_directory('subset', inside_dir=run_dir)
    orfans_file = os.path.join(run_dir, 'ORFans.ffn')

    #Loop over DNA files to extract SICO genes from each genome to file per SICO
    sico_files = set()
    muco_files = set()
    subset_files = set()
    number_of_sequences = 0

    for dna_file in dna_files:
        log.info('Extracting orthologous genes from %s', dna_file)
        for record in SeqIO.parse(dna_file, 'fasta'):
            number_of_sequences += 1

            #Find record in each list of dictionaries, to append it to the corresponding ortholog files
            aff_sico_files = _write_record_to_ortholog_file(sico_dir, shared_single_copy, record)
            sico_files.update(aff_sico_files)
            aff_muco_files = _write_record_to_ortholog_file(muco_dir, shared_multi_copy, record)
            muco_files.update(aff_muco_files)
            aff_nonsha_files = _write_record_to_ortholog_file(subset_dir, non_shared, record)
            subset_files.update(aff_nonsha_files)

            #ORFans do not fall into any of the above three categories: Add them to a separate file
            if not aff_sico_files and not aff_muco_files and not aff_nonsha_files:
                with open(orfans_file, mode='a') as write_handle:
                    SeqIO.write(record, write_handle, 'fasta')

    return sorted(sico_files), sorted(muco_files), sorted(subset_files), number_of_sequences, orfans_file
Пример #2
0
def _step11_orthomcl_dump_pairs(run_dir, config_file):
    """Dump files from the database produced by the orthomclPairs program.

    usage: orthomclDumpPairsFiles config_file

    where:
        config_file : see below (you can use the same file given to orthomclPairs)

    Database Input:
        InParalog, Ortholog, CoOrtholog tables - populated by orthomclPairs

    Output files:
        orthomclMclInput - file required by the mcl program
        pairs/ -dir holding relationship files
            potentialOrthologs.txt - ortholog relationships
            potentialInparalogs.txt - inparalog relationships
            potentialCoorthologs.txt - coortholog relationships

    The pairs/ files contain the pairs found by the orthomclPairs tables, and their
    average normalized scores.  This is the same information as in the
    orthomclMclInput file, but segregated by relationship type.  These are
    candidate relationships (edges) that will subsequently be grouped (clustered)
    by the mcl program to form the OrthoMCL ortholog groups.  These files contain
    more sensitive and less selective relationships then the final ortholog groups.

    Standard Error:
        logging info

    EXAMPLE: orthomclSoftware/bin/orthomclDumpPairsFile out_dir/orthomcl.config
    """
    #Run orthomclDumpPairsFile
    out_dir = create_directory('orthologs', inside_dir=run_dir)
    command = [ORTHOMCL_DUMP_PAIRS_FILES, config_file]
    log.info('Executing: %s', ' '.join(command))
    check_call(command, cwd=out_dir)

    #Desired destination output file paths
    mcl_dir = create_directory('mcl', inside_dir=run_dir)
    mclinput = os.path.join(mcl_dir, 'mclInput.tsv')
    orthologs = os.path.join(out_dir, 'potentialOrthologs.tsv')
    inparalogs = os.path.join(out_dir, 'potentialInparalogs.tsv')
    coorthologs = os.path.join(out_dir, 'potentialCoorthologs.tsv')

    #Move output files to desired destinations
    shutil.move(os.path.join(out_dir, 'mclInput'), mclinput)
    shutil.move(os.path.join(out_dir, 'pairs/orthologs.txt'), orthologs)
    shutil.move(os.path.join(out_dir, 'pairs/inparalogs.txt'), inparalogs)
    shutil.move(os.path.join(out_dir, 'pairs/coorthologs.txt'), coorthologs)

    #Assert mcl input file exists and has some content
    assert os.path.isfile(mclinput) and 0 < os.path.getsize(
        mclinput), mclinput + ' should exist and have some content'

    return mclinput, orthologs, inparalogs, coorthologs
Пример #3
0
def _step11_orthomcl_dump_pairs(run_dir, config_file):
    """Dump files from the database produced by the orthomclPairs program.

    usage: orthomclDumpPairsFiles config_file

    where:
        config_file : see below (you can use the same file given to orthomclPairs)

    Database Input:
        InParalog, Ortholog, CoOrtholog tables - populated by orthomclPairs

    Output files:
        orthomclMclInput - file required by the mcl program
        pairs/ -dir holding relationship files
            potentialOrthologs.txt - ortholog relationships
            potentialInparalogs.txt - inparalog relationships
            potentialCoorthologs.txt - coortholog relationships

    The pairs/ files contain the pairs found by the orthomclPairs tables, and their
    average normalized scores.  This is the same information as in the
    orthomclMclInput file, but segregated by relationship type.  These are
    candidate relationships (edges) that will subsequently be grouped (clustered)
    by the mcl program to form the OrthoMCL ortholog groups.  These files contain
    more sensitive and less selective relationships then the final ortholog groups.

    Standard Error:
        logging info

    EXAMPLE: orthomclSoftware/bin/orthomclDumpPairsFile out_dir/orthomcl.config
    """
    #Run orthomclDumpPairsFile
    out_dir = create_directory('orthologs', inside_dir=run_dir)
    command = [ORTHOMCL_DUMP_PAIRS_FILES, config_file]
    log.info('Executing: %s', ' '.join(command))
    check_call(command, cwd=out_dir)

    #Desired destination output file paths
    mcl_dir = create_directory('mcl', inside_dir=run_dir)
    mclinput = os.path.join(mcl_dir, 'mclInput.tsv')
    orthologs = os.path.join(out_dir, 'potentialOrthologs.tsv')
    inparalogs = os.path.join(out_dir, 'potentialInparalogs.tsv')
    coorthologs = os.path.join(out_dir, 'potentialCoorthologs.tsv')

    #Move output files to desired destinations
    shutil.move(os.path.join(out_dir, 'mclInput'), mclinput)
    shutil.move(os.path.join(out_dir, 'pairs/orthologs.txt'), orthologs)
    shutil.move(os.path.join(out_dir, 'pairs/inparalogs.txt'), inparalogs)
    shutil.move(os.path.join(out_dir, 'pairs/coorthologs.txt'), coorthologs)

    #Assert mcl input file exists and has some content
    assert os.path.isfile(mclinput) and 0 < os.path.getsize(mclinput), mclinput + ' should exist and have some content'

    return mclinput, orthologs, inparalogs, coorthologs
Пример #4
0
def _prepare_calculations(genomes_a_file,
                          genomes_b_file,
                          sicozip_file,
                          table_a_dest,
                          table_b_dest,
                          append_odd_even=False):
    '''Unzip sico_files, and if needed create temporary files for the odd/even only codons.'''
    if append_odd_even:
        # prepend file makeup when odd/even table are also added
        _write_intro_to_file(table_a_dest)
        _write_intro_to_file(table_b_dest)

    # extract ortholog files from sicozip
    rundir = tempfile.mkdtemp(prefix='calculations_')
    sico_files = extract_archive_of_files(sicozip_file, create_directory('sicos', inside_dir=rundir))

    # perform normal calculation
    run_calculations(genomes_a_file, genomes_b_file, sico_files, table_a_dest, table_b_dest)

    # separate calculations for odd and even tables
    if append_odd_even:
        odd_sico_files, even_sico_files = _split_by_odd_even_codons(sico_files)
        run_calculations(genomes_a_file, genomes_b_file, odd_sico_files, table_a_dest, table_b_dest)
        run_calculations(genomes_a_file, genomes_b_file, even_sico_files, table_a_dest, table_b_dest)

    # clean up
    shutil.rmtree(rundir)
Пример #5
0
def run_phipack(phipack_dir, dna_file):
    """Run PhiPack and return the number of informative sites, PHI, Max Chi^2 and NSS."""
    #Create directory for PhiPack to run in, so files get created there
    orth_name = os.path.split(dna_file)[1].split('.')[0]
    rundir = create_directory(orth_name, inside_dir=phipack_dir)

    #Build up list of commands
    command = PHIPACK, '-f', dna_file, '-o'  # Output NSS & Max Chi^2
    try:
        check_call(command, cwd=rundir, stdout=open('/dev/null', mode='w'))
    except CalledProcessError as err:
        log.warn('Error running PhiPack for %s:\n%s', dna_file, err)
        return {'PhiPack sites': None, 'Phi': None, 'Max Chi^2': None, 'NSS': None}

    #Retrieve output log file contents
    logfile = os.path.join(rundir, 'Phi.log')
    with open(logfile) as read_handle:
        contents = ''.join(read_handle)

    #Parse standard output to retrieve values for # sites, Phi, Chi^2 max & NSS
    #Found 103 informative sites.
    #PHI (Normal):        9.04e-01
    #Max Chi^2:           6.60e-01  (1000 permutations)
    #NSS:                 6.31e-01  (1000 permutations)
    sites = int(re.search('Found ([0-9]+) informative sites.', contents).group(1))
    raw_phi = re.search('PHI \(Normal\):\s+(.*)', contents).group(1)
    phi = float(raw_phi) if raw_phi != '--' else None
    chi = float(re.search('Max Chi\^2:\s+(.*)\s+\(1000 permutations\)', contents).group(1))
    nss = float(re.search('NSS:\s+(.*)\s+\(1000 permutations\)', contents).group(1))
    return {'PhiPack sites': sites, 'Phi': phi, 'Max Chi^2': chi, 'NSS': nss}
Пример #6
0
def _run_dna_dist(run_dir, aligned_file):
    """Run dnadist to calculate distances between individual strains in a distance matrix, as input for neighbor."""
    #Run calculations inside a directory
    dnadist_dir = create_directory('dnadist/', inside_dir=run_dir)

    #Read alignment file
    alignment = AlignIO.read(aligned_file, 'fasta')

    #Convert alignment in to proper input file for dnadist according to specification
    nr_of_species = len(alignment)
    nr_of_sites = len(alignment[0])
    infile = os.path.join(dnadist_dir, 'infile')
    with open(infile, mode='w') as write_handle:
        write_handle.write('   {0}   {1}\n'.format(nr_of_species, nr_of_sites))

        for seq_record in alignment:
            name = seq_record.id.split('|')[0]
            name = name if len(name) < 10 else name[:10]
            write_handle.write('{0:10}{1}\n'.format(name, seq_record.seq))

    #Actually run the dnadist program in the correct directory, and send input to it for the first prompt
    process = Popen(DNADIST, cwd=dnadist_dir, stdin=PIPE, stdout=PIPE, stderr=STDOUT)
    process.communicate(input='Y\n')

    #Retrieve outputfile
    outfile = os.path.join(dnadist_dir, 'outfile')
    assert os.path.exists(outfile) and 0 < os.path.getsize(outfile), outfile + ' should exist with some content now'
    return outfile
Пример #7
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: calculations.py
--genomes-a=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon A
--genomes-b=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon B
--sico-zip=FILE      archive of aligned & trimmed single copy orthologous (SICO) genes
--table-a=FILE       destination file path for summary statistics table based on orthologs in taxon A
--table-b=FILE       destination file path for summary statistics table based on orthologs in taxon B
--append-odd-even    append separate tables calculated for odd and even codons of ortholog alignments [OPTIONAL]
"""
    options = [
        'genomes-a', 'genomes-b', 'sico-zip', 'table-a', 'table-b',
        'append-odd-even?'
    ]
    genome_a_ids_file, genome_b_ids_file, sico_zip, table_a, table_b, oddeven = parse_options(
        usage, options, args)

    #Parse file containing GenBank GenBank Project IDs to extract GenBank Project IDs
    with open(genome_a_ids_file) as read_handle:
        lines = [line.strip() for line in read_handle]
        genome_ids_a = [line.split()[0] for line in lines]
        common_prefix_a = os.path.commonprefix(
            [line.split('\t')[1] for line in lines]).strip()
    with open(genome_b_ids_file) as read_handle:
        lines = [line.strip() for line in read_handle]
        genome_ids_b = [line.split()[0] for line in lines]
        common_prefix_b = os.path.commonprefix(
            [line.split('\t')[1] for line in lines]).strip()

    #Prepend headers to each of the output tables
    _prepend_table_header(table_a, genome_ids_a, common_prefix_a, genome_ids_b,
                          common_prefix_b, oddeven)
    _prepend_table_header(table_b, genome_ids_b, common_prefix_b, genome_ids_a,
                          common_prefix_a, oddeven)

    #Create run_dir to hold files relating to this run
    run_dir = tempfile.mkdtemp(prefix='calculations_')

    #Extract files from zip archive
    sico_files = extract_archive_of_files(
        sico_zip, create_directory('sicos', inside_dir=run_dir))

    #Actually do calculations
    tmp_table_tuple = calculate_tables(genome_ids_a, genome_ids_b, sico_files,
                                       oddeven)

    #Write the produced files to command line argument filenames
    with open(table_a, mode='ab') as append_handle:
        shutil.copyfileobj(open(tmp_table_tuple[0], mode='rb'), append_handle)
    with open(table_b, mode='ab') as append_handle:
        shutil.copyfileobj(open(tmp_table_tuple[1], mode='rb'), append_handle)

    #Remove now unused files to free disk space
    shutil.rmtree(run_dir)
    os.remove(tmp_table_tuple[0])
    os.remove(tmp_table_tuple[1])

    #Exit after a comforting log message
    log.info("Produced: \n%s\n%s", table_a, table_b)
Пример #8
0
def _run_dna_dist(run_dir, aligned_file):
    """Run dnadist to calculate distances between individual strains in a distance matrix, as input for neighbor."""
    #Run calculations inside a directory
    dnadist_dir = create_directory('dnadist/', inside_dir=run_dir)

    #Read alignment file
    alignment = AlignIO.read(aligned_file, 'fasta')

    #Convert alignment in to proper input file for dnadist according to specification
    nr_of_species = len(alignment)
    nr_of_sites = len(alignment[0])
    infile = os.path.join(dnadist_dir, 'infile')
    with open(infile, mode='w') as write_handle:
        write_handle.write('   {0}   {1}\n'.format(nr_of_species, nr_of_sites))

        for seq_record in alignment:
            name = seq_record.id.split('|')[0]
            name = name if len(name) < 10 else name[:10]
            write_handle.write('{0:10}{1}\n'.format(name, seq_record.seq))

    #Actually run the dnadist program in the correct directory, and send input to it for the first prompt
    process = Popen(DNADIST,
                    cwd=dnadist_dir,
                    stdin=PIPE,
                    stdout=PIPE,
                    stderr=STDOUT)
    process.communicate(input='Y\n')

    #Retrieve outputfile
    outfile = os.path.join(dnadist_dir, 'outfile')
    assert os.path.exists(outfile) and 0 < os.path.getsize(
        outfile), outfile + ' should exist with some content now'
    return outfile
Пример #9
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: run_phipack.py
--orthologs-zip=FILE     archive of orthologous genes in FASTA format
--stats-file=FILE        destination file path for values found through PhiPack for each ortholog
"""
    options = ('orthologs-zip', 'stats-file')
    orthologs_zip, stats_file = parse_options(usage, options, args)

    #Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='run_phipack_')

    #Extract files from zip archive
    extraction_dir = create_directory('extracted_orthologs', inside_dir=run_dir)
    ortholog_files = extract_archive_of_files(orthologs_zip, extraction_dir)

    #Find recombination in all ortholog_files
    _phipack_for_all_orthologs(run_dir, ortholog_files, stats_file)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info('Produced:\n%s', stats_file)
Пример #10
0
def _step6_orthomcl_filter_fasta(run_dir, input_dir, min_length=10, max_percent_stop=20):
    """Create goodProteins.fasta containing all good proteins and rejectProteins.fasta containing all rejects. Input is
    a directory containing a set of compliant input .fasta files (as produced by orthomclAdjustFasta).

    Usage:
      orthomclFilterFasta input_dir min_length max_percent_stops

    where:
      input_dir:           a directory containing a set of .fasta files
      min_length:          minimum allowed length of proteins.  (suggested: 10)
      max_percent_stop:    maximum percent stop codons.  (suggested 20)

    The input requirements are:
      1) a compliantFasta/ directory which contains all and only the proteome .fasta files, one file per proteome.
      2) each .fasta file must have a name in the form 'xxxx.fasta' where xxxx is a three or four letter unique taxon
         code.  For example: hsa.fasta or eco.fasta
      3) each protein in those files must have a definition line in the following format:
         >xxxx|yyyyyy
         where xxxx is the three or four letter taxon code and yyyyyy is a sequence identifier unique within that taxon.

    Output:
        my_orthomcl_dir/goodProteins.fasta
        my_orthomcl_dir/poorProteins.fasta
        report of suspicious proteomes (> 10% poor proteins)

    EXAMPLE: orthomclSoftware/bin/orthomclFilterFasta my_orthomcl_dir/compliantFasta 10 20
    """
    #Run orthomclFilterFasta
    out_dir = create_directory('filtered_fasta', inside_dir=run_dir)
    report = os.path.join(out_dir, 'filter_report.log')
    with open(report, mode='w') as report_file:
        command = [ORTHOMCL_FILTER_FASTA, input_dir, str(min_length), str(max_percent_stop)]
        log.info('Executing: %s', ' '.join(command))
        check_call(command, stdout=report_file, stderr=STDOUT)

    #Move output files to out directory
    good = os.path.join(out_dir, 'good_proteins.fasta')
    poor = os.path.join(out_dir, 'poor_proteins.fasta')
    shutil.move('goodProteins.fasta', good)
    shutil.move('poorProteins.fasta', poor)

    #Ensure neither of the proteomes is suspicious according to min_length & max_percent_stop
    with open(report) as report_file:
        if 'Proteomes with > 10% poor proteins:' in report_file.read():  # OrthoMCL does NOT add actual min_length value
            msg = 'OrthomclFilterFasta found suspicious proteomes based on values for length'
            log.error(msg)
            assert False, msg

    #Warn the user about the poor proteins found here, if they were found at all
    poor_records = list(SeqIO.parse(poor, 'fasta'))
    if poor_records:
        log.warn('%i poor sequence records identified by orthomclFilterFasta:', len(poor_records))
        for seqr in poor_records:
            log.warn('>%s: %s', seqr.id, seqr.seq)

    #Assert good exists and has some content
    assert os.path.isfile(good) and 0 < os.path.getsize(good), good + ' should exist and have some content'

    #Only good and poor proteins
    return good, poor
Пример #11
0
def run_codeml_for_sicos(codeml_dir, genome_ids_a, genome_ids_b, sico_files):
    """Run codeml for representatives of clades A and B in each of the SICO files, to calculate dN/dS."""
    log.info('Running codeml for {0} aligned and trimmed SICOs'.format(
        len(sico_files)))

    codeml_files = []
    for sico_file in sico_files:
        # Separate alignments for clade A & clade B genomes
        ali = AlignIO.read(sico_file, 'fasta')
        alignment_a = MultipleSeqAlignment(
            seqr for seqr in ali if seqr.id.split('|')[0] in genome_ids_a)
        alignment_b = MultipleSeqAlignment(
            seqr for seqr in ali if seqr.id.split('|')[0] in genome_ids_b)

        # Create sub directory for this run based on sico_file name
        filename = os.path.split(sico_file)[1]
        # Split off everything starting from the first dot
        base_name = filename[:filename.find('.')]
        sub_dir = create_directory(base_name, inside_dir=codeml_dir)

        # Submit for asynchronous calculation
        codeml_file = run_codeml(sub_dir, alignment_a, alignment_b)
        codeml_files.append(codeml_file)

    return codeml_files
Пример #12
0
def _phipack_for_all_orthologs(run_dir, aligned_files, genome_ids_a, genome_ids_b):
    """Filter aligned fasta files where there is evidence of recombination when inspecting phylogenetic trees.
    Return two collections of aligned files, the first without recombination, the second with recombination."""

    log.info('Filtering orthologs where phylogenetic trees show evidence of inter-taxon recombination')

    #Collections to hold both non recombination files & files showing recombination
    non_recomb = []
    recombined = []

    #Assign ortholog files to the correct collection based on whether they show recombination
    for ortholog_file in aligned_files:
        #Determine input file base name to create an ortholog run specific directory
        base_name = os.path.split(os.path.splitext(ortholog_file)[0])[1]
        ortholog_dir = create_directory(base_name, inside_dir=run_dir)

        #Create distance file
        distance_file = _run_dna_dist(ortholog_dir, ortholog_file)

        #Create tree file
        tree_file = _run_neighbor(ortholog_dir, distance_file)

        #Parse tree file to ensure all genome_ids_a & genome_ids_b group together in the tree
        if _tree_shows_recombination(genome_ids_a, genome_ids_b, tree_file):
            recombined.append(ortholog_file)
        else:
            non_recomb.append(ortholog_file)

    log.info('%i Orthologs out of %i were filtered out due to recombination, leaving %i non recombined orthologs',
             len(recombined), len(aligned_files), len(non_recomb))

    return non_recomb, recombined
Пример #13
0
def concatemer_per_genome(run_dir, genome_coding_regions_files):
    """Create a concatemer DNA file per genome containing all aligned & trimmed SICO genes."""
    concatemer_dir = create_directory('concatemers', inside_dir=run_dir)

    #Collection of output filenames
    concatemer_files = []

    #Loop over genome coding regions files to create concatemer of each
    for coding_region_file in genome_coding_regions_files:
        #Determine output file name
        filename = os.path.split(coding_region_file)[1]
        basename = filename[:filename.find('.')]
        concatemer_file = os.path.join(concatemer_dir, basename + '.concatemer.fna')
        concatemer_files.append(concatemer_file)

        #Copy ACTG content from coding regions file to concatemer
        with open(coding_region_file) as read_handle:
            with open(concatemer_file, mode='w') as write_handle:
                #Write out single concatemer header
                write_handle.write('> {0}|trimmed concatemer\n'.format(basename))

                #Copy over all lines that are not header lines (do not start with '>')
                for line in read_handle:
                    #Skip header lines
                    if not line.startswith('>'):
                        write_handle.write(line)

    log.info('Created %i genome concatemers', len(concatemer_files))
    return sorted(concatemer_files)
Пример #14
0
def concatemer_per_genome(run_dir, genome_coding_regions_files):
    """Create a concatemer DNA file per genome containing all aligned & trimmed SICO genes."""
    concatemer_dir = create_directory('concatemers', inside_dir=run_dir)

    #Collection of output filenames
    concatemer_files = []

    #Loop over genome coding regions files to create concatemer of each
    for coding_region_file in genome_coding_regions_files:
        #Determine output file name
        filename = os.path.split(coding_region_file)[1]
        basename = filename[:filename.find('.')]
        concatemer_file = os.path.join(concatemer_dir,
                                       basename + '.concatemer.fna')
        concatemer_files.append(concatemer_file)

        #Copy ACTG content from coding regions file to concatemer
        with open(coding_region_file) as read_handle:
            with open(concatemer_file, mode='w') as write_handle:
                #Write out single concatemer header
                write_handle.write(
                    '> {0}|trimmed concatemer\n'.format(basename))

                #Copy over all lines that are not header lines (do not start with '>')
                for line in read_handle:
                    #Skip header lines
                    if not line.startswith('>'):
                        write_handle.write(line)

    log.info('Created %i genome concatemers', len(concatemer_files))
    return sorted(concatemer_files)
Пример #15
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: concatenate_orthologs.py
--orthologs-zip=FILE     archive of orthologous genes in FASTA format
--coding-regions=FILE    destination file path archive of trimmed orthologous coding regions per genomes
--concatemer=FILE        destination file path for super-concatemer of all genomes
--taxon-a=FILE           destination file path for genome IDs for taxon A
--taxon-b=FILE           destination file path for genome IDs for taxon B
--tree=FILE              destination file path for tree visualization
"""
    options = ['orthologs-zip', 'coding-regions', 'concatemer', 'taxon-a', 'taxon-b', 'tree']
    orthologs_zip, target_coding_regions, target_concat_file, target_taxon_a, target_taxon_b, target_tree = \
        parse_options(usage, options, args)

    #Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='concatemer_tree_')

    #Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir)

    #Separate out orthologs per genome to create trimmed coding region files per genome
    genome_coding_regions_files = coding_regions_per_genome(run_dir, ortholog_files)
    create_archive_of_files(target_coding_regions, genome_coding_regions_files)

    #Concatenate coding region files per genome
    concatemer_files = concatemer_per_genome(run_dir, genome_coding_regions_files)
    #Create super concatemer
    create_super_concatemer(concatemer_files, target_concat_file)

    #Determine the taxa present in the super concatemer tree by building a phylogenetic tree from genome concatemer and
    #reading genome ids in the two largest clades.
    super_distance_file = _run_dna_dist(run_dir, target_concat_file)
    super_tree_file = _run_neighbor(run_dir, super_distance_file)
    genome_ids_a, genome_ids_b = _read_taxa_from_tree(super_tree_file)

    #Map Project IDs to Organism names
    id_to_name_map = dict((gid, genome['Organism/Name'])
                          for gid, genome in select_genomes_by_ids(genome_ids_a + genome_ids_b).iteritems())

    #Write Project IDs and Organism Names to files, with a fallback to genome_id for external genome
    with open(target_taxon_a, mode='w') as write_handle:
        for genome_id in genome_ids_a:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id, name=id_to_name_map.get(genome_id, genome_id)))
    with open(target_taxon_b, mode='w') as write_handle:
        for genome_id in genome_ids_b:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id, name=id_to_name_map.get(genome_id, genome_id)))

    #Visualize tree
    visualize_tree(super_tree_file, id_to_name_map, target_tree)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info('Produced: \n%s\n%s\n%s\n%s\n%s', target_coding_regions, target_concat_file,
             target_taxon_a, target_taxon_b, target_tree)
Пример #16
0
def _step5_orthomcl_adjust_fasta(run_dir, proteome_files, id_field=3):
    """Create an OrthoMCL compliant .fasta file, by adjusting definition lines.

    Usage:
      orthomclAdjustFasta taxon_code fasta_file id_field

    where:
      taxon_code:  a three or four letter unique abbreviation for the taxon
      fasta_file:  the input fasta file per proteome
      id_field:    a number indicating what field in the definition line contains
                   the protein ID.  Fields are separated by either ' ' or '|'. Any
                   spaces immediately following the '>' are ignored.  The first
                   field is 1. For example, in the following definition line, the
                   ID (AP_000668.1) is in field 4:  >gi|89106888|ref|AP_000668.1|

    Input file requirements:
      (1) .fasta format
      (2) a unique id is provided for each sequence, and is in the field specified
          by id_field

    Output file format:
      (1) .fasta format
      (2) definition line is of the form:
             >taxoncode|unique_protein_id

    The output file is named taxoncode.fasta

    Note: if your input files do not meet the requirements, you can do some simple perl or awk processing of them to
    create the required input files to this program, or the required output files.  This program is provided as a
    convenience, but OrthoMCL users are expected to have the scripting skills to provide compliant .fasta files.

    EXAMPLE: orthomclSoftware/bin/orthomclAdjustFasta hsa Homo_sapiens.NCBI36.53.pep.all.fa 1
    """
    #Create directory to hold compliant fasta
    adjusted_fasta_dir = create_directory('compliant_fasta', inside_dir=run_dir)
    adjusted_fasta_files = []
    for proteome_file in proteome_files:
        taxon_code = None
        #Use first part of header of first entry as taxon code
        for record in SeqIO.parse(proteome_file, 'fasta'):
            taxon_code = record.id.split('|')[0]
            break

        # If we failed to extract a taxon_code, proteome file must have been empty
        assert taxon_code, 'Proteome file appears empty: ' + proteome_file

        #Call orhtomclAdjustFasta
        command = [ORTHOMCL_ADJUST_FASTA, taxon_code, proteome_file, str(id_field)]
        log.info('Executing: %s', ' '.join(command))
        check_call(command)
        #Move resulting fasta file to compliantFasta directory
        adjusted_fasta_file = taxon_code + '.fasta'
        fasta_file_destination = os.path.join(adjusted_fasta_dir, adjusted_fasta_file)
        shutil.move(adjusted_fasta_file, fasta_file_destination)
        adjusted_fasta_files.append(fasta_file_destination)
    #Return path to directory containing compliantFasta
    return adjusted_fasta_dir, adjusted_fasta_files
Пример #17
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: filter_orthologs.py
--orthologs-zip=FILE           archive of orthologous genes in FASTA format
--retained-threshold=PERC      filter orthologs that retain less than PERC % of sequence after trimming alignment
--max-indel-length=NUMBER      filter orthologs that contain insertions / deletions longer than N in middle of alignment
--aligned-zip=FILE             destination file path for archive of aligned orthologous genes
--misaligned-zip=FILE          destination file path for archive of misaligned orthologous genes
--trimmed-zip=FILE             destination file path for archive of aligned & trimmed orthologous genes
--stats=FILE                   destination file path for ortholog trimming statistics file
--scatterplot=FILE             destination file path for scatterplot of retained and filtered sequences by length
"""
    options = [
        'orthologs-zip', 'retained-threshold', 'max-indel-length',
        'aligned-zip', 'misaligned-zip', 'trimmed-zip', 'stats', 'scatterplot'
    ]
    orthologs_zip, retained_threshold, max_indel_length, \
    aligned_zip, misaligned_zip, trimmed_zip, target_stats_path, target_scatterplot = \
        parse_options(usage, options, args)

    #Convert retained threshold to integer, so we can fail fast if argument value format was wrong
    retained_threshold = int(retained_threshold)
    max_indel_length = int(max_indel_length)

    #Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='align_trim_')

    #Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    sico_files = extract_archive_of_files(orthologs_zip, temp_dir)

    #Align SICOs so all sequences become equal length sequences
    aligned_files = _align_sicos(run_dir, sico_files)

    #Filter orthologs that retain less than PERC % of sequence after trimming alignment
    trimmed_files, misaligned_files = _trim_alignments(run_dir, aligned_files,
                                                       retained_threshold,
                                                       max_indel_length,
                                                       target_stats_path,
                                                       target_scatterplot)

    #Create archives of files on command line specified output paths
    create_archive_of_files(aligned_zip, aligned_files)
    create_archive_of_files(misaligned_zip, misaligned_files)
    create_archive_of_files(trimmed_zip, trimmed_files)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info(
        'Produced: \n%s', '\n'.join((aligned_zip, misaligned_zip, trimmed_zip,
                                     target_stats_path, target_scatterplot)))
Пример #18
0
def _trim_alignments(run_dir, dna_alignments, retained_threshold,
                     max_indel_length, stats_file, scatterplot_file):
    """Trim all DNA alignments using _trim_alignment (singular), and calculate some statistics about the trimming."""
    log.info(
        'Trimming {0} DNA alignments from first non-gap codon to last non-gap codon'
        .format(len(dna_alignments)))

    #Create directory here, to prevent race-condition when folder does not exist, but is then created by another process
    trimmed_dir = create_directory('trimmed', inside_dir=run_dir)

    # Trim all the alignments
    trim_tpls = [
        _trim_alignment((trimmed_dir, dna_alignment, max_indel_length))
        for dna_alignment in dna_alignments
    ]

    remaining_percts = [tpl[3] for tpl in trim_tpls]
    trimmed_alignments = [
        tpl[0] for tpl in trim_tpls if retained_threshold <= tpl[3]
    ]
    misaligned = [tpl[0] for tpl in trim_tpls if retained_threshold > tpl[3]]

    #Write trim statistics to file in such a way that they're easily converted to a graph in Galaxy
    with open(stats_file, mode='w') as append_handle:
        msg = '{0:6} sequence alignments trimmed'.format(len(trim_tpls))
        log.info(msg)
        append_handle.write('#' + msg + '\n')

        average_retained = sum(remaining_percts) / len(remaining_percts)
        msg = '{0:5.1f}% sequence retained on average overall'.format(
            average_retained)
        log.info(msg)
        append_handle.write('#' + msg + '\n')

        filtered = len(misaligned)
        msg = '{0:6} orthologs filtered because less than {1}% sequence retained or because of indel longer than {2} '\
            .format(filtered, str(retained_threshold), max_indel_length)
        log.info(msg)
        append_handle.write('#' + msg + '\n')

        append_handle.write(
            '# Trimmed file\tOriginal length\tTrimmed length\tPercentage retained\n'
        )
        for tpl in sorted(trim_tpls, key=itemgetter(3)):
            append_handle.write(os.path.split(tpl[0])[1] + '\t')
            append_handle.write(str(tpl[1]) + '\t')
            append_handle.write(str(tpl[2]) + '\t')
            append_handle.write('{0:.2f}\n'.format(tpl[3]))

    #Create scatterplot using trim_tuples
    scatterplot(retained_threshold, trim_tpls, scatterplot_file)

    return sorted(trimmed_alignments), sorted(misaligned)
Пример #19
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: calculations.py
--genomes-a=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon A
--genomes-b=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon B
--sico-zip=FILE      archive of aligned & trimmed single copy orthologous (SICO) genes
--table-a=FILE       destination file path for summary statistics table based on orthologs in taxon A
--table-b=FILE       destination file path for summary statistics table based on orthologs in taxon B
--append-odd-even    append separate tables calculated for odd and even codons of ortholog alignments [OPTIONAL]
"""
    options = ['genomes-a', 'genomes-b', 'sico-zip', 'table-a', 'table-b', 'append-odd-even?']
    genome_a_ids_file, genome_b_ids_file, sico_zip, table_a, table_b, oddeven = parse_options(usage, options, args)

    #Parse file containing GenBank GenBank Project IDs to extract GenBank Project IDs
    with open(genome_a_ids_file) as read_handle:
        lines = [line.strip() for line in read_handle]
        genome_ids_a = [line.split()[0] for line in lines]
        common_prefix_a = os.path.commonprefix([line.split('\t')[1] for line in lines]).strip()
    with open(genome_b_ids_file) as read_handle:
        lines = [line.strip() for line in read_handle]
        genome_ids_b = [line.split()[0] for line in lines]
        common_prefix_b = os.path.commonprefix([line.split('\t')[1] for line in lines]).strip()

    #Prepend headers to each of the output tables
    _prepend_table_header(table_a, genome_ids_a, common_prefix_a, genome_ids_b, common_prefix_b, oddeven)
    _prepend_table_header(table_b, genome_ids_b, common_prefix_b, genome_ids_a, common_prefix_a, oddeven)

    #Create run_dir to hold files relating to this run
    run_dir = tempfile.mkdtemp(prefix='calculations_')

    #Extract files from zip archive
    sico_files = extract_archive_of_files(sico_zip, create_directory('sicos', inside_dir=run_dir))

    #Actually do calculations
    tmp_table_tuple = calculate_tables(genome_ids_a, genome_ids_b, sico_files, oddeven)

    #Write the produced files to command line argument filenames
    with open(table_a, mode='ab') as append_handle:
        shutil.copyfileobj(open(tmp_table_tuple[0], mode='rb'), append_handle)
    with open(table_b, mode='ab') as append_handle:
        shutil.copyfileobj(open(tmp_table_tuple[1], mode='rb'), append_handle)

    #Remove now unused files to free disk space
    shutil.rmtree(run_dir)
    os.remove(tmp_table_tuple[0])
    os.remove(tmp_table_tuple[1])

    #Exit after a comforting log message
    log.info("Produced: \n%s\n%s", table_a, table_b)
Пример #20
0
def _run_neighbor(run_dir, distance_file):
    """Run neighbor to generate a tree of the distances in the distance file, and return the generated tree file."""
    neighbor_dir = create_directory('neighbor', inside_dir=run_dir)

    #Copy outfile from dnadist to infile inside neighbor_dir
    shutil.copy(distance_file, os.path.join(neighbor_dir, 'infile'))

    #Actually run neighbor
    process = Popen(NEIGHBOR, cwd=neighbor_dir, stdin=PIPE, stdout=PIPE, stderr=STDOUT)
    process.communicate(input='N\nY\n')

    #Retrieve newick tree file
    treefile = os.path.join(neighbor_dir, 'outtree')
    assert os.path.exists(treefile) and 0 < os.path.getsize(treefile), treefile + ' should exist with some content now'
    return treefile
Пример #21
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: filter_orthologs.py
--orthologs-zip=FILE           archive of orthologous genes in FASTA format
--retained-threshold=PERC      filter orthologs that retain less than PERC % of sequence after trimming alignment
--max-indel-length=NUMBER      filter orthologs that contain insertions / deletions longer than N in middle of alignment
--aligned-zip=FILE             destination file path for archive of aligned orthologous genes
--misaligned-zip=FILE          destination file path for archive of misaligned orthologous genes
--trimmed-zip=FILE             destination file path for archive of aligned & trimmed orthologous genes
--stats=FILE                   destination file path for ortholog trimming statistics file
--scatterplot=FILE             destination file path for scatterplot of retained and filtered sequences by length
"""
    options = ['orthologs-zip', 'retained-threshold', 'max-indel-length',
               'aligned-zip', 'misaligned-zip', 'trimmed-zip', 'stats', 'scatterplot']
    orthologs_zip, retained_threshold, max_indel_length, \
    aligned_zip, misaligned_zip, trimmed_zip, target_stats_path, target_scatterplot = \
        parse_options(usage, options, args)

    #Convert retained threshold to integer, so we can fail fast if argument value format was wrong
    retained_threshold = int(retained_threshold)
    max_indel_length = int(max_indel_length)

    #Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='align_trim_')

    #Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    sico_files = extract_archive_of_files(orthologs_zip, temp_dir)

    #Align SICOs so all sequences become equal length sequences
    aligned_files = _align_sicos(run_dir, sico_files)

    #Filter orthologs that retain less than PERC % of sequence after trimming alignment
    trimmed_files, misaligned_files = _trim_alignments(run_dir, aligned_files, retained_threshold, max_indel_length,
                                                       target_stats_path, target_scatterplot)

    #Create archives of files on command line specified output paths
    create_archive_of_files(aligned_zip, aligned_files)
    create_archive_of_files(misaligned_zip, misaligned_files)
    create_archive_of_files(trimmed_zip, trimmed_files)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info('Produced: \n%s', '\n'.join((aligned_zip, misaligned_zip, trimmed_zip,
                                         target_stats_path, target_scatterplot)))
def _create_blast_database(run_dir, fasta_file, nucleotide=False):
    """Create blast database"""
    assert os.path.exists(MAKEBLASTDB) and os.access(MAKEBLASTDB, os.X_OK), 'Could not find or run ' + MAKEBLASTDB

    dbtype = 'nucl' if nucleotide else 'prot'
    db_dir = create_directory('blast', inside_dir=run_dir)
    db_name = 'my_{0}_blast_db'.format(dbtype)
    log_file = os.path.join(db_dir, 'makeblastdb.log')
    with open(log_file, mode='w') as open_file:
        command = [MAKEBLASTDB,
                   '-in', fasta_file,
                   '-dbtype', dbtype,
                   '-out', os.path.join(db_dir, db_name)]
        log.info('Executing: %s', ' '.join(command))
        check_call(command, stdout=open_file)
    return db_dir, db_name
Пример #23
0
def _create_blast_database(run_dir, fasta_file, nucleotide=False):
    """Create blast database"""
    assert os.path.exists(MAKEBLASTDB) and os.access(
        MAKEBLASTDB, os.X_OK), 'Could not find or run ' + MAKEBLASTDB

    dbtype = 'nucl' if nucleotide else 'prot'
    db_dir = create_directory('blast', inside_dir=run_dir)
    db_name = 'my_{0}_blast_db'.format(dbtype)
    log_file = os.path.join(db_dir, 'makeblastdb.log')
    with open(log_file, mode='w') as open_file:
        command = [
            MAKEBLASTDB, '-in', fasta_file, '-dbtype', dbtype, '-out',
            os.path.join(db_dir, db_name)
        ]
        log.info('Executing: %s', ' '.join(command))
        check_call(command, stdout=open_file)
    return db_dir, db_name
Пример #24
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: split_by_taxa.py
--genomes-a=FILE        file with genome GenBank Project ID and Organism name on each line for taxon A
--genomes-b=FILE        file with genome GenBank Project ID and Organism name on each line for taxon B
--orthologs-zip=FILE    archive of aligned & trimmed single copy orthologous (SICO) genes
--taxon-a-zip=FILE      destination file path for archive of SICO genes belonging to taxon A
--taxon-b-zip=FILE      destination file path for archive of SICO genes belonging to taxon B
"""
    options = ['genomes-a', 'genomes-b', 'orthologs-zip', 'taxon-a-zip', 'taxon-b-zip']
    genome_a_ids_file, genome_b_ids_file, orthologs_zip, taxon_a_zip, taxon_b_zip = parse_options(usage, options, args)

    #Parse file containing RefSeq project IDs to extract RefSeq project IDs
    with open(genome_a_ids_file) as read_handle:
        lines = [line.split('\t') for line in read_handle]
        genome_ids_a = [line[0] for line in lines]
        common_prefix_a = _common_prefix([line[1] for line in lines], 'taxon_a')
    with open(genome_b_ids_file) as read_handle:
        lines = [line.split('\t') for line in read_handle]
        genome_ids_b = [line[0] for line in lines]
        common_prefix_b = _common_prefix([line[1] for line in lines], 'taxon_b')

    #Create run_dir to hold files related to this run
    run_dir = tempfile.mkdtemp(prefix='split_by_taxa_')

    #Extract files from zip archive
    ortholog_files = extract_archive_of_files(orthologs_zip, create_directory('alignments', inside_dir=run_dir))

    #Actually split alignments per taxon
    taxon_a_files, taxon_b_files = split_alignment_by_taxa(run_dir, ortholog_files,
                                                           (genome_ids_a, common_prefix_a),
                                                           (genome_ids_b, common_prefix_b))

    #Write the produced files to command line argument filenames
    create_archive_of_files(taxon_a_zip, taxon_a_files)
    create_archive_of_files(taxon_b_zip, taxon_b_files)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info("Produced: \n%s\n%s", taxon_a_zip, taxon_b_zip)
    return taxon_a_zip, taxon_b_zip
Пример #25
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: run_codeml.py
--genomes-a=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon A
--genomes-b=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon B
--sico-zip=FILE      archive of aligned & trimmed single copy orthologous (SICO) genes
--codeml-zip=FILE     destination file path for archive of codeml output per SICO gene
--dnds-stats=FILE     destination file path for file with dN, dS & dN/dS values per SICO gene
"""
    options = [
        'genomes-a', 'genomes-b', 'sico-zip', 'codeml-zip', 'dnds-stats'
    ]
    genome_a_ids_file, genome_b_ids_file, sico_zip, codeml_zip, dnds_file = parse_options(
        usage, options, args)

    # Parse file to extract GenBank Project IDs
    with open(genome_a_ids_file) as read_handle:
        genome_ids_a = [line.split()[0] for line in read_handle]
    with open(genome_b_ids_file) as read_handle:
        genome_ids_b = [line.split()[0] for line in read_handle]

    # Create run_dir to hold files relating to this run
    run_dir = tempfile.mkdtemp(prefix='run_codeml_')

    # Extract files from zip archive
    sico_files = extract_archive_of_files(
        sico_zip, create_directory('sicos', inside_dir=run_dir))

    # Actually run codeml
    codeml_files = run_codeml_for_sicos(run_dir, genome_ids_a, genome_ids_b,
                                        sico_files)

    # Write dnds values to single output file
    _write_dnds_per_ortholog(dnds_file, codeml_files)

    # Write the produced files to command line argument filenames
    create_archive_of_files(codeml_zip, codeml_files)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info("Produced: \n%s\n%s", codeml_zip, dnds_file)
Пример #26
0
def _phipack_for_all_orthologs(run_dir, aligned_files, stats_file):
    """Filter aligned fasta files where there is evidence of recombination when inspecting PhiPack values.
    Return two collections of aligned files, the first without recombination, the second with recombination."""

    log.info('Running PhiPack for %i orthologs to find recombination', len(aligned_files))

    #Create separate directory for phipack related values
    phipack_dir = create_directory('phipack', inside_dir=run_dir)

    with open(stats_file, mode='w') as write_handle:
        write_handle.write('\t'.join(['Ortholog',
                                      'Informative sites',
                                      'Phi',
                                      'Max Chi^2',
                                      'NSS',
                                      'COGs',
                                      'Product']) + '\n')

        #Retrieve unique genomes from first ortholog file
        genome_ids = set(fasta_record.id.split('|')[0] for fasta_record in SeqIO.parse(aligned_files[0], 'fasta'))
        genome_dicts = select_genomes_by_ids(genome_ids).values()

        #Assign ortholog files to the correct collection based on whether they show recombination
        for ortholog_file in aligned_files:
            orth_name = os.path.split(ortholog_file)[1].split('.')[0]

            #Parse tree file to ensure all genome_ids_a & genome_ids_b group together in the tree
            phipack_values = run_phipack(phipack_dir, ortholog_file)

            #Write PhiPack values to line
            write_handle.write('{0}\t{1[PhiPack sites]}\t{1[Phi]}\t{1[Max Chi^2]}\t{1[NSS]}'.format(orth_name,
                                                                                                    phipack_values))

            #Parse sequence records again, but now to retrieve cogs and products
            seq_records = list(SeqIO.parse(ortholog_file, 'fasta'))
            #COGs
            cogs = find_cogs_in_sequence_records(seq_records)
            write_handle.write('\t' + ','.join(cogs))
            #Product
            product = get_most_recent_gene_name(genome_dicts, seq_records)
            write_handle.write('\t' + product)

            #End line
            write_handle.write('\n')
Пример #27
0
def _translate_genome(tuples_of_gbk_and_ptt_files):
    """Translate all files for genome and concatenate them into single DNA and Protein fasta files."""
    assert tuples_of_gbk_and_ptt_files is not None, 'No genbank files were provided'

    project_id = tuples_of_gbk_and_ptt_files[0][0]
    out_dir = create_directory('translations/' + project_id)
    dna_files = []
    protein_files = []
    for project_id, gbk_file, ptt_file in tuples_of_gbk_and_ptt_files:
        dna_file, protein_file = _extract_gene_and_protein(out_dir, project_id, gbk_file, ptt_file)
        dna_files.append(dna_file)
        protein_files.append(protein_file)

    #Concatenate files into one
    dna_concatemer = os.path.join(out_dir, '{pid}.ffn'.format(pid=project_id))
    protein_concatemer = os.path.join(out_dir, '{pid}.faa'.format(pid=project_id))
    concatenate(dna_concatemer, dna_files)
    concatenate(protein_concatemer, protein_files)
    return dna_concatemer, protein_concatemer
Пример #28
0
def _trim_alignments(run_dir, dna_alignments, retained_threshold, max_indel_length, stats_file, scatterplot_file):
    """Trim all DNA alignments using _trim_alignment (singular), and calculate some statistics about the trimming."""
    log.info('Trimming {0} DNA alignments from first non-gap codon to last non-gap codon'.format(len(dna_alignments)))

    #Create directory here, to prevent race-condition when folder does not exist, but is then created by another process
    trimmed_dir = create_directory('trimmed', inside_dir=run_dir)

    # Trim all the alignments
    trim_tpls = [_trim_alignment((trimmed_dir, dna_alignment, max_indel_length)) for dna_alignment in dna_alignments]

    remaining_percts = [tpl[3] for tpl in trim_tpls]
    trimmed_alignments = [tpl[0] for tpl in trim_tpls if retained_threshold <= tpl[3]]
    misaligned = [tpl[0] for tpl in trim_tpls if retained_threshold > tpl[3]]

    #Write trim statistics to file in such a way that they're easily converted to a graph in Galaxy
    with open(stats_file, mode='w') as append_handle:
        msg = '{0:6} sequence alignments trimmed'.format(len(trim_tpls))
        log.info(msg)
        append_handle.write('#' + msg + '\n')

        average_retained = sum(remaining_percts) / len(remaining_percts)
        msg = '{0:5.1f}% sequence retained on average overall'.format(average_retained)
        log.info(msg)
        append_handle.write('#' + msg + '\n')

        filtered = len(misaligned)
        msg = '{0:6} orthologs filtered because less than {1}% sequence retained or because of indel longer than {2} '\
            .format(filtered, str(retained_threshold), max_indel_length)
        log.info(msg)
        append_handle.write('#' + msg + '\n')

        append_handle.write('# Trimmed file\tOriginal length\tTrimmed length\tPercentage retained\n')
        for tpl in sorted(trim_tpls, key=itemgetter(3)):
            append_handle.write(os.path.split(tpl[0])[1] + '\t')
            append_handle.write(str(tpl[1]) + '\t')
            append_handle.write(str(tpl[2]) + '\t')
            append_handle.write('{0:.2f}\n'.format(tpl[3]))

    #Create scatterplot using trim_tuples
    scatterplot(retained_threshold, trim_tpls, scatterplot_file)

    return sorted(trimmed_alignments), sorted(misaligned)
Пример #29
0
def _step12_mcl(run_dir, mcl_input_file):
    """Markov Cluster Algorithm: http://www.micans.org/mcl/

    Input:
        mclInput file
    Output:
        mclOutput file

    mcl my_orthomcl_dir/mclInput --abc -I 1.5 -o my_orthomcl_dir/mclOutput
    """
    #Run mcl
    mcl_dir = create_directory('mcl', inside_dir=run_dir)
    mcl_output_file = os.path.join(mcl_dir, 'mclOutput.tsv')
    mcl_log = os.path.join(mcl_dir, 'mcl.log')
    with open(mcl_log, mode='w') as open_file:
        threads = str(multiprocessing.cpu_count())
        command = [MCL, mcl_input_file, '--abc', '-I', '1.5', '-o', mcl_output_file, '-te', threads]
        log.info('Executing: %s', ' '.join(command))
        check_call(command, stdout=open_file, stderr=STDOUT)
    return mcl_output_file
Пример #30
0
def _run_neighbor(run_dir, distance_file):
    """Run neighbor to generate a tree of the distances in the distance file, and return the generated tree file."""
    neighbor_dir = create_directory('neighbor', inside_dir=run_dir)

    #Copy outfile from dnadist to infile inside neighbor_dir
    shutil.copy(distance_file, os.path.join(neighbor_dir, 'infile'))

    #Actually run neighbor
    process = Popen(NEIGHBOR,
                    cwd=neighbor_dir,
                    stdin=PIPE,
                    stdout=PIPE,
                    stderr=STDOUT)
    process.communicate(input='N\nY\n')

    #Retrieve newick tree file
    treefile = os.path.join(neighbor_dir, 'outtree')
    assert os.path.exists(treefile) and 0 < os.path.getsize(
        treefile), treefile + ' should exist with some content now'
    return treefile
Пример #31
0
def _download_genomes_table():
    '''Dowload the prokaryotes.txt genome table file from the NCBI FTP site, save a local copy and return contents.'''
    cache_dir = create_directory('')
    prokaryotes = 'prokaryotes.txt'
    output_file = os.path.join(cache_dir, prokaryotes)

    #Only download when existing file is older than a day
    time_between_downloads = 24 * 60 * 60
    if not os.path.isfile(output_file) or os.path.getmtime(output_file) < time.time() - time_between_downloads:
        #Login to FTP site
        ftp = FTP('ftp.ncbi.nlm.nih.gov')
        ftp.login(passwd='*****@*****.**')

        #Download ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt
        from download_taxa_ncbi import _download_genome_file
        _download_genome_file(ftp, '/genomes/GENOME_REPORTS', prokaryotes, cache_dir, datetime.now())

    #Read file and return content
    with open(output_file) as read_handle:
        return read_handle.read()
Пример #32
0
def coding_regions_per_genome(run_dir, trimmed_sicos):
    """Create a DNA file per genome containing all aligned & trimmed SICO genes als individual genes."""
    concatemer_dir = create_directory('coding_regions_per_genome',
                                      inside_dir=run_dir)
    log.info('Creating concatemers from {0} SICOs'.format(len(trimmed_sicos)))

    #Collections both for output files and their write handles, which will be reused for each SICO
    coding_region_files = []
    write_handles = {}

    #Loop over trimmed sico files to append each sequence to the right concatemer
    for trimmed_sico in trimmed_sicos:
        for seqr in SeqIO.parse(trimmed_sico, 'fasta'):
            #Sample header line: >58191|NC_010067.1|YP_001569097.1|COG4948MR|core
            project_id = seqr.id.split('|')[0]

            #Try to retrieve write handle from dictionary of cached write handles per genome
            write_handle = write_handles.get(project_id)

            #If not found, create & store write handle on demand
            if not write_handle:
                #Build up output file path for trimmed SICO genes per genome
                coding_region_file = os.path.join(
                    concatemer_dir, project_id + '.coding-regions.ffn')
                coding_region_files.append(coding_region_file)

                #Open write handle
                write_handle = open(coding_region_file, mode='w')
                write_handles[project_id] = write_handle

            #Write sequence record to coding-regions file
            SeqIO.write(seqr, write_handle, 'fasta')

    #Close genomes trimmed concatemer write handles
    for write_handle in write_handles.values():
        write_handle.close()

    log.info('Created %i genome coding regions files',
             len(coding_region_files))

    return sorted(coding_region_files)
Пример #33
0
def _translate_genome(tuples_of_gbk_and_ptt_files):
    """Translate all files for genome and concatenate them into single DNA and Protein fasta files."""
    assert tuples_of_gbk_and_ptt_files is not None, 'No genbank files were provided'

    project_id = tuples_of_gbk_and_ptt_files[0][0]
    out_dir = create_directory('translations/' + project_id)
    dna_files = []
    protein_files = []
    for project_id, gbk_file, ptt_file in tuples_of_gbk_and_ptt_files:
        dna_file, protein_file = _extract_gene_and_protein(
            out_dir, project_id, gbk_file, ptt_file)
        dna_files.append(dna_file)
        protein_files.append(protein_file)

    #Concatenate files into one
    dna_concatemer = os.path.join(out_dir, '{pid}.ffn'.format(pid=project_id))
    protein_concatemer = os.path.join(out_dir,
                                      '{pid}.faa'.format(pid=project_id))
    concatenate(dna_concatemer, dna_files)
    concatenate(protein_concatemer, protein_files)
    return dna_concatemer, protein_concatemer
Пример #34
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: run_codeml.py
--genomes-a=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon A
--genomes-b=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon B
--sico-zip=FILE      archive of aligned & trimmed single copy orthologous (SICO) genes
--codeml-zip=FILE     destination file path for archive of codeml output per SICO gene
--dnds-stats=FILE     destination file path for file with dN, dS & dN/dS values per SICO gene
"""
    options = ['genomes-a', 'genomes-b', 'sico-zip', 'codeml-zip', 'dnds-stats']
    genome_a_ids_file, genome_b_ids_file, sico_zip, codeml_zip, dnds_file = parse_options(usage, options, args)

    # Parse file to extract GenBank Project IDs
    with open(genome_a_ids_file) as read_handle:
        genome_ids_a = [line.split()[0] for line in read_handle]
    with open(genome_b_ids_file) as read_handle:
        genome_ids_b = [line.split()[0] for line in read_handle]

    # Create run_dir to hold files relating to this run
    run_dir = tempfile.mkdtemp(prefix='run_codeml_')

    # Extract files from zip archive
    sico_files = extract_archive_of_files(sico_zip, create_directory('sicos', inside_dir=run_dir))

    # Actually run codeml
    codeml_files = run_codeml_for_sicos(run_dir, genome_ids_a, genome_ids_b, sico_files)

    # Write dnds values to single output file
    _write_dnds_per_ortholog(dnds_file, codeml_files)

    # Write the produced files to command line argument filenames
    create_archive_of_files(codeml_zip, codeml_files)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info("Produced: \n%s\n%s", codeml_zip, dnds_file)
Пример #35
0
def run_codeml_for_sicos(codeml_dir, genome_ids_a, genome_ids_b, sico_files):
    """Run codeml for representatives of clades A and B in each of the SICO files, to calculate dN/dS."""
    log.info('Running codeml for {0} aligned and trimmed SICOs'.format(len(sico_files)))

    codeml_files = []
    for sico_file in sico_files:
        # Separate alignments for clade A & clade B genomes
        ali = AlignIO.read(sico_file, 'fasta')
        alignment_a = MultipleSeqAlignment(seqr for seqr in ali if seqr.id.split('|')[0] in genome_ids_a)
        alignment_b = MultipleSeqAlignment(seqr for seqr in ali if seqr.id.split('|')[0] in genome_ids_b)

        # Create sub directory for this run based on sico_file name
        filename = os.path.split(sico_file)[1]
        # Split off everything starting from the first dot
        base_name = filename[:filename.find('.')]
        sub_dir = create_directory(base_name, inside_dir=codeml_dir)

        # Submit for asynchronous calculation
        codeml_file = run_codeml(sub_dir, alignment_a, alignment_b)
        codeml_files.append(codeml_file)

    return codeml_files
Пример #36
0
def coding_regions_per_genome(run_dir, trimmed_sicos):
    """Create a DNA file per genome containing all aligned & trimmed SICO genes als individual genes."""
    concatemer_dir = create_directory('coding_regions_per_genome', inside_dir=run_dir)
    log.info('Creating concatemers from {0} SICOs'.format(len(trimmed_sicos)))

    #Collections both for output files and their write handles, which will be reused for each SICO
    coding_region_files = []
    write_handles = {}

    #Loop over trimmed sico files to append each sequence to the right concatemer
    for trimmed_sico in trimmed_sicos:
        for seqr in SeqIO.parse(trimmed_sico, 'fasta'):
            #Sample header line: >58191|NC_010067.1|YP_001569097.1|COG4948MR|core
            project_id = seqr.id.split('|')[0]

            #Try to retrieve write handle from dictionary of cached write handles per genome
            write_handle = write_handles.get(project_id)

            #If not found, create & store write handle on demand
            if not write_handle:
                #Build up output file path for trimmed SICO genes per genome
                coding_region_file = os.path.join(concatemer_dir, project_id + '.coding-regions.ffn')
                coding_region_files.append(coding_region_file)

                #Open write handle
                write_handle = open(coding_region_file, mode='w')
                write_handles[project_id] = write_handle

            #Write sequence record to coding-regions file
            SeqIO.write(seqr, write_handle, 'fasta')

    #Close genomes trimmed concatemer write handles
    for write_handle in write_handles.values():
        write_handle.close()

    log.info('Created %i genome coding regions files', len(coding_region_files))

    return sorted(coding_region_files)
Пример #37
0
def _step12_mcl(run_dir, mcl_input_file):
    """Markov Cluster Algorithm: http://www.micans.org/mcl/

    Input:
        mclInput file
    Output:
        mclOutput file

    mcl my_orthomcl_dir/mclInput --abc -I 1.5 -o my_orthomcl_dir/mclOutput
    """
    #Run mcl
    mcl_dir = create_directory('mcl', inside_dir=run_dir)
    mcl_output_file = os.path.join(mcl_dir, 'mclOutput.tsv')
    mcl_log = os.path.join(mcl_dir, 'mcl.log')
    with open(mcl_log, mode='w') as open_file:
        threads = str(multiprocessing.cpu_count())
        command = [
            MCL, mcl_input_file, '--abc', '-I', '1.5', '-o', mcl_output_file,
            '-te', threads
        ]
        log.info('Executing: %s', ' '.join(command))
        check_call(command, stdout=open_file, stderr=STDOUT)
    return mcl_output_file
Пример #38
0
def _step6_orthomcl_filter_fasta(run_dir,
                                 input_dir,
                                 min_length=10,
                                 max_percent_stop=20):
    """Create goodProteins.fasta containing all good proteins and rejectProteins.fasta containing all rejects. Input is
    a directory containing a set of compliant input .fasta files (as produced by orthomclAdjustFasta).

    Usage:
      orthomclFilterFasta input_dir min_length max_percent_stops

    where:
      input_dir:           a directory containing a set of .fasta files
      min_length:          minimum allowed length of proteins.  (suggested: 10)
      max_percent_stop:    maximum percent stop codons.  (suggested 20)

    The input requirements are:
      1) a compliantFasta/ directory which contains all and only the proteome .fasta files, one file per proteome.
      2) each .fasta file must have a name in the form 'xxxx.fasta' where xxxx is a three or four letter unique taxon
         code.  For example: hsa.fasta or eco.fasta
      3) each protein in those files must have a definition line in the following format:
         >xxxx|yyyyyy
         where xxxx is the three or four letter taxon code and yyyyyy is a sequence identifier unique within that taxon.

    Output:
        my_orthomcl_dir/goodProteins.fasta
        my_orthomcl_dir/poorProteins.fasta
        report of suspicious proteomes (> 10% poor proteins)

    EXAMPLE: orthomclSoftware/bin/orthomclFilterFasta my_orthomcl_dir/compliantFasta 10 20
    """
    #Run orthomclFilterFasta
    out_dir = create_directory('filtered_fasta', inside_dir=run_dir)
    report = os.path.join(out_dir, 'filter_report.log')
    with open(report, mode='w') as report_file:
        command = [
            ORTHOMCL_FILTER_FASTA, input_dir,
            str(min_length),
            str(max_percent_stop)
        ]
        log.info('Executing: %s', ' '.join(command))
        check_call(command, stdout=report_file, stderr=STDOUT)

    #Move output files to out directory
    good = os.path.join(out_dir, 'good_proteins.fasta')
    poor = os.path.join(out_dir, 'poor_proteins.fasta')
    shutil.move('goodProteins.fasta', good)
    shutil.move('poorProteins.fasta', poor)

    #Ensure neither of the proteomes is suspicious according to min_length & max_percent_stop
    with open(report) as report_file:
        if 'Proteomes with > 10% poor proteins:' in report_file.read(
        ):  # OrthoMCL does NOT add actual min_length value
            msg = 'OrthomclFilterFasta found suspicious proteomes based on values for length'
            log.error(msg)
            assert False, msg

    #Warn the user about the poor proteins found here, if they were found at all
    poor_records = list(SeqIO.parse(poor, 'fasta'))
    if poor_records:
        log.warn('%i poor sequence records identified by orthomclFilterFasta:',
                 len(poor_records))
        for seqr in poor_records:
            log.warn('>%s: %s', seqr.id, seqr.seq)

    #Assert good exists and has some content
    assert os.path.isfile(good) and 0 < os.path.getsize(
        good), good + ' should exist and have some content'

    #Only good and poor proteins
    return good, poor
Пример #39
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: concatenate_orthologs.py
--orthologs-zip=FILE     archive of orthologous genes in FASTA format
--coding-regions=FILE    destination file path archive of trimmed orthologous coding regions per genomes
--concatemer=FILE        destination file path for super-concatemer of all genomes
--taxon-a=FILE           destination file path for genome IDs for taxon A
--taxon-b=FILE           destination file path for genome IDs for taxon B
--tree=FILE              destination file path for tree visualization
"""
    options = [
        'orthologs-zip', 'coding-regions', 'concatemer', 'taxon-a', 'taxon-b',
        'tree'
    ]
    orthologs_zip, target_coding_regions, target_concat_file, target_taxon_a, target_taxon_b, target_tree = \
        parse_options(usage, options, args)

    #Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='concatemer_tree_')

    #Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir)

    #Separate out orthologs per genome to create trimmed coding region files per genome
    genome_coding_regions_files = coding_regions_per_genome(
        run_dir, ortholog_files)
    create_archive_of_files(target_coding_regions, genome_coding_regions_files)

    #Concatenate coding region files per genome
    concatemer_files = concatemer_per_genome(run_dir,
                                             genome_coding_regions_files)
    #Create super concatemer
    create_super_concatemer(concatemer_files, target_concat_file)

    #Determine the taxa present in the super concatemer tree by building a phylogenetic tree from genome concatemer and
    #reading genome ids in the two largest clades.
    super_distance_file = _run_dna_dist(run_dir, target_concat_file)
    super_tree_file = _run_neighbor(run_dir, super_distance_file)
    genome_ids_a, genome_ids_b = _read_taxa_from_tree(super_tree_file)

    #Map Project IDs to Organism names
    id_to_name_map = dict(
        (gid, genome['Organism/Name'])
        for gid, genome in select_genomes_by_ids(genome_ids_a +
                                                 genome_ids_b).iteritems())

    #Write Project IDs and Organism Names to files, with a fallback to genome_id for external genome
    with open(target_taxon_a, mode='w') as write_handle:
        for genome_id in genome_ids_a:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id,
                                                       name=id_to_name_map.get(
                                                           genome_id,
                                                           genome_id)))
    with open(target_taxon_b, mode='w') as write_handle:
        for genome_id in genome_ids_b:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id,
                                                       name=id_to_name_map.get(
                                                           genome_id,
                                                           genome_id)))

    #Visualize tree
    visualize_tree(super_tree_file, id_to_name_map, target_tree)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info('Produced: \n%s\n%s\n%s\n%s\n%s', target_coding_regions,
             target_concat_file, target_taxon_a, target_taxon_b, target_tree)
Пример #40
0
def _step5_orthomcl_adjust_fasta(run_dir, proteome_files, id_field=3):
    """Create an OrthoMCL compliant .fasta file, by adjusting definition lines.

    Usage:
      orthomclAdjustFasta taxon_code fasta_file id_field

    where:
      taxon_code:  a three or four letter unique abbreviation for the taxon
      fasta_file:  the input fasta file per proteome
      id_field:    a number indicating what field in the definition line contains
                   the protein ID.  Fields are separated by either ' ' or '|'. Any
                   spaces immediately following the '>' are ignored.  The first
                   field is 1. For example, in the following definition line, the
                   ID (AP_000668.1) is in field 4:  >gi|89106888|ref|AP_000668.1|

    Input file requirements:
      (1) .fasta format
      (2) a unique id is provided for each sequence, and is in the field specified
          by id_field

    Output file format:
      (1) .fasta format
      (2) definition line is of the form:
             >taxoncode|unique_protein_id

    The output file is named taxoncode.fasta

    Note: if your input files do not meet the requirements, you can do some simple perl or awk processing of them to
    create the required input files to this program, or the required output files.  This program is provided as a
    convenience, but OrthoMCL users are expected to have the scripting skills to provide compliant .fasta files.

    EXAMPLE: orthomclSoftware/bin/orthomclAdjustFasta hsa Homo_sapiens.NCBI36.53.pep.all.fa 1
    """
    #Create directory to hold compliant fasta
    adjusted_fasta_dir = create_directory('compliant_fasta',
                                          inside_dir=run_dir)
    adjusted_fasta_files = []
    for proteome_file in proteome_files:
        taxon_code = None
        #Use first part of header of first entry as taxon code
        for record in SeqIO.parse(proteome_file, 'fasta'):
            taxon_code = record.id.split('|')[0]
            break

        # If we failed to extract a taxon_code, proteome file must have been empty
        assert taxon_code, 'Proteome file appears empty: ' + proteome_file

        #Call orhtomclAdjustFasta
        command = [
            ORTHOMCL_ADJUST_FASTA, taxon_code, proteome_file,
            str(id_field)
        ]
        log.info('Executing: %s', ' '.join(command))
        check_call(command)
        #Move resulting fasta file to compliantFasta directory
        adjusted_fasta_file = taxon_code + '.fasta'
        fasta_file_destination = os.path.join(adjusted_fasta_dir,
                                              adjusted_fasta_file)
        shutil.move(adjusted_fasta_file, fasta_file_destination)
        adjusted_fasta_files.append(fasta_file_destination)
    #Return path to directory containing compliantFasta
    return adjusted_fasta_dir, adjusted_fasta_files
Пример #41
0
def _align_sicos(run_dir, sico_files):
    """Align all SICO files given as argument in parallel and return the resulting alignment files."""
    log.info('Aligning {0} SICO genes using TranslatorX & muscle.'.format(
        len(sico_files)))
    # We'll multiplex this embarrassingly parallel task using a pool of workers
    return [_run_translatorx((run_dir, sico_file)) for sico_file in sico_files]


def _run_translatorx((run_dir, sico_file), translation_table=CODON_TABLE_ID):
    """Run TranslatorX to create DNA level alignment file of protein level aligned DNA sequences within sico_file."""
    assert os.path.exists(TRANSLATORX) and os.access(
        TRANSLATORX, os.X_OK), 'Could not find or run ' + TRANSLATORX

    #Determine output file name
    sico_base = os.path.splitext(os.path.split(sico_file)[1])[0]
    alignment_dir = create_directory('alignments/' + sico_base,
                                     inside_dir=run_dir)

    #Created output file
    file_base = os.path.join(alignment_dir, sico_base)
    dna_alignment = file_base + '.nt_ali.fasta'

    #Actually run the TranslatorX program
    command = [
        TRANSLATORX, '-i', sico_file, '-c',
        str(translation_table), '-o', file_base
    ]
    check_call(command, stdout=open('/dev/null', 'w'), stderr=STDOUT)

    assert os.path.isfile(dna_alignment) and 0 < os.path.getsize(dna_alignment), \
        'Alignment file should exist and have some content now: {0}'.format(dna_alignment)
    return dna_alignment
Пример #42
0
def download_genome_files(genome, download_log=None, require_ptt=False):
    """Download genome .gbk & .ptt files from ncbi ftp and return pairs per accessioncode in tuples of three."""
    #ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Sulfolobus_islandicus_M_14_25_uid18871/CP001400.ffn
    #Download using FTP
    ftp = FTP('ftp.ncbi.nlm.nih.gov')
    ftp.login(passwd='*****@*****.**')

    #Try to find project directory in RefSeq curated listing
    projectid = genome['BioProject ID']
    base_dir = '/genomes/Bacteria'
    project_dir = _find_project_dir(ftp, base_dir, projectid)
    if project_dir:
        accessioncodes = genome['Chromosomes/RefSeq']
        target_dir = create_directory('refseq/' + projectid)
    else:
        if projectid:
            log.warn('Genome directory not found under %s%s for %s', ftp.host,
                     base_dir, projectid)

        #Try instead to find project directory in GenBank originals listing
        base_dir = '/genbank/genomes/Bacteria'
        project_dir = _find_project_dir(ftp, base_dir, projectid)
        if project_dir:
            accessioncodes = genome['Chromosomes/INSDC']
            target_dir = create_directory('genbank/' + projectid)
        else:
            log.warn('Genome directory not found under %s%s for %s', ftp.host,
                     base_dir, projectid)

    #Determine ast modified date to see if we should redownload the file following changes
    last_change_date = genome['Modify Date'] if genome[
        'Modify Date'] else genome['Release Date']

    #Download .gbk & .ptt files for all genome accessioncodes and append them to this list as tuples of gbk + ptt
    genome_files = []

    #Occasionally we can not find a folder, meaning we will have to skip this genome as well
    if project_dir:
        for acc in accessioncodes:
            #Try genbank file, which is always required
            try:
                gbk_file = _download_genome_file(ftp, project_dir,
                                                 acc + '.gbk', target_dir,
                                                 last_change_date)

                #Try to parse Bio.GenBank.Record to see if it contains more than five (arbitrary) feature records
                features = SeqIO.read(gbk_file, 'genbank').features
                if not any(feature.type == 'CDS' for feature in features):
                    #Skip when genbank file does not contain any coding sequence features
                    log.warn(
                        'GenBank file %s did not contain any coding sequence features',
                        acc)
                    continue
            except error_perm as err:
                if 'No such file or directory' not in str(err):
                    raise err
                log.warn(err)
                log.warn('GenBank file %s missing for %s', acc, projectid)
                continue
            except IOError as err:
                if 'Target file was empty after download' not in str(err):
                    raise err
                log.warn(err)
                continue

            #Try protein table file, which could be optional
            try:
                ptt_file = _download_genome_file(ftp, project_dir,
                                                 acc + '.ptt', target_dir,
                                                 last_change_date)
            except error_perm as err:
                if 'No such file or directory' not in str(err):
                    raise err
                log.warn(err)
                if require_ptt:
                    log.warn(
                        'Protein table file %s missing for %s: Probably no coding sequences',
                        acc, projectid)
                    continue
                else:
                    ptt_file = None
            except IOError as err:
                if 'Target file was empty after download' not in str(err):
                    raise err
                log.warn(err)
                continue
            genome_files.append((projectid, gbk_file, ptt_file))

    #Be nice and close the connection
    ftp.close()

    if len(genome_files) == 0:
        #Write out commented out line to the logfile detailing this error
        if download_log:
            with open(download_log, mode='a') as append_handle:
                append_handle.write('#{0}\t{1}\t'.format(
                    projectid, genome['Organism/Name']))
                append_handle.write(
                    '#Genome skipped because of missing files\n')

        #Return nothing when:
        #- none of the accessioncodes resulted in files
        #- there were no protein table files when they were required
        #- no folder could be found for projectid
        return None

    #Write out provenance logfile with sources of retrieved files
    #This file could coincidentally also serve as genome ID file for extract taxa
    if download_log:
        with open(download_log, mode='a') as append_handle:
            append_handle.write('{0}\t{1}\t{2}{3}\n'.format(
                projectid, genome['Organism/Name'], ftp.host, project_dir))

    #Return genome files
    return genome_files
Пример #43
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: filter_orthologs.py
--orthologs-zip=FILE            archive of orthologous genes in FASTA format
--filter-multiple-cogs          filter orthologs with multiple COG annotations among genes [OPTIONAL]

--filter-recombination=FILE     filter orthologs that show recombination when comparing phylogenetic trees [OPTIONAL]
                                destination file path for archive of recombination orthologs
--recombined-crosstable=FILE    destination file path for recombined crosstable of GeneIDs, COGs and Products [OPTIONAL]
--taxon-a=FILE                  file with genome IDs for taxon A to use in recombination filtering
--taxon-b=FILE                  file with genome IDs for taxon B to use in recombination filtering
--retained-zip=FILE             destination file path for archive of retained orthologs after filtering

--orthologs-per-genome=FILE      destination file path for orthologs split out per genome, based on the retained.zip
--concatemer=FILE                destination file path for super-concatemer of all genomes
"""
    options = ('orthologs-zip', 'filter-multiple-cogs=?', 'filter-recombination=?', 'recombined-crosstable=?',
               'taxon-a=?', 'taxon-b=?', 'retained-zip', 'orthologs-per-genome', 'concatemer')
    orthologs_zip, filter_cogs, filter_recombination, recombined_crosstable, \
    taxona, taxonb, retained_zip, target_orth_per_genome, target_concat_file = parse_options(usage, options, args)

    #Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='filter_orthologs_')

    #Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir)

    #Filter orthologs with multiple COG annotations among genes if flag was set
    if filter_cogs:
        ortholog_files, transfered_cogs = _filter_multiple_cog_orthologs(run_dir, ortholog_files)

    #Possible extension: filter ortholog when any strain has been flagged as 'mobile element', 'phage' or 'IS element'

    #Filter orthologs that show recombination when comparing phylogenetic trees if flag was set
    if filter_recombination:
        #Parse file to extract GenBank Project IDs
        with open(taxona) as read_handle:
            genome_ids_a = [line.split()[0] for line in read_handle]
        with open(taxonb) as read_handle:
            genome_ids_b = [line.split()[0] for line in read_handle]
        ortholog_files, recombined_files = _phipack_for_all_orthologs(run_dir, ortholog_files,
                                                                       genome_ids_a, genome_ids_b)
        #Create crosstable
        create_crosstable(recombined_files, recombined_crosstable)

    #Create archives of files on command line specified output paths
    if filter_cogs:
        shutil.move(transfered_cogs, filter_cogs)
    if filter_recombination:
        create_archive_of_files(filter_recombination, recombined_files)
    create_archive_of_files(retained_zip, ortholog_files)

    #Run the steps required after filtering orthologs
    post_recombination_filter(taxona, taxonb, retained_zip,
                              target_orth_per_genome, target_concat_file, run_dir)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info('Produced:')
    if filter_cogs:
        log.info(filter_cogs)
    if filter_recombination:
        log.info(filter_recombination)
    log.info(retained_zip)
    log.info(target_orth_per_genome)
    log.info(target_concat_file)
Пример #44
0
def download_genome_files(genome, download_log=None, require_ptt=False):
    """Download genome .gbk & .ptt files from ncbi ftp and return pairs per accessioncode in tuples of three."""
    #ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Sulfolobus_islandicus_M_14_25_uid18871/CP001400.ffn
    #Download using FTP
    ftp = FTP('ftp.ncbi.nlm.nih.gov')
    ftp.login(passwd='*****@*****.**')

    #Try to find project directory in RefSeq curated listing
    projectid = genome['BioProject ID']
    base_dir = '/genomes/Bacteria'
    project_dir = _find_project_dir(ftp, base_dir, projectid)
    if project_dir:
        accessioncodes = genome['Chromosomes/RefSeq']
        target_dir = create_directory('refseq/' + projectid)
    else:
        if projectid:
            log.warn('Genome directory not found under %s%s for %s', ftp.host, base_dir, projectid)

        #Try instead to find project directory in GenBank originals listing
        base_dir = '/genbank/genomes/Bacteria'
        project_dir = _find_project_dir(ftp, base_dir, projectid)
        if project_dir:
            accessioncodes = genome['Chromosomes/INSDC']
            target_dir = create_directory('genbank/' + projectid)
        else:
            log.warn('Genome directory not found under %s%s for %s', ftp.host, base_dir, projectid)

    #Determine ast modified date to see if we should redownload the file following changes
    last_change_date = genome['Modify Date'] if genome['Modify Date'] else genome['Release Date']

    #Download .gbk & .ptt files for all genome accessioncodes and append them to this list as tuples of gbk + ptt
    genome_files = []

    #Occasionally we can not find a folder, meaning we will have to skip this genome as well
    if project_dir:
        for acc in accessioncodes:
            #Try genbank file, which is always required
            try:
                gbk_file = _download_genome_file(ftp, project_dir, acc + '.gbk', target_dir, last_change_date)

                #Try to parse Bio.GenBank.Record to see if it contains more than five (arbitrary) feature records
                features = SeqIO.read(gbk_file, 'genbank').features
                if not any(feature.type == 'CDS' for feature in features):
                    #Skip when genbank file does not contain any coding sequence features
                    log.warn('GenBank file %s did not contain any coding sequence features', acc)
                    continue
            except error_perm as err:
                if 'No such file or directory' not in str(err):
                    raise err
                log.warn(err)
                log.warn('GenBank file %s missing for %s', acc, projectid)
                continue
            except IOError as err:
                if 'Target file was empty after download' not in str(err):
                    raise err
                log.warn(err)
                continue

            #Try protein table file, which could be optional
            try:
                ptt_file = _download_genome_file(ftp, project_dir, acc + '.ptt', target_dir, last_change_date)
            except error_perm as err:
                if 'No such file or directory' not in str(err):
                    raise err
                log.warn(err)
                if require_ptt:
                    log.warn('Protein table file %s missing for %s: Probably no coding sequences', acc, projectid)
                    continue
                else:
                    ptt_file = None
            except IOError as err:
                if 'Target file was empty after download' not in str(err):
                    raise err
                log.warn(err)
                continue
            genome_files.append((projectid, gbk_file, ptt_file))

    #Be nice and close the connection
    ftp.close()

    if len(genome_files) == 0:
        #Write out commented out line to the logfile detailing this error
        if download_log:
            with open(download_log, mode='a') as append_handle:
                append_handle.write('#{0}\t{1}\t'.format(projectid, genome['Organism/Name']))
                append_handle.write('#Genome skipped because of missing files\n')

        #Return nothing when:
        #- none of the accessioncodes resulted in files
        #- there were no protein table files when they were required
        #- no folder could be found for projectid
        return None

    #Write out provenance logfile with sources of retrieved files
    #This file could coincidentally also serve as genome ID file for extract taxa
    if download_log:
        with open(download_log, mode='a') as append_handle:
            append_handle.write('{0}\t{1}\t{2}{3}\n'.format(projectid, genome['Organism/Name'], ftp.host, project_dir))

    #Return genome files
    return genome_files
Пример #45
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: extract_orthologs.py
--genomes=FILE       file with GenBank Project IDs from complete genomes table on each line
--dna-zip=FILE       zip archive of extracted DNA files
--groups=FILE        file listing groups of orthologous proteins
--require-limiter    flag whether extracted core set of genomes should contain the limiter added in OrthoMCL [OPTIONAL]

--sico-zip=FILE      destination file path for archive of shared single copy orthologous (SICO) genes
--muco-zip=FILE      destination file path for archive of shared multiple copy orthologous genes
--subset-zip=FILE    destination file path for archive of variable copy orthologous genes shared for a subset only
--stats=FILE         destination file path for ortholog statistics file
--heatmap=FILE       destination file path heatmap of orthologs and occurrences of ortholog per genome
--orfans=FILE        destination file path ORFans
"""
    options = ['genomes', 'dna-zip', 'groups', 'require-limiter?',
               'sico-zip', 'muco-zip=?', 'subset-zip=?', 'stats', 'heatmap', 'orfans']
    genome_ids_file, dna_zip, groups_file, require_limiter, \
    target_sico, target_muco, target_subset, target_stats_path, target_heat, target_orfans = \
    parse_options(usage, options, args)

    #Parse file extract GenBank Project IDs
    with open(genome_ids_file) as read_handle:
        genomes = [line.split()[0] for line in read_handle if not line.startswith('#')]

    #Create temporary directory within which to extract orthologs
    run_dir = tempfile.mkdtemp(prefix='extract_orthologs_run_')

    #Extract files from zip archive
    temp_dir = create_directory('dna_files', inside_dir=run_dir)
    dna_files = extract_archive_of_files(dna_zip, temp_dir)

    #Actually run ortholog extraction
    sico_files, muco_files, subset_files, stats_file, heatmap_file, orfans_file = \
        extract_orthologs(run_dir, genomes, dna_files, groups_file, require_limiter)

    #Append the orfans to the heatmap file
    _append_orfans_to_heatmap(orfans_file, genomes, heatmap_file)

    #Move produced files to command line specified output paths
    create_archive_of_files(target_sico, sico_files)
    if target_muco:
        create_archive_of_files(target_muco, muco_files)
    if target_subset:
        create_archive_of_files(target_subset, subset_files)
    shutil.move(stats_file, target_stats_path)
    shutil.move(heatmap_file, target_heat)
    shutil.move(orfans_file, target_orfans)

    #Remove unused files to free disk space
    shutil.rmtree(run_dir)

    #Exit after a comforting log message
    log.info("Produced:")
    log.info("%s", target_sico)
    if target_muco:
        log.info("%s", target_muco)
    if target_subset:
        log.info("%s", target_subset)
    log.info("%s", target_stats_path)
    log.info("%s", target_heat)
Пример #46
0
def download_genome_files(genome, download_log=None, require_ptt=False, refseq_column='Chromosomes/RefSeq', embl_column='Chromosomes/INSDC'):
    """
    Download genome .gbk & .ptt files from MRS and return tuples containing project, genbank file & ptt file per
    accessioncode.

    @param genome: dictionary with genome values as parsed from lproks.cgi
    @param download_log: download log to append a line to for this genome
    @param require_ptt: boolean to indicate if individual accessioncodes should be skipped when ptt file is missing
    """
    project = genome['BioProject ID']
    #Try RefSeq accessions
    if genome[refseq_column]:
        accessioncodes = genome[refseq_column]
        databank = 'refseq'
        ptt_available = True
    else:
        #Use embl accessions
        accessioncodes = genome[embl_column]
        databank = 'embl'
        ptt_available = False

        #MRS only has ptt files for Refseq: Fail early
        if require_ptt:
            return None

    #The MRS embl & refseq databases receive updates daily, so there should be no need to fallback from one to the other

    #Determine output directory
    output_dir = create_directory(os.path.join(databank, project))

    #Determine last modified date to see if we should redownload the file following changes
    last_change_date = genome['Modify Date'] if genome['Modify Date'] else genome['Release Date']

    #Download .gbk & .ptt files for all accessioncodes and append them to this list as tuples of project, gbk & ptt
    genome_files = []

    #Download all gbk & ptt files
    for acc in accessioncodes:
        #Version numbers such as in "NC_009801.1" are sometimes appended to accession numbers: This is problematic
        acc = acc.split('.')[0]

        try:
            genbank_file = _download_file(output_dir, databank, acc, last_change_date)
        except IOError as ioerr:
            logging.warn('{0} file {1} missing for {2} because of: {3}'.format(databank, acc, project, str(ioerr)))
            continue

        #Try to parse Bio.GenBank.Record to see if it contains any CDS feature records and to ensure it has DNA seqs
        filetype = os.path.splitext(genbank_file)[1][1:]
        gb_record = SeqIO.read(genbank_file, filetype)
        str_seq = str(gb_record.seq)
        if re.match('^N+$', str_seq) or re.match('^X+$', str_seq):
            #Skip when genbank file does not contain any coding sequence features
            logging.warn('GenBank file %s did not contain any DNA sequence', acc)
            continue
        if not any(gb_featr for gb_featr in gb_record.features
                   #Skip any non coding sequence features or pseudo (non-functional version) CDS
                   if gb_featr.type == 'CDS' and not 'pseudo' in gb_featr.qualifiers):
            #Skip when genbank file does not contain any coding sequence features
            logging.warn('GenBank file %s did not contain any coding sequence features', acc)
            continue

        if not ptt_available:
            ptt_file = None
        else:
            try:
                ptt_file = _download_file(output_dir, 'ptt', acc, last_change_date)
            except IOError as ioerr:
                logging.warn('{0} file {1} missing for {2} because of: {3}'.format(databank, acc, project, str(ioerr)))
                ptt_file = None

        #Skip this accession when required ptt file is missing, but do allow for other accessions to pass
        if require_ptt and ptt_file == None:
            logging.warn('Protein table file %s missing for %s: Probably no coding sequences', acc, project)
            continue

        #Append tuples to genome_files
        genome_files.append((project, genbank_file, ptt_file))

    if len(genome_files) == 0:
        #Write out commented out line to the logfile detailing this error
        if download_log:
            with open(download_log, mode='a') as append_handle:
                append_handle.write('#{0}\t{1}\t'.format(project, genome['Organism/Name']))
                append_handle.write('#Genome skipped because of missing files\n')

        #Return nothing when:
        #- none of the accessioncodes resulted in files
        #- there were no protein table files when they were required
        return None

    #Write out provenance logfile with sources of retrieved files
    #This file could coincidentally also serve as genome ID file for extract taxa
    if download_log:
        with open(download_log, mode='a') as append_handle:
            append_handle.write('{0}\t{1}\t{2}\n'.format(project, genome['Organism/Name'],
                                                         'http://mrs.cmbi.ru.nl/mrs-5/info?db=' + databank))

    return genome_files
Пример #47
0
def download_genome_files(genome,
                          download_log=None,
                          require_ptt=False,
                          refseq_column='Chromosomes/RefSeq',
                          embl_column='Chromosomes/INSDC'):
    """
    Download genome .gbk & .ptt files from MRS and return tuples containing project, genbank file & ptt file per
    accessioncode.

    @param genome: dictionary with genome values as parsed from lproks.cgi
    @param download_log: download log to append a line to for this genome
    @param require_ptt: boolean to indicate if individual accessioncodes should be skipped when ptt file is missing
    """
    project = genome['BioProject ID']
    #Try RefSeq accessions
    if genome[refseq_column]:
        accessioncodes = genome[refseq_column]
        databank = 'refseq'
        ptt_available = True
    else:
        #Use embl accessions
        accessioncodes = genome[embl_column]
        databank = 'embl'
        ptt_available = False

        #MRS only has ptt files for Refseq: Fail early
        if require_ptt:
            return None

    #The MRS embl & refseq databases receive updates daily, so there should be no need to fallback from one to the other

    #Determine output directory
    output_dir = create_directory(os.path.join(databank, project))

    #Determine last modified date to see if we should redownload the file following changes
    last_change_date = genome['Modify Date'] if genome[
        'Modify Date'] else genome['Release Date']

    #Download .gbk & .ptt files for all accessioncodes and append them to this list as tuples of project, gbk & ptt
    genome_files = []

    #Download all gbk & ptt files
    for acc in accessioncodes:
        #Version numbers such as in "NC_009801.1" are sometimes appended to accession numbers: This is problematic
        acc = acc.split('.')[0]

        try:
            genbank_file = _download_file(output_dir, databank, acc,
                                          last_change_date)
        except IOError as ioerr:
            logging.warn('{0} file {1} missing for {2} because of: {3}'.format(
                databank, acc, project, str(ioerr)))
            continue

        #Try to parse Bio.GenBank.Record to see if it contains any CDS feature records and to ensure it has DNA seqs
        filetype = os.path.splitext(genbank_file)[1][1:]
        gb_record = SeqIO.read(genbank_file, filetype)
        str_seq = str(gb_record.seq)
        if re.match('^N+$', str_seq) or re.match('^X+$', str_seq):
            #Skip when genbank file does not contain any coding sequence features
            logging.warn('GenBank file %s did not contain any DNA sequence',
                         acc)
            continue
        if not any(
                gb_featr for gb_featr in gb_record.features
                #Skip any non coding sequence features or pseudo (non-functional version) CDS
                if gb_featr.type == 'CDS'
                and not 'pseudo' in gb_featr.qualifiers):
            #Skip when genbank file does not contain any coding sequence features
            logging.warn(
                'GenBank file %s did not contain any coding sequence features',
                acc)
            continue

        if not ptt_available:
            ptt_file = None
        else:
            try:
                ptt_file = _download_file(output_dir, 'ptt', acc,
                                          last_change_date)
            except IOError as ioerr:
                logging.warn(
                    '{0} file {1} missing for {2} because of: {3}'.format(
                        databank, acc, project, str(ioerr)))
                ptt_file = None

        #Skip this accession when required ptt file is missing, but do allow for other accessions to pass
        if require_ptt and ptt_file == None:
            logging.warn(
                'Protein table file %s missing for %s: Probably no coding sequences',
                acc, project)
            continue

        #Append tuples to genome_files
        genome_files.append((project, genbank_file, ptt_file))

    if len(genome_files) == 0:
        #Write out commented out line to the logfile detailing this error
        if download_log:
            with open(download_log, mode='a') as append_handle:
                append_handle.write('#{0}\t{1}\t'.format(
                    project, genome['Organism/Name']))
                append_handle.write(
                    '#Genome skipped because of missing files\n')

        #Return nothing when:
        #- none of the accessioncodes resulted in files
        #- there were no protein table files when they were required
        return None

    #Write out provenance logfile with sources of retrieved files
    #This file could coincidentally also serve as genome ID file for extract taxa
    if download_log:
        with open(download_log, mode='a') as append_handle:
            append_handle.write('{0}\t{1}\t{2}\n'.format(
                project, genome['Organism/Name'],
                'http://mrs.cmbi.ru.nl/mrs-5/info?db=' + databank))

    return genome_files
Пример #48
0

def _align_sicos(run_dir, sico_files):
    """Align all SICO files given as argument in parallel and return the resulting alignment files."""
    log.info('Aligning {0} SICO genes using TranslatorX & muscle.'.format(len(sico_files)))
    # We'll multiplex this embarrassingly parallel task using a pool of workers
    return [_run_translatorx((run_dir, sico_file)) for sico_file in sico_files]


def _run_translatorx((run_dir, sico_file), translation_table=CODON_TABLE_ID):
    """Run TranslatorX to create DNA level alignment file of protein level aligned DNA sequences within sico_file."""
    assert os.path.exists(TRANSLATORX) and os.access(TRANSLATORX, os.X_OK), 'Could not find or run ' + TRANSLATORX

    #Determine output file name
    sico_base = os.path.splitext(os.path.split(sico_file)[1])[0]
    alignment_dir = create_directory('alignments/' + sico_base, inside_dir=run_dir)

    #Created output file
    file_base = os.path.join(alignment_dir, sico_base)
    dna_alignment = file_base + '.nt_ali.fasta'

    #Actually run the TranslatorX program
    command = [TRANSLATORX,
               '-i', sico_file,
               '-c', str(translation_table),
               '-o', file_base]
    check_call(command, stdout=open('/dev/null', 'w'), stderr=STDOUT)

    assert os.path.isfile(dna_alignment) and 0 < os.path.getsize(dna_alignment), \
        'Alignment file should exist and have some content now: {0}'.format(dna_alignment)
    return dna_alignment