def _dna_file_per_sico(run_dir, dna_files, shared_single_copy, shared_multi_copy, non_shared): """Create fasta files with all sequences per ortholog.""" # Delete & create directory to remove any previously existing SICO files sico_dir = create_directory('sico', inside_dir=run_dir) muco_dir = create_directory('muco', inside_dir=run_dir) subset_dir = create_directory('subset', inside_dir=run_dir) orfans_file = os.path.join(run_dir, 'ORFans.ffn') # Loop over DNA files to extract SICO genes from each genome to file per SICO sico_files = set() muco_files = set() subset_files = set() number_of_sequences = 0 for dna_file in dna_files: log.info('Extracting orthologous genes from %s', dna_file) for record in SeqIO.parse(dna_file, 'fasta'): number_of_sequences += 1 # Find record in each list of dictionaries, to append it to the corresponding ortholog files aff_sico_files = _write_record_to_ortholog_file(sico_dir, shared_single_copy, record) sico_files.update(aff_sico_files) aff_muco_files = _write_record_to_ortholog_file(muco_dir, shared_multi_copy, record) muco_files.update(aff_muco_files) aff_nonsha_files = _write_record_to_ortholog_file(subset_dir, non_shared, record) subset_files.update(aff_nonsha_files) # ORFans do not fall into any of the above three categories: Add them to a separate file if not aff_sico_files and not aff_muco_files and not aff_nonsha_files: with open(orfans_file, mode='a') as write_handle: SeqIO.write(record, write_handle, 'fasta') return sorted(sico_files), sorted(muco_files), sorted(subset_files), number_of_sequences, orfans_file
def _step11_orthomcl_dump_pairs(run_dir, config_file): """Dump files from the database produced by the orthomclPairs program. usage: orthomclDumpPairsFiles config_file where: config_file : see below (you can use the same file given to orthomclPairs) Database Input: InParalog, Ortholog, CoOrtholog tables - populated by orthomclPairs Output files: orthomclMclInput - file required by the mcl program pairs/ -dir holding relationship files potentialOrthologs.txt - ortholog relationships potentialInparalogs.txt - inparalog relationships potentialCoorthologs.txt - coortholog relationships The pairs/ files contain the pairs found by the orthomclPairs tables, and their average normalized scores. This is the same information as in the orthomclMclInput file, but segregated by relationship type. These are candidate relationships (edges) that will subsequently be grouped (clustered) by the mcl program to form the OrthoMCL ortholog groups. These files contain more sensitive and less selective relationships then the final ortholog groups. Standard Error: logging info EXAMPLE: orthomclSoftware/bin/orthomclDumpPairsFile out_dir/orthomcl.config """ # Run orthomclDumpPairsFile out_dir = create_directory('orthologs', inside_dir=run_dir) command = [ORTHOMCL_DUMP_PAIRS_FILES, config_file] log.info('Executing: %s', ' '.join(command)) check_call(command, cwd=out_dir) # Desired destination output file paths mcl_dir = create_directory('mcl', inside_dir=run_dir) mclinput = os.path.join(mcl_dir, 'mclInput.tsv') orthologs = os.path.join(out_dir, 'potentialOrthologs.tsv') inparalogs = os.path.join(out_dir, 'potentialInparalogs.tsv') coorthologs = os.path.join(out_dir, 'potentialCoorthologs.tsv') # Move output files to desired destinations shutil.move(os.path.join(out_dir, 'mclInput'), mclinput) shutil.move(os.path.join(out_dir, 'pairs/orthologs.txt'), orthologs) shutil.move(os.path.join(out_dir, 'pairs/inparalogs.txt'), inparalogs) shutil.move(os.path.join(out_dir, 'pairs/coorthologs.txt'), coorthologs) # Assert mcl input file exists and has some content assert os.path.isfile(mclinput) and 0 < os.path.getsize( mclinput), mclinput + ' should exist and have some content' return mclinput, orthologs, inparalogs, coorthologs
def _step11_orthomcl_dump_pairs(run_dir, config_file): """Dump files from the database produced by the orthomclPairs program. usage: orthomclDumpPairsFiles config_file where: config_file : see below (you can use the same file given to orthomclPairs) Database Input: InParalog, Ortholog, CoOrtholog tables - populated by orthomclPairs Output files: orthomclMclInput - file required by the mcl program pairs/ -dir holding relationship files potentialOrthologs.txt - ortholog relationships potentialInparalogs.txt - inparalog relationships potentialCoorthologs.txt - coortholog relationships The pairs/ files contain the pairs found by the orthomclPairs tables, and their average normalized scores. This is the same information as in the orthomclMclInput file, but segregated by relationship type. These are candidate relationships (edges) that will subsequently be grouped (clustered) by the mcl program to form the OrthoMCL ortholog groups. These files contain more sensitive and less selective relationships then the final ortholog groups. Standard Error: logging info EXAMPLE: orthomclSoftware/bin/orthomclDumpPairsFile out_dir/orthomcl.config """ # Run orthomclDumpPairsFile out_dir = create_directory('orthologs', inside_dir=run_dir) command = [ORTHOMCL_DUMP_PAIRS_FILES, config_file] log.info('Executing: %s', ' '.join(command)) check_call(command, cwd=out_dir) # Desired destination output file paths mcl_dir = create_directory('mcl', inside_dir=run_dir) mclinput = os.path.join(mcl_dir, 'mclInput.tsv') orthologs = os.path.join(out_dir, 'potentialOrthologs.tsv') inparalogs = os.path.join(out_dir, 'potentialInparalogs.tsv') coorthologs = os.path.join(out_dir, 'potentialCoorthologs.tsv') # Move output files to desired destinations shutil.move(os.path.join(out_dir, 'mclInput'), mclinput) shutil.move(os.path.join(out_dir, 'pairs/orthologs.txt'), orthologs) shutil.move(os.path.join(out_dir, 'pairs/inparalogs.txt'), inparalogs) shutil.move(os.path.join(out_dir, 'pairs/coorthologs.txt'), coorthologs) # Assert mcl input file exists and has some content assert os.path.isfile(mclinput) and 0 < os.path.getsize(mclinput), mclinput + ' should exist and have some content' return mclinput, orthologs, inparalogs, coorthologs
def run_phipack(phipack_dir, dna_file): """Run PhiPack and return the number of informative sites, PHI, Max Chi^2 and NSS.""" # Create directory for PhiPack to run in, so files get created there orth_name = os.path.split(dna_file)[1].split('.')[0] rundir = create_directory(orth_name, inside_dir=phipack_dir) # Build up list of commands command = PHIPACK, '-f', dna_file, '-o' # Output NSS & Max Chi^2 try: check_call(command, cwd=rundir, stdout=open('/dev/null', mode='w')) except CalledProcessError as err: log.warn('Error running PhiPack for %s:\n%s', dna_file, err) return {'PhiPack sites': None, 'Phi': None, 'Max Chi^2': None, 'NSS': None} # Retrieve output log file contents logfile = os.path.join(rundir, 'Phi.log') with open(logfile) as read_handle: contents = ''.join(read_handle) # Parse standard output to retrieve values for # sites, Phi, Chi^2 max & NSS # Found 103 informative sites. # PHI (Normal): 9.04e-01 # Max Chi^2: 6.60e-01 (1000 permutations) # NSS: 6.31e-01 (1000 permutations) sites = int(re.search('Found ([0-9]+) informative sites.', contents).group(1)) raw_phi = re.search('PHI \(Normal\):\s+(.*)', contents).group(1) phi = float(raw_phi) if raw_phi != '--' else None chi = float(re.search('Max Chi\^2:\s+(.*)\s+\(1000 permutations\)', contents).group(1)) nss = float(re.search('NSS:\s+(.*)\s+\(1000 permutations\)', contents).group(1)) return {'PhiPack sites': sites, 'Phi': phi, 'Max Chi^2': chi, 'NSS': nss}
def _prepare_calculations(genomes_a_file, genomes_b_file, sicozip_file, table_a_dest, table_b_dest, append_odd_even=False): '''Unzip sico_files, and if needed create temporary files for the odd/even only codons.''' if append_odd_even: # prepend file makeup when odd/even table are also added _write_intro_to_file(table_a_dest) _write_intro_to_file(table_b_dest) # extract ortholog files from sicozip rundir = tempfile.mkdtemp(prefix='calculations_') sico_files = extract_archive_of_files( sicozip_file, create_directory('sicos', inside_dir=rundir)) # perform normal calculation run_calculations(genomes_a_file, genomes_b_file, sico_files, table_a_dest, table_b_dest) # separate calculations for odd and even tables if append_odd_even: odd_sico_files, even_sico_files = _split_by_odd_even_codons(sico_files) run_calculations(genomes_a_file, genomes_b_file, odd_sico_files, table_a_dest, table_b_dest) run_calculations(genomes_a_file, genomes_b_file, even_sico_files, table_a_dest, table_b_dest) # clean up shutil.rmtree(rundir)
def _run_dna_dist(run_dir, aligned_file): """Run dnadist to calculate distances between individual strains in a distance matrix, as input for neighbor.""" # Run calculations inside a directory dnadist_dir = create_directory('dnadist/', inside_dir=run_dir) # Read alignment file alignment = AlignIO.read(aligned_file, 'fasta') # Convert alignment in to proper input file for dnadist according to specification nr_of_species = len(alignment) nr_of_sites = len(alignment[0]) infile = os.path.join(dnadist_dir, 'infile') with open(infile, mode='w') as write_handle: write_handle.write(' {0} {1}\n'.format(nr_of_species, nr_of_sites)) for seq_record in alignment: name = seq_record.id.split('|')[0] name = name if len(name) < 10 else name[:10] write_handle.write('{0:10}{1}\n'.format(name, seq_record.seq)) # Actually run the dnadist program in the correct directory, and send input to it for the first prompt process = Popen(DNADIST, cwd=dnadist_dir, stdin=PIPE, stdout=PIPE, stderr=STDOUT) process.communicate(input='Y\n') # Retrieve outputfile outfile = os.path.join(dnadist_dir, 'outfile') assert os.path.exists(outfile) and 0 < os.path.getsize(outfile), outfile + ' should exist with some content now' return outfile
def concatemer_per_genome(run_dir, genome_coding_regions_files): """Create a concatemer DNA file per genome containing all aligned & trimmed SICO genes.""" concatemer_dir = create_directory('concatemers', inside_dir=run_dir) # Collection of output filenames concatemer_files = [] # Loop over genome coding regions files to create concatemer of each for coding_region_file in genome_coding_regions_files: # Determine output file name filename = os.path.split(coding_region_file)[1] basename = filename[:filename.find('.coding-regions')] concatemer_file = os.path.join(concatemer_dir, basename + '.concatemer.fna') concatemer_files.append(concatemer_file) # Copy ACTG content from coding regions file to concatemer with open(coding_region_file) as read_handle: with open(concatemer_file, mode='w') as write_handle: # Write out single concatemer header write_handle.write('> {0}|trimmed concatemer\n'.format(basename)) # Copy over all lines that are not header lines (do not start with '>') for line in read_handle: # Skip header lines if not line.startswith('>'): write_handle.write(line) log.info('Created %i genome concatemers', len(concatemer_files)) return sorted(concatemer_files)
def _phipack_for_all_orthologs(run_dir, aligned_files, genome_ids_a, genome_ids_b): """Filter aligned fasta files where there is evidence of recombination when inspecting phylogenetic trees. Return two collections of aligned files, the first without recombination, the second with recombination.""" log.info('Filtering orthologs where phylogenetic trees show evidence of inter-taxon recombination') # Collections to hold both non recombination files & files showing recombination non_recomb = [] recombined = [] # Assign ortholog files to the correct collection based on whether they show recombination for ortholog_file in aligned_files: # Determine input file base name to create an ortholog run specific directory base_name = os.path.split(os.path.splitext(ortholog_file)[0])[1] ortholog_dir = create_directory(base_name, inside_dir=run_dir) # Create distance file distance_file = _run_dna_dist(ortholog_dir, ortholog_file) # Create tree file tree_file = _run_neighbor(ortholog_dir, distance_file) # Parse tree file to ensure all genome_ids_a & genome_ids_b group together in the tree if _tree_shows_recombination(genome_ids_a, genome_ids_b, tree_file): recombined.append(ortholog_file) else: non_recomb.append(ortholog_file) log.info('%i Orthologs out of %i were filtered out due to recombination, leaving %i non recombined orthologs', len(recombined), len(aligned_files), len(non_recomb)) return non_recomb, recombined
def _run_dna_dist(run_dir, aligned_file): """Run dnadist to calculate distances between individual strains in a distance matrix, as input for neighbor.""" # Run calculations inside a directory dnadist_dir = create_directory('dnadist/', inside_dir=run_dir) # Read alignment file alignment = AlignIO.read(aligned_file, 'fasta') # Convert alignment in to proper input file for dnadist according to specification nr_of_species = len(alignment) nr_of_sites = len(alignment[0]) infile = os.path.join(dnadist_dir, 'infile') with open(infile, mode='w') as write_handle: write_handle.write(' {0} {1}\n'.format(nr_of_species, nr_of_sites)) for seq_record in alignment: name = seq_record.id.split('|')[0] name = name if len(name) < 10 else name[:10] write_handle.write('{0:10}{1}\n'.format(name, seq_record.seq)) # Actually run the dnadist program in the correct directory, and send input to it for the first prompt process = Popen(DNADIST, cwd=dnadist_dir, stdin=PIPE, stdout=PIPE, stderr=STDOUT) process.communicate(input='Y\n') # Retrieve outputfile outfile = os.path.join(dnadist_dir, 'outfile') assert os.path.exists(outfile) and 0 < os.path.getsize( outfile), outfile + ' should exist with some content now' return outfile
def _step6_orthomcl_filter_fasta(run_dir, input_dir, min_length=10, max_percent_stop=20): """Create goodProteins.fasta containing all good proteins and rejectProteins.fasta containing all rejects. Input is a directory containing a set of compliant input .fasta files (as produced by orthomclAdjustFasta). Usage: orthomclFilterFasta input_dir min_length max_percent_stops where: input_dir: a directory containing a set of .fasta files min_length: minimum allowed length of proteins. (suggested: 10) max_percent_stop: maximum percent stop codons. (suggested 20) The input requirements are: 1) a compliantFasta/ directory which contains all and only the proteome .fasta files, one file per proteome. 2) each .fasta file must have a name in the form 'xxxx.fasta' where xxxx is a three or four letter unique taxon code. For example: hsa.fasta or eco.fasta 3) each protein in those files must have a definition line in the following format: >xxxx|yyyyyy where xxxx is the three or four letter taxon code and yyyyyy is a sequence identifier unique within that taxon. Output: my_orthomcl_dir/goodProteins.fasta my_orthomcl_dir/poorProteins.fasta report of suspicious proteomes (> 10% poor proteins) EXAMPLE: orthomclSoftware/bin/orthomclFilterFasta my_orthomcl_dir/compliantFasta 10 20 """ # Run orthomclFilterFasta out_dir = create_directory('filtered_fasta', inside_dir=run_dir) report = os.path.join(out_dir, 'filter_report.log') with open(report, mode='w') as report_file: command = [ORTHOMCL_FILTER_FASTA, input_dir, str(min_length), str(max_percent_stop)] log.info('Executing: %s', ' '.join(command)) check_call(command, stdout=report_file, stderr=STDOUT) # Move output files to out directory good = os.path.join(out_dir, 'good_proteins.fasta') poor = os.path.join(out_dir, 'poor_proteins.fasta') shutil.move('goodProteins.fasta', good) shutil.move('poorProteins.fasta', poor) # Ensure neither of the proteomes is suspicious according to min_length & max_percent_stop with open(report) as report_file: if 'Proteomes with > 10% poor proteins:' in report_file.read(): # OrthoMCL does NOT add actual min_length value msg = 'OrthomclFilterFasta found suspicious proteomes based on values for length' log.error(msg) assert False, msg # Warn the user about the poor proteins found here, if they were found at all poor_records = list(SeqIO.parse(poor, 'fasta')) if poor_records: log.warn('%i poor sequence records identified by orthomclFilterFasta:', len(poor_records)) for seqr in poor_records: log.warn('>%s: %s', seqr.id, seqr.seq) # Assert good exists and has some content assert os.path.isfile(good) and 0 < os.path.getsize(good), good + ' should exist and have some content' # Only good and poor proteins return good, poor
def _prepare_calculations(genomes_a_file, genomes_b_file, sicozip_file, table_a_dest, table_b_dest, append_odd_even=False): '''Unzip sico_files, and if needed create temporary files for the odd/even only codons.''' if append_odd_even: # prepend file makeup when odd/even table are also added _write_intro_to_file(table_a_dest) _write_intro_to_file(table_b_dest) # extract ortholog files from sicozip rundir = tempfile.mkdtemp(prefix='calculations_') sico_files = extract_archive_of_files(sicozip_file, create_directory('sicos', inside_dir=rundir)) # perform normal calculation run_calculations(genomes_a_file, genomes_b_file, sico_files, table_a_dest, table_b_dest) # separate calculations for odd and even tables if append_odd_even: odd_sico_files, even_sico_files = _split_by_odd_even_codons(sico_files) run_calculations(genomes_a_file, genomes_b_file, odd_sico_files, table_a_dest, table_b_dest) run_calculations(genomes_a_file, genomes_b_file, even_sico_files, table_a_dest, table_b_dest) # clean up shutil.rmtree(rundir)
def concatemer_per_genome(run_dir, genome_coding_regions_files): """Create a concatemer DNA file per genome containing all aligned & trimmed SICO genes.""" concatemer_dir = create_directory('concatemers', inside_dir=run_dir) # Collection of output filenames concatemer_files = [] # Loop over genome coding regions files to create concatemer of each for coding_region_file in genome_coding_regions_files: # Determine output file name filename = os.path.split(coding_region_file)[1] basename = filename[:filename.find('.coding-regions')] concatemer_file = os.path.join(concatemer_dir, basename + '.concatemer.fna') concatemer_files.append(concatemer_file) # Copy ACTG content from coding regions file to concatemer with open(coding_region_file) as read_handle: with open(concatemer_file, mode='w') as write_handle: # Write out single concatemer header write_handle.write( '> {0}|trimmed concatemer\n'.format(basename)) # Copy over all lines that are not header lines (do not start with '>') for line in read_handle: # Skip header lines if not line.startswith('>'): write_handle.write(line) log.info('Created %i genome concatemers', len(concatemer_files)) return sorted(concatemer_files)
def run_codeml_for_sicos(codeml_dir, genome_ids_a, genome_ids_b, sico_files): """Run codeml for representatives of clades A and B in each of the SICO files, to calculate dN/dS.""" logging.info('Running codeml for %s aligned and trimmed SICOs', len(sico_files)) codeml_files = [] for sico_file in sico_files: # Separate alignments for clade A & clade B genomes ali = AlignIO.read(sico_file, 'fasta') alignment_a = MultipleSeqAlignment( seqr for seqr in ali if seqr.id.split('|')[0] in genome_ids_a) alignment_b = MultipleSeqAlignment( seqr for seqr in ali if seqr.id.split('|')[0] in genome_ids_b) # Create sub directory for this run based on sico_file name filename = os.path.split(sico_file)[1] # Split off everything starting from the first dot base_name = filename[:filename.find('.')] sub_dir = create_directory(base_name, inside_dir=codeml_dir) # Submit for asynchronous calculation codeml_file = run_codeml(sub_dir, alignment_a, alignment_b) codeml_files.append(codeml_file) return codeml_files
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: run_phipack.py --orthologs-zip=FILE archive of orthologous genes in FASTA format --stats-file=FILE destination file path for values found through PhiPack for each ortholog """ options = ('orthologs-zip', 'stats-file') orthologs_zip, stats_file = parse_options(usage, options, args) # Run filtering in a temporary folder, to prevent interference from simultaneous runs run_dir = tempfile.mkdtemp(prefix='run_phipack_') # Extract files from zip archive extraction_dir = create_directory('extracted_orthologs', inside_dir=run_dir) ortholog_files = extract_archive_of_files(orthologs_zip, extraction_dir) # Find recombination in all ortholog_files _phipack_for_all_orthologs(run_dir, ortholog_files, stats_file) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info('Produced:\n%s', stats_file)
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: concatenate_orthologs.py --orthologs-zip=FILE archive of orthologous genes in FASTA format --coding-regions=FILE destination file path archive of trimmed orthologous coding regions per genomes --concatemer=FILE destination file path for super-concatemer of all genomes --taxon-a=FILE destination file path for genome IDs for taxon A --taxon-b=FILE destination file path for genome IDs for taxon B --tree=FILE destination file path for tree visualization """ options = ['orthologs-zip', 'coding-regions', 'concatemer', 'taxon-a', 'taxon-b', 'tree'] orthologs_zip, target_coding_regions, target_concat_file, target_taxon_a, target_taxon_b, target_tree = \ parse_options(usage, options, args) # Run filtering in a temporary folder, to prevent interference from simultaneous runs run_dir = tempfile.mkdtemp(prefix='concatemer_tree_') # Extract files from zip archive temp_dir = create_directory('orthologs', inside_dir=run_dir) ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir) # Separate out orthologs per genome to create trimmed coding region files per genome genome_coding_regions_files = coding_regions_per_genome(run_dir, ortholog_files) create_archive_of_files(target_coding_regions, genome_coding_regions_files) # Concatenate coding region files per genome concatemer_files = concatemer_per_genome(run_dir, genome_coding_regions_files) # Create super concatemer create_super_concatemer(concatemer_files, target_concat_file) # Determine the taxa present in the super concatemer tree by building a phylogenetic tree from genome concatemer and # reading genome ids in the two largest clades. super_distance_file = _run_dna_dist(run_dir, target_concat_file) super_tree_file = _run_neighbor(run_dir, super_distance_file) genome_ids_a, genome_ids_b = _read_taxa_from_tree(super_tree_file) # Map Project IDs to Organism names id_to_name_map = dict((gid, genome['Organism/Name']) for gid, genome in select_genomes_by_ids(genome_ids_a + genome_ids_b).iteritems()) # Write Project IDs and Organism Names to files, with a fallback to genome_id for external genome with open(target_taxon_a, mode='w') as write_handle: for genome_id in genome_ids_a: write_handle.write('{id}\t{name}\n'.format(id=genome_id, name=id_to_name_map.get(genome_id, genome_id))) with open(target_taxon_b, mode='w') as write_handle: for genome_id in genome_ids_b: write_handle.write('{id}\t{name}\n'.format(id=genome_id, name=id_to_name_map.get(genome_id, genome_id))) # Visualize tree visualize_tree(super_tree_file, id_to_name_map, target_tree) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info('Produced: \n%s\n%s\n%s\n%s\n%s', target_coding_regions, target_concat_file, target_taxon_a, target_taxon_b, target_tree)
def _step5_orthomcl_adjust_fasta(run_dir, proteome_files, id_field=3): """Create an OrthoMCL compliant .fasta file, by adjusting definition lines. Usage: orthomclAdjustFasta taxon_code fasta_file id_field where: taxon_code: a three or four letter unique abbreviation for the taxon fasta_file: the input fasta file per proteome id_field: a number indicating what field in the definition line contains the protein ID. Fields are separated by either ' ' or '|'. Any spaces immediately following the '>' are ignored. The first field is 1. For example, in the following definition line, the ID (AP_000668.1) is in field 4: >gi|89106888|ref|AP_000668.1| Input file requirements: (1) .fasta format (2) a unique id is provided for each sequence, and is in the field specified by id_field Output file format: (1) .fasta format (2) definition line is of the form: >taxoncode|unique_protein_id The output file is named taxoncode.fasta Note: if your input files do not meet the requirements, you can do some simple perl or awk processing of them to create the required input files to this program, or the required output files. This program is provided as a convenience, but OrthoMCL users are expected to have the scripting skills to provide compliant .fasta files. EXAMPLE: orthomclSoftware/bin/orthomclAdjustFasta hsa Homo_sapiens.NCBI36.53.pep.all.fa 1 """ # Create directory to hold compliant fasta adjusted_fasta_dir = create_directory('compliant_fasta', inside_dir=run_dir) adjusted_fasta_files = [] for proteome_file in proteome_files: taxon_code = None # Use first part of header of first entry as taxon code for record in SeqIO.parse(proteome_file, 'fasta'): taxon_code = record.id.split('|')[0].replace('.', '_') break # If we failed to extract a taxon_code, proteome file must have been empty assert taxon_code, 'Proteome file appears empty: ' + proteome_file # Call orhtomclAdjustFasta command = [ORTHOMCL_ADJUST_FASTA, taxon_code, proteome_file, str(id_field)] log.info('Executing: %s', ' '.join(command)) check_call(command) # Move resulting fasta file to compliantFasta directory adjusted_fasta_file = taxon_code + '.fasta' fasta_file_destination = os.path.join(adjusted_fasta_dir, adjusted_fasta_file) shutil.move(adjusted_fasta_file, fasta_file_destination) adjusted_fasta_files.append(fasta_file_destination) # Return path to directory containing compliantFasta return adjusted_fasta_dir, adjusted_fasta_files
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: filter_orthologs.py --orthologs-zip=FILE archive of orthologous genes in FASTA format --retained-threshold=PERC filter orthologs that retain less than PERC % of sequence after trimming alignment --max-indel-length=NUMBER filter orthologs that contain insertions / deletions longer than N in middle of alignment --aligned-zip=FILE destination file path for archive of aligned orthologous genes --misaligned-zip=FILE destination file path for archive of misaligned orthologous genes --trimmed-zip=FILE destination file path for archive of aligned & trimmed orthologous genes --stats=FILE destination file path for ortholog trimming statistics file --scatterplot=FILE destination file path for scatterplot of retained and filtered sequences by length """ options = [ 'orthologs-zip', 'retained-threshold', 'max-indel-length', 'aligned-zip', 'misaligned-zip', 'trimmed-zip', 'stats', 'scatterplot' ] orthologs_zip, retained_threshold, max_indel_length, \ aligned_zip, misaligned_zip, trimmed_zip, target_stats_path, target_scatterplot = \ parse_options(usage, options, args) # Convert retained threshold to integer, so we can fail fast if argument value format was wrong retained_threshold = int(retained_threshold) max_indel_length = int(max_indel_length) # Run filtering in a temporary folder, to prevent interference from simultaneous runs run_dir = tempfile.mkdtemp(prefix='align_trim_') # Extract files from zip archive temp_dir = create_directory('orthologs', inside_dir=run_dir) sico_files = extract_archive_of_files(orthologs_zip, temp_dir) # Align SICOs so all sequences become equal length sequences aligned_files = _align_sicos(run_dir, sico_files) # Filter orthologs that retain less than PERC % of sequence after trimming alignment trimmed_files, misaligned_files = _trim_alignments(run_dir, aligned_files, retained_threshold, max_indel_length, target_stats_path, target_scatterplot) # Create archives of files on command line specified output paths create_archive_of_files(aligned_zip, aligned_files) create_archive_of_files(misaligned_zip, misaligned_files) create_archive_of_files(trimmed_zip, trimmed_files) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info( 'Produced: \n%s', '\n'.join((aligned_zip, misaligned_zip, trimmed_zip, target_stats_path, target_scatterplot)))
def _trim_alignments(run_dir, dna_alignments, retained_threshold, max_indel_length, stats_file, scatterplot_file): """Trim all DNA alignments using _trim_alignment (singular), and calculate some statistics about the trimming.""" log.info( 'Trimming {0} DNA alignments from first non-gap codon to last non-gap codon' .format(len(dna_alignments))) # Create directory here, to prevent race-condition when folder does not exist, but is then created by another process trimmed_dir = create_directory('trimmed', inside_dir=run_dir) # Trim all the alignments trim_tpls = [ _trim_alignment((trimmed_dir, dna_alignment, max_indel_length)) for dna_alignment in dna_alignments ] remaining_percts = [tpl[3] for tpl in trim_tpls] trimmed_alignments = [ tpl[0] for tpl in trim_tpls if retained_threshold <= tpl[3] ] misaligned = [tpl[0] for tpl in trim_tpls if retained_threshold > tpl[3]] # Write trim statistics to file in such a way that they're easily converted to a graph in Galaxy with open(stats_file, mode='w') as append_handle: msg = '{0:6} sequence alignments trimmed'.format(len(trim_tpls)) log.info(msg) append_handle.write('#' + msg + '\n') average_retained = sum(remaining_percts) / len(remaining_percts) msg = '{0:5.1f}% sequence retained on average overall'.format( average_retained) log.info(msg) append_handle.write('#' + msg + '\n') filtered = len(misaligned) msg = '{0:6} orthologs filtered because less than {1}% sequence retained or because of indel longer than {2} '\ .format(filtered, str(retained_threshold), max_indel_length) log.info(msg) append_handle.write('#' + msg + '\n') append_handle.write( '# Trimmed file\tOriginal length\tTrimmed length\tPercentage retained\n' ) for tpl in sorted(trim_tpls, key=itemgetter(3)): append_handle.write(os.path.split(tpl[0])[1] + '\t') append_handle.write(str(tpl[1]) + '\t') append_handle.write(str(tpl[2]) + '\t') append_handle.write('{0:.2f}\n'.format(tpl[3])) # Create scatterplot using trim_tuples scatterplot(retained_threshold, trim_tpls, scatterplot_file) return sorted(trimmed_alignments), sorted(misaligned)
def _dna_file_per_sico(run_dir, dna_files, shared_single_copy, shared_multi_copy, non_shared): """Create fasta files with all sequences per ortholog.""" # Delete & create directory to remove any previously existing SICO files sico_dir = create_directory('sico', inside_dir=run_dir) muco_dir = create_directory('muco', inside_dir=run_dir) subset_dir = create_directory('subset', inside_dir=run_dir) orfans_file = os.path.join(run_dir, 'ORFans.ffn') # Loop over DNA files to extract SICO genes from each genome to file per SICO sico_files = set() muco_files = set() subset_files = set() number_of_sequences = 0 for dna_file in dna_files: log.info('Extracting orthologous genes from %s', dna_file) for record in SeqIO.parse(dna_file, 'fasta'): number_of_sequences += 1 # Find record in each list of dictionaries, to append it to the corresponding ortholog files aff_sico_files = _write_record_to_ortholog_file( sico_dir, shared_single_copy, record) sico_files.update(aff_sico_files) aff_muco_files = _write_record_to_ortholog_file( muco_dir, shared_multi_copy, record) muco_files.update(aff_muco_files) aff_nonsha_files = _write_record_to_ortholog_file( subset_dir, non_shared, record) subset_files.update(aff_nonsha_files) # ORFans do not fall into any of the above three categories: Add them to a separate file if not aff_sico_files and not aff_muco_files and not aff_nonsha_files: with open(orfans_file, mode='a') as write_handle: SeqIO.write(record, write_handle, 'fasta') return sorted(sico_files), sorted(muco_files), sorted( subset_files), number_of_sequences, orfans_file
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: split_by_taxa.py --genomes-a=FILE file with genome GenBank Project ID and Organism name on each line for taxon A --genomes-b=FILE file with genome GenBank Project ID and Organism name on each line for taxon B --orthologs-zip=FILE archive of aligned & trimmed single copy orthologous (SICO) genes --taxon-a-zip=FILE destination file path for archive of SICO genes belonging to taxon A --taxon-b-zip=FILE destination file path for archive of SICO genes belonging to taxon B """ options = [ 'genomes-a', 'genomes-b', 'orthologs-zip', 'taxon-a-zip', 'taxon-b-zip' ] genome_a_ids_file, genome_b_ids_file, orthologs_zip, taxon_a_zip, taxon_b_zip = parse_options( usage, options, args) # Parse file containing RefSeq project IDs to extract RefSeq project IDs with open(genome_a_ids_file) as read_handle: lines = [line.split('\t') for line in read_handle] genome_ids_a = [line[0] for line in lines] common_prefix_a = _common_prefix([line[1] for line in lines], 'taxon_a') with open(genome_b_ids_file) as read_handle: lines = [line.split('\t') for line in read_handle] genome_ids_b = [line[0] for line in lines] common_prefix_b = _common_prefix([line[1] for line in lines], 'taxon_b') # Create run_dir to hold files related to this run run_dir = tempfile.mkdtemp(prefix='split_by_taxa_') # Extract files from zip archive ortholog_files = extract_archive_of_files( orthologs_zip, create_directory('alignments', inside_dir=run_dir)) # Actually split alignments per taxon taxon_a_files, taxon_b_files = split_alignment_by_taxa( run_dir, ortholog_files, (genome_ids_a, common_prefix_a), (genome_ids_b, common_prefix_b)) # Write the produced files to command line argument filenames create_archive_of_files(taxon_a_zip, taxon_a_files) create_archive_of_files(taxon_b_zip, taxon_b_files) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info("Produced: \n%s\n%s", taxon_a_zip, taxon_b_zip) return taxon_a_zip, taxon_b_zip
def _run_neighbor(run_dir, distance_file): """Run neighbor to generate a tree of the distances in the distance file, and return the generated tree file.""" neighbor_dir = create_directory('neighbor', inside_dir=run_dir) # Copy outfile from dnadist to infile inside neighbor_dir shutil.copy(distance_file, os.path.join(neighbor_dir, 'infile')) # Actually run neighbor process = Popen(NEIGHBOR, cwd=neighbor_dir, stdin=PIPE, stdout=PIPE, stderr=STDOUT) process.communicate(input='N\nY\n') # Retrieve newick tree file treefile = os.path.join(neighbor_dir, 'outtree') assert os.path.exists(treefile) and 0 < os.path.getsize(treefile), treefile + ' should exist with some content now' return treefile
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: filter_orthologs.py --orthologs-zip=FILE archive of orthologous genes in FASTA format --retained-threshold=PERC filter orthologs that retain less than PERC % of sequence after trimming alignment --max-indel-length=NUMBER filter orthologs that contain insertions / deletions longer than N in middle of alignment --aligned-zip=FILE destination file path for archive of aligned orthologous genes --misaligned-zip=FILE destination file path for archive of misaligned orthologous genes --trimmed-zip=FILE destination file path for archive of aligned & trimmed orthologous genes --stats=FILE destination file path for ortholog trimming statistics file --scatterplot=FILE destination file path for scatterplot of retained and filtered sequences by length """ options = ['orthologs-zip', 'retained-threshold', 'max-indel-length', 'aligned-zip', 'misaligned-zip', 'trimmed-zip', 'stats', 'scatterplot'] orthologs_zip, retained_threshold, max_indel_length, \ aligned_zip, misaligned_zip, trimmed_zip, target_stats_path, target_scatterplot = \ parse_options(usage, options, args) # Convert retained threshold to integer, so we can fail fast if argument value format was wrong retained_threshold = int(retained_threshold) max_indel_length = int(max_indel_length) # Run filtering in a temporary folder, to prevent interference from simultaneous runs run_dir = tempfile.mkdtemp(prefix='align_trim_') # Extract files from zip archive temp_dir = create_directory('orthologs', inside_dir=run_dir) sico_files = extract_archive_of_files(orthologs_zip, temp_dir) # Align SICOs so all sequences become equal length sequences aligned_files = _align_sicos(run_dir, sico_files) # Filter orthologs that retain less than PERC % of sequence after trimming alignment trimmed_files, misaligned_files = _trim_alignments(run_dir, aligned_files, retained_threshold, max_indel_length, target_stats_path, target_scatterplot) # Create archives of files on command line specified output paths create_archive_of_files(aligned_zip, aligned_files) create_archive_of_files(misaligned_zip, misaligned_files) create_archive_of_files(trimmed_zip, trimmed_files) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info('Produced: \n%s', '\n'.join((aligned_zip, misaligned_zip, trimmed_zip, target_stats_path, target_scatterplot)))
def _create_blast_database(run_dir, fasta_file, nucleotide=False): """Create blast database""" assert os.path.exists(MAKEBLASTDB) and os.access(MAKEBLASTDB, os.X_OK), 'Could not find or run ' + MAKEBLASTDB dbtype = 'nucl' if nucleotide else 'prot' db_dir = create_directory('blast', inside_dir=run_dir) db_name = 'my_{0}_blast_db'.format(dbtype) log_file = os.path.join(db_dir, 'makeblastdb.log') with open(log_file, mode='w') as open_file: command = [MAKEBLASTDB, '-in', fasta_file, '-dbtype', dbtype, '-out', os.path.join(db_dir, db_name)] log.info('Executing: %s', ' '.join(command)) check_call(command, stdout=open_file) return db_dir, db_name
def _phipack_for_all_orthologs(run_dir, aligned_files, stats_file): """Filter aligned fasta files where there is evidence of recombination when inspecting PhiPack values. Return two collections of aligned files, the first without recombination, the second with recombination.""" log.info('Running PhiPack for %i orthologs to find recombination', len(aligned_files)) # Create separate directory for phipack related values phipack_dir = create_directory('phipack', inside_dir=run_dir) with open(stats_file, mode='w') as write_handle: write_handle.write('\t'.join([ 'Ortholog', 'Informative sites', 'Phi', 'Max Chi^2', 'NSS', 'COGs', 'Product' ]) + '\n') # Retrieve unique genomes from first ortholog file genome_ids = set( fasta_record.id.split('|')[0] for fasta_record in SeqIO.parse(aligned_files[0], 'fasta')) genome_dicts = select_genomes_by_ids(genome_ids).values() # Assign ortholog files to the correct collection based on whether they show recombination for ortholog_file in aligned_files: orth_name = os.path.split(ortholog_file)[1].split('.')[0] # Parse tree file to ensure all genome_ids_a & genome_ids_b group together in the tree phipack_values = run_phipack(phipack_dir, ortholog_file) # Write PhiPack values to line write_handle.write( '{0}\t{1[PhiPack sites]}\t{1[Phi]}\t{1[Max Chi^2]}\t{1[NSS]}'. format(orth_name, phipack_values)) # Parse sequence records again, but now to retrieve cogs and products seq_records = list(SeqIO.parse(ortholog_file, 'fasta')) # COGs cogs = find_cogs_in_sequence_records(seq_records) write_handle.write('\t' + ','.join(cogs)) # Product product = get_most_recent_gene_name(genome_dicts, seq_records) write_handle.write('\t' + product) # End line write_handle.write('\n')
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: run_codeml.py --genomes-a=FILE file with GenBank Project IDs from complete genomes table on each line for taxon A --genomes-b=FILE file with GenBank Project IDs from complete genomes table on each line for taxon B --sico-zip=FILE archive of aligned & trimmed single copy orthologous (SICO) genes --codeml-zip=FILE destination file path for archive of codeml output per SICO gene --dnds-stats=FILE destination file path for file with dN, dS & dN/dS values per SICO gene """ options = [ 'genomes-a', 'genomes-b', 'sico-zip', 'codeml-zip', 'dnds-stats' ] genome_a_ids_file, genome_b_ids_file, sico_zip, codeml_zip, dnds_file = parse_options( usage, options, args) # Parse file to extract GenBank Project IDs with open(genome_a_ids_file) as read_handle: genome_ids_a = [line.split()[0] for line in read_handle] with open(genome_b_ids_file) as read_handle: genome_ids_b = [line.split()[0] for line in read_handle] # Create run_dir to hold files relating to this run run_dir = tempfile.mkdtemp(prefix='run_codeml_') # Extract files from zip archive sico_files = extract_archive_of_files( sico_zip, create_directory('sicos', inside_dir=run_dir)) # Actually run codeml codeml_files = run_codeml_for_sicos(run_dir, genome_ids_a, genome_ids_b, sico_files) # Write dnds values to single output file _write_dnds_per_ortholog(dnds_file, codeml_files) # Write the produced files to command line argument filenames create_archive_of_files(codeml_zip, codeml_files) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message logging.info("Produced: \n%s\n%s", codeml_zip, dnds_file)
def _phipack_for_all_orthologs(run_dir, aligned_files, stats_file): """Filter aligned fasta files where there is evidence of recombination when inspecting PhiPack values. Return two collections of aligned files, the first without recombination, the second with recombination.""" log.info('Running PhiPack for %i orthologs to find recombination', len(aligned_files)) # Create separate directory for phipack related values phipack_dir = create_directory('phipack', inside_dir=run_dir) with open(stats_file, mode='w') as write_handle: write_handle.write('\t'.join(['Ortholog', 'Informative sites', 'Phi', 'Max Chi^2', 'NSS', 'COGs', 'Product']) + '\n') # Retrieve unique genomes from first ortholog file genome_ids = set(fasta_record.id.split('|')[0] for fasta_record in SeqIO.parse(aligned_files[0], 'fasta')) genome_dicts = select_genomes_by_ids(genome_ids).values() # Assign ortholog files to the correct collection based on whether they show recombination for ortholog_file in aligned_files: orth_name = os.path.split(ortholog_file)[1].split('.')[0] # Parse tree file to ensure all genome_ids_a & genome_ids_b group together in the tree phipack_values = run_phipack(phipack_dir, ortholog_file) # Write PhiPack values to line write_handle.write('{0}\t{1[PhiPack sites]}\t{1[Phi]}\t{1[Max Chi^2]}\t{1[NSS]}'.format(orth_name, phipack_values)) # Parse sequence records again, but now to retrieve cogs and products seq_records = list(SeqIO.parse(ortholog_file, 'fasta')) # COGs cogs = find_cogs_in_sequence_records(seq_records) write_handle.write('\t' + ','.join(cogs)) # Product product = get_most_recent_gene_name(genome_dicts, seq_records) write_handle.write('\t' + product) # End line write_handle.write('\n')
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: split_by_taxa.py --genomes-a=FILE file with genome GenBank Project ID and Organism name on each line for taxon A --genomes-b=FILE file with genome GenBank Project ID and Organism name on each line for taxon B --orthologs-zip=FILE archive of aligned & trimmed single copy orthologous (SICO) genes --taxon-a-zip=FILE destination file path for archive of SICO genes belonging to taxon A --taxon-b-zip=FILE destination file path for archive of SICO genes belonging to taxon B """ options = ['genomes-a', 'genomes-b', 'orthologs-zip', 'taxon-a-zip', 'taxon-b-zip'] genome_a_ids_file, genome_b_ids_file, orthologs_zip, taxon_a_zip, taxon_b_zip = parse_options(usage, options, args) # Parse file containing RefSeq project IDs to extract RefSeq project IDs with open(genome_a_ids_file) as read_handle: lines = [line.split('\t') for line in read_handle] genome_ids_a = [line[0] for line in lines] common_prefix_a = _common_prefix([line[1] for line in lines], 'taxon_a') with open(genome_b_ids_file) as read_handle: lines = [line.split('\t') for line in read_handle] genome_ids_b = [line[0] for line in lines] common_prefix_b = _common_prefix([line[1] for line in lines], 'taxon_b') # Create run_dir to hold files related to this run run_dir = tempfile.mkdtemp(prefix='split_by_taxa_') # Extract files from zip archive ortholog_files = extract_archive_of_files(orthologs_zip, create_directory('alignments', inside_dir=run_dir)) # Actually split alignments per taxon taxon_a_files, taxon_b_files = split_alignment_by_taxa(run_dir, ortholog_files, (genome_ids_a, common_prefix_a), (genome_ids_b, common_prefix_b)) # Write the produced files to command line argument filenames create_archive_of_files(taxon_a_zip, taxon_a_files) create_archive_of_files(taxon_b_zip, taxon_b_files) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info("Produced: \n%s\n%s", taxon_a_zip, taxon_b_zip) return taxon_a_zip, taxon_b_zip
def _translate_genome(tuples_of_gbk_and_ptt_files): """Translate all files for genome and concatenate them into single DNA and Protein fasta files.""" assert tuples_of_gbk_and_ptt_files is not None, 'No genbank files were provided' project_id = tuples_of_gbk_and_ptt_files[0][0] out_dir = create_directory('translations/' + project_id) dna_files = [] protein_files = [] for project_id, gbk_file, ptt_file in tuples_of_gbk_and_ptt_files: dna_file, protein_file = _extract_gene_and_protein(out_dir, project_id, gbk_file, ptt_file) dna_files.append(dna_file) protein_files.append(protein_file) # Concatenate files into one dna_concatemer = os.path.join(out_dir, '{pid}.ffn'.format(pid=project_id)) protein_concatemer = os.path.join(out_dir, '{pid}.faa'.format(pid=project_id)) concatenate(dna_concatemer, dna_files) concatenate(protein_concatemer, protein_files) return dna_concatemer, protein_concatemer
def _trim_alignments(run_dir, dna_alignments, retained_threshold, max_indel_length, stats_file, scatterplot_file): """Trim all DNA alignments using _trim_alignment (singular), and calculate some statistics about the trimming.""" log.info('Trimming {0} DNA alignments from first non-gap codon to last non-gap codon'.format(len(dna_alignments))) # Create directory here, to prevent race-condition when folder does not exist, but is then created by another process trimmed_dir = create_directory('trimmed', inside_dir=run_dir) # Trim all the alignments trim_tpls = [_trim_alignment((trimmed_dir, dna_alignment, max_indel_length)) for dna_alignment in dna_alignments] remaining_percts = [tpl[3] for tpl in trim_tpls] trimmed_alignments = [tpl[0] for tpl in trim_tpls if retained_threshold <= tpl[3]] misaligned = [tpl[0] for tpl in trim_tpls if retained_threshold > tpl[3]] # Write trim statistics to file in such a way that they're easily converted to a graph in Galaxy with open(stats_file, mode='w') as append_handle: msg = '{0:6} sequence alignments trimmed'.format(len(trim_tpls)) log.info(msg) append_handle.write('#' + msg + '\n') average_retained = sum(remaining_percts) / len(remaining_percts) msg = '{0:5.1f}% sequence retained on average overall'.format(average_retained) log.info(msg) append_handle.write('#' + msg + '\n') filtered = len(misaligned) msg = '{0:6} orthologs filtered because less than {1}% sequence retained or because of indel longer than {2} '\ .format(filtered, str(retained_threshold), max_indel_length) log.info(msg) append_handle.write('#' + msg + '\n') append_handle.write('# Trimmed file\tOriginal length\tTrimmed length\tPercentage retained\n') for tpl in sorted(trim_tpls, key=itemgetter(3)): append_handle.write(os.path.split(tpl[0])[1] + '\t') append_handle.write(str(tpl[1]) + '\t') append_handle.write(str(tpl[2]) + '\t') append_handle.write('{0:.2f}\n'.format(tpl[3])) # Create scatterplot using trim_tuples scatterplot(retained_threshold, trim_tpls, scatterplot_file) return sorted(trimmed_alignments), sorted(misaligned)
def _run_neighbor(run_dir, distance_file): """Run neighbor to generate a tree of the distances in the distance file, and return the generated tree file.""" neighbor_dir = create_directory('neighbor', inside_dir=run_dir) # Copy outfile from dnadist to infile inside neighbor_dir shutil.copy(distance_file, os.path.join(neighbor_dir, 'infile')) # Actually run neighbor process = Popen(NEIGHBOR, cwd=neighbor_dir, stdin=PIPE, stdout=PIPE, stderr=STDOUT) process.communicate(input='N\nY\n') # Retrieve newick tree file treefile = os.path.join(neighbor_dir, 'outtree') assert os.path.exists(treefile) and 0 < os.path.getsize( treefile), treefile + ' should exist with some content now' return treefile
def _download_genomes_table(): '''Dowload the prokaryotes.txt genome table file from the NCBI FTP site, save a local copy and return contents.''' cache_dir = create_directory('') prokaryotes = 'prokaryotes.txt' output_file = os.path.join(cache_dir, prokaryotes) # Only download when existing file is older than a day time_between_downloads = 24 * 60 * 60 if not os.path.isfile(output_file) or os.path.getmtime(output_file) < time.time() - time_between_downloads: # Login to FTP site ftp = FTP('ftp.ncbi.nlm.nih.gov') ftp.login(passwd='*****@*****.**') # Download ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt from download_taxa_ncbi import _download_genome_file _download_genome_file(ftp, '/genomes/GENOME_REPORTS', prokaryotes, cache_dir, datetime.now()) # Read file and return content with open(output_file) as read_handle: return read_handle.read()
def _step12_mcl(run_dir, mcl_input_file): """Markov Cluster Algorithm: http://www.micans.org/mcl/ Input: mclInput file Output: mclOutput file mcl my_orthomcl_dir/mclInput --abc -I 1.5 -o my_orthomcl_dir/mclOutput """ # Run mcl mcl_dir = create_directory('mcl', inside_dir=run_dir) mcl_output_file = os.path.join(mcl_dir, 'mclOutput.tsv') mcl_log = os.path.join(mcl_dir, 'mcl.log') with open(mcl_log, mode='w') as open_file: threads = str(multiprocessing.cpu_count()) command = [MCL, mcl_input_file, '--abc', '-I', '1.5', '-o', mcl_output_file, '-te', threads] log.info('Executing: %s', ' '.join(command)) check_call(command, stdout=open_file, stderr=STDOUT) return mcl_output_file
def coding_regions_per_genome(run_dir, trimmed_sicos): """Create a DNA file per genome containing all aligned & trimmed SICO genes als individual genes.""" concatemer_dir = create_directory('coding_regions_per_genome', inside_dir=run_dir) log.info('Creating concatemers from {0} SICOs'.format(len(trimmed_sicos))) # Collections both for output files and their write handles, which will be reused for each SICO coding_region_files = [] write_handles = {} # Loop over trimmed sico files to append each sequence to the right concatemer for trimmed_sico in trimmed_sicos: for seqr in SeqIO.parse(trimmed_sico, 'fasta'): # Sample header line: >58191|NC_010067.1|YP_001569097.1|COG4948MR|core project_id = seqr.id.split('|')[0] # Try to retrieve write handle from dictionary of cached write handles per genome write_handle = write_handles.get(project_id) # If not found, create & store write handle on demand if not write_handle: # Build up output file path for trimmed SICO genes per genome coding_region_file = os.path.join( concatemer_dir, project_id + '.coding-regions.ffn') coding_region_files.append(coding_region_file) # Open write handle write_handle = open(coding_region_file, mode='w') write_handles[project_id] = write_handle # Write sequence record to coding-regions file SeqIO.write(seqr, write_handle, 'fasta') # Close genomes trimmed concatemer write handles for write_handle in write_handles.values(): write_handle.close() log.info('Created %i genome coding regions files', len(coding_region_files)) return sorted(coding_region_files)
def run_phipack(phipack_dir, dna_file): """Run PhiPack and return the number of informative sites, PHI, Max Chi^2 and NSS.""" # Create directory for PhiPack to run in, so files get created there orth_name = os.path.split(dna_file)[1].split('.')[0] rundir = create_directory(orth_name, inside_dir=phipack_dir) # Build up list of commands command = PHIPACK, '-f', dna_file, '-o' # Output NSS & Max Chi^2 try: check_call(command, cwd=rundir, stdout=open('/dev/null', mode='w')) except CalledProcessError as err: log.warn('Error running PhiPack for %s:\n%s', dna_file, err) return { 'PhiPack sites': None, 'Phi': None, 'Max Chi^2': None, 'NSS': None } # Retrieve output log file contents logfile = os.path.join(rundir, 'Phi.log') with open(logfile) as read_handle: contents = ''.join(read_handle) # Parse standard output to retrieve values for # sites, Phi, Chi^2 max & NSS # Found 103 informative sites. # PHI (Normal): 9.04e-01 # Max Chi^2: 6.60e-01 (1000 permutations) # NSS: 6.31e-01 (1000 permutations) sites = int( re.search('Found ([0-9]+) informative sites.', contents).group(1)) raw_phi = re.search('PHI \(Normal\):\s+(.*)', contents).group(1) phi = float(raw_phi) if raw_phi != '--' else None chi = float( re.search('Max Chi\^2:\s+(.*)\s+\(1000 permutations\)', contents).group(1)) nss = float( re.search('NSS:\s+(.*)\s+\(1000 permutations\)', contents).group(1)) return {'PhiPack sites': sites, 'Phi': phi, 'Max Chi^2': chi, 'NSS': nss}
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: run_codeml.py --genomes-a=FILE file with GenBank Project IDs from complete genomes table on each line for taxon A --genomes-b=FILE file with GenBank Project IDs from complete genomes table on each line for taxon B --sico-zip=FILE archive of aligned & trimmed single copy orthologous (SICO) genes --codeml-zip=FILE destination file path for archive of codeml output per SICO gene --dnds-stats=FILE destination file path for file with dN, dS & dN/dS values per SICO gene """ options = ['genomes-a', 'genomes-b', 'sico-zip', 'codeml-zip', 'dnds-stats'] genome_a_ids_file, genome_b_ids_file, sico_zip, codeml_zip, dnds_file = parse_options(usage, options, args) # Parse file to extract GenBank Project IDs with open(genome_a_ids_file) as read_handle: genome_ids_a = [line.split()[0] for line in read_handle] with open(genome_b_ids_file) as read_handle: genome_ids_b = [line.split()[0] for line in read_handle] # Create run_dir to hold files relating to this run run_dir = tempfile.mkdtemp(prefix='run_codeml_') # Extract files from zip archive sico_files = extract_archive_of_files(sico_zip, create_directory('sicos', inside_dir=run_dir)) # Actually run codeml codeml_files = run_codeml_for_sicos(run_dir, genome_ids_a, genome_ids_b, sico_files) # Write dnds values to single output file _write_dnds_per_ortholog(dnds_file, codeml_files) # Write the produced files to command line argument filenames create_archive_of_files(codeml_zip, codeml_files) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message logging.info("Produced: \n%s\n%s", codeml_zip, dnds_file)
def run_codeml_for_sicos(codeml_dir, genome_ids_a, genome_ids_b, sico_files): """Run codeml for representatives of clades A and B in each of the SICO files, to calculate dN/dS.""" logging.info('Running codeml for %s aligned and trimmed SICOs', len(sico_files)) codeml_files = [] for sico_file in sico_files: # Separate alignments for clade A & clade B genomes ali = AlignIO.read(sico_file, 'fasta') alignment_a = MultipleSeqAlignment(seqr for seqr in ali if seqr.id.split('|')[0] in genome_ids_a) alignment_b = MultipleSeqAlignment(seqr for seqr in ali if seqr.id.split('|')[0] in genome_ids_b) # Create sub directory for this run based on sico_file name filename = os.path.split(sico_file)[1] # Split off everything starting from the first dot base_name = filename[:filename.find('.')] sub_dir = create_directory(base_name, inside_dir=codeml_dir) # Submit for asynchronous calculation codeml_file = run_codeml(sub_dir, alignment_a, alignment_b) codeml_files.append(codeml_file) return codeml_files
def _download_genomes_table(): '''Dowload the prokaryotes.txt genome table file from the NCBI FTP site, save a local copy and return contents.''' cache_dir = create_directory('') prokaryotes = 'prokaryotes.txt' output_file = os.path.join(cache_dir, prokaryotes) # Only download when existing file is older than a day time_between_downloads = 24 * 60 * 60 if not os.path.isfile(output_file) or os.path.getmtime( output_file) < time.time() - time_between_downloads: # Login to FTP site ftp = FTP('ftp.ncbi.nlm.nih.gov') ftp.login(passwd='*****@*****.**') # Download ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt from download_taxa_ncbi import _download_genome_file _download_genome_file(ftp, '/genomes/GENOME_REPORTS', prokaryotes, cache_dir, datetime.now()) # Read file and return content with open(output_file) as read_handle: return read_handle.read()
def coding_regions_per_genome(run_dir, trimmed_sicos): """Create a DNA file per genome containing all aligned & trimmed SICO genes als individual genes.""" concatemer_dir = create_directory('coding_regions_per_genome', inside_dir=run_dir) log.info('Creating concatemers from {0} SICOs'.format(len(trimmed_sicos))) # Collections both for output files and their write handles, which will be reused for each SICO coding_region_files = [] write_handles = {} # Loop over trimmed sico files to append each sequence to the right concatemer for trimmed_sico in trimmed_sicos: for seqr in SeqIO.parse(trimmed_sico, 'fasta'): # Sample header line: >58191|NC_010067.1|YP_001569097.1|COG4948MR|core project_id = seqr.id.split('|')[0] # Try to retrieve write handle from dictionary of cached write handles per genome write_handle = write_handles.get(project_id) # If not found, create & store write handle on demand if not write_handle: # Build up output file path for trimmed SICO genes per genome coding_region_file = os.path.join(concatemer_dir, project_id + '.coding-regions.ffn') coding_region_files.append(coding_region_file) # Open write handle write_handle = open(coding_region_file, mode='w') write_handles[project_id] = write_handle # Write sequence record to coding-regions file SeqIO.write(seqr, write_handle, 'fasta') # Close genomes trimmed concatemer write handles for write_handle in write_handles.values(): write_handle.close() log.info('Created %i genome coding regions files', len(coding_region_files)) return sorted(coding_region_files)
def _step12_mcl(run_dir, mcl_input_file): """Markov Cluster Algorithm: http://www.micans.org/mcl/ Input: mclInput file Output: mclOutput file mcl my_orthomcl_dir/mclInput --abc -I 1.5 -o my_orthomcl_dir/mclOutput """ # Run mcl mcl_dir = create_directory('mcl', inside_dir=run_dir) mcl_output_file = os.path.join(mcl_dir, 'mclOutput.tsv') mcl_log = os.path.join(mcl_dir, 'mcl.log') with open(mcl_log, mode='w') as open_file: threads = str(multiprocessing.cpu_count()) command = [ MCL, mcl_input_file, '--abc', '-I', '1.5', '-o', mcl_output_file, '-te', threads ] log.info('Executing: %s', ' '.join(command)) check_call(command, stdout=open_file, stderr=STDOUT) return mcl_output_file
def _phipack_for_all_orthologs(run_dir, aligned_files, genome_ids_a, genome_ids_b): """Filter aligned fasta files where there is evidence of recombination when inspecting phylogenetic trees. Return two collections of aligned files, the first without recombination, the second with recombination.""" log.info( 'Filtering orthologs where phylogenetic trees show evidence of inter-taxon recombination' ) # Collections to hold both non recombination files & files showing recombination non_recomb = [] recombined = [] # Assign ortholog files to the correct collection based on whether they show recombination for ortholog_file in aligned_files: # Determine input file base name to create an ortholog run specific directory base_name = os.path.split(os.path.splitext(ortholog_file)[0])[1] ortholog_dir = create_directory(base_name, inside_dir=run_dir) # Create distance file distance_file = _run_dna_dist(ortholog_dir, ortholog_file) # Create tree file tree_file = _run_neighbor(ortholog_dir, distance_file) # Parse tree file to ensure all genome_ids_a & genome_ids_b group together in the tree if _tree_shows_recombination(genome_ids_a, genome_ids_b, tree_file): recombined.append(ortholog_file) else: non_recomb.append(ortholog_file) log.info( '%i Orthologs out of %i were filtered out due to recombination, leaving %i non recombined orthologs', len(recombined), len(aligned_files), len(non_recomb)) return non_recomb, recombined
def _align_sicos(run_dir, sico_files): """Align all SICO files given as argument in parallel and return the resulting alignment files.""" log.info('Aligning {0} SICO genes using TranslatorX & muscle.'.format(len(sico_files))) # We'll multiplex this embarrassingly parallel task using a pool of workers return [_run_translatorx((run_dir, sico_file)) for sico_file in sico_files] def _run_translatorx((run_dir, sico_file), translation_table=CODON_TABLE_ID): """Run TranslatorX to create DNA level alignment file of protein level aligned DNA sequences within sico_file.""" assert os.path.exists(TRANSLATORX) and os.access(TRANSLATORX, os.X_OK), 'Could not find or run ' + TRANSLATORX # Determine output file name sico_base = os.path.splitext(os.path.split(sico_file)[1])[0] alignment_dir = create_directory('alignments/' + sico_base, inside_dir=run_dir) # Created output file file_base = os.path.join(alignment_dir, sico_base) dna_alignment = file_base + '.nt_ali.fasta' # Actually run the TranslatorX program command = [TRANSLATORX, '-i', sico_file, '-c', str(translation_table), '-o', file_base] check_call(command, stdout=open('/dev/null', 'w'), stderr=STDOUT) assert os.path.isfile(dna_alignment) and 0 < os.path.getsize(dna_alignment), \ 'Alignment file should exist and have some content now: {0}'.format(dna_alignment) return dna_alignment
def _step5_orthomcl_adjust_fasta(run_dir, proteome_files, id_field=3): """Create an OrthoMCL compliant .fasta file, by adjusting definition lines. Usage: orthomclAdjustFasta taxon_code fasta_file id_field where: taxon_code: a three or four letter unique abbreviation for the taxon fasta_file: the input fasta file per proteome id_field: a number indicating what field in the definition line contains the protein ID. Fields are separated by either ' ' or '|'. Any spaces immediately following the '>' are ignored. The first field is 1. For example, in the following definition line, the ID (AP_000668.1) is in field 4: >gi|89106888|ref|AP_000668.1| Input file requirements: (1) .fasta format (2) a unique id is provided for each sequence, and is in the field specified by id_field Output file format: (1) .fasta format (2) definition line is of the form: >taxoncode|unique_protein_id The output file is named taxoncode.fasta Note: if your input files do not meet the requirements, you can do some simple perl or awk processing of them to create the required input files to this program, or the required output files. This program is provided as a convenience, but OrthoMCL users are expected to have the scripting skills to provide compliant .fasta files. EXAMPLE: orthomclSoftware/bin/orthomclAdjustFasta hsa Homo_sapiens.NCBI36.53.pep.all.fa 1 """ # Create directory to hold compliant fasta adjusted_fasta_dir = create_directory('compliant_fasta', inside_dir=run_dir) adjusted_fasta_files = [] for proteome_file in proteome_files: taxon_code = None # Use first part of header of first entry as taxon code for record in SeqIO.parse(proteome_file, 'fasta'): taxon_code = record.id.split('|')[0].replace('.', '_') break # If we failed to extract a taxon_code, proteome file must have been empty assert taxon_code, 'Proteome file appears empty: ' + proteome_file # Call orhtomclAdjustFasta command = [ ORTHOMCL_ADJUST_FASTA, taxon_code, proteome_file, str(id_field) ] log.info('Executing: %s', ' '.join(command)) check_call(command) # Move resulting fasta file to compliantFasta directory adjusted_fasta_file = taxon_code + '.fasta' fasta_file_destination = os.path.join(adjusted_fasta_dir, adjusted_fasta_file) shutil.move(adjusted_fasta_file, fasta_file_destination) adjusted_fasta_files.append(fasta_file_destination) # Return path to directory containing compliantFasta return adjusted_fasta_dir, adjusted_fasta_files
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: extract_orthologs.py --genomes=FILE file with GenBank Project IDs from complete genomes table on each line --dna-zip=FILE zip archive of extracted DNA files --groups=FILE file listing groups of orthologous proteins --require-limiter flag whether extracted core set of genomes should contain the limiter added in OrthoMCL [OPTIONAL] --sico-zip=FILE destination file path for archive of shared single copy orthologous (SICO) genes --muco-zip=FILE destination file path for archive of shared multiple copy orthologous genes --subset-zip=FILE destination file path for archive of variable copy orthologous genes shared for a subset only --stats=FILE destination file path for ortholog statistics file --heatmap=FILE destination file path heatmap of orthologs and occurrences of ortholog per genome --orfans=FILE destination file path ORFans """ options = ['genomes', 'dna-zip', 'groups', 'require-limiter?', 'sico-zip', 'muco-zip=?', 'subset-zip=?', 'stats', 'heatmap', 'orfans'] genome_ids_file, dna_zip, groups_file, require_limiter, \ target_sico, target_muco, target_subset, target_stats_path, target_heat, target_orfans = \ parse_options(usage, options, args) # Parse file extract GenBank Project IDs with open(genome_ids_file) as read_handle: genomes = [line.split()[0] for line in read_handle if not line.startswith('#')] # Create temporary directory within which to extract orthologs run_dir = tempfile.mkdtemp(prefix='extract_orthologs_run_') # Extract files from zip archive temp_dir = create_directory('dna_files', inside_dir=run_dir) dna_files = extract_archive_of_files(dna_zip, temp_dir) # Actually run ortholog extraction sico_files, muco_files, subset_files, stats_file, heatmap_file, orfans_file = \ extract_orthologs(run_dir, genomes, dna_files, groups_file, require_limiter) # Append the orfans to the heatmap file _append_orfans_to_heatmap(orfans_file, genomes, heatmap_file) # Move produced files to command line specified output paths create_archive_of_files(target_sico, sico_files) if target_muco: create_archive_of_files(target_muco, muco_files) if target_subset: create_archive_of_files(target_subset, subset_files) shutil.move(stats_file, target_stats_path) shutil.move(heatmap_file, target_heat) shutil.move(orfans_file, target_orfans) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info("Produced:") log.info("%s", target_sico) if target_muco: log.info("%s", target_muco) if target_subset: log.info("%s", target_subset) log.info("%s", target_stats_path) log.info("%s", target_heat)
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: filter_orthologs.py --orthologs-zip=FILE archive of orthologous genes in FASTA format --filter-multiple-cogs filter orthologs with multiple COG annotations among genes [OPTIONAL] --filter-recombination=FILE filter orthologs that show recombination when comparing phylogenetic trees [OPTIONAL] destination file path for archive of recombination orthologs --recombined-crosstable=FILE destination file path for recombined crosstable of GeneIDs, COGs and Products [OPTIONAL] --taxon-a=FILE file with genome IDs for taxon A to use in recombination filtering --taxon-b=FILE file with genome IDs for taxon B to use in recombination filtering --retained-zip=FILE destination file path for archive of retained orthologs after filtering --orthologs-per-genome=FILE destination file path for orthologs split out per genome, based on the retained.zip --concatemer=FILE destination file path for super-concatemer of all genomes """ options = ('orthologs-zip', 'filter-multiple-cogs=?', 'filter-recombination=?', 'recombined-crosstable=?', 'taxon-a=?', 'taxon-b=?', 'retained-zip', 'orthologs-per-genome', 'concatemer') orthologs_zip, filter_cogs, filter_recombination, recombined_crosstable, \ taxona, taxonb, retained_zip, target_orth_per_genome, target_concat_file = parse_options(usage, options, args) # Run filtering in a temporary folder, to prevent interference from simultaneous runs run_dir = tempfile.mkdtemp(prefix='filter_orthologs_') # Extract files from zip archive temp_dir = create_directory('orthologs', inside_dir=run_dir) ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir) # Filter orthologs with multiple COG annotations among genes if flag was set if filter_cogs: ortholog_files, transfered_cogs = _filter_multiple_cog_orthologs(run_dir, ortholog_files) # Possible extension: filter ortholog when any strain has been flagged as 'mobile element', 'phage' or 'IS element' # Filter orthologs that show recombination when comparing phylogenetic trees if flag was set if filter_recombination: # Parse file to extract GenBank Project IDs with open(taxona) as read_handle: genome_ids_a = [line.split()[0] for line in read_handle] with open(taxonb) as read_handle: genome_ids_b = [line.split()[0] for line in read_handle] ortholog_files, recombined_files = _phipack_for_all_orthologs(run_dir, ortholog_files, genome_ids_a, genome_ids_b) # Create crosstable create_crosstable(recombined_files, recombined_crosstable) # Create archives of files on command line specified output paths if filter_cogs: shutil.move(transfered_cogs, filter_cogs) if filter_recombination: create_archive_of_files(filter_recombination, recombined_files) create_archive_of_files(retained_zip, ortholog_files) # Run the steps required after filtering orthologs post_recombination_filter(taxona, taxonb, retained_zip, target_orth_per_genome, target_concat_file, run_dir) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info('Produced:') if filter_cogs: log.info(filter_cogs) if filter_recombination: log.info(filter_recombination) log.info(retained_zip) log.info(target_orth_per_genome) log.info(target_concat_file)
def _step6_orthomcl_filter_fasta(run_dir, input_dir, min_length=10, max_percent_stop=20): """Create goodProteins.fasta containing all good proteins and rejectProteins.fasta containing all rejects. Input is a directory containing a set of compliant input .fasta files (as produced by orthomclAdjustFasta). Usage: orthomclFilterFasta input_dir min_length max_percent_stops where: input_dir: a directory containing a set of .fasta files min_length: minimum allowed length of proteins. (suggested: 10) max_percent_stop: maximum percent stop codons. (suggested 20) The input requirements are: 1) a compliantFasta/ directory which contains all and only the proteome .fasta files, one file per proteome. 2) each .fasta file must have a name in the form 'xxxx.fasta' where xxxx is a three or four letter unique taxon code. For example: hsa.fasta or eco.fasta 3) each protein in those files must have a definition line in the following format: >xxxx|yyyyyy where xxxx is the three or four letter taxon code and yyyyyy is a sequence identifier unique within that taxon. Output: my_orthomcl_dir/goodProteins.fasta my_orthomcl_dir/poorProteins.fasta report of suspicious proteomes (> 10% poor proteins) EXAMPLE: orthomclSoftware/bin/orthomclFilterFasta my_orthomcl_dir/compliantFasta 10 20 """ # Run orthomclFilterFasta out_dir = create_directory('filtered_fasta', inside_dir=run_dir) report = os.path.join(out_dir, 'filter_report.log') with open(report, mode='w') as report_file: command = [ ORTHOMCL_FILTER_FASTA, input_dir, str(min_length), str(max_percent_stop) ] log.info('Executing: %s', ' '.join(command)) check_call(command, stdout=report_file, stderr=STDOUT) # Move output files to out directory good = os.path.join(out_dir, 'good_proteins.fasta') poor = os.path.join(out_dir, 'poor_proteins.fasta') shutil.move('goodProteins.fasta', good) shutil.move('poorProteins.fasta', poor) # Ensure neither of the proteomes is suspicious according to min_length & max_percent_stop with open(report) as report_file: if 'Proteomes with > 10% poor proteins:' in report_file.read( ): # OrthoMCL does NOT add actual min_length value msg = 'OrthomclFilterFasta found suspicious proteomes based on values for length' log.error(msg) assert False, msg # Warn the user about the poor proteins found here, if they were found at all poor_records = list(SeqIO.parse(poor, 'fasta')) if poor_records: log.warn('%i poor sequence records identified by orthomclFilterFasta:', len(poor_records)) for seqr in poor_records: log.warn('>%s: %s', seqr.id, seqr.seq) # Assert good exists and has some content assert os.path.isfile(good) and 0 < os.path.getsize( good), good + ' should exist and have some content' # Only good and poor proteins return good, poor
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: extract_orthologs.py --genomes=FILE file with GenBank Project IDs from complete genomes table on each line --dna-zip=FILE zip archive of extracted DNA files --groups=FILE file listing groups of orthologous proteins --require-limiter flag whether extracted core set of genomes should contain the limiter added in OrthoMCL [OPTIONAL] --sico-zip=FILE destination file path for archive of shared single copy orthologous (SICO) genes --muco-zip=FILE destination file path for archive of shared multiple copy orthologous genes --subset-zip=FILE destination file path for archive of variable copy orthologous genes shared for a subset only --stats=FILE destination file path for ortholog statistics file --heatmap=FILE destination file path heatmap of orthologs and occurrences of ortholog per genome --orfans=FILE destination file path ORFans """ options = [ 'genomes', 'dna-zip', 'groups', 'require-limiter?', 'sico-zip', 'muco-zip=?', 'subset-zip=?', 'stats', 'heatmap', 'orfans' ] genome_ids_file, dna_zip, groups_file, require_limiter, \ target_sico, target_muco, target_subset, target_stats_path, target_heat, target_orfans = \ parse_options(usage, options, args) # Parse file extract GenBank Project IDs with open(genome_ids_file) as read_handle: genomes = [ line.split()[0] for line in read_handle if not line.startswith('#') ] # Create temporary directory within which to extract orthologs run_dir = tempfile.mkdtemp(prefix='extract_orthologs_run_') # Extract files from zip archive temp_dir = create_directory('dna_files', inside_dir=run_dir) dna_files = extract_archive_of_files(dna_zip, temp_dir) # Actually run ortholog extraction sico_files, muco_files, subset_files, stats_file, heatmap_file, orfans_file = \ extract_orthologs(run_dir, genomes, dna_files, groups_file, require_limiter) # Append the orfans to the heatmap file _append_orfans_to_heatmap(orfans_file, genomes, heatmap_file) # Move produced files to command line specified output paths create_archive_of_files(target_sico, sico_files) if target_muco: create_archive_of_files(target_muco, muco_files) if target_subset: create_archive_of_files(target_subset, subset_files) shutil.move(stats_file, target_stats_path) shutil.move(heatmap_file, target_heat) shutil.move(orfans_file, target_orfans) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info("Produced:") log.info("%s", target_sico) if target_muco: log.info("%s", target_muco) if target_subset: log.info("%s", target_subset) log.info("%s", target_stats_path) log.info("%s", target_heat)
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: concatenate_orthologs.py --orthologs-zip=FILE archive of orthologous genes in FASTA format --coding-regions=FILE destination file path archive of trimmed orthologous coding regions per genomes --concatemer=FILE destination file path for super-concatemer of all genomes --taxon-a=FILE destination file path for genome IDs for taxon A --taxon-b=FILE destination file path for genome IDs for taxon B --tree=FILE destination file path for tree visualization """ options = [ 'orthologs-zip', 'coding-regions', 'concatemer', 'taxon-a', 'taxon-b', 'tree' ] orthologs_zip, target_coding_regions, target_concat_file, target_taxon_a, target_taxon_b, target_tree = \ parse_options(usage, options, args) # Run filtering in a temporary folder, to prevent interference from simultaneous runs run_dir = tempfile.mkdtemp(prefix='concatemer_tree_') # Extract files from zip archive temp_dir = create_directory('orthologs', inside_dir=run_dir) ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir) # Separate out orthologs per genome to create trimmed coding region files per genome genome_coding_regions_files = coding_regions_per_genome( run_dir, ortholog_files) create_archive_of_files(target_coding_regions, genome_coding_regions_files) # Concatenate coding region files per genome concatemer_files = concatemer_per_genome(run_dir, genome_coding_regions_files) # Create super concatemer create_super_concatemer(concatemer_files, target_concat_file) # Determine the taxa present in the super concatemer tree by building a phylogenetic tree from genome concatemer and # reading genome ids in the two largest clades. super_distance_file = _run_dna_dist(run_dir, target_concat_file) super_tree_file = _run_neighbor(run_dir, super_distance_file) genome_ids_a, genome_ids_b = _read_taxa_from_tree(super_tree_file) # Map Project IDs to Organism names id_to_name_map = dict( (gid, genome['Organism/Name']) for gid, genome in select_genomes_by_ids(genome_ids_a + genome_ids_b).iteritems()) # Write Project IDs and Organism Names to files, with a fallback to genome_id for external genome with open(target_taxon_a, mode='w') as write_handle: for genome_id in genome_ids_a: write_handle.write('{id}\t{name}\n'.format(id=genome_id, name=id_to_name_map.get( genome_id, genome_id))) with open(target_taxon_b, mode='w') as write_handle: for genome_id in genome_ids_b: write_handle.write('{id}\t{name}\n'.format(id=genome_id, name=id_to_name_map.get( genome_id, genome_id))) # Visualize tree visualize_tree(super_tree_file, id_to_name_map, target_tree) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info('Produced: \n%s\n%s\n%s\n%s\n%s', target_coding_regions, target_concat_file, target_taxon_a, target_taxon_b, target_tree)
def download_genome_files(genome, download_log=None, require_ptt=False, refseq_column='Chromosomes/RefSeq'): """Download genome .gbk & .ptt files from ncbi ftp and return pairs per accessioncode in tuples of three.""" logging.debug('Downloading: %s', genome) # ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Sulfolobus_islandicus_M_14_25_uid18871/CP001400.ffn # Download using FTP ftp = FTP('ftp.ncbi.nlm.nih.gov') ftp.login(passwd='*****@*****.**') # Try to find project directory in RefSeq curated listing projectid = genome['Assembly Accession'] folder = '/genomes/ASSEMBLY_BACTERIA/{}'.format(genome['FTP Path']) accessioncodes = genome[refseq_column] target_dir = create_directory('genomes/' + projectid) # Determine last modified date to see if we should redownload the file following changes last_change_date = genome['Modify Date'] if genome[ 'Modify Date'] else genome['Release Date'] # Download .gbk & .ptt files for all genome accessioncodes and append them to this list as tuples of gbk + ptt genome_files = [] for acc in accessioncodes: # Remove version suffixes to accessioncodes, such as NC_0012345.2 acc = acc.split('.')[0] # Try genbank file, which is always required try: gbk_file = _download_genome_file(ftp, folder, acc + '.gbk', target_dir, last_change_date) # Try to parse Bio.GenBank.Record to see if it contains more than five (arbitrary) feature records features = SeqIO.read(gbk_file, 'genbank').features if not any(feature.type == 'CDS' for feature in features): # Skip when genbank file does not contain any coding sequence features logging.warn( 'GenBank file %s did not contain any coding sequence features', acc) continue except error_perm as err: if 'No such file or directory' not in str(err): raise err logging.warn(err) logging.warn('GenBank file %s missing for %s', acc, projectid) continue except IOError as err: if 'Target file was empty after download' not in str(err): raise err logging.warn(err) continue # Try protein table file, which could be optional ptt_file = None try: ptt_file = _download_genome_file(ftp, folder, acc + '.ptt', target_dir, last_change_date) except error_perm as err: if 'No such file or directory' not in str(err): raise err logging.warn(err) if require_ptt: logging.warn( 'Protein table file %s missing for %s: Probably no coding sequences', acc, projectid) continue except IOError as err: if 'Target file was empty after download' not in str(err): raise err logging.warn(err) continue genome_files.append((projectid, gbk_file, ptt_file)) # Be nice and close the connection ftp.close() if len(genome_files) == 0: # Write out commented out line to the logfile detailing this error if download_log: with open(download_log, mode='a') as append_handle: append_handle.write('#{0}\t{1}\t'.format( projectid, genome['Organism/Name'])) append_handle.write( '# Genome skipped because of missing files\n') # Return nothing when: #- none of the accessioncodes resulted in files #- there were no protein table files when they were required #- no folder could be found for projectid return None # Write out provenance logfile with sources of retrieved files # This file could coincidentally also serve as genome ID file for extract taxa if download_log: with open(download_log, mode='a') as append_handle: append_handle.write('{0}\t{1}\t{2}{3}\n'.format( projectid, genome['Organism/Name'], ftp.host, folder)) # Extend with if refseq_column == 'Chromosomes/RefSeq': plasmid_files = download_plasmid_files(genome) if plasmid_files: genome_files.extend(plasmid_files) # Return genome files return genome_files
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: filter_orthologs.py --orthologs-zip=FILE archive of orthologous genes in FASTA format --filter-multiple-cogs filter orthologs with multiple COG annotations among genes [OPTIONAL] --filter-recombination=FILE filter orthologs that show recombination when comparing phylogenetic trees [OPTIONAL] destination file path for archive of recombination orthologs --recombined-crosstable=FILE destination file path for recombined crosstable of GeneIDs, COGs and Products [OPTIONAL] --taxon-a=FILE file with genome IDs for taxon A to use in recombination filtering --taxon-b=FILE file with genome IDs for taxon B to use in recombination filtering --retained-zip=FILE destination file path for archive of retained orthologs after filtering --orthologs-per-genome=FILE destination file path for orthologs split out per genome, based on the retained.zip --concatemer=FILE destination file path for super-concatemer of all genomes """ options = ('orthologs-zip', 'filter-multiple-cogs=?', 'filter-recombination=?', 'recombined-crosstable=?', 'taxon-a=?', 'taxon-b=?', 'retained-zip', 'orthologs-per-genome', 'concatemer') orthologs_zip, filter_cogs, filter_recombination, recombined_crosstable, \ taxona, taxonb, retained_zip, target_orth_per_genome, target_concat_file = parse_options(usage, options, args) # Run filtering in a temporary folder, to prevent interference from simultaneous runs run_dir = tempfile.mkdtemp(prefix='filter_orthologs_') # Extract files from zip archive temp_dir = create_directory('orthologs', inside_dir=run_dir) ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir) # Filter orthologs with multiple COG annotations among genes if flag was set if filter_cogs: ortholog_files, transfered_cogs = _filter_multiple_cog_orthologs( run_dir, ortholog_files) # Possible extension: filter ortholog when any strain has been flagged as 'mobile element', 'phage' or 'IS element' # Filter orthologs that show recombination when comparing phylogenetic trees if flag was set if filter_recombination: # Parse file to extract GenBank Project IDs with open(taxona) as read_handle: genome_ids_a = [line.split()[0] for line in read_handle] with open(taxonb) as read_handle: genome_ids_b = [line.split()[0] for line in read_handle] ortholog_files, recombined_files = _phipack_for_all_orthologs( run_dir, ortholog_files, genome_ids_a, genome_ids_b) # Create crosstable create_crosstable(recombined_files, recombined_crosstable) # Create archives of files on command line specified output paths if filter_cogs: shutil.move(transfered_cogs, filter_cogs) if filter_recombination: create_archive_of_files(filter_recombination, recombined_files) create_archive_of_files(retained_zip, ortholog_files) # Run the steps required after filtering orthologs post_recombination_filter(taxona, taxonb, retained_zip, target_orth_per_genome, target_concat_file, run_dir) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info('Produced:') if filter_cogs: log.info(filter_cogs) if filter_recombination: log.info(filter_recombination) log.info(retained_zip) log.info(target_orth_per_genome) log.info(target_concat_file)
def _align_sicos(run_dir, sico_files): """Align all SICO files given as argument in parallel and return the resulting alignment files.""" log.info('Aligning {0} SICO genes using TranslatorX & muscle.'.format( len(sico_files))) # We'll multiplex this embarrassingly parallel task using a pool of workers return [_run_translatorx((run_dir, sico_file)) for sico_file in sico_files] def _run_translatorx((run_dir, sico_file), translation_table=CODON_TABLE_ID): """Run TranslatorX to create DNA level alignment file of protein level aligned DNA sequences within sico_file.""" assert os.path.exists(TRANSLATORX) and os.access( TRANSLATORX, os.X_OK), 'Could not find or run ' + TRANSLATORX # Determine output file name sico_base = os.path.splitext(os.path.split(sico_file)[1])[0] alignment_dir = create_directory('alignments/' + sico_base, inside_dir=run_dir) # Created output file file_base = os.path.join(alignment_dir, sico_base) dna_alignment = file_base + '.nt_ali.fasta' # Actually run the TranslatorX program command = [ TRANSLATORX, '-i', sico_file, '-c', str(translation_table), '-o', file_base ] check_call(command, stdout=open('/dev/null', 'w'), stderr=STDOUT) assert os.path.isfile(dna_alignment) and 0 < os.path.getsize(dna_alignment), \ 'Alignment file should exist and have some content now: {0}'.format(dna_alignment) return dna_alignment