def _parse_fastani_results(self, fastout_file, dict_results): """ Parse the fastani output file Parameters ---------- fastout_file : str fastani output file. Returns ------- dictionary dict_results[user_g]={ref_genome1:{"af":af,"ani":ani},ref_genome2:{"af":af,"ani":ani}} """ with open(fastout_file, 'r') as fastfile: for line in fastfile: info = line.strip().split() ref_genome = os.path.basename(info[1]).replace( Config.FASTANI_GENOMES_EXT, "") user_g = remove_extension(os.path.basename(info[0])) ani = float(info[2]) af = round(float(info[3]) / float(info[4]), 2) if user_g in dict_results: dict_results[user_g][ref_genome] = {"ani": ani, 'af': af} else: dict_results[user_g] = {ref_genome: {"ani": ani, "af": af}} return dict_results
def _parse_fastani_results_reverse(self, fastout_file, dict_parser_distance): # TODO: Merge _parse_fastani_results and _parse_fastani_results_reverse """ Parse the fastani output file for the reverse comparison and pick the best ANI and AF Parameters ---------- fastout_file : fastani output file. dict_parser_distance: dictionaryof user genomes vs list of refrence genomes with ANI and AF Returns ------- dictionary dict_parser_distance[user_g]={ref_genome1:{"af":af,"ani":ani},ref_genome2:{"af":af,"ani":ani}} """ with open(fastout_file) as fastfile: for line in fastfile: info = line.strip().split() ref_genome = os.path.basename(info[0]).replace( Config.FASTANI_GENOMES_EXT, "") user_g = remove_extension(os.path.basename(info[1])) ani = float(info[2]) af = round(float(info[3]) / float(info[4]), 2) if user_g in dict_parser_distance: if ref_genome in dict_parser_distance.get(user_g): if dict_parser_distance.get(user_g).get(ref_genome).get('ani') < ani: dict_parser_distance[user_g][ref_genome]["ani"] = ani if dict_parser_distance.get(user_g).get(ref_genome).get('af') < af: dict_parser_distance[user_g][ref_genome]["af"] = af else: dict_parser_distance[user_g][ref_genome] = {"ani": ani, 'af': af} else: dict_parser_distance[user_g] = {ref_genome: {"ani": ani, "af": af}} return dict_parser_distance
def _genomes_to_process(self, genome_dir, batchfile, extension): """Get genomes to process. Parameters ---------- genome_dir : str Directory containing genomes. batchfile : str File describing genomes. extension : str Extension of files to process. Returns ------- genomic_files : d[genome_id] -> FASTA file Map of genomes to their genomic FASTA files. """ genomic_files, tln_tables = dict(), dict() if genome_dir: for f in os.listdir(genome_dir): if f.endswith(extension): genome_id = remove_extension(f, extension) genomic_files[genome_id] = os.path.join(genome_dir, f) elif batchfile: batchfile_fh = Batchfile(batchfile) genomic_files, tln_tables = batchfile_fh.genome_path, batchfile_fh.genome_tln # Check that all of the genome IDs are valid. for genome_key in genomic_files: self._verify_genome_id(genome_key) # Check that the prefix is valid and the path exists invalid_paths = list() for genome_key, genome_path in genomic_files.items(): if not os.path.isfile(genome_path): invalid_paths.append((genome_key, genome_path)) # Report on any invalid paths if len(invalid_paths) > 0: self.warnings.info(f'Reading from batchfile: {batchfile}') self.warnings.error(f'The following {len(invalid_paths)} genomes ' f'have invalid paths specified in the batchfile:') for g_path, g_gid in invalid_paths: self.warnings.info(f'{g_gid}\t{g_path}') raise GTDBTkExit(f'There are {len(invalid_paths)} paths in the ' f'batchfile which do not exist, see gtdb.warnings.log') if len(genomic_files) == 0: if genome_dir: self.logger.error('No genomes found in directory: %s. Check ' 'the --extension flag used to identify ' 'genomes.' % genome_dir) else: self.logger.error('No genomes found in batch file: %s. Please ' 'check the format of this file.' % batchfile) raise GTDBTkExit invalid_genomes = set(genomic_files.keys()) & set(get_reference_ids()) if len(invalid_genomes) > 0: self.warnings.info(f'The following {len(invalid_genomes)} have the ' f'same ID as GTDB-Tk reference genomes:') for invalid_genome in sorted(invalid_genomes): self.warnings.info(invalid_genome) raise GTDBTkExit(f'You have {len(invalid_genomes)} genomes with the ' f'same id as GTDB-Tk reference genomes, please ' f'rename them. See gtdb.warnings.log.') return genomic_files, tln_tables
def _genomes_to_process(self, genome_dir, batchfile, extension): """Get genomes to process. Parameters ---------- genome_dir : str Directory containing genomes. batchfile : str File describing genomes. extension : str Extension of files to process. Returns ------- genomic_files : d[genome_id] -> FASTA file Map of genomes to their genomic FASTA files. """ genomic_files = {} if genome_dir: for f in os.listdir(genome_dir): if f.endswith(extension): genome_id = remove_extension(f) genomic_files[genome_id] = os.path.join(genome_dir, f) elif batchfile: with open(batchfile, "r") as fh: for line_no, line in enumerate(fh): line_split = line.strip().split("\t") if line_split[0] == '': continue # blank line if len(line_split) != 2: self.logger.error( 'Batch file must contain exactly 2 columns.') raise GenomeBatchfileMalformed genome_file, genome_id = line_split self._verify_genome_id(genome_id) if genome_file is None or genome_file == '': raise GTDBTkExit('Missing genome file on line %d.' % (line_no + 1)) elif genome_id is None or genome_id == '': raise GTDBTkExit('Missing genome ID on line %d.' % (line_no + 1)) elif genome_id in genomic_files: raise GTDBTkExit( 'Genome ID %s appears multiple times.' % genome_id) if genome_file in genomic_files.values(): self.logger.warning( 'Genome file appears multiple times: %s' % genome_file) genomic_files[genome_id] = genome_file # Check that the prefix is valid and the path exists invalid_paths = list() for genome_key, genome_path in genomic_files.items(): if genome_key.startswith("RS_") or genome_key.startswith("GB_") \ or genome_key.startswith("UBA"): self.logger.error( "Submitted genomes start with the same prefix" " (RS_,GB_,UBA) as reference genomes in" " GTDB-Tk. This will cause issues for" " downstream analysis.") raise GTDBTkExit if not os.path.isfile(genome_path): invalid_paths.append((genome_key, genome_path)) # Report on any invalid paths if len(invalid_paths) > 0: self.warnings.info(f'Reading from batchfile: {batchfile}') self.warnings.error( f'The following {len(invalid_paths)} genomes ' f'have invalid paths specified in the batchfile:') for g_path, g_gid in invalid_paths: self.warnings.info(f'{g_gid}\t{g_path}') raise GTDBTkExit( f'There are {len(invalid_paths)} paths in the ' f'batchfile which do not exist, see gtdb.warnings.log') if len(genomic_files) == 0: if genome_dir: self.logger.error('No genomes found in directory: %s. Check ' 'the --extension flag used to identify ' 'genomes.' % genome_dir) else: self.logger.error('No genomes found in batch file: %s. Please ' 'check the format of this file.' % batchfile) raise GTDBTkExit return genomic_files
def _producer(self, genome_file): """Apply prodigal to genome with most suitable translation table. Parameters ---------- genome_file : str Fasta file for genome. """ genome_id = remove_extension(genome_file) aa_gene_file = os.path.join(self.output_dir, genome_id + '_genes.faa') nt_gene_file = os.path.join(self.output_dir, genome_id + '_genes.fna') gff_file = os.path.join(self.output_dir, genome_id + '.gff') best_translation_table = -1 table_coding_density = {4: -1, 11: -1} table_prob = {4: -1, 11: -1} if self.called_genes: os.system('cp %s %s' % (os.path.abspath(genome_file), aa_gene_file)) else: seqs = read_fasta(genome_file) if len(seqs) == 0: self.logger.warning( 'Cannot call Prodigal on an empty genome. Skipped: {}'. format(genome_file)) return None tmp_dir = tempfile.mkdtemp() # determine number of bases total_bases = 0 for seq in seqs.values(): total_bases += len(seq) # call genes under different translation tables if self.translation_table: translation_tables = [self.translation_table] else: translation_tables = [4, 11] translation_table_gffs = dict() tln_table_stats = dict() for translation_table in translation_tables: os.makedirs(os.path.join(tmp_dir, str(translation_table))) aa_gene_file_tmp = os.path.join(tmp_dir, str(translation_table), genome_id + '_genes.faa') nt_gene_file_tmp = os.path.join(tmp_dir, str(translation_table), genome_id + '_genes.fna') # check if there are sufficient bases to calculate prodigal parameters if total_bases < 100000 or self.meta: proc_str = 'meta' # use best precalculated parameters else: proc_str = 'single' # estimate parameters from data # If this is a gzipped genome, re-write the uncompressed genome file to disk prodigal_input = genome_file if genome_file.endswith('.gz'): prodigal_input = os.path.join( tmp_dir, os.path.basename(genome_file[0:-3]) + '.fna') write_fasta(seqs, prodigal_input) args = [ 'prodigal', '-m', '-p', proc_str, '-q', '-f', 'gff', '-g', str(translation_table), '-a', aa_gene_file_tmp, '-d', nt_gene_file_tmp, '-i', prodigal_input ] if self.closed_ends: args.append('-c') self.logger.debug('{}: {}'.format(genome_id, ' '.join(args))) proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) proc_out, proc_err = proc.communicate() gff_stdout = proc_out translation_table_gffs[translation_table] = gff_stdout if proc.returncode != 0: self.logger.warning( 'Prodigal returned a non-zero exit code while processing: {}' .format(genome_file)) return None # determine coding density prodigal_parser = ProdigalGeneFeatureParser(gff_stdout) # Skip if no genes were called. if prodigal_parser.n_sequences_processed() == 0: shutil.rmtree(tmp_dir) self.logger.warning( 'No genes were called! Check the quality of your genome. Skipped: {}' .format(genome_file)) return None # Save the statistics for this translation table prodigal_stats = prodigal_parser.generate_statistics() tln_table_stats[translation_table] = prodigal_stats table_coding_density[ translation_table] = prodigal_stats.coding_density # determine best translation table if not self.translation_table: # Logistic classifier coefficients b0 = 12.363017423768538 bi = np.array([ 0.01212327382066545, -0.9250857181041326, -0.10176647009345675, 0.7733711446656522, 0.6355731038236031, -0.1631355971443377, -0.14713264317198863, -0.10320909026025472, 0.09621494439016824, 0.4992209080695785, 1.159933669041023, -0.0507139271834123, 1.2619603455217179, 0.24392226222721214, -0.08567859197118802, -0.18759562346413916, 0.13136209122186523, -0.1399459561138417, 2.08086235029142, 0.6917662070950119 ]) # Scale x scaler_mean = np.array([ 0.0027036907781622732, -1.8082140490218692, -8.511942254988097e-08, 19.413811775420918, 12.08719100126732, 249.89521467118365, 0.0011868456444391487, -0.0007358432829349235, 0.004750880986023392, -0.04096159411654551, -0.12505492579693805, -0.03749033894554058, 0.13053986993752234, -0.15914556336256136, -0.6075506034967058, 0.06704648371665446, 0.04316693333324335, 0.26905236546875266, 0.010326462563249823, 333.3320678912514 ]) scaler_scale = np.array([ 0.08442772272873166, 2.043313786484819, 2.917510891467501e-05, 22.577812640992242, 12.246767248868036, 368.87834547339907, 0.0014166252200216657, 0.0014582164250905056, 0.025127203671053467, 0.5095427815162036, 0.2813128128116135, 0.2559877920464989, 1.274371529860827, 0.7314782174742842, 1.6885750374356985, 0.17019369029012987, 0.15376309021975043, 0.583965556283342, 0.025076680822882474, 544.3648797867784 ]) xi = np.array(tln_table_stats[11]) - np.array( tln_table_stats[4]) xi -= scaler_mean xi /= scaler_scale # If xi are all 0, then P(11) = 1. prob_tbl_11 = 1 / (1 + np.exp(-1 * (b0 + (bi * xi).sum()))) best_translation_table = 11 if prob_tbl_11 >= 0.5 else 4 table_prob[4] = 1.0 - prob_tbl_11 table_prob[11] = prob_tbl_11 else: best_translation_table = self.translation_table shutil.copyfile( os.path.join(tmp_dir, str(best_translation_table), genome_id + '_genes.faa'), aa_gene_file) shutil.copyfile( os.path.join(tmp_dir, str(best_translation_table), genome_id + '_genes.fna'), nt_gene_file) with open(gff_file, 'w') as f: f.write(translation_table_gffs[best_translation_table]) # clean up temporary files shutil.rmtree(tmp_dir) return genome_id, aa_gene_file, nt_gene_file, gff_file, best_translation_table, table_coding_density[ 4], table_coding_density[11], table_prob[4], table_prob[11]
def _genomes_to_process(self, genome_dir, batchfile, extension): """Get genomes to process. Parameters ---------- genome_dir : str Directory containing genomes. batchfile : str File describing genomes. extension : str Extension of files to process. Returns ------- genomic_files : d[genome_id] -> FASTA file Map of genomes to their genomic FASTA files. """ genomic_files = OrderedDict() if genome_dir: self.logger.debug( 'Looking for genomes with extension *.{} in: {}'.format( extension, genome_dir)) for f in os.listdir(genome_dir): if f.endswith(extension): genome_id = remove_extension(f) genomic_files[genome_id] = os.path.join(genome_dir, f) self.logger.debug('Found genome: {}'.format(genome_id)) elif batchfile: self.logger.debug( 'Using genomes specified in: {}'.format(batchfile)) with open(batchfile, 'r') as f: for line_no, line in enumerate(f.readlines()): line_split = line.strip().split('\t') if line_split[0] == '': continue # blank line if len(line_split) != 2: self.logger.error( 'Batch file must contain exactly 2 columns.') raise GenomeBatchfileMalformed genome_file, genome_id = line_split self._assert_genome_id_valid(genome_id) if genome_file is None or genome_file == '': self.logger.error('Missing genome file on line %d.' % (line_no + 1)) raise GenomeBatchfileMalformed elif genome_id is None or genome_id == '': self.logger.error('Missing genome ID on line %d.' % (line_no + 1)) raise GenomeBatchfileMalformed elif genome_id in genomic_files: self.logger.error( 'Genome ID %s appear multiple times.' % genome_id) raise GenomeBatchfileMalformed if genome_file in genomic_files.values(): self.logger.warning( 'Genome file appears multiple times: %s' % genome_file) genomic_files[genome_id] = genome_file self.logger.debug('Found genome {} at: {}'.format( genome_id, genome_file)) for genome_key in genomic_files.iterkeys(): if genome_key.startswith("RS_") or genome_key.startswith( "GB_") or genome_key.startswith("UBA"): self.logger.error( "Submitted genomes start with the same prefix (RS_,GB_,UBA) as " "reference genomes in GTDB-Tk. This will cause issues for " "downstream analysis.") raise GenomeNameInvalid if len(genomic_files) == 0: if genome_dir: self.logger.error( 'No genomes found in directory: %s. Check the --extension flag used to identify ' 'genomes.' % genome_dir) else: self.logger.error( 'No genomes found in batch file: %s. Please check the format of this file.' % batchfile) raise NoGenomesFound return genomic_files
def _genomes_to_process(self, genome_dir, batchfile, extension): """Get genomes to process. Parameters ---------- genome_dir : str Directory containing genomes. batchfile : str File describing genomes. extension : str Extension of files to process. Returns ------- genomic_files : d[genome_id] -> FASTA file Map of genomes to their genomic FASTA files. """ genomic_files, tln_tables = dict(), dict() if genome_dir: for f in os.listdir(genome_dir): if f.endswith(extension): genome_id = remove_extension(f, extension) genomic_files[genome_id] = os.path.join(genome_dir, f) elif batchfile: with open(batchfile, "r") as fh: for line_no, line in enumerate(fh): line_split = line.strip().split("\t") if line_split[0] == '': continue # blank line if len(line_split) not in {2, 3}: raise GTDBTkExit('Batch file must contain either 2 ' 'columns (detect translation table), ' 'or 3 (specify translation table).') if len(line_split) == 2: genome_file, genome_id = line_split elif len(line_split) == 3: genome_file, genome_id, tln_table = line_split if tln_table not in {'4', '11'}: raise GTDBTkExit( 'Specified translation table must ' 'be either 4, or 11.') tln_tables[genome_id] = int(tln_table) self._verify_genome_id(genome_id) if genome_file is None or genome_file == '': raise GTDBTkExit('Missing genome file on line %d.' % (line_no + 1)) elif genome_id is None or genome_id == '': raise GTDBTkExit('Missing genome ID on line %d.' % (line_no + 1)) elif genome_id in genomic_files: raise GTDBTkExit( 'Genome ID %s appears multiple times.' % genome_id) if genome_file in genomic_files.values(): self.logger.warning( 'Genome file appears multiple times: %s' % genome_file) genomic_files[genome_id] = genome_file # Check that the prefix is valid and the path exists invalid_paths = list() for genome_key, genome_path in genomic_files.items(): if not os.path.isfile(genome_path): invalid_paths.append((genome_key, genome_path)) # Report on any invalid paths if len(invalid_paths) > 0: self.warnings.info(f'Reading from batchfile: {batchfile}') self.warnings.error( f'The following {len(invalid_paths)} genomes ' f'have invalid paths specified in the batchfile:') for g_path, g_gid in invalid_paths: self.warnings.info(f'{g_gid}\t{g_path}') raise GTDBTkExit( f'There are {len(invalid_paths)} paths in the ' f'batchfile which do not exist, see gtdb.warnings.log') if len(genomic_files) == 0: if genome_dir: self.logger.error('No genomes found in directory: %s. Check ' 'the --extension flag used to identify ' 'genomes.' % genome_dir) else: self.logger.error('No genomes found in batch file: %s. Please ' 'check the format of this file.' % batchfile) raise GTDBTkExit invalid_genomes = set(genomic_files.keys()) & set(get_reference_ids()) if len(invalid_genomes) > 0: self.warnings.info( f'The following {len(invalid_genomes)} have the ' f'same ID as GTDB-Tk reference genomes:') for invalid_genome in sorted(invalid_genomes): self.warnings.info(invalid_genome) raise GTDBTkExit( f'You have {len(invalid_genomes)} genomes with the ' f'same id as GTDB-Tk reference genomes, please ' f'rename them. See gtdb.warnings.log.') return genomic_files, tln_tables