def fastani(self, qid, rid, q_gf, r_gf): """CalculateANI between a pair of genomes.""" # check cache if qid in self.ani_cache: if rid in self.ani_cache[qid]: ani, af = self.ani_cache[qid][rid] ani_af = (qid, rid, ani, af) return ani_af # create file pointing to representative genome files tmp_fastani_file = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) cmd = 'fastANI -q %s -r %s -o %s 2> /dev/null' % (q_gf, r_gf, tmp_fastani_file) run(cmd) if os.path.exists( tmp_fastani_file) and os.stat(tmp_fastani_file).st_size > 0: for line in open(tmp_fastani_file): line_split = line.strip().split() ani = float(line_split[2]) af = float(line_split[3]) / int(line_split[4]) ani_af = (qid, rid, ani, af) else: ani_af = (qid, rid, 0.0, 0.0) if os.path.exists(tmp_fastani_file): os.remove(tmp_fastani_file) return ani_af
def dist_pairwise(self, min_dist, sketch_file, dist_file): """Calculate pairwise Mash distance between genomes.""" if not os.path.exists(dist_file): self.logger.info( 'Calculating pairwise Mash distances between genomes (d = %.2f).' % min_dist) cmd = 'mash dist -p %d -d %f -v %f %s %s > %s 2> /dev/null' % ( self.cpus, min_dist, 1e-5, sketch_file, sketch_file, dist_file) run(cmd) else: self.logger.warning( 'Using previously generated pairwise distance file.')
def dist(self, min_dist, ref_sketch_file, query_sketch_file, dist_file): """Calculate Mash distance between reference and query genomes.""" if not os.path.exists(dist_file): self.logger.info('Calculating Mash distances between reference and query genomes (d = %.2f).' % min_dist) cmd = 'mash dist -p %d -d %f -v %f %s %s > %s 2> /dev/null' % (self.cpus, min_dist, 1e-5, ref_sketch_file, query_sketch_file, dist_file) run(cmd) else: self.logger.warning('Using previously generated pairwise distance file.')
def dist_pairwise(self, min_dist, sketch_file, dist_file, silence=False): """Calculate pairwise Mash distance between genomes.""" if not os.path.exists(dist_file): if not silence: self.logger.info( f'Calculating pairwise Mash distances between genomes (d = {min_dist:.2f}).' ) cmd = 'mash dist -p {} -d {} -v {} {} {} > {} 2> /dev/null'.format( self.cpus, min_dist, 1e-5, sketch_file, sketch_file, dist_file) run(cmd) else: if not silence: self.logger.warning( 'Using previously generated pairwise distance file.')
def sketch(self, gids, genome_files, genome_list_file, sketch_file): """Create Mash sketch for genomes.""" # create Mash sketch for potential representative genomes if not os.path.exists(sketch_file): fout = open(genome_list_file, 'w') for gid in gids: fout.write(genome_files[gid] + '\n') fout.close() self.logger.info('Creating Mash sketch for %d genomes.' % len(gids)) cmd = 'mash sketch -l -p %d -k 16 -s 5000 -o %s %s 2> /dev/null' % (self.cpus, sketch_file, genome_list_file) run(cmd) else: self.logger.warning('Using previously generated sketch file.')
def dist(self, min_dist, ref_sketch_file, query_sketch_file, dist_file, silence=False): """Calculate Mash distance between reference and query genomes.""" if not os.path.exists(dist_file): if not silence: self.logger.info( 'Calculating Mash distances between reference and query genomes (d = %.2f).' % min_dist) cmd = 'mash dist -p %d -d %f -v %f %s %s > %s 2> /dev/null' % ( self.cpus, min_dist, 1e-5, ref_sketch_file, query_sketch_file, dist_file) run(cmd) else: if not silence: self.logger.warning( 'Using previously generated pairwise distance file.')
def create(self, profiles, output_file): """Create Krona plot. Profiles for multiple items (e.g., genome, metagenome) can be specified. The complete hierarchy for each unique element should be specified as a semicolon separated string, e.g., k__Bacteria;c__Firmicutes;...;s__ The number of hits to each unique element is specified in the profiles dictionary, e.g., d[unique_id][element_str] = 10 Parameters ---------- profiles: d[unique_id][element_str] -> count Number of hits to specific elements for each item. output_file : str Name of output file. """ # create temporary files for each item cmd = 'ktImportText -o %s' % output_file tmp_dir = tempfile.mkdtemp() for unique_id in alphanumeric_sort(list(profiles.keys())): tmp_file = os.path.join(tmp_dir, unique_id) fout = open(tmp_file, 'w') for element_str, num_hits in profiles[unique_id].items(): elements = [x.strip() for x in element_str.split(';')] fout.write(str(num_hits) + '\t' + '\t'.join(elements) + '\n') fout.close() cmd += ' %s,%s' % (tmp_file, unique_id) # create krona plot execute.run(cmd) # clean up temporary files shutil.rmtree(tmp_dir)
def run(self, genomes_new_updated_file, qc_passed_file, batch_size): """Perform initial classification of new and updated genomes using GTDB-Tk.""" # get list of genomes passing QC self.logger.info('Reading genomes passing QC.') gids_pass_qc = read_qc_file(qc_passed_file) self.logger.info(f' - identified {len(gids_pass_qc):,} genomes.') # get path to genomes passing QC self.logger.info( 'Reading path to genomic file for new/updated genomes passing QC.') genomic_files = [] new_updated_gids = set() total_count = 0 with open(genomes_new_updated_file, encoding='utf-8') as f: header = f.readline().strip().split('\t') genomic_file_index = header.index('Genomic file') for line in f: tokens = line.strip().split('\t') gid = tokens[0] total_count += 1 if gid in gids_pass_qc: gf = tokens[genomic_file_index] genomic_files.append((gid, gf)) new_updated_gids.add(gid) self.logger.info( f' - identified {len(genomic_files):,} of {total_count:,} genomes as passing QC.') # create batch files genome_batch_files = [] batch_dir = os.path.join(self.output_dir, 'genome_batch_files') if os.path.exists(batch_dir): self.logger.warning( f'Using existing genome batch files in {batch_dir}.') for f in os.listdir(batch_dir): genome_batch_files.append(os.path.join(batch_dir, f)) # check if there are genomes not already in a batch file. Ideally, # this would never happen, but sometimes we process past this step # and then identify genomes missing in the database. These need to # be put into a batch file for processing. missing_gids = set(new_updated_gids) last_batch_idx = 0 for batch_file in os.listdir(batch_dir): idx = int(batch_file.split('_')[1].replace('.lst', '')) if idx > last_batch_idx: last_batch_idx = idx with open(os.path.join(batch_dir, batch_file)) as f: for line in f: tokens = line.strip().split('\t') missing_gids.discard(tokens[1]) if len(missing_gids) > 0: genome_batch_file = os.path.join( batch_dir, f'genomes_{last_batch_idx+1}.lst') genome_batch_files.append(genome_batch_file) self.logger.info('Added the batch file {} with {:,} genomes.'.format( genome_batch_file, len(missing_gids))) fout = open(genome_batch_file, 'w') for gid, gf in genomic_files: if gid in missing_gids: fout.write('{}\t{}\n'.format(gf, gid)) fout.close() else: os.makedirs(batch_dir) for batch_idx, start in enumerate(range(0, len(genomic_files), batch_size)): genome_batch_file = os.path.join( batch_dir, f'genomes_{batch_idx}.lst') genome_batch_files.append(genome_batch_file) fout = open(genome_batch_file, 'w') for i in range(start, min(start+batch_size, len(genomic_files))): gid, gf = genomic_files[i] fout.write('{}\t{}\n'.format(gf, gid)) fout.close() # process genomes with GTDB-Tk in batches for genome_batch_file in genome_batch_files: batch_idx = ntpath.basename(genome_batch_file).split('_')[ 1].replace('.lst', '') out_dir = os.path.join(self.output_dir, f'gtdbtk_batch{batch_idx}') if os.path.exists(out_dir): self.logger.warning( f'Skipping genome batch {batch_idx} as output directory already exists.') continue os.makedirs(out_dir) cmd = 'gtdbtk classify_wf --cpus {} --force --batchfile {} --out_dir {}'.format( self.cpus, genome_batch_file, out_dir) print(cmd) run(cmd) # combine summary files fout = open(os.path.join(self.output_dir, 'gtdbtk_classify.tsv'), 'w') bHeader = True gtdbtk_processed = set() for batch_dir in os.listdir(self.output_dir): if not batch_dir.startswith('gtdbtk_batch'): continue batch_dir = os.path.join(self.output_dir, batch_dir) ar_summary = os.path.join(batch_dir, 'gtdbtk.ar122.summary.tsv') bac_summary = os.path.join(batch_dir, 'gtdbtk.bac120.summary.tsv') for summary_file in [ar_summary, bac_summary]: with open(summary_file, encoding='utf-8') as f: header = f.readline() if bHeader: fout.write(header) bHeader = False for line in f: tokens = line.strip().split('\t') gid = tokens[0] if gid in new_updated_gids: # Ideally, this shouldn't be necessary, but # sometimes we process past this step and then # identify genomes missing in the database. This # can result in GTDB-Tk having been applied to # genomes that looked like they were "new", but # really were just erroneously missing from the # database. fout.write(line) gtdbtk_processed.add(gid) fout.close() self.logger.info( 'Identified {:,} genomes as being processed by GTDB-Tk.'.format(len(gtdbtk_processed))) skipped_gids = new_updated_gids - gtdbtk_processed if len(skipped_gids) > 0: self.logger.warning('Identified {:,} genomes as being skipped by GTDB-Tk.'.format( len(skipped_gids)))