def calculateAlignedMarkerSets(self, db_genome_ids, marker_ids): ''' Run Hmmalign for PFAM and TIGRFAM missing markers :param genome_ids: list of genome ids that are used for the tree step :param marker_ids: list of marker ids used for the tree building step ''' self.logger.info('Aligning marker genes not already in the database.') # return True # We need to rebuild the path to each genome_dirs_query = ( "SELECT g.id, g.genes_file_location,gs.external_id_prefix " "FROM genomes g " + "LEFT JOIN genome_sources gs ON gs.id = g.genome_source_id " + "WHERE g.id in %s") self.cur.execute(genome_dirs_query, (tuple(db_genome_ids), )) raw_results = self.cur.fetchall() genome_dirs = {a: fastaPathGenerator(b, c) for a, b, c in raw_results} manager = multiprocessing.Manager() out_q = manager.Queue() procs = [] nprocs = self.threads for item in splitchunks(genome_dirs, nprocs): p = multiprocessing.Process(target=self._hmmWorker, args=(item, marker_ids, out_q)) procs.append(p) p.start() # Collect all results into a single result dict. We know how many dicts # with results to expect. while out_q.empty(): time.sleep(1) # Wait for all worker processes to finish for p in procs: p.join() return True
def calculateAlignedMarkerSets(self, db_genome_ids, marker_ids): ''' Run Hmmalign for PFAM and TIGRFAM missing markers :param genome_ids: list of genome ids that are used for the tree step :param marker_ids: list of marker ids used for the tree building step ''' self.logger.info('Aligning marker genes not already in the database.') # We need to rebuild the path to each genome_dirs_query = ("SELECT g.id, g.genes_file_location,gs.external_id_prefix " "FROM genomes g " + "LEFT JOIN genome_sources gs ON gs.id = g.genome_source_id " + "WHERE g.id in %s") self.cur.execute(genome_dirs_query, (tuple(db_genome_ids),)) raw_results = self.cur.fetchall() genome_dirs = {a: fastaPathGenerator(b, c) for a, b, c in raw_results} manager = multiprocessing.Manager() out_q = manager.Queue() procs = [] nprocs = self.threads for item in splitchunks(genome_dirs, nprocs): p = multiprocessing.Process( target=self._hmmWorker, args=(item, marker_ids, out_q)) procs.append(p) p.start() # Collect all results into a single result dict. We know how many dicts # with results to expect. while out_q.empty(): time.sleep(1) # Wait for all worker processes to finish for p in procs: p.join() return True
def addGenomes(self, checkm_file, batchfile, study_file): """Add new genomes to DB. Parameters ---------- checkm_file : str Name of file containing CheckM results. batchfile : str Name of file describing genomes to add. study_file : str Name of file describing study from which genomes were recovered Returns ------- list List of database genome identifiers of added genomes. """ try: self.tmp_output_dir = tempfile.mkdtemp() self.logger.info("Parsing Study file.") study_id = self._processStudy(study_file) self.logger.info("Reading CheckM file.") checkm_results_dict = self._processCheckM(checkm_file) genomic_files = self._addGenomeBatch(batchfile, self.tmp_output_dir) self.logger.info("Running Prodigal to identify genes.") prodigal = Prodigal(self.threads) file_paths = prodigal.run(genomic_files) self.logger.info("Calculating and storing metadata for each genome.") manager = multiprocessing.Manager() progress_queue = multiprocessing.Queue() progress_proc = multiprocessing.Process(target=self._progress, args=(len(genomic_files), progress_queue)) progress_proc.start() out_q = multiprocessing.Manager().Queue() procs = [] nprocs = self.threads for item in splitchunks(genomic_files, 1): p = multiprocessing.Process( target=self._addGenomesWorker, args=(item, file_paths, checkm_results_dict, study_id, out_q, progress_queue), ) procs.append(p) p.start() # Pierre: why is this needed? while out_q.empty(): time.sleep(1) # wait for all worker processes to finish for p in procs: p.join() self.logger.info("Waiting for progress process.") progress_queue.put(None) progress_proc.join() # annotated genes against TIGRfam and Pfam databases self.logger.info("Identifying TIGRfam protein families.") gene_files = [file_paths[db_genome_id]["aa_gene_path"] for db_genome_id in genomic_files] tigr_search = TigrfamSearch(self.cur, self.currentUser, self.threads) tigr_search.run(gene_files) self.logger.info("Identifying Pfam protein families.") pfam_search = PfamSearch(self.cur, self.currentUser, self.threads) pfam_search.run(gene_files) except: if os.path.exists(self.tmp_output_dir): shutil.rmtree(self.tmp_output_dir) raise return genomic_files.keys()
def addGenomes(self, checkm_file, batchfile, study_file): """Add new genomes to DB. Parameters ---------- checkm_file : str Name of file containing CheckM results. batchfile : str Name of file describing genomes to add. study_file : str Name of file describing study from which genomes were recovered Returns ------- list List of database genome identifiers of added genomes. """ try: self.tmp_output_dir = tempfile.mkdtemp() self.logger.info("Parsing Study file.") study_id = self._processStudy(study_file) self.logger.info("Reading CheckM file.") checkm_results_dict = self._processCheckM(checkm_file) genomic_files = self._addGenomeBatch(batchfile, self.tmp_output_dir) self.logger.info("Running Prodigal to identify genes.") prodigal = Prodigal(self.threads) file_paths = prodigal.run(genomic_files) self.logger.info( "Calculating and storing metadata for each genome.") manager = multiprocessing.Manager() progress_queue = multiprocessing.Queue() progress_proc = multiprocessing.Process(target=self._progress, args=(len(genomic_files), progress_queue)) progress_proc.start() out_q = multiprocessing.Manager().Queue() procs = [] nprocs = self.threads for item in splitchunks(genomic_files, 1): p = multiprocessing.Process( target=self._addGenomesWorker, args=(item, file_paths, checkm_results_dict, study_id, out_q, progress_queue)) procs.append(p) p.start() # Pierre: why is this needed? while out_q.empty(): time.sleep(1) # wait for all worker processes to finish for p in procs: p.join() self.logger.info("Waiting for progress process.") progress_queue.put(None) progress_proc.join() # annotated genes against TIGRfam and Pfam databases self.logger.info("Identifying TIGRfam protein families.") gene_files = [ file_paths[db_genome_id]['aa_gene_path'] for db_genome_id in genomic_files ] tigr_search = TigrfamSearch(self.cur, self.currentUser, self.threads) tigr_search.run(gene_files) self.logger.info("Identifying Pfam protein families.") pfam_search = PfamSearch(self.cur, self.currentUser, self.threads) pfam_search.run(gene_files) except: if os.path.exists(self.tmp_output_dir): shutil.rmtree(self.tmp_output_dir) raise return genomic_files.keys()