def profiling_api(batch_id, database, occr_level): input_dir = os.path.join(INDIR, batch_id) files.create_if_not_exist(OUTDIR) output_dir = os.path.join(OUTDIR, batch_id) files.create_if_not_exist(output_dir) wgmlst.profiling(output_dir, input_dir, database, occr_level=occr_level, threads=2) profile_created = datetime.datetime.now() with open(os.path.join(output_dir, "namemap.json"), "r") as file: names = json.loads(file.read()) profile_filename = os.path.join(output_dir, "cgMLST_{}_{}_{}.tsv".format(database, occr_level, batch_id[0:8])) os.rename(os.path.join(output_dir, "wgmlst.tsv"), profile_filename) dendro = phylotree.Dendrogram() dendro.make_tree(profile_filename, names) dendro_created = datetime.datetime.now() newick_filename = os.path.join(output_dir, "dendrogram_{}.newick".format(batch_id[0:8])) dendro.to_newick(newick_filename) pdf_filename = os.path.join(output_dir, "dendrogram_{}.pdf".format(batch_id[0:8])) dendro.scipy_tree(pdf_filename) svg_filename = os.path.join(output_dir, "dendrogram_{}.svg".format(batch_id[0:8])) dendro.scipy_tree(svg_filename) png_filename = os.path.join(output_dir, "dendrogram_{}.png".format(batch_id[0:8])) dendro.scipy_tree(png_filename) sql = "INSERT INTO profile (id,created,file,occurrence,database) VALUES(%s,%s,%s,%s,%s);" data = (batch_id, profile_created, profile_filename, occr_level, database) db.to_sql(sql, data, database="profiling") sql = "INSERT INTO dendrogram (id,created,png_file,pdf_file,svg_file,newick_file) VALUES(%s,%s,%s,%s,%s,%s);" data = (batch_id, dendro_created, png_filename, pdf_filename, svg_filename, newick_filename) db.to_sql(sql, data, database="profiling")
def make_profile(self): """take file from profileSelectText for profiling""" switch_widgets = [self.profileSelector, self.profileSelectText, self.joblist_2, self.runButton] self.disable(switch_widgets) # setup paths option = str(self.joblist_2.currentItem().text()) database_dir = files.joinpath(self.data_dir, JobType.PGDB.to_str(), option, "DB") profile_dir = str(self.profileSelectText.toPlainText()) query_dir = files.joinpath(self.data_dir, JobType.WGMLST.to_str()) files.create_if_not_exist(query_dir) # create new job self.jobmgr = JobManager(self.data_dir) jobid, job_dir = self.jobmgr.start_job(JobType.WGMLST) self.jobmgr.close() # setup logger factory = LoggerFactory() factory.addLogBoxHandler(self.logbox_2) # TODO: bug -- crash after Worker is done. factory.addFileHandler(files.joinpath(query_dir, "log_" + jobid + ".txt")) logger = factory.create() # process algorithms wgmlst.profiling(job_dir, profile_dir, database_dir, logger) self.enable(switch_widgets)
def make_database(self): """take files from fileSelectText for making database""" switch_widgets = [self.fileSubmitter, self.fileSelectText, self.fileSelector] self.disable(switch_widgets) # setup paths source_dir = str(self.fileSelectText.toPlainText()) database_dir = files.joinpath(self.data_dir, JobType.PGDB.to_str()) files.create_if_not_exist(database_dir) # create new job self.jobmgr = JobManager(self.data_dir) jobid, job_dir = self.jobmgr.start_job(JobType.PGDB) self.jobmgr.close() # setup logger factory = LoggerFactory() factory.addLogBoxHandler(self.logbox_1) # TODO: bug -- crash after Worker is done. factory.addFileHandler(files.joinpath(database_dir, "log_" + jobid + ".txt")) logger = factory.create() # process algorithms pgdb.annotate_configs(source_dir, job_dir, logger=logger) pgdb.make_database(job_dir, logger=logger) self.enable(switch_widgets)
def annotate_configs(input_dir, output_dir, logger=None, threads=8, use_docker=True): if not logger: logger = logs.console_logger(__name__) logger.info("Formating contigs...") filenames = parse_filenames(input_dir) genome_dir = files.joinpath(output_dir, "Genomes") files.create_if_not_exist(genome_dir) namemap = format_contigs(filenames, input_dir, genome_dir) with open(files.joinpath(output_dir, "namemap.json"), "w") as f: f.write(json.dumps(namemap)) logger.info("Annotating...") annotate_dir = files.joinpath(output_dir, "Annotated") files.create_if_not_exist(annotate_dir) if use_docker: docker.prokka(genome_dir, annotate_dir) else: c = [cmds.form_prokka_cmd(x, genome_dir, annotate_dir) for x in namemap.values()] with ProcessPoolExecutor(int(threads / 2)) as executor: executor.map(os.system, c) logger.info("Moving protein CDS (.ffn) files...") ffn_dir = files.joinpath(output_dir, "FFN") files.create_if_not_exist(ffn_dir) move_file(annotate_dir, ffn_dir, ".ffn") logger.info("Moving annotation (.gff) files...") gff_dir = files.joinpath(output_dir, "GFF") files.create_if_not_exist(gff_dir) move_file(annotate_dir, gff_dir, ".gff") logger.info("Creating nonCDS.json...") create_noncds(output_dir, gff_dir)
def profiling(output_dir, input_dir, database, threads, occr_level=None, selected_loci=None, logger=None, aligcov_cut=0.5, identity=90): load_database_config() if not logger: logger = logs.console_logger(__name__) logger.info("Renaming contigs...") query_dir = files.joinpath(output_dir, "query") files.create_if_not_exist(query_dir) namemap = rename(query_dir, input_dir) with open(files.joinpath(output_dir, "namemap.json"), "w") as f: f.write(json.dumps(namemap)) if os.path.isdir(database): logger.info("Profiling loci...") refseq_fna = files.joinpath(database, "panRefSeq.fa") profile_loci(refseq_fna, query_dir, output_dir, aligcov_cut, identity, threads) logger.info("Allocating alleles...") profile_alleles(query_dir, database, output_dir, threads, occr_level) else: logger.info("Identifying loci and allocating alleles...") # select loci by scheme if selected_loci: selected_loci = set(selected_loci) else: query = "select locus_id from scheme where occurence>={};".format(occr_level) selected_loci = set(sql_query(query, database=database).iloc[:, 0]) temp_dir = os.path.join(query_dir, "temp") files.create_if_not_exist(temp_dir) collect = [] args = [(os.path.join(query_dir, filename), temp_dir) for filename in os.listdir(query_dir) if filename.endswith(".fa")] with ProcessPoolExecutor(threads) as executor: for filename in executor.map(identify_loci, args): genome_id = files.fasta_filename(filename) target_file = os.path.join(temp_dir, genome_id + ".locus.fna") profile = profile_by_query(target_file, genome_id, selected_loci, database) collect.append(profile) result = pd.concat(collect, axis=1) result.to_csv(files.joinpath(output_dir, "wgmlst.tsv"), sep="\t") shutil.rmtree(query_dir)
def __init__(self, mainTab): super(Window, self).__init__() self.setupUi(mainTab) current_dir = os.path.dirname(__file__) self.ROOT_DIR = os.path.abspath(os.path.join(current_dir, os.pardir)) self.data_dir = os.path.join(self.ROOT_DIR, "data") files.create_if_not_exist(self.data_dir) self.pool = worker.ThreadPool() # setting behaviers self.fileSelector.clicked.connect(lambda: self.select_dir(self.fileSelectText)) self.fileSubmitter.clicked.connect(lambda: self.pool.start(self.make_database, ())) self.profileSelector.clicked.connect(lambda: self.select_profiles(self.profileSelectText, self.joblist_2)) self.runButton.clicked.connect(lambda: self.pool.start(self.make_profile, ())) self.plottingSelector.clicked.connect(lambda: self.select_dir(self.plottingSelectText)) self.plottingSelectText.textChanged.connect(lambda: self.plotDendrogram())
def make_database(output_dir, logger=None, threads=2, use_docker=True): if not logger: logger = logs.console_logger(__name__) database_dir = files.joinpath(output_dir, "database") files.create_if_not_exist(database_dir) logger.info("Calculating the pan genome...") min_identity = 95 if use_docker: docker.roary(files.joinpath(output_dir, "GFF"), output_dir, min_identity, threads) else: c = cmds.form_roary_cmd(files.joinpath(output_dir, "GFF"), output_dir, min_identity, threads) os.system(c) logger.info("Extract profiles from roary result matrix...") matrix_file = files.joinpath(output_dir, "roary", "gene_presence_absence.csv") locusmeta_file = files.joinpath(database_dir, "locus_metadata.tsv") paralogmeta_file = files.joinpath(database_dir, "paralog_metadata.tsv") profiles, total_isolates = extract_profiles(matrix_file, locusmeta_file, paralogmeta_file) logger.info("Collecting allele profiles and making allele frequencies and reference sequence...") ffn_dir = files.joinpath(output_dir, "FFN") profile_file = files.joinpath(database_dir, "allele_profiles.tsv") profiles, freq = collect_allele_infos(profiles, ffn_dir) profiles.to_csv(profile_file, sep="\t") refseq_file = files.joinpath(database_dir, "panRefSeq.fa") refseqs = save_refseq(freq, refseq_file) locus_dir = files.joinpath(database_dir, "locusfiles") files.create_if_not_exist(locus_dir) save_locusfiles(freq, locus_dir) allele_freq_file = files.joinpath(database_dir, "allele_frequency.json") save_allele_freq(freq, allele_freq_file) logger.info("Making dynamic schemes...") scheme_file = files.joinpath(database_dir, "scheme.tsv") make_schemes(locusmeta_file, scheme_file, refseqs, total_isolates) logger.info("Done!!")