def eval_ada(VERBOSE=True): acr = np.zeros(10) for _ in range(100): _, _, correct = pipeline.pipe(feature='lbph', clf='adaboost', _mode='test', _verbose=VERBOSE) acr += (correct) est_list = np.linspace(2, 3.7, num=10) plt.bar(est_list, acr) plt.show() plt.savefig('graphs/ada_n_estimators.png')
def eval_pca(VERBOSE=True): trials = 0 total_correct = 0 n_comp = 2 for _ in range(2000): correct = pipeline.pipe(feature='cslbcop', clf='svm', _mode='test', _verbose=VERBOSE, pca_scatter=True, n_components=n_comp) total_correct += correct trials += 1 print( f'Trial: {trials}\tCorrects: {total_correct}\tOverall: {total_correct/trials}' )
def final_eval(_mode='test', _verbose=True): itr = 0 failed = 0 set_id = 0 data_path = 'data_tune/' datas = os.listdir(data_path) for set_id in range(159): auth = datas[set_id] form_count = len(os.listdir(data_path + auth)) forms = os.listdir(data_path + auth) for form_id in range(form_count): itr += 1 res = pipeline.pipe(feature='cslbcop', clf='svm', _verbose=_verbose, _mode=_mode, set_id=set_id, form_id=form_id) if not res: failed += 1 print( f'Trial: {itr}\tFailed: {failed}\tauthor: {set_id} \tform: {form_id}\n\n' )
def eval_perfomance_lbph_svm(MODE, VERBOSE=False): #fetch data tr = 0 pre = 0 conf = 0 trueacc = np.zeros(101) falseacc = np.zeros(101) allacc = np.zeros(101) avgrtconv = 0 predicted = -1 cnt = 0 for i in range(2000): print("Iteration: ", i + 1) predicted, conf, pre = pipeline.pipe(feature='lbph', clf='svm', _mode='test', _verbose=VERBOSE) conf = int(100 * conf[predicted] + 0.5) allacc[conf] += 1 if pre == 1: trueacc[conf] += 1 else: falseacc[conf] += 1 if pre == 1 and conf > 50: tr += 1 avgrtconv += conf if pre == 0 and conf > 50: cnt += 1 print("correct: ", tr) print(tr / 20) print(avgrtconv / 2000) print(cnt) plt.plot(allacc) plt.show() plt.plot(trueacc) plt.show() plt.plot(falseacc) plt.show()
import evaluations if __name__ == "__main__": # prepare_data.print_data_stat() # evaluations.eval_perfomance_lbph_svm(MODE, VERBOSE) # evaluations.eval_ada() # evaluations.eval_pca(VERBOSE) # evaluations.final_eval() VERBOSE = False MODE = 'deliver' data_path = 'data/' test_folders = os.listdir(data_path) test_folders = sorted(test_folders,key=int) results_file = open('results.txt',"w") time_file = open('time.txt',"w") for test_folder in test_folders: res,time = pipeline.pipe(feature='cslbcop', clf='svm',_verbose=VERBOSE, _mode=MODE, test_folder=test_folder) time = "{:.2f}".format(time) results_file.write(str(res)) results_file.write('\n') results_file.flush() time_file.write(str(time)) time_file.write('\n') time_file.flush() results_file.close() time_file.close()
def annotate_panel(vcf, vep, reference=None, threads=None, output="", panel="", buffer_size=None): if threads is None: threads = run(["getconf", "_NPROCESSORS_ONLN"]).stdout.strip() if not output: output = "." if os.path.isdir(output): output = os.path.join( output, "{}.annotation.tsv".format( vcf[:-4] if vcf.endswith(".vcf") else vcf)) vepjson = "{}.vep.json".format(output[:-4]) vep_options = [ "--no_stats", "--dir", vep, "--format", "vcf", "--json", "--offline", "--everything", "--warning_file", "STDERR", "--force_overwrite" ] if reference is not None: reference = (glob.glob(f"{reference}/*.fna") + glob.glob(f"{reference}/*.fa") + glob.glob(f"{reference}/*.fasta") + [reference])[0] vep_options += ["--fasta", reference] if int(threads) > 1: vep_options += ["--fork", threads] if "refseq" in vep: vep_options += ["--refseq"] if buffer_size is not None: vep_options += ["--buffer_size", buffer_size] pipe(["vep", "-i", vcf, "-o", vepjson] + vep_options) get_read_data = None with open(vcf, "rt") as f: for row in f: if not row.startswith("#"): break if row.startswith("##source="): source = row[9:].strip() #if source == "strelka": if source.startswith("VarDict"): get_read_data = vardict_read_data elif source == "VarScan2": get_read_data = varscan2_read_data elif source == "Mutect2": get_read_data = mutect2_read_data headings = row if get_read_data is None: sys.exit(f"Unsupported variant caller {source}") if len(headings.split("\t")) > 10: sys.exit("Multi-sample vcfs not suppored") targets = None principal = {} needed_genes = set() needed_transcripts = set() if panel: panel = Panel(panel) if "targets" in panel: targets = panel.targets if "names" in panel: for name in panel.names: name = name.split() needed_genes.add(name[0]) if len(name) > 1: needed_transcripts.add(name[1]) if "principal" in panel.paths: principal = appris(panel.paths["principal"]) if "refseq" in vep: def consequence_sort(cons): transcript, minor = cons["transcript_id"].split(".") prefix = transcript[:2] major = transcript[3:] return [ transcript in needed_transcripts, cons["gene_symbol"] in needed_genes, BIOTYPE[cons["biotype"]], REFSEQ[prefix], -int(cons["gene_id"]), principal.get(transcript, 0), "canonical" in cons, -int(major), int(minor) ] else: # ensembl transcripts def consequence_sort(cons): # Version numbers not in vep as of version 101, but who knows the future ... transcript = cons["transcript_id"] return [ transcript in needed_transcripts, cons["gene_symbol"] in needed_genes, BIOTYPE[cons["biotype"]], -int(cons["gene_id"].translate(DELETE_NON_DIGIT)), principal.get(transcript, 0), "canonical" in cons, -int(transcript.translate(DELETE_NON_DIGIT)) ] annotations = [] with open(vepjson) as f: for line in f: vep_output = json.loads(line) consequences = vep_output.get("transcript_consequences") if consequences: cons = sorted(consequences, key=consequence_sort)[-1] other_genes = set(c["gene_symbol"] for c in consequences) - set( [cons["gene_symbol"]]) else: most_severe_consequence = vep_output["most_severe_consequence"] for cons in sorted(chain(*[ v for k, v in vep_output.items() if k.endswith("_consequences") ]), key=lambda x: x.get("biotype", ""), reverse=True): # We are only going to use biotype and impact so probably does not matter which one we choose so long as we are consistent if most_severe_consequence in cons["consequence_terms"]: break other_genes = () row = vep_output["input"].rstrip().split("\t") read_data = get_read_data(row) if read_data["alt_depth"] == "0": continue # https://gatk.broadinstitute.org/hc/en-us/articles/360035532152-Fisher-s-Exact-Test fisher_strand = -10 * math.log10( fisher_exact([read_data["ref_fr"], read_data["alt_fr"]])[1]) demographics = parse_colocated(vep_output) annotations.append([ cons.get("gene_symbol", ""), cons.get("transcript_id", ""), row[CHROM], vep_output["start"], vep_output["allele_string"], row[QUAL], row[FILTERS], read_data["vaf"], read_data["depth"], read_data["alt_depth"], "{}:{}".format(*read_data["alt_fr"]), "{}:{}".format(*read_data["ref_fr"]), "{:.1f}".format(fisher_strand), cons.get("hgvsc", ""), cons.get("hgvsp", ""), cons.get("biotype", ""), cons.get("impact", ""), ", ".join(demographics.get("clin_sig", ())), ", ".join(cons.get("consequence_terms", ())), cons.get("sift_prediction", ""), cons.get("polyphen_prediction", ""), "{:.10f}".format( demographics["maf"]) if "maf" in demographics else "", ", ".join(sorted(other_genes)), ", ".join(demographics.get("dbsnp", ())), ", ".join(demographics.get("hgmd", ())), ", ".join( demographics.get("cosmic", ())), ", ".join( demographics.get("pubmed", ())) ]) os.unlink(vepjson) annotations.sort(key=lambda r: (chrom2int(r[2]), r[2], int(r[3]), r[4])) with open(output, "wt") as f: writer = csv.writer(f, delimiter="\t") writer.writerow([ "Gene", "Transcript", "Chrom", "Pos", "Change", "Quality", "Filters", "VAF", "Depth", "Alt Depth", "Alt Depth F:R", "Ref Depth F:R", "FisherStrand", "HGVSc", "HGVSp", "Biotype", "Impact", "Clinical Significance (Pubmed)", "Consequences", "Sift", "Polyphen", "MAF", "Other Genes", "dbSNP", "HGMD", "COSMIC", "Pubmed" ]) writer.writerows(annotations)
from flask import Flask, render_template, request, jsonify import os, sys reload(sys) # Reload does the trick! sys.setdefaultencoding('UTF8') sys.path.append('..') import pipeline as pp import nltk from utils.entity import EntityType # Initialize the Flask application app = Flask(__name__) app.config['JSONIFY_PRETTYPRINT_REGULAR'] = False pipe = pp.pipe() @app.route('/') def index(): return render_template('index.html') @app.route('/_tag_sentence') def tag_sentence(): a = request.args.get('input', 0) if len(a.split(" ")) < 2: return {} a = " ".join(nltk.word_tokenize(a)) bilstm_red = pipe.tag(a)
def analyze(self): # Update application copy of input/output dir (in case user didn't use buttons) self.inputDir = self.lineEdit_inputDir.text() self.outputDir = self.lineEdit_outputDir.text() # Get all user options opts = { 'input_dir': self.inputDir, 'output_dir': self.outputDir, 'bool_analyze_full': self.checkBox_setTime.checkState(), 'start_time': utils.toInt(self.lineEdit_startTime.text()), 'end_time': utils.toInt(self.lineEdit_endTime.text()), 'bool_removeClipped': self.checkBox_removeClipped.checkState(), 'bool_deleteClips': self.checkBox_deleteClips.checkState(), 'bool_makeSpectrograms': self.checkBox_spectrograms.checkState(), 'bool_parseFilename': self.checkBox_parseFilename.checkState(), 'filename_delimiter': self.lineEdit_filenameDelimiter.text(), 'filename_categories': [s for s in self.lineEdit_filenameCategories.text().split(',')], 'bool_useAva': self.checkBox_useAva.checkState(), 'avaFoldername': self.lineEdit_avaFolder.text(), 'bool_inferSilence': self.checkBox_useSilenceFile.checkState(), 'silenceFilename': self.lineEdit_silenceFile.text(), 'silenceThreshold': self.spinBox_silenceThresh.value(), 'silenceBuffer': self.spinBox_bufferLen.value(), 'silenceMinLen': self.spinBox_minSilenceLen.value() } # TODO: Generalize error catching try: progress = QtWidgets.QProgressDialog( "Processing files...", "Abort", 0, len(os.listdir(opts['input_dir']))) progress.setWindowModality(QtCore.Qt.WindowModal) self.vocBatch = pipeline.pipe(opts) except exceptions.TimeBoundError as error: error_dialog = QtWidgets.QErrorMessage() error_dialog.setWindowModality(QtCore.Qt.WindowModal) error_dialog.showMessage(error.error_string()) error_dialog.exec_() return except exceptions.FilenameParseError as error: error_dialog = QtWidgets.QErrorMessage() error_dialog.setWindowModality(QtCore.Qt.WindowModal) error_dialog.showMessage(error.error_string()) error_dialog.exec_() return except exceptions.NoInputError as error: error_dialog = QtWidgets.QErrorMessage() error_dialog.setWindowModality(QtCore.Qt.WindowModal) error_dialog.showMessage(error.error_string()) error_dialog.exec_() return self.loadOutputTables(self.lineEdit_outputDir.text()) if opts['bool_makeSpectrograms']: self.loadSpectrograms(opts['output_dir'])