def save_test_performance(data, tfids, fold, args): print "Predicting...", tfids # Generate predictions for each model predictions = util.predict(data, args.finaldir, args.reportdir) print "done" for tfname in predictions: print tfname Z = predictions[tfname] # Restore the original (unpreprocessed) scale and normalization of the predictions # Then add the opposite microarray's estimated probe value to each probe with open(args.finaldir + "/" + tfname +"/preprocessors.pkl") as f: normalizer = cPickle.load(f)['targets'][0] Z *= normalizer.scales[0] Z += normalizer.biases[0] # Add per-probe multiplicative bias to predictions probe_biases = load_probe_biases()[fold].reshape((-1,1)) if args.quick: probe_biases = probe_biases[:len(Z),:] M = ~np.isnan(probe_biases) Z[M] *= probe_biases[M] #with open("predictions/%s_%s.tsv" % (tfname, {"A":"B","B":"A"}[foldid]), 'w') as f: # f.writelines([str(x)+"\n" for x in Z.ravel()]) z = Z.ravel() y = data.Y[:,data.targetnames.index(tfname)].ravel() rowidx = data.rowidx.ravel() mask = ~np.isnan(y) zscore4 = np.std(y[mask])*4+np.mean(y[mask]) util._update_metrics(args.finaldir, tfname, "test", rowidx, z, y, aucthresh=(zscore4,zscore4))
def save_chip_performance_table(args, tfids): with open(args.reportdir + "/performance_chipseq.txt", "w") as f: f.write("Background\tFile\tauc.mean\tauc.std\n") chipseqids = ["TF_%d" % i for i in [23, 25, 31, 40, 44]] for tfid in chipseqids: if tfid not in tfids: continue for windowsize in [51, 100]: for background in ["dinuc", "full_genomic", "genomic"]: seqfile = "../data/dream5/chipseq/%s_CHIP_%d_%s.seq" % ( tfid, windowsize, background) invivodata = util.datasource.fromtxt(seqfile + "[0]", None, seqfile + "[1]", sequencenames=["seq"], targetnames=[tfid]) invivodata.targetnames = [tfid] predictions = util.predict(invivodata, args.finaldir, args.reportdir, include=[tfid]) z = predictions[tfid].ravel() y = invivodata.Y.ravel() metrics = util.calc_metrics(z, y) s = "%s\t%s\t%.3f\t%.3f\n" % (background, seqfile, metrics['auc.mean'], metrics['auc.std']) print s, f.write(s)
def save_chip_performance_table(args, tfids): with open(args.reportdir+"/performance_chipseq.txt", "w") as f: f.write("Background\tFile\tauc.mean\tauc.std\n") chipseqids = ["TF_%d"%i for i in [23,25,31,40,44]] for tfid in chipseqids: if tfid not in tfids: continue for windowsize in [51,100]: for background in ["dinuc", "full_genomic", "genomic"]: seqfile = "../data/dream5/chipseq/%s_CHIP_%d_%s.seq" % (tfid, windowsize, background) invivodata = util.datasource.fromtxt(seqfile+"[0]", None, seqfile+"[1]", sequencenames=["seq"], targetnames=[tfid]) invivodata.targetnames = [tfid] predictions = util.predict(invivodata, args.finaldir, args.reportdir, include=[tfid]) z = predictions[tfid].ravel() y = invivodata.Y.ravel() metrics = util.calc_metrics(z, y) s = "%s\t%s\t%.3f\t%.3f\n" % (background, seqfile, metrics['auc.mean'], metrics['auc.std']) print s, f.write(s)
def save_test_performance(data, tfids, fold, args): print "Predicting...", tfids # Generate predictions for each model predictions = util.predict(data, args.finaldir, args.reportdir) print "done" for tfname in predictions: print tfname Z = predictions[tfname] # Restore the original (unpreprocessed) scale and normalization of the predictions # Then add the opposite microarray's estimated probe value to each probe with open(args.finaldir + "/" + tfname + "/preprocessors.pkl") as f: normalizer = cPickle.load(f)['targets'][0] Z *= normalizer.scales[0] Z += normalizer.biases[0] # Add per-probe multiplicative bias to predictions probe_biases = load_probe_biases()[fold].reshape((-1, 1)) if args.quick: probe_biases = probe_biases[:len(Z), :] M = ~np.isnan(probe_biases) Z[M] *= probe_biases[M] #with open("predictions/%s_%s.tsv" % (tfname, {"A":"B","B":"A"}[foldid]), 'w') as f: # f.writelines([str(x)+"\n" for x in Z.ravel()]) z = Z.ravel() y = data.Y[:, data.targetnames.index(tfname)].ravel() rowidx = data.rowidx.ravel() mask = ~np.isnan(y) zscore4 = np.std(y[mask]) * 4 + np.mean(y[mask]) util._update_metrics(args.finaldir, tfname, "test", rowidx, z, y, aucthresh=(zscore4, zscore4))
def save_test_predictions(args): # In quick mode, only load a subset of the data maxrows = 10000 if args.quick else None chunktargets, chunkcols = get_chunktargets(args) tedata = {} abs_auc = lambda x: max(x, 1 - x) if args.mode == "AB": invivo_data_dir = "../data/rnac/invivo" util.makepath(args.testdir + "/invivo") aucdump = open(args.testdir + "/invivo/deepbind_all.txt", "w") aucdump.write("Protein\tFile\tModel\tauc\tauc.mean\tauc.std\n") for invivo_id in invivo_ids.keys(): rnac_ids, invivo_files = invivo_ids[invivo_id] rnac_names = ["RNCMPT%05d" % id for id in rnac_ids] rnac_names = [name for name in rnac_names if name in chunktargets] if not rnac_names: continue for invivo_file in invivo_files: if not os.path.exists(invivo_data_dir + "/" + invivo_file + ".txt"): continue print "File %s using models %s..." % (invivo_file, ",".join(rnac_names)) # Convert the invivo sequence file into a format that predict.py expects, # i.e. with a Fold ID, Event ID, and Sequence column. data = util.datasource.fromtxt( invivo_data_dir + "/" + invivo_file + ".txt[1]", None, invivo_data_dir + "/" + invivo_file + ".txt[0]", sequencenames=["bound", "seq"], targetnames=["bound"]) # First generate predictions based on "trivial" features for this particular invivo file sequences = [row[0] for row in data.sequences] labels = data.targets predictions = {} predictions["len"] = np.asarray([len(s) for s in sequences], np.float32).reshape((-1, 1)) predictions["A"] = np.asarray( [s.upper().count("A") / float(len(s)) for s in sequences], np.float32).reshape((-1, 1)) predictions["C"] = np.asarray( [s.upper().count("C") / float(len(s)) for s in sequences], np.float32).reshape((-1, 1)) predictions["G"] = np.asarray( [s.upper().count("G") / float(len(s)) for s in sequences], np.float32).reshape((-1, 1)) predictions["T"] = np.asarray( [(s.upper().count("U") + s.upper().count("T")) / float(len(s)) for s in sequences], np.float32).reshape((-1, 1)) predictions["GC"] = predictions["G"] + predictions["C"] # Next, generate predictions for each model on this same data file data.targetnames = rnac_names data.Y = np.repeat(data.Y, len(rnac_names), 1) data.Ymask = np.repeat(data.Ymask, len(rnac_names), 1) data.targets = data.Y.copy() #pred = util.predict(data, "../data/rnac/pfms", args.reportdir, scan=20) pred = util.predict(data, args.finaldir, args.reportdir, scan=20) predictions.update(pred) # Dump all performance stats to the file for pname in sorted( predictions.keys(), key=lambda x: x if "RNCMPT" not in x else " " + x ): # Make sure RNCMPT items go first in each group, for readability of all_aucs.txt # Calculate the AUC of this particular prediction, of this particular model, # on this particular invivo file. z = predictions[pname].ravel() y = np.array(labels).ravel() metrics = util.calc_metrics(z, y) # Write out a row indicating performance of each model on this file aucdump.write( "%s\t%s\t%s\t%.4f\t%.4f\t%.6f\n" % (invivo_id, invivo_file, pname, metrics["auc"], metrics["auc.mean"], metrics["auc.std"])) aucdump.close() # Re-open the AUC dump file, pull out all experiments associated with a single protein, # and collect only the best of each type all_aucs = {} with open(args.testdir + "/invivo/deepbind_all.txt") as f: f.readline() # discard header line for line in f: protein_name, invivo_file, model_name, auc, auc_mean, auc_std = line.rstrip( ).split("\t") model_suffix = ".deepbind" if "RNCMPT" in model_name else "" if model_name == "GC": continue all_aucs.setdefault(protein_name, []).append({ "file": invivo_file, "model": model_name + model_suffix, "auc": float(auc), "auc.mean": float(auc_mean), "auc.std": float(auc_std) }) # Open the rnac_all.txt and pull out the AUCs for the PFMs from the RNAcompete paper # The rnac_all.txt file is in a different format, for legacy reasons head = rnac_all_aucs[0].rstrip().split("\t") lines = [line.rstrip().split("\t") for line in rnac_all_aucs[1:]] cols = {item: head.index(item) for item in head} for line in lines: protein_name = line[0] for scantype in ("max", "sum", "avg", "direct"): invivo_file = line[1].rsplit(".", 1)[0] model_name = line[2] + "." + scantype auc = "nan" auc_mean = line[cols[scantype]] auc_std = line[cols[scantype + ".std"]] if protein_name in all_aucs: all_aucs[protein_name].append({ "file": invivo_file, "model": model_name + ".rnac", "auc": float(auc), "auc.mean": float(auc_mean), "auc.std": float(auc_std) }) for scantype in ("direct", "max", "sum", "avg"): for modeltype in ("deepbind", "rnac"): with open( args.testdir + "/invivo/%s_best_%s.txt" % (modeltype, scantype), "w") as f: f.write("Protein\tFile") f.write("\t" + "\t".join([ "deeputil.model", "deeputil.auc", "deeputil.auc.mean", "deeputil.auc.std" ])) f.write("\t" + "\t".join([ "rnac.model", "rnac.auc", "rnac.auc.mean", "rnac.auc.std" ])) f.write("\t" + "\t".join([ "trivial.model", "trivial.auc", "trivial.auc.mean", "trivial.auc.std" ])) f.write("\n") for protein in sorted(all_aucs.keys()): trials = all_aucs[protein] # For this particular protein, first find the best row based on non-trivial models, # e.g. among RNCMPTXXXXX.direct and RNCMPTXXXXX.max, while ignoring "A" and "len" best = None for trial in trials: if trial["model"].endswith(scantype + "." + modeltype): if best is None or trial["auc.mean"] > best[ "auc.mean"]: best = trial # Also find the best trivial feature associated with this file best_trivial = None for trial in trials: if trial["file"] == best[ "file"] and "RNCMPT" not in trial["model"]: if best_trivial is None or abs_auc( trial["auc.mean"]) > abs_auc( best_trivial["auc.mean"]): best_trivial = trial # Also find the best competing feature associated with this file best_other = None if modeltype == "rnac": other_scantype = "max" elif modeltype == "deepbind": other_scantype = "avg" else: other_scantype = scantype for trial in trials: if trial["file"] == best["file"] and ( "." + other_scantype + ".") in trial["model"] and not trial[ "model"].endswith(other_scantype + "." + modeltype): if best_other is None or trial[ "auc.mean"] > best_other["auc.mean"]: best_other = trial best_deepbind = best if modeltype == "deepbind" else best_other best_rnac = best if modeltype == "rnac" else best_other f.write("%s\t%s\t" % (protein, best["file"])) f.write("%s\t%.4f\t%.4f\t%.6f\t" % (best_deepbind["model"], best_deepbind["auc"], best_deepbind["auc.mean"], best_deepbind["auc.std"])) f.write("%s\t%.4f\t%.4f\t%.6f\t" % (best_rnac["model"], best_rnac["auc"], best_rnac["auc.mean"], best_rnac["auc.std"])) f.write("%s\t%.4f\t%.4f\t%.6f\n" % (best_trivial["model"], abs_auc(best_trivial["auc"]), abs_auc(best_trivial["auc.mean"]), best_trivial["auc.std"])) elif args.mode in ("A", "B"): # Generate predictions on PBM probes from test set (i.e. if mode="A", the training set was "A" so the test set is "B") print "Loading PBM data...", testfold = "A" if args.mode == "B" else "B" pbmdata = util.datasource.fromtxt("../data/rnac/sequences.tsv.gz", None, "../data/rnac/targets.tsv.gz", targetcols=[0], foldfilter=testfold, maxrows=maxrows) print "done" util.makepath(args.testdir + "/pbm") for targetname in chunktargets: print targetname pbmdata.targetnames = [targetname] predictions = util.predict(pbmdata, args.finaldir, args.reportdir, include=[targetname]) Z = predictions[targetname].ravel() with gzip.open( args.testdir + "/pbm/%s-DB-%s.txt.gz" % (targetname, testfold), "w") as f: for z in Z: f.write("%.4f\n" % z) else: quit("Unrecognized mode")