예제 #1
0
def save_test_performance(data, tfids, fold, args):

    print "Predicting...", tfids
    # Generate predictions for each model
    predictions = util.predict(data, args.finaldir, args.reportdir)
    print "done"

    for tfname in predictions:
        print tfname
        Z = predictions[tfname]


        # Restore the original (unpreprocessed) scale and normalization of the predictions
        # Then add the opposite microarray's estimated probe value to each probe
        with open(args.finaldir + "/" + tfname +"/preprocessors.pkl") as f:
            normalizer = cPickle.load(f)['targets'][0]
        Z *= normalizer.scales[0]
        Z += normalizer.biases[0]

        # Add per-probe multiplicative bias to predictions
        probe_biases = load_probe_biases()[fold].reshape((-1,1))
        if args.quick:
            probe_biases = probe_biases[:len(Z),:]
        M = ~np.isnan(probe_biases)
        Z[M] *= probe_biases[M]

        #with open("predictions/%s_%s.tsv" % (tfname, {"A":"B","B":"A"}[foldid]), 'w') as f:
        #    f.writelines([str(x)+"\n" for x in Z.ravel()])
        z = Z.ravel()
        y = data.Y[:,data.targetnames.index(tfname)].ravel()
        rowidx = data.rowidx.ravel()
        mask = ~np.isnan(y)
        zscore4 = np.std(y[mask])*4+np.mean(y[mask])
        util._update_metrics(args.finaldir, tfname, "test", rowidx, z, y, aucthresh=(zscore4,zscore4))
예제 #2
0
def save_chip_performance_table(args, tfids):
    with open(args.reportdir + "/performance_chipseq.txt", "w") as f:
        f.write("Background\tFile\tauc.mean\tauc.std\n")
        chipseqids = ["TF_%d" % i for i in [23, 25, 31, 40, 44]]
        for tfid in chipseqids:
            if tfid not in tfids:
                continue
            for windowsize in [51, 100]:
                for background in ["dinuc", "full_genomic", "genomic"]:
                    seqfile = "../data/dream5/chipseq/%s_CHIP_%d_%s.seq" % (
                        tfid, windowsize, background)
                    invivodata = util.datasource.fromtxt(seqfile + "[0]",
                                                         None,
                                                         seqfile + "[1]",
                                                         sequencenames=["seq"],
                                                         targetnames=[tfid])
                    invivodata.targetnames = [tfid]

                    predictions = util.predict(invivodata,
                                               args.finaldir,
                                               args.reportdir,
                                               include=[tfid])
                    z = predictions[tfid].ravel()
                    y = invivodata.Y.ravel()
                    metrics = util.calc_metrics(z, y)
                    s = "%s\t%s\t%.3f\t%.3f\n" % (background, seqfile,
                                                  metrics['auc.mean'],
                                                  metrics['auc.std'])
                    print s,
                    f.write(s)
예제 #3
0
def save_chip_performance_table(args, tfids):
    with open(args.reportdir+"/performance_chipseq.txt", "w") as f:
        f.write("Background\tFile\tauc.mean\tauc.std\n")
        chipseqids = ["TF_%d"%i for i in [23,25,31,40,44]]
        for tfid in chipseqids:
            if tfid not in tfids:
                continue
            for windowsize in [51,100]:
                for background in ["dinuc", "full_genomic", "genomic"]:
                    seqfile = "../data/dream5/chipseq/%s_CHIP_%d_%s.seq" % (tfid, windowsize, background)
                    invivodata = util.datasource.fromtxt(seqfile+"[0]", None, seqfile+"[1]", sequencenames=["seq"], targetnames=[tfid])
                    invivodata.targetnames = [tfid]

                    predictions = util.predict(invivodata, args.finaldir, args.reportdir, include=[tfid])
                    z = predictions[tfid].ravel()
                    y = invivodata.Y.ravel()
                    metrics = util.calc_metrics(z, y)
                    s = "%s\t%s\t%.3f\t%.3f\n" % (background, seqfile, metrics['auc.mean'], metrics['auc.std'])
                    print s,
                    f.write(s)
예제 #4
0
def save_test_performance(data, tfids, fold, args):

    print "Predicting...", tfids
    # Generate predictions for each model
    predictions = util.predict(data, args.finaldir, args.reportdir)
    print "done"

    for tfname in predictions:
        print tfname
        Z = predictions[tfname]

        # Restore the original (unpreprocessed) scale and normalization of the predictions
        # Then add the opposite microarray's estimated probe value to each probe
        with open(args.finaldir + "/" + tfname + "/preprocessors.pkl") as f:
            normalizer = cPickle.load(f)['targets'][0]
        Z *= normalizer.scales[0]
        Z += normalizer.biases[0]

        # Add per-probe multiplicative bias to predictions
        probe_biases = load_probe_biases()[fold].reshape((-1, 1))
        if args.quick:
            probe_biases = probe_biases[:len(Z), :]
        M = ~np.isnan(probe_biases)
        Z[M] *= probe_biases[M]

        #with open("predictions/%s_%s.tsv" % (tfname, {"A":"B","B":"A"}[foldid]), 'w') as f:
        #    f.writelines([str(x)+"\n" for x in Z.ravel()])
        z = Z.ravel()
        y = data.Y[:, data.targetnames.index(tfname)].ravel()
        rowidx = data.rowidx.ravel()
        mask = ~np.isnan(y)
        zscore4 = np.std(y[mask]) * 4 + np.mean(y[mask])
        util._update_metrics(args.finaldir,
                             tfname,
                             "test",
                             rowidx,
                             z,
                             y,
                             aucthresh=(zscore4, zscore4))
예제 #5
0
def save_test_predictions(args):
    # In quick mode, only load a subset of the data
    maxrows = 10000 if args.quick else None
    chunktargets, chunkcols = get_chunktargets(args)
    tedata = {}

    abs_auc = lambda x: max(x, 1 - x)

    if args.mode == "AB":

        invivo_data_dir = "../data/rnac/invivo"
        util.makepath(args.testdir + "/invivo")
        aucdump = open(args.testdir + "/invivo/deepbind_all.txt", "w")
        aucdump.write("Protein\tFile\tModel\tauc\tauc.mean\tauc.std\n")

        for invivo_id in invivo_ids.keys():
            rnac_ids, invivo_files = invivo_ids[invivo_id]
            rnac_names = ["RNCMPT%05d" % id for id in rnac_ids]
            rnac_names = [name for name in rnac_names if name in chunktargets]
            if not rnac_names:
                continue

            for invivo_file in invivo_files:
                if not os.path.exists(invivo_data_dir + "/" + invivo_file +
                                      ".txt"):
                    continue
                print "File %s using models %s..." % (invivo_file,
                                                      ",".join(rnac_names))

                # Convert the invivo sequence file into a format that predict.py expects,
                # i.e. with a Fold ID, Event ID, and Sequence column.
                data = util.datasource.fromtxt(
                    invivo_data_dir + "/" + invivo_file + ".txt[1]",
                    None,
                    invivo_data_dir + "/" + invivo_file + ".txt[0]",
                    sequencenames=["bound", "seq"],
                    targetnames=["bound"])

                # First generate predictions based on "trivial" features for this particular invivo file
                sequences = [row[0] for row in data.sequences]
                labels = data.targets
                predictions = {}
                predictions["len"] = np.asarray([len(s) for s in sequences],
                                                np.float32).reshape((-1, 1))
                predictions["A"] = np.asarray(
                    [s.upper().count("A") / float(len(s)) for s in sequences],
                    np.float32).reshape((-1, 1))
                predictions["C"] = np.asarray(
                    [s.upper().count("C") / float(len(s)) for s in sequences],
                    np.float32).reshape((-1, 1))
                predictions["G"] = np.asarray(
                    [s.upper().count("G") / float(len(s)) for s in sequences],
                    np.float32).reshape((-1, 1))
                predictions["T"] = np.asarray(
                    [(s.upper().count("U") + s.upper().count("T")) /
                     float(len(s))
                     for s in sequences], np.float32).reshape((-1, 1))
                predictions["GC"] = predictions["G"] + predictions["C"]

                # Next, generate predictions for each model on this same data file
                data.targetnames = rnac_names
                data.Y = np.repeat(data.Y, len(rnac_names), 1)
                data.Ymask = np.repeat(data.Ymask, len(rnac_names), 1)
                data.targets = data.Y.copy()
                #pred = util.predict(data, "../data/rnac/pfms", args.reportdir, scan=20)
                pred = util.predict(data,
                                    args.finaldir,
                                    args.reportdir,
                                    scan=20)
                predictions.update(pred)

                # Dump all performance stats to the file
                for pname in sorted(
                        predictions.keys(),
                        key=lambda x: x if "RNCMPT" not in x else " " + x
                ):  # Make sure RNCMPT items go first in each group, for readability of all_aucs.txt
                    # Calculate the AUC of this particular prediction, of this particular model,
                    # on this particular invivo file.
                    z = predictions[pname].ravel()
                    y = np.array(labels).ravel()
                    metrics = util.calc_metrics(z, y)

                    # Write out a row indicating performance of each model on this file
                    aucdump.write(
                        "%s\t%s\t%s\t%.4f\t%.4f\t%.6f\n" %
                        (invivo_id, invivo_file, pname, metrics["auc"],
                         metrics["auc.mean"], metrics["auc.std"]))

        aucdump.close()

        # Re-open the AUC dump file, pull out all experiments associated with a single protein,
        # and collect only the best of each type
        all_aucs = {}
        with open(args.testdir + "/invivo/deepbind_all.txt") as f:
            f.readline()  # discard header line
            for line in f:
                protein_name, invivo_file, model_name, auc, auc_mean, auc_std = line.rstrip(
                ).split("\t")
                model_suffix = ".deepbind" if "RNCMPT" in model_name else ""
                if model_name == "GC":
                    continue
                all_aucs.setdefault(protein_name, []).append({
                    "file":
                    invivo_file,
                    "model":
                    model_name + model_suffix,
                    "auc":
                    float(auc),
                    "auc.mean":
                    float(auc_mean),
                    "auc.std":
                    float(auc_std)
                })

        # Open the rnac_all.txt and pull out the AUCs for the PFMs from the RNAcompete paper
        # The rnac_all.txt file is in a different format, for legacy reasons
        head = rnac_all_aucs[0].rstrip().split("\t")
        lines = [line.rstrip().split("\t") for line in rnac_all_aucs[1:]]
        cols = {item: head.index(item) for item in head}
        for line in lines:
            protein_name = line[0]
            for scantype in ("max", "sum", "avg", "direct"):
                invivo_file = line[1].rsplit(".", 1)[0]
                model_name = line[2] + "." + scantype
                auc = "nan"
                auc_mean = line[cols[scantype]]
                auc_std = line[cols[scantype + ".std"]]
                if protein_name in all_aucs:
                    all_aucs[protein_name].append({
                        "file": invivo_file,
                        "model": model_name + ".rnac",
                        "auc": float(auc),
                        "auc.mean": float(auc_mean),
                        "auc.std": float(auc_std)
                    })

        for scantype in ("direct", "max", "sum", "avg"):
            for modeltype in ("deepbind", "rnac"):
                with open(
                        args.testdir + "/invivo/%s_best_%s.txt" %
                    (modeltype, scantype), "w") as f:
                    f.write("Protein\tFile")
                    f.write("\t" + "\t".join([
                        "deeputil.model", "deeputil.auc", "deeputil.auc.mean",
                        "deeputil.auc.std"
                    ]))
                    f.write("\t" + "\t".join([
                        "rnac.model", "rnac.auc", "rnac.auc.mean",
                        "rnac.auc.std"
                    ]))
                    f.write("\t" + "\t".join([
                        "trivial.model", "trivial.auc", "trivial.auc.mean",
                        "trivial.auc.std"
                    ]))
                    f.write("\n")
                    for protein in sorted(all_aucs.keys()):
                        trials = all_aucs[protein]
                        # For this particular protein, first find the best row based on non-trivial models,
                        # e.g. among RNCMPTXXXXX.direct and RNCMPTXXXXX.max, while ignoring "A" and "len"
                        best = None
                        for trial in trials:
                            if trial["model"].endswith(scantype + "." +
                                                       modeltype):
                                if best is None or trial["auc.mean"] > best[
                                        "auc.mean"]:
                                    best = trial

                        # Also find the best trivial feature associated with this file
                        best_trivial = None
                        for trial in trials:
                            if trial["file"] == best[
                                    "file"] and "RNCMPT" not in trial["model"]:
                                if best_trivial is None or abs_auc(
                                        trial["auc.mean"]) > abs_auc(
                                            best_trivial["auc.mean"]):
                                    best_trivial = trial

                        # Also find the best competing feature associated with this file
                        best_other = None
                        if modeltype == "rnac":
                            other_scantype = "max"
                        elif modeltype == "deepbind":
                            other_scantype = "avg"
                        else:
                            other_scantype = scantype

                        for trial in trials:
                            if trial["file"] == best["file"] and (
                                    "." + other_scantype +
                                    ".") in trial["model"] and not trial[
                                        "model"].endswith(other_scantype +
                                                          "." + modeltype):
                                if best_other is None or trial[
                                        "auc.mean"] > best_other["auc.mean"]:
                                    best_other = trial

                        best_deepbind = best if modeltype == "deepbind" else best_other
                        best_rnac = best if modeltype == "rnac" else best_other

                        f.write("%s\t%s\t" % (protein, best["file"]))
                        f.write("%s\t%.4f\t%.4f\t%.6f\t" %
                                (best_deepbind["model"], best_deepbind["auc"],
                                 best_deepbind["auc.mean"],
                                 best_deepbind["auc.std"]))
                        f.write("%s\t%.4f\t%.4f\t%.6f\t" %
                                (best_rnac["model"], best_rnac["auc"],
                                 best_rnac["auc.mean"], best_rnac["auc.std"]))
                        f.write("%s\t%.4f\t%.4f\t%.6f\n" %
                                (best_trivial["model"],
                                 abs_auc(best_trivial["auc"]),
                                 abs_auc(best_trivial["auc.mean"]),
                                 best_trivial["auc.std"]))

    elif args.mode in ("A", "B"):
        # Generate predictions on PBM probes from test set (i.e. if mode="A", the training set was "A" so the test set is "B")
        print "Loading PBM data...",
        testfold = "A" if args.mode == "B" else "B"
        pbmdata = util.datasource.fromtxt("../data/rnac/sequences.tsv.gz",
                                          None,
                                          "../data/rnac/targets.tsv.gz",
                                          targetcols=[0],
                                          foldfilter=testfold,
                                          maxrows=maxrows)
        print "done"

        util.makepath(args.testdir + "/pbm")
        for targetname in chunktargets:
            print targetname
            pbmdata.targetnames = [targetname]
            predictions = util.predict(pbmdata,
                                       args.finaldir,
                                       args.reportdir,
                                       include=[targetname])
            Z = predictions[targetname].ravel()
            with gzip.open(
                    args.testdir + "/pbm/%s-DB-%s.txt.gz" %
                (targetname, testfold), "w") as f:
                for z in Z:
                    f.write("%.4f\n" % z)

    else:
        quit("Unrecognized mode")