def run(): datadict = load_data() model = iONMF(rank=5, max_iter=100, alpha=0.0) # Fit all training data model.fit(datadict) # Make predictions about class on all training data # using only expression data ... testdict = dict() testdict["Pos_diff_expr"] = datadict["Pos_diff_expr"] testdict["Neg_diff_expr"] = datadict["Neg_diff_expr"] rdict = model.predict(testdict) # ... and calculate training error true_y = np.zeros((len(datadict["Class_0"]), 1)) true_y[np.where(datadict["Class_0"])] = 0 true_y[np.where(datadict["Class_1"])] = 1 true_y[np.where(datadict["Class_2"])] = 2 predictions = np.array([ np.argmax( [rdict["Class_0"][i], rdict["Class_1"][i], rdict["Class_2"][i]]) for i in xrange(len(true_y)) ]) acc = np.sum(predictions == true_y.ravel()) / float(len(true_y)) print "Training accuracy: ", acc # Plot matrices plt.figure(figsize=(12, 12)) for ki, ky in enumerate(testdict.keys()): plt.subplot(len(testdict), 2, 2 * ki + 1) plt.title(ky) plt.imshow(datadict[ky]) plt.subplot(len(testdict), 2, 2 * ki + 2) plt.title(ky + " (approx.)") plt.imshow(model.coef_.dot(model.basis_[ky])) plt.show()
def run(): datadict = load_data() model = iONMF(rank=5, max_iter=100, alpha=0.0) # Fit all training data model.fit(datadict) # Make predictions about class on all training data # using only expression data ... testdict = dict() testdict["Pos_diff_expr"] = datadict["Pos_diff_expr"] testdict["Neg_diff_expr"] = datadict["Neg_diff_expr"] rdict = model.predict(testdict) # ... and calculate training error true_y = np.zeros((len(datadict["Class_0"]), 1)) true_y[np.where(datadict["Class_0"])] = 0 true_y[np.where(datadict["Class_1"])] = 1 true_y[np.where(datadict["Class_2"])] = 2 predictions = np.array([np.argmax([rdict["Class_0"][i], rdict["Class_1"][i], rdict["Class_2"][i]]) for i in xrange(len(true_y))]) acc = np.sum(predictions == true_y.ravel()) / float(len(true_y)) print "Training accuracy: ", acc # Plot matrices plt.figure(figsize=(12, 12)) for ki, ky in enumerate(testdict.keys()): plt.subplot(len(testdict), 2, 2*ki+1) plt.title(ky) plt.imshow(datadict[ky]) plt.subplot(len(testdict), 2, 2*ki+2) plt.title(ky + " (approx.)") plt.imshow(model.coef_.dot(model.basis_[ky])) plt.show()
def run(): # Select example protein folder from the dataset protein = sys.argv[1] # Load training data and column labels training_data = load_data("../datasets/clip/%s/5000/training_sample_0" % protein, go=False, kmer=False) training_labels = load_labels("../datasets/clip/%s/5000/training_sample_0" % protein, go=False, kmer=False) model = iONMF(rank=5, max_iter=100, alpha=10.0) # Fit all training data model.fit(training_data) # Make predictions about class on all training data # delete class from dictionary test_data = load_data("../datasets/clip/%s/5000/test_sample_0" % protein, go=False, kmer=False) true_y = test_data["Y"].copy() del test_data["Y"] results = model.predict(test_data) # Evaluate prediction on holdout test set predictions = results["Y"] auc = roc_auc_score(true_y, predictions) print "Test AUC: ", auc # Draw low-dimensional components for Region types (H_RG) # and RNA structure (H_RNA) # with mean values in coefficient matrix W for positive (+) and negative (-) # positions f, axes = plt.subplots(model.rank, 3, sharex='col', figsize=(15, 8)) H_RNA = model.basis_["X_RNA"] H_RG = model.basis_["X_RG"] labelset = sorted(set(training_labels["X_RG"])) positives = training_data["Y"].nonzero()[0] negatives = (training_data["Y"] == 0).nonzero()[0] for k in xrange(model.rank): # Values in the coefficient (W) matrix w_positives = model.coef_[positives, :][:, k].mean() w_negatives = model.coef_[negatives, :][:, k].mean() e_positives = model.coef_[positives, :][:, k].std() / np.sqrt(len(positives)) e_negatives = model.coef_[negatives, :][:, k].std() / np.sqrt(len(negatives)) axes[k, 2].bar([0], [w_negatives], yerr =[(0,), (e_positives, )], color="blue", align="center") axes[k, 2].bar([1], [w_positives], yerr =[(0,), (e_negatives, )], color="green", align="center") # Plot RNA structure axes[k, 1].plot(H_RNA[k, :].ravel(),) # Plot region types for label in labelset: indices = np.where(map(lambda e: e == label, training_labels["X_RG"]))[0] axes[k, 0].plot(H_RG[k, indices].ravel(), label=label) axes[k, 0].set_ylabel("Module %d" % k) j = model.rank - 1 axes[0, 0].legend(bbox_to_anchor=(0., 1.04, 1., .102), loc=3, ncol=3, mode="expand", borderaxespad=0.) axes[0, 1].set_title("Double-stranded RNA") axes[0, 2].set_title("Mean values in the coefficient matrix (W)") axes[j, 0].set_xticks(np.linspace(0, H_RNA.shape[1], 5)) axes[j, 0].set_xticklabels([-50, -25, 0, 25, 50]) axes[j, 0].set_xlabel("Position relative to cross-link site") axes[j, 1].set_xticks(np.linspace(0, H_RNA.shape[1], 5)) axes[j, 1].set_xticklabels([-50, -25, 0, 25, 50]) axes[j, 1].set_xlabel("Position relative to cross-link site") axes[j, 2].set_xticks([0, 1]) axes[j, 2].set_xticklabels(["-", "+"]) plt.show()