def plot_sheer_cabestro(sheer_data="sheer6016.npy"): amounts = [164981, 1242, 236833, 17858, 0, 405754, 33990, 84479] eq_sheer = np.load(SHEER_PATH + sheer_data) fig, ax = plt.subplots(figsize=(7, 2.5)) #omg_cosa = [] amountvec = [] classvec = [] for target_class in CLASS_NAMES: classvec.append(target_class) tot_pssm = np.sum(eq_sheer[ssConvertString.find(target_class), :, 21:], axis=1) amountvec.append(amounts[ssConvertString.find(target_class)]) ax.plot(tot_pssm / amounts[ssConvertString.find(target_class)], marker='.', color=CLASS_COLOURS[target_class], label=target_class) #omg_cosa.append(mlines.Line2D([], [], color=CLASS_COLOURS[target_class], marker='.', #label=target_class)) ax.xaxis.set(ticks=range(19), ticklabels=range(-WINDOW, WINDOW + 1)) ax.margins(0) ax.legend( ) #omg_cosa, [el + ": {:d}".format(amountvec[i]) for i, el in enumerate(classvec)], #loc='upper left') #ax.axhline(0, color="black") plt.tight_layout() fig.show()
def clustering(args): points = np.load(SHEER_PATH + "points" + str(args.num_seqs) + ".npy") points = points[:args.num_points, ssConvertString.find(args.label)] print("Original points shape:", points.shape) mask = np.ones(len(points), dtype='bool') for i, point in enumerate(points): if np.allclose(point, np.zeros_like(point)): mask[i] = False points = points[mask] print("Filtered points shape (no zero vectors):", points.shape) if args.clustering == "agglomerative": n_clusters = 4 model = AgglomerativeClustering(n_clusters=n_clusters, linkage="average", affinity="cosine") elif args.clustering == "DBSCAN": # model = DBSCAN(metric="cosine", eps=args.eps) import hdbscan model = hdbscan.HDBSCAN(min_cluster_size=10) else: raise ValueError("Clustering algorithm '{:s}' not recognized".format( args.clustering)) model.fit(points) file_end = "{:s}{:d}.npy".format(args.label, args.num_points) np.save(SHEER_PATH + args.clustering + file_end, model.labels_) np.save(SHEER_PATH + 'mask' + file_end, mask) print("Clustering completed. Labels saved in " + SHEER_PATH + args.clustering + file_end) print("Labels:") print(pd.Series(model.labels_).value_counts())
def plot_sheer_class_aa(): abs_sheer = np.load(SHEER_PATH + "sheer6016.npy") # abs_sheer += abs_sheer[:, ::-1, :] # abs_sheer /= 2 fig2, ax21 = plt.subplots(figsize=(9, 3)) legend_names = [el for el in CLASS_NAMES] omg_cosa = [] kurtvec = [] for i, target_class in enumerate(legend_names): tot_pssm = np.sum(abs_sheer[ssConvertString.find(target_class), :, 21:], axis=1) kurtvec.append(kurtosis(tot_pssm)) if i == 0: ax22 = ax21 ax22.yaxis.set(ticks=[0]) else: ax22 = ax21.twinx() ax22.yaxis.set(ticks=[]) ax22.plot(tot_pssm, marker='.', color=CLASS_COLOURS[target_class]) ax22.set_ylim(bottom=0) omg_cosa.append(mlines.Line2D([], [], color=CLASS_COLOURS[target_class], marker='.', label=target_class)) ax21.xaxis.set(ticks=range(19), ticklabels=range(-WINDOW, WINDOW + 1)) ax21.margins(0) ax21.legend(omg_cosa, [el + ": {:.1f}".format(kurtvec[i]) for i, el in enumerate(legend_names)], loc='upper right') ax21.yaxis.set(ticks=[]) plt.tight_layout() fig2.show() fig2.savefig(FIGURES_PATH + "sheer_class_aa.eps")
def plot_sheer_aa(): totals = collect_sheer() totals /= 1000 fig, axes = plt.subplots(1, 3, figsize=(13, 2.8)) i = 0 for aa in pssmString_jurtz: if aa in ["G", "K", "M"]: ax1 = axes[i] for j, label in enumerate(CLASS_NAMES): ax1.plot(totals[ssConvertString.find(label), :, pssmString_jurtz.find(aa) + 21], label=label, marker='.', color=CLASS_COLOURS[label]) # vmax = np.max(abs(totals[:, :, 21:])) ax1.legend(loc='right') ax1.set(title="Pssm-values for " + aa) # , ylim=[-vmax, vmax]) ax1.xaxis.set(ticks=range(19), ticklabels=range(-WINDOW, WINDOW + 1)) # ax1.yaxis.set(ticks=[]) ax1.margins(0) i += 1 plt.tight_layout() fig.savefig(FIGURES_PATH + "class_agg_aa.eps") plt.show()
def plot_lines(): fig, ax = plt.subplots(figsize=(10, 2.8)) for j, label in enumerate(CLASS_NAMES): ax.plot(tot_sal[ssConvertString.find(label), WINDOW:-WINDOW].T, marker='.', label=label, color=CLASS_COLOURS[label]) ax.legend(loc="upper left") ax.xaxis.set(ticks=range(final_pos - ini_pos + 1), ticklabels=[preds[i] + "\n" + el + "\n" + str(ini_pos + i) for i, el in enumerate(labels)]) colors = [CLASS_COLOURS[el] for el in preds] for color, tick in zip(colors, ax.xaxis.get_major_ticks()): tick.label1.set_color(color) # set the color property ax.margins(0) ax2 = ax.twiny() ax2.xaxis.set(ticks=range(final_pos - ini_pos + 1), ticklabels=aas) # colors = [SEQLOGO_COLOURS[el] for el in aas] # for color, tick in zip(colors, ax2.xaxis.get_major_ticks()): # tick.label2.set_color(color) # set the color property plt.tight_layout() fig.savefig(FIGURES_PATH + "sample_8classes.eps", format='eps') plt.show()
def scrap(): exists = probe('saliencies') fail_seqs = 0 deleted = 0 origin = os.getcwd() os.chdir(SALIENCIES_PATH) for seq in range(len(exists)): for label in ssConvertString: if exists[seq, ssConvertString.find(label)]: try: try: fname = "saliencies" + str(seq) + label + ".pkl" with open(fname, "rb") as f: saliency = np.array(pickle.load(f)) except OSError: fname = "saliencies{:4d}{:s}.pkl".format(seq, label) with open(fname, "rb") as f: saliency = np.array(pickle.load(f)) if saliency.ndim != 3 or saliency.shape[ 0] != saliency.shape[1]: os.remove(fname) print("File " + fname + " deleted") deleted += 1 raise OSError("saliency badly formatted") except OSError: fail_seqs += 1 print(str(seq) + label + " Not found") os.chdir(origin) print(str(deleted) + " saliencies deleted") print(str(NUM_SEQS * 8 - fail_seqs) + " saliencies remaining")
def compute_complex_saliency(X_batch, mask_batch, batch_seq, inference, sym_x, batch, label): seq_len = int(np.sum(mask_batch[batch_seq])) try: sym_y = inference[batch_seq, :seq_len, ssConvertString.find(label)] grads = compute_single_saliency(X_batch=X_batch, sym_x=sym_x, sym_y=sym_y) grads = grads[:seq_len, batch_seq, :seq_len] except Exception as err: # IF GPU OUT OF MEMORY print(err) try: # FIRST HALF sym_y = inference[batch_seq, :seq_len // 2, ssConvertString.find(label)] grads1 = compute_single_saliency(X_batch=X_batch, sym_x=sym_x, sym_y=sym_y) grads1 = grads1[:seq_len // 2, batch_seq, :seq_len] except Exception: print(err) print("XXXXXXX Is it in the first part?") try: # SECOND HALF sym_y = inference[batch_seq, seq_len // 2:seq_len, ssConvertString.find(label)] grads2 = compute_single_saliency(X_batch=X_batch, sym_x=sym_x, sym_y=sym_y) grads2 = grads2[seq_len // 2:seq_len, batch_seq, :seq_len] except Exception: print(err) print("XXXXXXXX Or in the second?") grads = np.concatenate((grads1, grads2), axis=0) assert grads.shape[0] == grads.shape[ 1], "{:d} != {:d} for sequence with length {:d}. Concatenation of grad1 with shape {:s} and grads2 with shape {:s}, gives grads with shape {:s}.".format( grads.shape[0], grads.shape[1], seq_len, str(grads1.shape), str(grads2.shape), str(grads.shape)) fname = "saliencies{:4d}{:s}.pkl".format(BATCH_SIZE * batch + batch_seq, label) with open(PATH_SALIENCIES + fname, 'wb') as f: pickle.dump(grads, f, protocol=2)
def probe(folder='saliencies'): origin = os.getcwd() os.chdir(SALIENCIES_PATH) files = glob.glob(folder + '*') exists = np.zeros((NUM_SEQS, 8)) for el in files: found = re.search(r'(\d+)(\D)', el).groups() num = int(found[0]) label = ssConvertString.find(found[1]) if num < NUM_SEQS: assert exists[num, label] == 0 exists[num, label] += 1 os.chdir(origin) return exists
def plot_sheer_class_all(): totals = collect_sheer() fig, axes = plt.subplots(3, 3, figsize=(6 * 3 / 2, 6.65 * 3 / 2)) for i, target_class in enumerate(ssConvertString): ax = axes[i // 3][i % 3] tot_sal = totals[ssConvertString.find(target_class)] vmax = np.max(abs(tot_sal[..., 21:])) cax = ax.imshow(tot_sal[..., 21:].T, cmap='PiYG', vmin=-vmax, vmax=vmax) # fig.colorbar(cax) ax.set(title="Class " + target_class) ax.yaxis.set(ticks=range(len(pssmString_jurtz)), ticklabels=pssmString_jurtz) ax.xaxis.set(ticks=range(2 * WINDOW + 1), ticklabels=range(-WINDOW, WINDOW + 1)) ax.margins(0) plt.tight_layout() fig.savefig(FIGURES_PATH + "class_agg_class_all.eps", format='eps') plt.show()
def repair_saliencies(args): dater = Jurtz_Data() metadata_path = "dump_pureConv-20180804-010835-47.pkl" metadata = np.load(metadata_path) config_name = metadata['config_name'] config = importlib.import_module("%s" % config_name) print("Using configurations: '%s'" % config_name) l_in, l_out = config.build_model() sym_x = T.tensor3() inference = nn.layers.get_output(l_out, sym_x, deterministic=True) nn.layers.set_all_param_values(l_out, metadata['param_values']) batch_range = range(NUM_SEQS // BATCH_SIZE) if args.dir == 'b': batch_range = reversed(batch_range) elif args.dir != 'f': raise ValueError("args.dir is " + str(args.dir)) for batch in batch_range: exists = probe('saliencies') for batch_seq in range(BATCH_SIZE): seq = batch * BATCH_SIZE + batch_seq if int(np.sum(exists[seq])) != 8: X_batch, mask_batch = dater.get_batch_from_seq(seq) for label in ssConvertString: if not exists[seq, ssConvertString.find(label)]: print( "Repairing sequence {:d} and batch {:d} for label {:s}" .format(seq, batch, label)) compute_complex_saliency(X_batch=X_batch, mask_batch=mask_batch, batch=batch, label=label, batch_seq=batch_seq, inference=inference, sym_x=sym_x)
def plot_outliers(): dater = Jurtz_Data() X, labels, mask = dater.get_all_data() split_value = dater.split_value lengths_train = np.sum(mask[:split_value], axis=1) lengths_test = np.sum(mask[split_value:], axis=1) predictions = dater.get_all_predictions() def calculate_seq_accuracy(labels): num_seq = len(mask) seq_len = predictions.shape[1] tot_acc = 0 seq_acc = np.zeros(num_seq) for seq in range(num_seq): for pos in range(seq_len): if mask[seq, pos]: if labels[seq, pos] == np.argmax(predictions[seq, pos]): seq_acc[seq] += 1 tot_acc += 1 else: break seq_acc[seq] /= np.sum(mask[seq]) # print tot_acc / np.sum(mask) return seq_acc seq_acc = calculate_seq_accuracy(predictions, labels) seq_acc_train = seq_acc[:split_value] seq_acc_test = seq_acc[split_value:] # print len(lengths_train[lengths_train > 300]) # print len(lengths_test[lengths_test > 300]) colors = np.zeros((len(labels), 3)) for seq in range(len(labels)): for label in labels[seq, :int(np.sum(mask[seq]))]: if label in [ssConvertString.find('E'), ssConvertString.find('B')]: colors[seq, 0] += 1 elif label in [ssConvertString.find('H'), ssConvertString.find('G'), ssConvertString.find('I')]: colors[seq, 1] += 1 elif label in [ssConvertString.find('L'), ssConvertString.find('S'), ssConvertString.find('T')]: colors[seq, 2] += 1 colors[seq] = colors[seq] / np.sum(mask[seq]) colors2 = np.zeros((len(labels), 3)) for seq in range(len(labels)): for label in labels[seq, :int(np.sum(mask[seq]))]: if label in [ssConvertString.find('H'), ssConvertString.find('E'), ssConvertString.find('L')]: colors2[seq, 0] += 1 elif label in [ssConvertString.find('S'), ssConvertString.find('T'), ssConvertString.find('B'), ssConvertString.find('I'), ssConvertString.find('G')]: # colors2[seq, 1] += 1 colors2[seq, 2] += 1 colors2[seq] = colors2[seq] / np.sum(mask[seq]) min_red = np.min(colors2[:, 0]) colors2[:, 0] -= min_red max_red = np.max(colors2[:, 0]) colors2[:, 0] /= max_red colors2[:, 2] = 1 - colors2[:, 0] def print_length_vs_acc(lengths, seqs, ax, label, colors): seq_len = 700 for seq in range(len(seqs)): ax.plot(lengths[seq], seqs[seq], marker="X", linewidth=0, label="sequences", color=colors[seq]) ax.plot(np.mean(seqs) * np.ones(seq_len), label="mean", color='orange') ax.set(title='Per-sequence accuracy (' + label + '), mean: {:.2f}'.format(np.mean(seqs)), ylabel="accuracy", xlabel="sequence length", ylim=[0, 1]) blue_line = mlines.Line2D([], [], color='orange', label='mean accuracy') ax.legend(handles=[blue_line]) fig, ax = plt.subplots(1, 2, figsize=(15, 4)) print_length_vs_acc(lengths_train[:], seq_acc_train[:], ax[0], 'train', colors[:split_value]) print_length_vs_acc(lengths_test, seq_acc_test, ax[1], 'test', colors[split_value:]) plt.tight_layout() fig.savefig(FIGURES_PATH + 'per_seq_acc.eps', format='eps') fig.show() fig, ax = plt.subplots(1, 2, figsize=(15, 4)) print_length_vs_acc(lengths_train[:], seq_acc_train[:], ax[0], 'train', colors2[:split_value]) print_length_vs_acc(lengths_test, seq_acc_test, ax[1], 'test', colors2[split_value:]) plt.tight_layout() fig.savefig(FIGURES_PATH + 'per_seq_acc_2.eps', format='eps') fig.show()
def plot_sheer_class_aa(sheer_data="sheer6016.npy"): amounts = [164981, 1242, 236833, 17858, 0, 405754, 33990, 84479] eq_sheer = np.load(SHEER_PATH + sheer_data) fig, axes = plt.subplots(3, 1, figsize=(7, 7)) for j, ies in enumerate(((0, 1, 2), (3, 4), (5, 6, 7))): ax = axes[j] omg_cosa = [] amountvec = [] classvec = [] for i in ies: target_class = CLASS_NAMES[i] classvec.append(target_class) tot_pssm = np.sum(eq_sheer[ssConvertString.find(target_class), :, 21:], axis=1) amountvec.append(amounts[ssConvertString.find(target_class)]) if i == 0 or i == 3 or i == 5: ax2 = ax ax2.yaxis.set(ticks=[0]) else: ax2 = ax.twinx() ax2.yaxis.set(ticks=[]) if j == 0: ax2.set(ylim=[ -1.1 * np.max(abs(tot_pssm)), 1.1 * np.max(abs(tot_pssm)) ]) elif j == 1: ax2.set(ylim=[ -np.max(abs(tot_pssm)) / 10, 1.1 * np.max(abs(tot_pssm)) ]) else: ax2.set(ylim=[ -np.max(abs(tot_pssm)) / 4.5, 1.1 * np.max(abs(tot_pssm)) ]) ax2.plot(tot_pssm, marker='.', color=CLASS_COLOURS[target_class]) # ax22.set_ylim(bottom=0) omg_cosa.append( mlines.Line2D([], [], color=CLASS_COLOURS[target_class], marker='.', label=target_class)) ax.xaxis.set(ticks=range(19), ticklabels=range(-WINDOW, WINDOW + 1)) ax.margins(0) ax.legend(omg_cosa, [ el + ": {:d}".format(amountvec[i]) for i, el in enumerate(classvec) ], loc='upper left', fontsize=12) ax.axhline(0, color="black") ax.tick_params(labelsize=12) plt.tight_layout() fig.show() (fname, ext) = os.path.splitext(sheer_data) fig.savefig(FIGURES_PATH + fname + "_class_aa.pdf")
def plot_outliers(): dater = Jurtz_Data() X, mask, labels, num_seq = dater.get_test(Jurtz_Data.TEST_PATH) lengths_test = np.sum(mask, axis=1) predictions = dater.get_all_predictions() split_value = 64 * 86 predictions = predictions[split_value:] def calculate_seq_accuracy(): seq_len = predictions.shape[1] tot_acc = 0 seq_acc = np.zeros(num_seq) for seq in range(num_seq): for pos in range(seq_len): if mask[seq, pos]: if labels[seq, pos] == np.argmax(predictions[seq, pos]): seq_acc[seq] += 1 tot_acc += 1 else: break seq_acc[seq] /= np.sum(mask[seq]) print(tot_acc / np.sum(mask)) return seq_acc seq_acc_test = calculate_seq_accuracy() classes = [[], [], []] for seq in range(514): colors = np.zeros(3) for label in labels[seq, :int(np.sum(mask[seq]))]: if label in [ ssConvertString.find('H'), ssConvertString.find('G'), ssConvertString.find('I') ]: colors[0] += 1 elif label in [ ssConvertString.find('E'), ssConvertString.find('B') ]: colors[1] += 1 elif label in [ ssConvertString.find('L'), ssConvertString.find('S'), ssConvertString.find('T') ]: colors[2] += 1 if colors[0] > colors[1] and colors[0] > colors[2]: classes[0].append(seq) elif colors[1] > colors[2]: classes[1].append(seq) else: classes[2].append(seq) seq_len = 700 colours = ["green", "red", "blue"] fig, axes = plt.subplots(3, figsize=(7.2, 6)) for i, classus in enumerate(classes): ax = axes[i] ax.plot(lengths_test[classus], seq_acc_test[classus], marker="X", linewidth=0, label="sequences", color=colours[i]) ax.plot(np.mean(seq_acc_test) * np.ones(seq_len), label="mean", color='orange') ax.set(ylim=[0, 1], xlim=[0, seq_len]) ax.tick_params(labelsize=12) if i == 0 or i == 1: ax.xaxis.set(ticks=[]) if i == 1: plt.ylabel("accuracy", fontsize=15) plt.tight_layout() plt.xlabel("sequence length", fontsize=15) fig.savefig(FIGURES_PATH + 'per_seq_acc.eps', format='eps') fig.show()
def process(): exists_sal = probe('saliencies') exists_proc = probe('processed') dater = Jurtz_Data() fail_seqs = 0 processed = 0 origin = os.getcwd() os.chdir(SALIENCIES_PATH) for seq in range(len(exists_sal)): X_seq, mask_seq = dater.get_sequence(seq) end_seq = int(sum(mask_seq)) processed_seq = np.zeros((end_seq, 8, 2 * WINDOW + 1, 42)) if int(np.sum(exists_sal[seq])) != 8: print("Saliencies for " + str(seq) + " incomplete, " + str(np.sum(exists_sal[seq])) + " found") elif int(np.sum(exists_proc[seq])) != 8: try: for label in ssConvertString: try: fname = "saliencies" + str(seq) + label + ".pkl" with open(fname, "rb") as f: saliency_seq = np.array(pickle.load(f)) except OSError: fname = "saliencies{:4d}{:s}.pkl".format(seq, label) with open(fname, "rb") as f: saliency_seq = np.array(pickle.load(f)) for pos in range(end_seq): saliency_pos = np.zeros( (2 * WINDOW + 1, 42)) # window-size, n aminoacids # Pre-WINDOW if pos > WINDOW: init = pos - WINDOW saliency_pos[:WINDOW] += np.multiply( saliency_seq[pos, init:pos, :], X_seq[init:pos]) elif pos != 0: init = WINDOW - pos saliency_pos[init:WINDOW] += np.multiply( saliency_seq[pos, 0:pos, :], X_seq[0:pos]) # Window saliency_pos[WINDOW] += np.multiply( saliency_seq[pos, pos, :], X_seq[pos]) # Post-WINDOW if pos + WINDOW + 1 <= end_seq: end = pos + WINDOW + 1 saliency_pos[WINDOW + 1:] += np.multiply( saliency_seq[pos, pos + 1:end, :], X_seq[pos + 1:end]) elif pos != end_seq: end = end_seq saliency_pos[WINDOW + 1:-(pos + WINDOW + 1 - end)] += np.multiply( saliency_seq[pos, pos + 1:end, :], X_seq[pos + 1:end]) processed_seq[ pos, ssConvertString.find(label)] = saliency_pos processed += 1 except OSError: fail_seqs += 1 print(str(seq) + " Files not found") fname = "saliencies{:4d}.npy".format(seq) np.save(PROCESSED_PATH + fname, processed_seq) os.chdir(origin) print(str(processed) + " saliencies processed") print(str(fail_seqs) + " saliencies failed")