Пример #1
0
def plot_sheer_cabestro(sheer_data="sheer6016.npy"):
    amounts = [164981, 1242, 236833, 17858, 0, 405754, 33990, 84479]
    eq_sheer = np.load(SHEER_PATH + sheer_data)

    fig, ax = plt.subplots(figsize=(7, 2.5))

    #omg_cosa = []
    amountvec = []
    classvec = []
    for target_class in CLASS_NAMES:
        classvec.append(target_class)

        tot_pssm = np.sum(eq_sheer[ssConvertString.find(target_class), :, 21:],
                          axis=1)
        amountvec.append(amounts[ssConvertString.find(target_class)])
        ax.plot(tot_pssm / amounts[ssConvertString.find(target_class)],
                marker='.',
                color=CLASS_COLOURS[target_class],
                label=target_class)
        #omg_cosa.append(mlines.Line2D([], [], color=CLASS_COLOURS[target_class], marker='.',
        #label=target_class))

    ax.xaxis.set(ticks=range(19), ticklabels=range(-WINDOW, WINDOW + 1))
    ax.margins(0)
    ax.legend(
    )  #omg_cosa, [el + ": {:d}".format(amountvec[i]) for i, el in enumerate(classvec)],
    #loc='upper left')
    #ax.axhline(0, color="black")

    plt.tight_layout()
    fig.show()
Пример #2
0
def clustering(args):
    points = np.load(SHEER_PATH + "points" + str(args.num_seqs) + ".npy")
    points = points[:args.num_points, ssConvertString.find(args.label)]
    print("Original points shape:", points.shape)

    mask = np.ones(len(points), dtype='bool')
    for i, point in enumerate(points):
        if np.allclose(point, np.zeros_like(point)):
            mask[i] = False
    points = points[mask]
    print("Filtered points shape (no zero vectors):", points.shape)

    if args.clustering == "agglomerative":
        n_clusters = 4
        model = AgglomerativeClustering(n_clusters=n_clusters,
                                        linkage="average",
                                        affinity="cosine")
    elif args.clustering == "DBSCAN":
        # model = DBSCAN(metric="cosine", eps=args.eps)
        import hdbscan
        model = hdbscan.HDBSCAN(min_cluster_size=10)
    else:
        raise ValueError("Clustering algorithm '{:s}' not recognized".format(
            args.clustering))

    model.fit(points)

    file_end = "{:s}{:d}.npy".format(args.label, args.num_points)
    np.save(SHEER_PATH + args.clustering + file_end, model.labels_)
    np.save(SHEER_PATH + 'mask' + file_end, mask)
    print("Clustering completed. Labels saved in " + SHEER_PATH +
          args.clustering + file_end)
    print("Labels:")
    print(pd.Series(model.labels_).value_counts())
Пример #3
0
def plot_sheer_class_aa():
    abs_sheer = np.load(SHEER_PATH + "sheer6016.npy")
    # abs_sheer += abs_sheer[:, ::-1, :]
    # abs_sheer /= 2

    fig2, ax21 = plt.subplots(figsize=(9, 3))

    legend_names = [el for el in CLASS_NAMES]
    omg_cosa = []
    kurtvec = []
    for i, target_class in enumerate(legend_names):
        tot_pssm = np.sum(abs_sheer[ssConvertString.find(target_class), :, 21:], axis=1)
        kurtvec.append(kurtosis(tot_pssm))

        if i == 0:
            ax22 = ax21
            ax22.yaxis.set(ticks=[0])
        else:
            ax22 = ax21.twinx()
            ax22.yaxis.set(ticks=[])

        ax22.plot(tot_pssm, marker='.', color=CLASS_COLOURS[target_class])
        ax22.set_ylim(bottom=0)

        omg_cosa.append(mlines.Line2D([], [], color=CLASS_COLOURS[target_class], marker='.',
                                      label=target_class))

    ax21.xaxis.set(ticks=range(19), ticklabels=range(-WINDOW, WINDOW + 1))
    ax21.margins(0)
    ax21.legend(omg_cosa, [el + ": {:.1f}".format(kurtvec[i]) for i, el in enumerate(legend_names)], loc='upper right')
    ax21.yaxis.set(ticks=[])

    plt.tight_layout()
    fig2.show()
    fig2.savefig(FIGURES_PATH + "sheer_class_aa.eps")
Пример #4
0
def plot_sheer_aa():
    totals = collect_sheer()
    totals /= 1000

    fig, axes = plt.subplots(1, 3, figsize=(13, 2.8))
    i = 0
    for aa in pssmString_jurtz:
        if aa in ["G", "K", "M"]:
            ax1 = axes[i]
            for j, label in enumerate(CLASS_NAMES):
                ax1.plot(totals[ssConvertString.find(label), :, pssmString_jurtz.find(aa) + 21], label=label,
                         marker='.', color=CLASS_COLOURS[label])

            # vmax = np.max(abs(totals[:, :, 21:]))
            ax1.legend(loc='right')
            ax1.set(title="Pssm-values for " + aa)  # , ylim=[-vmax, vmax])
            ax1.xaxis.set(ticks=range(19), ticklabels=range(-WINDOW, WINDOW + 1))
            # ax1.yaxis.set(ticks=[])
            ax1.margins(0)

            i += 1

    plt.tight_layout()
    fig.savefig(FIGURES_PATH + "class_agg_aa.eps")
    plt.show()
Пример #5
0
    def plot_lines():
        fig, ax = plt.subplots(figsize=(10, 2.8))
        for j, label in enumerate(CLASS_NAMES):
            ax.plot(tot_sal[ssConvertString.find(label), WINDOW:-WINDOW].T, marker='.', label=label,
                    color=CLASS_COLOURS[label])

        ax.legend(loc="upper left")

        ax.xaxis.set(ticks=range(final_pos - ini_pos + 1),
                     ticklabels=[preds[i] + "\n" + el + "\n" + str(ini_pos + i) for i, el in
                                 enumerate(labels)])
        colors = [CLASS_COLOURS[el] for el in preds]
        for color, tick in zip(colors, ax.xaxis.get_major_ticks()):
            tick.label1.set_color(color)  # set the color property

        ax.margins(0)
        ax2 = ax.twiny()
        ax2.xaxis.set(ticks=range(final_pos - ini_pos + 1), ticklabels=aas)
        # colors = [SEQLOGO_COLOURS[el] for el in aas]
        # for color, tick in zip(colors, ax2.xaxis.get_major_ticks()):
        # tick.label2.set_color(color)  # set the color property

        plt.tight_layout()
        fig.savefig(FIGURES_PATH + "sample_8classes.eps", format='eps')
        plt.show()
Пример #6
0
def scrap():
    exists = probe('saliencies')

    fail_seqs = 0
    deleted = 0
    origin = os.getcwd()
    os.chdir(SALIENCIES_PATH)
    for seq in range(len(exists)):
        for label in ssConvertString:
            if exists[seq, ssConvertString.find(label)]:
                try:
                    try:
                        fname = "saliencies" + str(seq) + label + ".pkl"
                        with open(fname, "rb") as f:
                            saliency = np.array(pickle.load(f))
                    except OSError:
                        fname = "saliencies{:4d}{:s}.pkl".format(seq, label)
                        with open(fname, "rb") as f:
                            saliency = np.array(pickle.load(f))

                    if saliency.ndim != 3 or saliency.shape[
                            0] != saliency.shape[1]:
                        os.remove(fname)
                        print("File " + fname + " deleted")
                        deleted += 1
                        raise OSError("saliency badly formatted")

                except OSError:
                    fail_seqs += 1
                    print(str(seq) + label + " Not found")

    os.chdir(origin)
    print(str(deleted) + " saliencies deleted")
    print(str(NUM_SEQS * 8 - fail_seqs) + " saliencies remaining")
Пример #7
0
def compute_complex_saliency(X_batch, mask_batch, batch_seq, inference, sym_x,
                             batch, label):
    seq_len = int(np.sum(mask_batch[batch_seq]))
    try:
        sym_y = inference[batch_seq, :seq_len, ssConvertString.find(label)]
        grads = compute_single_saliency(X_batch=X_batch,
                                        sym_x=sym_x,
                                        sym_y=sym_y)
        grads = grads[:seq_len, batch_seq, :seq_len]

    except Exception as err:
        # IF GPU OUT OF MEMORY
        print(err)
        try:
            # FIRST HALF
            sym_y = inference[batch_seq, :seq_len // 2,
                              ssConvertString.find(label)]
            grads1 = compute_single_saliency(X_batch=X_batch,
                                             sym_x=sym_x,
                                             sym_y=sym_y)
            grads1 = grads1[:seq_len // 2, batch_seq, :seq_len]
        except Exception:
            print(err)
            print("XXXXXXX Is it in the first part?")

        try:
            # SECOND HALF
            sym_y = inference[batch_seq, seq_len // 2:seq_len,
                              ssConvertString.find(label)]
            grads2 = compute_single_saliency(X_batch=X_batch,
                                             sym_x=sym_x,
                                             sym_y=sym_y)
            grads2 = grads2[seq_len // 2:seq_len, batch_seq, :seq_len]
        except Exception:
            print(err)
            print("XXXXXXXX Or in the second?")

        grads = np.concatenate((grads1, grads2), axis=0)

    assert grads.shape[0] == grads.shape[
        1], "{:d} != {:d} for sequence with length {:d}. Concatenation of grad1 with shape {:s} and grads2 with shape {:s}, gives grads with shape {:s}.".format(
            grads.shape[0], grads.shape[1], seq_len, str(grads1.shape),
            str(grads2.shape), str(grads.shape))
    fname = "saliencies{:4d}{:s}.pkl".format(BATCH_SIZE * batch + batch_seq,
                                             label)
    with open(PATH_SALIENCIES + fname, 'wb') as f:
        pickle.dump(grads, f, protocol=2)
Пример #8
0
def probe(folder='saliencies'):
    origin = os.getcwd()
    os.chdir(SALIENCIES_PATH)
    files = glob.glob(folder + '*')

    exists = np.zeros((NUM_SEQS, 8))
    for el in files:
        found = re.search(r'(\d+)(\D)', el).groups()
        num = int(found[0])
        label = ssConvertString.find(found[1])
        if num < NUM_SEQS:
            assert exists[num, label] == 0
            exists[num, label] += 1

    os.chdir(origin)
    return exists
Пример #9
0
def plot_sheer_class_all():
    totals = collect_sheer()

    fig, axes = plt.subplots(3, 3, figsize=(6 * 3 / 2, 6.65 * 3 / 2))
    for i, target_class in enumerate(ssConvertString):
        ax = axes[i // 3][i % 3]
        tot_sal = totals[ssConvertString.find(target_class)]
        vmax = np.max(abs(tot_sal[..., 21:]))
        cax = ax.imshow(tot_sal[..., 21:].T, cmap='PiYG', vmin=-vmax, vmax=vmax)
        # fig.colorbar(cax)

        ax.set(title="Class " + target_class)
        ax.yaxis.set(ticks=range(len(pssmString_jurtz)), ticklabels=pssmString_jurtz)
        ax.xaxis.set(ticks=range(2 * WINDOW + 1),
                     ticklabels=range(-WINDOW, WINDOW + 1))
        ax.margins(0)

    plt.tight_layout()
    fig.savefig(FIGURES_PATH + "class_agg_class_all.eps", format='eps')
    plt.show()
Пример #10
0
def repair_saliencies(args):
    dater = Jurtz_Data()

    metadata_path = "dump_pureConv-20180804-010835-47.pkl"
    metadata = np.load(metadata_path)
    config_name = metadata['config_name']
    config = importlib.import_module("%s" % config_name)
    print("Using configurations: '%s'" % config_name)
    l_in, l_out = config.build_model()

    sym_x = T.tensor3()
    inference = nn.layers.get_output(l_out, sym_x, deterministic=True)
    nn.layers.set_all_param_values(l_out, metadata['param_values'])

    batch_range = range(NUM_SEQS // BATCH_SIZE)
    if args.dir == 'b':
        batch_range = reversed(batch_range)
    elif args.dir != 'f':
        raise ValueError("args.dir is " + str(args.dir))

    for batch in batch_range:
        exists = probe('saliencies')
        for batch_seq in range(BATCH_SIZE):
            seq = batch * BATCH_SIZE + batch_seq
            if int(np.sum(exists[seq])) != 8:
                X_batch, mask_batch = dater.get_batch_from_seq(seq)
                for label in ssConvertString:
                    if not exists[seq, ssConvertString.find(label)]:
                        print(
                            "Repairing sequence {:d} and batch {:d} for label {:s}"
                            .format(seq, batch, label))
                        compute_complex_saliency(X_batch=X_batch,
                                                 mask_batch=mask_batch,
                                                 batch=batch,
                                                 label=label,
                                                 batch_seq=batch_seq,
                                                 inference=inference,
                                                 sym_x=sym_x)
Пример #11
0
def plot_outliers():
    dater = Jurtz_Data()
    X, labels, mask = dater.get_all_data()
    split_value = dater.split_value
    lengths_train = np.sum(mask[:split_value], axis=1)
    lengths_test = np.sum(mask[split_value:], axis=1)

    predictions = dater.get_all_predictions()

    def calculate_seq_accuracy(labels):
        num_seq = len(mask)
        seq_len = predictions.shape[1]

        tot_acc = 0
        seq_acc = np.zeros(num_seq)
        for seq in range(num_seq):
            for pos in range(seq_len):
                if mask[seq, pos]:
                    if labels[seq, pos] == np.argmax(predictions[seq, pos]):
                        seq_acc[seq] += 1
                        tot_acc += 1
                else:
                    break

            seq_acc[seq] /= np.sum(mask[seq])

        # print tot_acc / np.sum(mask)
        return seq_acc

    seq_acc = calculate_seq_accuracy(predictions, labels)
    seq_acc_train = seq_acc[:split_value]
    seq_acc_test = seq_acc[split_value:]

    # print len(lengths_train[lengths_train > 300])
    # print len(lengths_test[lengths_test > 300])

    colors = np.zeros((len(labels), 3))
    for seq in range(len(labels)):
        for label in labels[seq, :int(np.sum(mask[seq]))]:
            if label in [ssConvertString.find('E'), ssConvertString.find('B')]:
                colors[seq, 0] += 1
            elif label in [ssConvertString.find('H'), ssConvertString.find('G'), ssConvertString.find('I')]:
                colors[seq, 1] += 1
            elif label in [ssConvertString.find('L'), ssConvertString.find('S'), ssConvertString.find('T')]:
                colors[seq, 2] += 1
        colors[seq] = colors[seq] / np.sum(mask[seq])

    colors2 = np.zeros((len(labels), 3))
    for seq in range(len(labels)):
        for label in labels[seq, :int(np.sum(mask[seq]))]:
            if label in [ssConvertString.find('H'), ssConvertString.find('E'), ssConvertString.find('L')]:
                colors2[seq, 0] += 1
            elif label in [ssConvertString.find('S'), ssConvertString.find('T'), ssConvertString.find('B'),
                           ssConvertString.find('I'), ssConvertString.find('G')]:
                # colors2[seq, 1] += 1
                colors2[seq, 2] += 1
        colors2[seq] = colors2[seq] / np.sum(mask[seq])

    min_red = np.min(colors2[:, 0])
    colors2[:, 0] -= min_red
    max_red = np.max(colors2[:, 0])
    colors2[:, 0] /= max_red
    colors2[:, 2] = 1 - colors2[:, 0]

    def print_length_vs_acc(lengths, seqs, ax, label, colors):
        seq_len = 700

        for seq in range(len(seqs)):
            ax.plot(lengths[seq], seqs[seq], marker="X", linewidth=0, label="sequences", color=colors[seq])
        ax.plot(np.mean(seqs) * np.ones(seq_len), label="mean", color='orange')
        ax.set(title='Per-sequence accuracy (' + label + '), mean: {:.2f}'.format(np.mean(seqs)),
               ylabel="accuracy", xlabel="sequence length", ylim=[0, 1])
        blue_line = mlines.Line2D([], [], color='orange', label='mean accuracy')
        ax.legend(handles=[blue_line])

    fig, ax = plt.subplots(1, 2, figsize=(15, 4))
    print_length_vs_acc(lengths_train[:], seq_acc_train[:], ax[0], 'train', colors[:split_value])
    print_length_vs_acc(lengths_test, seq_acc_test, ax[1], 'test', colors[split_value:])
    plt.tight_layout()
    fig.savefig(FIGURES_PATH + 'per_seq_acc.eps', format='eps')
    fig.show()

    fig, ax = plt.subplots(1, 2, figsize=(15, 4))
    print_length_vs_acc(lengths_train[:], seq_acc_train[:], ax[0], 'train', colors2[:split_value])
    print_length_vs_acc(lengths_test, seq_acc_test, ax[1], 'test', colors2[split_value:])
    plt.tight_layout()
    fig.savefig(FIGURES_PATH + 'per_seq_acc_2.eps', format='eps')
    fig.show()
Пример #12
0
def plot_sheer_class_aa(sheer_data="sheer6016.npy"):
    amounts = [164981, 1242, 236833, 17858, 0, 405754, 33990, 84479]
    eq_sheer = np.load(SHEER_PATH + sheer_data)

    fig, axes = plt.subplots(3, 1, figsize=(7, 7))

    for j, ies in enumerate(((0, 1, 2), (3, 4), (5, 6, 7))):
        ax = axes[j]

        omg_cosa = []
        amountvec = []
        classvec = []
        for i in ies:
            target_class = CLASS_NAMES[i]
            classvec.append(target_class)

            tot_pssm = np.sum(eq_sheer[ssConvertString.find(target_class), :,
                                       21:],
                              axis=1)
            amountvec.append(amounts[ssConvertString.find(target_class)])

            if i == 0 or i == 3 or i == 5:
                ax2 = ax
                ax2.yaxis.set(ticks=[0])
            else:
                ax2 = ax.twinx()
                ax2.yaxis.set(ticks=[])

            if j == 0:
                ax2.set(ylim=[
                    -1.1 * np.max(abs(tot_pssm)), 1.1 * np.max(abs(tot_pssm))
                ])
            elif j == 1:
                ax2.set(ylim=[
                    -np.max(abs(tot_pssm)) / 10, 1.1 * np.max(abs(tot_pssm))
                ])
            else:
                ax2.set(ylim=[
                    -np.max(abs(tot_pssm)) / 4.5, 1.1 * np.max(abs(tot_pssm))
                ])
            ax2.plot(tot_pssm, marker='.', color=CLASS_COLOURS[target_class])
            # ax22.set_ylim(bottom=0)

            omg_cosa.append(
                mlines.Line2D([], [],
                              color=CLASS_COLOURS[target_class],
                              marker='.',
                              label=target_class))

        ax.xaxis.set(ticks=range(19), ticklabels=range(-WINDOW, WINDOW + 1))
        ax.margins(0)
        ax.legend(omg_cosa, [
            el + ": {:d}".format(amountvec[i]) for i, el in enumerate(classvec)
        ],
                  loc='upper left',
                  fontsize=12)
        ax.axhline(0, color="black")
        ax.tick_params(labelsize=12)

    plt.tight_layout()
    fig.show()
    (fname, ext) = os.path.splitext(sheer_data)
    fig.savefig(FIGURES_PATH + fname + "_class_aa.pdf")
Пример #13
0
def plot_outliers():
    dater = Jurtz_Data()
    X, mask, labels, num_seq = dater.get_test(Jurtz_Data.TEST_PATH)
    lengths_test = np.sum(mask, axis=1)

    predictions = dater.get_all_predictions()
    split_value = 64 * 86
    predictions = predictions[split_value:]

    def calculate_seq_accuracy():
        seq_len = predictions.shape[1]

        tot_acc = 0
        seq_acc = np.zeros(num_seq)
        for seq in range(num_seq):
            for pos in range(seq_len):
                if mask[seq, pos]:
                    if labels[seq, pos] == np.argmax(predictions[seq, pos]):
                        seq_acc[seq] += 1
                        tot_acc += 1
                else:
                    break

            seq_acc[seq] /= np.sum(mask[seq])

        print(tot_acc / np.sum(mask))
        return seq_acc

    seq_acc_test = calculate_seq_accuracy()

    classes = [[], [], []]
    for seq in range(514):
        colors = np.zeros(3)
        for label in labels[seq, :int(np.sum(mask[seq]))]:
            if label in [
                    ssConvertString.find('H'),
                    ssConvertString.find('G'),
                    ssConvertString.find('I')
            ]:
                colors[0] += 1
            elif label in [
                    ssConvertString.find('E'),
                    ssConvertString.find('B')
            ]:
                colors[1] += 1
            elif label in [
                    ssConvertString.find('L'),
                    ssConvertString.find('S'),
                    ssConvertString.find('T')
            ]:
                colors[2] += 1
        if colors[0] > colors[1] and colors[0] > colors[2]:
            classes[0].append(seq)
        elif colors[1] > colors[2]:
            classes[1].append(seq)
        else:
            classes[2].append(seq)

    seq_len = 700
    colours = ["green", "red", "blue"]
    fig, axes = plt.subplots(3, figsize=(7.2, 6))

    for i, classus in enumerate(classes):
        ax = axes[i]
        ax.plot(lengths_test[classus],
                seq_acc_test[classus],
                marker="X",
                linewidth=0,
                label="sequences",
                color=colours[i])

        ax.plot(np.mean(seq_acc_test) * np.ones(seq_len),
                label="mean",
                color='orange')
        ax.set(ylim=[0, 1], xlim=[0, seq_len])
        ax.tick_params(labelsize=12)
        if i == 0 or i == 1:
            ax.xaxis.set(ticks=[])
        if i == 1:
            plt.ylabel("accuracy", fontsize=15)

    plt.tight_layout()
    plt.xlabel("sequence length", fontsize=15)
    fig.savefig(FIGURES_PATH + 'per_seq_acc.eps', format='eps')
    fig.show()
Пример #14
0
def process():
    exists_sal = probe('saliencies')
    exists_proc = probe('processed')

    dater = Jurtz_Data()

    fail_seqs = 0
    processed = 0
    origin = os.getcwd()
    os.chdir(SALIENCIES_PATH)
    for seq in range(len(exists_sal)):
        X_seq, mask_seq = dater.get_sequence(seq)
        end_seq = int(sum(mask_seq))
        processed_seq = np.zeros((end_seq, 8, 2 * WINDOW + 1, 42))

        if int(np.sum(exists_sal[seq])) != 8:
            print("Saliencies for " + str(seq) + " incomplete, " +
                  str(np.sum(exists_sal[seq])) + " found")
        elif int(np.sum(exists_proc[seq])) != 8:
            try:
                for label in ssConvertString:
                    try:
                        fname = "saliencies" + str(seq) + label + ".pkl"
                        with open(fname, "rb") as f:
                            saliency_seq = np.array(pickle.load(f))
                    except OSError:
                        fname = "saliencies{:4d}{:s}.pkl".format(seq, label)
                        with open(fname, "rb") as f:
                            saliency_seq = np.array(pickle.load(f))

                    for pos in range(end_seq):
                        saliency_pos = np.zeros(
                            (2 * WINDOW + 1, 42))  # window-size, n aminoacids
                        # Pre-WINDOW
                        if pos > WINDOW:
                            init = pos - WINDOW
                            saliency_pos[:WINDOW] += np.multiply(
                                saliency_seq[pos, init:pos, :],
                                X_seq[init:pos])
                        elif pos != 0:
                            init = WINDOW - pos
                            saliency_pos[init:WINDOW] += np.multiply(
                                saliency_seq[pos, 0:pos, :], X_seq[0:pos])

                        # Window
                        saliency_pos[WINDOW] += np.multiply(
                            saliency_seq[pos, pos, :], X_seq[pos])

                        # Post-WINDOW
                        if pos + WINDOW + 1 <= end_seq:
                            end = pos + WINDOW + 1
                            saliency_pos[WINDOW + 1:] += np.multiply(
                                saliency_seq[pos, pos + 1:end, :],
                                X_seq[pos + 1:end])
                        elif pos != end_seq:
                            end = end_seq
                            saliency_pos[WINDOW +
                                         1:-(pos + WINDOW + 1 -
                                             end)] += np.multiply(
                                                 saliency_seq[pos,
                                                              pos + 1:end, :],
                                                 X_seq[pos + 1:end])

                        processed_seq[
                            pos, ssConvertString.find(label)] = saliency_pos
                        processed += 1

            except OSError:
                fail_seqs += 1
                print(str(seq) + " Files not found")

        fname = "saliencies{:4d}.npy".format(seq)
        np.save(PROCESSED_PATH + fname, processed_seq)

    os.chdir(origin)
    print(str(processed) + " saliencies processed")
    print(str(fail_seqs) + " saliencies failed")