Пример #1
0
 def setUp(self):
     folder = dirname(__file__)
     files = [
         folder + "/data/dna_pos.fasta", folder + "/data/dna_neg.fasta"
     ]
     self.data = Data(files, "ACGT")
     self.params = {'conv_num': [1], 'kernel_num': [2, 4], 'epochs': [1, 2]}
     self.searcher = Grid_Search(self.params)
Пример #2
0
 def setUp(self):
     folder = dirname(__file__)
     file_name = folder + "/data/rna.fasta"
     self.data = Data(file_name, ("ACGU", "()."))
     self.params = {"conv_num":1, "kernel_num":3, "kernel_len":5,
                    "neuron_num":2, "epochs":3}
     self.m1 = Model(self.params, self.data, seed = 2)
     self.m2 = Model(self.params, self.data, seed = 13)
     self.m3 = Model(self.params, self.data, seed = 2)
Пример #3
0
 def setUp(self):
     folder = dirname(__file__)
     dna_files = [
         folder + "/data/dna_pos.fasta", folder + "/data/dna_neg.fasta"
     ]
     rna_files = folder + "/data/rna.fasta"
     rna_pwm = [
         folder + '/data/rna_pwm1.fasta', folder + '/data/rna_pwm2.fasta'
     ]
     rna_pwm_add = [
         folder + '/data/rna_pwm1_add.txt',
         folder + '/data/rna_pwm2_add.txt'
     ]
     rna_pwm_pos_feat1 = [
         folder + '/data/rna_pwm1_pos.txt',
         folder + '/data/rna_pwm2_pos.txt'
     ]
     rna_pwm_pos_feat2 = [
         folder + '/data/rna_pwm2_pos.txt',
         folder + '/data/rna_pwm1_pos.txt'
     ]
     self.data_dna = Data(dna_files, "ACGT")
     self.data_rna_dot = Data(rna_files, ("ACGU", "()."))
     self.data_pwm = Data(rna_pwm, ('ACGU', '().'), structure_pwm=True)
     self.data_pwm.load_additional_data(rna_pwm_add, is_categorical=False)
     self.data_pwm.load_additional_data(
         rna_pwm_add,
         is_categorical=True,
         categories=[str(x) for x in range(1, 17)])
     self.data_pwm.load_additional_positionwise_data(
         rna_pwm_pos_feat1, "feat1")
     self.data_pwm.load_additional_positionwise_data(
         rna_pwm_pos_feat2, "feat2")
def measure_rbp(entry):
    import os
    from time import time
    from pysster import utils

    output_folder = entry[4] + "_pysster/"
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)

    start = time()

    # predict secondary structures
    utils.predict_structures(entry[0], entry[0] + ".struct", annotate=True)
    utils.predict_structures(entry[1], entry[1] + ".struct", annotate=True)
    utils.predict_structures(entry[2], entry[2] + ".struct", annotate=True)
    utils.predict_structures(entry[3], entry[3] + ".struct", annotate=True)

    from pysster.Data import Data
    from pysster.Model import Model

    # load data
    data = Data([entry[0] + ".struct", entry[1] + ".struct"], ("ACGU", "HIMS"))
    data.train_val_test_split(
        0.8, 0.1999
    )  # we need to have at least one test sequence, even though we have a separate test object

    # training
    params = {"kernel_len": 8}
    model = Model(params, data)
    model.train(data)

    # load and predict test data
    data_test = Data([entry[2] + ".struct", entry[3] + ".struct"],
                     ("ACGU", "HIMS"))
    predictions = model.predict(data_test, "all")

    stop = time()
    print("{}, time in seconds: {}".format(entry[4], stop - start))

    # performance evaluation
    labels = data_test.get_labels("all")
    utils.plot_roc(labels, predictions, output_folder + "roc.pdf")
    utils.plot_prec_recall(labels, predictions, output_folder + "prec.pdf")

    # get motifs
    activations = model.get_max_activations(data_test, "all")
    _ = model.visualize_all_kernels(activations, data_test, output_folder)

    # save model to drive
    utils.save_model(model, "{}model.pkl".format(output_folder))
Пример #5
0
 def setUp(self):
     folder = dirname(__file__)
     dna_files = [folder + "/data/dna_pos.fasta", folder + "/data/dna_neg.fasta"]
     rna_files = folder + "/data/rna.fasta"
     rna_pwm = [folder + '/data/rna_pwm1.fasta', folder + '/data/rna_pwm2.fasta']
     rna_pwm_add = [folder + '/data/rna_pwm1_add.txt', folder + '/data/rna_pwm2_add.txt']
     self.data_dna = Data(dna_files, "ACGT")
     self.data_rna_dot = Data(rna_files, ("ACGU", "()."))
     self.data_pwm = Data(rna_pwm, ('ACGU', '().'), structure_pwm=True)
     self.data_pwm.load_additional_data(rna_pwm_add, is_categorical=False)
     self.data_pwm.load_additional_data(rna_pwm_add, is_categorical=True)
Пример #6
0
import os
from time import time
from IPython.display import Image
from pysster.Data import Data
from pysster.Grid_Search import Grid_Search
from pysster import utils

DATA = '/mnt/isilon/dbhi_bfx/perry/brian/'
###Establish output directory
output_folder = DATA + "explore_cgi/data/interim/cgi_ind_exp/pysster_output/run_10_17_18_2_feats/"
if not os.path.isdir(output_folder):
    os.makedirs(output_folder)

data = Data([
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/pysster_fa/cgi.indel.sample.fa.gz",
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/pysster_fa/both.indel.sample.fa.gz"
], ("ACGT", "XDI"))

add_cgi_features = [
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.sample__microsat.out"
]

add_both_features = [x.replace('cgi.', 'both.') for x in add_cgi_features]

indel_len_feat = [
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.sample__indel_length.out",
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/add_feat/both.indel.sample__indel_length.out"
Пример #7
0
import os
from time import time
from IPython.display import Image
from pysster.Data import Data
from pysster.Grid_Search import Grid_Search
from pysster import utils

###Establish output directory
output_folder = "explore_cgi/data/interim/pysster_tutorial_test_10_3_18/"
if not os.path.isdir(output_folder):
    os.makedirs(output_folder)


data = Data(["/home/ennisb/pysster/tutorials/data/alu.fa.gz", "/home/ennisb/pysster/tutorials/data/rep.fa.gz", "/home/ennisb/pysster/tutorials/data/nonrep.fa.gz"], ("ACGU", "HIMS"))
print(data.get_summary())

data.train_val_test_split(portion_train=0.6, portion_val=0.2, seed=3)
print(data.get_summary())

###Model Training
params = {"conv_num": [2, 3], "kernel_num": [20], "kernel_len": [20], "dropout_input": [0.1, 0.4]}
searcher = Grid_Search(params)
start = time()
model, summary = searcher.train(data,  verbose=False)
stop = time()
print("time in minutes: {}".format((stop-start)/60))

print(summary)

###Perfomance evaluation
predictions = model.predict(data, "test")
Пример #8
0
import os
from time import time
from IPython.display import Image
from pysster.Data import Data
from pysster.Grid_Search import Grid_Search
from pysster import utils
DATA = '/mnt/isilon/dbhi_bfx/perry/brian/'
###Establish output directory
output_folder = DATA + "explore_cgi/data/interim/cgi_ind_exp/pysster_output/train_model_no_add_feats_12_20_18/"
if not os.path.isdir(output_folder):
    os.makedirs(output_folder)

data = Data([
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/pysster_fa/cgi.indel.sample.fa.gz",
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/pysster_fa/both.indel.sample.fa.gz"
], ("ACGT", "XDI"))

print(data.get_summary())

data.train_val_test_split(portion_train=0.6, portion_val=0.2, seed=3)
print(data.get_summary())

###Model Training
params = {
    "conv_num": [2, 3],
    "kernel_num": [100],
    "kernel_len": [8],
    "dropout_input": [0.1, 0.4]
}
Пример #9
0
    "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__microsat.out"
]

add_both_features = [x.replace('cgi.', 'both.') for x in add_cgi_features]

indel_len_feat = [
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__indel_length.out",
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/add_feat/both.indel.unsample__indel_length.out"
]

#load the dataset as data
data = Data([
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/pysster_fa/cgi.indel.unsample.fa.gz",
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/pysster_fa/both.indel.unsample.fa.gz"
], ("ACGT", "XDI"))

for x, y in zip(add_cgi_features, add_both_features):
    features = [x, y]
    data.load_additional_data(features, is_categorical=True)

data.load_additional_data(indel_len_feat, is_categorical=False)

#run the model of pysster on all of the data set
predictions = model.predict(data, "all")
predictions

labels = data.get_labels("all")
labels
Пример #10
0
"""Run pysster"""
import os
from time import time
from IPython.display import Image
from pysster.Data import Data
from pysster.Grid_Search import Grid_Search
from pysster import utils

DATA = '/mnt/isilon/cbmi/variome/perry/brian/'
###Establish output directory
output_folder = DATA + "explore_cgi/data/interim/kaviar_pysster_additional_features/"
if not os.path.isdir(output_folder):
    os.makedirs(output_folder)

data = Data([
    DATA + "explore-cgi/data/interim/kaviar_fa_gz/short.cgi.fa.gz",
    DATA + "explore-cgi/data/interim/kaviar_fa_gz/short.both.fa.gz"
], ("ACGT"))

add_cgi_features = [
    DATA + "explore-cgi/data/interim/additional_features/1kg.cgi.out", DATA +
    "explore-cgi/data/interim/additional_features/20120824_combined_mask.cgi.out",
    DATA + "explore-cgi/data/interim/additional_features/blackTerry.cgi.out",
    DATA + "explore-cgi/data/interim/additional_features/dgv.cgi.out", DATA +
    "explore-cgi/data/interim/additional_features/dgv.short.cgi.out", DATA +
    "explore-cgi/data/interim/additional_features/GRCh37GenomicSuperDup.sorted.cgi.out",
    DATA +
    "explore-cgi/data/interim/additional_features/hg19.blacklist.cgi.out",
    DATA + "explore-cgi/data/interim/additional_features/rmsk.cgi.out",
    DATA + "explore-cgi/data/interim/additional_features/simpleRepeat.cgi.out"
]
Пример #11
0
def main():

    RBPs = [("data/pum2.train.positive.fasta",
             "data/pum2.train.negative.fasta",
             "data/pum2.test.positive.fasta",
             "data/pum2.test.negative.fasta",
             "PUM2"),
            ("data/qki.train.positive.fasta",
             "data/qki.train.negative.fasta",
             "data/qki.test.positive.fasta",
             "data/qki.test.negative.fasta",
             "QKI"),
            ("data/igf2bp123.train.positive.fasta",
             "data/igf2bp123.train.negative.fasta",
             "data/igf2bp123.test.positive.fasta",
             "data/igf2bp123.test.negative.fasta",
             "IGF2BP123"),
            ("data/srsf1.train.positive.fasta",
             "data/srsf1.train.negative.fasta",
             "data/srsf1.test.positive.fasta",
             "data/srsf1.test.negative.fasta",
             "SRSF1"),
            ("data/taf2n.train.positive.fasta",
             "data/taf2n.train.negative.fasta",
             "data/taf2n.test.positive.fasta",
             "data/taf2n.test.negative.fasta",
             "TAF2N"),
            ("data/nova.train.positive.fasta",
             "data/nova.train.negative.fasta",
             "data/nova.test.positive.fasta",
             "data/nova.test.negative.fasta",
             "NOVA")]

    for entry in RBPs:
        output_folder = entry[4] + "_pysster/"
        if not os.path.isdir(output_folder):
            os.makedirs(output_folder)

        start = time()

        # predict secondary structures
        utils.predict_structures(entry[0], entry[0]+".struct.gz", annotate=True)
        utils.predict_structures(entry[1], entry[1]+".struct.gz", annotate=True)
        utils.predict_structures(entry[2], entry[2]+".struct.gz", annotate=True)
        utils.predict_structures(entry[3], entry[3]+".struct.gz", annotate=True)

        # load data
        data = Data([entry[0]+".struct.gz", entry[1]+".struct.gz"], ("ACGU", "HIMS"))
        data.train_val_test_split(0.8, 0.1999) # we need to have at least one test sequence, even though we don't need it
        print(data.get_summary())

        # training
        params = {"kernel_len": 8}
        model = Model(params, data)
        model.train(data)

        # load and predict test data
        data_test = Data([entry[2]+".struct.gz", entry[3]+".struct.gz"], ("ACGU", "HIMS"))
        predictions = model.predict(data_test, "all")

        stop = time()
        print("{}, time in seconds: {}".format(entry[4], stop-start))

        # performance evaluation
        labels = data_test.get_labels("all")
        utils.plot_roc(labels, predictions, output_folder+"roc.pdf")
        utils.plot_prec_recall(labels, predictions, output_folder+"prec.pdf")
        print(utils.get_performance_report(labels, predictions))

        # get motifs
        activations = model.get_max_activations(data_test, "all")
        logos, scores = [], []
        for kernel in range(model.params["kernel_num"]):
            logo, score = model.visualize_kernel(activations, data_test, kernel, output_folder)
            logos.append(logo)
            scores.append(score)
        
        # sort motifs by importance score
        sorted_idx = [i[0] for i in sorted(enumerate(scores), key=lambda x:x[1])]
        with open(output_folder+"kernel_scores.txt", "wt") as handle:
            for x in sorted_idx:
                print("kernel {:>3}: {:.3f}".format(x, scores[x]))
                handle.write("kernel {:>3}: {:.3f}\n".format(x, scores[x]))

        # save model to drive
        utils.save_model(model, "{}model.pkl".format(output_folder))
Пример #12
0
from IPython.display import Image
DATA = "/mnt/isilon/dbhi_bfx/perry/brian/"
#establish output directory
output_folder = DATA + "explore-cgi/data/interim/cgi_ind_exp/pysster_output/model_test_run_ref_fa_10_24_18/"
if not os.path.isdir(output_folder):
    os.makedirs(output_folder)

#load the pysster prediction model
model = utils.load_model(
    "/mnt/isilon/dbhi_bfx/perry/brian/explore_cgi/data/interim/cgi_ind_exp/pysster_output/train_run_ref_seq_only_sampled_10_22_18/model.pkl"
)

#load the dataset as data
data = Data([
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/pysster_ref_fa/cgi.indel.unsample.fa.gz",
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/pysster_ref_fa/both.indel.unsample.fa.gz"
], ("ACGT"))

#run the model of pysster on all of the data set
predictions = model.predict(data, "all")
predictions

labels = data.get_labels("all")
labels

utils.plot_roc(labels, predictions, output_folder + "roc.png")
utils.plot_prec_recall(labels, predictions, output_folder + "prec.png")
print(utils.get_performance_report(labels, predictions))

Image(output_folder + "roc.png")
Пример #13
0
class Test_Data(unittest.TestCase):
    def setUp(self):
        folder = dirname(__file__)
        dna_files = [
            folder + "/data/dna_pos.fasta", folder + "/data/dna_neg.fasta"
        ]
        rna_files = folder + "/data/rna.fasta"
        rna_pwm = [
            folder + '/data/rna_pwm1.fasta', folder + '/data/rna_pwm2.fasta'
        ]
        rna_pwm_add = [
            folder + '/data/rna_pwm1_add.txt',
            folder + '/data/rna_pwm2_add.txt'
        ]
        rna_pwm_pos_feat1 = [
            folder + '/data/rna_pwm1_pos.txt',
            folder + '/data/rna_pwm2_pos.txt'
        ]
        rna_pwm_pos_feat2 = [
            folder + '/data/rna_pwm2_pos.txt',
            folder + '/data/rna_pwm1_pos.txt'
        ]
        self.data_dna = Data(dna_files, "ACGT")
        self.data_rna_dot = Data(rna_files, ("ACGU", "()."))
        self.data_pwm = Data(rna_pwm, ('ACGU', '().'), structure_pwm=True)
        self.data_pwm.load_additional_data(rna_pwm_add, is_categorical=False)
        self.data_pwm.load_additional_data(
            rna_pwm_add,
            is_categorical=True,
            categories=[str(x) for x in range(1, 17)])
        self.data_pwm.load_additional_positionwise_data(
            rna_pwm_pos_feat1, "feat1")
        self.data_pwm.load_additional_positionwise_data(
            rna_pwm_pos_feat2, "feat2")

    def test_data_init_dna(self):
        self.assertFalse(self.data_dna.is_rna_pwm)
        self.assertFalse(self.data_dna.is_rna)
        self.assertFalse(self.data_dna.multilabel)
        self.assertTrue(len(self.data_dna.data) == 100)
        self.assertTrue(self.data_dna.data[0].shape == (32, 4))
        self.assertTrue(self.data_dna.one_hot_encoder.alphabet == 'ACGT')

        self.assertTrue(len(self.data_dna.labels) == 100)
        self.assertTrue(self.data_dna.labels[0].shape == (2, ))
        for x in range(40):
            self.assertTrue((self.data_dna.labels[x] == [1, 0]).all())
        for x in range(40, 100):
            self.assertTrue((self.data_dna.labels[x] == [0, 1]).all())

    def test_data_init_rna(self):
        self.assertFalse(self.data_rna_dot.is_rna_pwm)
        self.assertTrue(len(self.data_rna_dot.data) == 20)
        self.assertTrue(self.data_rna_dot.data[0].shape == (40, 12))
        self.assertTrue(self.data_rna_dot.alpha_coder.alph1 == '().')

        idx_0 = [0, 2, 4, 10, 11, 14, 18, 19]
        idx_1 = [1, 2, 5, 6, 9, 10, 12, 15, 16, 17, 19]
        idx_2 = [0, 2, 3, 6, 7, 8, 9, 10, 13, 14, 15, 16]
        for obj in [self.data_rna_dot]:
            self.assertTrue(obj.is_rna)
            self.assertTrue(obj.multilabel)
            self.assertTrue(obj.alpha_coder.alph0 == 'ACGU')
            self.assertTrue(
                obj.one_hot_encoder.alphabet == obj.alpha_coder.alphabet)

            self.assertTrue(len(obj.labels) == 20)
            self.assertTrue(obj.labels[0].shape == (3, ))
            for x in idx_0:
                self.assertTrue(obj.labels[x][0] == 1)
            for x in idx_1:
                self.assertTrue(obj.labels[x][1] == 1)
            for x in idx_2:
                self.assertTrue(obj.labels[x][2] == 1)

    def test_data_train_val_test_split(self):
        for obj in [self.data_rna_dot]:
            self.assertTrue(len(obj.splits["train"]) == 14)
            self.assertTrue(len(obj.splits["val"]) == 3)
            self.assertTrue(len(obj.splits["test"]) == 3)
            self.assertTrue(
                set(obj.splits["train"]) & set(obj.splits["val"])
                & set(obj.splits["test"]) == set())

        self.assertTrue(len(self.data_dna.splits["train"]) == 70)
        self.assertTrue(len(self.data_dna.splits["val"]) == 15)
        self.assertTrue(len(self.data_dna.splits["test"]) == 15)
        self.assertTrue(
            set(self.data_dna.splits["train"])
            & set(self.data_dna.splits["val"])
            & set(self.data_dna.splits["test"]) == set())

    def test_data_shape(self):
        self.assertTrue(self.data_dna._shape() == (32, 4))
        self.assertTrue(self.data_rna_dot._shape() == (40, 12))
        self.assertTrue(self.data_pwm._shape() == (10, 14))

    def test_data_get_sequences(self):
        num_seqs = {'train': 70, 'val': 15, 'test': 15, 'all': 100}
        for group in ['train', 'val', 'test', 'all']:
            seqs = []
            for class_id in [0, 1]:
                seqs += self.data_dna._get_sequences(class_id, group)
            self.assertTrue(len(seqs) == num_seqs[group])
            for seq in seqs:
                self.assertTrue(seq == "ACGTACGTACGTACGTACGTACGTACGTACGT")

    def test_data_get_data(self):
        num_seqs = {'train': 70, 'val': 15, 'test': 15, 'all': 100}
        for group in ['train', 'val', 'test', 'all']:
            one_hot = self.data_dna._get_data(group)
            self.assertTrue(one_hot[0].shape == (num_seqs[group], 32, 4))
            self.assertTrue(one_hot[1].shape == (num_seqs[group], 2))

        num_seqs = {'train': 14, 'val': 3, 'test': 3, 'all': 20}
        for group in ['train', 'val', 'test', 'all']:
            one_hot = self.data_rna_dot._get_data(group)
            self.assertTrue(one_hot[0].shape == (num_seqs[group], 40, 12))
            self.assertTrue(one_hot[1].shape == (num_seqs[group], 3))

    def test_data_get_class_weights(self):
        weights = self.data_dna._get_class_weights()
        for key, val in {0: 1.5, 1: 1.0}.items():
            self.assertTrue(isclose(weights[key], val))
        weights = self.data_rna_dot._get_class_weights()
        for key, val in {0: 1.5, 1: 1.0909090909090908, 2: 1.0}.items():
            self.assertTrue(isclose(weights[key], val))

    def test_data_get_summary(self):
        classes = [["class_0", "class_1"], ["class_0", "class_1", "class_2"],
                   ["class_0", "class_1", "class_2"]]
        for i, obj in enumerate([self.data_dna, self.data_rna_dot]):
            text = obj.get_summary()
            text = text.split("\n")
            self.assertTrue(text[0].split() == classes[i])
            self.assertTrue(text[1].split()[:2] == ["all", "data:"])
            self.assertTrue(text[2].split()[0] == "training:")
            self.assertTrue(text[3].split()[0] == "validation:")
            self.assertTrue(text[4].split()[0] == "test:")
            for x in range(len(classes[i])):
                self.assertTrue(
                    int(text[2].split()[x + 1]) + int(text[3].split()[x + 1]) +
                    int(text[4].split()[x + 1]) == int(text[1].split()[x + 2]))

    def test_data_get_labels(self):
        labels = self.data_dna.get_labels("test")
        self.assertTrue(labels.shape == (15, 2))
        self.assertTrue((labels.sum(axis=1) == 1).all())
        labels = self.data_dna.get_labels("train")
        self.assertTrue(labels.shape == (70, 2))
        self.assertTrue((labels.sum(axis=1) == 1).all())
        labels = self.data_dna.get_labels("val")
        self.assertTrue(labels.shape == (15, 2))
        self.assertTrue((labels.sum(axis=1) == 1).all())
        labels = self.data_dna.get_labels("all")
        self.assertTrue(labels.shape == (100, 2))
        self.assertTrue((labels.sum(axis=1) == 1).all())
        self.assertTrue((labels.sum(axis=0) == [40, 60]).all())

        labels = self.data_rna_dot.get_labels("test")
        self.assertTrue(labels.shape == (3, 3))
        self.assertTrue((labels.sum(axis=1) <= 3).all())
        labels = self.data_rna_dot.get_labels("all")
        self.assertTrue(labels.shape == (20, 3))
        self.assertTrue((labels.sum(axis=1) <= 3).all())
        self.assertTrue((labels.sum(axis=0) == [8, 11, 12]).all())

    def test_data_pwm_struct(self):
        self.assertTrue(self.data_pwm.is_rna_pwm == True)
        self.assertTrue(len(self.data_pwm.data) == 32)
        self.assertTrue(self.data_pwm.data[0].shape == (10, 12))
        ref = np.array([
            0.9, 0, 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0.8, 0, 0.2, 0.7, 0, 0.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0.9, 0, 0.1, 0.0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0, 1.0, 0.0, 0.2, 0.8, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0.7, 0.3, 0.0, 0.8,
            0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0,
            0.9, 0.1
        ])
        ref.shape = (10, 12)
        self.assertTrue(np.allclose(self.data_pwm.data[0], ref))
        seqs = self.data_pwm._get_sequences(0, 'all')
        self.assertTrue(len(seqs) == 16)
        self.assertTrue(np.allclose(ref, seqs[15]))

    def test_data_additional(self):
        self.assertTrue(len(self.data_pwm.meta) == 2)
        self.assertTrue(self.data_pwm.meta[0]['is_categorical'] == False)
        self.assertTrue(self.data_pwm.meta[1]['is_categorical'] == True)
        self.assertTrue(self.data_pwm.meta[0]['data'] == [
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 15, 14,
            13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
        ])
        self.assertTrue(len(self.data_pwm.meta[1]['data']) == 32)
        for x in self.data_pwm.meta[1]['data']:
            self.assertTrue(sum(x) == 1)
        self.assertTrue((self.data_pwm.meta[1]['data'][0] ==
                         self.data_pwm.meta[1]['data'][31]).all())
        self.assertTrue((self.data_pwm.meta[1]['data'][13] ==
                         self.data_pwm.meta[1]['data'][18]).all())
        addi = self.data_pwm._get_additional_data([0, 1, 15, 16], 0, 4)
        self.assertTrue(len(addi) == 4)
        self.assertTrue(
            np.allclose(addi[0], [1, *self.data_pwm.meta[1]['data'][0]]))
        self.assertTrue(
            np.allclose(addi[1], [2, *self.data_pwm.meta[1]['data'][1]]))
        self.assertTrue(
            np.allclose(addi[2], [16, *self.data_pwm.meta[1]['data'][15]]))
        self.assertTrue(
            np.allclose(addi[3], [16, *self.data_pwm.meta[1]['data'][16]]))

        # check position-wise additional data
        self.assertTrue(len(self.data_pwm.positionwise) == 2)
        self.assertTrue(
            list(self.data_pwm.positionwise.keys()) == ["feat1", "feat2"])
        gen = self.data_pwm._data_generator("all", 32, False, False)
        dat = next(gen)
        self.assertTrue(dat[1].shape == (32, 17))
        self.assertTrue(dat[0].shape == (32, 10, 14))
        self.assertTrue(
            np.allclose(dat[0][0, :, 12],
                        [0.9, 0.8, 0.7, 0.9, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]))
        self.assertTrue(
            np.allclose(dat[0][31, :, 13],
                        [0.1, 0.2, 0.3, 0.1, 1.0, 1.0, 0.8, 0.3, 0.2, 0.1]))
        self.assertTrue(
            np.allclose(dat[0][31, :, 12],
                        [2.1, 2.2, 2.3, 2.1, 2.0, 2.0, 2.8, 2.3, 2.2, 2.1]))

        mod = Model(
            {
                "conv_num": 1,
                "kernel_num": 2,
                "kernel_len": 4,
                "neuron_num": 2,
                "epochs": 2
            }, self.data_pwm)
        mod.train(self.data_pwm, verbose=True)
        predictions = mod.predict(self.data_pwm, "all")
        self.assertTrue(predictions.shape == (32, 2))

        # check kernel output plot for position-wise data
        folder = gettempdir() + '/'
        acts = mod.get_max_activations(self.data_pwm, 'all')
        motif, score = mod.visualize_kernel(acts, self.data_pwm, 0, folder)
        with Image.open(folder + "additional_features_kernel_0.png") as img:
            self.assertTrue(img.size == (500, 1400))
        remove(folder + "additional_features_kernel_0.png")
        remove(folder + "motif_kernel_0.png")
        remove(folder + "position_kernel_0.png")
        remove(folder + "activations_kernel_0.png")