Exemplo n.º 1
0
add_both_features = [x.replace('cgi.', 'both.') for x in add_cgi_features]

#indel_len_feat = [DATA + "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__indel_length.out",
#                  DATA + "explore-cgi/data/interim/cgi_ind_exp/add_feat/both.indel.unsample__indel_length.out"]

#load the dataset as data
data = Data([
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/pysster_fa/cgi.indel.unsample.fa.gz",
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/pysster_fa/both.indel.unsample.fa.gz"
], ("ACGT", "XDI"))

for x, y in zip(add_cgi_features, add_both_features):
    features = [x, y]
    data.load_additional_data(features, is_categorical=True)

#data.load_additional_data(indel_len_feat, is_categorical=False)

#run the model of pysster on all of the data set
predictions = model.predict(data, "all")
predictions

labels = data.get_labels("all")
labels

utils.plot_roc(labels, predictions, output_folder + "roc.png")
utils.plot_prec_recall(labels, predictions, output_folder + "prec.png")
print(utils.get_performance_report(labels, predictions))

Image(output_folder + "roc.png")
Exemplo n.º 2
0
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.sample__microsat.out"
]

add_both_features = [x.replace('cgi.', 'both.') for x in add_cgi_features]

indel_len_feat = [
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.sample__indel_length.out",
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/add_feat/both.indel.sample__indel_length.out"
]

for x, y in zip(add_cgi_features, add_both_features):
    features = [x, y]
    data.load_additional_data(features, is_categorical=True)

data.load_additional_data(indel_len_feat, is_categorical=False)

print(data.get_summary())

data.train_val_test_split(portion_train=0.6, portion_val=0.2, seed=3)
print(data.get_summary())

###Model Training
params = {
    "conv_num": [2, 3],
    "kernel_num": [100],
    "kernel_len": [8],
    "dropout_input": [0.1, 0.4]
}
Exemplo n.º 3
0
class Test_Data(unittest.TestCase):
    def setUp(self):
        folder = dirname(__file__)
        dna_files = [
            folder + "/data/dna_pos.fasta", folder + "/data/dna_neg.fasta"
        ]
        rna_files = folder + "/data/rna.fasta"
        rna_pwm = [
            folder + '/data/rna_pwm1.fasta', folder + '/data/rna_pwm2.fasta'
        ]
        rna_pwm_add = [
            folder + '/data/rna_pwm1_add.txt',
            folder + '/data/rna_pwm2_add.txt'
        ]
        rna_pwm_pos_feat1 = [
            folder + '/data/rna_pwm1_pos.txt',
            folder + '/data/rna_pwm2_pos.txt'
        ]
        rna_pwm_pos_feat2 = [
            folder + '/data/rna_pwm2_pos.txt',
            folder + '/data/rna_pwm1_pos.txt'
        ]
        self.data_dna = Data(dna_files, "ACGT")
        self.data_rna_dot = Data(rna_files, ("ACGU", "()."))
        self.data_pwm = Data(rna_pwm, ('ACGU', '().'), structure_pwm=True)
        self.data_pwm.load_additional_data(rna_pwm_add, is_categorical=False)
        self.data_pwm.load_additional_data(
            rna_pwm_add,
            is_categorical=True,
            categories=[str(x) for x in range(1, 17)])
        self.data_pwm.load_additional_positionwise_data(
            rna_pwm_pos_feat1, "feat1")
        self.data_pwm.load_additional_positionwise_data(
            rna_pwm_pos_feat2, "feat2")

    def test_data_init_dna(self):
        self.assertFalse(self.data_dna.is_rna_pwm)
        self.assertFalse(self.data_dna.is_rna)
        self.assertFalse(self.data_dna.multilabel)
        self.assertTrue(len(self.data_dna.data) == 100)
        self.assertTrue(self.data_dna.data[0].shape == (32, 4))
        self.assertTrue(self.data_dna.one_hot_encoder.alphabet == 'ACGT')

        self.assertTrue(len(self.data_dna.labels) == 100)
        self.assertTrue(self.data_dna.labels[0].shape == (2, ))
        for x in range(40):
            self.assertTrue((self.data_dna.labels[x] == [1, 0]).all())
        for x in range(40, 100):
            self.assertTrue((self.data_dna.labels[x] == [0, 1]).all())

    def test_data_init_rna(self):
        self.assertFalse(self.data_rna_dot.is_rna_pwm)
        self.assertTrue(len(self.data_rna_dot.data) == 20)
        self.assertTrue(self.data_rna_dot.data[0].shape == (40, 12))
        self.assertTrue(self.data_rna_dot.alpha_coder.alph1 == '().')

        idx_0 = [0, 2, 4, 10, 11, 14, 18, 19]
        idx_1 = [1, 2, 5, 6, 9, 10, 12, 15, 16, 17, 19]
        idx_2 = [0, 2, 3, 6, 7, 8, 9, 10, 13, 14, 15, 16]
        for obj in [self.data_rna_dot]:
            self.assertTrue(obj.is_rna)
            self.assertTrue(obj.multilabel)
            self.assertTrue(obj.alpha_coder.alph0 == 'ACGU')
            self.assertTrue(
                obj.one_hot_encoder.alphabet == obj.alpha_coder.alphabet)

            self.assertTrue(len(obj.labels) == 20)
            self.assertTrue(obj.labels[0].shape == (3, ))
            for x in idx_0:
                self.assertTrue(obj.labels[x][0] == 1)
            for x in idx_1:
                self.assertTrue(obj.labels[x][1] == 1)
            for x in idx_2:
                self.assertTrue(obj.labels[x][2] == 1)

    def test_data_train_val_test_split(self):
        for obj in [self.data_rna_dot]:
            self.assertTrue(len(obj.splits["train"]) == 14)
            self.assertTrue(len(obj.splits["val"]) == 3)
            self.assertTrue(len(obj.splits["test"]) == 3)
            self.assertTrue(
                set(obj.splits["train"]) & set(obj.splits["val"])
                & set(obj.splits["test"]) == set())

        self.assertTrue(len(self.data_dna.splits["train"]) == 70)
        self.assertTrue(len(self.data_dna.splits["val"]) == 15)
        self.assertTrue(len(self.data_dna.splits["test"]) == 15)
        self.assertTrue(
            set(self.data_dna.splits["train"])
            & set(self.data_dna.splits["val"])
            & set(self.data_dna.splits["test"]) == set())

    def test_data_shape(self):
        self.assertTrue(self.data_dna._shape() == (32, 4))
        self.assertTrue(self.data_rna_dot._shape() == (40, 12))
        self.assertTrue(self.data_pwm._shape() == (10, 14))

    def test_data_get_sequences(self):
        num_seqs = {'train': 70, 'val': 15, 'test': 15, 'all': 100}
        for group in ['train', 'val', 'test', 'all']:
            seqs = []
            for class_id in [0, 1]:
                seqs += self.data_dna._get_sequences(class_id, group)
            self.assertTrue(len(seqs) == num_seqs[group])
            for seq in seqs:
                self.assertTrue(seq == "ACGTACGTACGTACGTACGTACGTACGTACGT")

    def test_data_get_data(self):
        num_seqs = {'train': 70, 'val': 15, 'test': 15, 'all': 100}
        for group in ['train', 'val', 'test', 'all']:
            one_hot = self.data_dna._get_data(group)
            self.assertTrue(one_hot[0].shape == (num_seqs[group], 32, 4))
            self.assertTrue(one_hot[1].shape == (num_seqs[group], 2))

        num_seqs = {'train': 14, 'val': 3, 'test': 3, 'all': 20}
        for group in ['train', 'val', 'test', 'all']:
            one_hot = self.data_rna_dot._get_data(group)
            self.assertTrue(one_hot[0].shape == (num_seqs[group], 40, 12))
            self.assertTrue(one_hot[1].shape == (num_seqs[group], 3))

    def test_data_get_class_weights(self):
        weights = self.data_dna._get_class_weights()
        for key, val in {0: 1.5, 1: 1.0}.items():
            self.assertTrue(isclose(weights[key], val))
        weights = self.data_rna_dot._get_class_weights()
        for key, val in {0: 1.5, 1: 1.0909090909090908, 2: 1.0}.items():
            self.assertTrue(isclose(weights[key], val))

    def test_data_get_summary(self):
        classes = [["class_0", "class_1"], ["class_0", "class_1", "class_2"],
                   ["class_0", "class_1", "class_2"]]
        for i, obj in enumerate([self.data_dna, self.data_rna_dot]):
            text = obj.get_summary()
            text = text.split("\n")
            self.assertTrue(text[0].split() == classes[i])
            self.assertTrue(text[1].split()[:2] == ["all", "data:"])
            self.assertTrue(text[2].split()[0] == "training:")
            self.assertTrue(text[3].split()[0] == "validation:")
            self.assertTrue(text[4].split()[0] == "test:")
            for x in range(len(classes[i])):
                self.assertTrue(
                    int(text[2].split()[x + 1]) + int(text[3].split()[x + 1]) +
                    int(text[4].split()[x + 1]) == int(text[1].split()[x + 2]))

    def test_data_get_labels(self):
        labels = self.data_dna.get_labels("test")
        self.assertTrue(labels.shape == (15, 2))
        self.assertTrue((labels.sum(axis=1) == 1).all())
        labels = self.data_dna.get_labels("train")
        self.assertTrue(labels.shape == (70, 2))
        self.assertTrue((labels.sum(axis=1) == 1).all())
        labels = self.data_dna.get_labels("val")
        self.assertTrue(labels.shape == (15, 2))
        self.assertTrue((labels.sum(axis=1) == 1).all())
        labels = self.data_dna.get_labels("all")
        self.assertTrue(labels.shape == (100, 2))
        self.assertTrue((labels.sum(axis=1) == 1).all())
        self.assertTrue((labels.sum(axis=0) == [40, 60]).all())

        labels = self.data_rna_dot.get_labels("test")
        self.assertTrue(labels.shape == (3, 3))
        self.assertTrue((labels.sum(axis=1) <= 3).all())
        labels = self.data_rna_dot.get_labels("all")
        self.assertTrue(labels.shape == (20, 3))
        self.assertTrue((labels.sum(axis=1) <= 3).all())
        self.assertTrue((labels.sum(axis=0) == [8, 11, 12]).all())

    def test_data_pwm_struct(self):
        self.assertTrue(self.data_pwm.is_rna_pwm == True)
        self.assertTrue(len(self.data_pwm.data) == 32)
        self.assertTrue(self.data_pwm.data[0].shape == (10, 12))
        ref = np.array([
            0.9, 0, 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0.8, 0, 0.2, 0.7, 0, 0.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0.9, 0, 0.1, 0.0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0, 1.0, 0.0, 0.2, 0.8, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0.7, 0.3, 0.0, 0.8,
            0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0,
            0.9, 0.1
        ])
        ref.shape = (10, 12)
        self.assertTrue(np.allclose(self.data_pwm.data[0], ref))
        seqs = self.data_pwm._get_sequences(0, 'all')
        self.assertTrue(len(seqs) == 16)
        self.assertTrue(np.allclose(ref, seqs[15]))

    def test_data_additional(self):
        self.assertTrue(len(self.data_pwm.meta) == 2)
        self.assertTrue(self.data_pwm.meta[0]['is_categorical'] == False)
        self.assertTrue(self.data_pwm.meta[1]['is_categorical'] == True)
        self.assertTrue(self.data_pwm.meta[0]['data'] == [
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 15, 14,
            13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
        ])
        self.assertTrue(len(self.data_pwm.meta[1]['data']) == 32)
        for x in self.data_pwm.meta[1]['data']:
            self.assertTrue(sum(x) == 1)
        self.assertTrue((self.data_pwm.meta[1]['data'][0] ==
                         self.data_pwm.meta[1]['data'][31]).all())
        self.assertTrue((self.data_pwm.meta[1]['data'][13] ==
                         self.data_pwm.meta[1]['data'][18]).all())
        addi = self.data_pwm._get_additional_data([0, 1, 15, 16], 0, 4)
        self.assertTrue(len(addi) == 4)
        self.assertTrue(
            np.allclose(addi[0], [1, *self.data_pwm.meta[1]['data'][0]]))
        self.assertTrue(
            np.allclose(addi[1], [2, *self.data_pwm.meta[1]['data'][1]]))
        self.assertTrue(
            np.allclose(addi[2], [16, *self.data_pwm.meta[1]['data'][15]]))
        self.assertTrue(
            np.allclose(addi[3], [16, *self.data_pwm.meta[1]['data'][16]]))

        # check position-wise additional data
        self.assertTrue(len(self.data_pwm.positionwise) == 2)
        self.assertTrue(
            list(self.data_pwm.positionwise.keys()) == ["feat1", "feat2"])
        gen = self.data_pwm._data_generator("all", 32, False, False)
        dat = next(gen)
        self.assertTrue(dat[1].shape == (32, 17))
        self.assertTrue(dat[0].shape == (32, 10, 14))
        self.assertTrue(
            np.allclose(dat[0][0, :, 12],
                        [0.9, 0.8, 0.7, 0.9, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]))
        self.assertTrue(
            np.allclose(dat[0][31, :, 13],
                        [0.1, 0.2, 0.3, 0.1, 1.0, 1.0, 0.8, 0.3, 0.2, 0.1]))
        self.assertTrue(
            np.allclose(dat[0][31, :, 12],
                        [2.1, 2.2, 2.3, 2.1, 2.0, 2.0, 2.8, 2.3, 2.2, 2.1]))

        mod = Model(
            {
                "conv_num": 1,
                "kernel_num": 2,
                "kernel_len": 4,
                "neuron_num": 2,
                "epochs": 2
            }, self.data_pwm)
        mod.train(self.data_pwm, verbose=True)
        predictions = mod.predict(self.data_pwm, "all")
        self.assertTrue(predictions.shape == (32, 2))

        # check kernel output plot for position-wise data
        folder = gettempdir() + '/'
        acts = mod.get_max_activations(self.data_pwm, 'all')
        motif, score = mod.visualize_kernel(acts, self.data_pwm, 0, folder)
        with Image.open(folder + "additional_features_kernel_0.png") as img:
            self.assertTrue(img.size == (500, 1400))
        remove(folder + "additional_features_kernel_0.png")
        remove(folder + "motif_kernel_0.png")
        remove(folder + "position_kernel_0.png")
        remove(folder + "activations_kernel_0.png")