def test_read_dna_from_biostring_order_1(): data_path = pkg_resources.resource_filename('janggu', 'resources/') order = 1 filename = os.path.join(data_path, 'sample.fa') seqs = sequences_from_fasta(filename) with pytest.raises(ValueError): data = Bioseq.create_from_seq('train', fastafile=seqs, storage='sparse', order=order, cache=False) data = Bioseq.create_from_seq('train', fastafile=seqs, order=order, cache=False) np.testing.assert_equal(len(data), 3897) np.testing.assert_equal(data.shape, (3897, 200, 1, 4)) np.testing.assert_equal( data[0][0, :10, 0, :], np.asarray([[0, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0], [0, 0, 1, 0]], dtype='int8'))
def test_read_protein_sequences(): data_path = pkg_resources.resource_filename('janggu', 'resources/') order = 1 filename = os.path.join(data_path, 'sample_protein.fa') data = Bioseq.create_from_seq('train', fastafile=filename, order=order, seqtype='protein', fixedlen=1000) np.testing.assert_equal(len(data), 3) np.testing.assert_equal(data.shape, (3, 1000, 1, 20)) np.testing.assert_equal( data[0][0, :4, 0, :], np.asarray([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]], dtype='int8')) np.testing.assert_equal( data[0][0, -2:, 0, :], np.zeros((2, 20), dtype='int8')) data = Bioseq.create_from_seq('train', fastafile=filename, order=order, seqtype='protein', fixedlen=5) np.testing.assert_equal(len(data), 3) np.testing.assert_equal(data.shape, (3, 5, 1, 20)) np.testing.assert_equal( data[0][0, :4, 0, :], np.asarray([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]], dtype='int8'))
def test_read_dna_from_fasta_order_2(): data_path = pkg_resources.resource_filename('janggu', 'resources/') order = 2 filename = os.path.join(data_path, 'sample.fa') for store_genome in [True, False]: data = Bioseq.create_from_seq('train', fastafile=filename, order=order, cache=False) np.testing.assert_equal(len(data), 3897) np.testing.assert_equal(data.shape, (3897, 199, 1, 16)) np.testing.assert_equal( data[0][0, :10, 0, :], np.asarray([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]], dtype='int8'))
def test_janggu_influence_fasta(tmpdir): data_path = pkg_resources.resource_filename('janggu', 'resources/') order = 1 filename = os.path.join(data_path, 'sample.fa') data = Bioseq.create_from_seq('dna', fastafile=filename, order=order, cache=False) dna = data @inputlayer def _cnn_model(inputs, inp, oup, params): layer = inputs['dna'] layer = Flatten()(layer) output = Dense(params[0])(layer) output = Dense(1, activation='sigmoid')(output) return inputs, output model = Janggu.create(_cnn_model, modelparams=(2, ), inputs=data, name='dna_ctcf_HepG2-cnn') #model.compile(optimizer='adadelta', loss='binary_crossentropy') # check with some nice offset iv = dna.gindexer[0] chrom, start, end = iv.chrom, iv.start, iv.end influence = input_attribution(model, dna, chrom=chrom, start=start, end=end) influence2 = input_attribution(model, dna, idx=0) np.testing.assert_equal(influence[0][:], influence2[0][:])
Note however, that this is a simplification that might not always work. In general, one would need to parse for '>' occurrences. """ return sum((1 for line in open(filename) if line[0] == '>')) # load the dataset DATA_PATH = pkg_resources.resource_filename('janggu', 'resources/') SAMPLE_1 = os.path.join(DATA_PATH, 'sample.fa') SAMPLE_2 = os.path.join(DATA_PATH, 'sample2.fa') # DNA sequences in one-hot encoding will be used as input DNA = Bioseq.create_from_seq('dna', fastafile=[SAMPLE_1, SAMPLE_2], order=args.order, cache=True) # An array of 1/0 will be used as labels for training Y = np.asarray([[1] for line in range(nseqs(SAMPLE_1))] + [[0] for line in range(nseqs(SAMPLE_2))]) LABELS = Array('y', Y, conditions=['TF-binding']) annot = pd.DataFrame(Y[:], columns=LABELS.conditions).applymap( lambda x: 'Oct4' if x == 1 else 'Mafk').to_dict(orient='list') # Define the model templates @inputlayer @outputdense('sigmoid') def single_stranded_model(inputs, inp, oup, params):
Note however, that this is a simplification that might not always work. In general, one would need to parse for '>' occurrences. """ return sum((1 for line in open(filename) if line[0] == '>')) # load the dataset DATA_PATH = pkg_resources.resource_filename('janggu', 'resources/') SAMPLE_1 = os.path.join(DATA_PATH, 'sample.fa') SAMPLE_2 = os.path.join(DATA_PATH, 'sample2.fa') # DNA sequences in one-hot encoding will be used as input DNA = Bioseq.create_from_seq('dna', fastafile=[SAMPLE_1, SAMPLE_2], order=args.order, datatags=['train'], cache=True) # An array of 1/0 will be used as labels for training Y = np.asarray([1 for line in range(nseqs(SAMPLE_1))] + [0 for line in range(nseqs(SAMPLE_2))]) LABELS = Array('y', Y, conditions=['TF-binding']) annot = pd.DataFrame(Y[:], columns=LABELS.conditions).applymap( lambda x: 'Oct4' if x == 1 else 'Mafk').to_dict(orient='list') # evaluation metrics from sklearn.metrics def wrap_roc(y_true, y_pred): fpr, tpr, _ = roc_curve(y_true, y_pred) aux = str('({:.2%})'.format(roc_auc_score(y_true, y_pred)))