def test_janggu_influence_genomic(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath """Test Janggu creation by shape and name. """ data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') csvfile = os.path.join(data_path, 'sample.csv') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', binsize=50, roi=bed_file, order=1) df = pd.read_csv(csvfile, header=None) ctcf = Array('ctcf', df.values, conditions=['peaks']) @inputlayer @outputdense('sigmoid') def _cnn_model(inputs, inp, oup, params): layer = inputs['dna'] layer = Flatten()(layer) output = Dense(params[0])(layer) return inputs, output model = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn') model.compile(optimizer='adadelta', loss='binary_crossentropy') # check with some nice offset iv = dna.gindexer[0] chrom, start, end = iv.chrom, iv.start, iv.end influence = input_attribution(model, dna, chrom=chrom, start=start, end=end) # check with an odd offset # chrom, start, end = influence2 = input_attribution(model, dna, chrom=chrom, start=start - 1, end=end + 1) np.testing.assert_equal(influence[0][:], influence2[0][:][:, 1:-1])
def test_janggu_influence_fasta(tmpdir): data_path = pkg_resources.resource_filename('janggu', 'resources/') order = 1 filename = os.path.join(data_path, 'sample.fa') data = Bioseq.create_from_seq('dna', fastafile=filename, order=order, cache=False) dna = data @inputlayer def _cnn_model(inputs, inp, oup, params): layer = inputs['dna'] layer = Flatten()(layer) output = Dense(params[0])(layer) output = Dense(1, activation='sigmoid')(output) return inputs, output model = Janggu.create(_cnn_model, modelparams=(2, ), inputs=data, name='dna_ctcf_HepG2-cnn') #model.compile(optimizer='adadelta', loss='binary_crossentropy') # check with some nice offset iv = dna.gindexer[0] chrom, start, end = iv.chrom, iv.start, iv.end influence = input_attribution(model, dna, chrom=chrom, start=start, end=end) influence2 = input_attribution(model, dna, idx=0) np.testing.assert_equal(influence[0][:], influence2[0][:])
print('Oct4 predictions scores should be greater than Mafk scores:') print('Prediction score examples for Oct4') for i in range(4): print('{}.: {}'.format(i, cov_pred[i])) print('Prediction score examples for Mafk') for i in range(1, 5): print('{}.: {}'.format(i, cov_pred[-i])) # Extract the 4th interval to perform input feature importance attribution # which represents an Oct4 bound region gi = DNA.gindexer[3] chrom = gi.chrom start = gi.start end = gi.end attr_oct = input_attribution(model, DNA, chrom=chrom, start=start, end=end) # visualize the important sequence features plotGenomeTrack(SeqTrack(attr_oct[0]), chrom, start, end).savefig( os.path.join(args.path, 'influence_oct4_example_order{}.png'.format(args.order))) # For the comparison, extract an interval # representing a Mafk bound region and visualize the # important features. gi = DNA.gindexer[7796] chrom = gi.chrom start = gi.start end = gi.end attr_mafk = input_attribution(model, DNA, chrom=chrom, start=start, end=end)