예제 #1
0
def test_read_dna_from_biostring_order_1():

    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    order = 1
    filename = os.path.join(data_path, 'sample.fa')
    seqs = sequences_from_fasta(filename)
    with pytest.raises(ValueError):
        data = Bioseq.create_from_seq('train',
                                      fastafile=seqs,
                                      storage='sparse',
                                      order=order,
                                      cache=False)

    data = Bioseq.create_from_seq('train',
                                  fastafile=seqs,
                                  order=order,
                                  cache=False)

    np.testing.assert_equal(len(data), 3897)
    np.testing.assert_equal(data.shape, (3897, 200, 1, 4))
    np.testing.assert_equal(
        data[0][0, :10, 0, :],
        np.asarray([[0, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0],
                    [0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0],
                    [1, 0, 0, 0], [0, 0, 1, 0]],
                   dtype='int8'))
예제 #2
0
def test_read_protein_sequences():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    order = 1
    filename = os.path.join(data_path, 'sample_protein.fa')
    data = Bioseq.create_from_seq('train', fastafile=filename,
                                 order=order, seqtype='protein', fixedlen=1000)
    np.testing.assert_equal(len(data), 3)
    np.testing.assert_equal(data.shape, (3, 1000, 1, 20))
    np.testing.assert_equal(
        data[0][0, :4, 0, :],
        np.asarray([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]], dtype='int8'))
    np.testing.assert_equal(
        data[0][0, -2:, 0, :], np.zeros((2, 20), dtype='int8'))

    data = Bioseq.create_from_seq('train', fastafile=filename,
                                 order=order, seqtype='protein', fixedlen=5)
    np.testing.assert_equal(len(data), 3)
    np.testing.assert_equal(data.shape, (3, 5, 1, 20))
    np.testing.assert_equal(
        data[0][0, :4, 0, :],
        np.asarray([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]], dtype='int8'))
예제 #3
0
def test_read_dna_from_fasta_order_2():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    order = 2
    filename = os.path.join(data_path, 'sample.fa')
    for store_genome in [True, False]:
        data = Bioseq.create_from_seq('train',
                                      fastafile=filename,
                                      order=order,
                                      cache=False)

        np.testing.assert_equal(len(data), 3897)
        np.testing.assert_equal(data.shape, (3897, 199, 1, 16))
        np.testing.assert_equal(
            data[0][0, :10, 0, :],
            np.asarray([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]],
                       dtype='int8'))
예제 #4
0
def test_janggu_influence_fasta(tmpdir):

    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    order = 1
    filename = os.path.join(data_path, 'sample.fa')

    data = Bioseq.create_from_seq('dna',
                                  fastafile=filename,
                                  order=order,
                                  cache=False)

    dna = data

    @inputlayer
    def _cnn_model(inputs, inp, oup, params):
        layer = inputs['dna']
        layer = Flatten()(layer)
        output = Dense(params[0])(layer)
        output = Dense(1, activation='sigmoid')(output)
        return inputs, output

    model = Janggu.create(_cnn_model,
                          modelparams=(2, ),
                          inputs=data,
                          name='dna_ctcf_HepG2-cnn')

    #model.compile(optimizer='adadelta', loss='binary_crossentropy')

    # check with some nice offset
    iv = dna.gindexer[0]
    chrom, start, end = iv.chrom, iv.start, iv.end
    influence = input_attribution(model,
                                  dna,
                                  chrom=chrom,
                                  start=start,
                                  end=end)

    influence2 = input_attribution(model, dna, idx=0)
    np.testing.assert_equal(influence[0][:], influence2[0][:])
예제 #5
0
    Note however, that this is a simplification
    that might not always work. In general, one would
    need to parse for '>' occurrences.
    """
    return sum((1 for line in open(filename) if line[0] == '>'))


# load the dataset
DATA_PATH = pkg_resources.resource_filename('janggu', 'resources/')
SAMPLE_1 = os.path.join(DATA_PATH, 'sample.fa')
SAMPLE_2 = os.path.join(DATA_PATH, 'sample2.fa')

# DNA sequences in one-hot encoding will be used as input
DNA = Bioseq.create_from_seq('dna',
                             fastafile=[SAMPLE_1, SAMPLE_2],
                             order=args.order,
                             cache=True)

# An array of 1/0 will be used as labels for training
Y = np.asarray([[1] for line in range(nseqs(SAMPLE_1))] +
               [[0] for line in range(nseqs(SAMPLE_2))])
LABELS = Array('y', Y, conditions=['TF-binding'])
annot = pd.DataFrame(Y[:], columns=LABELS.conditions).applymap(
    lambda x: 'Oct4' if x == 1 else 'Mafk').to_dict(orient='list')

# Define the model templates


@inputlayer
@outputdense('sigmoid')
def single_stranded_model(inputs, inp, oup, params):
예제 #6
0
    Note however, that this is a simplification
    that might not always work. In general, one would
    need to parse for '>' occurrences.
    """
    return sum((1 for line in open(filename) if line[0] == '>'))


# load the dataset
DATA_PATH = pkg_resources.resource_filename('janggu', 'resources/')
SAMPLE_1 = os.path.join(DATA_PATH, 'sample.fa')
SAMPLE_2 = os.path.join(DATA_PATH, 'sample2.fa')

# DNA sequences in one-hot encoding will be used as input
DNA = Bioseq.create_from_seq('dna',
                             fastafile=[SAMPLE_1, SAMPLE_2],
                             order=args.order,
                             datatags=['train'],
                             cache=True)

# An array of 1/0 will be used as labels for training
Y = np.asarray([1 for line in range(nseqs(SAMPLE_1))] +
               [0 for line in range(nseqs(SAMPLE_2))])
LABELS = Array('y', Y, conditions=['TF-binding'])
annot = pd.DataFrame(Y[:], columns=LABELS.conditions).applymap(
    lambda x: 'Oct4' if x == 1 else 'Mafk').to_dict(orient='list')


# evaluation metrics from sklearn.metrics
def wrap_roc(y_true, y_pred):
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    aux = str('({:.2%})'.format(roc_auc_score(y_true, y_pred)))