示例#1
0
def test_all_layers(seq, encodeSEQ, InputSEQ, ConvSEQ, tmpdir):
    seq_length = len(seq[0])

    # pre-process
    train_x = encodeSEQ(seq)
    train_y = np.array([[1]])
    print(train_x.shape)

    # build model
    inp = InputSEQ(seq_length=seq_length)
    if ConvSEQ == cl.ConvSplines:
        x = ConvSEQ(filters=1)(inp)
    else:
        x = ConvSEQ(filters=1, kernel_size=1)(inp)
    x = cl.GlobalSumPooling1D()(x)
    m = Model(inp, x)
    m.summary()
    m.compile("adam", loss="mse")

    m.fit(train_x, train_y)

    filepath = str(tmpdir.mkdir('data').join('test_keras.h5'))

    print(tmpdir)
    m.save(filepath)
    m = load_model(filepath)
    assert isinstance(m, Model)
示例#2
0
def manual_test_layer_plots_RNA():
    motifs = ["TTAATGA"]
    pwm_list = [PWM.from_consensus(motif) for motif in motifs]
    seq_length = 100
    motif_width = 7
    # specify the input shape
    input_dna = cl.InputDNA(seq_length)

    # convolutional layer with filters initialized on a PWM
    x = ConvRNA(
        filters=1,
        kernel_size=motif_width,  # motif width
        activation="relu",
        kernel_initializer=ci.PSSMKernelInitializer(pwm_list),
        bias_initializer=ci.PSSMBiasInitializer(pwm_list,
                                                kernel_size=motif_width,
                                                mean_max_scale=1)
        # mean_max_scale of 1 means that only consensus sequence gets score larger than 0
    )(input_dna)

    # Smoothing layer - positional-dependent effect
    x = cl.GAMSmooth(n_bases=10, l2_smooth=1e-3, l2=0)(x)
    x = cl.GlobalSumPooling1D()(x)
    x = kl.Dense(units=1, activation="linear")(x)
    model = Model(inputs=input_dna, outputs=x)
    model.compile("adam", "mse")
    # TODO - test
    model.layers[1].plot_weights(plot_type="heatmap")

    model.layers[1].plot_weights(0, plot_type="motif_raw")
    model.layers[1].plot_weights(0, plot_type="motif_pwm_info")
示例#3
0
def test_convDNA(tmpdir):
    motifs = ["TTAATGA"]
    pwm_list = [PWM.from_consensus(motif) for motif in motifs]
    seq_length = 100
    motif_width = 7
    # specify the input shape
    input_dna = cl.InputDNA(seq_length)

    # convolutional layer with filters initialized on a PWM
    x = cl.ConvDNA(
        filters=1,
        kernel_size=motif_width,  # motif width
        activation="relu",
        kernel_initializer=ci.PWMKernelInitializer(pwm_list),
        bias_initializer=ci.PWMBiasInitializer(pwm_list,
                                               kernel_size=motif_width,
                                               mean_max_scale=1)
        # mean_max_scale of 1 means that only consensus sequence gets score larger than 0
    )(input_dna)

    # Smoothing layer - positional-dependent effect
    x = cl.GAMSmooth(n_bases=10, l2_smooth=1e-3, l2=0)(x)
    x = cl.GlobalSumPooling1D()(x)
    x = kl.Dense(units=1, activation="linear")(x)
    model = Model(inputs=input_dna, outputs=x)

    # compile the model
    model.compile(optimizer="adam", loss="mse", metrics=[cm.var_explained])

    # filepath = "/tmp/model.h5"
    filepath = str(tmpdir.mkdir('data').join('test_keras.h5'))

    model.save(filepath)
    m = load_model(filepath)
    assert isinstance(m, Model)
示例#4
0
def manual_test_layer_plots_AA():
    motifs = ["ACDEFGGIKNY"]

    seq = encodeAA(motifs)

    seq_length = 100
    motif_width = 7

    seqlogo_fig(seq[0], vocab="AA")
    plt.show()

    # specify the input shape
    input_dna = cl.InputAA(seq_length)

    # convolutional layer with filters initialized on a PWM
    x = ConvAA(
        filters=1,
        kernel_size=motif_width,  # motif width
        activation="relu",
        # mean_max_scale of 1 means that only consensus sequence gets score larger than 0
    )(input_dna)

    # Smoothing layer - positional-dependent effect
    x = cl.GAMSmooth(n_bases=10, l2_smooth=1e-3, l2=0)(x)
    x = cl.GlobalSumPooling1D()(x)
    x = kl.Dense(units=1, activation="linear")(x)
    model = Model(inputs=input_dna, outputs=x)
    model.compile("adam", "mse")
    # TODO - test
    model.layers[1].plot_weights(plot_type="heatmap")

    model.layers[1].plot_weights(0, plot_type="motif_raw")
    model.layers[1].plot_weights(0, plot_type="motif_pwm_info")
def get_pool(pooling):
    pooling = deepcopy(pooling)
    pool_type = pooling["pool_type"]
    del pooling["pool_type"]
    if pool_type == "max":
        return [kl.MaxPooling1D(pool_size=pooling["pool_size"]), kl.Flatten()]
    elif pool_type == "mean":
        return [
            kl.AveragePooling1D(pool_size=pooling["pool_size"]),
            kl.Flatten()
        ]
    elif pool_type == "weight":
        return [cl.SmoothPositionWeight(**pooling), cl.GlobalSumPooling1D()]
    else:
        raise ValueError("")
示例#6
0
def manual_test_layer_plots_Codon():
    motifs = ["TTAATGAAT"]
    seq_length = 102
    # specify the input shape
    input_dna = cl.InputCodon(seq_length)

    # convolutional layer with filters initialized on a PWM
    x = cl.ConvCodon(
        filters=1,
        kernel_size=2,  # motif width
        activation="relu",
        # mean_max_scale of 1 means that only consensus sequence gets score larger than 0
    )(input_dna)

    # Smoothing layer - positional-dependent effect
    x = cl.GAMSmooth(n_bases=10, l2_smooth=1e-3, l2=0)(x)
    x = cl.GlobalSumPooling1D()(x)
    x = kl.Dense(units=1, activation="linear")(x)
    model = Model(inputs=input_dna, outputs=x)
    model.compile("adam", "mse")
    model.layers[1].plot_weights(figsize=(3, 10))
示例#7
0
def test_ConvSplines(tmpdir):

    x_pos = np.vstack([np.arange(15), np.arange(15)])
    y = np.arange(2)

    x = encodeSplines(x_pos)

    inl = cl.InputSplines(15, 10)
    o = cl.ConvSplines(
        1,
        kernel_regularizer=cr.GAMRegularizer(l2_smooth=.5),
    )(inl)
    o = cl.GlobalSumPooling1D()(o)

    model = Model(inl, o)
    model.compile("Adam", "mse")
    model.fit(x, y)

    filepath = str(tmpdir.mkdir('data').join('test_keras.h5'))

    # load and save the model
    model.save(filepath)
    m = load_model(filepath)
    assert isinstance(m, Model)
示例#8
0
def single_layer_pos_effect(
        pooling_layer="sum",  # 'sum', 'max' or 'mean'
        nonlinearity="relu",  # 'relu' or 'exp'
        motif_length=9,
        n_motifs=6,  # number of filters
        step_size=0.01,
        num_tasks=1,  # multi-task learning - 'trans'
        n_covariates=0,
        seq_length=100,  # pre-defined sequence length
        # splines
    n_splines=None,
        share_splines=False,  # should the positional bias be shared across motifs
        # regularization
    lamb=1e-5,  # overall motif coefficient regularization
        motif_lamb=1e-5,
        spline_lamb=1e-5,
        spline_param_lamb=1e-5,
        # initialization
        init_motifs=None,  # motifs to intialize
        init_motif_bias=0,
        init_sd_motif=1e-2,
        init_sd_w=1e-3,  # initial weight scale of feature w or motif w
        **kwargs):  # unused params

    # initialize conv kernels to known motif pwm's
    if init_motifs:
        # WARNING - initialization is not the same as for Concise class
        pwm_list = [PWM.from_consensus(motif) for motif in init_motifs]
        kernel_initializer = ci.PWMKernelInitializer(pwm_list,
                                                     stddev=init_sd_motif)
        bias_initializer = ci.PWMBiasInitializer(pwm_list,
                                                 kernel_size=motif_length)
    else:
        # kernel_initializer = "glorot_uniform"
        kernel_initializer = ki.RandomNormal(stddev=init_sd_motif)
        bias_initializer = ki.Constant(value=init_motif_bias)

    activation = nonlinearity  # supports 'relu' out-of-the-box

    # define the model
    # ----------------
    inputs = []
    seq_input = kl.Input((seq_length, 4))
    inputs.append(seq_input)
    # convolution
    xseq = kl.Conv1D(
        filters=n_motifs,
        kernel_size=motif_length,
        kernel_regularizer=kr.l1(l=motif_lamb),  # Regularization
        activation=activation,
        kernel_initializer=kernel_initializer,
        bias_initializer=bias_initializer)(seq_input)
    # optional positional effect
    if n_splines:
        xseq = cl.GAMSmooth(
            n_bases=n_splines,
            share_splines=share_splines,
            l2_smooth=spline_lamb,
            l2=spline_param_lamb,
        )(xseq)
    # pooling layer
    if pooling_layer is "max":
        xseq = kl.pooling.GlobalMaxPooling1D()(xseq)
    elif pooling_layer is "mean":
        xseq = kl.pooling.GlobalAveragePooling1D()(xseq)
    elif pooling_layer is "sum":
        xseq = cl.GlobalSumPooling1D()(xseq)
    else:
        raise ValueError("pooling_layer can only be 'sum', 'mean' or 'max'.")
    # -----
    # add covariates
    if n_covariates:
        cov_input = kl.Input((n_covariates, ))
        inputs.append(cov_input)
        x = kl.concatenate([xseq, cov_input])
    else:
        x = xseq
    # -----

    predictions = kl.Dense(
        units=num_tasks,
        kernel_regularizer=kr.l1(lamb),
        kernel_initializer=ki.RandomNormal(stddev=init_sd_w))(x)

    model = Model(inputs=inputs, outputs=predictions)

    model.compile(optimizer=Adam(lr=step_size), loss="mse", metrics=["mse"])

    return model
def model(
        train_data,
        activation="relu",
        kernel_size=10,
        filters=16,
        conv2_use_skip=False,
        internal_pos={"name": "global_maxpool"},
        # {"name": "strided_maxpool", "pool_size": 3}
        # {"name": "maxpool+rnn_sequence", "pool_size": 3, "dropout": 0.1}
        # {"name": "rnn", "pool_size": 3, "dropout": 0.1}
        # {"name": "maxpool+weight_sum", "pool_size": 3, "n_bases": 10, "share_splines": False,
        # "l2_smooth": 1e-5, "l2": 0}
        external_pos=None,  # None, {"type": "gam", "as_track": True, "units": 1}
        dropout_rate=0.5,
        n_hidden=100,
        use_batchnorm=False,
        use_weightnorm=False,
        lr=0.001):
    """Returns keras model for modelling arrays returned by data.data()
    """
    # config
    seq_length = train_data[0]["seq"].shape[1]
    n_tasks = train_data[1].shape[1]
    ext_n_bases = train_data[0]["dist_tss"].shape[2]
    activation = PReLU() if activation == "PReLU" else activation

    inputs = []
    # position module
    # ---------------
    if external_pos is not None:
        # conf
        external_pos["as_track"] = train_data[0]["dist_tss"].shape[1] != 1
        if external_pos["as_track"]:
            pos_length = seq_length
        else:
            pos_length = 1

        ext_filters = external_pos["units"]

        # NOTE: Concise now implements a layer SplineT, which simplifies
        # the following code significantly.
        pos_inputs, pos_outputs = tuple(
            zip(*[
                pos_module(pos_length=pos_length,
                           ext_n_bases=ext_n_bases,
                           ext_filters=ext_filters,
                           kernel_size=kernel_size,
                           feat_name=feat_name,
                           ext_pos_kwargs=external_pos) for feat_name in
                external_pos.get("feat_names", ["gene_start", "gene_end"])
            ]))
        inputs += list(pos_inputs)
        pos_outputs = list(pos_outputs)

    # sequence module
    # ----------------
    # initialize conv kernels to known motif pwm's
    seq_input = cl.InputDNA(seq_length, name="seq")
    inputs += [seq_input]
    x_pwm = cl.ConvDNA(filters=filters,
                       kernel_size=kernel_size,
                       activation=activation)(seq_input)
    if use_batchnorm:
        x_pwm = kl.BatchNormalization(axis=1)(x_pwm)

    # inject external_pos as a track
    if external_pos is not None and external_pos["as_track"]:
        x_pwm = kl.concatenate([x_pwm] + pos_outputs, axis=-1)

    x = kl.Conv1D(filters, kernel_size=1, activation=activation)(x_pwm)
    if conv2_use_skip:
        x = kl.concatenate([x_pwm, x])  # skip connection ?
    if use_batchnorm:
        x = kl.BatchNormalization(axis=1)(x)

    # summarize across sequence
    # -------------------------
    if internal_pos["name"] == "global_maxpool":
        x = kl.GlobalMaxPool1D()(x)
    elif internal_pos["name"] == "strided_maxpool":
        x = kl.MaxPool1D(pool_size=internal_pos["pool_size"])(x)
        x = kl.Flatten()(x)
    elif internal_pos["name"] == "maxpool+rnn_sequence":
        x = kl.MaxPool1D(pool_size=internal_pos["pool_size"])(x)
        x = kl.Bidirectional(
            kl.LSTM(filters,
                    dropout_W=internal_pos["dropout"],
                    dropout_U=internal_pos["dropout"],
                    return_sequences=True))(x)
        x = kl.Flatten()(x)
    elif internal_pos["name"] == "rnn":
        x = kl.MaxPool1D(pool_size=internal_pos["pool_size"])(x)
        x = kl.Bidirectional(
            kl.LSTM(filters,
                    dropout_W=internal_pos["dropout"],
                    dropout_U=internal_pos["dropout"]))(x)
    elif internal_pos["name"] == "maxpool+weight_sum":
        x = kl.MaxPool1D(pool_size=internal_pos["pool_size"])(x)
        x = cl.SmoothPositionWeight(n_bases=internal_pos.get("n_bases", 10),
                                    share_splines=internal_pos.get(
                                        "share_splines", False),
                                    l2_smooth=internal_pos.get("l2_smooth", 0),
                                    l2=internal_pos.get("l2", 0))(x)
        x = cl.GlobalSumPooling1D()(x)
    else:
        raise ValueError("invalid internal_pos")
    if use_batchnorm:
        x = kl.BatchNormalization()(x)
    x = kl.Dropout(dropout_rate)(x)

    # append external_pos as a scalar
    # -------------------------------
    if external_pos is not None and not external_pos["as_track"]:
        x = kl.concatenate([x] + pos_outputs, axis=-1)

    # FC layers
    # ---------
    x = kl.Dense(n_hidden, activation=activation)(x)
    if use_batchnorm:
        x = kl.BatchNormalization()(x)
    x = kl.Dropout(dropout_rate)(x)
    outputs = kl.Dense(n_tasks, activation="sigmoid")(x)

    # compile model
    # -------------
    m = Model(inputs, outputs)
    if use_weightnorm:
        optimizer = AdamWithWeightnorm(lr=lr)
    else:
        optimizer = Adam(lr=lr)

    m.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["acc"])

    if use_weightnorm:
        data_based_init(m, model_data.subset(train_data, np.arange(500))[0])
    return m