Exemplo n.º 1
0
def test_convDNA(tmpdir):
    motifs = ["TTAATGA"]
    pwm_list = [PWM.from_consensus(motif) for motif in motifs]
    seq_length = 100
    motif_width = 7
    # specify the input shape
    input_dna = cl.InputDNA(seq_length)

    # convolutional layer with filters initialized on a PWM
    x = cl.ConvDNA(
        filters=1,
        kernel_size=motif_width,  # motif width
        activation="relu",
        kernel_initializer=ci.PWMKernelInitializer(pwm_list),
        bias_initializer=ci.PWMBiasInitializer(pwm_list,
                                               kernel_size=motif_width,
                                               mean_max_scale=1)
        # mean_max_scale of 1 means that only consensus sequence gets score larger than 0
    )(input_dna)

    # Smoothing layer - positional-dependent effect
    x = cl.GAMSmooth(n_bases=10, l2_smooth=1e-3, l2=0)(x)
    x = cl.GlobalSumPooling1D()(x)
    x = kl.Dense(units=1, activation="linear")(x)
    model = Model(inputs=input_dna, outputs=x)

    # compile the model
    model.compile(optimizer="adam", loss="mse", metrics=[cm.var_explained])

    # filepath = "/tmp/model.h5"
    filepath = str(tmpdir.mkdir('data').join('test_keras.h5'))

    model.save(filepath)
    m = load_model(filepath)
    assert isinstance(m, Model)
Exemplo n.º 2
0
def model(train_data, filters=1, kernel_size=9, motif_init=None, lr=0.001):
    seq_length = train_data[0]["seq"].shape[1]
    pwm_list = train_data[2]

    if motif_init is not None:
        # Motif init is a dictionary with fields: "stddev"
        kinit = ci.PSSMKernelInitializer(pwm_list,
                                         stddev=motif_init.get("stddev", 0.05),  # if not specified, use 0.05
                                         add_noise_before_Pwm2Pssm=True)
        binit = "zeros"
    else:
        kinit = "glorot_uniform"
        binit = "zeros"

    # sequence
    in_dna = cl.InputDNA(seq_length=seq_length, name="seq")
    x = cl.ConvDNA(filters=filters,
                   kernel_size=kernel_size,
                   activation="relu",
                   kernel_initializer=kinit,
                   bias_initializer=binit,
                   name="conv1")(in_dna)
    x = kl.AveragePooling1D(pool_size=4)(x)
    x = kl.Flatten()(x)

    x = kl.Dense(units=1)(x)
    m = Model(in_dna, x)
    m.compile(Adam(lr=lr), loss="binary_crossentropy", metrics=["acc"])
    return m
Exemplo n.º 3
0
def test_convDNA_sequential():
    m = Sequential([cl.ConvDNA(filters=1, kernel_size=10, seq_length=100)])
    m.compile("adam", loss="binary_crossentropy")
def model(
        train_data,
        activation="relu",
        kernel_size=10,
        filters=16,
        conv2_use_skip=False,
        internal_pos={"name": "global_maxpool"},
        # {"name": "strided_maxpool", "pool_size": 3}
        # {"name": "maxpool+rnn_sequence", "pool_size": 3, "dropout": 0.1}
        # {"name": "rnn", "pool_size": 3, "dropout": 0.1}
        # {"name": "maxpool+weight_sum", "pool_size": 3, "n_bases": 10, "share_splines": False,
        # "l2_smooth": 1e-5, "l2": 0}
        external_pos=None,  # None, {"type": "gam", "as_track": True, "units": 1}
        dropout_rate=0.5,
        n_hidden=100,
        use_batchnorm=False,
        use_weightnorm=False,
        lr=0.001):
    """Returns keras model for modelling arrays returned by data.data()
    """
    # config
    seq_length = train_data[0]["seq"].shape[1]
    n_tasks = train_data[1].shape[1]
    ext_n_bases = train_data[0]["dist_tss"].shape[2]
    activation = PReLU() if activation == "PReLU" else activation

    inputs = []
    # position module
    # ---------------
    if external_pos is not None:
        # conf
        external_pos["as_track"] = train_data[0]["dist_tss"].shape[1] != 1
        if external_pos["as_track"]:
            pos_length = seq_length
        else:
            pos_length = 1

        ext_filters = external_pos["units"]

        # NOTE: Concise now implements a layer SplineT, which simplifies
        # the following code significantly.
        pos_inputs, pos_outputs = tuple(
            zip(*[
                pos_module(pos_length=pos_length,
                           ext_n_bases=ext_n_bases,
                           ext_filters=ext_filters,
                           kernel_size=kernel_size,
                           feat_name=feat_name,
                           ext_pos_kwargs=external_pos) for feat_name in
                external_pos.get("feat_names", ["gene_start", "gene_end"])
            ]))
        inputs += list(pos_inputs)
        pos_outputs = list(pos_outputs)

    # sequence module
    # ----------------
    # initialize conv kernels to known motif pwm's
    seq_input = cl.InputDNA(seq_length, name="seq")
    inputs += [seq_input]
    x_pwm = cl.ConvDNA(filters=filters,
                       kernel_size=kernel_size,
                       activation=activation)(seq_input)
    if use_batchnorm:
        x_pwm = kl.BatchNormalization(axis=1)(x_pwm)

    # inject external_pos as a track
    if external_pos is not None and external_pos["as_track"]:
        x_pwm = kl.concatenate([x_pwm] + pos_outputs, axis=-1)

    x = kl.Conv1D(filters, kernel_size=1, activation=activation)(x_pwm)
    if conv2_use_skip:
        x = kl.concatenate([x_pwm, x])  # skip connection ?
    if use_batchnorm:
        x = kl.BatchNormalization(axis=1)(x)

    # summarize across sequence
    # -------------------------
    if internal_pos["name"] == "global_maxpool":
        x = kl.GlobalMaxPool1D()(x)
    elif internal_pos["name"] == "strided_maxpool":
        x = kl.MaxPool1D(pool_size=internal_pos["pool_size"])(x)
        x = kl.Flatten()(x)
    elif internal_pos["name"] == "maxpool+rnn_sequence":
        x = kl.MaxPool1D(pool_size=internal_pos["pool_size"])(x)
        x = kl.Bidirectional(
            kl.LSTM(filters,
                    dropout_W=internal_pos["dropout"],
                    dropout_U=internal_pos["dropout"],
                    return_sequences=True))(x)
        x = kl.Flatten()(x)
    elif internal_pos["name"] == "rnn":
        x = kl.MaxPool1D(pool_size=internal_pos["pool_size"])(x)
        x = kl.Bidirectional(
            kl.LSTM(filters,
                    dropout_W=internal_pos["dropout"],
                    dropout_U=internal_pos["dropout"]))(x)
    elif internal_pos["name"] == "maxpool+weight_sum":
        x = kl.MaxPool1D(pool_size=internal_pos["pool_size"])(x)
        x = cl.SmoothPositionWeight(n_bases=internal_pos.get("n_bases", 10),
                                    share_splines=internal_pos.get(
                                        "share_splines", False),
                                    l2_smooth=internal_pos.get("l2_smooth", 0),
                                    l2=internal_pos.get("l2", 0))(x)
        x = cl.GlobalSumPooling1D()(x)
    else:
        raise ValueError("invalid internal_pos")
    if use_batchnorm:
        x = kl.BatchNormalization()(x)
    x = kl.Dropout(dropout_rate)(x)

    # append external_pos as a scalar
    # -------------------------------
    if external_pos is not None and not external_pos["as_track"]:
        x = kl.concatenate([x] + pos_outputs, axis=-1)

    # FC layers
    # ---------
    x = kl.Dense(n_hidden, activation=activation)(x)
    if use_batchnorm:
        x = kl.BatchNormalization()(x)
    x = kl.Dropout(dropout_rate)(x)
    outputs = kl.Dense(n_tasks, activation="sigmoid")(x)

    # compile model
    # -------------
    m = Model(inputs, outputs)
    if use_weightnorm:
        optimizer = AdamWithWeightnorm(lr=lr)
    else:
        optimizer = Adam(lr=lr)

    m.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["acc"])

    if use_weightnorm:
        data_based_init(m, model_data.subset(train_data, np.arange(500))[0])
    return m
def model(
        train_data,
        nonlinearity="relu",
        filters=1,
        init_motifs={
            "use_pssm": True,
            "stddev": 0.1,
            "mean_max_scale": 0.0,
        },
        # init_sd_w=1e-3,
        pos_effect={
            "type": "gam",
            "l2_smooth": 1e-5,
            "l2": 1e-5,
            "activation": None,
            "use_bias": False,
            "merge": {
                "type": "multiply"
            },
        },
        use_weightnorm=False,
        l1_weights=0,
        l1_motif=0,
        hidden_fc=None,
        # learning rate
        lr=0.002):

    seq_length = train_data[0]["seq"].shape[1]
    n_tasks = 1 if len(train_data[1].shape) == 1 else train_data[1].shape[1]
    assert seq_length - (11 - 1) == n_tasks
    pwm_list = train_data[4]
    pos_features = train_data[3]
    kernel_size = 11  # hard-coded for comparison with branchpointer

    # sequence module
    # ----------------
    # initialize conv kernels to known motif pwm's
    if init_motifs is not None:
        if init_motifs.get("n_pwm", None) is not None:
            pwm_list = pwm_list[:init_motifs["n_pwm"]]
        if init_motifs["use_pssm"]:
            kernel_initializer = ci.PSSMKernelInitializer(
                pwm_list, stddev=init_motifs["stddev"])
            bias_initializer = ci.PSSMBiasInitializer(
                pwm_list,
                kernel_size=kernel_size,
                mean_max_scale=init_motifs.get("mean_max_scale", 0.0))
        else:
            kernel_initializer = ci.PWMKernelInitializer(
                pwm_list, stddev=init_motifs["stddev"])
            bias_initializer = ci.PWMBiasInitializer(
                pwm_list,
                kernel_size=kernel_size,
                mean_max_scale=init_motifs.get("mean_max_scale", 0.0))
    else:
        kernel_initializer = "glorot_uniform"
        bias_initializer = "zeros"

    seq_input = cl.InputDNA(seq_length, name="seq")
    # convolution
    activation = PReLU() if nonlinearity == "PReLU" else nonlinearity

    x = cl.ConvDNA(
        filters=filters,
        kernel_size=kernel_size,
        kernel_regularizer=kr.l1(l1_motif),  # Regularization
        activation=activation,
        kernel_initializer=kernel_initializer,
        bias_initializer=bias_initializer)(seq_input)

    # positional module
    # -----------------

    # optional positional effect
    if pos_effect is not None:
        # config
        if pos_effect["merge"]["type"] == "multiply":
            pos_filters = filters
            merge_fun = kl.multiply
        elif pos_effect["merge"]["type"] == "concatenate":
            # if we concatenate, then the number of filters should be 1
            pos_filters = 1
            merge_fun = kl.concatenate
        elif pos_effect["merge"]["type"] == "add":
            pos_filters = filters
            merge_fun = kl.add
        else:
            raise ValueError(
                "pos_effect[\"merge\"][\"type\"] needs to be from {multiply, concatenate, add}"
            )

        pos_effect["n_bases"] = pos_effect.get(
            "n_bases") or train_data[0]["dist2"].shape[2]

        # NOTE: Concise now implements a layer SplineT, which simplifies
        # the following code significantly.
        pos_inputs, pos_outputs = tuple(
            zip(*[
                pos_module(pos_length=n_tasks,
                           ext_n_bases=pos_effect["n_bases"],
                           ext_filters=pos_filters,
                           feat_name=feat_name,
                           ext_pos_kwargs=pos_effect)
                for feat_name in pos_features
            ]))
        pos_in_layers = list(pos_inputs)
        pos_out_layers = list(pos_outputs)

        # merge the layers
        # ----------------
        x = merge_fun([x] + pos_out_layers)
        input_list = [seq_input] + pos_in_layers
        if pos_effect["merge"]["type"] == "concatenate" and \
           pos_effect["merge"].get("hidden_fc", None) is not None:
            hidden_fc = pos_effect["merge"]["hidden_fc"]
            act_string = hidden_fc.get("activation", "relu")
            activation = PReLU() if act_string == "PReLU" else act_string
            for i in range(hidden_fc["n_layers"]):
                x = kl.Conv1D(hidden_fc["n_hidden"],
                              kernel_size=1,
                              activation="relu",
                              kernel_regularizer=kr.L1L2(
                                  l1=hidden_fc.get("l1", 0),
                                  l2=hidden_fc.get("l2", 0)))(x)
                if hidden_fc["dropout_rate"]:  # non-zero
                    x = kl.Dropout(hidden_fc["dropout_rate"])(x)
    else:
        input_list = seq_input

    if pos_effect is None and filters == 1:
        predictions = kl.Activation("sigmoid")(x)
    else:
        predictions = kl.Conv1D(filters=1,
                                kernel_size=1,
                                kernel_regularizer=kr.l1(l1_weights),
                                activation="sigmoid",
                                name="Conv1D_final_layer")(x)

    # predictions = kl.Flatten()(predictions)  # remove the last dimention
    model = Model(input_list, predictions)

    if use_weightnorm:
        optimizer = AdamWithWeightnorm(lr=lr)
    else:
        optimizer = Adam(lr=lr)

    model.compile(optimizer=optimizer,
                  loss=closs.binary_crossentropy_masked,
                  metrics=[
                      cm.accuracy, cm.f1, cm.precision, cm.recall,
                      cm.sensitivity, cm.specificity, cm.fdr
                  ],
                  sample_weight_mode="temporal")

    if use_weightnorm:
        data_based_init(model,
                        model_data.subset(train_data, np.arange(500))[0])
    return model