Exemplo n.º 1
0
def test_exon_model_masking():
    model = MMSplice()

    preds = [
        model.exonM.predict(encodeDNA(['AAA']))[0][0],
        model.exonM.predict(encodeDNA(['AAA', 'CATACA']))[0][0],
        model.exonM.predict(encodeDNA(['AAA', 'CATACAGGAA']))[0][0]
    ]

    for i in preds:
        assert abs(preds[0] - i) < 1e-6
def create_patterns(motif_seqs):
    patterns = [
        Pattern(seq=encodeDNA([s])[0],
                contrib=dict(a=encodeDNA([s])[0]),
                hyp_contrib=dict(a=encodeDNA([s])[0]),
                name=str(i)) for i, s in enumerate(motif_seqs)
    ]

    aligned_patterns = [
        p.align(patterns[0], pad_value=np.array([0.25] * 4)) for p in patterns
    ]
    return patterns, aligned_patterns
Exemplo n.º 3
0
    def sim_pred(self, central_motif, side_motif=None, side_distances=[], repeat=128, importance=[]):
        """
        Args:
          importance: list of importance scores
        """
        from basepair.exp.chipnexus.simulate import generate_seq, average_profiles, flatten
        batch_size = repeat
        seqlen = self.seqmodel.seqlen
        tasks = self.seqmodel.tasks

        # simulate sequence
        seqs = encodeDNA([generate_seq(central_motif, side_motif=side_motif,
                                       side_distances=side_distances, seqlen=seqlen)
                          for i in range(repeat)])

        # get predictions
        scaled_preds = self.predict(seqs, batch_size=batch_size)

        if importance:
            # get the importance scores (compute only the profile and counts importance)
            imp_scores_all = self.seqmodel.imp_score_all(seqs, intp_pattern=['*/profile/wn', '*/counts/pre-act'])
            imp_scores = {t: {self._get_old_imp_score_name(imp_score_name): seqs * imp_scores_all[f'{t}/{imp_score_name}']
                              for imp_score_name in importance}
                          for t in tasks}

            # merge and aggregate the profiles
            out = {"imp": imp_scores, "profile": scaled_preds}
        else:
            out = {"profile": scaled_preds}
        return average_profiles(flatten(out, "/"))
Exemplo n.º 4
0
    def sim_pred(self, central_motif, side_motif=None, side_distances=[], repeat=128, importance=[]):
        """
        Args:
          importance: list of importance scores
        """
        # TODO - update?
        from basepair.exp.chipnexus.simulate import generate_seq, postproc, average_profiles, flatten
        batch_size = repeat
        seqlen = self.input_seqlen()
        tasks = self.tasks

        # simulate sequence
        seqs = encodeDNA([generate_seq(central_motif, side_motif=side_motif,
                                       side_distances=side_distances, seqlen=seqlen)
                          for i in range(repeat)])

        # get predictions
        preds = self.model.predict(seqs, batch_size=batch_size)
        # TODO - remove this and use model.predict instead
        scaled_preds = postproc(preds, tasks)

        if importance:
            # get the importance scores
            imp_scores = self.seq_importance(seqs, importance)

            # merge and aggregate the profiles
            out = {"imp": imp_scores, "profile": scaled_preds}
        else:
            out = scaled_preds
        return average_profiles(flatten(out, "/"))
Exemplo n.º 5
0
    def sim_pred(self, central_motif, side_motif=None, side_distances=[], repeat=128, contribution=[]):
        """Embed two motifs in random sequences and obtain their average predictions.

        Args:
          contribution: list of contribution scores
        """
        from bpnet.simulate import generate_seq, average_profiles, flatten
        batch_size = repeat
        seqlen = self.seqmodel.seqlen
        tasks = self.seqmodel.tasks

        # simulate sequence
        seqs = encodeDNA([generate_seq(central_motif, side_motif=side_motif,
                                       side_distances=side_distances, seqlen=seqlen)
                          for i in range(repeat)])

        # get predictions
        scaled_preds = self.predict(seqs, batch_size=batch_size)

        if contribution:
            # get the contribution scores (compute only the profile and counts contribution)
            contrib_scores_all = self.seqmodel.contrib_score_all(seqs, intp_pattern=['*/profile/wn', '*/counts/pre-act'])
            contrib_scores = {t: {self._get_old_contrib_score_name(contrib_score_name): seqs * contrib_scores_all[f'{t}/{contrib_score_name}']
                                  for contrib_score_name in contribution}
                              for t in tasks}

            # merge and aggregate the profiles
            out = {"contrib": contrib_scores, "profile": scaled_preds}
        else:
            out = {"profile": scaled_preds}
        return average_profiles(flatten(out, "/"))
Exemplo n.º 6
0
def random_seq_onehot(l):
    """Generate random sequence one-hot-encoded

    Args:
      l: sequence length
    """
    from concise.preprocessing import encodeDNA
    return encodeDNA([''.join(random.choices("ACGT", k=int(l)))])[0]
Exemplo n.º 7
0
    def split(self, x, overhang):
        ''' x: a sequence to split
        '''
        intronl_len, intronr_len = overhang
        # need to pad N if left seq not enough long
        lackl = self.acceptor_intron_len - intronl_len
        if lackl >= 0:
            x = "N" * (lackl + 1) + x
            intronl_len += lackl + 1
        lackr = self.donor_intron_len - intronr_len
        if lackr >= 0:
            x = x + "N" * (lackr + 1)
            intronr_len += lackr + 1
        acceptor_intron = x[:intronl_len - self.acceptor_intron_cut]
        acceptor = x[(intronl_len -
                      self.acceptor_intron_len):(intronl_len +
                                                 self.acceptor_exon_len)]
        exon = x[(intronl_len + self.exon_cut_l):(-intronr_len -
                                                  self.exon_cut_r)]
        donor = x[(-intronr_len - self.donor_exon_len):(-intronr_len +
                                                        self.donor_intron_len)]
        donor_intron = x[-intronr_len + self.donor_intron_cut:]
        if donor[self.donor_exon_len:self.donor_exon_len + 2] != "GT":
            warnings.warn("None GT donor", UserWarning)
        if acceptor[self.acceptor_intron_len -
                    2:self.acceptor_intron_len] != "AG":
            warnings.warn("None AG donor", UserWarning)

        if self.encode:
            return {
                "acceptor_intron": encodeDNA([acceptor_intron]),
                "acceptor": encodeDNA([acceptor]),
                "exon": encodeDNA([exon]),
                "donor": encodeDNA([donor], maxlen=18),
                "donor_intron": encodeDNA([donor_intron])
            }
        else:
            return {
                "acceptor_intron": acceptor_intron,
                "acceptor": acceptor,
                "exon": exon[:self.maxExonLength],
                "donor": donor,
                "donor_intron": donor_intron
            }
Exemplo n.º 8
0
def prepare_data(dt,
                 features,
                 response,
                 sequence,
                 id_column=None,
                 seq_align="end",
                 trim_seq_len=None):
    """
    Prepare data for Concise.train or ConciseCV.train.

    Args:
        dt: A pandas DataFrame containing all the required data.
        features (List of strings): Column names of `dt` used to produce the features design matrix. These columns should be numeric.
        response (str or list of strings): Name(s) of column(s) used as a reponse variable.
        sequence (str): Name of the column storing the DNA/RNA sequences.
        id_column (str): Name of the column used as the row identifier.
        seq_align (str): one of ``{"start", "end"}``. To which end should we align sequences?
        trim_seq_len (int): Consider only first `trim_seq_len` bases of each sequence when generating the sequence design matrix. If :python:`None`, set :py:attr:`trim_seq_len` to the longest sequence length, hence whole sequences are considered.
        standardize_features (bool): If True, column in the returned matrix matrix :py:attr:`X_seq` are normalied to have zero mean and unit variance.


    Returns:
        tuple: Tuple with elements: :code:`(X_feat: X_seq, y, id_vec)`, where:

               - :py:attr:`X_feat`: features design matrix of shape :code:`(N, D)`, where N is :code:`len(dt)` and :code:`D = len(features)`
               - :py:attr:`X_seq`:  sequence matrix  of shape :code:`(N, 1, trim_seq_len, 4)`. It represents 1-hot encoding of the DNA/RNA sequence.
               - :py:attr:`y`: Response variable 1-column matrix of shape :code:`(N, 1)`    
               - :py:attr:`id_vec`: 1D Character array of shape :code:`(N)`. It represents the ID's of individual rows.

    Note:
        One-hot encoding  of the DNA/RNA sequence is the following:

        .. code:: python

               {
                 "A": np.array([1, 0, 0, 0]),
                 "C": np.array([0, 1, 0, 0]),
                 "G": np.array([0, 0, 1, 0]),
                 "T": np.array([0, 0, 0, 1]),
                 "U": np.array([0, 0, 0, 1]),
                 "N": np.array([0, 0, 0, 0]),
               }

    """
    if type(response) is str:
        response = [response]

    X_feat = np.array(dt[features], dtype="float32")
    y = np.array(dt[response], dtype="float32")
    X_seq = encodeDNA(seq_vec=dt[sequence],
                      maxlen=trim_seq_len,
                      seq_align=seq_align)
    X_seq = np.array(X_seq, dtype="float32")
    id_vec = np.array(dt[id_column])

    return X_feat, X_seq, y, id_vec
Exemplo n.º 9
0
def test_interpret_wo_bias():
    from bpnet.metrics import RegressionMetrics, ClassificationMetrics, PeakPredictionProfileMetric
    from concise.preprocessing import encodeDNA
    # test the model
    seqs = encodeDNA(['ACAGA'] * 100)

    inputs = {"seq": seqs, "bias/a/profile": np.random.randn(100, 5, 2)}

    # Let's use regression
    targets = {
        "a/class": np.random.randint(low=0, high=2,
                                     size=(100, 1)).astype(float),
        "a/counts": 1 + np.ceil(np.abs(np.random.randn(100))),
        "a/profile": 1 + np.ceil(np.abs(np.random.randn(100, 5, 2))),
    }

    import keras.backend as K
    # K.clear_session()
    # use bias
    m = SeqModel(
        body=BaseNet('relu'),
        heads=[
            BinaryClassificationHead('{task}/class',
                                     net=TopDense(pool_size=2),
                                     use_bias=False),
            ScalarHead('{task}/counts',
                       loss='mse',
                       metric=RegressionMetrics(),
                       net=TopDense(pool_size=2),
                       use_bias=False),
            ProfileHead(
                '{task}/profile',
                loss='mse',
                metric=PeakPredictionProfileMetric(neg_max_threshold=0.05,
                                                   required_min_pos_counts=0),
                net=TopConv(n_output=2),
                use_bias=True,
                bias_shape=(5, 2)
            ),  # NOTE: the shape currently has to be hard-coded to the sequence length
        ],
        tasks=['a'])
    m.model.fit(inputs, targets)

    o = m.contrib_score_all(seqs)
    assert 'a/profile/wn' in o
    assert o['a/profile/wn'].shape == seqs.shape
    assert 'a/profile/wn' in o
    assert o['a/profile/wn'].shape == seqs.shape

    # evaluate the dataset -> setup an array dataset (NumpyDataset) -> convert to
    from bpnet.data import NumpyDataset
    ds = NumpyDataset({"inputs": inputs, "targets": targets})
    o = m.evaluate(ds)
    assert 'avg/counts/mad' in o
Exemplo n.º 10
0
    def split(self, x, intronl_len=100, intronr_len=80):
        ''' x: a sequence to split
        '''
        lackl = self.acceptor_intron_len - \
            intronl_len  # need to pad N if left seq not enough long
        if lackl >= 0:
            x = "N" * (lackl + 1) + x
            intronl_len += lackl + 1
        lackr = self.donor_intron_len - intronr_len
        if lackr >= 0:
            x = x + "N" * (lackr + 1)
            intronr_len += lackr + 1

        acceptor_intron = x[:intronl_len - self.acceptor_intron_cut]
        acceptor = x[(intronl_len -
                      self.acceptor_intron_len):(intronl_len +
                                                 self.acceptor_exon_len)]
        exon = x[(intronl_len + self.exon_cut_l):(-intronr_len -
                                                  self.exon_cut_r)]
        donor = x[(-intronr_len - self.donor_exon_len):(-intronr_len +
                                                        self.donor_intron_len)]
        donor_intron = x[-intronr_len + self.donor_intron_cut:]

        if self.pattern_warning:
            if donor[self.donor_exon_len:self.donor_exon_len + 2] != "GT":
                warnings.warn("None GT donor", UserWarning)
            if acceptor[self.acceptor_intron_len -
                        2:self.acceptor_intron_len] != "AG":
                warnings.warn("None AG donor", UserWarning)
        if len(exon) == 0:
            exon = 'N'

        return {
            "acceptor_intron": encodeDNA([acceptor_intron]),
            "acceptor": encodeDNA([acceptor]),
            "exon": encodeDNA([exon]),
            "donor": encodeDNA([donor]),
            "donor_intron": encodeDNA([donor_intron])
        }
Exemplo n.º 11
0
 def load(split="train"):
     dt = pd.read_csv(DATA_DIR + "/PUM2_{0}.csv".format(split))
     # DNA/RNA sequence
     xseq = encodeDNA(dt.seq, maxlen=seq_length, seq_align='center')
     # response variable
     y = dt.binding_site.as_matrix().reshape((-1, 1)).astype("float")
     if split == "train":
         from concise.data import attract
         # add also the pwm_list
         pwm_list = attract.get_pwm_list(["129"])
         return {"seq": xseq}, y, pwm_list
     else:
         return {"seq": xseq}, y
Exemplo n.º 12
0
    def predict_on_batch(self, x, **kwargs):
        ''' Use when load batch with sequence already splited.
        This way various length sequences are padded to the same length.
        x is a batch with sequence not encoded.
        Need to be encoded here for collate function to work
        '''
        fts = x['seq']
        acceptor_intron = encodeDNA(fts['acceptor_intron'].tolist(),
                                    seq_align="end")
        acceptor = encodeDNA(fts['acceptor'].tolist(), seq_align="end")
        exon = encodeDNA(fts['exon'].tolist(), seq_align="end")
        donor = encodeDNA(fts['donor'].tolist(), seq_align="end")
        donor_intron = encodeDNA(fts['donor_intron'].tolist(), seq_align="end")
        score = np.concatenate([
            self.acceptor_intronM.predict(acceptor_intron),
            logit(self.acceptorM.predict(acceptor)),
            self.exonM.predict(exon),
            logit(self.donorM.predict(donor)),
            self.donor_intronM.predict(donor_intron)
        ],
                               axis=1)

        return score
Exemplo n.º 13
0
    def predict(self, seq, overhang=(100, 100)):
        """
        Performe prediction of overhanged exon sequence string.

        Args:
          seq (str):  sequence of overhanged exon.
          overhang (Tuple[int, int]): overhang of seqeunce.

        Returns:
          np.array of modular predictions
          as [[acceptor_intronM, acceptor, exon, donor, donor_intron]].
        """
        batch = self.spliter.split(seq, overhang)
        batch = {k: encodeDNA([v]) for k, v in batch.items()}
        return self.predict_on_batch(batch)[0]
Exemplo n.º 14
0
    def split(self, x, overhang):
        ''' x: a sequence to split
        '''
        intronl_len, intronr_len = overhang
        # need to pad N if left seq not enough long
        lackl = self.acceptor_intron_len - intronl_len
        if lackl >= 0:
            x = "N" * (lackl + 1) + x
            intronl_len += lackl + 1
        lackr = self.donor_intron_len - intronr_len
        if lackr >= 0:
            x = x + "N" * (lackr + 1)
            intronr_len += lackr + 1

        acceptor_intron = x[:intronl_len - self.acceptor_intron_cut]

        acceptor_start = intronl_len - self.acceptor_intron_len
        acceptor_end = intronl_len + self.acceptor_exon_len
        acceptor = x[acceptor_start:acceptor_end]

        exon_start = intronl_len + self.exon_cut_l
        exon_end = -intronr_len - self.exon_cut_r
        exon = x[exon_start:exon_end]

        donor_start = -intronr_len - self.donor_exon_len
        donor_end = -intronr_len + self.donor_intron_len
        donor = x[donor_start:donor_end]

        donor_intron = x[-intronr_len + self.donor_intron_cut:]

        if donor[self.donor_exon_len:self.donor_exon_len + 2] != "GT":
            warnings.warn("None GT donor", UserWarning)
        if acceptor[self.acceptor_intron_len -
                    2:self.acceptor_intron_len] != "AG":
            warnings.warn("None AG donor", UserWarning)

        splits = {
            "acceptor_intron": acceptor_intron,
            "acceptor": acceptor,
            "exon": exon,
            "donor": donor,
            "donor_intron": donor_intron
        }

        if self.encode:
            return {k: encodeDNA([v]) for k, v in splits.items()}

        return splits
Exemplo n.º 15
0
def extract_seq(interval, variant, fasta_file, one_hot=False):
    """
    Note: in case the variant is an indel, the anchorpoint at the beginning is used

    Args:
      interval: pybedtools.Interval where to extract the sequence from
      variant: Variant class with attributes: chr, pos, ref, alt
      fasta_file: file path or pysam.FastaFile instance
      one_hot: if True, one-hot-encode the output sequence

    Returns:
      sequence
    """
    if isinstance(fasta_file, str):
        from pysam import FastaFile
        fasta_file = FastaFile(fasta_file)
    if variant is not None and variant.pos - 1 >= interval.start and variant.pos <= interval.stop:
        inside = True
        lendiff = len(variant.alt) - len(variant.ref)
    else:
        inside = False
        lendiff = 0
    seq = fasta_file.fetch(str(interval.chrom), interval.start,
                           interval.stop - lendiff)

    if not inside:
        out = seq
    else:
        # now, mutate the sequence
        pos = variant.pos - interval.start - 1
        expect_ref = seq[pos:(pos + len(variant.ref))]
        if expect_ref != variant.ref:
            raise ValueError(
                f"Expected reference: {expect_ref}, observed reference: {variant.ref}"
            )
        # Anchor at the beginning
        out = seq[:pos] + variant.alt + seq[(pos + len(variant.ref)):]
    assert len(
        out
    ) == interval.stop - interval.start  # sequece length has to be correct at the end
    if one_hot:
        out = encodeDNA([out.upper()])[0]
    return out
Exemplo n.º 16
0
def test_output_files_model_w_bias(trained_model_w_bias):
    K.clear_session()
    output_files = os.listdir(str(trained_model_w_bias))
    expected_files = [
        'config.gin',
        'config.gin.json',
        'bpnet-train.kwargs.json',
        'dataspec.yml',
        'evaluate.ipynb',
        'evaluate.html',
        'evaluation.valid.json',
        'history.csv',
        'model.h5',
        'seq_model.pkl',
        'note_params.json',
    ]
    for f in expected_files:
        assert f in output_files

    m = SeqModel.load(trained_model_w_bias / 'seq_model.pkl')
    m.predict(encodeDNA(["A" * 200]))
Exemplo n.º 17
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            self.fasta_extractor = Fasta(self.fasta_file)
        interval = self.bt[idx]
        interval_fasta_id = self._interval_to_fasta_id(interval)

        if self.targets is not None:
            y = self.targets.iloc[idx].values
        else:
            y = {}

        # Run the fasta extractor
        start, end = self._compute_relative_coords(interval)
        record = self.fasta_extractor[interval_fasta_id]
        seq = record[start:end].seq

        return {
            "inputs": encodeDNA([seq]).squeeze(),
            "targets": y,
            "metadata": {
                "ranges": GenomicRanges.from_interval(interval)
            }
        }
Exemplo n.º 18
0
def test_tf_model():
    tf.reset_default_graph()
    input_nodes = "inputs"
    target_nodes = "preds"
    meta_graph = "model_files/model.tf.meta"
    # meta_graph = 'model_files/model.tf-modified.meta'
    checkpoint = "model_files/model.tf"
    index = "model_files/model.tf.index"
    pkl_file = "model_files/const_feed_dict.pkl"

    from kipoi.model import TensorFlowModel

    m = TensorFlowModel(input_nodes="inputs",
                        target_nodes="preds",
                        meta_graph=meta_graph,
                        checkpoint=checkpoint,
                        const_feed_dict_pkl=pkl_file)
    ops = tf.get_default_graph().get_operations()

    # TODO - modify the
    out = tf.train.export_meta_graph(
        filename='model_files/model.tf-modified.meta', as_text=True)
    ops[0].outputs[0].shape[0] = None

    pops = [
        op.outputs[0] for op in ops
        if op.type == "Placeholder" and op.name.startswith("Placeholder")
    ]

    m.input_ops  # view shapes of the data
    m.target_ops

    from concise.preprocessing import encodeDNA

    x = encodeDNA(["T" * m.input_ops.shape[1].value] * 2).astype("float32")
    out = m.predict_on_batch(x)
Exemplo n.º 19
0
 def _encode_seq(self, seq):
     return {k: encodeDNA([v]) for k, v in seq.items()}
def data_extended(rbp_name, n_bases=10,
                  pos_class_weight=1.0,
                  scale="sign_log",  # or "nat"
                  pos_as_track=False,
                  valid_chr=[1, 3],
                  test_chr=[2, 4, 6, 8, 10]):
    """
    pos_class_weight: positive class weight
    """
    dt_train, dt_valid, dt_test = data_split(rbp_name + "_extended", valid_chr, test_chr)

    seq_train = encodeDNA(dt_train.seq.tolist())
    seq_valid = encodeDNA(dt_valid.seq.tolist())
    seq_test = encodeDNA(dt_test.seq.tolist())

    seq_length = seq_train.shape[1]
    # impute missing values (not part of the pipeline as the Imputer lacks inverse_transform method)
    imp = Imputer(strategy="median")
    imp.fit(pd.concat([dt_train[POS_FEATURES], dt_valid[POS_FEATURES]]))
    dt_train[POS_FEATURES] = imp.transform(dt_train[POS_FEATURES])
    dt_valid[POS_FEATURES] = imp.transform(dt_valid[POS_FEATURES])
    dt_test[POS_FEATURES] = imp.transform(dt_test[POS_FEATURES])

    if scale == "sign_log":
        preproc_pipeline = make_pipeline(
            FunctionTransformer(func=sign_log_func,
                                inverse_func=sign_log_func_inverse),
            MinMaxScaler()
        )
    elif scale == "nat":
        preproc_pipeline = make_pipeline(
            MinMaxScaler()
        )
    else:
        ValueError("scale argument invalid")

    dtx_train = np.array(dt_train[POS_FEATURES])
    dtx_valid = np.array(dt_valid[POS_FEATURES])
    dtx_test = np.array(dt_test[POS_FEATURES])

    def melt_array(arr, seq_length):
        """ 3-dim -> 2-dim transform

        arr = np.arange(12).reshape((2,2,3))
        assert np.all(unmelt_array(melt_array(arr, 3), 3) == arr)
        """
        arr = np.transpose(arr, (0, 2, 1))
        assert arr.shape[2] == len(POS_FEATURES)
        assert arr.shape[1] == seq_length
        return arr.reshape((-1, len(POS_FEATURES)))

    def unmelt_array(arr, seq_length):
        arr = arr.reshape(((-1, seq_length, len(POS_FEATURES))))
        return np.transpose(arr, (0, 2, 1))

    if pos_as_track:
        dtx_train = melt_array(expand_positions(dtx_train, seq_length), seq_length)
        dtx_valid = melt_array(expand_positions(dtx_valid, seq_length), seq_length)
        dtx_test = melt_array(expand_positions(dtx_test, seq_length), seq_length)

    # transform pos features
    preproc_pipeline.fit(np.concatenate([dtx_train, dtx_valid]))

    train_pos = preproc_pipeline.transform(dtx_train)
    valid_pos = preproc_pipeline.transform(dtx_valid)
    test_pos = preproc_pipeline.transform(dtx_test)

    def create_feature_dict(arr, seq_length, pos_as_track):
        if pos_as_track:
            arr = unmelt_array(arr, seq_length)
        else:
            arr = arr[..., np.newaxis]

        raw_dist = {"raw_dist_" + k: arr[:, i][..., np.newaxis]
                    for i, k in enumerate(POS_FEATURES)}
        # (batch, seq_length / 1, 1)

        dist = {"dist_" + k: encodeSplines(arr[:, i], start=0, end=1)
                for i, k in enumerate(POS_FEATURES)}
        # (batch, seq_length / 1, default number of splines)

        # add also the merged version - last dimension = features
        raw_dist_all = np.concatenate([raw_dist["raw_dist_" + k] for k in POS_FEATURES], axis=-1)
        # (batch, seq_length / 1, n_features)

        return {**raw_dist, **dist, **{"raw_dist_all": raw_dist_all}}

    train_dist = create_feature_dict(train_pos, seq_length, pos_as_track)
    valid_dist = create_feature_dict(valid_pos, seq_length, pos_as_track)
    test_dist = create_feature_dict(test_pos, seq_length, pos_as_track)

    x_train = {"seq": seq_train, **train_dist}
    x_valid = {"seq": seq_valid, **valid_dist}
    x_test = {"seq": seq_test, **test_dist}

    # y
    y_train = dt_train.binding_site.as_matrix().reshape((-1, 1)).astype("float")
    y_valid = dt_valid.binding_site.as_matrix().reshape((-1, 1)).astype("float")
    y_test = dt_test.binding_site.as_matrix().reshape((-1, 1)).astype("float")
    sample_weight = np.squeeze(np.where(y_train == 1, pos_class_weight, 1), -1)

    return (x_train, y_train, sample_weight, POS_FEATURES, preproc_pipeline), \
        (x_valid, y_valid),\
        (x_test, y_test)
Exemplo n.º 21
0
import h5py
import pandas as pd
from concise.preprocessing import encodeDNA

df = pd.read_pickle("human_utrs_result.pkl")

top_n = 2000
inputs = encodeDNA(df.utr)[:top_n]
preds = df.retrained_pred.values.reshape((-1, 1))[:top_n]

fw = h5py.File("expect.human_utrs.h5", 'w')
fw.create_dataset('/inputs', data=inputs)
fw.create_dataset('/preds', data=preds)
fw.flush()
fw.close()
Exemplo n.º 22
0
def test_encodeDNA():

    seq = "ACGTTTATNT"
    assert len(seq) == 10

    with pytest.raises(ValueError):
        encodeDNA(seq)

    assert encodeDNA([seq]).shape == (1, 10, 4)

    assert encodeDNA([seq], maxlen=20).shape == (1, 20, 4)

    assert encodeDNA([seq], maxlen=5).shape == (1, 5, 4)
    assert np.all(encodeDNA([seq])[0, 0] == np.array([1, 0, 0, 0]))
    assert np.all(encodeDNA([seq])[0, 1] == np.array([0, 1, 0, 0]))
    assert np.all(encodeDNA([seq])[0, 2] == np.array([0, 0, 1, 0]))
    assert np.all(encodeDNA([seq])[0, 3] == np.array([0, 0, 0, 1]))
    assert np.all(encodeDNA([seq])[0, 4] == np.array([0, 0, 0, 1]))
    assert np.all(encodeDNA([seq])[0, -1] == np.array([0, 0, 0, 1]))
    assert np.all(encodeDNA([seq])[0, -2] == np.array([0, 0, 0, 0]))
    return pd.DataFrame({
        "pattern": p.name,
        "strand": strands,
        "center": positions,
        "seq_idx": np.arange(len(seqs_one_hot))
    })


# 'TTTACAATTT'  # seq1
# 'TTTACAATT'   # seq2
# '  AACAAA '  # m1
# ' AAACAA  '  # m1
# '   ACAAT '  # m2

seqs = ['TTTACAATTT', 'TTTACAATT']
seqs_one_hot = encodeDNA(seqs)

motif_seqs_1 = ['TTTGTT', 'AAACAA', 'TTGTTT', 'ACAATT', 'TATTGT']

motif_seqs_2 = ['AACAAA', 'AAACAA', 'TTGTTT', 'ACAATT', 'TATTGT']


def create_patterns(motif_seqs):
    patterns = [
        Pattern(seq=encodeDNA([s])[0],
                contrib=dict(a=encodeDNA([s])[0]),
                hyp_contrib=dict(a=encodeDNA([s])[0]),
                name=str(i)) for i, s in enumerate(motif_seqs)
    ]

    aligned_patterns = [
Exemplo n.º 24
0
"""Create one-hot encoded sequences from the input"""
import pandas as pd
from basepair.exp.chipnexus.data import(pool_bottleneck,
                                                 gen_padded_sequence,
                                                 syn_padded_sequence,
                                                 )

import numpy as np

input_gen = '../tidied_GEN_RPMsExpression_plusSeqs'
input_syn = '../tidied_SYN_RPMsExpression_plusSeqs'


from concise.preprocessing import encodeDNA
dfs_gen = pd.read_csv(input_gen)
dfs_syn = pd.read_csv(input_syn)

bpnet_seq_gen = encodeDNA([gen_padded_sequence(s, "AAAGACGCG")
                                   for s in dfs_gen.Sequence.str.upper()])


bpnet_seq_syn = encodeDNA([gen_padded_sequence(s, "AAAGACGCG")
                                   for s in dfs_syn.Sequence.str.upper()])


np.save("tidied_GEN_RPMsExpression_plusSeqs_one_hot",bpnet_seq_gen)
np.save("tidied_SYN_RPMsExpression_plusSeqs_one_hot",bpnet_seq_syn)
Exemplo n.º 25
0
 def _encode_batch_seq(self, batch):
     return {k: encodeDNA(v.tolist()) for k, v in batch.items()}
def data(rbp_name, n_bases=10,
         pos_class_weight=1.0,
         tss_trunc=2000, polya_trunc=2000,
         pos_as_track=False, kernel_size=10,
         scale_raw=False,
         valid_chr=[1, 3],
         test_chr=[2, 4, 6, 8, 10]):
    """
    pos_class_weight: positive class weight
    """
    dt_train, dt_valid, dt_test = data_split(rbp_name, valid_chr, test_chr)

    # TODO - not working just with dt_train.seq ?!?!?
    seq_train = encodeDNA(dt_train.seq.tolist())
    seq_valid = encodeDNA(dt_valid.seq.tolist())
    seq_test = encodeDNA(dt_test.seq.tolist())

    tss_dist = {"train": dt_train.TSS_distance.values,
                "valid": dt_valid.TSS_distance.values,
                "test": dt_test.TSS_distance.values}
    polya_dist = {"train": dt_train.polya_distance.values,
                  "valid": dt_valid.polya_distance.values,
                  "test": dt_test.polya_distance.values}

    seq_length = seq_train.shape[1]
    pos_length = seq_length - kernel_size + 1

    def expand_positions(x, pos_length):
        """If pos_as_track, use it"""
        x = x.reshape((-1, 1))
        # 1. create a matrix with incrementing positions
        incr_array = np.arange(pos_length) - pos_length // 2
        # expand to have the same shape as x
        positions_offset = np.repeat(incr_array.reshape((1, -1)), x.shape[0], axis=0)
        return positions_offset + x

    if pos_as_track:
        tss_dist = {k: expand_positions(v, pos_length) for k, v in tss_dist.items()}
        polya_dist = {k: expand_positions(v, pos_length) for k, v in polya_dist.items()}
        shift = pos_length // 2 + 2
    else:
        tss_dist = {k: v[:, np.newaxis] for k, v in tss_dist.items()}
        polya_dist = {k: v[:, np.newaxis] for k, v in polya_dist.items()}
        shift = 1

    # transform polya_distance - change order
    tss_dist = {k: (v + shift) for k, v in tss_dist.items()}
    polya_dist = {k: -1 * (v - shift) for k, v in polya_dist.items()}

    tss_pos_ranges = get_pos_ranges(tss_dist)
    polya_pos_ranges = get_pos_ranges(polya_dist)

    def get_tss_nat_dist(x):
        return encodeSplines(x, n_bases=n_bases,
                             start=tss_pos_ranges["min"],
                             end=tss_trunc)

    def get_tss_log_dist(x):
        return encodeSplines(np.log10(x), n_bases=n_bases,
                             start=np.log10(tss_pos_ranges["min"]),
                             end=np.log10(tss_pos_ranges["max"]),
                             )

    def get_polya_nat_dist(x):
        return encodeSplines(x, n_bases=n_bases,
                             start=polya_pos_ranges["min"],
                             end=polya_trunc)

    def get_polya_log_dist(x):
        return encodeSplines(np.log10(x), n_bases=n_bases,
                             start=np.log10(polya_pos_ranges["min"]),
                             end=np.log10(polya_pos_ranges["max"]),
                             )

    # min-max scaler
    mms_tss = MinMaxScaler()
    mms_tss.fit(np.log10(tss_dist["train"]).reshape((-1, 1)))
    mms_polya = MinMaxScaler()
    mms_polya.fit(np.log10(polya_dist["train"]).reshape((-1, 1)))

    def get_raw_tss_log_dist(x):
        sh = x.shape
        if scale_raw:
            return mms_tss.transform(np.log10(x).reshape((-1, 1))).\
                reshape(sh)[:, :, np.newaxis]
        else:
            return np.log10(x)[:, :, np.newaxis]

    def get_raw_polya_log_dist(x):
        sh = x.shape
        if scale_raw:
            return mms_polya.transform(np.log10(x).reshape((-1, 1))).\
                reshape(sh)[:, :, np.newaxis]
        else:
            return np.log10(x)[:, :, np.newaxis]

    y_train = dt_train.binding_site.as_matrix().reshape((-1, 1)).astype("float")
    y_valid = dt_valid.binding_site.as_matrix().reshape((-1, 1)).astype("float")
    y_test = dt_test.binding_site.as_matrix().reshape((-1, 1)).astype("float")
    sample_weight = np.squeeze(np.where(y_train == 1, pos_class_weight, 1), -1)
    return ({"seq": seq_train,
             "dist_tss_nat": get_tss_nat_dist(tss_dist["train"]),
             "dist_tss_log": get_tss_log_dist(tss_dist["train"]),
             "dist_polya_nat": get_polya_nat_dist(polya_dist["train"]),
             "dist_polya_log": get_polya_log_dist(polya_dist["train"]),
             # "raw_dist_tss_nat": tss_dist["train"], # Not supported, not thresholding it
             "raw_dist_tss_log": get_raw_tss_log_dist(tss_dist["train"]),
             # "raw_dist_polya_nat": polya_dist["train"],
             "raw_dist_polya_log": get_raw_polya_log_dist(polya_dist["train"])},
            y_train, sample_weight, tss_pos_ranges, polya_pos_ranges, mms_tss, mms_polya),\
        ({"seq": seq_valid,
          "dist_tss_nat": get_tss_nat_dist(tss_dist["valid"]),
          "dist_tss_log": get_tss_log_dist(tss_dist["valid"]),
          "dist_polya_nat": get_polya_nat_dist(polya_dist["valid"]),
          "dist_polya_log": get_polya_log_dist(polya_dist["valid"]),
          # "raw_dist_tss_nat": tss_dist["valid"],
          "raw_dist_tss_log": get_raw_tss_log_dist(tss_dist["valid"]),
          # "raw_dist_polya_nat": polya_dist["valid"],
          "raw_dist_polya_log": get_raw_polya_log_dist(polya_dist["valid"])},
         y_valid),\
        ({"seq": seq_test,
          "dist_tss_nat": get_tss_nat_dist(tss_dist["test"]),
          "dist_tss_log": get_tss_log_dist(tss_dist["test"]),
          "dist_polya_nat": get_polya_nat_dist(polya_dist["test"]),
          "dist_polya_log": get_polya_log_dist(polya_dist["test"]),
          # "raw_dist_tss_nat": tss_dist["test"],
          "raw_dist_tss_log": get_raw_tss_log_dist(tss_dist["test"]),
          # "raw_dist_polya_nat": polya_dist["test"],
          "raw_dist_polya_log": get_raw_polya_log_dist(polya_dist["test"])},
         y_test)