Exemplo n.º 1
0
    def predict_sequences(self,
                          sequences,
                          const_intercept=False,
                          transform_scores=True,
                          key_colname="",
                          sequence_colname="sequence",
                          flank_colname="flank",
                          predict_flanks=False,
                          flank_len=0,
                          only_pred=False):
        """
        Do not make this as generator, because we need to use it somewhere else.
        TODO: handle flank_len

        Args:
            only_pred: return only prediction dictionary, if False, return BasePrediction
                       object which contains the sequence.
        """
        seqdict = bio.get_seqdict(sequences,
                                  sequence_col=sequence_colname,
                                  keycol=key_colname)
        if type(sequences) == pd.DataFrame:
            flank_left = bio.get_seqdict(sequences,
                                         "%s_left" % flank_colname,
                                         keycol=key_colname,
                                         ignore_missing_col=True)
            flank_right = bio.get_seqdict(sequences,
                                          "%s_right" % flank_colname,
                                          keycol=key_colname,
                                          ignore_missing_col=True)
        predictions = {}
        for key in seqdict:
            if type(sequences) == pd.DataFrame:
                sequence = flank_left[key][-flank_len:] + seqdict[
                    key] + flank_right[key][:flank_len]
            else:
                sequence = seqdict[key]
            prediction = self.predict_sequence(sequence, const_intercept,
                                               transform_scores)

            # since we use flank, we need to update the result
            for result in prediction:
                result['site_start'] = result['site_start'] - flank_len
                result['core_start'] = result['core_start'] - flank_len
                # if a prediction is in the flanks
                if result['core_start'] < 0 or \
                   result['core_start'] + result['core_width'] > len(seqdict[key]) - 1:
                    # remove the prediction
                    prediction.remove(result)
                result['core_mid'] = result['core_mid'] - flank_len
            if only_pred:
                predictions[key] = prediction
            else:
                predictions[key] = basepred.BasePrediction(
                    sequence, prediction)
        return predictions
Exemplo n.º 2
0
    def predict_sequences(self, sequence_df, key_colname="",
                          sequence_colname="sequence",
                          flank_colname="flank", predict_flanks=False,
                          flank_len=10):
        '''This is a temporary function that makes predictions dict
           using the dataframe'''

        seqdict = self.pred_input_todict(sequence_df,
                                         sequence_colname=sequence_colname,
                                         key_colname=key_colname)
        if predict_flanks:
            flank_left = bio.get_seqdict(sequence_df,"%s_left" % flank_colname,
                                         ignore_missing_colname=True,
                                         keycolname=key_colname)
            flank_right = bio.get_seqdict(sequence_df,"%s_right" % flank_colname,
                                          ignore_missing_colname=True,
                                          keycolname=key_colname)
        if self.protein == 'ets1':
            core = (11,15)
            centerPos = 12
        if self.protein == 'runx1':
            core = (12, 17)
            centerPos = 14
        kmerFile = self.kmer_align_path
        predictions = {}
        # for each sequence we want to predict
        for key in seqdict:
            sequence = seqdict[key]
            if predict_flanks:
                sequence = flank_left[key][-10:] + seqdict[key] + flank_right[key][:10]
            prediction = self.predict_sequence(sequence, kmerFile, core, centerPos, self.threshold, self.protein)
            if predict_flanks:
                for result in prediction:
                    result['site_start'] = result['site_start'] - flank_len
                    result['core_start'] = result['core_start'] - flank_len
                    # if a prediction is in the flanks
                    if result['core_start'] < 0 or \
                       result['core_start'] + result['core_width'] > len(seqdict[key]) - 1:
                       # remove the prediction
                       prediction.remove(result)
            predictions[key] = basepred.BasePrediction(sequence, prediction)
        return predictions
Exemplo n.º 3
0
    def predict_sequences(self,
                          sequence_df,
                          key_colname="",
                          sequence_colname="sequence",
                          flank_colname="flank",
                          predict_flanks=False,
                          flank_len=10):
        '''This is a temporary function that makes predictions dict
           using the dataframe'''

        seqdict = bio.get_seqdict(sequence_df,
                                  sequence_col=sequence_colname,
                                  keycol=key_colname)
        if predict_flanks:
            flank_left = bio.get_seqdict(sequence_df,
                                         "%s_left" % flank_colname,
                                         ignore_missing_colname=True,
                                         keycolname=key_colname)
            flank_right = bio.get_seqdict(sequence_df,
                                          "%s_right" % flank_colname,
                                          ignore_missing_colname=True,
                                          keycolname=key_colname)
        predictions = {}
        # for each sequence we want to predict
        for key in seqdict:
            sequence = seqdict[key]
            if predict_flanks:
                sequence = flank_left[key][-10:] + seqdict[key] + flank_right[
                    key][:10]
            prediction = self.predict_sequence(sequence)
            if predict_flanks:
                for result in prediction:
                    result['site_start'] = result['site_start'] - flank_len
                    result['core_start'] = result['core_start'] - flank_len
                    # if a prediction is in the flanks
                    if result['core_start'] < 0 or \
                       result['core_start'] + result['core_width'] > len(seqdict[key]) - 1:
                        # remove the prediction
                        prediction.remove(result)
            predictions[key] = basepred.BasePrediction(sequence, prediction)
        return predictions
Exemplo n.º 4
0
 def predict_sequences(self,
                       sequences,
                       sequence_colname="sequence",
                       key_colname="",
                       only_pred=False):
     """
     """
     seqdict = bio.get_seqdict(sequences,
                               sequence_col=sequence_colname,
                               keycol=key_colname)
     predictions = {}
     for key in seqdict:
         prediction = self.predict_sequence(seqdict[key])
         if only_pred:
             predictions[key] = prediction
         else:
             predictions[key] = basepred.BasePrediction(
                 seqdict[key], prediction)
     return predictions
Exemplo n.º 5
0
    def pred_input_todict(self,
                          sequence_input,
                          sequence_colname="sequence",
                          key_colname="",
                          predict_flanks=True):
        """
        Get the dictionary form of the input for sequences predictions.

        sequence_input types allowed: Datafram, dictionary
        """
        # check if input is a dataframe
        if isinstance(sequence_input, pd.DataFrame):
            return bio.get_seqdict(sequence_input,
                                   sequence_colname=sequence_colname,
                                   keycolname=key_colname)
        # check if input is a dictionary
        elif isinstance(sequence_input, dict):
            return sequence_input
        # raise exception if input type is not allowed
        else:
            raise Exception(
                "input must be data frame or dictionary of sequences")
Exemplo n.º 6
0
    def predict_sequences(self,
                          sequences,
                          sequence_colname="sequence",
                          key_colname="",
                          predict_flanks=False,
                          flank_colname="flank",
                          flank_len=10,
                          only_pred=False):
        """
        Get a dictionary of escore predictions for each sequence.

        Args:
            sequences: list / data frame / dictionary of sequences (see bio.get_seqdict)
            sequence_colname: when input is a data frame, this is the column name of
                the sequence (default: sequence)
            key_colname: when input is data frame, this is the column with the key
                that denotes distict row (default: "")
            predict_flanks: default False, when True check flank column--input needs
                to be a data frame
            flank_colname: the column name of the flank sequence
            flank_len: length of the flanking sequences
            only_pred: by default we return result as `BasePred` object for plotting
        Return:
            list of dictionary of the predicted sequences as a BasePred object
            if `only_pred` is False, else just return the list
        """
        seqdict = bio.get_seqdict(sequences,
                                  sequence_col=sequence_colname,
                                  keycol=key_colname)
        # get the flanks if we are including flank predictions
        if predict_flanks:
            flank_left = bio.get_seqdict(sequence_df,
                                         "%s_left" % flank_colname,
                                         keycol=key_colname,
                                         ignore_missing_colname=True)
            flank_right = bio.get_seqdict(sequence_df,
                                          "%s_right" % flank_colname,
                                          keycol=key_colname,
                                          ignore_missing_colname=True)
        # get prediction of each sequence
        predictions = {}
        for key, sequence in seqdict.items():
            # if we are including flanks in the prediction
            if predict_flanks:
                # make sure there are enough flanks to take
                if len(flank_left[key]) < flank_len or len(
                        flank_right[key]) < flank_len:
                    raise Exception(
                        "flank_len is greater than the length of flanks available"
                    )
                # update the sequence to be predicted
                sequence = flank_left[key][
                    -flank_len:] + sequence + flank_right[key][:flank_len]
            # get the prediction for this sequence
            prediction = self.predict_sequence(sequence)
            if only_pred:
                predictions[key] = prediction
            else:
                predictions[key] = basepred.BasePrediction(
                    sequence, prediction)
        # return the dictionary of predictions for each sequence
        return predictions