Exemplo n.º 1
0
def extract_features(seq, sec_str):
    feature_dict = {}
    seq_temp = seq
    dp = sec_str
    bp = rna_toolkit.dp_to_bp(dp)
    feature_dict['ent_3'] = []
    feature_dict['gc_perentage'] = []
    feature_dict['ensemble_diversity'] = []
    feature_dict['expected_accuracy'] = []
    feature_dict['fe_per'] = []

    if (rna_toolkit.is_pseudoknotted(bp) > 0) or (len(bp) != len(seq_temp)):
        feature_dict['ent_3'].append(float('nan'))
        feature_dict['gc_perentage'].append(float('nan'))
        feature_dict['ensemble_diversity'].append(float('nan'))
        feature_dict['expected_accuracy'].append(float('nan'))
        feature_dict['fe_per'].append(float('nan'))
    else:
        dp_temp = rna_toolkit.bp_to_dp(bp)

        a = RNA.fold_compound(seq_temp)
        a.pf()
        bp_prob = np.array(a.bpp())
        prob_unbp_array = np.ones(len(seq_temp) + 1)
        (s, mm) = a.mfe()
        for ii in range(len(bp) + 1):
            prob_unbp_array[ii] -= np.sum(bp_prob[ii, :])
            prob_unbp_array[ii] -= np.sum(bp_prob[:, ii])

        expected_accuracy = 0
        gamma = 1
        for ii in range(len(seq_temp)):
            if bp[ii] == 0:
                expected_accuracy += prob_unbp_array[ii + 1]
            elif ii + 1 < bp[ii]:
                expected_accuracy += 2 * gamma * bp_prob[ii + 1, bp[ii]]
        expected_accuracy /= len(seq_temp)

        ensemble_diversity = 0
        dim_x, dim_y = bp_prob.shape
        for bp_i in bp_prob[np.triu_indices(dim_x)]:
            ensemble_diversity += 2 * bp_i * (1 - bp_i)
        ensemble_diversity /= len(seq_temp)

        pos_entropy = 0
        for ubp_i in prob_unbp_array:
            if ubp_i > 0 and ubp_i < 1:
                pos_entropy -= (ubp_i * math.log(ubp_i) +
                                (1 - ubp_i) * math.log(1 - ubp_i))
        pos_entropy /= len(seq_temp)

        gc_perentage = 0
        gc_perentage = (seq_temp.count('G') + seq_temp.count('C')) / float(
            len(seq_temp))

        bp_percentage = 0
        bp_percentage = (dp_temp.count('(') + dp_temp.count(')')) / float(
            len(seq_temp))

        if rna_toolkit.entropy_max(len(seq_temp), 3) > 0:
            ent_3 = rna_toolkit.entropy(seq_temp, 3) / rna_toolkit.entropy_max(
                len(seq_temp), 3)
        else:
            ent_3 = float('nan')

        fe_temp = a.eval_structure(dp_temp)
        if mm != 0:
            fe_per = abs(mm - fe_temp) / abs(mm)
        else:
            fe_per = float('nan')

        feature_dict['ent_3'].append(ent_3)
        feature_dict['gc_perentage'].append(gc_perentage)
        feature_dict['ensemble_diversity'].append(ensemble_diversity)
        feature_dict['expected_accuracy'].append(expected_accuracy)
        feature_dict['fe_per'].append(fe_per)

    df = pd.DataFrame(feature_dict)
    print(df)
    return df
Exemplo n.º 2
0
def extract_features_pseudoknotted(seq, bp):
    feature_dict = {}
    seq_temp = seq

    feature_dict['gc_perentage'] = float('nan')

    feature_dict['ent_3'] = float('nan')
    feature_dict['ent_4'] = float('nan')
    feature_dict['ent_5'] = float('nan')
    feature_dict['ent_6'] = float('nan')
    feature_dict['ent_7'] = float('nan')
    feature_dict['ent_8'] = float('nan')

    feature_dict['bfe_per'] = float('nan')
    feature_dict['kfe_per'] = float('nan')

    if rna_toolkit.entropy_max(len(seq_temp), 3) > 0:
        ent_3 = rna_toolkit.entropy(seq_temp, 3) / rna_toolkit.entropy_max(
            len(seq_temp), 3)

    if rna_toolkit.entropy_max(len(seq_temp), 4) > 0:
        ent_4 = rna_toolkit.entropy(seq_temp, 4) / rna_toolkit.entropy_max(
            len(seq_temp), 4)

    if rna_toolkit.entropy_max(len(seq_temp), 5) > 0:
        ent_5 = rna_toolkit.entropy(seq_temp, 5) / rna_toolkit.entropy_max(
            len(seq_temp), 5)

    if rna_toolkit.entropy_max(len(seq_temp), 6) > 0:
        ent_6 = rna_toolkit.entropy(seq_temp, 6) / rna_toolkit.entropy_max(
            len(seq_temp), 6)

    if rna_toolkit.entropy_max(len(seq_temp), 7) > 0:
        ent_7 = rna_toolkit.entropy(seq_temp, 7) / rna_toolkit.entropy_max(
            len(seq_temp), 7)

    if rna_toolkit.entropy_max(len(seq_temp), 8) > 0:
        ent_8 = rna_toolkit.entropy(seq_temp, 8) / rna_toolkit.entropy_max(
            len(seq_temp), 8)

    gc_perentage = (seq_temp.count('G') + seq_temp.count('C')) / float(
        len(seq_temp))

    a = RNA.fold_compound(seq_temp)
    a.pf()
    (s, mfe) = a.mfe()

    bfe, kfe = Peudo_Decom(bp, seq, 'a')
    if mfe != 0:
        bfe_per = abs(bfe - mfe) / abs(mfe)
        feature_dict['bfe_per'] = bfe_per

    if bfe != 0:
        kfe_per = abs(bfe - kfe) / abs(bfe)
        feature_dict['kfe_per'] = kfe_per

    feature_dict['ent_3'] = ent_3
    feature_dict['ent_4'] = ent_4
    feature_dict['ent_5'] = ent_5
    feature_dict['ent_6'] = ent_6
    feature_dict['ent_7'] = ent_7
    feature_dict['ent_8'] = ent_8

    feature_dict['gc_perentage'] = gc_perentage

    df = pd.DataFrame(feature_dict, index=[0])
    return df