示例#1
0
def whole_revc_kmer_psednc(pos_file, neg_file, k):
    """Generate revc_kmer and psednc feature into a file combined positive and negative file."""

    revc_kmer = RevcKmer(k=k, normalize=True, upto=True)
    with open(pos_file) as fp:
        revc_kmer_pos_vecs = np.array(revc_kmer.make_revckmer_vec(fp))
    with open(neg_file) as fp:
        revc_kmer_neg_vecs = np.array(revc_kmer.make_revckmer_vec(fp))

    lamada = 6
    w = 0.8
    psednc = PseDNC(lamada, w)
    with open(pos_file) as fp:
        psednc_pos_vecs = np.array(psednc.make_psednc_vec(fp))
    with open(neg_file) as fp:
        psednc_neg_vecs = np.array(psednc.make_psednc_vec(fp))

    pos_vecs = np.column_stack((revc_kmer_pos_vecs, psednc_pos_vecs[:, -lamada:]))
    neg_vecs = np.column_stack((revc_kmer_neg_vecs, psednc_neg_vecs[:, -lamada:]))
    vecs = pos_vecs.tolist() + neg_vecs.tolist()
    labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs)

    # Write file.
    write_file = "data/whole_revc_kmer_psednc.txt"
    write_libsvm(vecs, labels, write_file)
示例#2
0
def whole_revc_kmer_psednc_choose_args(pos_file, neg_file, k):
    """Generate revc_kmer and psednc feature into a file combined positive and negative file."""

    revc_kmer = RevcKmer(k=k, normalize=True, upto=True)
    with open(pos_file) as fp:
        revc_kmer_pos_vecs = np.array(revc_kmer.make_revckmer_vec(fp))
    with open(neg_file) as fp:
        revc_kmer_neg_vecs = np.array(revc_kmer.make_revckmer_vec(fp))

    for lamada in range(1, 2):
        w = 0.1
        while w < 1:
            psednc = PseDNC(lamada, w)
            with open(pos_file) as fp:
                psednc_pos_vecs = np.array(psednc.make_psednc_vec(fp))
            with open(neg_file) as fp:
                psednc_neg_vecs = np.array(psednc.make_psednc_vec(fp))

            pos_vecs = np.column_stack((revc_kmer_pos_vecs, psednc_pos_vecs[:, -lamada:]))
            neg_vecs = np.column_stack((revc_kmer_neg_vecs, psednc_neg_vecs[:, -lamada:]))
            vecs = pos_vecs.tolist() + neg_vecs.tolist()
            labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs)

            # Write file.
            lamada_w = str(lamada) + '_' + str(w)
            write_file = "data/whole_revc_kmer_psednc_" + lamada_w + ".txt"
            print(write_file)
            write_libsvm(vecs, labels, write_file)

            w += 0.1
示例#3
0
def cv5_psednc(fold_path, filename):
    """Contrast experiment by psednc in article
    Prediction of DNase I Hypersensitive Sites by Using Pseudo Nucleotide Compositions.
    """
    lamada = 6
    w = 0.2
    psednc = PseDNC(lamada, w)
    for i in range(5):
        # Generate RevcKmer_PseDNC vecs.
        with open(fold_path + "test_neg_" + str(i)) as fp:
            test_neg_psednc_vecs = psednc.make_psednc_vec(fp)
        with open(fold_path + "test_pos_" + str(i)) as fp:
            test_pos_psednc_vecs = psednc.make_psednc_vec(fp)
        with open(fold_path + "train_neg_" + str(i)) as fp:
            train_neg_psednc_vecs = psednc.make_psednc_vec(fp)
        with open(fold_path + "train_pos_" + str(i)) as fp:
            train_pos_psednc_vecs = psednc.make_psednc_vec(fp)

        n_lamada = "_".join([str(lamada), str(w)])
        # Write test file.
        write_file = fold_path + filename + "_" + n_lamada + "_test_" + str(i) + ".txt"
        test_vecs = test_pos_psednc_vecs + test_neg_psednc_vecs
        test_vecs_labels = [1] * len(test_pos_psednc_vecs) + [-1] * len(test_neg_psednc_vecs)
        write_libsvm(test_vecs, test_vecs_labels, write_file)

        # Write train file.
        write_file = fold_path + filename + "_" + n_lamada + "_train_" + str(i) + ".txt"
        train_vecs = train_pos_psednc_vecs + train_neg_psednc_vecs
        train_vecs_labels = [1] * len(train_pos_psednc_vecs) + [-1] * len(train_neg_psednc_vecs)
        write_libsvm(train_vecs, train_vecs_labels, write_file)
示例#4
0
def borderline_smote_revc_psednc(fold_path):
    revc_kmer = RevcKmer(k=2, normalize=True, upto=True)
    with open("data/hs.fasta") as f:
        pos_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(f))
    with open("data/non-hs.fasta") as f:
        neg_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(f))

    lamada = 6
    w = 0.8
    psednc = PseDNC(lamada, w)
    with open("data/hs.fasta") as f:
        pos_psednc_vecs = np.array(psednc.make_psednc_vec(f))
    with open("data/non-hs.fasta") as f:
        neg_psednc_vecs = np.array(psednc.make_psednc_vec(f))

    pos_vecs = np.column_stack((pos_revc_kmer_vecs, pos_psednc_vecs[:, -lamada:]))
    neg_vecs = np.column_stack((neg_revc_kmer_vecs, neg_psednc_vecs[:, -lamada:]))
    vecs = np.row_stack((pos_vecs,  neg_vecs))
    vecs_labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs)
    _1, synthetic, _2 = (smote.borderline_smote(vecs, vecs_labels, 1, N=300, k=5))
    pos_vecs = pos_vecs.tolist() + synthetic.tolist()
    vecs = pos_vecs + neg_vecs.tolist()
    labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs)

    lamada_n = "_".join([str(lamada), str(w)])
    write_file = "/".join([fold_path, lamada_n])
    print(write_file)
    write_libsvm(vecs, labels, write_file)
示例#5
0
文件: main.py 项目: liufule12/hsDNA
def psednc_tool(lamada, w, pos_file, neg_file, write_file):
    psednc = PseDNC(lamada=lamada, w=w)
    with open(pos_file) as fp:
        pos_vecs = psednc.make_psednc_vec(fp)
    with open(neg_file) as fp:
        neg_vecs = psednc.make_psednc_vec(fp)

    vecs = pos_vecs + neg_vecs
    labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs)

    # Write file.
    write_libsvm(vecs, labels, write_file)
示例#6
0
def cv5_smote_revc_psednc(fold_path, filename, k):
    # Generate pos and neg vecs and SMOTE synthetic vecs.
    lamada = 6
    w = 0.8
    revc_kmer = RevcKmer(k=k, normalize=True, upto=True)
    psednc = PseDNC(lamada, w)
    for i in range(5):
        # Generate RevcKmer_PseDNC vecs.
        with open(fold_path + "test_neg_" + str(i)) as fp:
            test_neg_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp))
        with open(fold_path + "test_neg_" + str(i)) as fp:
            test_neg_psednc_vecs = np.array(psednc.make_psednc_vec(fp))
        test_neg_revc_psednc_vecs = np.column_stack((test_neg_revc_kmer_vecs, test_neg_psednc_vecs[:, -lamada:]))

        with open(fold_path + "test_pos_" + str(i)) as fp:
            test_pos_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp))
        with open(fold_path + "test_pos_" + str(i)) as fp:
            test_pos_psednc_vecs = np.array(psednc.make_psednc_vec(fp))
        test_pos_revc_psednc_vecs = np.column_stack((test_pos_revc_kmer_vecs, test_pos_psednc_vecs[:, -lamada:]))

        with open(fold_path + "train_neg_" + str(i)) as fp:
            train_neg_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp))
        with open(fold_path + "train_neg_" + str(i)) as fp:
            train_neg_psednc_vecs = np.array(psednc.make_psednc_vec(fp))
        train_neg_revc_psednc_vecs = np.column_stack((train_neg_revc_kmer_vecs, train_neg_psednc_vecs[:, -lamada:]))

        with open(fold_path + "train_pos_" + str(i)) as fp:
            train_pos_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp))
        with open(fold_path + "train_pos_" + str(i)) as fp:
            train_pos_psednc_vecs = np.array(psednc.make_psednc_vec(fp))
        train_pos_revc_psednc_vecs = np.column_stack((train_pos_revc_kmer_vecs, train_pos_psednc_vecs[:, -lamada:]))

        # Generate synthetic vecs from pos_vecs.
        synthetic1 = (smote.smote(train_pos_revc_psednc_vecs, N=100, k=5)).tolist()
        synthetic2 = (smote.smote(train_pos_revc_psednc_vecs, N=50, k=5)).tolist()
        synthetic = np.row_stack((synthetic1, synthetic2))

        n_lamada = "_".join([str(lamada), str(w)])
        # Write test file.
        write_file = fold_path + filename + '_' + n_lamada + "_test_" + str(i) + ".txt"
        test_vecs = test_pos_revc_psednc_vecs.tolist() + test_neg_revc_psednc_vecs.tolist()
        test_vecs_labels = [1] * len(test_pos_revc_psednc_vecs) + [-1] * len(test_neg_revc_psednc_vecs)
        write_libsvm(test_vecs, test_vecs_labels, write_file)

        # Write train file.
        write_file = fold_path + filename + '_' + n_lamada + "_train_" + str(i) + ".txt"
        train_pos_vecs = train_pos_revc_psednc_vecs.tolist() + synthetic.tolist()
        train_vecs = train_pos_vecs + train_neg_revc_psednc_vecs.tolist()
        train_vecs_labels = [1] * len(train_pos_vecs) + [-1] * len(train_neg_revc_psednc_vecs)
        write_libsvm(train_vecs, train_vecs_labels, write_file)
def PseudoDinucleotideComposition(gene2seq):
    X = dict()
    succeed_cnt = 0
    # let's use the default value
    psednc = PseDNC()
    for gene, seq in gene2seq.items():
        seq = [seq]
        try:
            vec = psednc.make_psednc_vec(seq)
            X[gene] = vec
            succeed_cnt += 1
            if succeed_cnt % 100 == 0:
                print(succeed_cnt)
        except Exception as e:
            continue
    print('PseDNC, succeed for %d gene' % (succeed_cnt))
    with open('PseDNC-features.json', 'w') as output_f:
        output_f.write(json.dumps(X))
    return X
示例#8
0
文件: main.py 项目: liufule12/hsDNA
def cv5_psednc_tool(lamada, w, test_neg_file, test_pos_file, train_neg_file, train_pos_file, test_write_file,
                    train_write_file):
    psednc = PseDNC(lamada=lamada, w=w)
    with open(test_neg_file) as fp:
        test_neg_vecs = psednc.make_psednc_vec(fp)
    with open(test_pos_file) as fp:
        test_pos_vecs = psednc.make_psednc_vec(fp)
    with open(train_pos_file) as fp:
        train_pos_vecs = psednc.make_psednc_vec(fp)
    with open(train_neg_file) as fp:
        train_neg_vecs = psednc.make_psednc_vec(fp)

    train_vecs = train_pos_vecs + train_neg_vecs
    test_vecs = test_pos_vecs + test_neg_vecs
    train_labels = [1] * len(train_pos_vecs) + [-1] * len(train_neg_vecs)
    test_labels = [1] * len(test_pos_vecs) + [-1] * len(test_neg_vecs)

    # Write file.
    write_libsvm(train_vecs, train_labels, train_write_file)
    write_libsvm(test_vecs, test_labels, test_write_file)
示例#9
0
from sklearn import cross_validation
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from scipy import interp
import matplotlib.pyplot as plt


if __name__ == '__main__':
    begin_time = time.time()
    print('Example1 Start.(This process may use several minutes, please do not close the program.)')

    # ##############################################################################
    # Data IO and generation.

    # Generate the PseDNC feature vector.
    psednc = PseDNC(lamada=3, w=0.05)
    pos_vec = psednc.make_psednc_vec(open('hotspots.fasta'))
    neg_vec = psednc.make_psednc_vec(open('coldspots.fasta'))

    print(len(pos_vec))
    print(len(neg_vec))

    # Merge positive and negative feature vectors and generate their corresponding labels.
    vec = np.array(pos_vec + neg_vec)
    vec_label = np.array([1] * len(pos_vec) + [0] * len(neg_vec))

    # ##############################################################################
    # Classification and accurate analysis.

    # evaluate performance of the predictor by 5-fold cross-validation and plot the mean ROC curve.
    clf = svm.SVC(C=32, gamma=0.5)
示例#10
0
def main():
    pseknc = PseKNC(k=2, lamada=1, w=0.05)
    pos_vec3train = pseknc.make_pseknc_vec(open('postrain.txt'))
    neg_vec3train = pseknc.make_pseknc_vec(open('negtrain.txt'))
    pos_vec3test = pseknc.make_pseknc_vec(open('postest.txt'))
    neg_vec3test = pseknc.make_pseknc_vec(open('negtest.txt'))
    fea_vec3train = []
    fea_vec3test = []
    fea_vec3train.extend(pos_vec3train + neg_vec3train)
    fea_vec3test.extend(pos_vec3test + neg_vec3test)
    psednc = PseDNC(lamada=3, w=0.05)
    pos_vec1train = psednc.make_psednc_vec(open('postrain.txt'))
    neg_vec1train = psednc.make_psednc_vec(open('negtrain.txt'))
    pos_vec1test = psednc.make_psednc_vec(open('postest.txt'))
    neg_vec1test = psednc.make_psednc_vec(open('negtest.txt'))
    fea_vec1train = []
    fea_vec1test = []
    fea_vec1train.extend(pos_vec1train + neg_vec1train)
    fea_vec1test.extend(pos_vec1test + neg_vec1test)
    feature_matrix = []
    label_vector = []
    train_samples = open('./data_new.txt', 'r')
    i = 0
    for line in train_samples:
        feature_vector = []
        with open('./feature_importance.txt', 'r') as f:
            feature_importance = f.read().splitlines()
        if i < 596:
            label_vector.append(1)
        else:
            label_vector.append(0)
        sequence = line
        feature_vector.extend(fea_vec1train[i] + fea_vec3train[i] +
                              ssc(sequence))
        sequence = line.replace('\n', '')
        feature_vector.extend(
            kmer(sequence) + ksnpf(sequence) + binary_code(sequence))
        feature = []
        for m in range(0, 390):
            t = feature_importance[m]
            feature.append(feature_vector[int(t)])
        feature_matrix.append(feature_vector)
        i = i + 1
    train_samples.close()
    feature_array = np.array(feature_matrix, dtype=np.float32)
    min_max_scaler = preprocessing.MinMaxScaler(copy=True,
                                                feature_range=(-1, 1))
    feature_scaled = min_max_scaler.fit_transform(feature_array)
    X = feature_scaled
    y = label_vector
    clf = SVC(C=1.11, gamma=0.003, probability=True)
    clf.fit(X, y)

    feature_matrix = []
    test_label_vector = []
    test_samples = open('./data_test.txt', 'r')
    i = 0
    for line in test_samples:
        feature_vector = []
        with open('./feature_importance.txt', 'r') as f:
            feature_importance = f.read().splitlines()
        if i < 149:
            test_label_vector.append(1)
        else:
            test_label_vector.append(0)
        sequence = line
        feature_vector.extend(fea_vec1test[i] + fea_vec3test[i] +
                              ssc(sequence))
        sequence = line.replace('\n', '')
        feature_vector.extend(
            kmer(sequence) + ksnpf(sequence) + binary_code(sequence))
        feature = []
        for m in range(0, 390):
            t = feature_importance[m]
            feature.append(feature_vector[int(t)])
        feature_matrix.append(feature_vector)
        i = i + 1
    test_samples.close()
    test_feature_array = np.array(feature_matrix, dtype=np.float32)
    X_test = min_max_scaler.transform(test_feature_array)
    y_test = test_label_vector

    print clf.score(X_test, y_test)
    predict_y_test = clf.predict(X_test)

    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for i in range(0, len(y_test)):
        if int(y_test[i]) == 1 and int(predict_y_test[i]) == 1:
            TP = TP + 1
        elif int(y_test[i]) == 1 and int(predict_y_test[i]) == 0:
            FN = FN + 1
        elif int(y_test[i]) == 0 and int(predict_y_test[i]) == 0:
            TN = TN + 1
        elif int(y_test[i]) == 0 and int(predict_y_test[i]) == 1:
            FP = FP + 1
    Sn = float(TP) / (TP + FN)
    Sp = float(TN) / (TN + FP)
    ACC = float((TP + TN)) / (TP + TN + FP + FN)
    prob_predict_y_test = clf.predict_proba(X_test)
    predictions_test = prob_predict_y_test[:, 1]
    #######generate combined negative scores
    #combined_prob=predictions_test

    y_validation = np.array(y_test, dtype=int)
    fpr, tpr, thresholds = metrics.roc_curve(y_validation,
                                             predictions_test,
                                             pos_label=1)
    roc_auc = auc(fpr, tpr)
    #print('AdaBoostClassifier AUC:%s'%roc_auc)
    F1 = metrics.f1_score(y_validation, map(int, predict_y_test))
    MCC = metrics.matthews_corrcoef(y_validation, map(int, predict_y_test))
    print('SVM Accuracy:%s' % ACC)
    print('SVM AUC:%s' % roc_auc)
    print('SVM Sensitive:%s' % Sn)
    print('SVM Specificity:%s' % Sp)
    print('SVM F1:%s' % F1)
    print('SVM MCC:%s' % MCC)
    return normData


###########################################################################################

if __name__ == '__main__':
    featurename = 'Psednc'

    # getting psednc feature
    print(
        '...............................................................................'
    )
    print('Coding for ' + featurename + ' feature, beginning')
    tic = time.clock()

    psednc = PseDNC(lamada=1, w=0.05)
    pos_vec = psednc.make_psednc_vec(open('strong enhancers4.fasta'))
    neg_vec = psednc.make_psednc_vec(open('weak enhancers4.fasta'))
    Z = array(pos_vec + neg_vec)
    X = noramlization(Z)
    y = array([1] * len(pos_vec) + [0] * len(neg_vec))

    print('The number of positive and negative samples: %d,%d' %
          (len(pos_vec), len(neg_vec)))
    print('Dimension of ' + featurename + ' feature vectors: %d' % len(X[0]))

    toc = time.clock()
    print("Coding time: %.3f minutes" % ((toc - tic) / 60.0))
    print(
        '...............................................................................'
    )
示例#12
0
def prepare_data_with_repDNA(include_acceptor=False,
                             include_donor=False,
                             save_file_name="dataset",
                             samples_per_file=20000,
                             start=0,
                             pre_start=0,
                             pre_end=299,
                             post_start=302,
                             post_end=601,
                             include_kmer=False,
                             include_DAC=False,
                             include_DCC=False,
                             include_TAC=False,
                             include_TCC=False,
                             include_PseDNC=False,
                             include_PseKNC=False,
                             include_PC_PseDNC=False,
                             include_PC_PseTNC=False,
                             include_SC_PseDNC=False,
                             include_SC_PseTNC=False):

    print("Reading data ...")

    cpu_count = int(mp.cpu_count() * 2 / 3)

    # Prepare selected modes
    mode_list = []
    if include_acceptor:
        mode_list.append("acceptor")
    if include_donor:
        mode_list.append("donor")

    # Read data and perform transformation
    for b in mode_list:

        if include_kmer:
            x_dataset = []
            # kmer count occurences:
            kmer = Kmer(k=2, upto=True, normalize=True)

            for a in ["negative", "positive"]:
                # Read data
                file_name = "../data/{}_{}.fa".format(a, b)
                print("Processing", file_name)
                my_time = time.time()

                seqs = util.get_data(open(file_name))[start:start +
                                                      samples_per_file]

                x_dataset.extend(kmer.make_kmer_vec(seqs))
            x_dataset = np.array(x_dataset, dtype=np.float)

            x_filename = "../data/x_kmer_" + save_file_name + (
                "_" + str(start) + "_start" if start != 0 else
                "") + "_" + str(samples_per_file) + "_samples.npy"
            # save dataset in numpy readable files
            np.save(file=x_filename, arr=x_dataset)

            print("Finished Kmer data.")
            print("Shape:", x_dataset.shape)
            print("Data saved in {}.".format(x_filename))

        if include_DAC:
            # Calculate and store Dinuleotide-based auto covariance
            # Initialize datasets
            x_dataset = []
            dac = DAC(2)

            for a in ["negative", "positive"]:
                # Read data
                file_name = "../data/{}_{}.fa".format(a, b)
                print("Processing", file_name)

                seqs = util.get_data(open(file_name))[start:start +
                                                      samples_per_file]

                x_dataset.extend(
                    Parallel(n_jobs=cpu_count)(
                        delayed(dac.make_dac_vec)([
                            seq[pre_start:pre_end + 1] + seq[300:301 + 1] +
                            seq[post_start:post_end + 1]
                        ],
                                                  all_property=True)
                        for seq in seqs))

            x_dataset = np.array(x_dataset, dtype=np.float)

            x_filename = "../data/x_dac_" + save_file_name + (
                "_" + str(start) + "_start" if start != 0 else
                "") + "_" + str(samples_per_file) + "_samples.npy"
            # save dataset in numpy readable files
            np.save(file=x_filename, arr=x_dataset)

            print("Finished DAC data.")
            print("Shape:", x_dataset.shape)
            print("Data saved in {}.".format(x_filename))

        if include_DCC:
            # Calculate and store Dinuleotide-based cross covariance
            # Initialize datasets
            x_dataset = []

            dcc = DCC(1)

            for a in ["negative", "positive"]:
                # Read data
                file_name = "../data/{}_{}.fa".format(a, b)
                print("Processing", file_name)

                seqs = util.get_data(open(file_name))[start:start +
                                                      samples_per_file]

                x_dataset.extend(
                    Parallel(n_jobs=cpu_count)(
                        delayed(dcc.make_dcc_vec)([
                            seq[pre_start:pre_end + 1] + seq[300:301 + 1] +
                            seq[post_start:post_end + 1]
                        ],
                                                  all_property=True)
                        for seq in seqs))

            x_dataset = np.array(x_dataset, dtype=np.float)

            x_filename = "../data/x_dcc_" + save_file_name + (
                "_" + str(start) + "_start" if start != 0 else
                "") + "_" + str(samples_per_file) + "_samples.npy"
            # save dataset in numpy readable files
            np.save(file=x_filename, arr=x_dataset)

            print("Finished DCC data.")
            print("Shape:", x_dataset.shape)
            print("Data saved in {}.".format(x_filename))

        if include_TAC:
            # Calculate and store Trinuleotide-based cross covariance
            # Initialize datasets
            x_dataset = []

            tac = TAC(3)

            for a in ["negative", "positive"]:
                # Read data
                file_name = "../data/{}_{}.fa".format(a, b)
                print("Processing", file_name)

                seqs = util.get_data(open(file_name))[start:start +
                                                      samples_per_file]

                x_dataset.extend(
                    Parallel(n_jobs=cpu_count)(
                        delayed(tac.make_tac_vec)([
                            seq[pre_start:pre_end + 1] + seq[300:301 + 1] +
                            seq[post_start:post_end + 1]
                        ],
                                                  all_property=True)
                        for seq in seqs))

            x_dataset = np.array(x_dataset, dtype=np.float)

            x_filename = "../data/x_tac_" + save_file_name + (
                "_" + str(start) + "_start" if start != 0 else
                "") + "_" + str(samples_per_file) + "_samples.npy"
            # save dataset in numpy readable files
            np.save(file=x_filename, arr=x_dataset)

            print("Finished TAC data.")
            print("Shape:", x_dataset.shape)
            print("Data saved in {}.".format(x_filename))

        if include_TCC:
            # Calculate and store Dinuleotide-based cross covariance
            # Initialize datasets
            x_dataset = []

            tcc = TCC(2)

            for a in ["negative", "positive"]:
                # Read data
                file_name = "../data/{}_{}.fa".format(a, b)
                print("Processing", file_name)

                seqs = util.get_data(open(file_name))[start:start +
                                                      samples_per_file]

                x_dataset.extend(
                    Parallel(n_jobs=cpu_count)(
                        delayed(tcc.make_tcc_vec)([
                            seq[pre_start:pre_end + 1] + seq[300:301 + 1] +
                            seq[post_start:post_end + 1]
                        ],
                                                  all_property=True)
                        for seq in seqs))

            x_dataset = np.array(x_dataset, dtype=np.float)

            x_filename = "../data/x_tcc_" + save_file_name + (
                "_" + str(start) + "_start" if start != 0 else
                "") + "_" + str(samples_per_file) + "_samples.npy"
            # save dataset in numpy readable files
            np.save(file=x_filename, arr=x_dataset)

            print("Finished TCC data.")
            print("Shape:", x_dataset.shape)
            print("Data saved in {}.".format(x_filename))

        if include_PseDNC:
            # Calculate and store Dinuleotide-based cross covariance
            # Initialize datasets
            x_dataset = []

            pseDNC = PseDNC(2)

            for a in ["negative", "positive"]:
                # Read data
                file_name = "../data/{}_{}.fa".format(a, b)
                print("Processing", file_name)

                seqs = util.get_data(open(file_name))[start:start +
                                                      samples_per_file]

                x_dataset.extend(
                    Parallel(n_jobs=cpu_count)(
                        delayed(pseDNC.make_psednc_vec)([
                            seq[pre_start:pre_end + 1] + seq[300:301 + 1] +
                            seq[post_start:post_end + 1]
                        ]) for seq in seqs))

            x_dataset = np.array(x_dataset, dtype=np.float)

            x_filename = "../data/x_pseDNC_" + save_file_name + (
                "_" + str(start) + "_start" if start != 0 else
                "") + "_" + str(samples_per_file) + "_samples.npy"
            # save dataset in numpy readable files
            np.save(file=x_filename, arr=x_dataset)

            print("Finished PseDNC data.")
            print("Shape:", x_dataset.shape)
            print("Data saved in {}.".format(x_filename))

        if include_PseKNC:
            # Calculate and store Dinuleotide-based cross covariance
            # Initialize datasets
            x_dataset = []
            pseKNC = PseKNC(k=2, lamada=1, w=0.05)

            for a in ["negative", "positive"]:
                # Read data
                file_name = "../data/{}_{}.fa".format(a, b)
                print("Processing", file_name)

                seqs = util.get_data(open(file_name))[start:start +
                                                      samples_per_file]

                x_dataset.extend(
                    Parallel(n_jobs=cpu_count)(
                        delayed(pseKNC.make_pseknc_vec)([
                            seq[pre_start:pre_end + 1] + seq[300:301 + 1] +
                            seq[post_start:post_end + 1]
                        ]) for seq in seqs))

            x_dataset = np.array(x_dataset, dtype=np.float)

            x_filename = "../data/x_pseKNC_" + save_file_name + (
                "_" + str(start) + "_start" if start != 0 else
                "") + "_" + str(samples_per_file) + "_samples.npy"
            # save dataset in numpy readable files
            np.save(file=x_filename, arr=x_dataset)

            print("Finished pseKNC data.")
            print("Shape:", x_dataset.shape)
            print("Data saved in {}.".format(x_filename))

        if include_PC_PseDNC:
            # Calculate and store Dinuleotide-based cross covariance
            # Initialize datasets
            x_dataset = []

            pc_psednc = PCPseDNC(lamada=2, w=0.05)

            for a in ["negative", "positive"]:
                # Read data
                file_name = "../data/{}_{}.fa".format(a, b)
                print("Processing", file_name)

                seqs = util.get_data(open(file_name))[start:start +
                                                      samples_per_file]

                x_dataset.extend(
                    Parallel(n_jobs=cpu_count)(
                        delayed(pc_psednc.make_pcpsednc_vec)([
                            seq[pre_start:pre_end + 1] + seq[300:301 + 1] +
                            seq[post_start:post_end + 1]
                        ],
                                                             all_property=True)
                        for seq in seqs))

            x_dataset = np.array(x_dataset, dtype=np.float)

            x_filename = "../data/x_PC_PseDNC_" + save_file_name + (
                "_" + str(start) + "_start" if start != 0 else
                "") + "_" + str(samples_per_file) + "_samples.npy"
            # save dataset in numpy readable files
            np.save(file=x_filename, arr=x_dataset)

            print("Finished PC-PseDNC data.")
            print("Shape:", x_dataset.shape)
            print("Data saved in {}.".format(x_filename))

        if include_PC_PseTNC:
            # Calculate and store Dinuleotide-based cross covariance
            # Initialize datasets
            x_dataset = []

            pc_psetnc = PCPseTNC(lamada=2, w=0.05)

            for a in ["negative", "positive"]:
                # Read data
                file_name = "../data/{}_{}.fa".format(a, b)
                print("Processing", file_name)

                seqs = util.get_data(open(file_name))[start:start +
                                                      samples_per_file]

                x_dataset.extend(
                    Parallel(n_jobs=cpu_count)(
                        delayed(pc_psetnc.make_pcpsetnc_vec)([
                            seq[pre_start:pre_end + 1] + seq[300:301 + 1] +
                            seq[post_start:post_end + 1]
                        ],
                                                             all_property=True)
                        for seq in seqs))

            x_dataset = np.array(x_dataset, dtype=np.float)

            x_filename = "../data/x_PC_PseTNC_" + save_file_name + (
                "_" + str(start) + "_start" if start != 0 else
                "") + "_" + str(samples_per_file) + "_samples.npy"
            # save dataset in numpy readable files
            np.save(file=x_filename, arr=x_dataset)

            print("Finished PC-PseTNC data.")
            print("Shape:", x_dataset.shape)
            print("Data saved in {}.".format(x_filename))

        if include_SC_PseDNC:
            # Calculate and store Dinuleotide-based cross covariance
            # Initialize datasets
            x_dataset = []

            sc_psednc = SCPseDNC(lamada=2, w=0.05)

            for a in ["negative", "positive"]:
                # Read data
                file_name = "../data/{}_{}.fa".format(a, b)
                print("Processing", file_name)

                seqs = util.get_data(open(file_name))[start:start +
                                                      samples_per_file]

                x_dataset.extend(
                    Parallel(n_jobs=cpu_count)(
                        delayed(sc_psednc.make_scpsednc_vec)([
                            seq[pre_start:pre_end + 1] + seq[300:301 + 1] +
                            seq[post_start:post_end + 1]
                        ],
                                                             all_property=True)
                        for seq in seqs))

            x_dataset = np.array(x_dataset, dtype=np.float)

            x_filename = "../data/x_SC_PseDNC_" + save_file_name + (
                "_" + str(start) + "_start" if start != 0 else
                "") + "_" + str(samples_per_file) + "_samples.npy"
            # save dataset in numpy readable files
            np.save(file=x_filename, arr=x_dataset)

            print("Finished SC-PseDNC data.")
            print("Shape:", x_dataset.shape)
            print("Data saved in {}.".format(x_filename))

        if include_SC_PseTNC:
            # Calculate and store Dinuleotide-based cross covariance
            # Initialize datasets
            x_dataset = []

            sc_psetnc = SCPseTNC(lamada=2, w=0.05)

            for a in ["negative", "positive"]:
                # Read data
                file_name = "../data/{}_{}.fa".format(a, b)
                print("Processing", file_name)

                seqs = util.get_data(open(file_name))[start:start +
                                                      samples_per_file]

                x_dataset.extend(
                    Parallel(n_jobs=cpu_count)(
                        delayed(sc_psetnc.make_scpsetnc_vec)([
                            seq[pre_start:pre_end + 1] + seq[300:301 + 1] +
                            seq[post_start:post_end + 1]
                        ],
                                                             all_property=True)
                        for seq in seqs))

            x_dataset = np.array(x_dataset, dtype=np.float)

            x_filename = "../data/x_SC_PseTNC_" + save_file_name + (
                "_" + str(start) + "_start" if start != 0 else
                "") + "_" + str(samples_per_file) + "_samples.npy"
            # save dataset in numpy readable files
            np.save(file=x_filename, arr=x_dataset)

            print("Finished SC-PseTNC data.")
            print("Shape:", x_dataset.shape)
            print("Data saved in {}.".format(x_filename))
示例#13
0
    return normData


###########################################################################################

if __name__ == '__main__':
    featurename = 'Psednc'

    # getting psednc feature
    print(
        '...............................................................................'
    )
    print('Coding for ' + featurename + ' feature, beginning')
    tic = time.clock()

    psednc = PseDNC(lamada=1, w=0.05)
    pos_vec = psednc.make_psednc_vec(open('enhancers4.fasta'))
    neg_vec = psednc.make_psednc_vec(open('non-enhancers4.fasta'))
    Z = array(pos_vec + neg_vec)
    X = noramlization(Z)
    y = array([1] * len(pos_vec) + [0] * len(neg_vec))

    print('The number of positive and negative samples: %d,%d' %
          (len(pos_vec), len(neg_vec)))
    print('Dimension of ' + featurename + ' feature vectors: %d' % len(X[0]))

    toc = time.clock()
    print("Coding time: %.3f minutes" % ((toc - tic) / 60.0))
    print(
        '...............................................................................'
    )

###########################################################################################

if __name__ == '__main__':

    featurename = 'Psednc'

    #getting psednc feature
    print(
        '...............................................................................'
    )
    print('Coding for ' + featurename + ' feature, beginning')
    tic = time.clock()

    psednc = PseDNC(lamada=1, w=0.05)
    pos_vec = psednc.make_psednc_vec(open('posi_samples.fasta'))
    neg_vec = psednc.make_psednc_vec(open('nega_samples.fasta'))
    X = array(pos_vec + neg_vec)
    y = array([1] * len(pos_vec) + [0] * len(neg_vec))

    print('The number of positive and negative samples: %d,%d' %
          (len(pos_vec), len(neg_vec)))
    print('Dimension of ' + featurename + ' feature vectors: %d' % len(X[0]))

    toc = time.clock()
    print("Coding time: %.3f minutes" % ((toc - tic) / 60.0))
    print(
        '...............................................................................'
    )
示例#15
0
def main():
    psednc = PseDNC(lamada=8, w=0.8)
    pos_vec = psednc.make_psednc_vec(open('postrain.txt'))
    neg_vec = psednc.make_psednc_vec(open('negtrain.txt'))
    fea_vec=[]
    fea_vec.extend(pos_vec+neg_vec)
    feature_matrix=[]
    label_vector=[]
    train_samples=open('./data_new.txt','r')
    i=0
    for line in train_samples:
        feature_vector=[]
        if i<596:
           label_vector.append(1)
        else:
            label_vector.append(0)
        sequence=line
        feature_vector.extend(ssc(sequence)+fea_vec[i])
        sequence=line.replace('\n','')
        feature_vector.extend(binary_code(sequence))
        feature_matrix.append(feature_vector)
        i=i+1
    train_samples.close()
    feature_array = np.array(feature_matrix,dtype=np.float32)
    min_max_scaler = preprocessing.MinMaxScaler(copy=True, feature_range=(-1, 1))
    feature_scaled= min_max_scaler.fit_transform(feature_array)
    X=feature_scaled
    y=label_vector
    X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.4,random_state=0)
    clf = SVC(C=0.98,gamma=0.001,probability=True)
    clf.fit(X_train,y_train)
    predict_y_test = clf.predict(X_test)
        
    TP=0
    TN=0
    FP=0
    FN=0 
    for i in range(0,len(y_test)):
        if int(y_test[i])==1 and int(predict_y_test[i])==1:
            TP=TP+1
        elif int(y_test[i])==1 and int(predict_y_test[i])==0:
            FN=FN+1
        elif int(y_test[i])==0 and int(predict_y_test[i])==0:
            TN=TN+1
        elif int(y_test[i])==0 and int(predict_y_test[i])==1:
            FP=FP+1
    Sn=float(TP)/(TP+FN)
    Sp=float(TN)/(TN+FP)
    ACC=float((TP+TN))/(TP+TN+FP+FN)
    prob_predict_y_test = clf.predict_proba(X_test)
    predictions_test = prob_predict_y_test[:, 1]
#######generate combined negative scores        
        #combined_prob=predictions_test        
        
    y_validation=np.array(y_test,dtype=int)
    fpr, tpr, thresholds =metrics.roc_curve(y_validation, predictions_test,pos_label=1)
    roc_auc = auc(fpr, tpr)
        #print('AdaBoostClassifier AUC:%s'%roc_auc)
    F1=metrics.f1_score(y_validation, map(int,predict_y_test))
    MCC=metrics.matthews_corrcoef(y_validation,map(int,predict_y_test))
    print('SVM Accuracy:%s'%ACC)
    print('SVM AUC:%s'%roc_auc)
    print('SVM Sensitive:%s'%Sn)
    print('SVM Specificity:%s'%Sp)
    print('SVM F1:%s'%F1)
    print('SVM MCC:%s'%MCC)