示例#1
0
def GetRevcKmer(
    k,
    samples_file,
):
    rev_kmer = RevcKmer(k=k, normalize=True)
    vec = rev_kmer.make_revckmer_vec(open(samples_file))
    np.savetxt('DHSs_reckmer_' + str(k) + '.txt', vec)
示例#2
0
def whole_revc_kmer_psednc(pos_file, neg_file, k):
    """Generate revc_kmer and psednc feature into a file combined positive and negative file."""

    revc_kmer = RevcKmer(k=k, normalize=True, upto=True)
    with open(pos_file) as fp:
        revc_kmer_pos_vecs = np.array(revc_kmer.make_revckmer_vec(fp))
    with open(neg_file) as fp:
        revc_kmer_neg_vecs = np.array(revc_kmer.make_revckmer_vec(fp))

    lamada = 6
    w = 0.8
    psednc = PseDNC(lamada, w)
    with open(pos_file) as fp:
        psednc_pos_vecs = np.array(psednc.make_psednc_vec(fp))
    with open(neg_file) as fp:
        psednc_neg_vecs = np.array(psednc.make_psednc_vec(fp))

    pos_vecs = np.column_stack((revc_kmer_pos_vecs, psednc_pos_vecs[:, -lamada:]))
    neg_vecs = np.column_stack((revc_kmer_neg_vecs, psednc_neg_vecs[:, -lamada:]))
    vecs = pos_vecs.tolist() + neg_vecs.tolist()
    labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs)

    # Write file.
    write_file = "data/whole_revc_kmer_psednc.txt"
    write_libsvm(vecs, labels, write_file)
示例#3
0
def whole_revc_kmer_psednc_choose_args(pos_file, neg_file, k):
    """Generate revc_kmer and psednc feature into a file combined positive and negative file."""

    revc_kmer = RevcKmer(k=k, normalize=True, upto=True)
    with open(pos_file) as fp:
        revc_kmer_pos_vecs = np.array(revc_kmer.make_revckmer_vec(fp))
    with open(neg_file) as fp:
        revc_kmer_neg_vecs = np.array(revc_kmer.make_revckmer_vec(fp))

    for lamada in range(1, 2):
        w = 0.1
        while w < 1:
            psednc = PseDNC(lamada, w)
            with open(pos_file) as fp:
                psednc_pos_vecs = np.array(psednc.make_psednc_vec(fp))
            with open(neg_file) as fp:
                psednc_neg_vecs = np.array(psednc.make_psednc_vec(fp))

            pos_vecs = np.column_stack((revc_kmer_pos_vecs, psednc_pos_vecs[:, -lamada:]))
            neg_vecs = np.column_stack((revc_kmer_neg_vecs, psednc_neg_vecs[:, -lamada:]))
            vecs = pos_vecs.tolist() + neg_vecs.tolist()
            labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs)

            # Write file.
            lamada_w = str(lamada) + '_' + str(w)
            write_file = "data/whole_revc_kmer_psednc_" + lamada_w + ".txt"
            print(write_file)
            write_libsvm(vecs, labels, write_file)

            w += 0.1
示例#4
0
def borderline_smote_revc_psednc(fold_path):
    revc_kmer = RevcKmer(k=2, normalize=True, upto=True)
    with open("data/hs.fasta") as f:
        pos_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(f))
    with open("data/non-hs.fasta") as f:
        neg_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(f))

    lamada = 6
    w = 0.8
    psednc = PseDNC(lamada, w)
    with open("data/hs.fasta") as f:
        pos_psednc_vecs = np.array(psednc.make_psednc_vec(f))
    with open("data/non-hs.fasta") as f:
        neg_psednc_vecs = np.array(psednc.make_psednc_vec(f))

    pos_vecs = np.column_stack((pos_revc_kmer_vecs, pos_psednc_vecs[:, -lamada:]))
    neg_vecs = np.column_stack((neg_revc_kmer_vecs, neg_psednc_vecs[:, -lamada:]))
    vecs = np.row_stack((pos_vecs,  neg_vecs))
    vecs_labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs)
    _1, synthetic, _2 = (smote.borderline_smote(vecs, vecs_labels, 1, N=300, k=5))
    pos_vecs = pos_vecs.tolist() + synthetic.tolist()
    vecs = pos_vecs + neg_vecs.tolist()
    labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs)

    lamada_n = "_".join([str(lamada), str(w)])
    write_file = "/".join([fold_path, lamada_n])
    print(write_file)
    write_libsvm(vecs, labels, write_file)
示例#5
0
def cv5_loop_borderline_smote_revc_kmer(fold_path, filename, k):
    revc_kmer = RevcKmer(k=k, normalize=True, upto=True)
    for i in range(5):
        # Generate RevcKmer vecs.
        with open(fold_path + "test_neg_" + str(i)) as fp:
            test_neg_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp)
        with open(fold_path + "test_pos_" + str(i)) as fp:
            test_pos_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp)
        with open(fold_path + "train_neg_" + str(i)) as fp:
            train_neg_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp)
        with open(fold_path + "train_pos_" + str(i)) as fp:
            train_pos_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp)

        # Generate borderline SMOTE synthetic vecs from train_vecs.
        train_vecs = np.row_stack((train_pos_revc_kmer_vecs, train_neg_revc_kmer_vecs))
        train_vecs_labels = [1] * len(train_pos_revc_kmer_vecs) + [-1] * len(train_neg_revc_kmer_vecs)
        synthetic = smote.loop_borderline_smote(train_vecs, train_vecs_labels, 1, -1, N=200, k=5)

        # Write test file.
        write_file = fold_path + filename + "_test_" + str(i) + ".txt"
        test_vecs = test_pos_revc_kmer_vecs + test_neg_revc_kmer_vecs
        test_vecs_labels = [1] * len(test_pos_revc_kmer_vecs) + [-1] * len(test_neg_revc_kmer_vecs)
        write_libsvm(test_vecs, test_vecs_labels, write_file)

        # Write train file.
        write_file = fold_path + filename + "_train_" + str(i) + ".txt"
        train_pos_revc_kmer_vecs = train_pos_revc_kmer_vecs + synthetic
        train_vecs = train_pos_revc_kmer_vecs + train_neg_revc_kmer_vecs
        train_vecs_labels = [1] * len(train_pos_revc_kmer_vecs) + [-1] * len(train_neg_revc_kmer_vecs)
        write_libsvm(train_vecs, train_vecs_labels, write_file)
示例#6
0
文件: main.py 项目: liufule12/hsDNA
def revc_kmer_tool(k, pos_file, neg_file, write_file):
    revc_kmer = RevcKmer(k=k, upto=True, normalize=True)
    with open(pos_file) as fp:
        pos_vecs = revc_kmer.make_revckmer_vec(fp)
    with open(neg_file) as fp:
        neg_vecs = revc_kmer.make_revckmer_vec(fp)
    vecs = pos_vecs + neg_vecs
    labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs)
    write_libsvm(vecs, labels, write_file)
示例#7
0
def whole_revc_kmer(pos_file, neg_file, k):
    """Generate revc_kmer into a file combined positive and negative file."""
    revc_kmer = RevcKmer(k=k, normalize=True, upto=True)
    with open(pos_file) as fp:
        pos_vecs = revc_kmer.make_revckmer_vec(fp)
    with open(neg_file) as fp:
        neg_vecs = revc_kmer.make_revckmer_vec(fp)
    vecs = pos_vecs + neg_vecs
    labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs)

    # Write file.
    write_file = "data/whole_revc_kmer.txt"
    write_libsvm(vecs, labels, write_file)
示例#8
0
def cv5_smote_revc_psednc(fold_path, filename, k):
    # Generate pos and neg vecs and SMOTE synthetic vecs.
    lamada = 6
    w = 0.8
    revc_kmer = RevcKmer(k=k, normalize=True, upto=True)
    psednc = PseDNC(lamada, w)
    for i in range(5):
        # Generate RevcKmer_PseDNC vecs.
        with open(fold_path + "test_neg_" + str(i)) as fp:
            test_neg_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp))
        with open(fold_path + "test_neg_" + str(i)) as fp:
            test_neg_psednc_vecs = np.array(psednc.make_psednc_vec(fp))
        test_neg_revc_psednc_vecs = np.column_stack((test_neg_revc_kmer_vecs, test_neg_psednc_vecs[:, -lamada:]))

        with open(fold_path + "test_pos_" + str(i)) as fp:
            test_pos_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp))
        with open(fold_path + "test_pos_" + str(i)) as fp:
            test_pos_psednc_vecs = np.array(psednc.make_psednc_vec(fp))
        test_pos_revc_psednc_vecs = np.column_stack((test_pos_revc_kmer_vecs, test_pos_psednc_vecs[:, -lamada:]))

        with open(fold_path + "train_neg_" + str(i)) as fp:
            train_neg_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp))
        with open(fold_path + "train_neg_" + str(i)) as fp:
            train_neg_psednc_vecs = np.array(psednc.make_psednc_vec(fp))
        train_neg_revc_psednc_vecs = np.column_stack((train_neg_revc_kmer_vecs, train_neg_psednc_vecs[:, -lamada:]))

        with open(fold_path + "train_pos_" + str(i)) as fp:
            train_pos_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp))
        with open(fold_path + "train_pos_" + str(i)) as fp:
            train_pos_psednc_vecs = np.array(psednc.make_psednc_vec(fp))
        train_pos_revc_psednc_vecs = np.column_stack((train_pos_revc_kmer_vecs, train_pos_psednc_vecs[:, -lamada:]))

        # Generate synthetic vecs from pos_vecs.
        synthetic1 = (smote.smote(train_pos_revc_psednc_vecs, N=100, k=5)).tolist()
        synthetic2 = (smote.smote(train_pos_revc_psednc_vecs, N=50, k=5)).tolist()
        synthetic = np.row_stack((synthetic1, synthetic2))

        n_lamada = "_".join([str(lamada), str(w)])
        # Write test file.
        write_file = fold_path + filename + '_' + n_lamada + "_test_" + str(i) + ".txt"
        test_vecs = test_pos_revc_psednc_vecs.tolist() + test_neg_revc_psednc_vecs.tolist()
        test_vecs_labels = [1] * len(test_pos_revc_psednc_vecs) + [-1] * len(test_neg_revc_psednc_vecs)
        write_libsvm(test_vecs, test_vecs_labels, write_file)

        # Write train file.
        write_file = fold_path + filename + '_' + n_lamada + "_train_" + str(i) + ".txt"
        train_pos_vecs = train_pos_revc_psednc_vecs.tolist() + synthetic.tolist()
        train_vecs = train_pos_vecs + train_neg_revc_psednc_vecs.tolist()
        train_vecs_labels = [1] * len(train_pos_vecs) + [-1] * len(train_neg_revc_psednc_vecs)
        write_libsvm(train_vecs, train_vecs_labels, write_file)
示例#9
0
def borderline_smote_revc_kmer():
    revc_kmer = RevcKmer(k=6, normalize=True, upto=True)
    with open("data/hs.fasta") as f:
        pos_vecs = np.array(revc_kmer.make_revckmer_vec(f))
    with open("data/non-hs.fasta") as f:
        neg_vecs = np.array(revc_kmer.make_revckmer_vec(f))

    vecs = np.row_stack((pos_vecs, neg_vecs))
    vecs_labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs)
    _1, synthetic1, _2 = smote.borderline_smote(vecs, vecs_labels, 1, N=300, k=5)

    pos_vecs = pos_vecs.tolist() + synthetic1.tolist()
    vecs = pos_vecs + neg_vecs.tolist()
    labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs)
    write_libsvm(vecs, labels, "borderline_smote_revc_kmer.txt")
def ReverseComplimentKmer(gene2seq, k):
    X = dict()
    succeed_cnt = 0
    rev_kmer = RevcKmer(k, normalize=True, upto=True)
    for gene, seq in gene2seq.items():
        seq = [seq]
        try:
            pos_vec = rev_kmer.make_revckmer_vec(seq)
            X[gene] = pos_vec
            succeed_cnt += 1
            if succeed_cnt % 100 == 0:
                print(succeed_cnt)
        except Exception as e:
            continue
    print('Reverse Compliment Kmer, succeed for %d gene' % (succeed_cnt))
    with open('%d-revcKmer-features.json' % (k), 'w') as output_f:
        output_f.write(json.dumps(X))
    return X
示例#11
0
文件: main.py 项目: liufule12/hsDNA
def cv5_upto_revckmer_tool(k, test_neg_file, test_pos_file, train_neg_file, train_pos_file, test_write_file,
                           train_write_file):
    kmer = RevcKmer(k=k, upto=True, normalize=True)
    with open(test_neg_file) as fp:
        test_neg_vecs = kmer.make_revckmer_vec(fp)
    with open(test_pos_file) as fp:
        test_pos_vecs = kmer.make_revckmer_vec(fp)
    with open(train_pos_file) as fp:
        train_pos_vecs = kmer.make_revckmer_vec(fp)
    with open(train_neg_file) as fp:
        train_neg_vecs = kmer.make_revckmer_vec(fp)

    train_vecs = train_pos_vecs + train_neg_vecs
    test_vecs = test_pos_vecs + test_neg_vecs
    train_labels = [1] * len(train_pos_vecs) + [-1] * len(train_neg_vecs)
    test_labels = [1] * len(test_pos_vecs) + [-1] * len(test_neg_vecs)

    # Write file.
    write_libsvm(train_vecs, train_labels, train_write_file)
    write_libsvm(test_vecs, test_labels, test_write_file)
示例#12
0
def convert(name):
    print("Now converting " + name)

    #COnvert from fastq to fasta
    fastq2fasta()

    #4mers representation
    revckmer = RevcKmer(k=6)

    #Open file to write to
    f = open("vectored_data/" + name + ".txt", "w")

    temp = revckmer.make_kmer_vec(open("vectored_data/temp.fasta"))
    for val in temp:
        string = ""
        for i, num in enumerate(val):
            string += str(num) + ","
        #end
        f.write(string[0:-1 - 1] + "\n")
    #end

    #Close file correctly
    f.close()
示例#13
0
def cv5_revc_kmer(fold_path, filename, k):
    revc_kmer = RevcKmer(k=k, normalize=True, upto=True)
    for i in range(5):
        # Generate RevcKmer vecs.
        with open(fold_path + "test_neg_" + str(i)) as fp:
            test_neg_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp)
        with open(fold_path + "test_pos_" + str(i)) as fp:
            test_pos_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp)
        with open(fold_path + "train_neg_" + str(i)) as fp:
            train_neg_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp)
        with open(fold_path + "train_pos_" + str(i)) as fp:
            train_pos_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp)

        # Write test file.
        write_file = fold_path + filename + "_test_" + str(i) + ".txt"
        test_vecs = test_pos_revc_kmer_vecs + test_neg_revc_kmer_vecs
        test_vecs_labels = [1] * len(test_pos_revc_kmer_vecs) + [-1] * len(test_neg_revc_kmer_vecs)
        write_libsvm(test_vecs, test_vecs_labels, write_file)

        # Write train file.
        write_file = fold_path + filename + "_train_" + str(i) + ".txt"
        train_vecs = train_pos_revc_kmer_vecs + train_neg_revc_kmer_vecs
        train_vecs_labels = [1] * len(train_pos_revc_kmer_vecs) + [-1] * len(train_neg_revc_kmer_vecs)
        write_libsvm(train_vecs, train_vecs_labels, write_file)
def GetRevcKmer(k):
    rev_kmer = RevcKmer(k=k)
    pos_vec = rev_kmer.make_revckmer_vec(open(posi_samples_file))
    neg_vec = rev_kmer.make_revckmer_vec(open(nega_samples_file))
    X = array(pos_vec + neg_vec)
    return X
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from scipy import interp
import matplotlib.pyplot as plt

if __name__ == '__main__':
    begin_time = time.time()
    print(
        'Example1 Start.(This process may use several minutes, please do not close the program.)'
    )

    # ##############################################################################
    # Data IO and generation.

    # Generate the feature vectors based on reverse compliment kmer.
    rev_kmer = RevcKmer(k=6, normalize=True, upto=True)
    pos_vec = rev_kmer.make_revckmer_vec(open('hs.fasta'))
    neg_vec = rev_kmer.make_revckmer_vec(open('non-hs.fasta'))

    print(len(pos_vec))
    print(len(neg_vec))

    # Merge positive and negative feature vectors and generate their corresponding labels.
    vec = np.array(pos_vec + neg_vec)
    vec_label = np.array([1] * len(pos_vec) + [0] * len(neg_vec))

    # ##############################################################################
    # Classification and accurate analysis.

    # Using 10-fold cross-validation to evaluate the performance of the predictor.
    clf = svm.LinearSVC()
示例#16
0
from sklearn import cross_validation
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from scipy import interp
import matplotlib.pyplot as plt


if __name__ == '__main__':
    begin_time = time.time()
    print('Example1 Start.(This process may use several minutes, please do not close the program.)')

    # ##############################################################################
    # Data IO and generation.

    # Generate the feature vectors based on reverse compliment kmer.
    rev_kmer = RevcKmer(k=6, normalize=True, upto=True)
    pos_vec = rev_kmer.make_revckmer_vec(open('hs.fasta'))
    neg_vec = rev_kmer.make_revckmer_vec(open('non-hs.fasta'))

    print(len(pos_vec))
    print(len(neg_vec))

    # Merge positive and negative feature vectors and generate their corresponding labels.
    vec = np.array(pos_vec + neg_vec)
    vec_label = np.array([1] * len(pos_vec) + [0] * len(neg_vec))

    # ##############################################################################
    # Classification and accurate analysis.

    # Using 10-fold cross-validation to evaluate the performance of the predictor.
    clf = svm.LinearSVC()