def whole_revc_kmer_psednc(pos_file, neg_file, k): """Generate revc_kmer and psednc feature into a file combined positive and negative file.""" revc_kmer = RevcKmer(k=k, normalize=True, upto=True) with open(pos_file) as fp: revc_kmer_pos_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) with open(neg_file) as fp: revc_kmer_neg_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) lamada = 6 w = 0.8 psednc = PseDNC(lamada, w) with open(pos_file) as fp: psednc_pos_vecs = np.array(psednc.make_psednc_vec(fp)) with open(neg_file) as fp: psednc_neg_vecs = np.array(psednc.make_psednc_vec(fp)) pos_vecs = np.column_stack((revc_kmer_pos_vecs, psednc_pos_vecs[:, -lamada:])) neg_vecs = np.column_stack((revc_kmer_neg_vecs, psednc_neg_vecs[:, -lamada:])) vecs = pos_vecs.tolist() + neg_vecs.tolist() labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs) # Write file. write_file = "data/whole_revc_kmer_psednc.txt" write_libsvm(vecs, labels, write_file)
def whole_revc_kmer_psednc_choose_args(pos_file, neg_file, k): """Generate revc_kmer and psednc feature into a file combined positive and negative file.""" revc_kmer = RevcKmer(k=k, normalize=True, upto=True) with open(pos_file) as fp: revc_kmer_pos_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) with open(neg_file) as fp: revc_kmer_neg_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) for lamada in range(1, 2): w = 0.1 while w < 1: psednc = PseDNC(lamada, w) with open(pos_file) as fp: psednc_pos_vecs = np.array(psednc.make_psednc_vec(fp)) with open(neg_file) as fp: psednc_neg_vecs = np.array(psednc.make_psednc_vec(fp)) pos_vecs = np.column_stack((revc_kmer_pos_vecs, psednc_pos_vecs[:, -lamada:])) neg_vecs = np.column_stack((revc_kmer_neg_vecs, psednc_neg_vecs[:, -lamada:])) vecs = pos_vecs.tolist() + neg_vecs.tolist() labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs) # Write file. lamada_w = str(lamada) + '_' + str(w) write_file = "data/whole_revc_kmer_psednc_" + lamada_w + ".txt" print(write_file) write_libsvm(vecs, labels, write_file) w += 0.1
def borderline_smote_revc_psednc(fold_path): revc_kmer = RevcKmer(k=2, normalize=True, upto=True) with open("data/hs.fasta") as f: pos_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(f)) with open("data/non-hs.fasta") as f: neg_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(f)) lamada = 6 w = 0.8 psednc = PseDNC(lamada, w) with open("data/hs.fasta") as f: pos_psednc_vecs = np.array(psednc.make_psednc_vec(f)) with open("data/non-hs.fasta") as f: neg_psednc_vecs = np.array(psednc.make_psednc_vec(f)) pos_vecs = np.column_stack((pos_revc_kmer_vecs, pos_psednc_vecs[:, -lamada:])) neg_vecs = np.column_stack((neg_revc_kmer_vecs, neg_psednc_vecs[:, -lamada:])) vecs = np.row_stack((pos_vecs, neg_vecs)) vecs_labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs) _1, synthetic, _2 = (smote.borderline_smote(vecs, vecs_labels, 1, N=300, k=5)) pos_vecs = pos_vecs.tolist() + synthetic.tolist() vecs = pos_vecs + neg_vecs.tolist() labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs) lamada_n = "_".join([str(lamada), str(w)]) write_file = "/".join([fold_path, lamada_n]) print(write_file) write_libsvm(vecs, labels, write_file)
def cv5_loop_borderline_smote_revc_kmer(fold_path, filename, k): revc_kmer = RevcKmer(k=k, normalize=True, upto=True) for i in range(5): # Generate RevcKmer vecs. with open(fold_path + "test_neg_" + str(i)) as fp: test_neg_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp) with open(fold_path + "test_pos_" + str(i)) as fp: test_pos_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp) with open(fold_path + "train_neg_" + str(i)) as fp: train_neg_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp) with open(fold_path + "train_pos_" + str(i)) as fp: train_pos_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp) # Generate borderline SMOTE synthetic vecs from train_vecs. train_vecs = np.row_stack((train_pos_revc_kmer_vecs, train_neg_revc_kmer_vecs)) train_vecs_labels = [1] * len(train_pos_revc_kmer_vecs) + [-1] * len(train_neg_revc_kmer_vecs) synthetic = smote.loop_borderline_smote(train_vecs, train_vecs_labels, 1, -1, N=200, k=5) # Write test file. write_file = fold_path + filename + "_test_" + str(i) + ".txt" test_vecs = test_pos_revc_kmer_vecs + test_neg_revc_kmer_vecs test_vecs_labels = [1] * len(test_pos_revc_kmer_vecs) + [-1] * len(test_neg_revc_kmer_vecs) write_libsvm(test_vecs, test_vecs_labels, write_file) # Write train file. write_file = fold_path + filename + "_train_" + str(i) + ".txt" train_pos_revc_kmer_vecs = train_pos_revc_kmer_vecs + synthetic train_vecs = train_pos_revc_kmer_vecs + train_neg_revc_kmer_vecs train_vecs_labels = [1] * len(train_pos_revc_kmer_vecs) + [-1] * len(train_neg_revc_kmer_vecs) write_libsvm(train_vecs, train_vecs_labels, write_file)
def revc_kmer_tool(k, pos_file, neg_file, write_file): revc_kmer = RevcKmer(k=k, upto=True, normalize=True) with open(pos_file) as fp: pos_vecs = revc_kmer.make_revckmer_vec(fp) with open(neg_file) as fp: neg_vecs = revc_kmer.make_revckmer_vec(fp) vecs = pos_vecs + neg_vecs labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs) write_libsvm(vecs, labels, write_file)
def whole_revc_kmer(pos_file, neg_file, k): """Generate revc_kmer into a file combined positive and negative file.""" revc_kmer = RevcKmer(k=k, normalize=True, upto=True) with open(pos_file) as fp: pos_vecs = revc_kmer.make_revckmer_vec(fp) with open(neg_file) as fp: neg_vecs = revc_kmer.make_revckmer_vec(fp) vecs = pos_vecs + neg_vecs labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs) # Write file. write_file = "data/whole_revc_kmer.txt" write_libsvm(vecs, labels, write_file)
def cv5_smote_revc_psednc(fold_path, filename, k): # Generate pos and neg vecs and SMOTE synthetic vecs. lamada = 6 w = 0.8 revc_kmer = RevcKmer(k=k, normalize=True, upto=True) psednc = PseDNC(lamada, w) for i in range(5): # Generate RevcKmer_PseDNC vecs. with open(fold_path + "test_neg_" + str(i)) as fp: test_neg_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) with open(fold_path + "test_neg_" + str(i)) as fp: test_neg_psednc_vecs = np.array(psednc.make_psednc_vec(fp)) test_neg_revc_psednc_vecs = np.column_stack((test_neg_revc_kmer_vecs, test_neg_psednc_vecs[:, -lamada:])) with open(fold_path + "test_pos_" + str(i)) as fp: test_pos_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) with open(fold_path + "test_pos_" + str(i)) as fp: test_pos_psednc_vecs = np.array(psednc.make_psednc_vec(fp)) test_pos_revc_psednc_vecs = np.column_stack((test_pos_revc_kmer_vecs, test_pos_psednc_vecs[:, -lamada:])) with open(fold_path + "train_neg_" + str(i)) as fp: train_neg_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) with open(fold_path + "train_neg_" + str(i)) as fp: train_neg_psednc_vecs = np.array(psednc.make_psednc_vec(fp)) train_neg_revc_psednc_vecs = np.column_stack((train_neg_revc_kmer_vecs, train_neg_psednc_vecs[:, -lamada:])) with open(fold_path + "train_pos_" + str(i)) as fp: train_pos_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) with open(fold_path + "train_pos_" + str(i)) as fp: train_pos_psednc_vecs = np.array(psednc.make_psednc_vec(fp)) train_pos_revc_psednc_vecs = np.column_stack((train_pos_revc_kmer_vecs, train_pos_psednc_vecs[:, -lamada:])) # Generate synthetic vecs from pos_vecs. synthetic1 = (smote.smote(train_pos_revc_psednc_vecs, N=100, k=5)).tolist() synthetic2 = (smote.smote(train_pos_revc_psednc_vecs, N=50, k=5)).tolist() synthetic = np.row_stack((synthetic1, synthetic2)) n_lamada = "_".join([str(lamada), str(w)]) # Write test file. write_file = fold_path + filename + '_' + n_lamada + "_test_" + str(i) + ".txt" test_vecs = test_pos_revc_psednc_vecs.tolist() + test_neg_revc_psednc_vecs.tolist() test_vecs_labels = [1] * len(test_pos_revc_psednc_vecs) + [-1] * len(test_neg_revc_psednc_vecs) write_libsvm(test_vecs, test_vecs_labels, write_file) # Write train file. write_file = fold_path + filename + '_' + n_lamada + "_train_" + str(i) + ".txt" train_pos_vecs = train_pos_revc_psednc_vecs.tolist() + synthetic.tolist() train_vecs = train_pos_vecs + train_neg_revc_psednc_vecs.tolist() train_vecs_labels = [1] * len(train_pos_vecs) + [-1] * len(train_neg_revc_psednc_vecs) write_libsvm(train_vecs, train_vecs_labels, write_file)
def borderline_smote_revc_kmer(): revc_kmer = RevcKmer(k=6, normalize=True, upto=True) with open("data/hs.fasta") as f: pos_vecs = np.array(revc_kmer.make_revckmer_vec(f)) with open("data/non-hs.fasta") as f: neg_vecs = np.array(revc_kmer.make_revckmer_vec(f)) vecs = np.row_stack((pos_vecs, neg_vecs)) vecs_labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs) _1, synthetic1, _2 = smote.borderline_smote(vecs, vecs_labels, 1, N=300, k=5) pos_vecs = pos_vecs.tolist() + synthetic1.tolist() vecs = pos_vecs + neg_vecs.tolist() labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs) write_libsvm(vecs, labels, "borderline_smote_revc_kmer.txt")
def GetRevcKmer( k, samples_file, ): rev_kmer = RevcKmer(k=k, normalize=True) vec = rev_kmer.make_revckmer_vec(open(samples_file)) np.savetxt('DHSs_reckmer_' + str(k) + '.txt', vec)
def cv5_upto_revckmer_tool(k, test_neg_file, test_pos_file, train_neg_file, train_pos_file, test_write_file, train_write_file): kmer = RevcKmer(k=k, upto=True, normalize=True) with open(test_neg_file) as fp: test_neg_vecs = kmer.make_revckmer_vec(fp) with open(test_pos_file) as fp: test_pos_vecs = kmer.make_revckmer_vec(fp) with open(train_pos_file) as fp: train_pos_vecs = kmer.make_revckmer_vec(fp) with open(train_neg_file) as fp: train_neg_vecs = kmer.make_revckmer_vec(fp) train_vecs = train_pos_vecs + train_neg_vecs test_vecs = test_pos_vecs + test_neg_vecs train_labels = [1] * len(train_pos_vecs) + [-1] * len(train_neg_vecs) test_labels = [1] * len(test_pos_vecs) + [-1] * len(test_neg_vecs) # Write file. write_libsvm(train_vecs, train_labels, train_write_file) write_libsvm(test_vecs, test_labels, test_write_file)
def cv5_revc_kmer(fold_path, filename, k): revc_kmer = RevcKmer(k=k, normalize=True, upto=True) for i in range(5): # Generate RevcKmer vecs. with open(fold_path + "test_neg_" + str(i)) as fp: test_neg_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp) with open(fold_path + "test_pos_" + str(i)) as fp: test_pos_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp) with open(fold_path + "train_neg_" + str(i)) as fp: train_neg_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp) with open(fold_path + "train_pos_" + str(i)) as fp: train_pos_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp) # Write test file. write_file = fold_path + filename + "_test_" + str(i) + ".txt" test_vecs = test_pos_revc_kmer_vecs + test_neg_revc_kmer_vecs test_vecs_labels = [1] * len(test_pos_revc_kmer_vecs) + [-1] * len(test_neg_revc_kmer_vecs) write_libsvm(test_vecs, test_vecs_labels, write_file) # Write train file. write_file = fold_path + filename + "_train_" + str(i) + ".txt" train_vecs = train_pos_revc_kmer_vecs + train_neg_revc_kmer_vecs train_vecs_labels = [1] * len(train_pos_revc_kmer_vecs) + [-1] * len(train_neg_revc_kmer_vecs) write_libsvm(train_vecs, train_vecs_labels, write_file)
def ReverseComplimentKmer(gene2seq, k): X = dict() succeed_cnt = 0 rev_kmer = RevcKmer(k, normalize=True, upto=True) for gene, seq in gene2seq.items(): seq = [seq] try: pos_vec = rev_kmer.make_revckmer_vec(seq) X[gene] = pos_vec succeed_cnt += 1 if succeed_cnt % 100 == 0: print(succeed_cnt) except Exception as e: continue print('Reverse Compliment Kmer, succeed for %d gene' % (succeed_cnt)) with open('%d-revcKmer-features.json' % (k), 'w') as output_f: output_f.write(json.dumps(X)) return X
def GetRevcKmer(k): rev_kmer = RevcKmer(k=k) pos_vec = rev_kmer.make_revckmer_vec(open(posi_samples_file)) neg_vec = rev_kmer.make_revckmer_vec(open(nega_samples_file)) X = array(pos_vec + neg_vec) return X
from sklearn.metrics import roc_curve, auc from scipy import interp import matplotlib.pyplot as plt if __name__ == '__main__': begin_time = time.time() print( 'Example1 Start.(This process may use several minutes, please do not close the program.)' ) # ############################################################################## # Data IO and generation. # Generate the feature vectors based on reverse compliment kmer. rev_kmer = RevcKmer(k=6, normalize=True, upto=True) pos_vec = rev_kmer.make_revckmer_vec(open('hs.fasta')) neg_vec = rev_kmer.make_revckmer_vec(open('non-hs.fasta')) print(len(pos_vec)) print(len(neg_vec)) # Merge positive and negative feature vectors and generate their corresponding labels. vec = np.array(pos_vec + neg_vec) vec_label = np.array([1] * len(pos_vec) + [0] * len(neg_vec)) # ############################################################################## # Classification and accurate analysis. # Using 10-fold cross-validation to evaluate the performance of the predictor. clf = svm.LinearSVC() scores = cross_validation.cross_val_score(clf, vec, y=vec_label, cv=10)
from sklearn.cross_validation import StratifiedKFold from sklearn.metrics import roc_curve, auc from scipy import interp import matplotlib.pyplot as plt if __name__ == '__main__': begin_time = time.time() print('Example1 Start.(This process may use several minutes, please do not close the program.)') # ############################################################################## # Data IO and generation. # Generate the feature vectors based on reverse compliment kmer. rev_kmer = RevcKmer(k=6, normalize=True, upto=True) pos_vec = rev_kmer.make_revckmer_vec(open('hs.fasta')) neg_vec = rev_kmer.make_revckmer_vec(open('non-hs.fasta')) print(len(pos_vec)) print(len(neg_vec)) # Merge positive and negative feature vectors and generate their corresponding labels. vec = np.array(pos_vec + neg_vec) vec_label = np.array([1] * len(pos_vec) + [0] * len(neg_vec)) # ############################################################################## # Classification and accurate analysis. # Using 10-fold cross-validation to evaluate the performance of the predictor. clf = svm.LinearSVC() scores = cross_validation.cross_val_score(clf, vec, y=vec_label, cv=10)