model = 'tree' # run boosting for each value of mismatch for m in range(M): Xt = XT[m].astype(float) Yt = Yt.astype(float) Nt = Yt.shape[1] predicted_labels = np.zeros((Nt,T),dtype='int16') # split the data indices into `Nfold` random disjoint sets Fidx = splitdata.cv_multiclass_fold(Yt,Nfold) for fold in range(Nfold): # split the data and labels into train and test sets train_data, train_labels, test_data, test_labels \ = splitdata.cv_split(Xt,Yt,Fidx[fold]) # specify output file names filetag = model+'_%d_%d_%d' % (K,m,fold) output_file = '%s/output_%s.txt' % (data_path, filetag) handle = open(output_file,'w') to_write = ['round', 'kmer', 'threshold', 'train_auc', 'train_acc', 'test_auc', 'test_acc', 'runtime'] handle.write('\t'.join(to_write)+'\n') handle.close() # run Adaboost adt, adt_outputs, performance, predicted_labels = boost.adaboost( \ train_data, train_labels, test_data, test_labels, T, \ output_file=output_file, kmer_dict=kmer_dict, model=model, \ predicted_labels=predicted_labels, test_indices=Fidx[fold])
Yt = Yt.astype('int16') Nt = Yt.shape[1] T = 20 predicted_labels = np.zeros((Nt, T), dtype='int16') # number of folds of cross validation Nfold = 10 # split the data indices into 10 random disjoint sets Fidx = splitdata.cv_multiclass_fold(Yt, Nfold) for fold in range(Nfold): params = (fold, k, m, T) # using each set as the test set and the rest as train sets # split the data and run boosting X, Y, x, y, Idx = splitdata.cv_split(Xt, Yt, Fidx[fold]) predicted_labels = boost.adaboost(X, Y, x, y, predicted_labels, Fidx[fold], params, kmer_dict, model='tree', virus_family=virus_family) output_file = project_path + 'cache/%s_temp/%s_virii_test_output_%d_%d.pkl' % ( virus_family, virus_family, k, m) f = open(output_file, 'w') cPickle.Pickler(f, protocol=2).dump(Fidx)
def main(): # default proj_path def_pp = os.sep.join(os.getcwd().split(os.sep)[:-1]) # parse arguments parser = ArgumentParser() parser.add_argument("-J","--jobid", dest="jobid", type=str, help="dataset id") parser.add_argument("-K", "--kmerlength", dest="K", type=int, help="kmer feature length") parser.add_argument("-M", "--mismatch", dest="M", type=int, help="max allowed mismatch") parser.add_argument("-P", "--projpath", dest="proj_path", type=str, default=def_pp, help="project path (defaults to cwd)") parser.add_argument("-T","--rounds", dest="T", type=int, help="number of boosting rounds") parser.add_argument("-N","--folds", dest="Nfolds", type=int, default=5, help="number of cv folds to execute") parser.add_argument("-R","--runid", dest="runid", type=str, help="run id") parser.add_argument("-O","--outdir",dest="outdir", type=str, help="output directory") parser.add_argument("-Z","--model", dest="model", type=str, default="tree", help="model type (tree or stump)") args = parser.parse_args() jobid = args.jobid runid = args.runid K = args.K M = args.M T = args.T Nfolds = args.Nfolds model = args.model outdir = args.outdir # set up paths proj_path = args.proj_path src_path = proj_path + '/src/psmkboost' C_path = '%s/get_new_function.c' % (src_path) feat_path = '%s/data' % (proj_path) # run_path = '%s/cache/runs/%s' % (proj_path, runid) run_path = outdir if not os.path.exists(run_path): os.makedirs(run_path) print 'Running adaboost on %s using position specific mismatch kmer feature space' % (jobid) # load feature matrix XDF = pd.load('%s/data/%s.K%d.M%d.feature_matrix.pkl' % (proj_path, jobid, K, M)) (N, P) = XDF.shape # load feature list features = XDF.columns feat_dict = dict(zip(range(P), features)) # load label dict ldf = '%s/data/%s.labeldict.csv' % (proj_path, jobid) label_dict = {} for row in csv.reader(open(ldf,'r'), delimiter=','): label_dict[int(row[0])] = row[1] C = len(label_dict) # load label matrix lf = '%s/data/%s.labels.csv' % (proj_path, jobid) Y = gen_label_matrix(lf, N, C) Yt = Y.T # in this case we will loop over binary thresholds threshold_list = range(2) #holds predicted label at each round predicted_labels = np.zeros((N,T),dtype='int') # split the data indices into `Nfold` random disjoint sets Fidx = splitdata.cv_multiclass_fold(Yt,Nfolds) for fold in range(Nfolds): print 'executing fold %d'%(fold+1) # split the data and labels into train and test sets # skip this and return only indices? print 'splitting data...' tt = time.time() train_data, train_labels, test_data, test_labels \ = splitdata.cv_split(XDF.as_matrix().T, Yt, Fidx[fold]) print 'split data time=%.2f seconds'%(time.time()-tt) # specify output file names filetag = '%s_K%d_M%d_fold%d'%(model,K,M,fold) output_file = '%s/%s.%s.outputsummary_%s.txt' % (run_path, jobid, runid, filetag) handle = open(output_file,'w') to_write = ['round', 'kmer', 'threshold', 'train_auc', 'train_acc', 'test_auc', 'test_acc', 'runtime'] handle.write('\t'.join(to_write)+'\n') handle.close() # parse the C code from get_new_function.c f = open(C_path,'r') C_code = '\n'.join([line for line in f if '//' not in line]) f.close() # run Adaboost print 'entering adaboost' adt, adt_outputs, performance, predicted_labels = boost.adaboost(C_code, \ train_data, train_labels, test_data, test_labels, T, \ output_file=output_file, kmer_dict=feat_dict, model=model, \ predicted_labels=predicted_labels, test_indices=Fidx[fold]) # save the learned model model_file = '%s/%s.%s.adt_%s.pkl' % (run_path, jobid, runid, filetag) handle = open(model_file,'w') cPickle.dump(adt,handle) handle.close() # save algorithm performance (errors, runtime, etc) results_file = '%s/%s.%s.performance_%s.pkl' % (run_path, jobid, runid, filetag) handle = open(results_file,'w') cPickle.Pickler(handle,protocol=2).dump(adt_outputs) cPickle.Pickler(handle,protocol=2).dump(performance) handle.close() # output predicted labels on test data for each CV fold output_file = '%s/%s.%s.testsetpredictions_%d.pkl' \ % (run_path, jobid, runid, K) handle = open(output_file,'w') cPickle.Pickler(handle,protocol=2).dump(Fidx) cPickle.Pickler(handle,protocol=2).dump(predicted_labels) handle.close()
kmer_dict = cPickle.load(f) f.close() # make Xt, Yt memory-efficient Xt = Xt.astype("int16") Yt = Yt.astype("int16") Nt = Yt.shape[1] T = 20 predicted_labels = np.zeros((Nt, T), dtype="int16") # number of folds of cross validation Nfold = 10 # split the data indices into 10 random disjoint sets Fidx = splitdata.cv_multiclass_fold(Yt, Nfold) for fold in range(Nfold): params = (fold, k, m, T) # using each set as the test set and the rest as train sets # split the data and run boosting X, Y, x, y, Idx = splitdata.cv_split(Xt, Yt, Fidx[fold]) predicted_labels = boost.adaboost( X, Y, x, y, predicted_labels, Fidx[fold], params, kmer_dict, model="tree", virus_family=virus_family ) output_file = project_path + "cache/%s_temp/%s_virii_test_output_%d_%d.pkl" % (virus_family, virus_family, k, m) f = open(output_file, "w") cPickle.Pickler(f, protocol=2).dump(Fidx) cPickle.Pickler(f, protocol=2).dump(predicted_labels) f.close()
model = 'tree' # run boosting for each value of mismatch for m in range(M): Xt = XT[m].astype(float) Yt = Yt.astype(float) Nt = Yt.shape[1] predicted_labels = np.zeros((Nt, T), dtype='int16') # split the data indices into `Nfold` random disjoint sets Fidx = splitdata.cv_multiclass_fold(Yt, Nfold) for fold in range(Nfold): # split the data and labels into train and test sets train_data, train_labels, test_data, test_labels \ = splitdata.cv_split(Xt,Yt,Fidx[fold]) # specify output file names filetag = model + '_%d_%d_%d' % (K, m, fold) output_file = '%s/output_%s.txt' % (data_path, filetag) handle = open(output_file, 'w') to_write = [ 'round', 'kmer', 'threshold', 'train_auc', 'train_acc', 'test_auc', 'test_acc', 'runtime' ] handle.write('\t'.join(to_write) + '\n') handle.close() # run Adaboost adt, adt_outputs, performance, predicted_labels = boost.adaboost( \ train_data, train_labels, test_data, test_labels, T, \