def train(self, sentences, labels, cross_validation = False): x = [] y = [] for i in range(0, len(sentences)): sentence = sentences[i] prev = [] j = 0 for word in sentence: body = word.lower() featurespace = self._construct_featurespace(body, prev) prev.append((body, labels[i][j])) if len(prev) > self.chain_len: del(prev[0]) x.append(featurespace.featureset) j += 1 y.extend(labels[i]) prob = svm.problem(y, x) if cross_validation: param = svm.parameter('-c 1 -v 4 -s 4') svm.train(prob, param) else: param = svm.parameter('-c 1 -s 4') self._svm_model = svm.train(prob, param)
def train(self, sentences, labels, cross_validation=False): x = [] y = [] for i in range(0, len(sentences)): sentence = sentences[i] prev = [] j = 0 for word in sentence: body = word.lower() featurespace = self._construct_featurespace(body, prev) prev.append((body, labels[i][j])) if len(prev) > self.chain_len: del (prev[0]) x.append(featurespace.featureset) j += 1 y.extend(labels[i]) prob = svm.problem(y, x) if cross_validation: param = svm.parameter('-c 1 -v 4 -s 4') svm.train(prob, param) else: param = svm.parameter('-c 1 -s 4') self._svm_model = svm.train(prob, param)
def train(self, x, y, biased=False): data = [] for sample in x: data.append(dict([(self._features.setId(d), sample[d]) for d in sample])) labels = [self._labels.setId(C) for C in y] if self._labels.count() == 2: labels = [1 if label == 1 else -1 for label in labels] param = liblinear.parameter("-c 1 -s 2 -q" + (" -B {0}".format(biased) if biased else "")) else: param = liblinear.parameter("-c 1 -s 4 -q" + (" -B {0}".format(biased) if biased else "")) prob = liblinear.problem(labels, data) self._model = liblinear.train(prob, param)
def train(self, x, y, biased=False): data = [] for sample in x: data.append( dict([(self._features.setId(d), sample[d]) for d in sample])) labels = [self._labels.setId(C) for C in y] if self._labels.count() == 2: labels = [1 if label == 1 else -1 for label in labels] param = liblinear.parameter( '-c 1 -s 2 -q' + (' -B {0}'.format(biased) if biased else '')) else: param = liblinear.parameter( '-c 1 -s 4 -q' + (' -B {0}'.format(biased) if biased else '')) prob = liblinear.problem(labels, data) self._model = liblinear.train(prob, param)
def _complete_training(self, debug=False): """ Forward data to external training and extract classifier information """ if self.str_label_function is not None: self.label_function = eval(self.str_label_function) self.labels = self.label_function() options = "-c %.42f -e %.42f -s %d -B %d" % \ (self.complexity, self.tolerance, self.alg_num, self.offset) for i,w in enumerate(self.weight): options += " -w%d %.42f" % (i, w) if not self.debug: options += " -q" self._log("Liblinear is now quiet!") import liblinearutil param = liblinearutil.parameter(options) problem = liblinearutil.problem(self.labels, self.samples) model = liblinearutil.train(problem, param) self.calculate_classification_vector(model) if self.debug: print self.print_w print self.b
def train(self): sys.stderr.write('creating training problem...') prob = problem(self.labels, self.contexts) sys.stderr.write('done\ntraining with option(s) "' + self.parameters + '"...') self.model = train(prob, parameter(self.parameters)) sys.stderr.write('done\n')
def _lib_train_liblinear(user_tfidf, num_pos, num_neg, ignore): param = parameter("-s 0") sparse_user_tfidf, num_pos, num_neg = _convert_to_sparse_matrix(user_tfidf, num_pos, num_neg, ignore) labels = ([1] * num_pos) + ([-1] * num_neg) prob = problem(labels, sparse_user_tfidf) modellog = train(prob, param) return modellog
def train(self, data_train, data_dev): """Trains Minitagger on the given data.""" start_time = time.time() assert self.__feature_extractor.is_training # Assert untrained # Extract features (only labeled instances) and pass them to liblinear. [label_list, features_list, _] = \ self.__feature_extractor.extract_features(data_train, False, []) if not self.quiet: print("{0} labeled instances (out of {1})".format( len(label_list), data_train.num_instances)) print("{0} label types".format(len(data_train.label_count))) print("{0} observation types".format( len(data_train.observation_count))) print("\"{0}\" feature template".format( self.__feature_extractor.feature_template)) print("{0} feature types".format( self.__feature_extractor.num_feature_types())) problem = liblinearutil.problem(label_list, features_list) self.__liblinear_model = \ liblinearutil.train(problem, liblinearutil.parameter("-q")) self.__feature_extractor.is_training = False if not self.quiet: num_seconds = int(math.ceil(time.time() - start_time)) print("Training time: {0}".format( str(datetime.timedelta(seconds=num_seconds)))) if data_dev is not None: quiet_value = self.quiet self.quiet = True _, acc = self.predict(data_dev) self.quiet = quiet_value print("Dev accuracy: {0:.3f}%".format(acc))
def train_liblinear(args): model_name, gold_dir, dirs = args[0], args[1], args[2:] vectors, predicates = get_data(gold_dir, dirs) prob = problem(map(num_to_class, predicates), vectors) param = parameter('-s 0') model = train(prob, param) save_model(model_name, model)
def _complete_training(self, debug=False): """ Forward data to external training and extract classifier information """ if self.str_label_function is not None: self.label_function = eval(self.str_label_function) self.labels = self.label_function() options = "-c %.42f -e %.42f -s %d -B %d" % \ (self.complexity, self.tolerance, self.alg_num, self.offset) for i, w in enumerate(self.weight): options += " -w%d %.42f" % (i, w) if not self.debug: options += " -q" self._log("Liblinear is now quiet!") import liblinearutil param = liblinearutil.parameter(options) problem = liblinearutil.problem(self.labels, self.samples) model = liblinearutil.train(problem, param) self.calculate_classification_vector(model) if self.debug: print self.print_w print self.b
def train(self, data_train, data_test): """ Trains Minitagger on the given train data. If test data is given, it reports the accuracy of the trained model and the F1_score (macro average of f1_score of each label) @type data_train: SequenceData @param data_train: the training data set @type data_test: SequenceData @param data_test: the test data set """ # keep the training start timestamp start_time = time.time() assert (self.__feature_extractor.is_training ), "In order to train, is_training flag should be True" # Extract features only for labeled instances from data_train [label_list, features_list, _] = self.__feature_extractor.extract_features(data_train, False, []) # print some useful information about the data if not self.quiet: print("{0} labeled words (out of {1})".format( len(label_list), data_train.num_of_words)) print("{0} label types".format(len(data_train.label_count))) print("{0} word types".format(len(data_train.word_count))) print("\"{0}\" feature template".format( self.__feature_extractor.feature_template)) print("{0} feature types".format( self.__feature_extractor.num_feature_types())) # define problem to be trained using the parameters received from the feature_extractor problem = liblinearutil.problem(label_list, features_list) # train the model (-q stands for quiet = True in the liblinearutil) self.__liblinear_model = liblinearutil.train( problem, liblinearutil.parameter("-q")) # training is done, set is_training to False, so that prediction can be done self.__feature_extractor.is_training = False # print some useful information if not self.quiet: num_seconds = int(math.ceil(time.time() - start_time)) # how much did the training last print("Training time: {0}".format( str(datetime.timedelta(seconds=num_seconds)))) # perform prediction on the data_test and report accuracy if data_test is not None: quiet_value = self.quiet self.quiet = True pred_labels, acc = self.predict(data_test) self.quiet = quiet_value self.__save_prediction_to_file(data_test, pred_labels) f1score, precision, recall = report_fscore(self.prediction_path + "/predictions.txt", wikiner=self.wikiner) print("Accuracy: ", acc) # create some files useful for debugging if self.debug: self.__debug(data_test, pred_labels) return f1score, precision, recall
def train_regression(self, x, y): data = [] for sample in x: data.append(dict([(self._features.setId(d), sample[d]) for d in sample])) self._regression = True param = liblinear.parameter("-c 1 -s 0") prob = liblinear.problem(y, data) self._model = liblinear.train(prob, param)
def train_regression(self, x, y): data = [] for sample in x: data.append( dict([(self._features.setId(d), sample[d]) for d in sample])) self._regression = True param = liblinear.parameter('-c 1 -s 0') prob = liblinear.problem(y, data) self._model = liblinear.train(prob, param)
def train_log_regr_liblinear(features, responses): echo('Training with Liblinear') prob = liblinearutil.problem(responses, features) # -s 0: L2-regularized logistic regression (primal) # -B 1: Fit a bias term # -q: quiet mode param = liblinearutil.parameter('-s 0 -B 1 -q') return liblinearutil.train(prob, param)
def train_SVR_liblinear(features, responses): echo('Training with Liblinear') prob = liblinearutil.problem(responses, features) # -s 11: L2-regularized L2-loss support vector regression (primal) # -B 1: Fit a bias term # -q: quiet mode param = liblinearutil.parameter('-s 11 -B 1 -q') return liblinearutil.train(prob, param)
def parallel_train_predict(args): print("A process begins.") x_train,y_train,x_test,y_test=args problem = liblinearutil.problem(y_train, x_train) parameter = liblinearutil.parameter('-s 0 -c 1') time_start = time.clock() model = liblinearutil.train(problem, parameter) print("A process training finished in %f."%(time.clock()-time_start)) time_start = time.clock() p_label, p_acc, p_val = liblinearutil.predict(y_test, x_test,model,'-b 0') print("A process predicting finished in %f."%(time.clock()-time_start)) return p_val
def train(train_data, features, c): x = [] y = [] for key in train_data: y.append(train_data[key]['class']) x.append(features[key]) prob = liblinearutil.problem(y, x) param = liblinearutil.parameter('-q -c ' + str(c) ) model = liblinearutil.train(prob, param) return model
def eval_SVM(X, y, Xhat, yhat): # create classification problem problem = liblinearutil.problem(y, X) # set SVM parameters svm_param = liblinearutil.parameter('-s 3 -c 10 -q -B 1') # train SVM model = liblinearutil.train(problem, svm_param) # predict and evaluate p_label, p_acc, p_val = liblinearutil.predict(yhat, Xhat, model, '-q') # compute accuracy acc, mse, scc = liblinearutil.evaluations(yhat, p_label) return acc
def best_C(self,x,y): """ training using y=list,x=dict parameter = string of parameters searches for best C """ prob=lu.problem(y,x) para="" para+= "-s 2 -C -B %f -p %f -e %f" % (self.bias, self.p, self.eps) print para para1=lu.parameter(para) self.model=lu.train(prob,para1) best_C, best_rate = lu.train(y, x, para) return best_C, best_rate
def main(): if __name__ == "__main__": y, x = svm_read_problem(feature_file, return_scipy=True) # train:test = 7:3 train_X = x[:14000] train_y = y[:14000] test_X = x[14000:] test_y = y[14000:] prob = problem(train_y, train_X) param = parameter("-c 1 -s 2") model = train(prob, param) p_labs, p_acc, p_vals = predict(test_y, test_X, model) accuracy, precision, recall = metrics_result(test_y, p_labs) print print "accuracy: ", accuracy print "precision: ", precision print "recall: ", recall
def train(self,x,y): """ training using y=list,x=dict parameter = string of parameters """ prob=lu.problem(y,x) para="" para+= "-s %d -c %f -B %f -p %f -e %f" % (self.L, self.c, self.bias, self.p, self.eps) if(self.v!=0): para+=" -v %d" % self.v if(self.q!=0): para+= " -q" print para para1=lu.parameter(para) self.model=lu.train(prob,para1) return True
def classify(ds_cur=None): from os import chdir, system chdir('./liblinear-2.1/python/') from liblinearutil import problem, parameter, train, predict chdir('../../') from pdb import set_trace from tqdm import tqdm from pymongo import MongoClient from json import dumps from bson.objectid import ObjectId set_trace() dont_include = {'_id': 0} print 'List of variables:\n' for key in variable_lookup: print key[1] ch1 = raw_input( 'Input "s" to select custom fields (default selection - all fields):') if ch1 == 's': print 'Please input 0 for fields you would like to exclude, any other input would include it.' for key in variable_lookup: if key[0] == 'class': continue ch2 = raw_input(key[1] + ':') if ch2 == '0': dont_include[key[0]] = 0 if ds_cur == None: conn = MongoClient('mongodb://localhost:27017') dataset = conn['rmpdb']['dataset_profs_ten_over'] ds_cur = dataset.find(filter={}, projection=dont_include) dataset2 = conn['rmpdb']['dataset_profs_five_over_less_ten'] ds_cur2 = dataset2.find(filter={}, projection=dont_include) X = [] # Variables Y = [] # Classes ids = [] # Keep track of professor IDs X2 = [] # Variables Y2 = [] # Classes ids2 = [] # Keep track of professor IDs print 'Building training set according to selection..' for row in tqdm(ds_cur): x_dict = dict() for key in row: if key == 'class': Y.append(int(row[key])) elif key == 'prof_id': ids.append(row[key]) elif isNan(row[key]): continue else: x_dict[int(key)] = float(row[key]) X.append(x_dict) for row in tqdm(ds_cur2): x_dict2 = dict() for key in row: if key == 'class': Y2.append(int(row[key])) elif key == 'prof_id': ids2.append(row[key]) elif isNan(row[key]): continue else: x_dict2[int(key)] = float(row[key]) X2.append(x_dict2) ch = raw_input( 'Include top words for males and females as features? (y/n) [n]: ') if ch == 'y': from glob import glob from json import loads vec_files = glob('../logs/*.vec') if not len(vec_files) == 0: print 'Word vector files found in ../logs: \n' print vec_files fch = raw_input( 'Enter name of file without extension. [../logs/trial0.vec] Enter 0 to skip. ../logs/' ) if fch == '0': male_vector, female_vector = build_vector() else: try: f = open('../logs/' + fch + '.vec', 'r') male_vector, female_vector = loads(f.read()) except: f = open('../logs/trial0.vec', 'r') male_vector, female_vector = loads(f.read()) else: male_vector, female_vector = build_vector() print 'Male vectors as (word, count)' print male_vector print "=============================================" print 'Female vectors as (word, count)' print female_vector print "=============================================" print 'Calculating word features for all professors in dataset. This shall take some time.' print 'Depending on your cutoff, this can take from 4 - 6 hours. Probably a good idea to get some other stuff done..' male_words = [tup[0] for tup in male_vector] female_words = [tup[0] for tup in female_vector] union_words = list(set(male_words).union(set(female_words))) final_words = list() print 'Select words you want to remove by entering "x".' for word in union_words: wch = raw_input(word + ':') if wch == 'x': continue else: final_words.append(word) from string import punctuation exclude = set(punctuation) rmpdb = MongoClient('mongodb://localhost:27017')['rmpdb'] for i in tqdm(range(len(ids))): prof_id = ids[i] # male_dict = dict() # female_dict = dict() # for tup in male_vector: # male_dict[tup[0]] = 0 # for tup in female_vector: # female_dict[tup[0]] = 0 vec_dict = dict() for word in final_words: vec_dict[word] = 0 prof_comments = rmpdb['profs'].find_one( {'_id': ObjectId(prof_id)}, { '_id': 0, 'all comments.rComments': 1 }) for comment in prof_comments['all comments']: text = comment['rComments'] no_punc_text = ''.join(ch for ch in text if ch not in exclude) toks = no_punc_text.split() for tok in toks: # if tok.lower() in male_dict: # male_dict[tok.lower()] += 1 # if tok.lower() in female_dict: # female_dict[tok.lower()] += 1 if tok.lower() in vec_dict: vec_dict[tok.lower()] += 1 feature_counter = 53 #starts right after variable_lookup['53'] # for j in range(len(male_vector)): # feature_counter += 1 # tup = male_vector[j] # if not male_dict[tup[0]] == 0: # X[i][feature_counter] = male_dict[tup[0]] # for j in range(len(female_vector)): # feature_counter += 1 # tup = female_vector[j] # if not female_dict[tup[0]] == 0: # X[i][feature_counter] = female_dict[tup[0]] for j in range(len(final_words)): feature_counter += 1 word = final_words[j] if not vec_dict[word] == 0: X[i][feature_counter] = vec_dict[word] # if feature_counter == 97: # break print "Building test set.." for i in tqdm(range(len(ids2))): prof_id = ids2[i] # male_dict = dict() # female_dict = dict() # for tup in male_vector: # male_dict[tup[0]] = 0 # for tup in female_vector: # female_dict[tup[0]] = 0 vec_dict = dict() for word in final_words: vec_dict[word] = 0 prof_comments = rmpdb['profs'].find_one( {'_id': ObjectId(prof_id)}, { '_id': 0, 'all comments.rComments': 1 }) for comment in prof_comments['all comments']: text = comment['rComments'] no_punc_text = ''.join(ch for ch in text if ch not in exclude) toks = no_punc_text.split() for tok in toks: # if tok.lower() in male_dict: # male_dict[tok.lower()] += 1 # if tok.lower() in female_dict: # female_dict[tok.lower()] += 1 if tok.lower() in vec_dict: vec_dict[tok.lower()] += 1 feature_counter = 53 #starts right after variable_lookup['53'] # for j in range(len(male_vector)): # feature_counter += 1 # tup = male_vector[j] # if not male_dict[tup[0]] == 0: # X[i][feature_counter] = male_dict[tup[0]] # for j in range(len(female_vector)): # feature_counter += 1 # tup = female_vector[j] # if not female_dict[tup[0]] == 0: # X[i][feature_counter] = female_dict[tup[0]] for j in range(len(final_words)): feature_counter += 1 word = final_words[j] if not vec_dict[word] == 0: X2[i][feature_counter] = vec_dict[word] print 'Words used:' print final_words else: pass print 'Writing temp files for AUC calculation..' build_svm_file(X, Y) print 'Temp file written..' print 'Features used:' fstr = list() for key in variable_lookup: if key[0] in dont_include or key[0] == 'class': continue else: fstr.append(key[1]) print dumps(fstr) print '======================================\n' prob = problem(Y, X) param = parameter('-s 6 -v 10') m = train(prob, param) print 'Evaluating..\n' system('liblinear-2.1/train -s 6 -v 10 liblinear-2.1/temp_ds') model = train(prob, parameter('-s 6 -q')) #system('rm liblinear-2.1/temp_ds') print 'Testing model on test set..' p_Y2, p_acc, p_vals = predict(Y2, X2, model) contingency_mat = [[0, 0], [0, 0]] for i in range(len(Y2)): if (Y2[i] == 0) and (p_Y2[i] == 0): contingency_mat[0][0] += 1 elif (Y2[i] == 0) and (p_Y2[i] == 1): contingency_mat[0][1] += 1 elif (Y2[i] == 1) and (p_Y2[i] == 0): contingency_mat[1][0] += 1 else: contingency_mat[1][1] += 1 return (model, p_acc, contingency_mat)
def classify(ds_cur = None): from os import chdir, system chdir('./liblinear-2.1/python/') from liblinearutil import problem, parameter, train, predict chdir('../../') from pdb import set_trace from tqdm import tqdm from pymongo import MongoClient from json import dumps from bson.objectid import ObjectId set_trace() dont_include = {'_id' : 0} print 'List of variables:\n' for key in variable_lookup: print key[1] ch1 = raw_input('Input "s" to select custom fields (default selection - all fields):') if ch1 == 's': print 'Please input 0 for fields you would like to exclude, any other input would include it.' for key in variable_lookup: if key[0] == 'class': continue ch2 = raw_input(key[1] + ':') if ch2 == '0': dont_include[key[0]] = 0 if ds_cur == None: conn = MongoClient('mongodb://localhost:27017') dataset = conn['rmpdb']['dataset_profs_ten_over'] ds_cur = dataset.find(filter = {}, projection = dont_include) dataset2 = conn['rmpdb']['dataset_profs_five_over_less_ten'] ds_cur2 = dataset2.find(filter = {}, projection = dont_include) X = [] # Variables Y = [] # Classes ids = [] # Keep track of professor IDs X2 = [] # Variables Y2 = [] # Classes ids2 = [] # Keep track of professor IDs print 'Building training set according to selection..' for row in tqdm(ds_cur): x_dict = dict() for key in row: if key == 'class': Y.append(int(row[key])) elif key == 'prof_id': ids.append(row[key]) elif isNan(row[key]): continue else: x_dict[int(key)] = float(row[key]) X.append(x_dict) for row in tqdm(ds_cur2): x_dict2 = dict() for key in row: if key == 'class': Y2.append(int(row[key])) elif key == 'prof_id': ids2.append(row[key]) elif isNan(row[key]): continue else: x_dict2[int(key)] = float(row[key]) X2.append(x_dict2) ch = raw_input('Include top words for males and females as features? (y/n) [n]: ') if ch == 'y': from glob import glob from json import loads vec_files = glob('../logs/*.vec') if not len(vec_files) == 0: print 'Word vector files found in ../logs: \n' print vec_files fch = raw_input('Enter name of file without extension. [../logs/trial0.vec] Enter 0 to skip. ../logs/') if fch == '0': male_vector, female_vector = build_vector() else: try: f = open('../logs/' + fch + '.vec', 'r') male_vector, female_vector = loads(f.read()) except: f = open('../logs/trial0.vec', 'r') male_vector, female_vector = loads(f.read()) else: male_vector, female_vector = build_vector() print 'Male vectors as (word, count)' print male_vector print "=============================================" print 'Female vectors as (word, count)' print female_vector print "=============================================" print 'Calculating word features for all professors in dataset. This shall take some time.' print 'Depending on your cutoff, this can take from 4 - 6 hours. Probably a good idea to get some other stuff done..' male_words = [tup[0] for tup in male_vector] female_words = [tup[0] for tup in female_vector] union_words = list(set(male_words).union(set(female_words))) final_words = list() print 'Select words you want to remove by entering "x".' for word in union_words: wch = raw_input(word + ':') if wch == 'x': continue else: final_words.append(word) from string import punctuation exclude = set(punctuation) rmpdb = MongoClient('mongodb://localhost:27017')['rmpdb'] for i in tqdm(range(len(ids))): prof_id = ids[i] # male_dict = dict() # female_dict = dict() # for tup in male_vector: # male_dict[tup[0]] = 0 # for tup in female_vector: # female_dict[tup[0]] = 0 vec_dict = dict() for word in final_words: vec_dict[word] = 0 prof_comments = rmpdb['profs'].find_one({'_id' : ObjectId(prof_id)}, {'_id' : 0, 'all comments.rComments' : 1}) for comment in prof_comments['all comments']: text = comment['rComments'] no_punc_text = ''.join(ch for ch in text if ch not in exclude) toks = no_punc_text.split() for tok in toks: # if tok.lower() in male_dict: # male_dict[tok.lower()] += 1 # if tok.lower() in female_dict: # female_dict[tok.lower()] += 1 if tok.lower() in vec_dict: vec_dict[tok.lower()] += 1 feature_counter = 53 #starts right after variable_lookup['53'] # for j in range(len(male_vector)): # feature_counter += 1 # tup = male_vector[j] # if not male_dict[tup[0]] == 0: # X[i][feature_counter] = male_dict[tup[0]] # for j in range(len(female_vector)): # feature_counter += 1 # tup = female_vector[j] # if not female_dict[tup[0]] == 0: # X[i][feature_counter] = female_dict[tup[0]] for j in range(len(final_words)): feature_counter += 1 word = final_words[j] if not vec_dict[word] == 0: X[i][feature_counter] = vec_dict[word] # if feature_counter == 97: # break print "Building test set.." for i in tqdm(range(len(ids2))): prof_id = ids2[i] # male_dict = dict() # female_dict = dict() # for tup in male_vector: # male_dict[tup[0]] = 0 # for tup in female_vector: # female_dict[tup[0]] = 0 vec_dict = dict() for word in final_words: vec_dict[word] = 0 prof_comments = rmpdb['profs'].find_one({'_id' : ObjectId(prof_id)}, {'_id' : 0, 'all comments.rComments' : 1}) for comment in prof_comments['all comments']: text = comment['rComments'] no_punc_text = ''.join(ch for ch in text if ch not in exclude) toks = no_punc_text.split() for tok in toks: # if tok.lower() in male_dict: # male_dict[tok.lower()] += 1 # if tok.lower() in female_dict: # female_dict[tok.lower()] += 1 if tok.lower() in vec_dict: vec_dict[tok.lower()] += 1 feature_counter = 53 #starts right after variable_lookup['53'] # for j in range(len(male_vector)): # feature_counter += 1 # tup = male_vector[j] # if not male_dict[tup[0]] == 0: # X[i][feature_counter] = male_dict[tup[0]] # for j in range(len(female_vector)): # feature_counter += 1 # tup = female_vector[j] # if not female_dict[tup[0]] == 0: # X[i][feature_counter] = female_dict[tup[0]] for j in range(len(final_words)): feature_counter += 1 word = final_words[j] if not vec_dict[word] == 0: X2[i][feature_counter] = vec_dict[word] print 'Words used:' print final_words else: pass print 'Writing temp files for AUC calculation..' build_svm_file(X, Y) print 'Temp file written..' print 'Features used:' fstr = list() for key in variable_lookup: if key[0] in dont_include or key[0] == 'class': continue else: fstr.append(key[1]) print dumps(fstr) print '======================================\n' prob = problem(Y, X) param = parameter('-s 6 -v 10') m = train(prob, param) print 'Evaluating..\n' system('liblinear-2.1/train -s 6 -v 10 liblinear-2.1/temp_ds') model = train(prob, parameter('-s 6 -q')) #system('rm liblinear-2.1/temp_ds') print 'Testing model on test set..' p_Y2, p_acc, p_vals = predict(Y2, X2, model) contingency_mat = [[0, 0], [0, 0]] for i in range(len(Y2)): if (Y2[i] == 0) and (p_Y2[i] == 0): contingency_mat[0][0] += 1 elif (Y2[i] == 0) and (p_Y2[i] == 1): contingency_mat[0][1] += 1 elif (Y2[i] == 1) and (p_Y2[i] == 0): contingency_mat[1][0] += 1 else: contingency_mat[1][1] += 1 return (model, p_acc, contingency_mat)
def simpleLibLinear(X_train, Y_train): prob = ll.problem(Y_train, X_train) param = ll.parameter('-c ' + str(c)) m = ll.train(prob, param) return m
return problems def parallel_train_predict(args): print("A process begins.") x_train,y_train,x_test,y_test=args problem = liblinearutil.problem(y_train, x_train) parameter = liblinearutil.parameter('-s 0 -c 1') time_start = time.clock() model = liblinearutil.train(problem, parameter) print("A process training finished in %f."%(time.clock()-time_start)) time_start = time.clock() p_label, p_acc, p_val = liblinearutil.predict(y_test, x_test,model,'-b 0') print("A process predicting finished in %f."%(time.clock()-time_start)) return p_val if parallel_computing==0: problems = divide_problem(20000,20000,data_only=1) parameter = liblinearutil.parameter('-s 0 -c 1') models=[] time_start = time.clock() for i in range(len(problems)): t=[] for j in range(len(problems[i])): t.append(train(problems[i][j][1],problems[i][j][0])) models.append(t) print("Exercise 3 with MLP training finished in %f."%(time.clock()-time_start)) time_start = time.clock() p_val=[-1e100]*len(data.x_test) for i in range(len(problems)): t=[1e100]*len(data.x_test) for j in range(len(problems[i])): p_val_ij=predict(data.x_test, data.y_test,models[i][j]) for k in range(len(t)):
def train(self): sys.stderr.write('creating training problem...') prob = problem(self.labels, self.contexts) sys.stderr.write('done\ntraining with option(s) "'+self.parameters+'"...') self.model = train(prob, parameter(self.parameters)) sys.stderr.write('done\n')
def simpleLibLinear(X_train,Y_train): prob = ll.problem(Y_train,X_train) param = ll.parameter('-c '+str(c)) m = ll.train(prob, param) return m