def testRegression(): y = data.loadSVMLightTarget("../feature_set/synthetic.reg") # we are using pseudo index instead of the real feature set to use the # precomputed kernel. x = range(y.shape[0]) dataset = zip(x, y) random.shuffle(dataset) # this precomputed kernel reads the index value and returns the covariance # matrix kernel = kernel_factory.precomputed("../feature_set/synthetic.kernel") for train, test in data.kFolds(dataset): x_train, y_train = zip(*train) x_train = np.atleast_2d(np.array(x_train)).T y_train = np.atleast_2d(np.array(y_train)).T reg = regress.GPRegressor(kernel) reg.fit(x_train, y_train) print "fitted" # test x_test, y_test = zip(*test) x_test = np.atleast_2d(np.array(x_test)).T y_test = np.atleast_2d(np.array(y_test)).T y_predict = reg.predict(x_test) print x_test.T print y_predict.T break
def run(self): #dataset = self.getFollowingData() dataset = self.loadData() for train, test in data.kFolds(dataset): # training m = self.trainModelOn(train) predicates = self.getPredicates(train) e = self.getMostEffective(m, predicates) print e
def getSeeds(self): seeds = [] dataset = self.loadData() for train, test in data.kFolds(dataset): # training m = self.trainModelOn(train) predicates = self.getPredicates(train) e = self.getMostEffective(m, predicates) for tuple_list in e.values(): user_ids = map(operator.itemgetter(0), tuple_list) seeds.extend(user_ids) return list(set(seeds))
def outputOnlyMatched(): maxent.set_verbose(1) text_dataset = getTextData(False) following_dataset = getFollowingData(False) dataset = zip(text_dataset, following_dataset) random.shuffle(dataset) print 'finished loading dataset' tester = tests.tester(4) n_total = 0 n_emit = 0 for train, test in data.kFolds(dataset): text_train, following_train = zip(*train) # training t_model = trainedModelOn(text_train) f_model = trainedModelOn(following_train) # prediction trials = [] for datum in test: text_datum, following_datum = datum text_context, target, weight = text_datum following_context, target, weight = following_datum t_pre = t_model.predict(text_context) f_pre = f_model.predict(following_context) if t_pre == f_pre: trials.append((target, t_pre)) n_emit += 1 n_total += 1 trials = zip(*trials) tester.record(trials[0], trials[1]) print 'accuracy:', tester.accuracy() print 'confusion matrix:' print tester.confusionMatrix() print 'emitted portion:', float(n_emit) / float(n_total)
def getFreqplot(self, dataset): n_buckets = 20 cali_count = np.zeros((4, n_buckets)) for train, test in data.kFolds(dataset): # training m = self.trainedModelOn(train) for datum in test: context, target, weight = datum pre = m.eval_all(context) for label, score in pre: cali_count[int(label), int(floor(n_buckets * score))] += 1 return cali_count
def scoreDistribution(): maxent.set_verbose(1) text_dataset = getTextData() for train, test in data.kFolds(text_dataset): model = trainedModelOn(train) for datum in test: context, target, weight = datum pred = model.predict(context) model.eval_all(context) if pred != target: prob = map(itemgetter(1), sorted(model.eval_all(context), key = itemgetter(0))) print prob, target break
def doCrossValidation(self, dataset, size_limit): tester = tests.tester(4) for train, test in data.kFolds(dataset): # training train = random.sample(train, size_limit) m = self.trainedModelOn(train) # prediction trials = [] for datum in test: context, target, weight = datum pre_target = m.predict(context) trials.append((target, pre_target)) trials = zip(*trials) tester.record(trials[0], trials[1]) print size_limit, tester.accuracy()
def seePredictionOnTrainingData(): maxent.set_verbose(1) dataset = getTextData() print 'finished loading dataset' for train, test in data.kFolds(dataset): m = trainedModelOn(train) print "Accuracy on Training Set" for datum in train: context, target, weight = datum print m.eval_all(context) print "Accuracy on Test Set" for datum in test: context, target, weight = datum print m.eval_all(context) break
def regressAgeWithGP(): y = data.loadSVMLightTarget("../feature_set/text.sreg") # we are using pseudo index instead of the real feature set to use the # precomputed kernel. x = range(y.shape[0]) dataset = zip(x, y) random.shuffle(dataset) # this precomputed kernel reads the index value and returns the covariance # matrix for tau in [1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3]: kernel = kernel_factory.precomputed("../feature_set/text.kernel", tau) rms_errors = [] diviations = [] for train, test in data.kFolds(dataset): x_train, y_train = zip(*train) x_train = np.atleast_2d(np.array(x_train)).T y_train = np.atleast_2d(np.array(y_train)).T reg = regress.GPRegressor(kernel) reg.fit(x_train, y_train) # test x_test, y_test = zip(*test) x_test = np.atleast_2d(np.array(x_test)).T y_test = np.atleast_2d(np.array(y_test)).T y_predict = reg.predict(x_test) rms_e = sqrt(np.mean((y_predict - y_test) ** 2)) div_e = np.mean(np.absolute(y_predict - y_test)) rms_errors.append(rms_e) diviations.append(div_e) print "tau:", tau print "rms error:", np.mean(np.array(rms_e)) print "diviation:", np.mean(np.array(div_e))
def doCrossValidation(dataset): tester = tests.tester(4) for train, test in data.kFolds(dataset): # training m = trainedModelOn(train) print 'train size', len(train) # prediction trials = [] for datum in test: context, target, weight = datum pre_target = m.predict(context) trials.append((target, pre_target)) trials = zip(*trials) tester.record(trials[0], trials[1]) print 'accuracy:', tester.accuracy() print 'confusion matrix:' print tester.confusionMatrix()
def simpleEnsemble(pickup): maxent.set_verbose(1) text_dataset = getTextData(False) following_dataset = getFollowingData(False) dataset = zip(text_dataset, following_dataset) random.shuffle(dataset) print 'finished loading dataset' tester = tests.tester(4) for train, test in data.kFolds(dataset): text_train, following_train = zip(*train) # training t_model = trainedModelOn(text_train) f_model = trainedModelOn(following_train) # prediction trials = [] for datum in test: text_datum, following_datum = datum text_context, target, weight = text_datum following_context, target, weight = following_datum t_conf = t_model.eval_all(text_context) f_conf = f_model.eval_all(following_context) pre_target = str(pickup(t_conf, f_conf)) trials.append((target, pre_target)) trials = zip(*trials) tester.record(trials[0], trials[1]) print 'accuracy:', tester.accuracy() print 'confusion matrix:' print tester.confusionMatrix()
def run(self): dataset = self.loadData() f = open('confused.txt', 'w') for train, test in data.kFolds(dataset): # training m = self.trainedModelOn(train) dbcon = mlm.DBConnector() for datum in test: context, target, weight, meta = datum pre = m.eval_all(context) label = pre[0][0] confi = pre[0][1] if .2 <= confi <= .4 and label != target: f.write('###') f.write(' '.join( map(str, [target, label, meta['user_id'], meta['screen_name'], confi]))) f.write('\n') #for predicate, value in context: # f.write(predicate + ' ' + str(value) + '\n') user_id = meta['user_id'] text, length = dbcon.loadText(user_id) f.write(text) f.write('\n\n\n') f.close()
def classifierEnsemble(): text_dataset = getTextData(False) following_dataset = getFollowingData(False) dataset = zip(text_dataset, following_dataset) random.shuffle(dataset) print 'finished loading dataset' tester = tests.tester(4) def _conf_to_feature(conf1, conf2): def _append_to_key(c): def _append(f): return (c + f[0], f[1]) return _append conf1 = map(_append_to_key('0'), conf1) conf2 = map(_append_to_key('1'), conf2) confs = conf1 confs.extend(conf2) return confs for train, test in data.kFolds(dataset): coffset = int(len(train) * .8) text_train, following_train = zip(*train[:coffset]) # training t_model = trainedModelOn(text_train) f_model = trainedModelOn(following_train) # train a chooser chooser = cmaxent.MaxentModel() chooser.begin_add_event() for datum in train[coffset:]: text_datum, following_datum = datum text_context, target, weight = text_datum following_context, target, weight = following_datum t_conf = t_model.eval_all(text_context) f_conf = f_model.eval_all(following_context) confs = _conf_to_feature(t_conf, f_conf) chooser.add_event(confs, target) chooser.end_add_event(0) chooser.train(50, 'lbfgs', 1e-1, 1e-4) # retrain the underlying classifiers text_train, following_train = zip(*train) t_model = trainedModelOn(text_train) f_model = trainedModelOn(following_train) # prediction trials = [] for datum in test: text_datum, following_datum = datum text_context, target, weight = text_datum following_context, target, weight = following_datum t_conf = t_model.eval_all(text_context) f_conf = f_model.eval_all(following_context) confs = _conf_to_feature(t_conf, f_conf) pre_target = chooser.predict(confs) trials.append((target, pre_target)) trials = zip(*trials) tester.record(trials[0], trials[1]) print 'accuracy:', tester.accuracy() print 'confusion matrix:' print tester.confusionMatrix()