def getTrainedModel(training_set, test_set, n_round): ''' @param training_set This is a list of pairs, where each pair is (user_id, label) @param test_set This is in the same format as training_set @return model Trained model which expose eval_all() method ''' # {{{ # TRAINING phase m = cmaxent.MaxentModel() m.begin_add_event() # add event reading the file one by one for user_id, label in training_set: context = getContext(user_id) if context is None: continue weight = 1.0 m.add_event(context, label, weight) m.end_add_event(1) m.train(100, 'lbfgs', 1e1, 1e-4) # TEST phase confusion_matrix = np.zeros([4,4]) for user_id, label in test_set: context = getContext(user_id) if context is None: continue weight = 1.0 predictions = m.eval_all(context) predicted_target = predictions[0][0] label = int(label) predicted_target = int(predicted_target) confusion_matrix[label][predicted_target] += 1 accuracy = np.trace(confusion_matrix) / float(confusion_matrix.sum()) # Write the log logAndPrint('Round %d; Confusion Matrix' % n_round) logAndPrint(str(confusion_matrix)) logAndPrint('Test Accuracy: %f\n' % accuracy) return m # }}}
def caliplot(): model = getLearner() test_file = '../../data/semi/train_test_hardlabel/test0' confidence_bins = np.zeros([4, BIN_SIZE], dtype = float) correct_guess_bins = np.zeros([4, BIN_SIZE], dtype = float) for line in open(test_file): user_id, target = line.rstrip('\n').split('\t') context = getContext(user_id) if context is None: continue weight = 1.0 predictions = model.eval_all(context) predicted_target = int(predictions[0][0]) with_conf_bin = int(predictions[0][1] * BIN_SIZE) for label, confidence in predictions: conf_bin = int(confidence * BIN_SIZE) confidence_bins[int(label)][conf_bin] += 1 correct_guess_bins[predicted_target][with_conf_bin] += 1 print correct_guess_bins / confidence_bins
def addInstancesFromFile(model, file_name): counter = 0 # count the number of instances successfully added for line in open(file_name): user_id, target = line.rstrip('\n').split('\t') context = getContext(user_id) if context is None: continue weight = 1.0 model.add_event(context, target, weight) counter += 1 print "A total of %d instances have been added into the model." % counter return model
def evaluate(model, unlabeled_set, n_round): # {{{ evaluation_results = [[] for x in range(4)] print 'start to evaluate unlabeled set of size %d.' % len(unlabeled_set) count = 1 for user_id in unlabeled_set: context = getContext(user_id) if context is None: continue predictions = model.eval_all(context) predicted_target, score = predictions[0] predicted_target = int(predicted_target) evaluation_results[predicted_target].append((user_id, score)) if count % 100 == 0: sys.stdout.write('.') sys.stdout.flush() if count % 1000 == 0: print '%d out of %d data evaluated.' % (count, len(unlabeled_set)) count += 1 # let's see the confidence distribution of the pseudo labeled data num_bins = 10 confidence_bins = np.zeros([4, num_bins], dtype = int) for label, eval_list in enumerate(evaluation_results): for user_id, score in eval_list: bin_idx = int(score * num_bins) confidence_bins[label][bin_idx] += 1 # write log logAndPrint('Round %d; Confidence distribution' % n_round) logAndPrint(str(confidence_bins) + '\n') return evaluation_results # }}}