def compare_manual_vs_model(): with open(DATA_FOLDER + "labels_int.p", "r") as f: y_dict = pickle.load(f) print "Loading test data" X_test, y_test, filenames_test = dataset.load_test() y_pred = joblib.load("../models/pred_ml_improved.pkl") relevant = [] for pred, correct, filename in zip(y_pred, y_test, filenames_test): if filename in FILES: relevant.append((pred, correct, filename, CLASSIFICATIONS[filename])) model_predictions, correct, filename, manual_predictions = zip(*relevant) manual_predictions = learn.multilabel_binary_y(manual_predictions) model_predictions = np.array(model_predictions) correct = learn.multilabel_binary_y(correct) rules = infer_topology.infer_topology_rules() improved_manual = infer_topology.apply_topology_rules(rules, manual_predictions) prediction_names = ["MODEL", "MANUAL", "IMPROVED_MANUAL"] predictions = [model_predictions, manual_predictions, improved_manual] for name, pred in zip(prediction_names, predictions): print "\n{}\n--".format(name) print "Zero-one classification loss", zero_one_loss(correct, pred) print "Hamming loss", hamming_loss(correct, pred) print "Precision:", precision_score(correct, pred, average="weighted", labels=label_list) print "Recall :", recall_score(correct, pred, average="weighted", labels=label_list) print "F1 score :", f1_score(correct, pred, average="weighted", labels=label_list)
def compare_manual_vs_model(): with open(DATA_FOLDER + 'labels_int.p', 'r') as f: y_dict = pickle.load(f) print "Loading test data" X_test, y_test, filenames_test = dataset.load_test() y_pred = joblib.load('../models/pred_ml_improved.pkl') relevant = [] for pred, correct, filename in zip(y_pred, y_test, filenames_test): if filename in FILES: relevant.append( (pred, correct, filename, CLASSIFICATIONS[filename])) model_predictions, correct, filename, manual_predictions = zip(*relevant) manual_predictions = learn.multilabel_binary_y(manual_predictions) model_predictions = np.array(model_predictions) correct = learn.multilabel_binary_y(correct) rules = infer_topology.infer_topology_rules() improved_manual = infer_topology.apply_topology_rules( rules, manual_predictions) prediction_names = ["MODEL", "MANUAL", "IMPROVED_MANUAL"] predictions = [model_predictions, manual_predictions, improved_manual] for name, pred in zip(prediction_names, predictions): print "\n{}\n--".format(name) print "Zero-one classification loss", zero_one_loss(correct, pred) print "Hamming loss", hamming_loss(correct, pred) print "Precision:", precision_score(correct, pred, average='weighted', labels=label_list) print "Recall :", recall_score(correct, pred, average='weighted', labels=label_list) print "F1 score :", f1_score(correct, pred, average='weighted', labels=label_list)
def improve_predictions( probability_predictions_file='../models/pred_ml_proba.pkl', out_file='../models/pred_ml_improved.pkl', use_infer_topology=True): print "> IMPROVING PREDICTIONS\n--- Forcing at least one label (most likely)" print "Loading probability predictions" y_pred_proba = joblib.load(probability_predictions_file) #Because we use a one-versus-rest classifier, there may be documents without any labels #We deal with this by adding the most likely labels y_pred_improved = np.zeros(y_pred_proba.shape, dtype=np.int_) print "Converting to binary predictions" y_pred = np.where(y_pred_proba >= 0.5, 1, 0) for i, (prediction, prediction_proba) in enumerate(tqdm(zip(y_pred, y_pred_proba))): if sum(prediction) == 0: most_likely_label_index = np.argmax(prediction_proba) y_pred_improved[i, most_likely_label_index] = 1 y_pred_improved[i] += prediction print np.sum(np.subtract(y_pred_improved, y_pred)), "labels added" if use_infer_topology: print "> IMPROVING PREDICTIONS\n--- Topology rules" print "Loading train set y-values" y_train, filenames_train = dataset.load_train_y() rules = infer_topology.infer_topology_rules(y_train) y_pred_improved = infer_topology.apply_topology_rules( rules, y_pred_improved) print "Saving to file" joblib.dump(y_pred_improved, out_file) print "Done!\n---"
def improve_predictions( probability_predictions_file="../models/pred_ml_proba.pkl", out_file="../models/pred_ml_improved.pkl", use_infer_topology=True, ): print "> IMPROVING PREDICTIONS\n--- Forcing at least one label (most likely)" print "Loading probability predictions" y_pred_proba = joblib.load(probability_predictions_file) # Because we use a one-versus-rest classifier, there may be documents without any labels # We deal with this by adding the most likely labels y_pred_improved = np.zeros(y_pred_proba.shape, dtype=np.int_) print "Converting to binary predictions" y_pred = np.where(y_pred_proba >= 0.5, 1, 0) for i, (prediction, prediction_proba) in enumerate(tqdm(zip(y_pred, y_pred_proba))): if sum(prediction) == 0: most_likely_label_index = np.argmax(prediction_proba) y_pred_improved[i, most_likely_label_index] = 1 y_pred_improved[i] += prediction print np.sum(np.subtract(y_pred_improved, y_pred)), "labels added" if use_infer_topology: print "> IMPROVING PREDICTIONS\n--- Topology rules" print "Loading train set y-values" y_train, filenames_train = dataset.load_train_y() rules = infer_topology.infer_topology_rules(y_train) y_pred_improved = infer_topology.apply_topology_rules(rules, y_pred_improved) print "Saving to file" joblib.dump(y_pred_improved, out_file) print "Done!\n---"