def improve_predictions( probability_predictions_file='../models/pred_ml_proba.pkl', out_file='../models/pred_ml_improved.pkl', use_infer_topology=True): print "> IMPROVING PREDICTIONS\n--- Forcing at least one label (most likely)" print "Loading probability predictions" y_pred_proba = joblib.load(probability_predictions_file) #Because we use a one-versus-rest classifier, there may be documents without any labels #We deal with this by adding the most likely labels y_pred_improved = np.zeros(y_pred_proba.shape, dtype=np.int_) print "Converting to binary predictions" y_pred = np.where(y_pred_proba >= 0.5, 1, 0) for i, (prediction, prediction_proba) in enumerate(tqdm(zip(y_pred, y_pred_proba))): if sum(prediction) == 0: most_likely_label_index = np.argmax(prediction_proba) y_pred_improved[i, most_likely_label_index] = 1 y_pred_improved[i] += prediction print np.sum(np.subtract(y_pred_improved, y_pred)), "labels added" if use_infer_topology: print "> IMPROVING PREDICTIONS\n--- Topology rules" print "Loading train set y-values" y_train, filenames_train = dataset.load_train_y() rules = infer_topology.infer_topology_rules(y_train) y_pred_improved = infer_topology.apply_topology_rules( rules, y_pred_improved) print "Saving to file" joblib.dump(y_pred_improved, out_file) print "Done!\n---"
def improve_predictions( probability_predictions_file="../models/pred_ml_proba.pkl", out_file="../models/pred_ml_improved.pkl", use_infer_topology=True, ): print "> IMPROVING PREDICTIONS\n--- Forcing at least one label (most likely)" print "Loading probability predictions" y_pred_proba = joblib.load(probability_predictions_file) # Because we use a one-versus-rest classifier, there may be documents without any labels # We deal with this by adding the most likely labels y_pred_improved = np.zeros(y_pred_proba.shape, dtype=np.int_) print "Converting to binary predictions" y_pred = np.where(y_pred_proba >= 0.5, 1, 0) for i, (prediction, prediction_proba) in enumerate(tqdm(zip(y_pred, y_pred_proba))): if sum(prediction) == 0: most_likely_label_index = np.argmax(prediction_proba) y_pred_improved[i, most_likely_label_index] = 1 y_pred_improved[i] += prediction print np.sum(np.subtract(y_pred_improved, y_pred)), "labels added" if use_infer_topology: print "> IMPROVING PREDICTIONS\n--- Topology rules" print "Loading train set y-values" y_train, filenames_train = dataset.load_train_y() rules = infer_topology.infer_topology_rules(y_train) y_pred_improved = infer_topology.apply_topology_rules(rules, y_pred_improved) print "Saving to file" joblib.dump(y_pred_improved, out_file) print "Done!\n---"
print np.sum(np.subtract(predictions_improved,predictions)), "labels added" return predictions_improved def print_topology(rules): label_list = dataset.load_labels() root_nodes = set([y for x,y in rules]) tree = {root:[] for root in root_nodes} for rule in rules: from_label, to_label = rule tree[to_label].append(from_label) for parent, children in tree.iteritems(): print label_list[parent] for child in children: print " >", label_list[child] if __name__ == "__main__": with open(DATA_FOLDER+'labels_int.p', 'r') as f: y_dict = pickle.load(f) rules = infer_topology_rules(y_dict.values(), verbose=False) print_topology(rules) y, _ = dataset.load_train_y() rules = infer_topology_rules(y, verbose=False) print_topology(rules)