예제 #1
0
def improve_predictions(
        probability_predictions_file='../models/pred_ml_proba.pkl',
        out_file='../models/pred_ml_improved.pkl',
        use_infer_topology=True):

    print "> IMPROVING PREDICTIONS\n--- Forcing at least one label (most likely)"
    print "Loading probability predictions"
    y_pred_proba = joblib.load(probability_predictions_file)

    #Because we use a one-versus-rest classifier, there may be documents without any labels
    #We deal with this by adding the most likely labels

    y_pred_improved = np.zeros(y_pred_proba.shape, dtype=np.int_)
    print "Converting to binary predictions"
    y_pred = np.where(y_pred_proba >= 0.5, 1, 0)

    for i, (prediction,
            prediction_proba) in enumerate(tqdm(zip(y_pred, y_pred_proba))):
        if sum(prediction) == 0:
            most_likely_label_index = np.argmax(prediction_proba)
            y_pred_improved[i, most_likely_label_index] = 1
        y_pred_improved[i] += prediction

    print np.sum(np.subtract(y_pred_improved, y_pred)), "labels added"

    if use_infer_topology:
        print "> IMPROVING PREDICTIONS\n--- Topology rules"
        print "Loading train set y-values"
        y_train, filenames_train = dataset.load_train_y()

        rules = infer_topology.infer_topology_rules(y_train)
        y_pred_improved = infer_topology.apply_topology_rules(
            rules, y_pred_improved)

    print "Saving to file"
    joblib.dump(y_pred_improved, out_file)
    print "Done!\n---"
예제 #2
0
def improve_predictions(
    probability_predictions_file="../models/pred_ml_proba.pkl",
    out_file="../models/pred_ml_improved.pkl",
    use_infer_topology=True,
):

    print "> IMPROVING PREDICTIONS\n--- Forcing at least one label (most likely)"
    print "Loading probability predictions"
    y_pred_proba = joblib.load(probability_predictions_file)

    # Because we use a one-versus-rest classifier, there may be documents without any labels
    # We deal with this by adding the most likely labels

    y_pred_improved = np.zeros(y_pred_proba.shape, dtype=np.int_)
    print "Converting to binary predictions"
    y_pred = np.where(y_pred_proba >= 0.5, 1, 0)

    for i, (prediction, prediction_proba) in enumerate(tqdm(zip(y_pred, y_pred_proba))):
        if sum(prediction) == 0:
            most_likely_label_index = np.argmax(prediction_proba)
            y_pred_improved[i, most_likely_label_index] = 1
        y_pred_improved[i] += prediction

    print np.sum(np.subtract(y_pred_improved, y_pred)), "labels added"

    if use_infer_topology:
        print "> IMPROVING PREDICTIONS\n--- Topology rules"
        print "Loading train set y-values"
        y_train, filenames_train = dataset.load_train_y()

        rules = infer_topology.infer_topology_rules(y_train)
        y_pred_improved = infer_topology.apply_topology_rules(rules, y_pred_improved)

    print "Saving to file"
    joblib.dump(y_pred_improved, out_file)
    print "Done!\n---"
예제 #3
0
    print np.sum(np.subtract(predictions_improved,predictions)), "labels added"
    return predictions_improved

def print_topology(rules):

    label_list = dataset.load_labels()

    root_nodes = set([y for x,y in rules])
    tree = {root:[] for root in root_nodes}

    for rule in rules:
        from_label, to_label = rule
        tree[to_label].append(from_label)

    for parent, children in tree.iteritems():
        print label_list[parent]
        for child in children:
            print "  >", label_list[child]

if __name__ == "__main__":

    with open(DATA_FOLDER+'labels_int.p', 'r') as f:
        y_dict = pickle.load(f)

    rules = infer_topology_rules(y_dict.values(), verbose=False)
    print_topology(rules)

    y, _ = dataset.load_train_y()
    rules = infer_topology_rules(y, verbose=False)
    print_topology(rules)