def test_learner_on_data(): import GwData import WordTokenizer import numpy as np MINIMUM_COVERAGE_PCT = 2.0 code = "53" print "Learning rules for code: " + code # '%%' is how you print a '%' in python given that it is a special char print "Mininum coverage: %d%%\n" % (MINIMUM_COVERAGE_PCT) data = GwData.GwData() xs = WordTokenizer.tokenize(data.documents, stem=False, spelling_correct=False, remove_stop_words=False, min_word_count=1) ys = data.labels_for(code) def rule_score_fn(act_ys, predicted): r, p, f1 = rpf1(act_ys, predicted) return r * (p**0.5) shuffled_ixs = np.array(range(len(xs))) np.random.shuffle(shuffled_ixs) shuffled_xs = np.array(xs)[shuffled_ixs] shuffled_ys = np.array(ys)[shuffled_ixs] td_size = int(len(xs) * 0.9) td_xs, td_ys = shuffled_xs[0:td_size], shuffled_ys[0:td_size] vd_xs, vd_ys = shuffled_xs[td_size:], shuffled_ys[td_size:] assert len(td_xs) + len(vd_xs) == len(xs), "|TD| + |VD| == |D|" learner = RegExLearner(precision, f1_score, MINIMUM_COVERAGE_PCT) learner.fit(td_xs, td_ys) print_positives(xs, ys) print str(learner) # TD Performance td_pred = learner.predict(td_xs) r, p, f1 = rpf1(td_ys, td_pred) print "TD:\n\tRecall: {0}\n\tPrecision: {1}\n\tF1: {2}\n".format( r, p, f1) # VD performance vd_pred = learner.predict(vd_xs) r, p, f1 = rpf1(vd_ys, vd_pred) print "VD:\n\tRecall: {0}\n\tPrecision: {1}\n\tF1: {2}\n".format( r, p, f1) pass
def test_learner_on_data(): import GwData import WordTokenizer import numpy as np MINIMUM_COVERAGE_PCT = 2.0 code = "53" print "Learning rules for code: " + code # '%%' is how you print a '%' in python given that it is a special char print "Mininum coverage: %d%%\n" % (MINIMUM_COVERAGE_PCT) data = GwData.GwData() xs = WordTokenizer.tokenize(data.documents, stem=False, spelling_correct=False, remove_stop_words=False, min_word_count=1) ys = data.labels_for(code) def rule_score_fn(act_ys, predicted): r, p, f1 = rpf1(act_ys, predicted) return r * (p ** 0.5) shuffled_ixs = np.array(range(len(xs))) np.random.shuffle(shuffled_ixs) shuffled_xs = np.array(xs)[shuffled_ixs] shuffled_ys = np.array(ys)[shuffled_ixs] td_size = int(len(xs) * 0.9) td_xs, td_ys = shuffled_xs[0:td_size], shuffled_ys[0:td_size] vd_xs, vd_ys = shuffled_xs[td_size:], shuffled_ys[td_size:] assert len(td_xs) + len(vd_xs) == len(xs), "|TD| + |VD| == |D|" learner = RegExLearner(precision, f1_score, MINIMUM_COVERAGE_PCT) learner.fit(td_xs, td_ys) print_positives(xs, ys) print str(learner) # TD Performance td_pred = learner.predict(td_xs) r, p, f1 = rpf1(td_ys, td_pred) print "TD:\n\tRecall: {0}\n\tPrecision: {1}\n\tF1: {2}\n".format(r, p, f1) # VD performance vd_pred = learner.predict(vd_xs) r, p, f1 = rpf1(vd_ys, vd_pred) print "VD:\n\tRecall: {0}\n\tPrecision: {1}\n\tF1: {2}\n".format(r, p, f1) pass
def test_learner_on_data(): import GwData import WordTokenizer code = "50" data = GwData.GwData() xs = WordTokenizer.tokenize(data.documents, spelling_correct=False) ys = data.labels_for(code) def rule_score_fn(act_ys, predicted): return precision(act_ys, predicted) * (recall(act_ys, predicted)** 0.5) learner = RegExLearner(precision, f1_score, 2.5) learner.fit(xs, ys) pred = learner.predict(xs) # TD Performance print_positives(xs, ys) r, p, f1 = rpf1(ys, pred) print "TD:\n\tRecall: {0}\n\tPrecision: {1}\n\tF1: {2}\n".format( r, p, f1) print str(learner) pass
def test(epochs = 1): results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.0, show_accuracy=True, verbose=1) #valid_probs = model.predict_proba(X_valid, batch_size=batch_size) test_probs = model.predict_proba(X_test, batch_size=batch_size) valid_f1s = [] test_f1s = [] test_f1s_50 = [] cutoff = 0 for ix, tag in ix2tag.items(): #valid_tag_predictions = valid_probs[:, ix] test_tag_predictions = test_probs[:, ix] #valid_tag_ys = y_valid[:, ix] test_tag_ys = y_test[:, ix] #r_v, p_v, f1_v, cutoff = find_cutoff(valid_tag_ys, valid_tag_predictions) #alid_f1s.append(f1_v) #test_classes = [1 if p >= cutoff else 0 for p in test_tag_predictions] test_classes_5050 = [1 if p >= 0.5 else 0 for p in test_tag_predictions] #r, p, f1 = rpf1(test_tag_ys, test_classes) r50, p50, f150 = rpf1(test_tag_ys, test_classes_5050) #print("VALIDATION:", tag.ljust(35), str(sum(valid_tag_ys)).ljust(3), "recall", rnd(r_v), "precision", rnd(p_v), "f1", rnd(f1_v), "cutoff", rnd(cutoff)) #print("TEST :", tag.ljust(35), str(sum(test_tag_ys)).ljust(3), "recall", rnd(r), "precision", rnd(p), "f1", rnd(f1), "cutoff", rnd(cutoff)) print("TEST 50/50:", tag.ljust(35), str(sum(test_tag_ys)).ljust(3), "recall", rnd(r50), "precision", rnd(p50), "f1", rnd(f150), "cutoff", rnd(cutoff)) #test_f1s.append(f1) test_f1s_50.append(f150) #print("MEAN VALID F1 : " + str(np.mean(valid_f1s))) #print("MEAN TEST F1 : " + str(np.mean(test_f1s))) print("MEAN TEST F1 50/50 : " + str(np.mean(test_f1s_50))) return np.mean(test_f1s), np.mean(test_f1s_50)
def test(epochs = 1): ixs = range(len(X_train)) random.shuffle(ixs) x_shf = X_train[ixs] y_shf = y_train[ixs] concat_X_train = [] for i in range(len(ngram_filters)): concat_X_train.append(x_shf) results = model.fit(concat_X_train, y_shf, batch_size=batch_size, nb_epoch=epochs, validation_split=0.0, show_accuracy=True, verbose=1) predictions = model.predict_proba(concat_X_test) print("Xp shape:", predictions.shape) f1s = [] for ix, tag in ix2tag.items(): tag_predictions = predictions[:, ix] tag_predictions = [1 if p >= 0.5 else 0 for p in tag_predictions] tag_ys = y_test[:, ix] r, p, f1 = rpf1(tag_ys, tag_predictions) count = sum(tag_ys) print(tag.ljust(10), str(count).rjust(4), "recall", rnd(r), "precision", rnd(p), "f1", rnd(f1)) f1s.append(f1) mean_f1 = np.mean(f1s) print("MEAN F1: " + str(mean_f1)) return mean_f1
def test(epochs=1): ixs = range(len(X_train)) random.shuffle(ixs) x_shf = X_train[ixs] y_shf = y_train[ixs] concat_X_train = [] for i in range(len(ngram_filters)): concat_X_train.append(x_shf) results = model.fit(concat_X_train, y_shf, batch_size=batch_size, nb_epoch=epochs, validation_split=0.0, show_accuracy=True, verbose=1) predictions = model.predict_proba(concat_X_test) print("Xp shape:", predictions.shape) f1s = [] for ix, tag in ix2tag.items(): tag_predictions = predictions[:, ix] tag_predictions = [1 if p >= 0.5 else 0 for p in tag_predictions] tag_ys = y_test[:, ix] r, p, f1 = rpf1(tag_ys, tag_predictions) count = sum(tag_ys) print(tag.ljust(10), str(count).rjust(4), "recall", rnd(r), "precision", rnd(p), "f1", rnd(f1)) f1s.append(f1) mean_f1 = np.mean(f1s) print("MEAN F1: " + str(mean_f1)) return mean_f1
def test(epochs=1): ixs = range(len(X_train)) random.shuffle(ixs) x_shf = X_train[ixs] y_shf = y_train[ixs] model.fit({ "input": X_train, "output": y_train }, nb_epoch=epochs) #64 seems good for now predictions = model.predict({"input": X_test, "output": y_test})["output"] print("Xp shape:", predictions.shape) f1s = [] for ix, tag in ix2tag.items(): tag_predictions = predictions[:, ix] tag_predictions = [1 if p >= 0.5 else 0 for p in tag_predictions] tag_ys = y_test[:, ix] r, p, f1 = rpf1(tag_ys, tag_predictions) count = sum(tag_ys) print(tag.ljust(10), str(count).rjust(4), "recall", rnd(r), "precision", rnd(p), "f1", rnd(f1)) f1s.append(f1) mean_f1 = np.mean(f1s) print("MEAN F1: " + str(mean_f1)) return mean_f1
def test_learner(): instances = [ (["a", "b", "c", "d"], 1), (["a", "b", "d", "c"], 1), (["c", "a", "b"], 1), (["a", "b"], 1), (["a", "b", "e"], 0), (["c", "b", "a"], 0), (["a", "c"], 0), (["a", "c"], 0), (["b", "c"], 0), (["b", "d"], 0), (["d"], 0), ] xs, ys = zip(*instances) learner = RegExLearner(precision, f1_score, 2.0) learner.fit(xs, ys) pred = learner.predict(xs) print_positives(xs, ys) r, p, f1 = rpf1(ys, pred) print "TD:\n\tRecall: {0}\n\tPrecision: {1}\n\tF1: {2}\n".format(r, p, f1) print str(learner) pass
def test(epochs = 1): results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.0, show_accuracy=True, verbose=1) probs = flatten( model.predict_proba(X_test, batch_size=batch_size) ) y_pred = [1 if p >= 0.5 else 0 for p in probs] r, p, f1 = rpf1(y_test, y_pred) print("recall", r, "precision", p, "f1", f1) return f1
def test_learner(): instances = [ (["a", "b", "c", "d"], 1), (["a", "b", "d", "c"], 1), (["c", "a", "b"], 1), (["a", "b"], 1), (["a", "b", "e"], 0), (["c", "b", "a"], 0), (["a", "c"], 0), (["a", "c"], 0), (["b", "c"], 0), (["b", "d"], 0), (["d"], 0), ] xs, ys = zip(*instances) learner = RegExLearner(precision, f1_score, 2.0) learner.fit(xs, ys) pred = learner.predict(xs) print_positives(xs, ys) r, p, f1 = rpf1(ys, pred) print "TD:\n\tRecall: {0}\n\tPrecision: {1}\n\tF1: {2}\n".format( r, p, f1) print str(learner) pass
def find_cutoff(y_test, predictions): scale = 20.0 min_val = round(min(predictions)) max_val = round(max(predictions)) diff = max_val - min_val inc = diff / scale cutoff = -1 best = -1 for i in range(1, int(scale)+1, 1): val = inc * i classes = [1 if p >= val else 0 for p in predictions] r, p, f1 = rpf1(y_test, classes) if f1 >= best: cutoff = val best = f1 classes = [1 if p >= cutoff else 0 for p in predictions] r, p, f1 = rpf1(y_test, classes) return r, p, f1, cutoff
def test(epochs=1): results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.0, show_accuracy=True, verbose=1) classes = flatten(model.predict_classes(X_test, batch_size=batch_size)) r, p, f1 = rpf1(y_test, classes) print("recall", r, "precision", p, "f1", f1) return f1
def test(epochs=1): results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.0, show_accuracy=True, verbose=1) #valid_probs = model.predict_proba(X_valid, batch_size=batch_size) test_probs = model.predict_proba(X_test, batch_size=batch_size) valid_f1s = [] test_f1s = [] test_f1s_50 = [] cutoff = 0 for ix, tag in ix2tag.items(): #valid_tag_predictions = valid_probs[:, ix] test_tag_predictions = test_probs[:, ix] #valid_tag_ys = y_valid[:, ix] test_tag_ys = y_test[:, ix] #r_v, p_v, f1_v, cutoff = find_cutoff(valid_tag_ys, valid_tag_predictions) #alid_f1s.append(f1_v) #test_classes = [1 if p >= cutoff else 0 for p in test_tag_predictions] test_classes_5050 = [ 1 if p >= 0.5 else 0 for p in test_tag_predictions ] #r, p, f1 = rpf1(test_tag_ys, test_classes) r50, p50, f150 = rpf1(test_tag_ys, test_classes_5050) #print("VALIDATION:", tag.ljust(35), str(sum(valid_tag_ys)).ljust(3), "recall", rnd(r_v), "precision", rnd(p_v), "f1", rnd(f1_v), "cutoff", rnd(cutoff)) #print("TEST :", tag.ljust(35), str(sum(test_tag_ys)).ljust(3), "recall", rnd(r), "precision", rnd(p), "f1", rnd(f1), "cutoff", rnd(cutoff)) print("TEST 50/50:", tag.ljust(35), str(sum(test_tag_ys)).ljust(3), "recall", rnd(r50), "precision", rnd(p50), "f1", rnd(f150), "cutoff", rnd(cutoff)) #test_f1s.append(f1) test_f1s_50.append(f150) #print("MEAN VALID F1 : " + str(np.mean(valid_f1s))) #print("MEAN TEST F1 : " + str(np.mean(test_f1s))) print("MEAN TEST F1 50/50 : " + str(np.mean(test_f1s_50))) return np.mean(test_f1s), np.mean(test_f1s_50)
def test(epochs=1): ixs = range(len(X_train)) random.shuffle(ixs) x_shf = X_train[ixs] y_shf = y_train[ixs] model.fit({"input": X_train, "output": y_train}, nb_epoch=epochs)#64 seems good for now predictions = model.predict({"input": X_test, "output": y_test})["output"] print("Xp shape:", predictions.shape) f1s = [] for ix, tag in ix2tag.items(): tag_predictions = predictions[:, ix] tag_predictions = [1 if p >= 0.5 else 0 for p in tag_predictions] tag_ys = y_test[:, ix] r, p, f1 = rpf1(tag_ys, tag_predictions) count = sum(tag_ys) print(tag.ljust(10), str(count).rjust(4), "recall", rnd(r), "precision", rnd(p), "f1", rnd(f1)) f1s.append(f1) mean_f1 = np.mean(f1s) print("MEAN F1: " + str(mean_f1)) return mean_f1
def test_learner_on_data(): import GwData import WordTokenizer code = "50" data = GwData.GwData() xs = WordTokenizer.tokenize(data.documents, spelling_correct=False) ys = data.labels_for(code) def rule_score_fn(act_ys, predicted): return precision(act_ys, predicted) * (recall(act_ys, predicted) ** 0.5) learner = RegExLearner(precision, f1_score, 2.5) learner.fit(xs, ys) pred = learner.predict(xs) # TD Performance print_positives(xs, ys) r, p, f1 = rpf1(ys, pred) print "TD:\n\tRecall: {0}\n\tPrecision: {1}\n\tF1: {2}\n".format(r, p, f1) print str(learner) pass
def rule_score_fn(act_ys, predicted): r, p, f1 = rpf1(act_ys, predicted) return r * (p ** 0.5)
def test(epochs = 1): results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.0, show_accuracy=True, verbose=1) classes = flatten( model.predict_classes(X_test, batch_size=batch_size) ) r, p, f1 = rpf1(y_test, classes) print("recall", r, "precision", p, "f1", f1) return f1
def rule_score_fn(act_ys, predicted): r, p, f1 = rpf1(act_ys, predicted) return r * (p**0.5)
def score_fn(expected, actual): r,p,f1 = rpf1(expected, actual) return 1.0 - f1