def protobowl(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) '''eval''' guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn', 'RnnGuesser', 0, '') guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char') with open(guesses_dir, 'rb') as f: guesses = pickle.load(f) guesses = guesses.groupby('qanta_id') questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = questions[fold] df = load_protobowl() df = df.groupby('qid') worker = partial(simulate_game, guesses, buzzes, df) possibility = [] outcome = [] for question in tqdm(questions): pos, out = worker(question) possibility += pos outcome += out result_df = pd.DataFrame({ 'Possibility': possibility, 'Outcome': outcome, }) result_dir = os.path.join(model.model_dir, '{}_protobowl.pkl'.format(fold)) with open(result_dir, 'wb') as f: pickle.dump(result_df, f)
def protobowl(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) """eval""" guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "") guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char") with open(guesses_dir, "rb") as f: guesses = pickle.load(f) guesses = guesses.groupby("qanta_id") questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = questions[fold] df = load_protobowl() df = df.groupby("qid") worker = partial(simulate_game, guesses, buzzes, df) possibility = [] outcome = [] for question in tqdm(questions): pos, out = worker(question) possibility += pos outcome += out result_df = pd.DataFrame({"Possibility": possibility, "Outcome": outcome,}) result_dir = os.path.join(model.model_dir, "{}_protobowl.pkl".format(fold)) with open(result_dir, "wb") as f: pickle.dump(result_df, f)
def fit_curve(self): df = load_protobowl() # convert prompt to false df.result = df.result.apply(lambda x: x is True) xy = list(zip(df.relative_position.tolist(), df.result.tolist())) xy = sorted(xy, key=lambda x: x[0]) ratios = dict() cnt = 0 for x, y in xy: x = int(x * 1000) ratios[x] = cnt cnt += y ratios = sorted(ratios.items(), key=lambda x: x[0]) ratios = [(x / 1000, y) for x, y in ratios] ttl_correct = df.result.tolist().count(True) ttl_correct = len(xy) curve = [(x, 1 - y / ttl_correct) for x, y in ratios] X, y = list(map(list, zip(*curve))) X = np.asarray(X) y = np.asarray(y) degree = 3 polynomial_features = PolynomialFeatures(degree=degree, include_bias=False) linear_regression = LinearRegression() pipeline = Pipeline([ ("polynomial_features", polynomial_features), ("linear_regression", linear_regression), ]) pipeline.fit(X[:, np.newaxis], y) print(pipeline.steps[1][1].coef_) def get_weight(x): return pipeline.predict(np.asarray([[x]]))[0] ddf = pd.DataFrame({"x": X, "y": y}) p0 = (ggplot(ddf, aes(x="x", y="y")) + geom_point(size=0.3, color="blue", alpha=0.5, shape="+") + stat_function(fun=get_weight, color="red", size=2, alpha=0.5) + labs(x="Position", y="Weight")) p0.save("output/reporting/curve_score.pdf") p0.draw() return pipeline
def fit_curve(self): df, questions = load_protobowl() # convert prompt to false df.result = df.result.apply(lambda x: x is True) xy = list(zip(df.relative_position.tolist(), df.result.tolist())) xy = sorted(xy, key=lambda x: x[0]) ratios = dict() cnt = 0 for x, y in xy: x = int(x*1000) ratios[x] = cnt cnt += y ratios = sorted(ratios.items(), key=lambda x: x[0]) ratios = [(x / 1000, y) for x, y in ratios] ttl_correct = df.result.tolist().count(True) ttl_correct = len(xy) curve = [(x, 1 - y / ttl_correct) for x, y in ratios] X, y = list(map(list, zip(*curve))) X = np.asarray(X) y = np.asarray(y) degree = 3 polynomial_features = PolynomialFeatures(degree=degree, include_bias=False) linear_regression = LinearRegression() pipeline = Pipeline([("polynomial_features", polynomial_features), ("linear_regression", linear_regression)]) pipeline.fit(X[:, np.newaxis], y) print(pipeline.steps[1][1].coef_) def get_weight(x): return pipeline.predict(np.asarray([[x]]))[0] ddf = pd.DataFrame({'x': X, 'y': y}) p0 = ggplot(ddf, aes(x='x', y='y')) \ + geom_point(size=0.3, color='blue', alpha=0.5, shape='+') \ + stat_function(fun=get_weight, color='red', size=2, alpha=0.5) \ + labs(x='Position', y='Weight') p0.save('output/reporting/curve_score.pdf') p0.draw() return pipeline
def main(): fold = BUZZER_DEV_FOLD # load questions print('loading questions') questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = questions[fold] # load guesser outputs print('loading guesser outputs') guesses = read_data(fold) guesses = {x[0]: x for x in guesses} # load buzzer outputs print('loading buzzer outputs') buzz_dir = os.path.join(buzzes_dir.format(fold)) with open(buzz_dir, 'rb') as f: buzzes = pickle.load(f) # load protobowl records print('loading protobowl records') df, _ = load_protobowl() record_groups = df.groupby('qid') metrics = [_protobowl_scores, _curve_scores] pool = Pool(8) worker = partial(run_all_metrics, guesses, buzzes, record_groups, metrics) scores = pool.map(worker, questions) all_scores = list(map(list, zip(*scores))) protobowl_scores = all_scores[0] protobowl_scores = list(map(list, zip(*protobowl_scores))) protobowl_scores = [[x for x in s if x is not None] for s in protobowl_scores] print([np.mean(s) for s in protobowl_scores]) curve_scores = all_scores[1] curve_scores = list(map(list, zip(*curve_scores))) curve_scores = [[x for x in s if x is not None] for s in curve_scores] print([np.mean(s) for s in curve_scores])
def protobowl(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) '''eval''' guesses_dir = AbstractGuesser.output_path( 'qanta.guesser.rnn', 'RnnGuesser', 0, '') guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char') with open(guesses_dir, 'rb') as f: guesses = pickle.load(f) guesses = guesses.groupby('qanta_id') questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = questions[fold] df = load_protobowl() df = df.groupby('qid') worker = partial(simulate_game, guesses, buzzes, df) possibility = [] outcome = [] for question in tqdm(questions): pos, out = worker(question) possibility += pos outcome += out result_df = pd.DataFrame({ 'Possibility': possibility, 'Outcome': outcome, }) result_dir = os.path.join( model.model_dir, '{}_protobowl.pkl'.format(fold)) with open(result_dir, 'wb') as f: pickle.dump(result_df, f)