Пример #1
0
def create_wikipedia_cache(
        parsed_wiki_path='data/external/wikipedia/parsed-wiki',
        output_path=WIKI_LOOKUP_PATH):
    from qanta.spark import create_spark_context

    sc = create_spark_context()
    db = QuestionDatabase()
    questions = list(db.all_questions().values())
    train_questions = [
        q for q in questions
        if q.fold == 'guesstrain' or q.fold == 'buzzertrain'
    ]
    answers = {q.page for q in train_questions}
    b_answers = sc.broadcast(answers)
    # Paths used in spark need to be absolute and it needs to exist
    page_path = os.path.abspath(parsed_wiki_path)
    page_pattern = os.path.join(page_path, '*', '*')

    def parse_page(json_text):
        page = json.loads(json_text)
        return {
            'id': int(page['id']),
            'title': page['title'].replace(' ', '_'),
            'text': page['text'],
            'url': page['url']
        }

    wiki_pages = sc.textFile(page_pattern).map(parse_page).filter(
        lambda p: p['title'] in b_answers.value).collect()
    wiki_lookup = {p['title']: p for p in wiki_pages}
    with open(output_path, 'w') as f:
        json.dump(wiki_lookup, f)

    return wiki_lookup
Пример #2
0
    def requires(self):
        db = QuestionDatabase()
        questions = list(db.all_questions().values())
        n_batches = int(math.ceil(len(questions) / BATCH_SIZE))

        for i in range(n_batches):
            yield TaggedQuestionBatch(question_batch=i)
Пример #3
0
def create_wikipedia_redirect_pickle(redirect_csv, output_pickle):
    countries = {}
    with open(COUNTRY_LIST_PATH) as f:
        for line in f:
            k, v = line.split('\t')
            countries[k] = v.strip()

    db = QuestionDatabase()
    pages = set(db.all_answers().values())

    with open(redirect_csv) as redirect_f:
        redirects = {}
        n_total = 0
        n_selected = 0
        for row in csv.reader(redirect_f, quotechar='"', escapechar='\\'):
            n_total += 1
            source = row[0]
            target = row[1]
            if (target not in pages or source in countries
                    or target.startswith('WikiProject')
                    or target.endswith("_topics")
                    or target.endswith("_(overview)")):
                continue
            else:
                redirects[source] = target
                n_selected += 1

        log.info(
            'Filtered {} raw wikipedia redirects to {} matching redirects'.
            format(n_total, n_selected))

    with open(output_pickle, 'wb') as output_f:
        pickle.dump(redirects, output_f)
Пример #4
0
def process_file(filename):
    with open(filename, 'r') as f:
        questions = defaultdict(set)
        for line in f:
            tokens = line.split()
            offset = 1 if int(tokens[0]) == -1 else 0
            ident = tokens[1 + offset].replace("'", "").split('_')
            q = int(ident[0])
            s = int(ident[1])
            t = int(ident[2])
            guess = tokens[3 + offset]
            questions[(q, s, t)].add(guess)
        qdb = QuestionDatabase('data/questions.db')
        answers = qdb.all_answers()
        recall = 0
        warn = 0
        for ident, guesses in questions.items():
            if len(guesses) < conf['n_guesses']:
                log.info("WARNING LOW GUESSES")
                log.info(
                    'Question {0} is missing guesses, only has {1}'.format(
                        ident, len(guesses)))
                warn += 1
            correct = answers[ident[0]].replace(' ', '_') in guesses
            recall += correct
        log.info('Recall: {0} Total: {1}'.format(recall / len(questions),
                                                 len(questions)))
        log.info('Warned lines: {0}'.format(warn))
Пример #5
0
def generate(min_count,  pred_file, meta_file, output):
    database = QuestionDatabase()
    data = load_data(pred_file, meta_file, database)
    dan_answers = set(database.page_by_count(min_count, True))
    answers = compute_answers(data, dan_answers)
    stats = compute_statistics(answers).cache()
    stats.to_json(safe_path(output), root_array=False)
    pprint.pprint(stats)
Пример #6
0
def main():
    questions = QuestionDatabase().all_questions()
    guessdev_questions = {k: v  for k, v in questions.items() 
            if v.fold == 'guessdev'}
    highlights = {}
    for k, v in tqdm(guessdev_questions.items()):
        highlights[k] = get_highlights(v.flatten_text())
    with open('guessdev_highlight.pkl', 'wb') as f:
        pickle.dump(highlights, f)
Пример #7
0
def test():
    questions = QuestionDatabase().all_questions()
    guessdev_questions = [x for x in questions.values() if x.fold == 'guessdev']
    highlights = get_highlights(questions[0].flatten_text())
    print(highlights['guess'])
    for x in highlights['wiki']:
        print('WIKI|' + x.replace('<em>', color.RED).replace('</em>', color.END))
    for x in highlights['qb']:
        print('QUIZ|' + x.replace('<em>', color.RED).replace('</em>', color.END))
Пример #8
0
def text_iterator(use_wiki, wiki_location,
                  use_qb, qb_location,
                  use_source, source_location,
                  limit=-1,
                  min_pages=0, country_list=COUNTRY_LIST_PATH):
    if isinstance(qb_location, str):
        qdb = QuestionDatabase(qb_location)
    else:
        qdb = qb_location
    doc_num = 0

    cw = CachedWikipedia(wiki_location, data_path(country_list))
    pages = qdb.questions_with_pages()

    for p in sorted(pages, key=lambda k: len(pages[k]), reverse=True):
        # This bit of code needs to line up with the logic in qdb.py
        # to have the same logic as the page_by_count function
        if len(pages[p]) < min_pages:
            continue

        if use_qb:
            train_questions = [x for x in pages[p] if x.fold == "train"]
            question_text = "\n".join(" ".join(x.raw_words()) for x in train_questions)
        else:
            question_text = ''

        if use_source:
            filename = '%s/%s' % (source_location, p)
            if os.path.isfile(filename):
                try:
                    with gzip.open(filename, 'rb') as f:
                        source_text = f.read()
                except zlib.error:
                    log.info("Error reading %s" % filename)
                    source_text = ''
            else:
                source_text = ''
        else:
            source_text = u''

        if use_wiki:
            wikipedia_text = cw[p].content
        else:
            wikipedia_text = u""

        total_text = wikipedia_text
        total_text += "\n"
        total_text += question_text
        total_text += "\n"
        total_text += str(source_text)

        yield p, total_text
        doc_num += 1

        if 0 < limit < doc_num:
            break
Пример #9
0
def main():
    questions = QuestionDatabase().all_questions()
    guessdev_questions = {
        k: v
        for k, v in questions.items() if v.fold == 'guessdev'
    }
    highlights = {}
    for k, v in tqdm(guessdev_questions.items()):
        highlights[k] = get_highlights(v.flatten_text())
    with open('guessdev_highlight.pkl', 'wb') as f:
        pickle.dump(highlights, f)
Пример #10
0
def train_classifier(class_type, question_db=None):
    if question_db is None:
        question_db = QuestionDatabase(QB_QUESTION_DB)

    log.info("Training classifier: {}".format(class_type))
    all_questions = question_db.questions_with_pages()
    train = compute_features(all_questions, 'train', class_type)
    train_x = train['text']
    train_y = train['label']
    classifier = pipeline_creators[class_type]().fit(train_x, train_y)
    return classifier
Пример #11
0
    def initialize_cache(path):
        """
        This function iterates over all pages and accessing them in the cache. This forces a
        prefetch of all wiki pages
        """
        db = QuestionDatabase(QB_QUESTION_DB)
        pages = db.questions_with_pages()
        cw = CachedWikipedia(path)
        pool = Pool()

        input_data = [(format_guess(title), cw) for title in pages.keys()]
        pool.starmap(access_page, input_data)
Пример #12
0
def load_quizbowl(folds=c.BUZZER_INPUT_FOLDS) \
                    -> Tuple[Dict[str, int], Dict[str, list]]:
    # merge_dfs()
    log.info('Loading data')
    question_db = QuestionDatabase()
    quizbowl_db = QuizBowlDataset(bc.MIN_ANSWERS,
                                  guesser_train=True,
                                  buzzer_train=True)
    all_questions = question_db.all_questions()
    if not os.path.isfile(bc.OPTIONS_DIR):
        log.info('Loading the set of options')
        all_options = set(quizbowl_db.training_data()[1])

        id2option = list(all_options)
        with open(safe_path(bc.OPTIONS_DIR), 'wb') as outfile:
            pickle.dump(id2option, outfile)
    else:
        with open(safe_path(bc.OPTIONS_DIR), 'rb') as infile:
            id2option = pickle.load(infile)
    option2id = {o: i for i, o in enumerate(id2option)}
    num_options = len(id2option)
    log.info('Number of options {0}'.format(len(id2option)))

    guesses_by_fold = dict()
    for fold in folds:
        save_dir = '%s_processed.pickle' % (os.path.join(bc.GUESSES_DIR, fold))
        if os.path.isfile(save_dir):
            with open(safe_path(save_dir), 'rb') as infile:
                guesses_by_fold[fold] = pickle.load(infile)
            log.info('Loading {0} guesses'.format(fold))
            continue

        log.info('Processing {0} guesses'.format(fold))
        guesses = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold])

        worker = partial(_process_question, option2id, all_questions)
        inputs = guesses.groupby('qnum')
        guesses_by_fold[fold] = _multiprocess(worker,
                                              inputs,
                                              info='df data',
                                              multi=True)
        guesses_by_fold[fold] = [
            x for x in guesses_by_fold[fold] if x is not None
        ]
        print(len(guesses_by_fold[fold]))

        with open(safe_path(save_dir), 'wb') as outfile:
            pickle.dump(guesses_by_fold[fold], outfile)

        log.info('Processed {0} guesses saved to {1}'.format(fold, save_dir))

    return option2id, guesses_by_fold
Пример #13
0
def test():
    questions = QuestionDatabase().all_questions()
    guessdev_questions = [
        x for x in questions.values() if x.fold == 'guessdev'
    ]
    highlights = get_highlights(questions[0].flatten_text())
    print(highlights['guess'])
    for x in highlights['wiki']:
        print('WIKI|' +
              x.replace('<em>', color.RED).replace('</em>', color.END))
    for x in highlights['qb']:
        print('QUIZ|' +
              x.replace('<em>', color.RED).replace('</em>', color.END))
Пример #14
0
 def run(self):
     db = QuestionDatabase(QB_QUESTION_DB)
     questions = db.all_questions()
     with open(safe_path(EXPO_QUESTIONS), 'w', newline='') as f:
         f.write('id,answer,sent,text\n')
         writer = csv.writer(f, delimiter=',')
         for q in questions.values():
             if q.fold != 'test':
                 continue
             max_sent = max(q.text.keys())
             for i in range(max_sent + 1):
                 writer.writerow(
                     [q.qnum, format_guess(q.page), i, q.text[i]])
Пример #15
0
def web_initialize_file_cache(path, remote_delay=1):
    """
    Initialize the cache by requesting each page with wikipedia package.
    This function iterates over all pages and accessing them in the cache. This forces a
    prefetch of all wiki pages
    """
    db = QuestionDatabase()
    pages = db.questions_with_pages()
    cw = CachedWikipedia(path, remote_delay=remote_delay)
    pool = Pool()

    input_data = [(title, cw) for title in pages.keys()]
    pool.starmap(access_page, input_data)
Пример #16
0
def create_output(path: str):
    df = read_dfs(path).cache()
    question_db = QuestionDatabase()
    answers = question_db.all_answers()
    for qnum in answers:
        answers[qnum] = format_guess(answers[qnum])

    sc = SparkContext.getOrCreate()  # type: SparkContext
    b_answers = sc.broadcast(answers)

    def generate_string(group):
        rows = group[1]
        result = ""
        feature_values = []
        meta = None
        qnum = None
        sentence = None
        token = None
        guess = None
        for name in FEATURE_NAMES:
            named_feature_list = list(
                filter(lambda r: r.feature_name == name, rows))
            if len(named_feature_list) != 1:
                raise ValueError(
                    'Encountered more than one row when there should be exactly one row'
                )
            named_feature = named_feature_list[0]
            if meta is None:
                qnum = named_feature.qnum
                sentence = named_feature.sentence
                token = named_feature.token
                guess = named_feature.guess
                meta = '{} {} {} {}'.format(qnum, named_feature.sentence,
                                            named_feature.token, guess)
            feature_values.append(named_feature.feature_value)
        assert '@' not in result, \
            '@ is a special character that is split on and not allowed in the feature line'

        vw_features = ' '.join(feature_values)
        if guess == b_answers.value[qnum]:
            vw_label = "1 '{}_{}_{} ".format(qnum, sentence, token)
        else:
            vw_label = "-1 '{}_{}_{} ".format(qnum, sentence, token)

        return vw_label + vw_features + '@' + meta

    for fold in VW_FOLDS:
        group_features(df.filter(df.fold == fold))\
            .map(generate_string)\
            .saveAsTextFile('output/vw_input/{0}.vw'.format(fold))
    sc.stop()
Пример #17
0
def task_list():
    guess_df = AbstractGuesser.load_all_guesses()
    question_db = QuestionDatabase()
    question_map = question_db.all_questions()
    tasks = []
    guess_df = guess_df[['qnum', 'sentence', 'token', 'guess',
                         'fold']].drop_duplicates(
                             ['qnum', 'sentence', 'token', 'guess'])
    for name, guesses in guess_df.groupby(['qnum', 'sentence', 'token']):
        qnum = name[0]
        question = question_map[qnum]
        tasks.append(Task(question, guesses))

    return tasks
Пример #18
0
def generate_questions():
    with open('data/100_possible_questions.pickle', 'rb') as f:
        qs = pickle.load(f)

    with open('data/qb_questions.txt', 'w') as f:
        for q in qs:
            f.write(q.flatten_text())
            f.write('\n')

    db = QuestionDatabase()
    answers = db.all_answers().values()
    with open('data/answers.txt', 'w') as f:
        for a in answers:
            f.write(a.lower().replace(' ', '_'))
            f.write('\n')
Пример #19
0
def evaluate():
    wiki = WikiNetworkGuesser()
    db = QuestionDatabase(QB_QUESTION_DB)
    questions = [q for q in db.all_questions().values() if q.fold == 'train']
    random.shuffle(questions)

    subset = questions[0:10]
    df = None
    for q in subset:
        tmp_df = wiki.generate_guesses(q.flatten_text(), q.page.lower().replace(' ', '_'), q.qnum)
        if df is None:
            df = tmp_df
        else:
            df = pd.concat([df, tmp_df])
    return df
Пример #20
0
def load_quizbowl(split_sentences=True, num_answers=-1, min_answer_freq=-1):
    nlp = spacy.load("en")
    questions = QuestionDatabase().all_questions().values()
    answers = [x.page for x in questions]
    answer_counter = collections.Counter(answers)
    if num_answers != -1:
        answer_counter = sorted(answer_counter.items(), key=lambda x: x[1])[::-1]
        answers = [x for x, y in answer_counter[:num_answers]]
    else:
        answers = [x for x, y in answer_counter.items() if y >= min_answer_freq]
    answer_to_id = {x: i for i, x in enumerate(answers)}
    print("# class: {}".format(len(answers)))

    folds = [GUESSER_TRAIN_FOLD, GUESSER_DEV_FOLD]
    questions = [x for x in questions if x.fold in folds and x.page in answers]

    train, dev = [], []
    for q in tqdm(questions):
        text = nlp(clean_question(q.flatten_text()))
        answer = answer_to_id[q.page]
        if split_sentences:
            for sent in text.sents:
                sent = [w.lower_ for w in sent if w.is_alpha or w.is_digit]
                if q.fold == GUESSER_TRAIN_FOLD:
                    train.append((sent, answer))
                else:
                    dev.append((sent, answer))
        else:
            sent = [w.lower_ for w in text if w.is_alpha or w.is_digit]
            if q.fold == GUESSER_TRAIN_FOLD:
                train.append((sent, answer))
            else:
                dev.append((sent, answer))

    return train, dev, answers
Пример #21
0
def preprocess_titles():
    # stop_words = set(stopwords.words('english'))
    titles_file = open('data/titles-sorted.txt')
    db = QuestionDatabase()
    pages = {format_guess(page) for page in db.questions_with_pages().keys()}
    with open('data/processed-titles-sorted.txt', 'w') as f:
        for line in titles_file:
            page = format_guess(line.strip().lower())
            # if len(page) > 2 and re.match(r"^[a-zA-Z0-9_()']+$", page)\
            #         and page not in stop_words and page[0].isalnum():
            if page in pages:
                f.write(line.strip().lower())
            else:
                f.write('@')
            f.write('\n')
    titles_file.close()
Пример #22
0
def hyper_search(fold):
    option2id, all_guesses = load_quizbowl()

    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold])

    cfgs = get_cfgs()
    cfg_buzzes = []
    for i, cfg in enumerate(cfgs):
        print('**********{}**********'.format(i))
        buzzes = run(cfg, fold, all_guesses, option2id)
        cfg_buzzes.append((cfg, buzzes))

    with open('output/buzzer/cfg_buzzes_{}.pkl'.format(fold), 'wb') as outfile:
        pickle.dump(cfg_buzzes, outfile)
Пример #23
0
def main():
    buzzer = RNNBuzzer()

    # setup questions
    questions = list(QuestionDatabase().all_questions().values())
    dev_questions = [x for x in questions if x.fold == 'dev']

    # setup machine agent
    gspec = AbstractGuesser.list_enabled_guessers()[0]
    guesser_dir = AbstractGuesser.output_path(gspec.guesser_module,
                                              gspec.guesser_class, '')
    guesser = ElasticSearchWikidataGuesser.load(guesser_dir)
    guesser = ESGuesserWrapper(guesser)
    machine_agent = GuesserBuzzerAgent(guesser, buzzer)

    # setup human agent
    human_agent = HumanAgent()

    # setup hook
    hooks = []
    hooks.append(hook.NotifyBuzzingHook)
    hooks.append(hook.GameInterfaceHook)
    hooks.append(hook.VisualizeGuesserBuzzerHook(machine_agent))
    hooks.append(hook.HighlightHook)

    # setup game
    game = Game(dev_questions, [human_agent, machine_agent], hooks)

    game.run(10)
Пример #24
0
def main(folds, model_name):

    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    question_texts = {k: v.text for k, v in all_questions.items()}
    protobowl_ids = {
        k: all_questions[k].protobowl
        for k in all_questions if all_questions[k].protobowl != ""
    }
    protobowl_df = load_protobowl().groupby("qid")

    save_dir = "output/summary/new_performance/"
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # feature -> fold -> value
    variables = defaultdict(lambda: defaultdict())
    for fold in folds:
        guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold])
        questions = guesses_df.groupby("qnum")

        buzzes_dir = bc.BUZZES_DIR.format(fold, model_name)
        with open(buzzes_dir, "rb") as infile:
            buzzes = pickle.load(infile)
        log.info("Buzzes loaded from {}.".format(buzzes_dir))

        # qnum -> n_guessers * length
        top_guesses = _multiprocess(_get_top_guesses,
                                    questions,
                                    info="Top guesses",
                                    multi=True)
        top_guesses = {k: v for k, v in top_guesses}
        inputs = [top_guesses, buzzes, answers, variables, fold, save_dir]

        # get_eop_stats(*inputs)
        get_his_stats(*inputs)
        # get_hyper_search(*inputs)

        p_inputs = [question_texts, protobowl_ids, protobowl_df, questions
                    ] + inputs
        get_protobowl(p_inputs)

    for key, value in variables.items():
        variables[key] = dict(value)
    variables = dict(variables)

    report(variables, save_dir, folds)
Пример #25
0
def load_data(pred_file: str, meta_file: str, q_db: QuestionDatabase) -> Sequence:
    preds = load_predictions(pred_file)
    metas = load_meta(meta_file)
    answers = q_db.all_answers()

    def create_line(group):
        question = group[0]
        elements = group[1]
        st_groups = (
            seq(elements).group_by(lambda x: (x[0].sentence, x[0].token)).sorted()
        )
        st_lines = []
        for st, v in st_groups:
            scored_guesses = (
                seq(v)
                .map(lambda x: ScoredGuess(x[0].score, x[1].guess))
                .sorted(reverse=True)
                .list()
            )
            st_lines.append(
                Line(
                    question,
                    st[0],
                    st[1],
                    scored_guesses[0].score > 0,
                    scored_guesses[0].guess,
                    answers[question],
                    scored_guesses,
                )
            )
        return question, st_lines

    def fix_missing_label(pm):
        prediction = pm[0]
        meta = pm[1]
        if (
            prediction.question is None
            or prediction.token is None
            or prediction.sentence is None
        ):
            log.info(
                "WARNING: Prediction malformed, fixing with meta line: {0}".format(
                    prediction
                )
            )
            prediction = Prediction(
                prediction.score, meta.question, meta.sentence, meta.token
            )
        assert meta.question == prediction.question
        assert meta.sentence == prediction.sentence
        assert meta.token == prediction.token
        return prediction, meta

    return (
        preds.zip(metas)
        .map(fix_missing_label)
        .group_by(lambda x: x[0].question)
        .map(create_line)
    )
Пример #26
0
def wikify(output_directory):
    database = QuestionDatabase(QB_QUESTION_DB)
    pages = database.questions_with_pages()

    total = 0
    for p in pages:
        if len(pages[p]) >= conf['wikifier']['min_appearances']:
            log.info('{} {}'.format(p, len(pages[p])))
            for q in pages[p]:
                total += 1
                for sentence, word, text in q.partials():
                    sentence -= 1
                    with open(
                            "%s/%i-%i.txt" %
                        (output_directory, q.qnum, sentence), 'w') as output:
                        output.write("%s\n" % text[sentence])
    log.info(str(total))
Пример #27
0
def create_wikipedia_cache(dump_path):
    from qanta.spark import create_spark_session

    spark = create_spark_session()
    db = QuestionDatabase()
    answers = set(db.all_answers().values())
    b_answers = spark.sparkContext.broadcast(answers)
    # Paths used in spark need to be absolute and it needs to exist
    page_path = os.path.abspath(safe_path(WIKI_PAGE_PATH))

    def create_page(row):
        title = normalize_wikipedia_title(row.title)
        filter_answers = b_answers.value
        if title in filter_answers:
            page = WikipediaPage(title, row.text, None, None, row.id, row.url)
            write_page(page, page_path=page_path)

    spark.read.json(dump_path).rdd.foreach(create_page)
Пример #28
0
Файл: test.py Проект: nadesai/qb
def test_buzzer():
    questions = QuestionDatabase().all_questions()
    buzzer = RNNBuzzer(word_skip=conf['buzzer_word_skip'])

    # setup machine agent
    gspec = AbstractGuesser.list_enabled_guessers()[0]
    guesser_dir = AbstractGuesser.output_path(gspec.guesser_module,
                                              gspec.guesser_class, '')
    guesser = ElasticSearchWikidataGuesser.load(guesser_dir)
    guesser = ESGuesserWrapper(guesser)

    key = list(questions.keys())[4]
    question = questions[key].flatten_text().split()
    for i, word in enumerate(question):
        clue = ' '.join(question[:i])
        guesses = guesser.guess(clue)
        buzz = buzzer.buzz(guesses)
        print(buzz)
Пример #29
0
    def run(self):
        make_dirs('output/tagme/')
        db = QuestionDatabase()
        questions = list(db.all_questions().values())
        batch = 0
        batch_lookup = {}

        while batch * BATCH_SIZE < len(questions):
            batch_questions = questions[batch * BATCH_SIZE:(batch + 1) *
                                        BATCH_SIZE]
            batch_lookup[batch] = batch_questions
            batch += 1

        with open('output/tagme/batches.pickle', 'wb') as f:
            pickle.dump(batch_lookup, f)

        with open('output/tagme/meta.pickle', 'wb') as f:
            pickle.dump(batch, f)
Пример #30
0
def main(folds, model_name):
    
    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    question_texts = {k: v.text for k, v in all_questions.items()}
    protobowl_ids = {k: all_questions[k].protobowl 
        for k in all_questions if all_questions[k].protobowl != ''}
    protobowl_df = load_protobowl().groupby('qid')

    save_dir = 'output/summary/new_performance/'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # feature -> fold -> value
    variables = defaultdict(lambda: defaultdict())
    for fold in folds:
        guesses_df = AbstractGuesser.load_guesses(
                bc.GUESSES_DIR, folds=[fold])
        questions = guesses_df.groupby('qnum')

        buzzes_dir = bc.BUZZES_DIR.format(fold, model_name)
        with open(buzzes_dir, 'rb') as infile:
            buzzes = pickle.load(infile)
        log.info('Buzzes loaded from {}.'.format(buzzes_dir))

        # qnum -> n_guessers * length
        top_guesses = _multiprocess(_get_top_guesses, questions, 
            info='Top guesses', multi=True)
        top_guesses = {k: v for k, v in top_guesses}
        inputs = [top_guesses, buzzes, answers, variables, fold, save_dir]

        # get_eop_stats(*inputs)
        get_his_stats(*inputs)
        # get_hyper_search(*inputs)
        
        p_inputs = [question_texts, protobowl_ids, protobowl_df, questions] + inputs
        get_protobowl(p_inputs)

    for key, value in variables.items():
        variables[key] = dict(value)
    variables = dict(variables)

    report(variables, save_dir, folds)
Пример #31
0
    def __init__(self):
        super(StatsExtractor, self).__init__()
        with open(SENTENCE_STATS, 'rb') as f:
            self.word_count_mean, self.word_count_std = pickle.load(f)

        self.guess_frequencies = {}
        question_db = QuestionDatabase(QB_QUESTION_DB)
        all_questions = question_db.questions_with_pages()
        for page in all_questions:
            self.guess_frequencies[page] = sum(1 for x in all_questions[page]
                                               if x.fold == "train")

        self.frequency_mean = np.mean(list(self.guess_frequencies.values()))
        self.frequency_std = np.std(list(self.guess_frequencies.values()))
        for page in all_questions:
            normalized_frequency = normalize(self.guess_frequencies[page],
                                             self.frequency_mean,
                                             self.frequency_std)
            self.guess_frequencies[page] = normalized_frequency
        self.normed_missing_guess = normalize(0, self.frequency_mean,
                                              self.frequency_std)
Пример #32
0
def report_ultimate():
    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    question_texts = {k: v.text for k, v in all_questions.items()}
    protobowl_ids = {
        k: all_questions[k].protobowl
        for k in all_questions if all_questions[k].protobowl != ''
    }
    protobowl_df, user_count = load_protobowl()
    guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR,
                                              folds=[c.BUZZER_DEV_FOLD])
    questions = guesses_df.groupby('qnum')
    top_guesses = _multiprocess(_get_top_guesses,
                                questions,
                                info='Top guesses',
                                multi=True)
    top_guesses = {k: v for k, v in top_guesses}

    option2id, all_guesses = load_quizbowl()
    test_iter = QuestionIterator(all_guesses[c.BUZZER_DEV_FOLD],
                                 option2id,
                                 batch_size=128)
    buzzes = ultimate_buzzer(test_iter)

    save_dir = 'output/summary/new_performance/'
    inputs = [top_guesses, buzzes, answers, None, c.BUZZER_DEV_FOLD, save_dir]
    user_answers_thresholds = [1, 10, 50, 100, 500, 1000, 2000]
    threshold_stats = []
    for threshold in user_answers_thresholds:
        pdf1 = protobowl_df[protobowl_df.user_answers > threshold]
        p_inputs = [
            question_texts, protobowl_ids,
            pdf1.groupby('qid'), questions
        ] + inputs
        pstats = get_protobowl(p_inputs)
        threshold_stats.append(pstats)
        print('ultimate', threshold, pstats)
    print('ultimate', [x['reward'] for x in threshold_stats])
Пример #33
0
def report(buzzes_dir):
    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    question_texts = {k: v.text for k, v in all_questions.items()}
    protobowl_ids = {
        k: all_questions[k].protobowl
        for k in all_questions if all_questions[k].protobowl != ''
    }
    protobowl_df, user_count = load_protobowl()
    guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR,
                                              folds=[c.BUZZER_DEV_FOLD])
    questions = guesses_df.groupby('qnum')
    top_guesses = _multiprocess(_get_top_guesses,
                                questions,
                                info='Top guesses',
                                multi=True)
    top_guesses = {k: v for k, v in top_guesses}

    with open(buzzes_dir, 'rb') as infile:
        buzzes = pickle.load(infile)

    save_dir = 'output/summary/new_performance/'
    inputs = [top_guesses, buzzes, answers, None, c.BUZZER_DEV_FOLD, save_dir]
    user_answers_thresholds = [1, 10, 50, 100, 500, 1000, 2000]
    threshold_stats = []
    for threshold in user_answers_thresholds:
        pdf1 = protobowl_df[protobowl_df.user_answers > threshold]
        p_inputs = [
            question_texts, protobowl_ids,
            pdf1.groupby('qid'), questions
        ] + inputs
        pstats = get_protobowl(p_inputs)
        threshold_stats.append(pstats)
        print(threshold, pstats)
    with open(buzzes_dir + '.pstats', 'wb') as f:
        pickle.dump(threshold_stats, f)
    print([x['reward'] for x in threshold_stats])
Пример #34
0
def main():
    db = QuestionDatabase()
    question_lookup = db.all_questions()
    questions = list(question_lookup.values())


    guesser_train_questions = [q for q in questions if q.fold == 'guesstrain']
    guesser_train_answers = [q.page for q in guesser_train_questions]
    answer_counts = Counter(guesser_train_answers)
    answer_set = set(answer_counts.keys())

    app = dash.Dash()
    app.layout = html.Div(children=[
        html.H1(children='Quiz Bowl Question Explorer'),
        compute_stats(questions, db.location),

        html.H2('Question Inspector'),
        dcc.Dropdown(
            options=[{'label': q.qnum, 'value': q.qnum} for q in questions],
            value=questions[0].qnum,
            id='question-selector'
        ),
        html.Div([
            html.Div(id='question-display')
        ]),

        dcc.Graph(
            id='answer-count-plot',
            figure=go.Figure(
                data=[go.Histogram(x=list(answer_counts.values()), name='Answer Counts')],
                layout=go.Layout(
                    title='Answer Count Distribution',
                    showlegend=True
                )
            )
        ),
        dcc.Graph(
            id='answer-count-cum-plot',
            figure=go.Figure(
                data=[go.Histogram(
                    x=list(answer_counts.values()),
                    name='Answer Counts Cumulative',
                    cumulative=dict(enabled=True, direction='decreasing'),
                    histnorm='percent'
                )],
                layout=go.Layout(
                    title='Answer Count Cumulative Distribution',
                    showlegend=True
                )
            )
        ),

        html.Label('Answer Selection'),
        dcc.Dropdown(
            options=sorted([{'label': a, 'value': a} for a in answer_set], key=lambda k: k['label']),
            id='answer-list'
        ),
        html.Div(id='answer-count')
    ])

    @app.callback(
        Output(component_id='answer-count', component_property='children'),
        [Input(component_id='answer-list', component_property='value')]
    )
    def update_answer_count(answer):
        return f'Answer: {answer} Question Count: {answer_counts[answer]}'

    @app.callback(
        Output(component_id='question-display', component_property='children'),
        [Input(component_id='question-selector', component_property='value')]
    )
    def update_question(qb_id):
        qb_id = int(qb_id)
        question = question_lookup[qb_id]
        sentences, answer, _ = question.to_example()
        return [
            html.P(f'ID: {qb_id} Fold: {question.fold}'),
            html.H3('Sentences')
        ] + [html.P(f'{i}: {sent}') for i, sent in enumerate(sentences)] + [
            html.H3('Answer'), html.P(answer)
        ]

    app.css.append_css({"external_url": "https://codepen.io/chriddyp/pen/bWLwgP.css"})
    app.run_server(debug=True)