示例#1
0
def process_file(filename):
    with open(filename, 'r') as f:
        questions = defaultdict(set)
        for line in f:
            tokens = line.split()
            offset = 1 if int(tokens[0]) == -1 else 0
            ident = tokens[1 + offset].replace("'", "").split('_')
            q = int(ident[0])
            s = int(ident[1])
            t = int(ident[2])
            guess = tokens[3 + offset]
            questions[(q, s, t)].add(guess)
        qdb = QuestionDatabase('data/questions.db')
        answers = qdb.all_answers()
        recall = 0
        warn = 0
        for ident, guesses in questions.items():
            if len(guesses) < N_GUESSES:
                print("WARNING LOW GUESSES")
                print('Question {0} is missing guesses, only has {1}'.format(ident, len(guesses)))
                warn += 1
            correct = answers[ident[0]].replace(' ', '_') in guesses
            recall += correct
        print('Recall: {0} Total: {1}'.format(recall / len(questions), len(questions)))
        print('Warned lines: {0}'.format(warn))
示例#2
0
def generate(min_count, qdb, pred_file, meta_file, output):
    database = QuestionDatabase(qdb)
    data = load_data(pred_file, meta_file, database)
    dan_answers = set(database.page_by_count(min_count=min_count, exclude_test=True))
    answers = compute_answers(data, dan_answers)
    stats = compute_statistics(answers).cache()
    stats.to_json(output, root_array=False)
    pp = pprint.PrettyPrinter()
    pp.pprint(stats)
示例#3
0
 def initialize_cache(path):
     """
     This function iterates over all pages and accessing them in the cache. This forces a
     prefetch of all wiki pages
     """
     db = QuestionDatabase(QB_QUESTION_DB)
     pages = db.questions_with_pages()
     cw = CachedWikipedia(path)
     for p in pages:
         cw[p].content
示例#4
0
 def initialize_cache(path):
     """
     This function iterates over all pages and accessing them in the cache. This forces a
     prefetch of all wiki pages
     """
     db = QuestionDatabase(QB_QUESTION_DB)
     pages = db.questions_with_pages()
     cw = CachedWikipedia(path)
     for p in pages:
         cw[p].content
示例#5
0
 def build(cls):
     ix = index.create_in(WHOOSH_WIKI_INDEX_PATH, cls.schema)
     writer = ix.writer()
     cw = CachedWikipedia(QB_WIKI_LOCATION, COUNTRY_LIST_PATH)
     qdb = QuestionDatabase(QB_QUESTION_DB)
     questions = qdb.questions_with_pages()
     pages = [page for page, questions in questions if len(questions) < MAX_APPEARANCES]
     pages = list(qdb.get_all_pages(exclude_test=True))
     print("Building whoosh wiki index from {0} pages".format(len(pages)))
     bar = progressbar.ProgressBar()
     for p in bar(pages):
         writer.add_document(page=p, content=cw[p].content)
     writer.commit()
示例#6
0
def wikify(output_directory):
    database = QuestionDatabase(QB_QUESTION_DB)
    pages = database.questions_with_pages()

    total = 0
    for p in pages:
        if len(pages[p]) >= MIN_APPEARANCES:
            print(p, len(pages[p]))
            for q in pages[p]:
                total += 1
                for sentence, word, text in q.partials():
                    sentence -= 1
                    with open("%s/%i-%i.txt" % (output_directory, q.qnum, sentence),
                              'w') as output:
                        output.write("%s\n" % unidecode(text[sentence]))
    print(total)
示例#7
0
def wikify(output_directory):
    database = QuestionDatabase(QB_QUESTION_DB)
    pages = database.questions_with_pages()

    total = 0
    for p in pages:
        if len(pages[p]) >= MIN_APPEARANCES:
            print(p, len(pages[p]))
            for q in pages[p]:
                total += 1
                for sentence, word, text in q.partials():
                    sentence -= 1
                    with open(
                            "%s/%i-%i.txt" %
                        (output_directory, q.qnum, sentence), 'w') as output:
                        output.write("%s\n" % unidecode(text[sentence]))
    print(total)
示例#8
0
def compute_stats():
    qdb = QuestionDatabase(QB_QUESTION_DB)
    ir = IrExtractor()
    questions = qdb.guess_questions()
    test_guesses = pseq(questions, partition_size=100)\
        .filter(lambda q: q.fold == 'test')\
        .map(lambda q: (q.page, ir.text_guess(q.flatten_text())))
    correct = 0
    close = 0
    total = 0
    for page, guesses in test_guesses:
        top_guess = max(guesses.items(), key=lambda x: x[1], default=None)
        if top_guess is not None and page == top_guess[0]:
            correct += 1
        elif page in guesses:
            close += 1
        total += 1
    print("Total Correct: {0}, Percent Correct: {1}".format(correct, correct / total))
    print("Total Close: {0}, Percent Close: {1}".format(close, close / total))
示例#9
0
def compute_stats():
    qdb = QuestionDatabase(QB_QUESTION_DB)
    ir = IrExtractor()
    questions = qdb.guess_questions()
    test_guesses = pseq(questions, partition_size=100)\
        .filter(lambda q: q.fold == 'test')\
        .map(lambda q: (q.page, ir.text_guess(q.flatten_text())))
    correct = 0
    close = 0
    total = 0
    for page, guesses in test_guesses:
        top_guess = max(guesses.items(), key=lambda x: x[1], default=None)
        if top_guess is not None and page == top_guess[0]:
            correct += 1
        elif page in guesses:
            close += 1
        total += 1
    print("Total Correct: {0}, Percent Correct: {1}".format(
        correct, correct / total))
    print("Total Close: {0}, Percent Close: {1}".format(close, close / total))
示例#10
0
def build_classifier(class_type, output: str, bigram_thresh=1000):
    questions = QuestionDatabase(QB_QUESTION_DB)
    bigram_filename = "output/classifier/%s/bigrams.pkl" % class_type
    if os.path.exists(bigram_filename):
        bgset = pickle.load(open(bigram_filename, 'rb'))
        print("Using previous bigrams")
    else:
        print("computing bigrams...")
        bgset = compute_frequent_bigrams(bigram_thresh, questions)
        write_bigrams(bgset, bigram_filename)

    train_classifier(output, bgset, questions, class_type)
    evaluate(output, bgset, questions, class_type)
示例#11
0
def create_guesses(guess_db_path, processes=cpu_count()):
    q_db = QuestionDatabase(QB_QUESTION_DB)
    guess_list = GuessList(guess_db_path)

    deep_feature = instantiate_feature('deep', q_db)
    questions = q_db.guess_questions()
    tasks = []
    for q in questions:
        tasks.append((q, deep_feature))

    with Pool(processes=processes) as pool:
        question_guesses = pool.imap(parallel_generate_guesses, tasks)
        i, n = 0, len(tasks)
        log.info("Guess generation starting for {0} questions".format(n))
        for qnum, fold, guesses in question_guesses:
            guess_list.save_guesses('deep', qnum, fold, guesses)
            log.info("Progress: {0} / {1} questions completed".format(i, n))
            i += 1

    log.info("Guess generation completed, generating indices")
    guess_list.create_indexes()
    log.info("Guess generation done")
示例#12
0
def create_guesses(guess_db_path, processes=cpu_count()):
    q_db = QuestionDatabase(QB_QUESTION_DB)
    guess_list = GuessList(guess_db_path)

    deep_feature = instantiate_feature('deep', q_db)
    questions = q_db.guess_questions()
    tasks = []
    for q in questions:
        tasks.append((q, deep_feature))

    with Pool(processes=processes) as pool:
        question_guesses = pool.imap(parallel_generate_guesses, tasks)
        i, n = 0, len(tasks)
        log.info("Guess generation starting for {0} questions".format(n))
        for qnum, fold, guesses in question_guesses:
            guess_list.save_guesses('deep', qnum, fold, guesses)
            log.info("Progress: {0} / {1} questions completed".format(i, n))
            i += 1

    log.info("Guess generation completed, generating indices")
    guess_list.create_indexes()
    log.info("Guess generation done")
示例#13
0
def spark_batch(sc: SparkContext,
                feature_names,
                question_db: str,
                guess_db: str,
                granularity='sentence'):
    sql_context = SQLContext(sc)
    question_db = QuestionDatabase(question_db)

    log.info("Loading Questions")
    questions = question_db.guess_questions()

    log.info("Loading Guesses")
    guess_list = GuessList(guess_db)
    guess_lookup = guess_list.all_guesses(allow_train=True)

    log.info("Loading tasks")
    tasks = [Task(q, guess_lookup[q.qnum]) for q in questions]
    shuffle(tasks)
    log.info("Number of tasks: {0}".format(len(tasks)))

    features = {
        name: instantiate_feature(name, question_db)
        for name in feature_names
    }

    b_features = sc.broadcast(features)

    def f_eval(x):
        return evaluate_feature_question(x, b_features, granularity)

    log.info("Beginning feature job")
    feature_rdd = sc.parallelize(tasks)\
        .repartition(150 * len(feature_names))\
        .flatMap(f_eval)

    feature_df = sql_context.createDataFrame(feature_rdd, SCHEMA).cache()
    feature_df.count()
    log.info("Beginning write job")
    for fold in FOLDS:
        feature_df_with_fold = feature_df.filter(
            'fold = "{0}"'.format(fold)).cache()
        for name in feature_names:
            filename = 'output/features/{0}/sentence.{1}.parquet'.format(
                fold, name)
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            feature_df_with_fold.filter('feature_name = "{0}"'.format(name))\
                .write.save(filename, mode='overwrite')
        feature_df_with_fold.unpersist()
    log.info("Computation Completed, stopping Spark")
    sc.stop()
示例#14
0
def start_spark_streaming():
    question_db = QuestionDatabase(QB_QUESTION_DB)
    features = {name: instantiate_feature(name, question_db) for name in FEATURE_NAMES}

    sc = create_sc()
    b_features = sc.broadcast(features)
    ssc = StreamingContext(sc, 5)

    ssc.socketTextStream('localhost', 9999) \
        .repartition(QB_STREAMING_CORES - 1) \
        .flatMap(lambda line: generate_guesses(line, b_features)) \
        .map(lambda sg: evaluate_features(sg, b_features)) \
        .foreachRDD(score_and_save)

    ssc.start()
    ssc.awaitTermination()
    sc.stop()
示例#15
0
def instantiate_feature(feature_name: str, question_db: QuestionDatabase):
    """
    @param feature_name: The feature to instantiate
    @param question_db: question database
    """

    feature = None
    print("Loading feature %s ..." % feature_name)
    if feature_name == "lm":
        feature = LanguageModel(data_path(CLM_PATH))
    elif feature_name == "deep":
        page_dict = {}
        for page in question_db.get_all_pages():
            page_dict[page.lower().replace(' ', '_')] = page
        feature = DeepExtractor(C.DEEP_DAN_CLASSIFIER_TARGET,
                                C.DEEP_DAN_PARAMS_TARGET, C.DEEP_VOCAB_TARGET,
                                "data/internal/common/ners", page_dict)
    elif feature_name == "wikilinks":
        feature = WikiLinks()
    elif feature_name == "answer_present":
        feature = AnswerPresent()
    elif feature_name == "label":
        feature = Labeler(question_db)
    elif feature_name == "classifier":
        # TODO: Change this to depend on any given bigrams.pkl, which are atm all the same
        feature = Classifier(question_db)
    elif feature_name == "mentions":
        answers = set(x for x, y in text_iterator(False,
                                                  "",
                                                  False,
                                                  question_db,
                                                  False,
                                                  "",
                                                  limit=-1,
                                                  min_pages=MIN_APPEARANCES))
        feature = Mentions(answers)
    else:
        log.info("Don't know what to do with %s" % feature_name)
    log.info("done")
    return feature
示例#16
0
def instantiate_feature(feature_name: str, question_db: QuestionDatabase):
    """
    @param feature_name: The feature to instantiate
    @param question_db: question database
    """

    feature = None
    print("Loading feature %s ..." % feature_name)
    if feature_name == "lm":
        feature = LanguageModel(data_path(CLM_PATH))
    elif feature_name == "deep":
        page_dict = {}
        for page in question_db.get_all_pages():
            page_dict[page.lower().replace(' ', '_')] = page
        feature = DeepExtractor(
            C.DEEP_DAN_CLASSIFIER_TARGET,
            C.DEEP_DAN_PARAMS_TARGET,
            C.DEEP_VOCAB_TARGET,
            "data/internal/common/ners",
            page_dict
        )
    elif feature_name == "wikilinks":
        feature = WikiLinks()
    elif feature_name == "answer_present":
        feature = AnswerPresent()
    elif feature_name == "label":
        feature = Labeler(question_db)
    elif feature_name == "classifier":
        # TODO: Change this to depend on any given bigrams.pkl, which are atm all the same
        feature = Classifier(question_db)
    elif feature_name == "mentions":
        answers = set(x for x, y in text_iterator(
            False, "", False, question_db, False, "", limit=-1, min_pages=MIN_APPEARANCES))
        feature = Mentions(answers)
    else:
        log.info("Don't know what to do with %s" % feature_name)
    log.info("done")
    return feature
示例#17
0
def text_iterator(use_wiki,
                  wiki_location,
                  use_qb,
                  qb_location,
                  use_source,
                  source_location,
                  limit=-1,
                  min_pages=0,
                  country_list=COUNTRY_LIST_PATH):
    if isinstance(qb_location, str):
        qdb = QuestionDatabase(qb_location)
    else:
        qdb = qb_location
    doc_num = 0

    cw = CachedWikipedia(wiki_location, data_path(country_list))
    pages = qdb.questions_with_pages()

    for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True):
        # This bit of code needs to line up with the logic in qdb.py
        # to have the same logic as the page_by_count function
        if len(pages[pp]) < min_pages:
            continue

        if use_qb:
            train_questions = [x for x in pages[pp] if x.fold == "train"]
            question_text = u"\n".join(u" ".join(x.raw_words())
                                       for x in train_questions)
        else:
            question_text = u''

        if use_source:
            filename = '%s/%s' % (source_location, pp)
            if os.path.isfile(filename):
                try:
                    with gzip.open(filename, 'rb') as f:
                        source_text = f.read()
                except zlib.error:
                    print("Error reading %s" % filename)
                    source_text = ''
            else:
                source_text = ''
        else:
            source_text = u''

        if use_wiki:
            wikipedia_text = cw[pp].content
        else:
            wikipedia_text = u""

        total_text = wikipedia_text
        total_text += "\n"
        total_text += question_text
        total_text += "\n"
        total_text += unidecode(str(source_text))

        yield pp, total_text
        doc_num += 1

        if 0 < limit < doc_num:
            break
示例#18
0
import operator

kBAD_ANSWERS = ["", "red river", "the", "figaro", "normal", "s", "p"]

if __name__ == "__main__":
    args = argparse.ArgumentParser('Interactive assign pages to questions')
    args.add_argument('--database', type=str, default='data/questions.db',
                      help='sqlite3 database of questions')
    args.add_argument('--titles', type=str, default='data/wiki_index.pkl',
                      help='page title candiates')
    args.add_argument('--labels', type=str, default='data/map/ans_to_wiki',
                      help='write page assignment answers')
    args = args.parse_args()

    # Open up the database
    d = QuestionDatabase(args.database)
    page_diversity = d.answer_map(normalize)

    # Set up the active learner for writing assignments
    al = ActiveLearner(None, args.labels)
    existing_labels = set(x[0] for x in al.human_labeled())

    # get the candidates we want to assign to pages
    answers = d.unmatched_answers(existing_labels)
    print(answers.keys()[:10])

    # Open up the title finder
    tf = TitleFinder(open(args.titles))

    for ans, count in sorted(answers.items(), key=lambda x: sum(x[1].values()),
                             reverse=True):
示例#19
0
            first, rest = ii.split('ID="', 1)
            id, rest = rest.split('" TITLE="', 1)
            title, rest = rest.split('"', 1)
            self.topics[int(id)] = title


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description='Import questions')
    parser.add_argument('--naqt_path', type=str)
    parser.add_argument('--db', type=str, default='data/questions.db')

    flags = parser.parse_args()

    qdb = QuestionDatabase(flags.db)
    conn = qdb._conn
    answer_map = qdb.answer_map()

    # Find existing naqt questions
    c = conn.cursor()
    command = 'SELECT naqt FROM questions WHERE naqt >= 0;'
    c.execute(command)
    existing = set(int(x[0]) for x in c)

    num_skipped = 0
    last_id = kNAQT_START
    if flags.naqt_path:
        for qq in naqt_reader(flags.naqt_path):
            if qq.answer in answer_map and len(answer_map[qq.answer]) == 1:
                page = answer_map[qq.answer].keys()[0]
示例#20
0
文件: naqt.py 项目: cequencer/qb
            first, rest = ii.split('ID="', 1)
            id, rest = rest.split('" TITLE="', 1)
            title, rest = rest.split('"', 1)
            self.topics[int(id)] = title


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description='Import questions')
    parser.add_argument('--naqt_path', type=str)
    parser.add_argument('--db', type=str, default='data/questions.db')

    flags = parser.parse_args()

    qdb = QuestionDatabase(flags.db)
    conn = qdb._conn
    answer_map = qdb.answer_map()

    # Find existing naqt questions
    c = conn.cursor()
    command = 'SELECT naqt FROM questions WHERE naqt >= 0;'
    c.execute(command)
    existing = set(int(x[0]) for x in c)

    num_skipped = 0
    last_id = kNAQT_START
    if flags.naqt_path:
        for qq in naqt_reader(flags.naqt_path):
            if qq.answer in answer_map and len(answer_map[qq.answer]) == 1:
                page = answer_map[qq.answer].keys()[0]
示例#21
0
import time

from page_assignment.active_learning_for_matching import ActiveLearner
from qanta.util.qdb import QuestionDatabase


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="apply wikipedia pages")
    parser.add_argument("--db", default='data/questions.db', type=str,
                        help="The question database")
    parser.add_argument("--match_location", type=str,
                        default='data/map/ans_to_wiki_',
                        help="Where we read matches learned")

    flags = parser.parse_args()

    start = time.time()
    print("Loading db..")
    db = QuestionDatabase(flags.db)
    print("Loading classifier...")
    classifier = ActiveLearner(None, flags.match_location, [])

    for question, page in classifier.human_labeled():
        ans_type = ""
        db.set_answer_page(question, page, ans_type)
        print(question, page, "GIVEN", ans_type)
示例#22
0
def main():
    import argparse
    parser = argparse.ArgumentParser(description='')
    default_path = 'data/'
    parser.add_argument('--question_db',
                        type=str,
                        default=default_path + 'questions.db')
    parser.add_argument('--guess_db',
                        type=str,
                        default=default_path + 'guesses.db',
                        help="Guess database")
    parser.add_argument("--num_choices",
                        type=int,
                        default=4,
                        help="How many choices do we write")
    parser.add_argument("--train_out", type=str, default="sci_train.csv")
    parser.add_argument("--test_out", type=str, default="sci_test.csv")
    parser.add_argument("--key_out", type=str, default="sci_key.csv")
    flags = parser.parse_args()

    # Create database connections
    print("Opening %s" % flags.question_db)
    question_database = sqlite3.connect(flags.question_db)
    guess_database = sqlite3.connect(flags.guess_db)

    # First get answers of interest and put them in a dictionary where the value is their count
    query = 'select page from questions where page != "" and ('
    query += " or ".join("category='%s'" % x for x in CATEGORIES)
    query += ")"
    c = question_database.cursor()
    print(query)
    c.execute(query)

    answer_count = defaultdict(int)
    for pp, in c:
        answer_count[pp] += 1

    query = 'select page, id, naqt, fold from questions where page != ""'
    c = question_database.cursor()
    c.execute(query)

    print(list(x for x in answer_count if answer_count[x] >= COUNT_CUTOFF))
    print(len(list(x for x in answer_count
                   if answer_count[x] >= COUNT_CUTOFF)))

    # Load the DAN to generate guesses if they're missing from the database
    deep = instantiate_feature("deep", QuestionDatabase(flags.question_db))

    questions = {}
    question_num = 0
    for pp, ii, nn, ff in c:
        if nn >= 0 or answer_count[pp] < COUNT_CUTOFF:
            continue
        question_num += 1
        question = McScience(pp, ii, ff)
        question.add_text(question_first_sentence(question_database, ii))
        choices = question_top_guesses(question.text, deep, guess_database, ii,
                                       pp, flags.num_choices)
        question.add_choices(choices)
        questions[ii] = question
        if question_num % 100 == 0:
            print(pp, ii, question_num)
            print(choices)

    answer_choices = [
        "answer%s" % CHOICEIDS[x] for x in range(flags.num_choices)
    ]

    train_out = DictWriter(open(flags.train_out,
                                'w'), ["id", "question", "correctAnswer"] +
                           answer_choices)
    train_out.writeheader()

    test_out = DictWriter(open(flags.test_out, 'w'),
                          ["id", "question"] + answer_choices)
    test_out.writeheader()

    key_out = DictWriter(open(flags.key_out, 'w'), ["id", "correctAnswer"])
    key_out.writeheader()

    # Now write the questions out
    for qq in questions.values():
        print(qq.fold)
        if qq.fold == "devtest":
            test_out.writerow(qq.csv_line(CHOICEIDS, "test"))
            key_out.writerow(qq.csv_line(CHOICEIDS, "key"))
        else:
            train_out.writerow(qq.csv_line(CHOICEIDS, "train"))
示例#23
0
    args.add_argument('--database',
                      type=str,
                      default='data/questions.db',
                      help='sqlite3 database of questions')
    args.add_argument('--titles',
                      type=str,
                      default='data/wiki_index.pkl',
                      help='page title candiates')
    args.add_argument('--labels',
                      type=str,
                      default='data/map/ans_to_wiki',
                      help='write page assignment answers')
    args = args.parse_args()

    # Open up the database
    d = QuestionDatabase(args.database)
    page_diversity = d.answer_map(normalize)

    # Set up the active learner for writing assignments
    al = ActiveLearner(None, args.labels)
    existing_labels = set(x[0] for x in al.human_labeled())

    # get the candidates we want to assign to pages
    answers = d.unmatched_answers(existing_labels)
    print(answers.keys()[:10])

    # Open up the title finder
    tf = TitleFinder(open(args.titles))

    for ans, count in sorted(answers.items(),
                             key=lambda x: sum(x[1].values()),
示例#24
0
    return seen

if __name__ == "__main__":
    from util import flags

    flags.define_string("title_index", None, "Pickle of all titles")
    flags.define_string("label_path", None, "Where we write page associations")
    flags.define_string("database", None, "Question database")
    flags.define_string("performance_output", None, "Where we write user performance")
    flags.define_string("user", None, "User identifier")
    flags.InitFlags()

    seen = already_answered(flags.performance_output, flags.user)
    al = ActiveLearner(None, flags.label_path)
    print("Loading question db %s" % flags.database)
    db = QuestionDatabase(flags.database)
    pw = PerformanceWriter(flags.performance_output, flags.user)
    tf = TitleFinder(open(flags.title_index))


    questions = db.questions_by_tournament("High School Championship")
    for qid in questions:
        question = questions[qid]
        if question.fold == "train" or qid in seen:
            continue
        choices = list(tf.query(question.answer))

        # Get what and when the human answered
        wp, idx, ans = get_answer([question.text[x] for x in sorted(question.text)], question.answer, question.page)

        print("\n".join(question.text.values()))