Пример #1
0
def _extract_tokens(row: List[str], index: Index, question_text: str, doc_iid: int) -> Tuple[List[int], List[int]]:
    doc_tokens = list(index.get_document_by_int_id(doc_iid))
    query_tokens = [index.token2id.get(token, 0) for token in index.tokenize(question_text)]

    row.extend([json.dumps(query_tokens), json.dumps(doc_tokens)])

    return doc_tokens, query_tokens
Пример #2
0
def build():
    global INDEX
    INDEX = Index(env='default')
    batches = parallel.chunk(CHUNK_SIZE, INDEX.document_int_ids())
    helpers.log(f'Building maps for {INDEX.count()} documents.')
    int2wid = {}
    wid2int = {}
    for batch_maps in parallel.execute(_process_batch, batches):
        batch_int2wid, batch_wid2int = batch_maps
        int2wid.update(batch_int2wid)
        wid2int.update(batch_wid2int)

    with open(TOKEN2ID, 'wb')as file:
        pickle.dump(INDEX.token2id, file)
    with open(ID2TOKEN, 'wb')as file:
        pickle.dump(INDEX.id2token, file)
    with open(ID2DF, 'wb')as file:
        pickle.dump(INDEX.id2df, file)
    with open(ID2TF, 'wb')as file:
        pickle.dump(INDEX.id2tf, file)
    with open(ID2TOKEN, 'wb')as file:
        pickle.dump(INDEX.id2token, file)
    with open(INT2WID, 'wb') as file:
        pickle.dump(int2wid, file)
    with open(WID2INT, 'wb') as file:
        pickle.dump(wid2int, file)

    helpers.log(f'Finished building maps. Mapped {len(int2wid)}/{INDEX.index.document_count()}')
Пример #3
0
def accuracies(database: str, table: str):
    """Compute accuracy over a dataset given a retrieval database.
    Statistics are split over question types and levels."""
    dataset = Dataset.from_file(TRAIN_HOTPOT_SET)
    global INDEX
    INDEX = Index()

    # shape: (question type [comparison, bridge], level [hard, medium, easy], number of found gold articles [0, 1, 2])
    hits = np.zeros((2, 3, 3))
    with sqlite3.connect(database) as conn:
        for question in tqdm(dataset, unit='questions'):
            target = _extract_target(question)
            prediction = _fetch_prediction(table, conn, question)
            if prediction is None:
                continue

            (prediction, ) = prediction
            prediction = pickle.loads(prediction)
            _update_hits(prediction, target, question, hits)

        full, half, full_hard, full_medium, full_easy, full_comparison, full_bridge = _accuracies(
            hits)

    logging.info(f'[{datetime.now()}]\t[Full Accuracy: {round(full, 4)}]')
    logging.info(f'[{datetime.now()}]\t[Half Accuracy: {round(half, 4)}]')
    logging.info(f'[{"-" * 10}]')
    logging.info(
        f'[{datetime.now()}]\t[Easy Question Full Accuracy: {round(full_easy, 4)}]'
    )
    logging.info(
        f'[{datetime.now()}]\t[Medium Question Full Accuracy: {round(full_medium, 4)}]'
    )
    logging.info(
        f'[{datetime.now()}]\t[Hard Question Full Accuracy: {round(full_hard, 4)}]'
    )
    logging.info(f'[{"-" * 10}]')
    logging.info(
        f'[{datetime.now()}]\t[Comparison Question Full Accuracy: {round(full_comparison, 4)}]'
    )
    logging.info(
        f'[{datetime.now()}]\t[Bridge Question Full Accuracy: {round(full_bridge, 4)}]'
    )

    return {
        'full': full,
        'half': half,
        'full_hard': full_hard,
        'full_medium': full_medium,
        'full_easy': full_easy,
        'full_comparison': full_comparison,
        'full_bridge': full_bridge
    }
Пример #4
0
def _bigram_unigram_5000(query: str,
                         index: Index,
                         n: int = 5000) -> List[Tuple[int, float]]:
    """ Retrieves the at most n candidates from the full set of articles based on query-document pair bigram/unigram
    matches. Uses pre-built inverted index. Assumed to be equivalent to Algorithm 2, Appendix C of HotpotQA paper.
    Possible mismatches:
        -- unigram/bigrams counts in our case are considered only over first at most 500 characters of the article. Not
        clear if they use full article or not.
        -- implementation does not follow algorithm exactly since that seems very inefficient. We made it better, but
        maybe some edge-cases result in different results.

    :param query: A string of words to match.
    :param index: The prebuilt inverted index.
    :param n: The control threshold
    :return: A list of at most 5000 candidates.
    """

    # tokenize, step, filter stopwords and collect unigrams and bigrams
    tokenized_query = index.tokenize(query)
    query_unigrams = set(tokenized_query)
    query_bigrams = set(nltk.bigrams(tokenized_query))

    # count the overlapping n-gram for each query-document pair
    overlap_set = Counter()
    for bigram in query_bigrams:
        for (doc_id, _) in index.bigram_query(bigram[0],
                                              bigram[1],
                                              request=10000):
            overlap_set[doc_id] += 1
    for unigram in query_unigrams:
        for (doc_id, _) in index.unigram_query(unigram, request=10000):
            overlap_set[doc_id] += 1

    # Get the best n+1 documents and filter all the ones that have a count equal to the smallest count in the list.
    most_common = overlap_set.most_common(n + 1)
    candidates = filter(lambda t: t[1] > most_common[-1][1], most_common)

    return [candidate for candidate in candidates]
Пример #5
0
def process():
    """Filter the collection of 5 million document to just the top 5000 at most according to bigram/unigram
    filtering per question. Processed in parallel."""
    global_start = datetime.now()
    global INDEX
    INDEX = Index(env='tfidf')
    os.makedirs(DIR_NAME)
    (batches, no_batches, no_queries), total_retrieved = retrieve.load_dataset_batches(), 0
    retrieve.create_retrieval_db(DB_NAME)

    helpers.log(f'Retrieving documents. Workers: {os.cpu_count()}')
    start = datetime.now()
    for batch_retrieval in parallel.execute(_process_question_batch, batches):
        total_retrieved += batch_retrieval
    end = datetime.now()
    helpers.log(f'Finished retrieval in {end - start}. Filtered {total_retrieved}/{no_queries}')

    global_end = datetime.now()
    helpers.log(f'Finished process in {global_end - global_start}.')
Пример #6
0
def build(skip_relevant: bool = True):
    global INDEX, COLUMNS, QUESTION_COUNTS
    global SKIP_RELEVANT
    SKIP_RELEVANT = skip_relevant
    INDEX = Index('tfidf')
    helpers.log('Loaded index.')

    os.makedirs(ct.CANDIDATES_DIR, exist_ok=True)
    with open(ct.TRAIN_HOTPOT_SET, 'r') as file:
        question_set = json.load(file)
        train_question_set = question_set[:ct.TRAIN_DEV_SPLIT]
        dev_question_set = question_set[ct.TRAIN_DEV_SPLIT:]
    with open(ct.DEV_HOTPOT_SET, 'r') as file:
        test_question_set = json.load(file)

    iterator: List[Tuple[str, str, Callable]] = [
        # (train_question_set, 'train', ct.TRAIN_CANDIDATES_DB, ct.CANDIDATES_CHUNK),
        # (dev_question_set, 'dev', ct.DEV_CANDIDATES_DB, ct.CANDIDATES_CHUNK),
        (test_question_set, 'test', ct.TEST_CANDIDATES_DB, ct.CANDIDATES_CHUNK)
    ]

    for (_set, split, candidate_db_path, chunk) in iterator:
        start = datetime.now()

        db = sqlite3.connect(candidate_db_path)
        cursor = db.cursor()
        cursor.execute(sql.create_candidate_table)
        db.commit()
        helpers.log('Created candidates table.')

        QUESTION_COUNTS = cursor.execute(sql.count_question_rows).fetchall()
        QUESTION_COUNTS = {json.loads(_id): _count for (_id, _count) in QUESTION_COUNTS}
        helpers.log(f'Retrieved question counts for {len(QUESTION_COUNTS)} questions.')
        cursor.close()
        db.close()

        helpers.log(f'Creating {split} candidate set with {len(_set)} question.')
        total_count = 0
        _set_generator = parallel.chunk(chunk, zip([split] * len(_set), _set))
        for batch_count in parallel.execute(_build_candidates, _set_generator):
            total_count += batch_count
        helpers.log(f'Created {split} candidate set with {total_count} questions in {datetime.now() - start}')
Пример #7
0
def build():
    assert constants.TRAIN_FEATURES_CHUNK > 1
    assert constants.DEV_FEATURES_CHUNK > 1

    global INDEX
    INDEX = Index('tfidf')
    helpers.log('Loaded index.')

    global EXTRACTORS
    EXTRACTORS = []
    if 'entity' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(EntityExtractor(INDEX))
    if 'ibm1' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(IBM1FeatureExtractor(normalized=False))
    if 'nibm1' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(IBM1FeatureExtractor(normalized=True))
    if 'bigram' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(BigramOverlapFeatureExtractor(normalized=False))
    if 'nbigram' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(BigramOverlapFeatureExtractor(normalized=True))
    if 'qword' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(QuestionWordFeatureExtractor())
    if 'doclen' in constants.FEATURE_EXTRACTORS:
        EXTRACTORS.append(DocumentLengthFeatureExtractor())
    helpers.log('Loaded extractors.')

    global COLUMNS
    COLUMNS = copy.copy(constants.FEATURE_BASE_COLUMN_NAMES)
    COLUMNS.extend(feature for extractor in EXTRACTORS for feature in extractor.feature_name)
    COLUMNS.append(constants.FEATURE_TARGET_COLUMN_NAME)
    helpers.log('Loaded column names.')

    os.makedirs(constants.FEATURES_DIR, exist_ok=True)
    iterator: List[Tuple[str, str, Callable]] = [
        # (constants.TRAIN_CANDIDATES_DB, constants.TRAIN_FEATURES_DB, constants.TRAIN_FEATURES_CHUNK),
        # (constants.DEV_CANDIDATES_DB, constants.DEV_FEATURES_DB, constants.DEV_FEATURES_CHUNK),
        (constants.TEST_CANDIDATES_DB, constants.TEST_FEATURES_DB, constants.TEST_FEATURES_CHUNK)
    ]

    for (candidate_db_path, feature_db_path, chunk) in iterator:
        start_time = datetime.now()
        _set = candidate_db_path.split("/")[-1].split(".")[1]

        done = False
        feature_db = sqlite3.connect(feature_db_path)
        cursor = feature_db.cursor()
        while not done:
            try:
                cursor.execute(sql.create_features_table(COLUMNS))
                feature_db.commit()
                done = True
            except Exception as e:
                helpers.log(e)
        helpers.log(f'Created {_set} features table.')

        (start,) = cursor.execute('SELECT MAX(id) FROM features').fetchone()
        start = start if start is not None else 0  # first id in the database
        cursor.close()
        feature_db.close()
        helpers.log(f'Starting feature build at {start}.')

        candidate_db = sqlite3.connect(candidate_db_path)
        cursor = candidate_db.cursor()
        (stop,) = cursor.execute('SELECT COUNT(*) FROM candidates').fetchone()  # last id in the database
        cursor.close()
        candidate_db.close()
        id_range = range(start + 1, stop + 1)
        helpers.log(f'Retrieved {len(id_range)} candidate indices for {_set} set.')

        total_count = 0
        _set_generator = parallel.chunk(chunk, zip([_set] * len(id_range), id_range))
        _batched_set_generator = parallel.chunk(constants.GRAND_CHUNK, _set_generator)
        for grand_batch_idx, _batch_set in _batched_set_generator:
            grand_batch_count = 0
            for batch_count in parallel.execute(_build_candidates, _batch_set, _as='process'):
                grand_batch_count += batch_count
                total_count += batch_count
            helpers.log(f'Processed {_set} batch of features no {grand_batch_idx} with {grand_batch_count} pairs.')
        helpers.log(f'Created {_set} features set with {total_count} pairs in {datetime.now() - start_time}')