Python TfidfDocRanker示例，hotpot.tfidf_retriever.tfidf_doc_ranker.TfidfDocRanker Python示例

示例#1

0

显示文件

文件： build_hotpot_open_dataset.py 项目： sjliu0920/MUPPET

def init(top_k, get_text):
    global PROCESS_DB, PROCESS_RANKER, TOP_K, GET_TEXT
    PROCESS_DB = DocDB()
    Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)
    PROCESS_RANKER = TfidfDocRanker()
    TOP_K = top_k
    GET_TEXT = get_text

示例#2

0

显示文件

文件： build _squad_relevance.py 项目： sjliu0920/MUPPET

def init(db_path, full_doc_db_path, ranker_path):
    global PROCESS_TOK, PROCESS_DB, PROCESS_RANKER, PROCESS_FULL_DOC_DB, DOC_TITLES
    PROCESS_TOK = CoreNLPTokenizer()
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
    PROCESS_DB = DocDB(db_path)
    Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)
    PROCESS_FULL_DOC_DB = DocDB(full_doc_db_path, full_docs=True)
    Finalize(PROCESS_FULL_DOC_DB, PROCESS_FULL_DOC_DB.close, exitpriority=100)
    PROCESS_RANKER = TfidfDocRanker(ranker_path)
    DOC_TITLES = PROCESS_FULL_DOC_DB.get_doc_titles()

示例#3

0

显示文件

文件： eval.py 项目： sjliu0920/MUPPET

def main(dataset_path,
         k_list_to_check,
         ranker_path=None,
         normalize_ranker=False,
         num_workers=1,
         filter=False,
         tokenizer='corenlp'):
    dataset = load_dataset(dataset_path)
    if filter:
        print(f"Filtering questions without question mark.")
        old_len = len(dataset)
        dataset = [x for x in dataset if '?' in x['question']]
        print(f"Removed {len(dataset)-old_len} questions.")
    ranker = TfidfDocRanker(tfidf_path=ranker_path,
                            normalize_vectors=normalize_ranker,
                            tokenizer=tokenizer)
    gold_dict = build_gold_dict(dataset)
    regular_table = prettytable.PrettyTable(
        ['Top K', 'Hits', 'Perfect Questions', 'At Least One'])
    cat_table_dict = {
        cat: prettytable.PrettyTable(
            ['Top K', 'Hits', 'Perfect Questions', 'At Least One'])
        for cat in CATEGORIES
    }
    max_k = max(k_list_to_check)
    print(f"Retrieving top {max_k} ...")
    start = time.time()
    result_dict = get_top_k(dataset, ranker, max_k, num_workers)
    print(f"Done, took {time.time()-start} ms.")
    for k in k_list_to_check:
        print(f"Calculating scores for top {k}...")
        start = time.time()
        scores, category_scores = top_k_coverage_score(gold_dict, result_dict,
                                                       k)
        print(f"Done, took {time.time()-start} ms.")
        regular_table.add_row([
            k, scores['Hits'], scores['Perfect Questions'],
            scores['At Least One']
        ])
        for cat in cat_table_dict:
            cat_table_dict[cat].add_row([
                k, category_scores[cat]['Hits'],
                category_scores[cat]['Perfect Questions'],
                category_scores[cat]['At Least One']
            ])
    print("Overall Results:")
    print(regular_table)
    for cat, table in cat_table_dict.items():
        print('\n**********************************************\n')
        print(f"Category: {cat} Results:")
        print(table)

示例#4

0

显示文件

文件： simple_tf_idf_rank.py 项目： sjliu0920/MUPPET

def main():
    parser = argparse.ArgumentParser(description='Evaluate tf-idf scoring on full squad.')
    parser.add_argument('--ranker', action='store_true', help='Whether to use bi-gram hashing or not')
    parser.add_argument('--per-doc', action='store_true')
    parser.add_argument('--num-workers', type=int, default=1)
    args = parser.parse_args()

    ranker = None
    if args.ranker:
        print("Loading ranker...")
        ranker = TfidfDocRanker()

    if args.per_doc:
        return main_for_document(ranker, args.num_workers)

    print("Loading data...")
    corpus = SquadRelevanceCorpus()
    # if args.corpus == "dev":
    #     questions = corpus.get_dev()
    # else:
    #     questions = corpus.get_train()
    questions = corpus.get_dev()

    question_preprocessor = SquadTextLengthPreprocessor(600)
    questions = [question_preprocessor.preprocess(x) for x in questions
                 if (question_preprocessor.preprocess(x) is not None)]

    if args.num_workers <= 1:
        if args.ranker:
            init()
        gold_ranks = [get_rank_in_distractors(q) for q in tqdm(questions)]
    else:
        # Setup worker pool
        workers = ProcessPool(
            args.num_workers,
            initializer=init if args.ranker else None,
            initargs=[]
        )

        gold_ranks = []
        with tqdm(total=len(questions)) as pbar:
            for rank in tqdm(workers.imap_unordered(get_rank_in_distractors, questions)):
                gold_ranks.append(rank)
                pbar.update()

    mean_rank = np.mean(gold_ranks)
    precision_at_1 = Counter(gold_ranks)[1]/len(gold_ranks)

    print(f"Mean Rank: {mean_rank}")
    print(f"Precision @ 1: {precision_at_1}")

示例#5

0

显示文件

文件： simple_tf_idf_rank.py 项目： sjliu0920/MUPPET

def init():
    global PROCESS_RANKER
    PROCESS_RANKER = TfidfDocRanker()

示例#6

0

显示文件

from hotpot.tfidf_retriever.tfidf_doc_ranker import TfidfDocRanker

logger = logging.getLogger()
logger.setLevel(logging.INFO)
fmt = logging.Formatter('%(asctime)s: [ %(message)s ]', '%m/%d/%Y %I:%M:%S %p')
console = logging.StreamHandler()
console.setFormatter(fmt)
logger.addHandler(console)

parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default=None)
parser.add_argument('--normalize', action='store_true')
args = parser.parse_args()

logger.info('Initializing ranker...')
ranker = TfidfDocRanker(tfidf_path=args.model,
                        normalize_vectors=args.normalize)

# ------------------------------------------------------------------------------
# Drop in to interactive
# ------------------------------------------------------------------------------


def process(query, k=1):
    doc_names, doc_scores = ranker.closest_docs(query, k)
    table = prettytable.PrettyTable(['Rank', 'Doc Id', 'Doc Score'])
    for i in range(len(doc_names)):
        table.add_row([i + 1, doc_names[i], '%.5g' % doc_scores[i]])
    print(table)


banner = """

示例#7

0

显示文件

def init(top_k):
    global PROCESS_DB, PROCESS_RANKER, TOP_K
    PROCESS_DB = OldDocbDB(DRQA_DOC_DB)
    Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)
    PROCESS_RANKER = TfidfDocRanker(DRQA_RANKER, strict=False, tokenizer='simple')
    TOP_K = top_k

示例#8

0

显示文件

文件： interactive.py 项目： sjliu0920/MUPPET

    PROCESS_TOK = CoreNLPTokenizer()
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)


def tokenize_words(text):
    global PROCESS_TOK
    return PROCESS_TOK.tokenize(text).words()


def tokenize_sentences(sentences):
    global PROCESS_TOK
    return [PROCESS_TOK.tokenize(s).words() if s != '' else [] for s in sentences]


print("Loading TF-IDF...")
tfidf_ranker = TfidfDocRanker()
db = DocDB()

loader = ResourceLoader()
# loader = HotpotQuestions().get_resource_loader()
word_counts = load_counts(join(LOCAL_DATA_DIR, 'hotpot', 'wiki_word_counts.txt'))
title_counts = load_counts(join(LOCAL_DATA_DIR, 'hotpot', 'wiki_title_word_counts.txt'))
word_counts.update(title_counts)
voc = set(word_counts.keys())

print("Loading encoder...")

spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=2,
                                 max_num_question_words=None, max_num_context_words=None)
encoder = SentenceEncoderIterativeModel(model_dir_path=args.encoder_model, vocabulary=voc,
                                        spec=spec, loader=loader, use_char_inputs=False,

示例#9

0

显示文件

文件： eval.py 项目： sjliu0920/MUPPET

def modified_main(dataset_path,
                  k_list_to_check,
                  ranker_path=None,
                  normalize_ranker=False,
                  num_workers=1,
                  tokenizer='corenlp',
                  docdb_path=None,
                  out=None):
    dataset = load_dataset(dataset_path)
    ranker = TfidfDocRanker(tfidf_path=ranker_path,
                            normalize_vectors=normalize_ranker,
                            tokenizer=tokenizer)
    docdb = DocDB(docdb_path)
    print("Building modified queries...")
    ranked_gold_dict = build_ranked_golds(dataset, docdb=docdb, ranker=ranker)
    regular_table = prettytable.PrettyTable([
        'Top K', 'Second Paragraph Hits',
        'Second Paragraph Hits Modified Query'
    ])
    cat_table_dict = {
        cat: prettytable.PrettyTable([
            'Top K', 'Second Paragraph Hits',
            'Second Paragraph Hits Modified Query'
        ])
        for cat in CATEGORIES
    }
    max_k = max(k_list_to_check)
    print(f"Retrieving top {max_k} ...")
    start = time.time()
    reg_result_dict, ranked_result_dict = get_ranked_top_k(
        dataset, ranked_gold_dict, ranker, max_k, num_workers)
    print(f"Done, took {time.time()-start} ms.")
    for k in k_list_to_check:
        print(f"Calculating scores for top {k}...")
        start = time.time()
        reg_scores, reg_category_scores = modified_top_k_coverage_score(
            ranked_gold_dict, reg_result_dict, k)
        mod_scores, mod_category_scores = modified_top_k_coverage_score(
            ranked_gold_dict, ranked_result_dict, k)
        print(f"Done, took {time.time()-start} ms.")
        regular_table.add_row([
            k, reg_scores['Second Paragraph Hits'],
            mod_scores['Second Paragraph Hits']
        ])
        for cat in cat_table_dict:
            cat_table_dict[cat].add_row([
                k, reg_category_scores[cat]['Second Paragraph Hits'],
                mod_category_scores[cat]['Second Paragraph Hits']
            ])
    output_str = 'Overall Results:\n'
    output_str += regular_table.__str__() + '\n'
    for cat, table in cat_table_dict.items():
        output_str += '\n**********************************************\n'
        output_str += f"Category: {cat} Results:\n"
        output_str += table.__str__() + '\n'

    if out is None:
        print(output_str)
    else:
        with open(out, 'w') as f:
            f.write(output_str)