def get_count_matrix(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class(db) with db_class(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids()[:args.num_docs] DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)} # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) workers = ProcessPool(args.num_workers, initializer=init, initargs=(tok_class, db_class, db_opts)) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] unigrams, bigrams = [], [] hash2gram = {} step = max(int(len(doc_ids) / 10), 1) batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data, b_unigrams, b_bigrams, b_hash2gram in workers.imap_unordered( _count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) unigrams.extend(b_unigrams) bigrams.extend(b_bigrams) hash2gram.update(b_hash2gram) workers.close() workers.join() unigrams = list(set(unigrams)) bigrams = list(set(bigrams)) logger.info('Creating sparse matrix...') count_matrix = None if args.matrix_type == 'csr': count_matrix = sp.csr_matrix((data, (row, col)), shape=(args.hash_size, len(doc_ids))) count_matrix.sum_duplicates() elif args.matrix_type == 'csc': count_matrix = sp.csr_matrix((data, (row, col)), shape=(args.hash_size, len(doc_ids))) count_matrix.sum_duplicates() return count_matrix, (DOC2IDX, doc_ids), (unigrams, bigrams, hash2gram)
def read_docs_from_db(args, db, db_opts): """retrieve docs from sqlite db""" logger.info('Retrieving docs from db...') data = {} db_class = retriever.get_class(db) with db_class(**db_opts) as doc_db: titles = doc_db.get_doc_ids() # control number for test if args.num_docs > 0: titles = titles[:args.num_docs] for title in titles: data[title] = doc_db.get_doc_text(title) return data
def build_simhash(args, source='db'): title2text = {} titles = {} # retrieve docs from db if source == 'db': title2text = read_docs_from_db(args, args.doc_db, args.db_opts) # retrieve docs from json elif source == 'json': title2text = read_drqa_format_dataset_as_dict(args.json_path) titles = list(title2text.keys()) # control number when testing code if args.num_docs > 0: titles = titles[:args.num_docs] title2text = {title: title2text[title] for title in titles} logger.info('Mapping...') title2hash = [] tok_class = tokenizers.get_class(args.tokenizer) # multiprocessing if args.work_type == 'multi': # Setup worker pool workers = ProcessPool(args.num_workers, initializer=init, initargs=(tok_class, retriever.get_class(args.doc_db), { 'db_path': args.doc_db })) step = max(int(len(title2text) / 10), 1) batches = [titles[i:i + step] for i in range(0, len(titles), step)] _convert = partial(title2text_dic_2_title2hash_dic, title2text) # map doc text to simhash using multiprocess for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for title, simhash in workers.imap_unordered(_convert, batch): title2hash.append((title, simhash)) workers.close() workers.join() # single processing elif args.work_type == 'single': with tqdm(total=len(title2text)) as pbar: for (k, v) in title2text.items(): title2hash.append( title2text_dic_2_title2hash_dic(title2text, k)) pbar.update() return title2hash
def get_title_tfidf_matrix(args, db, db_opts, doc_freqs): """""" db_class = retriever.get_class(db) with db_class(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids()[:args.num_docs] tokenizer = tokenizers.get_class(args.tokenizer)() DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)} rows, cols, datas = [], [], [] for doc_id in doc_ids: words = tokenizer.tokenize(doc_id).ngrams( n=args.ngram, uncased=True, filter_fn=retriever.utils.filter_ngram) wids = [retriever.utils.hash(w, args.hash_size) for w in words] if len(wids) == 0: logger.warning('No valid word in: %s' % doc_id) continue # Count TF wids_unique, wids_counts = np.unique(wids, return_counts=True) tfs = np.log1p(wids_counts) # Count IDF Ns = doc_freqs[wids_unique] idfs = np.log((len(doc_ids) - Ns + 0.5) / (Ns + 0.5)) idfs[idfs < 0] = 0 # TF-IDF data = np.multiply(tfs, idfs) # add row num, col num and data rows.extend(wids_unique) cols.extend([DOC2IDX[doc_id]] * len(data)) datas.extend(data) # build scipy sparse csr_matrix tfidf_matrix = sp.csr_matrix((datas, (rows, cols)), shape=(args.hash_size, len(doc_ids))) tfidf_matrix.sum_duplicates() return tfidf_matrix
"""Document retriever based on bm25 for comparision with default weight-tfidf model.""" import sys sys.path.append('/home/zrx/projects/MbaQA/') from tqdm import tqdm from gensim import corpora from gensim.summarization import bm25 from mbaqa import retriever, tokenizers docdb = retriever.get_class('sqlite')() tokenizer = tokenizers.get_class('ltp')() titles = docdb.get_doc_ids()[:] IDX2TITLE = {idx: titles[idx] for idx in range(len(titles))} stop_words_path = '../../data/stopwords/stopwords.txt' stopwords = [] with open(stop_words_path, encoding='utf8') as file: for line in file: stopwords.append(line.replace('\n', '').strip()) corpus = [] with tqdm(total=len(titles)) as pbar: for title in titles: # Tokenize tokens = tokenizer.tokenize(retriever.utils.normalize(docdb.get_doc_text(title))) # Get ngrams from tokens, with stopword/punctuation filtering.
from mbaqa import retriever logger = logging.getLogger() logger.setLevel(logging.INFO) fmt = logging.Formatter('%(asctime)s: [ %(message)s ]', '%m/%d/%Y %I:%M:%S %p') console = logging.StreamHandler() console.setFormatter(fmt) logger.addHandler(console) parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, default=None) args = parser.parse_args() logger.info('Initializing ranker...') ranker = retriever.get_class('tfidf')(tfidf_path=args.model) # ------------------------------------------------------------------------------ # Drop in to interactive # ------------------------------------------------------------------------------ def process(query, k=1): doc_names, doc_scores = ranker.closest_docs(query, k) table = prettytable.PrettyTable(['Rank', 'Doc Id', 'Doc Score']) for i in range(len(doc_names)): table.add_row([i + 1, doc_names[i], '%.5g' % doc_scores[i]]) print(table) banner = """
"""A flask web wrapper for document retriever interactive mode.""" import json import re import traceback import random from flask import Flask, jsonify from flask_cors import CORS import sys sys.path.append('/home/zrx/projects/MbaQA') from mbaqa import retriever from mbaqa.tokenizers import LtpTokenizer import scripts.dataset.utils as utils ranker = retriever.get_class('tfidf')(tfidf_path='../../data/retriever/model/mba-tfidf-ngram=2-hash=16777216-tokenizer=ltp-numdocs=78259.npz') doc_db = retriever.doc_db.DocDB(db_path='../../data/db/mba.db') tokenizer = LtpTokenizer() app = Flask(__name__) CORS(app) @app.route('/') def index(): """Show Some titles of doc set.""" titles = doc_db.get_doc_ids() titles = random.sample(titles, 10) return json.dumps(titles)
# build question -> doc_id(title) mapping question2title = {} with open(JSON_PATH, encoding='utf8') as f: dataset_json = json.load(f) docs = dataset_json['data'] for doc in docs: title = doc['title'] for para in doc['paragraphs']: for qa in para['qas']: question2title[qa['question']] = title # get doc-scores, title-scores, label-doc-index for each query logger.info('Computing scores ...') ranker = retriever.get_class('tfidf')() ranker.strict = False query_doc_scores = {} all_title_scores = [] all_doc_scores = [] labels = [] with tqdm(total=len(questions)) as pbar: for q in questions: doc_scores = ranker.get_doc_scores(q) # .toarray().reshape(76437,) all_doc_scores.append(doc_scores) # print('top: {} - {}'.format(np.max(doc_scores), ranker.doc_dict[1][np.where(doc_scores == np.max(doc_scores))[0][0]])) title_scores = ranker.get_title_scores(q) # .toarray().reshape(76437,) all_title_scores.append(title_scores) # print('top: {} - {}'.format(np.max(title_scores), ranker.doc_dict[1][np.where(title_scores == np.max(title_scores))[0][0]]))