def __init__(self): # init pyltp tool annotators = {'ner', 'pos'} ltp_tokenizer = tokenizers.get_class('ltp') self.tokenizer = ltp_tokenizer(annotators=annotators) # regex pattern for dates detection. Need to be improved. self.date_pattern = '\d{2,4}(年|月|日|世纪|年代)'
def __init__(self, tfidf_path=None, strict=True): """ Args: tfidf_path: path to saved model file strict: fail on empty queries or continue (and return empty result) """ # Load from disk tfidf_path = tfidf_path or DEFAULTS['tfidf_path'] logger.info('Loading %s' % tfidf_path) matrix, metadata = utils.load_sparse_csr(tfidf_path) self.doc_mat = matrix self.ngrams = metadata['ngram'] self.hash_size = metadata['hash_size'] self.tokenizer = tokenizers.get_class(metadata['tokenizer'])() self.doc_freqs = metadata['doc_freqs'].squeeze() self.doc_dict = metadata['doc_dict'] self.num_docs = len(self.doc_dict[0]) self.strict = strict self.csc_matrix = None self.unigrams = metadata['unigrams'] self.bigrams = metadata['bigrams'] self.hash2gram = metadata['hash2gram'] self.title_tfidf = metadata['title_tfidf'] self.titles_tokens = [] self.title_csc_matrix = None self.titles_lens = None
def get_count_matrix(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class(db) with db_class(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids()[:args.num_docs] DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)} # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) workers = ProcessPool(args.num_workers, initializer=init, initargs=(tok_class, db_class, db_opts)) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] unigrams, bigrams = [], [] hash2gram = {} step = max(int(len(doc_ids) / 10), 1) batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data, b_unigrams, b_bigrams, b_hash2gram in workers.imap_unordered( _count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) unigrams.extend(b_unigrams) bigrams.extend(b_bigrams) hash2gram.update(b_hash2gram) workers.close() workers.join() unigrams = list(set(unigrams)) bigrams = list(set(bigrams)) logger.info('Creating sparse matrix...') count_matrix = None if args.matrix_type == 'csr': count_matrix = sp.csr_matrix((data, (row, col)), shape=(args.hash_size, len(doc_ids))) count_matrix.sum_duplicates() elif args.matrix_type == 'csc': count_matrix = sp.csr_matrix((data, (row, col)), shape=(args.hash_size, len(doc_ids))) count_matrix.sum_duplicates() return count_matrix, (DOC2IDX, doc_ids), (unigrams, bigrams, hash2gram)
def build_simhash(args, source='db'): title2text = {} titles = {} # retrieve docs from db if source == 'db': title2text = read_docs_from_db(args, args.doc_db, args.db_opts) # retrieve docs from json elif source == 'json': title2text = read_drqa_format_dataset_as_dict(args.json_path) titles = list(title2text.keys()) # control number when testing code if args.num_docs > 0: titles = titles[:args.num_docs] title2text = {title: title2text[title] for title in titles} logger.info('Mapping...') title2hash = [] tok_class = tokenizers.get_class(args.tokenizer) # multiprocessing if args.work_type == 'multi': # Setup worker pool workers = ProcessPool(args.num_workers, initializer=init, initargs=(tok_class, retriever.get_class(args.doc_db), { 'db_path': args.doc_db })) step = max(int(len(title2text) / 10), 1) batches = [titles[i:i + step] for i in range(0, len(titles), step)] _convert = partial(title2text_dic_2_title2hash_dic, title2text) # map doc text to simhash using multiprocess for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for title, simhash in workers.imap_unordered(_convert, batch): title2hash.append((title, simhash)) workers.close() workers.join() # single processing elif args.work_type == 'single': with tqdm(total=len(title2text)) as pbar: for (k, v) in title2text.items(): title2hash.append( title2text_dic_2_title2hash_dic(title2text, k)) pbar.update() return title2hash
def process_dataset(data, tokenizer, workers=None): """Iterate processing (tokenize, parse, etc) dataset multithreaded.""" tokenizer_class = tokenizers.get_class(tokenizer) make_pool = partial(Pool, workers, initializer=init) workers = make_pool(initargs=(tokenizer_class, {'annotators': {'lemma'}})) q_tokens = workers.map(tokenize, data['questions']) workers.close() workers.join() workers = make_pool(initargs=(tokenizer_class, { 'annotators': {'lemma', 'pos', 'ner'} })) c_tokens = workers.map(tokenize, data['contexts']) workers.close() workers.join() for idx in range(len(data['qids'])): question = q_tokens[idx]['words'] qlemma = q_tokens[idx]['lemma'] document = c_tokens[data['qid2cid'][idx]]['words'] offsets = c_tokens[data['qid2cid'][idx]]['offsets'] lemma = c_tokens[data['qid2cid'][idx]]['lemma'] pos = c_tokens[data['qid2cid'][idx]]['pos'] ner = c_tokens[data['qid2cid'][idx]]['ner'] ans_tokens = [] if len(data['answers']) > 0: for ans in data['answers'][idx]: found = find_answer(offsets, ans['answer_start'], ans['answer_start'] + len(ans['text'])) if found: ans_tokens.append(found) yield { 'id': data['qids'][idx], 'question': question, 'document': document, 'offsets': offsets, 'answers': ans_tokens, 'qlemma': qlemma, 'lemma': lemma, 'pos': pos, 'ner': ner, }
def get_title_tfidf_matrix(args, db, db_opts, doc_freqs): """""" db_class = retriever.get_class(db) with db_class(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids()[:args.num_docs] tokenizer = tokenizers.get_class(args.tokenizer)() DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)} rows, cols, datas = [], [], [] for doc_id in doc_ids: words = tokenizer.tokenize(doc_id).ngrams( n=args.ngram, uncased=True, filter_fn=retriever.utils.filter_ngram) wids = [retriever.utils.hash(w, args.hash_size) for w in words] if len(wids) == 0: logger.warning('No valid word in: %s' % doc_id) continue # Count TF wids_unique, wids_counts = np.unique(wids, return_counts=True) tfs = np.log1p(wids_counts) # Count IDF Ns = doc_freqs[wids_unique] idfs = np.log((len(doc_ids) - Ns + 0.5) / (Ns + 0.5)) idfs[idfs < 0] = 0 # TF-IDF data = np.multiply(tfs, idfs) # add row num, col num and data rows.extend(wids_unique) cols.extend([DOC2IDX[doc_id]] * len(data)) datas.extend(data) # build scipy sparse csr_matrix tfidf_matrix = sp.csr_matrix((datas, (rows, cols)), shape=(args.hash_size, len(doc_ids))) tfidf_matrix.sum_duplicates() return tfidf_matrix
"""Document retriever based on bm25 for comparision with default weight-tfidf model.""" import sys sys.path.append('/home/zrx/projects/MbaQA/') from tqdm import tqdm from gensim import corpora from gensim.summarization import bm25 from mbaqa import retriever, tokenizers docdb = retriever.get_class('sqlite')() tokenizer = tokenizers.get_class('ltp')() titles = docdb.get_doc_ids()[:] IDX2TITLE = {idx: titles[idx] for idx in range(len(titles))} stop_words_path = '../../data/stopwords/stopwords.txt' stopwords = [] with open(stop_words_path, encoding='utf8') as file: for line in file: stopwords.append(line.replace('\n', '').strip()) corpus = [] with tqdm(total=len(titles)) as pbar: for title in titles: # Tokenize tokens = tokenizer.tokenize(retriever.utils.normalize(docdb.get_doc_text(title))) # Get ngrams from tokens, with stopword/punctuation filtering.
from functools import partial from simhash import Simhash, SimhashIndex logger = logging.getLogger() logger.setLevel(logging.INFO) fmt = logging.Formatter('%(asctime)s: [ %(message)s ]', '%m/%d/%Y %I:%M:%S %p') console = logging.StreamHandler() console.setFormatter(fmt) logger.addHandler(console) # ------------------------------------------------------------------------------ # Multiprocessing functions # ------------------------------------------------------------------------------ DOC2IDX = None PROCESS_TOK = tokenizers.get_class('ltp')() PROCESS_DB = None def init(tokenizer_class, db_class, db_opts): global PROCESS_TOK, PROCESS_DB PROCESS_TOK = tokenizer_class() Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100) PROCESS_DB = db_class(**db_opts) Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100) def fetch_text(doc_id): global PROCESS_DB return PROCESS_DB.get_doc_text(doc_id)
logger.info('Ranking...') closest_docs = ranker.batch_closest_docs(questions, k=args.n_docs, title_weight=args.title_weight, num_workers=args.num_workers) # closest_docs = [] # with tqdm(total=len(questions)) as pbar: # for question in questions: # closest_docs.append(ranker.closest_docs_by_content_and_title(question, title_weight=args.title_weight, k=5)) # pbar.update() answers_docs = zip(answers, closest_docs) # define processes tok_class = tokenizers.get_class(args.tokenizer) tok_opts = {} db_class = retriever.DocDB db_opts = {'db_path': args.doc_db} processes = ProcessPool(processes=args.num_workers, initializer=init, initargs=(tok_class, tok_opts, db_class, db_opts)) # compute the scores for each pair, and print the statistics logger.info('Retrieving and computing scores...') get_score_partial = partial(get_score, match=args.match) scores = processes.map(get_score_partial, answers_docs) # get failing questions failing_questions = [(questions[i], answers[i][0], closest_docs[i]) for i in range(len(scores)) if scores[i] == 0]