예제 #1
0
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()[:args.num_docs]
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(args.num_workers,
                          initializer=init,
                          initargs=(tok_class, db_class, db_opts))

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    unigrams, bigrams = [], []
    hash2gram = {}
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 25)
        for b_row, b_col, b_data, b_unigrams, b_bigrams, b_hash2gram in workers.imap_unordered(
                _count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
            unigrams.extend(b_unigrams)
            bigrams.extend(b_bigrams)
            hash2gram.update(b_hash2gram)
    workers.close()
    workers.join()

    unigrams = list(set(unigrams))
    bigrams = list(set(bigrams))

    logger.info('Creating sparse matrix...')

    count_matrix = None
    if args.matrix_type == 'csr':
        count_matrix = sp.csr_matrix((data, (row, col)),
                                     shape=(args.hash_size, len(doc_ids)))
        count_matrix.sum_duplicates()
    elif args.matrix_type == 'csc':
        count_matrix = sp.csr_matrix((data, (row, col)),
                                     shape=(args.hash_size, len(doc_ids)))
        count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids), (unigrams, bigrams, hash2gram)
예제 #2
0
def read_docs_from_db(args, db, db_opts):
    """retrieve docs from sqlite db"""
    logger.info('Retrieving docs from db...')

    data = {}
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        titles = doc_db.get_doc_ids()
        # control number for test
        if args.num_docs > 0:
            titles = titles[:args.num_docs]
        for title in titles:
            data[title] = doc_db.get_doc_text(title)
    return data
예제 #3
0
def build_simhash(args, source='db'):
    title2text = {}
    titles = {}
    # retrieve docs from db
    if source == 'db':
        title2text = read_docs_from_db(args, args.doc_db, args.db_opts)
    # retrieve docs from json
    elif source == 'json':
        title2text = read_drqa_format_dataset_as_dict(args.json_path)
        titles = list(title2text.keys())
        # control number when testing code
        if args.num_docs > 0:
            titles = titles[:args.num_docs]
            title2text = {title: title2text[title] for title in titles}

    logger.info('Mapping...')
    title2hash = []
    tok_class = tokenizers.get_class(args.tokenizer)
    # multiprocessing
    if args.work_type == 'multi':
        # Setup worker pool
        workers = ProcessPool(args.num_workers,
                              initializer=init,
                              initargs=(tok_class,
                                        retriever.get_class(args.doc_db), {
                                            'db_path': args.doc_db
                                        }))
        step = max(int(len(title2text) / 10), 1)
        batches = [titles[i:i + step] for i in range(0, len(titles), step)]
        _convert = partial(title2text_dic_2_title2hash_dic, title2text)

        # map doc text to simhash using multiprocess

        for i, batch in enumerate(batches):
            logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                        '-' * 25)
            for title, simhash in workers.imap_unordered(_convert, batch):
                title2hash.append((title, simhash))
        workers.close()
        workers.join()

    # single processing
    elif args.work_type == 'single':
        with tqdm(total=len(title2text)) as pbar:
            for (k, v) in title2text.items():
                title2hash.append(
                    title2text_dic_2_title2hash_dic(title2text, k))
                pbar.update()
    return title2hash
예제 #4
0
def get_title_tfidf_matrix(args, db, db_opts, doc_freqs):
    """"""
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()[:args.num_docs]
    tokenizer = tokenizers.get_class(args.tokenizer)()
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    rows, cols, datas = [], [], []
    for doc_id in doc_ids:
        words = tokenizer.tokenize(doc_id).ngrams(
            n=args.ngram, uncased=True, filter_fn=retriever.utils.filter_ngram)
        wids = [retriever.utils.hash(w, args.hash_size) for w in words]

        if len(wids) == 0:
            logger.warning('No valid word in: %s' % doc_id)
            continue

        # Count TF
        wids_unique, wids_counts = np.unique(wids, return_counts=True)
        tfs = np.log1p(wids_counts)

        # Count IDF
        Ns = doc_freqs[wids_unique]
        idfs = np.log((len(doc_ids) - Ns + 0.5) / (Ns + 0.5))
        idfs[idfs < 0] = 0

        # TF-IDF
        data = np.multiply(tfs, idfs)

        # add row num, col num and data
        rows.extend(wids_unique)
        cols.extend([DOC2IDX[doc_id]] * len(data))
        datas.extend(data)

    # build scipy sparse csr_matrix
    tfidf_matrix = sp.csr_matrix((datas, (rows, cols)),
                                 shape=(args.hash_size, len(doc_ids)))
    tfidf_matrix.sum_duplicates()

    return tfidf_matrix
예제 #5
0
"""Document retriever based on bm25 for comparision with default weight-tfidf model."""

import sys
sys.path.append('/home/zrx/projects/MbaQA/')

from tqdm import tqdm
from gensim import corpora
from gensim.summarization import bm25

from mbaqa import retriever, tokenizers

docdb = retriever.get_class('sqlite')()
tokenizer = tokenizers.get_class('ltp')()

titles = docdb.get_doc_ids()[:]
IDX2TITLE = {idx: titles[idx] for idx in range(len(titles))}

stop_words_path = '../../data/stopwords/stopwords.txt'
stopwords = []
with open(stop_words_path, encoding='utf8') as file:
    for line in file:
        stopwords.append(line.replace('\n', '').strip())


corpus = []
with tqdm(total=len(titles)) as pbar:
    for title in titles:
        # Tokenize
        tokens = tokenizer.tokenize(retriever.utils.normalize(docdb.get_doc_text(title)))

        # Get ngrams from tokens, with stopword/punctuation filtering.
예제 #6
0
from mbaqa import retriever

logger = logging.getLogger()
logger.setLevel(logging.INFO)
fmt = logging.Formatter('%(asctime)s: [ %(message)s ]', '%m/%d/%Y %I:%M:%S %p')
console = logging.StreamHandler()
console.setFormatter(fmt)
logger.addHandler(console)

parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default=None)
args = parser.parse_args()

logger.info('Initializing ranker...')
ranker = retriever.get_class('tfidf')(tfidf_path=args.model)

# ------------------------------------------------------------------------------
# Drop in to interactive
# ------------------------------------------------------------------------------


def process(query, k=1):
    doc_names, doc_scores = ranker.closest_docs(query, k)
    table = prettytable.PrettyTable(['Rank', 'Doc Id', 'Doc Score'])
    for i in range(len(doc_names)):
        table.add_row([i + 1, doc_names[i], '%.5g' % doc_scores[i]])
    print(table)


banner = """
예제 #7
0
"""A flask web wrapper for document retriever interactive mode."""

import json
import re
import traceback
import random
from flask import Flask, jsonify
from flask_cors import CORS

import sys
sys.path.append('/home/zrx/projects/MbaQA')
from mbaqa import retriever
from mbaqa.tokenizers import LtpTokenizer
import scripts.dataset.utils as utils

ranker = retriever.get_class('tfidf')(tfidf_path='../../data/retriever/model/mba-tfidf-ngram=2-hash=16777216-tokenizer=ltp-numdocs=78259.npz')
doc_db = retriever.doc_db.DocDB(db_path='../../data/db/mba.db')
tokenizer = LtpTokenizer()

app = Flask(__name__)
CORS(app)


@app.route('/')
def index():
    """Show Some titles of doc set."""
    titles = doc_db.get_doc_ids()
    titles = random.sample(titles, 10)
    return json.dumps(titles)

예제 #8
0
# build question -> doc_id(title) mapping
question2title = {}
with open(JSON_PATH, encoding='utf8') as f:
    dataset_json = json.load(f)
docs = dataset_json['data']
for doc in docs:
    title = doc['title']
    for para in doc['paragraphs']:
        for qa in para['qas']:
            question2title[qa['question']] = title


# get doc-scores, title-scores, label-doc-index for each query
logger.info('Computing scores ...')
ranker = retriever.get_class('tfidf')()
ranker.strict = False

query_doc_scores = {}
all_title_scores = []
all_doc_scores = []
labels = []
with tqdm(total=len(questions)) as pbar:
    for q in questions:
        doc_scores = ranker.get_doc_scores(q)  # .toarray().reshape(76437,)
        all_doc_scores.append(doc_scores)
        # print('top: {} - {}'.format(np.max(doc_scores), ranker.doc_dict[1][np.where(doc_scores == np.max(doc_scores))[0][0]]))
        title_scores = ranker.get_title_scores(q)  # .toarray().reshape(76437,)
        all_title_scores.append(title_scores)
        # print('top: {} - {}'.format(np.max(title_scores), ranker.doc_dict[1][np.where(title_scores == np.max(title_scores))[0][0]]))