Exemplo n.º 1
0
    def __init__(self):
        dk.set_default('corenlp_classpath', corenlp_path)
        dr.set_default('model', model_path)

        # DrQA retriever
        self.retriever = ret.get_class('tfidf')(tfidf_path=tfidf_path)

        # DrQA reader
        self.reader = dr.Predictor(model_path, "corenlp", normalize=True)

        # Answerability classifier
        self.tokenizer = BertTokenizer.from_pretrained(
            model_name, do_lower_case="uncased"
            in model_name)  # , cache_dir=cache_directory)
        self.pretrained_model = Model()

        checkpoint = torch.load(load_name,
                                map_location=lambda storage, loc: storage)
        self.pretrained_model.load_state_dict(checkpoint['state_dict'])

        self.pretrained_model.zero_grad()
        self.pretrained_model.eval()
        self.pretrained_model.freeze()
        torch.set_grad_enabled(False)

        # Creates a map from document id to
        self.docs_txt = {}
        with open(docs_json_path, encoding='utf-8') as docs_text:
            for line in docs_text:
                line = eval(line)
                self.docs_txt[line["id"]] = line["text"]
Exemplo n.º 2
0
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    init(tok_class, db_class, db_opts)

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 25)
        for b_row, b_col, b_data in map(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)

    logger.info('Creating sparse matrix...')
    count_matrix = sp.csr_matrix((data, (row, col)),
                                 shape=(args.hash_size, len(doc_ids)))
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)
Exemplo n.º 3
0
 def __init__(self, predictor, rankerPath, dbPath, ebdPath=None):
     self.predictor = predictor
     self.ranker = retriever.get_class('tfidf')(tfidf_path=rankerPath)
     conn = sqlite3.connect(dbPath)
     self.db = conn.cursor()
     self.filter = filtText('drqa/features/map.txt')
     self.score = contextScore(ebdPath)
Exemplo n.º 4
0
 def __init__(self, db_path, model):
     '''
     Args:
         model: tfidf model path
     '''
     self.doc_db = retriever.DocDB(db_path=db_path)
     self.ranker = retriever.get_class('tfidf')(tfidf_path=model)
Exemplo n.º 5
0
def rank(args):
    logger.info('Initializing ranker...')
    ranker = retriever.get_class('tfidf')(tfidf_path=args.model)

    basename = os.path.splitext(os.path.basename(args.data_path))[0]
    dump_path = os.path.join(args.out_dir, f'{basename}-{args.k}.rank')
    logger.info(f'Dumping rank jsons to {dump_path}')

    with io.open(args.data_path) as json_file:
        for idx, line in enumerate(json_file):

            input_json = json.loads(line.strip('\n'))
            doc_id, doc = input_json['id'], input_json['text']

            doc_names, doc_scores = ranker.closest_docs(query=doc, k=args.k)

            dump_json = {
                'doc_id': doc_id,
                'rank_ids': list(doc_names),
                'rank_scores': list(doc_scores),
            }
            json_str = json.dumps(dump_json, ensure_ascii=False)

            with open(dump_path, 'a') as f:
                f.write(json_str + '\n')

            if idx and idx % 1000 == 0:
                logger.info(f'\t{idx} finished...')
                logger.info(f'\tExample: {json_str}')
Exemplo n.º 6
0
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        #doc_ids = doc_db.get_doc_ids()
     doc_ids = [] 
    res = es.search(index="htts", doc_type="htts", body={"size":500,"query": {"match_all": {}}},scroll='10m')
    scroll = res['_scroll_id']
    #logger.info(scroll)
    #for doc in res['hits']['hits']:
        #print("%s" % (doc['_source']['documentId']))
     #   doc_ids.append(doc['_source']['documentId'])     
    #res2 = es.scroll(scroll_id = scroll, scroll = '1m')   
    
    #for doc in res2['hits']['hits']:
        #print("%s" % (doc['_source']['documentId']))
     #   doc_ids.append(doc['_source']['documentId'])   
    
    scroll_id = res['_scroll_id']
    for ref in scrollr(es, scroll_id, extract_references):
        print(ref)
        doc_ids.append(ref)
           
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(
        args.num_workers,
        initializer=init,
        initargs=(tok_class, db_class, db_opts)
    )

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    count_matrix = sp.csr_matrix(
        (data, (row, col)), shape=(args.hash_size, len(doc_ids))
    )
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)
Exemplo n.º 7
0
def init():
    global ranker, nlp, df_topic_keywords, lda_model, vectorizer
    print('Initializing app')
    ranker = retriever.get_class('tfidf')(tfidf_path=MODEL)
    print('ranker:', ranker)
    df_topic_keywords = pd.read_pickle(ROOT_DIR / 'model' / 'df_topic_keywords.pkl')
    lda_model = pickle.load(open(ROOT_DIR / 'model' / 'best_lda_model.pkl', 'rb'))
    vocabulary = pickle.load(open(ROOT_DIR / 'model' / 'tm_features.pkl', 'rb'))
    vectorizer = CountVectorizer(decode_error='replace', vocabulary=vocabulary)
    nlp = spacy.load('en', disable=['parser', 'ner'])
Exemplo n.º 8
0
 def __init__(self,tfidf_path,
              tokenizer,
              use_stopwords = False,
              qclassifier = None):
     
     Answerer.__init__(self,qclassifier)
     self.tokenizer = tokenizer
     self.ranker =retriever.get_class('tfidf')(tfidf_path=tfidf_path)
     self.stopwords = stopwords
     self.use_stopwords = use_stopwords
def my_sample_fever():
    logger = logging.getLogger()
    dictConfig({
        'version': 1,
        'formatters': {
            'default': {
                'format':
                '[%(asctime)s] %(levelname)s in %(module)s: %(message)s',
            }
        },
        'handlers': {
            'wsgi': {
                'class': 'logging.StreamHandler',
                'stream': 'ext://sys.stderr',
                'formatter': 'default'
            }
        },
        'root': {
            'level': 'INFO',
            'handlers': ['wsgi']
        },
        'allennlp': {
            'level': 'INFO',
            'handlers': ['wsgi']
        },
    })

    logger.info("Columbia FEVER application")
    config = json.load(
        open(os.getenv("CONFIG_PATH", "configs/system_config.json")))

    ner_predictor = Predictor.from_path(
        "https://s3-us-west-2.amazonaws.com/allennlp/models/fine-grained-ner-model-elmo-2018.12.21.tar.gz"
    )
    google_config = GoogleConfig(**config['retrieval']['google'])
    ranker = retriever.get_class('tfidf')(
        tfidf_path=config['retrieval']['tfidf']['index'])

    predictors = {}
    for key in ('page_model', 'state_model'):
        path = config[key].pop('path')
        predictors[key] = ColumbiaPredictor(path, config['cuda_device'],
                                            **config[key])

    # The prediction function that is passed to the web server for FEVER2.0
    def predict(instances):
        predictions = getDocsSingle(instances, google_config, ner_predictor,
                                    ranker)
        for key in ('page_model', 'state_model'):
            predictions = list(predictors[key].predict(predictions))
        return predictions

    return fever_web_api(predict)
Exemplo n.º 10
0
    def __init__(self, db, n_docs, n_sents, whole_docs, compat, model):
        super().__init__(db)
        self.n_docs = n_docs
        self.n_sents = n_sents
        self.whole_docs = whole_docs
        self.compat = compat
        self.ranker = retriever.get_class('tfidf')(tfidf_path=model)
        self.onlineranker_args = self.RankArgs()

        self.doc_titles = [
            self.ranker.get_doc_id(i) for i in range(self.ranker.num_docs)
        ]
        self.ner_retriever = NER_Retriever(self.doc_titles)
Exemplo n.º 11
0
    def __init__(self, name, retriever_model, num_threads):
        super().__init__(name)

        self.num_threads = min(num_threads, int(multiprocessing.cpu_count()))

        # initialize a ranker per thread
        self.arguments = []
        for id in tqdm(range(self.num_threads)):
            self.arguments.append({
                "id":
                id,
                "ranker":
                retriever.get_class("tfidf")(tfidf_path=retriever_model),
            })
Exemplo n.º 12
0
    def __init__(
        self,
        db,
        model,
        max_page,
        max_sent,
    ):
        self.db = db
        self.n_docs = max_page
        self.n_sents = max_sent
        self.model = model

        self.ranker = retriever.get_class('tfidf')(tfidf_path=model)
        self.onlineranker_args = self.RankArgs()
Exemplo n.º 13
0
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    #多线程编程
    workers = ProcessPool(args.num_workers,
                          initializer=init,
                          initargs=(tok_class, db_class, db_opts))
    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    #分批写入矩阵
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    #partial:偏函数
    """
    函数在执行时,要带上所有必要的参数进行调用。但是,有时参数可以在函数被调用之前提前获知。这种情况下,一个函数有
    一个或多个参数预先就能用上,以便函数能用更少的参数进行调用。
    """
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    count_matrix = sp.csr_matrix((data, (row, col)),
                                 shape=(args.hash_size, len(doc_ids)))
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)
Exemplo n.º 14
0
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}
    # 5075182
    logger.info('the number of docs is %s' % (len(DOC2IDX)))

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(args.num_workers,
                          initializer=init,
                          initargs=(tok_class, db_class, db_opts))

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping......')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 24 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 24)
        k = 0
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
            k += 1
            if k % 10000 == 0:
                logger.info('NO: %s is ......' % k)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix......')
    count_matrix = sp.csr_matrix((data, (row, col)),
                                 shape=(args.hash_size, len(doc_ids)))
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)
Exemplo n.º 15
0
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(
        args.num_workers,
        initializer=init,
        initargs=(tok_class, db_class, db_opts)
    )

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    count_matrix = sp.csr_matrix(
        (data, (row, col)), shape=(args.hash_size, len(doc_ids))
    )
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)
Exemplo n.º 16
0
def get_count_matrix_sklearn(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    hashvec = HashingVectorizer(n_features=2**24,
                                dtype=np.int8,
                                ngram_range=(1, 2),
                                norm=None,
                                non_negative=True)
    chunk_size = 100000

    texts = []
    chunks = []
    db = db_class(**db_opts)
    for i, doc_id in enumerate(doc_ids):
        #if i == 100000: break
        texts.append(db.get_doc_text(doc_id))
        if i % chunk_size == 0:
            if i > 0:
                print(i, 'fitting hashvec...')
                chunks.append(hashvec.transform(texts))
                del texts[:]
    chunks.append(hashvec.transform(texts))

    count_matrix = sp.vstack(chunks)
    count_matrix = count_matrix.transpose()

    print(count_matrix.shape)
    print(count_matrix.dtype)

    return count_matrix, (DOC2IDX, doc_ids)
Exemplo n.º 17
0
class MyTfidfDocRanker(retriever.get_class('tfidf')):
    def text2spvec(self, query, data_val=False):
        """Create a sparse tfidf-weighted word vector from query.

        tfidf = log(tf + 1) * log((N - Nt + 0.5) / (Nt + 0.5))
        """
        # Get hashed ngrams
        words = self.parse(utils.normalize(query))
        wids = [utils.hash(w, self.hash_size) for w in words]

        if len(wids) == 0:
            if self.strict:
                raise RuntimeError('No valid word in: %s' % query)
            else:
                logger.warning('No valid word in: %s' % query)
                return sp.csr_matrix((1, self.hash_size))

        # Count TF
        wids_unique, wids_counts = np.unique(wids, return_counts=True)
        tfs = np.log1p(wids_counts)

        # Count IDF
        Ns = self.doc_freqs[wids_unique]
        idfs = np.log((self.num_docs - Ns + 0.5) / (Ns + 0.5))
        idfs[idfs < 0] = 0

        # TF-IDF
        data = np.multiply(tfs, idfs)

        if data_val:
            return data, wids_unique

        # One row, sparse csr matrix
        indptr = np.array([0, len(wids_unique)])
        spvec = sp.csr_matrix(
            (data, wids_unique, indptr), shape=(1, self.hash_size)
        )

        return spvec
Exemplo n.º 18
0
    start = time.time()

    # read all the data and store it
    logger.info("Reading data ...")
    questions = []
    answers = []
    for line in open(args.dataset):
        data = json.loads(line)
        question = data["question"]
        answer = data["answer"]
        questions.append(question)
        answers.append(answer)

    # get the closest docs for each question.
    logger.info("Initializing ranker...")
    ranker = retriever.get_class("tfidf")(tfidf_path=args.model)

    logger.info("Ranking...")
    closest_docs = ranker.batch_closest_docs(
        questions, k=args.n_docs, num_workers=args.num_workers
    )
    answers_docs = zip(answers, closest_docs)

    # define processes
    tok_class = tokenizers.get_class(args.tokenizer)
    tok_opts = {}
    db_class = retriever.DocDB
    db_opts = {"db_path": args.doc_db}
    processes = ProcessPool(
        processes=args.num_workers,
        initializer=init,
Exemplo n.º 19
0
import time
import sqlite3
from drqa import retriever
import numpy as np
from functools import partial
from concurrent.futures import ThreadPoolExecutor
from itertools import chain
import pandas as pd
from drqa.retriever import utils
import os
import pandas as pd

db = "/home/giuseppe/Scrivania/HLT_Project/Retriver/Process_gnq/gnq_articles.db"
connection = sqlite3.connect(db, check_same_thread=False)
tfidf = "/home/giuseppe/Scrivania/HLT_Project/Retriver/DrQA/gnq_articles-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz"
ranker = retriever.get_class('tfidf')(tfidf_path=tfidf)
qa_db = "/home/giuseppe/Scrivania/HLT_Project/Retriver/Process_gnq/gnq_qa.db"


def get_doc_text(doc_id):
    """Fetch the raw text of the doc for 'doc_id'."""
    cursor = connection.cursor()
    cursor.execute("SELECT text FROM documents WHERE id = ?",
                   (utils.normalize(doc_id), ))
    result = cursor.fetchone()
    cursor.close()
    return result if result is None else result[0]


def _split_doc(doc):
    """Given a doc, split it into chunks (by paragraph)."""
Exemplo n.º 20
0
        question = data['question']
        answer = data['answer']

        # Make sure the regex compiles
        if args.regex:
            try:
                re.compile(answer[0])
            except BaseException:
                logger.warning('Regex failed to compile: %s' % answer)
                continue

        questions.append(question)
        answers.append(answer)

    # Get classes
    ranker_class = retriever.get_class(args.ranker)
    db_class = retriever.get_class(args.db)
    tokenizer_class = tokenizers.get_class(args.tokenizer)

    # Form options
    search_keys = ('regex', 'match_threshold', 'char_max',
                   'char_min', 'window_sz')
    opts = {
        'ranker_class': retriever.get_class(args.ranker),
        'tokenizer_class': tokenizers.get_class(args.tokenizer),
        'db_class': retriever.get_class(args.db),
        'search': {k: vars(args)[k] for k in search_keys},
    }
    opts.update(vars(args))

    # Process!
Exemplo n.º 21
0
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(
        db)  # get specify db class instance to get documents
    with db_class(**db_opts) as doc_db:  # context management
        doc_ids = doc_db.get_doc_ids()  # get all doc ids
    '''
        enumerate(list) wrap a list to dic as follow:
        list=['a','b','c']
        enumerate(list)=dict{0: 'a', 1: 'b', 2: 'c'}
        
        so iterate enumerate(list) return two values:index(start from 0) and value of origin list
    '''
    DOC2IDX = {doc_id: i
               for i, doc_id in enumerate(doc_ids)
               }  # get doc to index maps from doc_ids

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(args.num_workers,
                          initializer=init,
                          initargs=(tok_class, db_class, db_opts))

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)  # get count of steps
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)
               ]  # calc the batch range of each step

    # redefine function signature. use some defaults args to wrap a function object and return
    # a callable object.
    # refer link:http://www.wklken.me/posts/2013/08/18/python-extra-functools.html
    _count = partial(count, args.ngram, args.hash_size)

    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            # three lists extend when each step
            row.extend(
                b_row
            )  # list[.../step..../step...] hash(n-gram(token from doc))
            col.extend(b_col)  # list[.../step..../step...] index of doc
            data.extend(
                b_data
            )  # list[.../step..../step...] value of count of n-gram(token from doc)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    '''生成的稀疏矩阵示例
       hash(N-gram) --------------(col)
       index of doc | element=count(hash) 
                    |
                    |
                    |
                    (row)
    '''
    count_matrix = sp.csr_matrix((data, (row, col)),
                                 shape=(args.hash_size, len(doc_ids)))

    # 将矩阵中实体元素相同的进行相加合并
    count_matrix.sum_duplicates()
    # 输出矩阵,以及其他
    return count_matrix, (DOC2IDX, doc_ids)
        question = data['question']
        answer = data['answer']

        # Make sure the regex compiles
        if args.regex:
            try:
                re.compile(answer[0])
            except BaseException:
                logger.warning('Regex failed to compile: %s' % answer)
                continue

        questions.append(question)
        answers.append(answer)

    # Get classes
    ranker_class = retriever.get_class(args.ranker)
    db_class = retriever.get_class(args.db)
    tokenizer_class = tokenizers.get_class(args.tokenizer)

    # Form options
    search_keys = ('regex', 'match_threshold', 'char_max', 'char_min',
                   'window_sz')
    opts = {
        'ranker_class': retriever.get_class(args.ranker),
        'tokenizer_class': tokenizers.get_class(args.tokenizer),
        'db_class': retriever.get_class(args.db),
        'search': {k: vars(args)[k]
                   for k in search_keys},
    }
    opts.update(vars(args))
Exemplo n.º 23
0
def eval_model(db: FeverDocDB, args) -> Model:
    archive = load_archive(args.archive_file,
                           cuda_device=args.cuda_device,
                           overrides=args.overrides)

    config = archive.config
    ds_params = config["dataset_reader"]

    model = archive.model
    model.eval()

    reader = FEVERReader(db,
                         sentence_level=ds_params.pop("sentence_level", False),
                         wiki_tokenizer=Tokenizer.from_params(
                             ds_params.pop('wiki_tokenizer', {})),
                         claim_tokenizer=Tokenizer.from_params(
                             ds_params.pop('claim_tokenizer', {})),
                         token_indexers=TokenIndexer.dict_from_params(
                             ds_params.pop('token_indexers', {})))

    while True:

        claim = input("enter claim (or q to quit) >>")
        if claim.lower() == "q":
            break

        ranker = retriever.get_class('tfidf')(tfidf_path=args.model)

        p_lines = []
        pages, _ = ranker.closest_docs(claim, 5)

        for page in pages:
            lines = db.get_doc_lines(page)
            lines = [
                line.split("\t")[1] if len(line.split("\t")[1]) > 1 else ""
                for line in lines.split("\n")
            ]

            p_lines.extend(zip(lines, [page] * len(lines), range(len(lines))))

        scores = tf_idf_sim(claim, [pl[0] for pl in p_lines])
        scores = list(
            zip(scores, [pl[1] for pl in p_lines], [pl[2] for pl in p_lines],
                [pl[0] for pl in p_lines]))
        scores = list(filter(lambda score: len(score[3].strip()), scores))
        sentences_l = list(
            sorted(scores, reverse=True, key=lambda elem: elem[0]))

        sentences = [s[3] for s in sentences_l[:5]]
        evidence = " ".join(sentences)

        print("Best pages: {0}".format(repr(pages)))

        print("Evidence:")
        for idx, sentence in enumerate(sentences_l[:5]):
            print("{0}\t{1}\t\t{2}\t{3}".format(idx + 1, sentence[0],
                                                sentence[1], sentence[3]))

        item = reader.text_to_instance(evidence, claim)

        prediction = model.forward_on_instance(item, args.cuda_device)
        cls = model.vocab._index_to_token["labels"][np.argmax(
            prediction["label_probs"])]
        print("PREDICTED: {0}".format(cls))
        print()
Exemplo n.º 24
0
 def __init__(self, saved_model_path):
     self.ranker = retriever.get_class('tfidf')(tfidf_path=saved_model_path)
def process(ranker, query, k=1):
    doc_names, doc_scores = ranker.closest_docs(query, k)

    return doc_names


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--in-file', type=str)
    parser.add_argument('--out-file', type=str)
    parser.add_argument('--index', type=str)
    parser.add_argument('--count', type=int, default=1)
    args = parser.parse_args()

    k = args.count
    ranker = retriever.get_class('tfidf')(tfidf_path=args.index)

    with open(args.in_file) as f:
        with open(args.out_file, "w+") as f2:
            for line in tqdm(f.readlines()):
                line = json.loads(line)

                if line["label"] == "NOT ENOUGH INFO":
                    pages = process(ranker, line['claim'], k=k)
                    pp = list(pages)

                    for idx, evidence_group in enumerate(line['evidence']):
                        for evidence in evidence_group:
                            if idx < len(pp):
                                evidence[2] = pp[idx]
                                evidence[3] = -1
Exemplo n.º 26
0
        question = data["question"]
        answer = data["answer"]

        # Make sure the regex compiles
        if args.regex:
            try:
                re.compile(answer[0])
            except BaseException:
                logger.warning("Regex failed to compile: %s" % answer)
                continue

        questions.append(question)
        answers.append(answer)

    # Get classes
    ranker_class = retriever.get_class(args.ranker)
    db_class = retriever.get_class(args.db)
    tokenizer_class = tokenizers.get_class(args.tokenizer)

    # Form options
    search_keys = ("regex", "match_threshold", "char_max", "char_min",
                   "window_sz")
    opts = {
        "ranker_class": retriever.get_class(args.ranker),
        "tokenizer_class": tokenizers.get_class(args.tokenizer),
        "db_class": retriever.get_class(args.db),
        "search": {k: vars(args)[k]
                   for k in search_keys},
    }
    opts.update(vars(args))
Exemplo n.º 27
0
 def __init__(self, db, n_docs, n_sents, model):
     super().__init__(db)
     self.n_docs = n_docs
     self.n_sents = n_sents
     self.ranker = retriever.get_class('tfidf')(tfidf_path=model)
     self.onlineranker_args = self.RankArgs()
Exemplo n.º 28
0
 def __init__(self, database, index, n_docs, n_sents):
     super().__init__(database)
     self.n_docs = n_docs
     self.n_sents = n_sents
     self.ranker = retriever.get_class('tfidf')(tfidf_path=index)
     self.onlineranker_args = self.RankArgs()
Exemplo n.º 29
0
 def __init__(self, db, k, model):
     self.db = db
     self.k = k
     self.model = model
     self.ranker = retriever.get_class('tfidf')(tfidf_path=self.model)
Exemplo n.º 30
0
    # read all the data and store it
    logger.info('Reading data ...')
    questions = []
    answers = []

    for line in open(args.dataset):
        data = json.loads(line)
        question = data['question']
        answer = data['answer']
        questions.append(question)
        answers.append(answer)

    # get the closest docs for each question.
    logger.info('Initializing ranker...')
    ranker = retriever.get_class('tfidf')(tfidf_path=args.model)

    logger.info('Ranking...')
    closest_docs = ranker.batch_closest_docs(questions,
                                             k=args.n_docs,
                                             num_workers=args.num_workers)
    ranker = []

    tok_class = tokenizers.get_class(args.tokenizer)
    tok_opts = {}
    db_class = retriever.DocDB
    db_opts = {'db_path': args.doc_db}
    PROCESS_TOK = tok_class(**tok_opts)
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
    PROCESS_DB = db_class(**db_opts)
    Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)
Exemplo n.º 31
0
import logging
from drqa import retriever

logger = logging.getLogger()
logger.setLevel(logging.INFO)
fmt = logging.Formatter('%(asctime)s: [ %(message)s ]', '%m/%d/%Y %I:%M:%S %p')
console = logging.StreamHandler()
console.setFormatter(fmt)
logger.addHandler(console)

parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default=None)
args = parser.parse_args()

logger.info('Initializing ranker...')
ranker = retriever.get_class('tfidf')(tfidf_path=args.model)


# ------------------------------------------------------------------------------
# Drop in to interactive
# ------------------------------------------------------------------------------


def process(query, k=1):
    doc_names, doc_scores = ranker.closest_docs(query, k)
    table = prettytable.PrettyTable(
        ['Rank', 'Doc Id', 'Doc Score']
    )
    for i in range(len(doc_names)):
        table.add_row([i + 1, doc_names[i], '%.5g' % doc_scores[i]])
    print(table)
Exemplo n.º 32
0
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(
        db)  # drqa/retriever/__init__.py --> doc_db.py
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids(
        )  # Fetch all ids of docs stored in the db.
    DOC2IDX = {doc_id: i
               for i, doc_id in enumerate(doc_ids)
               }  # store in {'3255': 0, '8902': 1, ...}

    # Setup worker pool
    tok_class = tokenizers.get_class(
        args.tokenizer
    )  # 'corenlp', drqa/tokenizers/__init__.py --> corenlp_tokenizer.py
    workers = ProcessPool(args.num_workers,
                          initializer=init,
                          initargs=(tok_class, db_class, db_opts))

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step]
               for i in range(0, len(doc_ids), step)]  # total 10 batches
    _count = partial(
        count, args.ngram,
        args.hash_size)  # args.hash_size --> default=int(math.pow(2, 24))
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    """
    csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
            where ``data``, ``row_ind`` and ``col_ind`` satisfy the
            relationship ``a[row_ind[k], col_ind[k]] = data[k]``.
    
    Examples:
        >>> row = np.array([0, 0, 1, 2, 2, 2])
        >>> col = np.array([0, 2, 2, 0, 1, 2])
        >>> data = np.array([1, 2, 3, 4, 5, 6])
        >>> csr_matrix((data, (row, col)), shape=(3, 3)).toarray()
        array([[1, 0, 2],
               [0, 0, 3],
               [4, 5, 6]])
    
    count_matrix: shape=(args.hash_size, len(doc_ids))
    
              doc_1   doc_2  ...   doc_m
    word_1    [[1,      0,   ...    2],
    word_2     [0,      0,   ...    3],
     ...                ...
    word_n     [4,      5,   ...    6]]
    
    i.e., (word_1, doc_m) denotes word 'word_1' appear 2 times in doc 'doc_m'.
    
    Reference: https://towardsdatascience.com/machine-learning-to-big-data-scaling-inverted-indexing-with-solr-ba5b48833fb4
    """
    count_matrix = sp.csr_matrix(  # import scipy.sparse as sp
        (data, (row, col)),
        shape=(args.hash_size, len(doc_ids)))
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)
Exemplo n.º 33
0
def configurate_server(server, tfidf_path):
    server.handler_params = {
        "ranker": get_class("tfidf")(tfidf_path=tfidf_path, strict=False)
    }