Exemplo n.º 1
0
def preprocess_squad(file_name, output_name, skip_answer=False):
    data = load_squad(file_name)
    
    tokenizer_class = tokenizers.get_class('spacy')
    make_pool = partial(Pool, 8, initializer=init)
    workers = make_pool(initargs=(tokenizer_class, {'annotators': {'lemma'}}))
    q_tokens = workers.map(tokenize, data['questions'])
    workers.close()
    workers.join()

    workers = make_pool(
        initargs=(tokenizer_class, {'annotators': {'lemma', 'pos', 'ner'}})
    )
    c_tokens = workers.map(tokenize, data['contexts'])
    workers.close()
    workers.join()

    examples = []
    for idx in range(len(data['qids'])):
        question = q_tokens[idx]['words']
        qlemma = q_tokens[idx]['lemma']
        document = c_tokens[data['qid2cid'][idx]]['words']
        offsets = c_tokens[data['qid2cid'][idx]]['offsets']
        lemma = c_tokens[data['qid2cid'][idx]]['lemma']
        pos = c_tokens[data['qid2cid'][idx]]['pos']
        ner = c_tokens[data['qid2cid'][idx]]['ner']
        ans_tokens = []
        if len(data['answers']) > 0:
            for ans in data['answers'][idx]:
                found = find_answer(offsets,
                                    ans['answer_start'],
                                    ans['answer_start'] + len(ans['text']))
                if found:
                    ans_tokens.append(found)
        examples.append( {
            'id': data['qids'][idx],
            'question': question,
            'document': document,
            'offsets': offsets,
            'answers': ans_tokens,
            'qlemma': qlemma,
            'lemma': lemma,
            'pos': pos,
            'ner': ner,
        })
    json.dump(examples, open(output_name, 'w'))
 def __init__(self, tfidf_path=None, strict=True):
     """
     Args:
         tfidf_path: path to saved model file
         strict: fail on empty queries or continue (and return empty result)
     """
     # Load from disk
     tfidf_path = tfidf_path
     logger.info('Loading %s' % tfidf_path)
     matrix, metadata = utils.load_sparse_csr(tfidf_path)
     self.doc_mat = matrix
     self.ngrams = metadata['ngram']
     self.hash_size = metadata['hash_size']
     self.tokenizer = tokenizers.get_class(metadata['tokenizer'])()
     self.doc_freqs = metadata['doc_freqs'].squeeze()
     self.doc_dict = metadata['doc_dict']
     self.num_docs = len(self.doc_dict[0])
     self.strict = strict
Exemplo n.º 3
0
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(args.num_workers,
                          initializer=init,
                          initargs=(tok_class, db_class, db_opts))

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    count_matrix = sp.csr_matrix((data, (row, col)),
                                 shape=(args.hash_size, len(doc_ids)))
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)
Exemplo n.º 4
0
    def __init__(self, model=None, tokenizer=None, embedding_file=None, num_workers=None, normalize=True):
        """
        Args:
            model: path to saved model file.
            tokenizer: option string to select tokenizer class.
            normalize: squash output score to 0-1 probabilities with a softmax.
            embedding_file: if provided, will expand dictionary to use all
              available pretrained vectors in this file.
            num_workers: number of CPU processes to use to preprocess batches.
        """
        logger.info('Initializing model...')
        self.model = DocReader.load(model or DEFAULTS['model'],
                                    normalize=normalize)

        if embedding_file:
            logger.info('Expanding dictionary...')
            words = utils.index_embedding_words(embedding_file)
            added = self.model.expand_dictionary(words)
            self.model.load_embeddings(added, embedding_file)

        logger.info('Initializing tokenizer...')
        annotators = tokenizers.get_annotators_for_model(self.model)
        if not tokenizer:
            tokenizer_class = DEFAULTS['tokenizer']
        else:
            tokenizer_class = tokenizers.get_class(tokenizer)

        if num_workers is None or num_workers > 0:
            self.workers = ProcessPool(
                num_workers,
                initializer=init,
                initargs=(tokenizer_class, annotators),
            )
        else:
            self.workers = None
            self.tokenizer = tokenizer_class(annotators=annotators)
Exemplo n.º 5
0
import unicodedata
from tqdm import tqdm
import pickle

sys_dir = './'
sys.path.append(sys_dir)

import json
import tokenizers
from multiprocessing.util import Finalize
tokenizers.set_default('corenlp_classpath', sys_dir + '/data/corenlp/*')

import string
import regex as re

tok_class = tokenizers.get_class("corenlp")
tok_opts = {}
PROCESS_TOK = tok_class(**tok_opts)
Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)


def tokenizer_text(my_str='', uncased=True):
    '''
    :param my_str: string
    :param cased: bool
    :return: list[str]
    '''
    text = unicodedata.normalize('NFD', my_str)
    answer = PROCESS_TOK.tokenize(text)

    if uncased == True:
Exemplo n.º 6
0

if __name__ == '__main__':

    client = pymongo.MongoClient(host='192.168.1.145', port=27017)
    db = client['AttnReader']
    collection_train = db[args.collection_train_name]
    collection_val = db[args.collection_val_name]

    if args.raw_data:
        train = load_origin_data(collection_train,
                                 args,
                                 single_answer=args.single_answer_train)
        dev = load_origin_data(collection_val, args, single_answer=False)
    else:
        tokenizer = tokenizers.get_class(args.tokenizer)()
        train = load_data_tokenize(collection_train,
                                   tokenizer,
                                   args,
                                   single_answer=args.single_answer_train)
        dev = load_data_tokenize(collection_val,
                                 tokenizer,
                                 args,
                                 single_answer=False)
        if args.tokenizer == 'ltp':
            tokenizer.release()

    context_ents = list(train.netags) + list(dev.netags)
    context_tags = list(train.postags) + list(dev.postags)

    question_tokens = list(train.query_words) + list(dev.query_words)