Exemplo n.º 1
0
 def fast_qa_by_cmkb():
     from qa.para_select import WordMatchSelector
     import random
     if request.json is None:
         return jsonify({
             'result': 'failed',
             'message': 'request is not json'
         })
     req = RequestQA(request.json)
     res = cmkb_elk_retriever.search_elk(req.question)[0:2]
     print('retrieve %d docs' % (len(res)))
     if len(res) == 0:
         keyword = '高血壓'
         if '糖尿病' in req.question:
             keyword = '糖尿病'
         target_sample = None
         for sample in jsonl_reader('./data/docs/fake_db.jsonl'):
             if keyword in sample['question']:
                 target_sample = sample
                 break
         assert target_sample is not None
         SimpleParagraphTransform().transform(target_sample)
         x = DureaderRawExample(target_sample)
         records = x.flatten(['question', 'qid'], ['url', 'title'])
         results = WordMatchSelector(k=1).paragraph_selection(records)
         results = random.sample(results, k=2)
     else:
         records = []
         for i, x in enumerate(res):
             q = 'q:%d' % (i)
             for j, p in enumerate(x['paragraphs']):
                 obj = {
                     'qid': i,
                     'doc_id': i,
                     'question': q,
                     'passage': p,
                     'url': x['url'],
                     'title': x['title']
                 }
                 records.append(obj)
         k = 1 if len(res) > 1 else 2
         results = WordMatchSelector(k=k).paragraph_selection(records)
         results = random.sample(results, k=2)
     response = {
         'question': req.question,
         'algo_version': req.algo_version,
         'answers': []
     }
     for x in results:
         response['answers'].append({
             'paragraph': x['passage'],
             'answer': '',
             'answer_pos': [0, -1],
             'title': x['title'],
             'url': x['url']
         })
     return jsonify(wrap_response(response))
Exemplo n.º 2
0
def load_beta_file(path):
    print('load file %s' % (path))
    records = []
    for json_obj in jsonl_reader(path):
        question = json_obj['question']
        if len(question) == 0:
            continue
        for paragraph in json_obj['paragraphs']:
            records.append({'question': question, 'passage': paragraph})
    return records
Exemplo n.º 3
0
def test_mrc_model():
    from mrc_server import create_app, multi_doc_model_factory
    from preprocessing import SimpleParagraphTransform
    test_data = next(jsonl_reader('./data/test/test_mrc.jsonl'))
    test_config = {'model_type': 'mock'}
    model = multi_doc_model_factory(test_config)
    assert True == util.check_input_format(test_data, 'raw')
    SimpleParagraphTransform().transform(test_data)
    assert util.check_input_format(test_data, 'multi_mrc')
    print(model.get_answer_list(test_data))
Exemplo n.º 4
0
def test_mock_mrc_server():
    from mrc_server import create_app
    from qa.ranker import RankerFactory
    from preprocessing import SimpleParagraphTransform
    test_data = next(jsonl_reader('./data/test/test_mrc.jsonl'))
    test_config = {'model_type': 'mock'}
    assert True == util.check_input_format(test_data, 'raw')
    SimpleParagraphTransform().transform(test_data)
    assert util.check_input_format(test_data, 'multi_mrc')
    app = create_app(test_config)
    with app.test_client() as c:
        rv = c.post('/qa',
                    json={
                        'mrc_input': test_data,
                        'answer_num': 3,
                        'algo_version': 0
                    })
    json_data = rv.get_json()
    print(json_data)
Exemplo n.º 5
0
def load_examples_from_scratch(path,sample_stg=None,concat=False,attach_label=None):
    examples = []
    labels = []
    line_cnt  = 0
    for json_obj in jsonl_reader(path):
        if line_cnt%2000 == 0:
            print('load %dth line'%(line_cnt))
        line_cnt+=1
        paras = []
        if sample_stg is None:
            tmp_paras = []
            tmp_labels = []
            for di,doc in enumerate(json_obj['documents']):
                if attach_label is not None and 'most_related_para' not in doc:
                    continue
                tmp_paras.extend(doc['paragraphs'])
                if attach_label is None:
                    continue
                zeros = [0] * len(doc['paragraphs'])
                if attach_label == 'most_related_para' :
                    zeros[doc['most_related_para']] = 1
                if attach_label == 'answer_docs' and 'answer_docs' in json_obj and  di in json_obj['answer_docs' ]:
                    zeros[doc['most_related_para']] = 1
                assert sum(zeros) <2
                tmp_labels.extend(zeros)
            if attach_label is not None:
                assert len(tmp_labels) == len(tmp_paras)
            paras.extend(tmp_paras)
            if len(tmp_labels) > 0:
                labels.extend(tmp_labels)
        else:
            pos_examples,neg_examples = sample_stg(json_obj)
            labels.extend([1]*len(pos_examples)+[0]*len(neg_examples))
            paras = pos_examples+neg_examples
        examples.extend([(json_obj['question'].strip(),p) for p in paras])
    print('total %d examples'%(len(examples)))
    if len(labels) > 0:
        if concat:
            return [(q,p,lb) for (q,p),lb in zip(examples,labels)  ] 
        return examples,labels
    return examples
Exemplo n.º 6
0
def test_mrc_server():
    from mrc_server import create_app
    from preprocessing import SimpleParagraphTransform
    test_data = next(jsonl_reader('./data/test/test_mrc.jsonl'))
    test_config = {
        'model_type': 'pipeline',
        'device': 'cpu',
        'ranker_config_path': './data/model/pointwise/answer_doc/config.json',
        'reader_config_path': './data/model/reader/bert_default/config.json'
    }
    assert True == util.check_input_format(test_data, 'raw')
    SimpleParagraphTransform().transform(test_data)
    assert util.check_input_format(test_data, 'multi_mrc')
    app = create_app(test_config)
    print('send data')
    with app.test_client() as c:
        rv = c.post('/qa',
                    json={
                        'mrc_input': test_data,
                        'answer_num': 3,
                        'algo_version': 0
                    })
    json_data = rv.get_json()
    print(json_data)
Exemplo n.º 7
0
 def _load_file(self, path):
     ret = []
     for raw_sample in jsonl_reader(path):
         ret.extend(self._para_selection(raw_sample))
     return ret
Exemplo n.º 8
0
from .dureader import DureaderRawExample, DureaderRawDocument, DureaderLoader
from common.util import jsonl_reader

json_obj = next(jsonl_reader('./data/unittest/dureader_fake.json'))

example = DureaderRawExample(json_obj)
flatten_samples = example.flatten()
assert len(flatten_samples) == 5
assert json_obj['question'] == flatten_samples[0]['question']
assert json_obj['question_id'] == flatten_samples[0]['question_id']

o1 = [{
    'passage': 'abcde',
    'passage_id': 0,
    'doc_id': 0,
    'question': 'wtf is this?',
    'question_id': 12345
}]
o2 = [{
    'passage': 'abcde',
    'passage_id': 0,
    'doc_id': 0,
    'question': 'wtf is this?',
    'question_id': 12345
}, {
    'passage': 'ccc',
    'passage_id': 2,
    'doc_id': 1,
    'question': 'wtf is this?',
    'question_id': 12345
}]
Exemplo n.º 9
0
def test_check_format():
    test_data = next(jsonl_reader('./data/test/test_mrc.jsonl'))
    assert True == util.check_input_format(test_data, 'raw')
    assert False == util.check_input_format(test_data, 'multi_mrc')