예제 #1
0
def index_ngram_word():
    print('Indexing ngram word...')
    schema = Schema(id=ID(stored=True),
                    question=NGRAMWORDS(minsize=2,
                                        maxsize=7,
                                        tokenizer=SpaceSeparatedTokenizer()),
                    answer=NGRAMWORDS(minsize=2,
                                      maxsize=7,
                                      tokenizer=SpaceSeparatedTokenizer()))
    if not os.path.exists('index_ngram_word'):
        os.mkdir('index_ngram_word')
    ix = create_in('index_ngram_word', schema)
    writer = ix.writer()
    with open(PATH_QUESTION_ANSWER, 'r') as f:
        for qa in json_lines.reader(f):
            # print(qa['question'])
            # print(qa['answer'])
            # print('\n')
            if not convenion.is_valid_qa(qa):
                continue
            question = convenion.customize_and_remove_stopword(qa['question'])
            answer = convenion.customize_and_remove_stopword(qa['answer'])
            writer.add_document(id=qa['id_cmt'],
                                question=question,
                                answer=answer)
        print('Commit ngram word...')
        writer.commit()
예제 #2
0
def raw_index_file():
    with jsonlines.open(PATH_QUESTION_ANSWER_INDEXER, mode='w') as writer:
        with jsonlines.open(PATH_QUESTION_ANSWER) as reader:
            for qa in reader:
                if not convenion.is_valid_qa(qa):
                    continue
                id_doc = qa['id_cmt']
                question = qa['question']
                answer = qa['answer']
                question_custom = convenion.customize_string(question)
                answer_custom = convenion.customize_string(answer)
                question_removed_stopword = convenion.customize_and_remove_stopword(
                    question)
                answer_removed_stopword = convenion.customize_and_remove_stopword(
                    answer)
                # print(question_custom)
                # print(answer_custom)
                # print(question_removed_stopword)
                # print(answer_removed_stopword)
                doc_id = {"index": {"_id": id_doc}}
                doc = {
                    "question": question,
                    "answer": answer,
                    "question_custom": question_custom,
                    "answer_custom": answer_custom,
                    "question_removed_stopword": question_removed_stopword,
                    "answer_removed_stopword": answer_removed_stopword,
                }
                writer.write(doc_id)
                writer.write(doc)
예제 #3
0
def index_basic():
    # Use scoring method BM25F
    print('Indexing basic...')
    schema = Schema(id=ID(stored=True),
                    question=STORED,
                    answer=STORED,
                    question_custom=TEXT(stored=True),
                    answer_custom=TEXT(stored=True))

    if not os.path.exists('index_basic'):
        os.mkdir('index_basic')
    ix = create_in('index_basic', schema)
    writer = ix.writer()

    with open(PATH_QUESTION_ANSWER, 'r') as f:
        for qa in json_lines.reader(f):
            if not convenion.is_valid_qa(qa):
                continue
            question = qa['question']
            answer = qa['answer']
            question_custom = convenion.customize_and_remove_stopword(
                qa['question'])
            answer_custom = convenion.customize_and_remove_stopword(
                qa['answer'])
            print(question_custom)
            print(answer_custom)
            writer.add_document(id=qa['id_cmt'],
                                question=question,
                                answer=answer,
                                question_custom=question_custom,
                                answer_custom=answer_custom)
        print('Commit basic...')
        writer.commit()
예제 #4
0
def raw_query_pool():
    with open('elastic/query_pool.json') as f:
        queries = json.load(f)
        print("Current queries len: ", len(queries))
        print("\n")
        arr_id = [query['id'] for query in queries]
        arr_id_checked = list(arr_id)

        arr_question_source = []
        with jsonlines.open(PATH_QUESTION_ANSWER) as reader:
            for qa in reader:
                if not convenion.is_valid_qa(qa):
                    continue
                arr_question_source.append(qa)
            print(random.choice(arr_question_source))

        user_judge = ''

        while (len(arr_id) != 250) and (user_judge != '0'):
            qa_checking = random.choice(arr_question_source)
            if qa_checking['id_cmt'] in arr_id_checked:
                continue
            arr_id_checked.append(qa_checking['id_cmt'])
            # print("Question: %(question)s\n" %qa_checking)
            # print('Input your jugde for quenstion: ')
            user_judge = input(qa_checking['question'] + '\n')
            if user_judge != '1':
                print("Collecting next question...\n")
                continue
            print("Add to query...\n")
            arr_id.append(qa_checking['id_cmt'])
            queries.append({
                'id': qa_checking['id_cmt'],
                'question': qa_checking['question'],
                'searched': 0
            })
            print("Current queries len: ", len(queries))
            print("\n")

        with open('elastic/query_pool.json', 'w') as outfile:
            json.dump(queries, outfile)