Exemplo n.º 1
0
def split_trainingsdata_into_sentences():
    data = json.load(open('trainings_data.json'))
    f = open('trainings_data_sentence', 'w')
    trainings_data_sentence = []
    for entry in data['data']:
        sentences = entry['text'].split('.')
        for qas in entry['qas']:
            question = qas['question']
            qp_result = process_question(question, NLPToolkit())
            #relevant_sentences = []
            #not_relevant_sentences = []
            keywords = qp_result.question_model.get_keywords()
            answer_type = qp_result.answer_type
            #print(answer_type)
            for sentence in sentences:
                relevant = False
                for answer in qas['answers']:
                    if answer in sentence:
                        relevant = True
                    count_keywords = get_number_of_keywords(sentence, keywords)
                    similarity = get_similiarity(question, answer)
                    #count_named_entities = get_number_of_named_entities(sentence, answer_type)
                    #print("Keywords: " + str(keywords))
                    #print("NE: " + str(named_entities))
                    f.write(question + " " + sentence + " " +
                            str(count_keywords) + " " + str(similarity) +
                            " ::: " + str(relevant) + '\n')
Exemplo n.º 2
0
 def test_get_context_can_return_multiple_sentences(self):
     expected_context = "This contrasts with expendable launch " \
               "systems, where each launch vehicle is launched once and then discarded. No completely reusable " \
               "orbital launch system has ever been created."
     answer_predictor = AnswerPredictor(NLPToolkit())
     context = answer_predictor._get_context_of_best_span(passage, 259, 270)
     self.assertEqual(context, expected_context)
def preprocessing_pipeline(docs: Documents, qp_result: QPResult,
                           nlp_toolkit: NLPToolkit):
    # preprocessing pipeline:
    # 1: split docs in sentences
    # 2: filter sentences

    processed_docs = []
    # we use all docs in the moment and evaluate based on the probability distribution available through softmax
    for doc in docs.docs:
        sentences = nlp_toolkit.text_to_sentences(doc.text)
        filtered_sentences = \
            [sentence for sentence in sentences if filter_passages(sentence, qp_result.answer_type, nlp_toolkit)]
        processed_docs.append(' '.join(filtered_sentences))

    return processed_docs
Exemplo n.º 4
0
def process_question(question: str, nlp_toolkit: NLPToolkit) -> QPResult:
    # start logging
    Logger.info('started')
    start = datetime.now()

    # start question processing
    clf_name = get_clf_name(question)
    clf = get_clf_from_disk(clf_name)
    label = get_predicted_label(question, clf)
    keywords = nlp_toolkit.get_headwords(question)
    # keywords = get_key_words(question)
    # print(keywords)
    # print(AnswerType[label.upper()])

    # end logging
    end = datetime.now()
    diff = end - start
    Logger.info('AnswerType: ' + str(AnswerType[label]))
    Logger.info('finished (' + str(diff.seconds) + '.' +
                str(diff.microseconds)[0:2] + ' s)')
    Logger.small_seperator()

    return QPResult(QuestionModel(keywords, question), AnswerType[label])
Exemplo n.º 5
0
import unittest

from utils.nlptoolkit import NLPToolkit

nlp_toolkit = NLPToolkit()


class TestNLPToolkit(unittest.TestCase):

    def test_get_headwords(self):
        headwords = nlp_toolkit.get_headwords('What is question answering')
        print(headwords)


if __name__ == '__main__':
    unittest.main()
Exemplo n.º 6
0
def text_contains_any_answer(text, answers):
    for a in answers:
        if a['text'] in text:
            return True
    return False


Logger.config('info')

Logger.info("Start analysis")

question_counter = 0
correct_answers_counter = 0

nlptoolkit = NLPToolkit()

for dataset in data['data']:
    title = dataset['title']
    Logger.info('Dataset: ' + title)

    for paragraph in dataset['paragraphs'][:5]:
        context = paragraph['context']
        for question_answer_set in paragraph['qas']:
            question_counter += 1
            question = question_answer_set['question']
            correct_answers = question_answer_set['answers']
            Logger.info(question)

            doc = Document(title, context)
            docs = Documents()
Exemplo n.º 7
0
    def test_returns_the_sentence_containing_the_answer(self):
        answer_predictor = AnswerPredictor(NLPToolkit())
        result = answer_predictor.predict(passage, question)

        self.assertEqual(result['context'], correct_context)
Exemplo n.º 8
0
    def test_returns_the_expected_answer(self):
        answer_predictor = AnswerPredictor(NLPToolkit())
        result = answer_predictor.predict(passage, question)

        self.assertEqual(result['answer'], correct_answer)
sys.path.append(DIR + '/../../src')
data = json.load(open('dev-v1.1.json'))


def text_contains_any_answer(text, answers):
    for a in answers:
        if a['text'] in text:
            return True
    return False


Logger.config('error')

Logger.error("Start analysis")

nlp = NLPToolkit()
question_counter = 0
correct_answers_counter = 0
correct_answer_dict = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0}

false_answer = 0

for dataset in data['data']:
    title = dataset['title']
    title_ext = title.replace('_', ' ')
    Logger.error('Dataset: ' + title)

    for paragraph in dataset['paragraphs'][:5]:
        for question_answer_set in paragraph['qas']:
            Logger.error(
                '####################################################')
Exemplo n.º 10
0
 def test_returns_main_article(self):
     question_str = "Arnold Schwarzenegger"
     question_model = QuestionModel(question_str.split(), question_str)
     result = document_retrieval.receive_docs(question_model, NLPToolkit())
     self.assertEqual(result.get_doc_with_highest_rank().title,
                      'Arnold Schwarzenegger')
Exemplo n.º 11
0
FINE_CLASSES_SYNSETS = [
    'abbreviation.n.01', 'formula.n.01', 'animal.n.01', 'body.n.01',
    'color.n.01', 'creative.a.01', 'currency.n.01', 'event.n.01', 'food.n.01',
    'musical_instrument.n.01', 'speech.n.02', 'letter.n.02', 'plant.n.02',
    'merchandise.n.01', 'religion.n.01', 'sport.n.01', 'substance.n.01',
    'symbol.n.01', 'technique.n.01', 'term.n.01', 'vehicle.n.01', 'word.n.01',
    'definition.n.01', 'description.n.01', 'manner.n.01', 'reason.n.02',
    'group.n.01', 'person.n.01', 'city.n.01', 'state.n.04', 'mountain.n.01',
    'code.v.02', 'count.n.01', 'date.n.01', 'distance.n.01', 'money.n.01',
    'rate.v.01', 'period.n.05', 'percentage.n.01', 'speed.n.01',
    'temperature.n.01', 'size.n.01', 'weight.n.01', 'disease.n.01',
    'entity.n.01', 'title.n.06', 'description.n.02', 'location.n.01',
    'state.n.01', 'numeral.n.01'
]

nlp = NLPToolkit()


def get_features(questions):
    feature_enriched_questions = []
    for question in questions:
        doc = get_doc(question)
        wh_word = str(get_wh_word(doc))
        enriched_question = question
        if wh_word == "how":
            pass
        elif wh_word == "who":
            enriched_question = enriched_question + " " + str(
                get_head_word_noun_phrase(doc))
        elif wh_word == "why":
            pass
Exemplo n.º 12
0
def text_contains_any_answer(text, answers):
    for a in answers:
        if a['text'] in text:
            return True
    return False


Logger.config('info')

Logger.info("Start analysis")

question_counter = 0
correct_answers_counter = 0

nlptoolkit = NLPToolkit()

for dataset in data['data']:
    title = dataset['title']
    Logger.info('Dataset: ' + title)

    for paragraph in dataset['paragraphs'][:5]:
        context = paragraph['context']
        for question_answer_set in paragraph['qas']:
            question_counter += 1
            question = question_answer_set['question']
            correct_answers = question_answer_set['answers']
            Logger.info(question)

            doc = Document(title, context)
            docs = Documents()
Exemplo n.º 13
0
            return True
    return False


def is_correct_article(title, correct_title) -> bool:
    if title == correct_title.replace("_", " "):
        return True
    else:
        return False


Logger.config('error')

Logger.info("Start document_retrieval analysis")

nlpToolkit = NLPToolkit()
question_counter = 0
correct_article_counter = 0
correct_firstArticle_counter = 0
correct_secondArticle_counter = 0
correct_thirdArticle_counter = 0
correct_answers_counter = 0

for dataset in data['data']:
    title = dataset['title']
    Logger.info('Dataset: ' + title)

    for paragraph in dataset['paragraphs'][:5]:
        for question_answer_set in paragraph['qas']:
            question_counter += 1
            question = question_answer_set['question']