def rankDocument(doc, q): titly = doc.find_all("content", attrs={"name": ["title", "altTitle"]}) hl = 0 # count of terms highlited by search engine sim = 0 # query-defined similarity with the titles for t in titly: # We search for the max scores from all the title entries c = BeautifulSoup(t.get_text(), "html.parser") hl = max(hl, len(c.find_all("span"))) sim = max(sim, q.similarity(Analysis(c.get_text(), superficial=True))) return hl + sim
def rankResult(result, q): title_similarity = q.similarity(Analysis(result['title'], superficial=True)) snippet = BeautifulSoup(result['snippet'], 'html.parser') matches = snippet.find_all('span', attrs={"class": "searchmatch"}) match_score = len(set(m.get_text() for m in matches)) #print('{}: {}+{}'.format(result['title'], title_similarity, match_score)) title_weight = 1 match_weight = 0.1 return title_weight * title_similarity + match_weight * match_score
def response(data_file): with open(data_file, 'r') as file: data = json.load(file) #print(json.dumps(data, indent=4, sort_keys=True)) ground = Ground() for i in data['questions']: question = Analysis(i['body']) question_type = question.graph.question_type doc = Analysis(retrieveDocument(question)) ground.teardown() ground.add_text(doc) answers = ground.ask_question(question) print('------------------------------------') print('Question: ' + i['body']) print('Question type: ' + question_type) print('Ideal answer: ', i['ideal_answer']) if (len(answers) == 0): print("Answer: Sorry, I don't know the answer.") else: print('Answer: ', '\n'.join(compose_answer(question, a) for a in answers))
results = json.loads(response.text)['query']['search'] best_result = max(results, key=lambda r: rankResult(r, q)) page = wikiRetrieve(best_result['pageid']) content = BeautifulSoup( json.loads(page.text)['parse']['text']['*'], 'html.parser') return ' '.join(p.get_text() for p in content.find_all('p')) def retrieveDocument(q): r = wikiSearch(q.content_words()) doc = getMostRelevantDocument(r, q) return ' '.join(doc.split()) if __name__ == "__main__": q = Analysis("What is the internet of things?") print(q.content_words()) r = wikiSearch(q.content_words()) print("{} -> {}".format(r.url, r.status_code)) results = json.loads(r.text)['query']['search'] print([(r['title'], r['snippet']) for r in results]) page = wikiRetrieve(results[0]['pageid']) print(json.loads(page.text)['parse']['text']['*']) doc = getMostRelevantDocument(r, q) print(doc) print(retrieveDocument(Analysis("What is a heart attack?")))
for result in cur.fetchall(): tmp = {} tmp['content'] = str(list(result)[0]) data.append(tmp) index = index + 1 disconnect(conn) client = MongoClient('') client.tfgchat.test.delete_many({}) client.tfgchat.test.insert_many(data) def getMostRelevantDocument(q): client = MongoClient('') query = ' '.join(q.content_words()) client.tfgchat.test.create_index([('content', "text")]) cursor = client.tfgchat.test.find_one({"$text": { "$search": query }}, {"score": { "$meta": "textScore" }}) return BeautifulSoup(cursor.get('content'), 'html.parser').get_text() def retrieveDocument(q): #createDatabase(2000) return getMostRelevantDocument(q) if __name__ == "__main__": q = Analysis("What is a heart attack?") print(retrieveDocument(q))
docs = parsed.find_all('document') doc = max(docs, key=lambda d: rankDocument(d, q)) html = doc.find("content", attrs={"name": "FullSummary"}).get_text() return BeautifulSoup(html, "html.parser").get_text(" ") ''' Retrieves different documents which can contain the response to question introduced by the user. To do it, it uses the result of the analysis of this question (parameter q). ''' def retrieveDocument(q): r = medlineSearch(q.content_words()) doc = getMostRelevantDocument(r, q) return ' '.join(doc.split()) if __name__ == "__main__": q = Analysis("What are the causes of blood infection?") print(q.content_words()) r = medlineSearch(q.content_words()) print("{} -> {}".format(r.url, r.status_code)) doc = getMostRelevantDocument(r, q) print(doc) print(retrieveDocument(Analysis("What are the causes of a heart attack?")))
def test_init(): global Analysis, nlg import nbimporter from natural_language import Analysis from grafeno import linearizers nlg = linearizers.get_pipeline(['node_edges']) # Test case showing the behavior of Ground class if __name__ == "__main__": test_init() ground = Ground() ground.teardown() ground.add_text( Analysis( "John loves Mary. John loves very cute dogs. Peter hates Susan. Susan loves John. Paul loves Joana. Joana loves Paul." )) answers = ground.ask_question(Analysis("Who loves John")) for answer in answers: print(answer.linearize(linearizer=nlg)) print('##########################################################') answers = ground.ask_question(Analysis("John loves who")) for answer in answers: print(answer.linearize(linearizer=nlg))
def do_tests(original_path, destination_path, type_filter='none'): json_file = json.load(open(original_path)) all_tests = [] errors = {} # We create a single Ground ground = Ground() for question in json_file['questions']: # Error Handling snipped_errors = "" question_errors = "" compose_answer_errors = False question_text = question['body'] ideal_answer = question['ideal_answer'] # Clears the Ground ground.teardown() # Adds all the snippets to the current knowledge base for snippet in question['snippets']: snippet_text = snippet['text'] try: ground.add_text(Analysis(snippet_text)) except (KeyError): snipped_errors = snippet_text try: # Analyzes the query q = Analysis(question['body']) question_type = q.graph.question_type # If we have to avoid this type of question due to the filter, we move to the next for iteration if (type_filter != 'none' and type_filter != question_type): continue print('------------------------------------') print('Question: ' + question_text) print('Question type: ' + question_type) print('Ideal answer: ' + ideal_answer[0]) # We ask a question to the current knowledge base answers = ground.ask_question(q) except: question_errors = question['body'] print(' Question errors: ' + question_errors) try: # Translates the answer into a natural language sentence answers = response(q, answers, question_type) except: print(' Compose answer errors') compose_answer_errors = True print('Answer: ' + answers) # We collect all the relevant information all_tests.append({ 'question': question_text, 'answer': answers, 'ideal_answer': ideal_answer, 'snipped_errors': snipped_errors, 'question_errors': question_errors, 'compose_answer_errors': compose_answer_errors }) # We collect all the questions full = {"questions": all_tests} # We saved all the questions into the specified file dump = json.dumps(full) f = open(destination_path, "w") f.write(dump) f.close()