def get_candidate_answers(question, text, W2vecextractor, q_verb, sgraphs, useWord2Vec = False, useVerb = True): stopwords = set(nltk.corpus.stopwords.words("english")) # Collect all the candidate answers candidate_answers = [] if(useWord2Vec==True and useVerb == False): q_feat = W2vecextractor.sent2vec(question) sentences = nltk.sent_tokenize(text) for i in range(0, len(sentences)): sent = sentences[i] a_feat = W2vecextractor.sent2vec(sent) dist = cosine_similarity(q_feat, a_feat) #calculate cosine similarity between the question and the candidate answer candidate_answers.append((dist, i, sent)) #print("distance: "+str(dist)+"\t sent: "+sent) if(useWord2Vec==True and useVerb == True): #print(q_verb) q_feat = W2vecextractor.word2v(q_verb) sentences = nltk.sent_tokenize(text) for i in range(0, len(sentences)): sent = sentences[i] s_verb = find_main(sgraphs[i])['word'] #print(s_verb) a_feat = W2vecextractor.word2v(s_verb) dist = cosine_similarity(q_feat, a_feat) #calculate cosine similarity between the main verbs in the question and the candidate answer candidate_answers.append((dist, i, sent)) #print("distance: "+str(dist)+"\t sent: "+sent) else: qbow = get_bow(question, stopwords) sentences = nltk.sent_tokenize(text) for i in range(0, len(sentences)): sent = sentences[i] # A list of all the word tokens in the sentence sbow = get_bow(sent, stopwords) # Count the # of overlapping words between the Q and the A # & is the set intersection operator overlap = len(qbow & sbow) candidate_answers.append((overlap, i, sent)) # Sort the results by the first element of the tuple (i.e., the count) # Sort answers from smallest to largest by default, so reverse it # Make sure to check about whether the results are null. #if len(candidate_answers) > 0: #best_answer = sorted(candidate_answers, key=lambda x: x[0], reverse=True)[0][1] #best_answer = max(candidate_answers, key=lambda x: x[0])[1] #return best_answer return sorted(candidate_answers, key=lambda x: x[0], reverse=True)
def baseline_word2vec_verb(question, sentences, stopwords, W2vecextractor, q_verb, sgraphs): q_feat = W2vecextractor.word2v(q_verb) candidate_answers = [] print("ROOT of question: "+str(q_verb)) for i in range(0, len(sentences)): sent = sentences[i] s_verb = find_main(sgraphs[i])['word'] print("ROOT of sentence: "+str(s_verb)) a_feat = W2vecextractor.word2v(s_verb) dist = cosine_similarity([q_feat], [a_feat]) candidate_answers.append((dist[0], sent)) answers = sorted(candidate_answers, key=operator.itemgetter(0), reverse=True) best_answer = (answers[0])[1] return best_answer
qgraphs = read_dep_parses(fname + ".questions.dep") for j in range(0, len(questions)): qname = "{0}-{1}".format(fname, j + 1) if qname in questions: print("QuestionID: " + qname) question = questions[qname]['Question'] print(question) qtypes = questions[qname]['Type'] # Get the question dep graph qgraph = qgraphs[i] # Get main verb in the question q_verb = find_main(qgraph)['word'] answer = None # qtypes can be "Story", "Sch", "Sch | Story" for qt in qtypes.split("|"): qt = qt.strip().lower() # These are the text data where you can look for answers. raw_text = data_dict[qt] par_text = data_dict[qt + ".par"] dep_text = data_dict[qt + ".dep"] # get the applicable dep file for finding the answer ans_dep_file = fname + "." + str(qt) + ".dep" # get the dep graphs for all sentences in the answer file sgraphs = read_dep_parses(ans_dep_file)