def calculate_idf_dictionary(pair_attributes): """ Calculates the idf-dictionary """ words = [] documents = [] for i in range(len(pair_attributes)): t, h, id_num, e, ta = pair_attributes[i] t_lemmas, pos = xml_util.get_lemmas_from_text_node(t) h_lemmas, pos = xml_util.get_lemmas_from_text_node(h) doc = [] for word in t_lemmas: words.append(word.lower()) doc.append(word.lower()) for word in h_lemmas: words.append(word.lower()) doc.append(word.lower()) documents.append(set(doc)) word_set = set(words) # Creates a dictionary 'idf_dict' that can be used to count how many document each word is present in idf_dict = {} # Starts by initiating the count for all words to 0 for word in word_set: idf_dict[word] = 0 # Then calculates the number of documents each word in the word_set appears in for word in word_set: for document in documents: if word in document: idf_dict[word] += 1 return idf_dict
def calculatePolarity(self): def helper(text, hypothesis): negatives = [ "not", "refuse", "wrong", "deny", "no", "false", "ignore", "cannot", "can't", "never", "unsuccessfully" ] text_negatives = 0 hypothesis_negatives = 0 for word in text: if word in negatives: text_negatives += 1 for word in hypothesis: if word in negatives: hypothesis_negatives += 1 if (text_negatives % 2) == (hypothesis_negatives % 2): return 0 else: return 1 pair_attributes = self.processedPairs[:] results = [] n = len(pair_attributes) for i in range(n): t, h, id_num, e, ta = pair_attributes[i] id_num = int(id_num) text, pos = xml_util.get_lemmas_from_text_node(t) hypothesis, pos = xml_util.get_lemmas_from_text_node(h) results.append(helper(text, hypothesis)) return results
def calculatePolarity(self): def helper(text,hypothesis): negatives = ["not", "refuse", "wrong", "deny", "no", "false", "ignore", "cannot", "can't", "never", "unsuccessfully"] text_negatives = 0 hypothesis_negatives = 0 for word in text: if word in negatives: text_negatives += 1 for word in hypothesis: if word in negatives: hypothesis_negatives += 1 if (text_negatives % 2) == (hypothesis_negatives % 2): return 0 else: return 1 pair_attributes = self.processedPairs[:] results = [] n = len(pair_attributes) for i in range(n): t,h,id_num,e,ta = pair_attributes[i] id_num = int(id_num) text,pos = xml_util.get_lemmas_from_text_node(t) hypothesis,pos = xml_util.get_lemmas_from_text_node(h) results.append(helper(text,hypothesis)) return results
def calculateBigrams(self): pair_attributes = self.processedPairs[:] n = len(pair_attributes) results = [] entailments = [0 for foo in range(n + 1)] for i in range(n): t, h, id_num, e, ta = pair_attributes[i] id_num = int(id_num) t_lemmas, pos = xml_util.get_lemmas_from_text_node(t) h_lemmas, pos = xml_util.get_lemmas_from_text_node(h) results.append(bleu.modified_bleu(t_lemmas, h_lemmas)) return results
def calculateBigrams(self): pair_attributes = self.processedPairs[:] n = len(pair_attributes) results = [] entailments = [0 for foo in range(n+1)] for i in range(n): t,h,id_num,e,ta = pair_attributes[i] id_num = int(id_num) t_lemmas,pos = xml_util.get_lemmas_from_text_node(t) h_lemmas,pos = xml_util.get_lemmas_from_text_node(h) results.append(bleu.modified_bleu(t_lemmas,h_lemmas)) return results
def bleu_matching(threshold, pairs): pair_attributes = pairs[:] n = len(pair_attributes) results = [0 for foo in range(n+1)] entailments = [0 for foo in range(n+1)] for i in range(n): t,h,id_num,e,ta = pair_attributes[i] id_num = int(id_num) t_lemmas,pos = xml_util.get_lemmas_from_text_node(t) h_lemmas,pos = xml_util.get_lemmas_from_text_node(h) entailments[id_num] = calculate_entailment(t_lemmas,h_lemmas,threshold) if e == entailments[id_num]: results[id_num] = 1 lexical.output_rte(entailments) print "Threshold: " + "%.2f"%threshold + " Accuracy: " + str(float(sum(results)) / float(n))
def bleu_matching(threshold, pairs): pair_attributes = pairs[:] n = len(pair_attributes) results = [0 for foo in range(n + 1)] entailments = [0 for foo in range(n + 1)] for i in range(n): t, h, id_num, e, ta = pair_attributes[i] id_num = int(id_num) t_lemmas, pos = xml_util.get_lemmas_from_text_node(t) h_lemmas, pos = xml_util.get_lemmas_from_text_node(h) entailments[id_num] = calculate_entailment(t_lemmas, h_lemmas, threshold) if e == entailments[id_num]: results[id_num] = 1 lexical.output_rte(entailments) print "Threshold: " + "%.2f" % threshold + " Accuracy: " + str( float(sum(results)) / float(n))
def idf_weighting(threshold, pairs): pair_attributes = pairs[:] words = [] documents = [] n = len(pair_attributes) results = [0 for foo in range(n + 1)] entailments = [0 for foo in range(n + 1)] # Starts by adding all the words to the list 'words' and then making a set of these words # Also makes a list of documents where each document is a set of all the words in a given # (text, hypothesis) pair. for i in range(n): t, h, id_num, e, ta = pair_attributes[i] t_lemmas, pos = xml_util.get_lemmas_from_text_node(t) h_lemmas, pos = xml_util.get_lemmas_from_text_node(h) doc = [] for word in t_lemmas: words.append(word.lower()) doc.append(word.lower()) for word in h_lemmas: words.append(word.lower()) doc.append(word.lower()) documents.append(set(doc)) word_set = set(words) # Creates a dictionary 'idf_dict' that can be used to count how many document each word is present in idf_dict = {} # Starts by initiating the count for all words to 0 for word in word_set: idf_dict[word] = 0 # Then calculates the number of documents each word in the word_set appears in for word in word_set: for document in documents: if word in document: idf_dict[word] += 1 print "dict done" for i in range(n): t, h, id_num, e, ta = pair_attributes[i] id_num = int(id_num) t_lemmas, pos = xml_util.get_lemmas_from_text_node(t) h_lemmas, pos = xml_util.get_lemmas_from_text_node(h) entailments[id_num] = calculate_entailment(t_lemmas, h_lemmas, idf_dict, threshold) results[id_num] = 1 if e == entailments[id_num] else 0 lexical.output_rte(entailments) print "Threshold: " + "%.2f" % threshold + " Accuracy: " + str( float(sum(results)) / float(n))
def idf_weighting(threshold, pairs): pair_attributes = pairs[:] words = [] documents = [] n = len(pair_attributes) results = [0 for foo in range(n + 1)] entailments = [0 for foo in range(n + 1)] # Starts by adding all the words to the list 'words' and then making a set of these words # Also makes a list of documents where each document is a set of all the words in a given # (text, hypothesis) pair. for i in range(n): t, h, id_num, e, ta = pair_attributes[i] t_lemmas, pos = xml_util.get_lemmas_from_text_node(t) h_lemmas, pos = xml_util.get_lemmas_from_text_node(h) doc = [] for word in t_lemmas: words.append(word.lower()) doc.append(word.lower()) for word in h_lemmas: words.append(word.lower()) doc.append(word.lower()) documents.append(set(doc)) word_set = set(words) # Creates a dictionary 'idf_dict' that can be used to count how many document each word is present in idf_dict = {} # Starts by initiating the count for all words to 0 for word in word_set: idf_dict[word] = 0 # Then calculates the number of documents each word in the word_set appears in for word in word_set: for document in documents: if word in document: idf_dict[word] += 1 print "dict done" for i in range(n): t, h, id_num, e, ta = pair_attributes[i] id_num = int(id_num) t_lemmas, pos = xml_util.get_lemmas_from_text_node(t) h_lemmas, pos = xml_util.get_lemmas_from_text_node(h) entailments[id_num] = calculate_entailment(t_lemmas, h_lemmas, idf_dict, threshold) results[id_num] = 1 if e == entailments[id_num] else 0 lexical.output_rte(entailments) print "Threshold: " + "%.2f" % threshold + " Accuracy: " + str(float(sum(results)) / float(n))
def calculateIdf(self): pair_attributes = self.processedPairs[:] idf_dict = syntax_matching.calculate_idf_dictionary(pair_attributes) results = [] n = len(pair_attributes) for i in range(n): t, h, id_num, e, ta = pair_attributes[i] id_num = int(id_num) text, pos = xml_util.get_lemmas_from_text_node(t) hypothesis, pos = xml_util.get_lemmas_from_text_node(h) idf_counter = 0.0 idf_divider = 0.0 for word in hypothesis: for w in Synonyms.FindAllSynonyms(word): if w in text: idf_counter += (1.0 / float(idf_dict[word.lower()])) break idf_divider += (1.0 / float(idf_dict[word.lower()])) idf_word_match = idf_counter / idf_divider results.append(idf_word_match) return results
def calculateIdf(self): pair_attributes = self.processedPairs[:] idf_dict = syntax_matching.calculate_idf_dictionary(pair_attributes) results = [] n = len(pair_attributes) for i in range(n): t,h,id_num,e,ta = pair_attributes[i] id_num = int(id_num) text,pos = xml_util.get_lemmas_from_text_node(t) hypothesis,pos = xml_util.get_lemmas_from_text_node(h) idf_counter = 0.0 idf_divider = 0.0 for word in hypothesis: for w in Synonyms.FindAllSynonyms(word): if w in text: idf_counter += (1.0 / float(idf_dict[word.lower()])) break idf_divider += (1.0 / float(idf_dict[word.lower()])) idf_word_match = idf_counter / idf_divider results.append(idf_word_match) return results
def calculateLemmas(self): pair_attributes = self.processedPairs[:] n = len(pair_attributes) results = [] entailments = [0 for foo in range(n+1)] for i in range(n): t,h,id_num,e,ta = pair_attributes[i] id_num = int(id_num) text,pos = xml_util.get_lemmas_from_text_node(t) hypothesis,pos = xml_util.get_lemmas_from_text_node(h) matching_words = 0 new_hyp = [] for word in hypothesis: if word not in new_hyp: new_hyp.append(word) hypothesis = new_hyp[:] for word in hypothesis: if word in text: matching_words += 1 entails = (float(matching_words) / float(len(hypothesis))) results.append(entails) return results
def calculatePOS(self): pair_attributes = self.processedPairs[:] n = len(pair_attributes) results = [] for i in range(n): text, hypothesis, id_num, e, ta = pair_attributes[i] id_num = int(id_num) t_lemmas, t_pos = xml_util.get_lemmas_from_text_node(text) h_lemmas, h_pos = xml_util.get_lemmas_from_text_node(hypothesis) text = [] for i in range(len(t_lemmas)): text.append((t_lemmas[i], t_pos[i])) hypothesis = [] for i in range(len(h_lemmas)): hypothesis.append((h_lemmas[i], h_pos[i])) matching_words = 0 for h in hypothesis: if h in text: matching_words += 1 entails = float(matching_words) / float(len(hypothesis)) results.append(entails) return results
def calculateLemmas(self): pair_attributes = self.processedPairs[:] n = len(pair_attributes) results = [] entailments = [0 for foo in range(n + 1)] for i in range(n): t, h, id_num, e, ta = pair_attributes[i] id_num = int(id_num) text, pos = xml_util.get_lemmas_from_text_node(t) hypothesis, pos = xml_util.get_lemmas_from_text_node(h) matching_words = 0 new_hyp = [] for word in hypothesis: if word not in new_hyp: new_hyp.append(word) hypothesis = new_hyp[:] for word in hypothesis: if word in text: matching_words += 1 entails = (float(matching_words) / float(len(hypothesis))) results.append(entails) return results
def lemma_matching(threshold, pairs): pair_attributes = pairs[:] n = len(pair_attributes) results = [0 for foo in range(n+1)] entailments = [0 for foo in range(n+1)] for i in range(n): t,h,id_num,e,ta = pair_attributes[i] id_num = int(id_num) t_lemmas,t_pos = xml_util.get_lemmas_from_text_node(t) h_lemmas,h_pos = xml_util.get_lemmas_from_text_node(h) text = [] for i in range(len(t_lemmas)): text.append((t_lemmas[i],t_pos[i])) hypothesis = [] for i in range(len(h_lemmas)): hypothesis.append((h_lemmas[i],h_pos[i])) entailments[id_num] = calculate_entailment(text,hypothesis,threshold) if (e == entailments[id_num]): results[id_num] = 1 else: results[id_num] = 0 lexical.output_rte(entailments) print "Threshold: " + "%.2f"%threshold + " Accuracy: " + str(float(sum(results)) / float(n))
def calculatePOS(self): pair_attributes = self.processedPairs[:] n = len(pair_attributes) results = [] for i in range(n): text,hypothesis,id_num,e,ta = pair_attributes[i] id_num = int(id_num) t_lemmas,t_pos = xml_util.get_lemmas_from_text_node(text) h_lemmas,h_pos = xml_util.get_lemmas_from_text_node(hypothesis) text = [] for i in range(len(t_lemmas)): text.append((t_lemmas[i],t_pos[i])) hypothesis = [] for i in range(len(h_lemmas)): hypothesis.append((h_lemmas[i],h_pos[i])) matching_words = 0 for h in hypothesis: if h in text: matching_words += 1 entails = float(matching_words) / float(len(hypothesis)) results.append(entails) return results
def lemma_matching(threshold, pairs): pair_attributes = pairs[:] n = len(pair_attributes) results = [0 for foo in range(n + 1)] entailments = [0 for foo in range(n + 1)] for i in range(n): t, h, id_num, e, ta = pair_attributes[i] id_num = int(id_num) t_lemmas, t_pos = xml_util.get_lemmas_from_text_node(t) h_lemmas, h_pos = xml_util.get_lemmas_from_text_node(h) text = [] for i in range(len(t_lemmas)): text.append((t_lemmas[i], t_pos[i])) hypothesis = [] for i in range(len(h_lemmas)): hypothesis.append((h_lemmas[i], h_pos[i])) entailments[id_num] = calculate_entailment(text, hypothesis, threshold) if (e == entailments[id_num]): results[id_num] = 1 else: results[id_num] = 0 lexical.output_rte(entailments) print "Threshold: " + "%.2f" % threshold + " Accuracy: " + str( float(sum(results)) / float(n))