示例#1
0
def calculate_idf_dictionary(pair_attributes):
    """ Calculates the idf-dictionary """
    words = []
    documents = []
    for i in range(len(pair_attributes)):
        t, h, id_num, e, ta = pair_attributes[i]
        t_lemmas, pos = xml_util.get_lemmas_from_text_node(t)
        h_lemmas, pos = xml_util.get_lemmas_from_text_node(h)
        doc = []
        
        for word in t_lemmas:
            words.append(word.lower())
            doc.append(word.lower())
        for word in h_lemmas:
            words.append(word.lower())
            doc.append(word.lower())
        documents.append(set(doc))
    word_set = set(words)
    # Creates a dictionary 'idf_dict' that can be used to count how many document each word is present in
    idf_dict = {}
    # Starts by initiating the count for all words to 0
    for word in word_set:
        idf_dict[word] = 0
    # Then calculates the number of documents each word in the word_set appears in
    for word in word_set:
        for document in documents:
            if word in document:
                idf_dict[word] += 1
    return idf_dict
示例#2
0
    def calculatePolarity(self):
        def helper(text, hypothesis):
            negatives = [
                "not", "refuse", "wrong", "deny", "no", "false", "ignore",
                "cannot", "can't", "never", "unsuccessfully"
            ]
            text_negatives = 0
            hypothesis_negatives = 0
            for word in text:
                if word in negatives:
                    text_negatives += 1
            for word in hypothesis:
                if word in negatives:
                    hypothesis_negatives += 1
            if (text_negatives % 2) == (hypothesis_negatives % 2):
                return 0
            else:
                return 1

        pair_attributes = self.processedPairs[:]
        results = []
        n = len(pair_attributes)
        for i in range(n):
            t, h, id_num, e, ta = pair_attributes[i]
            id_num = int(id_num)
            text, pos = xml_util.get_lemmas_from_text_node(t)
            hypothesis, pos = xml_util.get_lemmas_from_text_node(h)
            results.append(helper(text, hypothesis))
        return results
    def calculatePolarity(self):
        def helper(text,hypothesis):
                negatives = ["not", "refuse", "wrong", "deny", "no", "false", "ignore", "cannot", "can't", "never", "unsuccessfully"]
                text_negatives = 0
                hypothesis_negatives = 0
                for word in text:
                    if word in negatives:
                        text_negatives += 1
                for word in hypothesis:
                    if word in negatives:
                        hypothesis_negatives += 1
                if (text_negatives % 2) == (hypothesis_negatives % 2):
                    return 0
                else:
                    return 1

        pair_attributes = self.processedPairs[:]
        results = []
        n = len(pair_attributes)
        for i in range(n):
            t,h,id_num,e,ta = pair_attributes[i]
            id_num = int(id_num)
            text,pos = xml_util.get_lemmas_from_text_node(t)
            hypothesis,pos = xml_util.get_lemmas_from_text_node(h)
            results.append(helper(text,hypothesis))
        return results
示例#4
0
    def calculateBigrams(self):
        pair_attributes = self.processedPairs[:]

        n = len(pair_attributes)
        results = []
        entailments = [0 for foo in range(n + 1)]
        for i in range(n):
            t, h, id_num, e, ta = pair_attributes[i]
            id_num = int(id_num)
            t_lemmas, pos = xml_util.get_lemmas_from_text_node(t)
            h_lemmas, pos = xml_util.get_lemmas_from_text_node(h)
            results.append(bleu.modified_bleu(t_lemmas, h_lemmas))

        return results
    def calculateBigrams(self):
        pair_attributes = self.processedPairs[:]

        n = len(pair_attributes)
        results = []
        entailments = [0 for foo in range(n+1)]
        for i in range(n):
            t,h,id_num,e,ta = pair_attributes[i]
            id_num = int(id_num)
            t_lemmas,pos = xml_util.get_lemmas_from_text_node(t)
            h_lemmas,pos = xml_util.get_lemmas_from_text_node(h)
            results.append(bleu.modified_bleu(t_lemmas,h_lemmas))

        return results
示例#6
0
def bleu_matching(threshold, pairs):
    pair_attributes = pairs[:]
    n = len(pair_attributes)
    results = [0 for foo in range(n+1)]
    entailments = [0 for foo in range(n+1)]
    for i in range(n):
        t,h,id_num,e,ta = pair_attributes[i]
        id_num = int(id_num)
        t_lemmas,pos = xml_util.get_lemmas_from_text_node(t)
        h_lemmas,pos = xml_util.get_lemmas_from_text_node(h)
        entailments[id_num] = calculate_entailment(t_lemmas,h_lemmas,threshold)
        if e == entailments[id_num]:
            results[id_num] = 1
    lexical.output_rte(entailments)
    print "Threshold: " + "%.2f"%threshold + " Accuracy: " + str(float(sum(results)) / float(n))
示例#7
0
def bleu_matching(threshold, pairs):
    pair_attributes = pairs[:]
    n = len(pair_attributes)
    results = [0 for foo in range(n + 1)]
    entailments = [0 for foo in range(n + 1)]
    for i in range(n):
        t, h, id_num, e, ta = pair_attributes[i]
        id_num = int(id_num)
        t_lemmas, pos = xml_util.get_lemmas_from_text_node(t)
        h_lemmas, pos = xml_util.get_lemmas_from_text_node(h)
        entailments[id_num] = calculate_entailment(t_lemmas, h_lemmas,
                                                   threshold)
        if e == entailments[id_num]:
            results[id_num] = 1
    lexical.output_rte(entailments)
    print "Threshold: " + "%.2f" % threshold + " Accuracy: " + str(
        float(sum(results)) / float(n))
def idf_weighting(threshold, pairs):
    pair_attributes = pairs[:]
    words = []
    documents = []
    n = len(pair_attributes)
    results = [0 for foo in range(n + 1)]
    entailments = [0 for foo in range(n + 1)]
    # Starts by adding all the words to the list 'words' and then making a set of these words
    # Also makes a list of documents where each document is a set of all the words in a given
    # (text, hypothesis) pair.
    for i in range(n):
        t, h, id_num, e, ta = pair_attributes[i]
        t_lemmas, pos = xml_util.get_lemmas_from_text_node(t)
        h_lemmas, pos = xml_util.get_lemmas_from_text_node(h)
        doc = []
        for word in t_lemmas:
            words.append(word.lower())
            doc.append(word.lower())
        for word in h_lemmas:
            words.append(word.lower())
            doc.append(word.lower())
        documents.append(set(doc))
    word_set = set(words)
    # Creates a dictionary 'idf_dict' that can be used to count how many document each word is present in
    idf_dict = {}
    # Starts by initiating the count for all words to 0
    for word in word_set:
        idf_dict[word] = 0
    # Then calculates the number of documents each word in the word_set appears in
    for word in word_set:
        for document in documents:
            if word in document:
                idf_dict[word] += 1
    print "dict done"
    for i in range(n):
        t, h, id_num, e, ta = pair_attributes[i]
        id_num = int(id_num)
        t_lemmas, pos = xml_util.get_lemmas_from_text_node(t)
        h_lemmas, pos = xml_util.get_lemmas_from_text_node(h)
        entailments[id_num] = calculate_entailment(t_lemmas, h_lemmas,
                                                   idf_dict, threshold)
        results[id_num] = 1 if e == entailments[id_num] else 0
    lexical.output_rte(entailments)
    print "Threshold: " + "%.2f" % threshold + " Accuracy: " + str(
        float(sum(results)) / float(n))
def idf_weighting(threshold, pairs):
    pair_attributes = pairs[:]
    words = []
    documents = []
    n = len(pair_attributes)
    results = [0 for foo in range(n + 1)]
    entailments = [0 for foo in range(n + 1)]
    # Starts by adding all the words to the list 'words' and then making a set of these words
    # Also makes a list of documents where each document is a set of all the words in a given
    # (text, hypothesis) pair.
    for i in range(n):
        t, h, id_num, e, ta = pair_attributes[i]
        t_lemmas, pos = xml_util.get_lemmas_from_text_node(t)
        h_lemmas, pos = xml_util.get_lemmas_from_text_node(h)
        doc = []
        for word in t_lemmas:
            words.append(word.lower())
            doc.append(word.lower())
        for word in h_lemmas:
            words.append(word.lower())
            doc.append(word.lower())
        documents.append(set(doc))
    word_set = set(words)
    # Creates a dictionary 'idf_dict' that can be used to count how many document each word is present in
    idf_dict = {}
    # Starts by initiating the count for all words to 0
    for word in word_set:
        idf_dict[word] = 0
    # Then calculates the number of documents each word in the word_set appears in
    for word in word_set:
        for document in documents:
            if word in document:
                idf_dict[word] += 1
    print "dict done"
    for i in range(n):
        t, h, id_num, e, ta = pair_attributes[i]
        id_num = int(id_num)
        t_lemmas, pos = xml_util.get_lemmas_from_text_node(t)
        h_lemmas, pos = xml_util.get_lemmas_from_text_node(h)
        entailments[id_num] = calculate_entailment(t_lemmas, h_lemmas, idf_dict, threshold)
        results[id_num] = 1 if e == entailments[id_num] else 0
    lexical.output_rte(entailments)
    print "Threshold: " + "%.2f" % threshold + " Accuracy: " + str(float(sum(results)) / float(n))
示例#10
0
 def calculateIdf(self):
     pair_attributes = self.processedPairs[:]
     idf_dict = syntax_matching.calculate_idf_dictionary(pair_attributes)
     results = []
     n = len(pair_attributes)
     for i in range(n):
         t, h, id_num, e, ta = pair_attributes[i]
         id_num = int(id_num)
         text, pos = xml_util.get_lemmas_from_text_node(t)
         hypothesis, pos = xml_util.get_lemmas_from_text_node(h)
         idf_counter = 0.0
         idf_divider = 0.0
         for word in hypothesis:
             for w in Synonyms.FindAllSynonyms(word):
                 if w in text:
                     idf_counter += (1.0 / float(idf_dict[word.lower()]))
                     break
             idf_divider += (1.0 / float(idf_dict[word.lower()]))
         idf_word_match = idf_counter / idf_divider
         results.append(idf_word_match)
     return results
 def calculateIdf(self):
     pair_attributes = self.processedPairs[:]
     idf_dict = syntax_matching.calculate_idf_dictionary(pair_attributes)
     results = []
     n = len(pair_attributes)
     for i in range(n):
         t,h,id_num,e,ta = pair_attributes[i]
         id_num = int(id_num)
         text,pos = xml_util.get_lemmas_from_text_node(t)
         hypothesis,pos = xml_util.get_lemmas_from_text_node(h)
         idf_counter = 0.0
         idf_divider = 0.0
         for word in hypothesis:
             for w in Synonyms.FindAllSynonyms(word):
                 if w in text:
                     idf_counter += (1.0 / float(idf_dict[word.lower()]))
                     break
             idf_divider += (1.0 / float(idf_dict[word.lower()]))
         idf_word_match = idf_counter / idf_divider
         results.append(idf_word_match)
     return results
    def calculateLemmas(self):
                pair_attributes = self.processedPairs[:]
                n = len(pair_attributes)
                results = []
                entailments = [0 for foo in range(n+1)]
                for i in range(n):
                    t,h,id_num,e,ta = pair_attributes[i]
                    id_num = int(id_num)
                    text,pos = xml_util.get_lemmas_from_text_node(t)
                    hypothesis,pos = xml_util.get_lemmas_from_text_node(h)

                    matching_words = 0
                    new_hyp = []
                    for word in hypothesis:
                        if word not in new_hyp:
                            new_hyp.append(word)
                    hypothesis = new_hyp[:]
                    for word in hypothesis:
                        if word in text:
                            matching_words += 1
                    entails = (float(matching_words) / float(len(hypothesis)))
                    results.append(entails)
                return results
示例#13
0
    def calculatePOS(self):
        pair_attributes = self.processedPairs[:]
        n = len(pair_attributes)
        results = []
        for i in range(n):
            text, hypothesis, id_num, e, ta = pair_attributes[i]
            id_num = int(id_num)
            t_lemmas, t_pos = xml_util.get_lemmas_from_text_node(text)
            h_lemmas, h_pos = xml_util.get_lemmas_from_text_node(hypothesis)
            text = []
            for i in range(len(t_lemmas)):
                text.append((t_lemmas[i], t_pos[i]))
            hypothesis = []
            for i in range(len(h_lemmas)):
                hypothesis.append((h_lemmas[i], h_pos[i]))
            matching_words = 0
            for h in hypothesis:
                if h in text:
                    matching_words += 1
            entails = float(matching_words) / float(len(hypothesis))
            results.append(entails)

        return results
示例#14
0
    def calculateLemmas(self):
        pair_attributes = self.processedPairs[:]
        n = len(pair_attributes)
        results = []
        entailments = [0 for foo in range(n + 1)]
        for i in range(n):
            t, h, id_num, e, ta = pair_attributes[i]
            id_num = int(id_num)
            text, pos = xml_util.get_lemmas_from_text_node(t)
            hypothesis, pos = xml_util.get_lemmas_from_text_node(h)

            matching_words = 0
            new_hyp = []
            for word in hypothesis:
                if word not in new_hyp:
                    new_hyp.append(word)
            hypothesis = new_hyp[:]
            for word in hypothesis:
                if word in text:
                    matching_words += 1
            entails = (float(matching_words) / float(len(hypothesis)))
            results.append(entails)
        return results
示例#15
0
def lemma_matching(threshold, pairs):
    pair_attributes = pairs[:]
    n = len(pair_attributes)
    results = [0 for foo in range(n+1)]
    entailments = [0 for foo in range(n+1)]
    for i in range(n):
        t,h,id_num,e,ta = pair_attributes[i]
        id_num = int(id_num)
        t_lemmas,t_pos = xml_util.get_lemmas_from_text_node(t)
        h_lemmas,h_pos = xml_util.get_lemmas_from_text_node(h)
        text = []
        for i in range(len(t_lemmas)):
            text.append((t_lemmas[i],t_pos[i]))
        hypothesis = []
        for i in range(len(h_lemmas)):
            hypothesis.append((h_lemmas[i],h_pos[i]))
        entailments[id_num] = calculate_entailment(text,hypothesis,threshold)
        if (e == entailments[id_num]):
            results[id_num] = 1
        else:
            results[id_num] = 0
    lexical.output_rte(entailments)
    print "Threshold: " + "%.2f"%threshold + " Accuracy: " + str(float(sum(results)) / float(n))
    def calculatePOS(self):
        pair_attributes = self.processedPairs[:]
        n = len(pair_attributes)
        results = []
        for i in range(n):
            text,hypothesis,id_num,e,ta = pair_attributes[i]
            id_num = int(id_num)
            t_lemmas,t_pos = xml_util.get_lemmas_from_text_node(text)
            h_lemmas,h_pos = xml_util.get_lemmas_from_text_node(hypothesis)
            text = []
            for i in range(len(t_lemmas)):
                text.append((t_lemmas[i],t_pos[i]))
            hypothesis = []
            for i in range(len(h_lemmas)):
                hypothesis.append((h_lemmas[i],h_pos[i]))
            matching_words = 0
            for h in hypothesis:
                if h in text:
                    matching_words += 1
            entails = float(matching_words) / float(len(hypothesis))
            results.append(entails)

        return results
示例#17
0
def lemma_matching(threshold, pairs):
    pair_attributes = pairs[:]
    n = len(pair_attributes)
    results = [0 for foo in range(n + 1)]
    entailments = [0 for foo in range(n + 1)]
    for i in range(n):
        t, h, id_num, e, ta = pair_attributes[i]
        id_num = int(id_num)
        t_lemmas, t_pos = xml_util.get_lemmas_from_text_node(t)
        h_lemmas, h_pos = xml_util.get_lemmas_from_text_node(h)
        text = []
        for i in range(len(t_lemmas)):
            text.append((t_lemmas[i], t_pos[i]))
        hypothesis = []
        for i in range(len(h_lemmas)):
            hypothesis.append((h_lemmas[i], h_pos[i]))
        entailments[id_num] = calculate_entailment(text, hypothesis, threshold)
        if (e == entailments[id_num]):
            results[id_num] = 1
        else:
            results[id_num] = 0
    lexical.output_rte(entailments)
    print "Threshold: " + "%.2f" % threshold + " Accuracy: " + str(
        float(sum(results)) / float(n))