def remove_noise(tweet_tokens, stop_words=()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        # token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
        #                '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        # token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if token in tweet_tokens:
            lemmatizer = SnowballStemmer('english')
            token = lemmatizer.stem(token)

        if len(token) > 0 and token not in string.punctuation and token.lower(
        ) not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens
	def lemmatize(self, token, v=1):
		if v == self.STEMMER_WORDNET_ENGLISH:
			from nltk.stem.wordnet import WordNetLemmatizer
			lmtzr = WordNetLemmatizer()
			return lmtzr.lemmatize(token)

		if v == self.STEMMER_SNOWBALL_ENGLISH:
			from nltk import stem
			lmtzr =stem.snowball.EnglishStemmer()
			return lmtzr.stem(token)

		if v == self.STEMMER_LANCASTER_ENGLISH:
			from nltk.stem.lancaster import LancasterStemmer
			lmtzr = LancasterStemmer()
			return lmtzr.stem(token)

		if v == self.STEMMER_PORTER_ENGLISH:
			from nltk.stem.porter import PorterStemmer
			lmtzr = PorterStemmer()
			return lmtzr.stem(token)
示例#3
0
    'psychobiology', 'magical', 'magically', 'adulatory', 'mandatory'
]

tokens += [
    'microchemistry', 'scorningly', 'excystate', 'execrable', 'statued',
    'statuary', 'sparringly'
]

## or compare with random bag of words

#random_range =  randint(0, 236736)
#
#tokens = words.words()[random_range-10:random_range]

wnl = WordNetLemmatizer()
wnl.stem = wnl.lemmatize

STEMMERS = {
    #    'Regexp': RegexpStemmer(regexp=),
    'Lancaster': LancasterStemmer(),
    'Porter': PorterStemmer(),
    'Modified-Porter': ModifiedPorterStemmer(),
    'Snowball-EN': SnowballStemmer('english'),
    'WordNet-Lemma': wnl,
}

table_headers = ["Word/Stemmer"] + STEMMERS.keys()

table = []
for t in tokens:
    result = [t]
示例#4
0
class RTEInferenceTagger(object):
    """
    Predict whether a hypothesis can be inferred from a text, 
    based on the degree of word overlap.
    """
    def __init__(self, threshold=33, stop=True):
        self.threshold = threshold
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'is', 'are', 'were', 'and'])
        self.stemmer = WordNetLemmatizer()
    
    def tag(self, rtepair, verbose=False):
        """
        Tag a RTEPair as to whether the hypothesis can be inferred from the text.
        """
        return self.tag_sentences(rtepair.text, rtepair.hyp )

    def tag_sentences(self, text, hyp, verbose=False):
        """
        Tag a RTEPair as to whether the hypothesis can be inferred from the text.
        """
        glueclass = DrtGlue()
        text_drs_list = glueclass.parse_to_meaning(text)
        if text_drs_list:
            text_ex = text_drs_list[0].simplify().toFol()
        else:
            if verbose: print 'ERROR: No readings were generated for the Text'
        
        hyp_drs_list = glueclass.parse_to_meaning(hyp)
        if hyp_drs_list:
            hyp_ex = hyp_drs_list[0].simplify().toFol()
        else:
            if verbose: print 'ERROR: No readings were generated for the Hypothesis'

        #1. proof T -> H
        #2. proof (BK & T) -> H
        #3. proof :(BK & T)
        #4. proof :(BK & T & H)
        #5. satisfy BK & T
        #6. satisfy BK & T & H
            
        result = inference.Prover9().prove(hyp_ex, [text_ex])
        if verbose: print 'prove: T -> H: %s' % result
        
        if not result:
            bk = self._generate_BK(text, hyp, verbose)
            bk_exs = [bk_pair[0] for bk_pair in bk]
            
            if verbose: 
                print 'Generated Background Knowledge:'
                for bk_ex in bk_exs:
                    print bk_ex
                
            result = inference.Prover9().prove(hyp_ex, [text_ex]+bk_exs)
            if verbose: print 'prove: (T & BK) -> H: %s' % result
            
            if not result:
                consistent = self.check_consistency(bk_exs+[text_ex])                
                if verbose: print 'consistency check: (BK & T): %s' % consistent

                if consistent:
                    consistent = self.check_consistency(bk_exs+[text_ex, hyp_ex])                
                    if verbose: print 'consistency check: (BK & T & H): %s' % consistent
                    
        return result
    
    def check_consistency(self, assumptions, verbose=False):
        return inference.ParallelProverBuilderCommand(assumptions=assumptions).build_model()
        
    def _tag(self, text, hyp, verbose=False):
        self._generate_BK(text, hyp, verbose)
    
    def _generate_BK(self, text, hyp, verbose=False):
        text = word_tokenize(text)
        hyp = word_tokenize(hyp)
        
        if self.stemmer:
            textbow = set(self._stem(word) for word in text)
            hypbow = set(self._stem(word) for word in hyp)
        else:
            textbow = set(word.lower() for word in text)
            hypbow = set(word.lower() for word in hyp)
        
        if verbose:
            print 'textbow: %s' % textbow
            print 'hypbow: %s' % hypbow
        
        if self.stop:
            textbow = textbow - self.stopwords
            hypbow = hypbow - self.stopwords

        bk = []
        fullbow = textbow|hypbow
        for word_text in fullbow:
            pos = None
            if word_text in wordnet.N:
                bk.extend(self._generate_BK_word(word_text, wordnet.N, fullbow))
            if word_text in wordnet.V:
                bk.extend(self._generate_BK_word(word_text, wordnet.V, fullbow))
            if word_text in wordnet.ADJ:
                bk.extend(self._generate_BK_word(word_text, wordnet.ADJ, fullbow))
            if word_text in wordnet.ADV:
                bk.extend(self._generate_BK_word(word_text, wordnet.ADV, fullbow))
                
        return bk
        
    def _generate_BK_word(self, word_text, pos, fullbow):
        bk = []
        synonyms = set()
        hypernyms = set()
                
        for synset in pos[word_text]:
            for synonym_text in synset:
                if synonym_text != word_text and synonym_text.lower() in fullbow \
                                             and word_text.lower() in fullbow:
                    synonyms.add(synonym_text)
            for hypernymset in synset[wordnet.HYPERNYM]:
                for hypernym_text in hypernymset:
                    if hypernym_text != word_text and hypernym_text.lower() in fullbow \
                                                  and word_text.lower() in fullbow:
                        hypernyms.add(hypernym_text)
                    
        ######################################
        # synonym: all x.((synonym x) -> (word x))
        # hypernym: all x.((word x) -> (hypernym x))
        # synset-sister: all x.((word x) -> (not (sister x)))
        ######################################            
        
        for synonym_text in synonyms:
            bk.append(self._create_axiom_reverse(word_text, synset, synonym_text, pos, 'implies'))

        for hypernym_text in hypernyms - synonyms:
            bk.append(self._create_axiom(word_text, synset, hypernym_text, pos, 'implies'))

        # Create synset-sisters
        for i in range(len(pos[word_text])):
            synset1 = pos[word_text][i]
            j = i+1
            while j < len(pos[word_text]):
                synset2 = pos[word_text][j]
                for word1 in synset1:
                    if word1 != word_text and word1.lower() in fullbow:
                        for word2 in synset2:
                            if word2 != word_text and word2 != word1 and word2.lower() in fullbow:
                                bk.append(self._create_axiom_synset_sisters(word1, synset1, word2, synset2, pos))
                j = j+1
        
        return bk
        
    def _common_BK():
        # From Recognising Textual Entailment by Bos&Markert
        return [LogicParser().parse('all x y z.((in(x,y) & in(y,z)) -> in(x,z))'),
                LogicParser().parse('all e x y.((event(e) & subj(e,x) & in(e,y)) -> in(x,y))'),
                LogicParser().parse('all e x y.((event(e) & obj(e,x) & in(e,y)) -> in(x,y))'),
                LogicParser().parse('all e x y.((event(e) & theme(e,x) & in(e,y)) -> in(x,y))'),
                LogicParser().parse('all x y.(in(x,y) -> some e.(locate(e) & obj(e,x) & in(e,y)))'),
                LogicParser().parse('all x y.(of(x,y) -> some e.(have(e) & subj(e,y) & obj(e,x)))'),
                LogicParser().parse('all e y.((event(e) & subj(e,x)) -> by(e,x))')]
    
    def _create_axiom(self, word_text, word_synset, nym_text, pos, operator):
        nym_text = nym_text.split('(')[0];
        
        nym_word = pos[nym_text]
        dist = 1#min([word_synset.shortest_path_distance(nym_synset) for nym_synset in nym_word])

        word_text = word_text.replace('.', '')
        nym_text = nym_text.replace('.', '')

        exp_text = 'all x.(%s(x) %s %s(x))' % (word_text, operator, nym_text)
        return (LogicParser().parse(exp_text), dist)

    def _create_axiom_reverse(self, word_text, word_synset, nym_text, pos, operator):
        nym_text = nym_text.split('(')[0];

        nym_word = pos[nym_text]
        dist = 1#min([word_synset.shortest_path_distance(nym_synset) for nym_synset in nym_word])

        word_text = word_text.replace('.', '')
        nym_text = nym_text.replace('.', '')

        exp_text = 'all x.(%s(x) %s %s(x))' % (nym_text, operator, word_text)
        return (LogicParser().parse(exp_text), dist)

    def _create_axiom_synset_sisters(self, text1, word1_synset, text2, word2_synset, pos):
        """
        Return an expression of the form 'all x.(word(x) -> (not sister(x)))'.
        The reverse is not needed because it is equal to 'all x.((not word(x)) or (not sister(x)))'
        """
        
        text2 = text2.split('(')[0];

        dist = 1#word1_synset.shortest_path_distance(word2_synset)

        text1 = text1.replace('.', '')
        text2 = text2.replace('.', '')

        exp_text = 'all x.(%s(x) -> (not %s(x)))' % (text1, text2)
        return (LogicParser().parse(exp_text), dist)
    
    def _stem(self, word):
        stem = self.stemmer.stem(word)
        if stem:
            return stem
        else:
            return word
示例#5
0
class RTEInferenceTagger(object):
    """
    Predict whether a hypothesis can be inferred from a text, 
    based on the degree of word overlap.
    """
    def __init__(self, threshold=33, stop=True):
        self.threshold = threshold
        self.stop = stop
        self.stopwords = set(
            ['a', 'the', 'it', 'they', 'of', 'in', 'is', 'are', 'were', 'and'])
        self.stemmer = WordNetLemmatizer()

    def tag(self, rtepair, verbose=False):
        """
        Tag a RTEPair as to whether the hypothesis can be inferred from the text.
        """
        return self.tag_sentences(rtepair.text, rtepair.hyp)

    def tag_sentences(self, text, hyp, verbose=False):
        """
        Tag a RTEPair as to whether the hypothesis can be inferred from the text.
        """
        glueclass = DrtGlue()
        text_drs_list = glueclass.parse_to_meaning(text)
        if text_drs_list:
            text_ex = text_drs_list[0].simplify().toFol()
        else:
            if verbose: print 'ERROR: No readings were generated for the Text'

        hyp_drs_list = glueclass.parse_to_meaning(hyp)
        if hyp_drs_list:
            hyp_ex = hyp_drs_list[0].simplify().toFol()
        else:
            if verbose:
                print 'ERROR: No readings were generated for the Hypothesis'

        #1. proof T -> H
        #2. proof (BK & T) -> H
        #3. proof :(BK & T)
        #4. proof :(BK & T & H)
        #5. satisfy BK & T
        #6. satisfy BK & T & H

        result = inference.Prover9().prove(hyp_ex, [text_ex])
        if verbose: print 'prove: T -> H: %s' % result

        if not result:
            bk = self._generate_BK(text, hyp, verbose)
            bk_exs = [bk_pair[0] for bk_pair in bk]

            if verbose:
                print 'Generated Background Knowledge:'
                for bk_ex in bk_exs:
                    print bk_ex

            result = inference.Prover9().prove(hyp_ex, [text_ex] + bk_exs)
            if verbose: print 'prove: (T & BK) -> H: %s' % result

            if not result:
                consistent = self.check_consistency(bk_exs + [text_ex])
                if verbose:
                    print 'consistency check: (BK & T): %s' % consistent

                if consistent:
                    consistent = self.check_consistency(bk_exs +
                                                        [text_ex, hyp_ex])
                    if verbose:
                        print 'consistency check: (BK & T & H): %s' % consistent

        return result

    def check_consistency(self, assumptions, verbose=False):
        return inference.ParallelProverBuilderCommand(
            assumptions=assumptions).build_model()

    def _tag(self, text, hyp, verbose=False):
        self._generate_BK(text, hyp, verbose)

    def _generate_BK(self, text, hyp, verbose=False):
        text = word_tokenize(text)
        hyp = word_tokenize(hyp)

        if self.stemmer:
            textbow = set(self._stem(word) for word in text)
            hypbow = set(self._stem(word) for word in hyp)
        else:
            textbow = set(word.lower() for word in text)
            hypbow = set(word.lower() for word in hyp)

        if verbose:
            print 'textbow: %s' % textbow
            print 'hypbow: %s' % hypbow

        if self.stop:
            textbow = textbow - self.stopwords
            hypbow = hypbow - self.stopwords

        bk = []
        fullbow = textbow | hypbow
        for word_text in fullbow:
            pos = None
            if word_text in wordnet.N:
                bk.extend(self._generate_BK_word(word_text, wordnet.N,
                                                 fullbow))
            if word_text in wordnet.V:
                bk.extend(self._generate_BK_word(word_text, wordnet.V,
                                                 fullbow))
            if word_text in wordnet.ADJ:
                bk.extend(
                    self._generate_BK_word(word_text, wordnet.ADJ, fullbow))
            if word_text in wordnet.ADV:
                bk.extend(
                    self._generate_BK_word(word_text, wordnet.ADV, fullbow))

        return bk

    def _generate_BK_word(self, word_text, pos, fullbow):
        bk = []
        synonyms = set()
        hypernyms = set()

        for synset in pos[word_text]:
            for synonym_text in synset:
                if synonym_text != word_text and synonym_text.lower() in fullbow \
                                             and word_text.lower() in fullbow:
                    synonyms.add(synonym_text)
            for hypernymset in synset[wordnet.HYPERNYM]:
                for hypernym_text in hypernymset:
                    if hypernym_text != word_text and hypernym_text.lower() in fullbow \
                                                  and word_text.lower() in fullbow:
                        hypernyms.add(hypernym_text)

        ######################################
        # synonym: all x.((synonym x) -> (word x))
        # hypernym: all x.((word x) -> (hypernym x))
        # synset-sister: all x.((word x) -> (not (sister x)))
        ######################################

        for synonym_text in synonyms:
            bk.append(
                self._create_axiom_reverse(word_text, synset, synonym_text,
                                           pos, 'implies'))

        for hypernym_text in hypernyms - synonyms:
            bk.append(
                self._create_axiom(word_text, synset, hypernym_text, pos,
                                   'implies'))

        # Create synset-sisters
        for i in range(len(pos[word_text])):
            synset1 = pos[word_text][i]
            j = i + 1
            while j < len(pos[word_text]):
                synset2 = pos[word_text][j]
                for word1 in synset1:
                    if word1 != word_text and word1.lower() in fullbow:
                        for word2 in synset2:
                            if word2 != word_text and word2 != word1 and word2.lower(
                            ) in fullbow:
                                bk.append(
                                    self._create_axiom_synset_sisters(
                                        word1, synset1, word2, synset2, pos))
                j = j + 1

        return bk

    def _common_BK():
        # From Recognising Textual Entailment by Bos&Markert
        return [
            LogicParser().parse('all x y z.((in(x,y) & in(y,z)) -> in(x,z))'),
            LogicParser().parse(
                'all e x y.((event(e) & subj(e,x) & in(e,y)) -> in(x,y))'),
            LogicParser().parse(
                'all e x y.((event(e) & obj(e,x) & in(e,y)) -> in(x,y))'),
            LogicParser().parse(
                'all e x y.((event(e) & theme(e,x) & in(e,y)) -> in(x,y))'),
            LogicParser().parse(
                'all x y.(in(x,y) -> some e.(locate(e) & obj(e,x) & in(e,y)))'
            ),
            LogicParser().parse(
                'all x y.(of(x,y) -> some e.(have(e) & subj(e,y) & obj(e,x)))'
            ),
            LogicParser().parse('all e y.((event(e) & subj(e,x)) -> by(e,x))')
        ]

    def _create_axiom(self, word_text, word_synset, nym_text, pos, operator):
        nym_text = nym_text.split('(')[0]

        nym_word = pos[nym_text]
        dist = 1  #min([word_synset.shortest_path_distance(nym_synset) for nym_synset in nym_word])

        word_text = word_text.replace('.', '')
        nym_text = nym_text.replace('.', '')

        exp_text = 'all x.(%s(x) %s %s(x))' % (word_text, operator, nym_text)
        return (LogicParser().parse(exp_text), dist)

    def _create_axiom_reverse(self, word_text, word_synset, nym_text, pos,
                              operator):
        nym_text = nym_text.split('(')[0]

        nym_word = pos[nym_text]
        dist = 1  #min([word_synset.shortest_path_distance(nym_synset) for nym_synset in nym_word])

        word_text = word_text.replace('.', '')
        nym_text = nym_text.replace('.', '')

        exp_text = 'all x.(%s(x) %s %s(x))' % (nym_text, operator, word_text)
        return (LogicParser().parse(exp_text), dist)

    def _create_axiom_synset_sisters(self, text1, word1_synset, text2,
                                     word2_synset, pos):
        """
        Return an expression of the form 'all x.(word(x) -> (not sister(x)))'.
        The reverse is not needed because it is equal to 'all x.((not word(x)) or (not sister(x)))'
        """

        text2 = text2.split('(')[0]

        dist = 1  #word1_synset.shortest_path_distance(word2_synset)

        text1 = text1.replace('.', '')
        text2 = text2.replace('.', '')

        exp_text = 'all x.(%s(x) -> (not %s(x)))' % (text1, text2)
        return (LogicParser().parse(exp_text), dist)

    def _stem(self, word):
        stem = self.stemmer.stem(word)
        if stem:
            return stem
        else:
            return word