def remove_noise(tweet_tokens, stop_words=()): cleaned_tokens = [] for token, tag in pos_tag(tweet_tokens): # token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\ # '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token) # token = re.sub("(@[A-Za-z0-9_]+)","", token) if tag.startswith("NN"): pos = 'n' elif tag.startswith('VB'): pos = 'v' else: pos = 'a' lemmatizer = WordNetLemmatizer() token = lemmatizer.lemmatize(token, pos) if token in tweet_tokens: lemmatizer = SnowballStemmer('english') token = lemmatizer.stem(token) if len(token) > 0 and token not in string.punctuation and token.lower( ) not in stop_words: cleaned_tokens.append(token.lower()) return cleaned_tokens
def lemmatize(self, token, v=1): if v == self.STEMMER_WORDNET_ENGLISH: from nltk.stem.wordnet import WordNetLemmatizer lmtzr = WordNetLemmatizer() return lmtzr.lemmatize(token) if v == self.STEMMER_SNOWBALL_ENGLISH: from nltk import stem lmtzr =stem.snowball.EnglishStemmer() return lmtzr.stem(token) if v == self.STEMMER_LANCASTER_ENGLISH: from nltk.stem.lancaster import LancasterStemmer lmtzr = LancasterStemmer() return lmtzr.stem(token) if v == self.STEMMER_PORTER_ENGLISH: from nltk.stem.porter import PorterStemmer lmtzr = PorterStemmer() return lmtzr.stem(token)
'psychobiology', 'magical', 'magically', 'adulatory', 'mandatory' ] tokens += [ 'microchemistry', 'scorningly', 'excystate', 'execrable', 'statued', 'statuary', 'sparringly' ] ## or compare with random bag of words #random_range = randint(0, 236736) # #tokens = words.words()[random_range-10:random_range] wnl = WordNetLemmatizer() wnl.stem = wnl.lemmatize STEMMERS = { # 'Regexp': RegexpStemmer(regexp=), 'Lancaster': LancasterStemmer(), 'Porter': PorterStemmer(), 'Modified-Porter': ModifiedPorterStemmer(), 'Snowball-EN': SnowballStemmer('english'), 'WordNet-Lemma': wnl, } table_headers = ["Word/Stemmer"] + STEMMERS.keys() table = [] for t in tokens: result = [t]
class RTEInferenceTagger(object): """ Predict whether a hypothesis can be inferred from a text, based on the degree of word overlap. """ def __init__(self, threshold=33, stop=True): self.threshold = threshold self.stop = stop self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'is', 'are', 'were', 'and']) self.stemmer = WordNetLemmatizer() def tag(self, rtepair, verbose=False): """ Tag a RTEPair as to whether the hypothesis can be inferred from the text. """ return self.tag_sentences(rtepair.text, rtepair.hyp ) def tag_sentences(self, text, hyp, verbose=False): """ Tag a RTEPair as to whether the hypothesis can be inferred from the text. """ glueclass = DrtGlue() text_drs_list = glueclass.parse_to_meaning(text) if text_drs_list: text_ex = text_drs_list[0].simplify().toFol() else: if verbose: print 'ERROR: No readings were generated for the Text' hyp_drs_list = glueclass.parse_to_meaning(hyp) if hyp_drs_list: hyp_ex = hyp_drs_list[0].simplify().toFol() else: if verbose: print 'ERROR: No readings were generated for the Hypothesis' #1. proof T -> H #2. proof (BK & T) -> H #3. proof :(BK & T) #4. proof :(BK & T & H) #5. satisfy BK & T #6. satisfy BK & T & H result = inference.Prover9().prove(hyp_ex, [text_ex]) if verbose: print 'prove: T -> H: %s' % result if not result: bk = self._generate_BK(text, hyp, verbose) bk_exs = [bk_pair[0] for bk_pair in bk] if verbose: print 'Generated Background Knowledge:' for bk_ex in bk_exs: print bk_ex result = inference.Prover9().prove(hyp_ex, [text_ex]+bk_exs) if verbose: print 'prove: (T & BK) -> H: %s' % result if not result: consistent = self.check_consistency(bk_exs+[text_ex]) if verbose: print 'consistency check: (BK & T): %s' % consistent if consistent: consistent = self.check_consistency(bk_exs+[text_ex, hyp_ex]) if verbose: print 'consistency check: (BK & T & H): %s' % consistent return result def check_consistency(self, assumptions, verbose=False): return inference.ParallelProverBuilderCommand(assumptions=assumptions).build_model() def _tag(self, text, hyp, verbose=False): self._generate_BK(text, hyp, verbose) def _generate_BK(self, text, hyp, verbose=False): text = word_tokenize(text) hyp = word_tokenize(hyp) if self.stemmer: textbow = set(self._stem(word) for word in text) hypbow = set(self._stem(word) for word in hyp) else: textbow = set(word.lower() for word in text) hypbow = set(word.lower() for word in hyp) if verbose: print 'textbow: %s' % textbow print 'hypbow: %s' % hypbow if self.stop: textbow = textbow - self.stopwords hypbow = hypbow - self.stopwords bk = [] fullbow = textbow|hypbow for word_text in fullbow: pos = None if word_text in wordnet.N: bk.extend(self._generate_BK_word(word_text, wordnet.N, fullbow)) if word_text in wordnet.V: bk.extend(self._generate_BK_word(word_text, wordnet.V, fullbow)) if word_text in wordnet.ADJ: bk.extend(self._generate_BK_word(word_text, wordnet.ADJ, fullbow)) if word_text in wordnet.ADV: bk.extend(self._generate_BK_word(word_text, wordnet.ADV, fullbow)) return bk def _generate_BK_word(self, word_text, pos, fullbow): bk = [] synonyms = set() hypernyms = set() for synset in pos[word_text]: for synonym_text in synset: if synonym_text != word_text and synonym_text.lower() in fullbow \ and word_text.lower() in fullbow: synonyms.add(synonym_text) for hypernymset in synset[wordnet.HYPERNYM]: for hypernym_text in hypernymset: if hypernym_text != word_text and hypernym_text.lower() in fullbow \ and word_text.lower() in fullbow: hypernyms.add(hypernym_text) ###################################### # synonym: all x.((synonym x) -> (word x)) # hypernym: all x.((word x) -> (hypernym x)) # synset-sister: all x.((word x) -> (not (sister x))) ###################################### for synonym_text in synonyms: bk.append(self._create_axiom_reverse(word_text, synset, synonym_text, pos, 'implies')) for hypernym_text in hypernyms - synonyms: bk.append(self._create_axiom(word_text, synset, hypernym_text, pos, 'implies')) # Create synset-sisters for i in range(len(pos[word_text])): synset1 = pos[word_text][i] j = i+1 while j < len(pos[word_text]): synset2 = pos[word_text][j] for word1 in synset1: if word1 != word_text and word1.lower() in fullbow: for word2 in synset2: if word2 != word_text and word2 != word1 and word2.lower() in fullbow: bk.append(self._create_axiom_synset_sisters(word1, synset1, word2, synset2, pos)) j = j+1 return bk def _common_BK(): # From Recognising Textual Entailment by Bos&Markert return [LogicParser().parse('all x y z.((in(x,y) & in(y,z)) -> in(x,z))'), LogicParser().parse('all e x y.((event(e) & subj(e,x) & in(e,y)) -> in(x,y))'), LogicParser().parse('all e x y.((event(e) & obj(e,x) & in(e,y)) -> in(x,y))'), LogicParser().parse('all e x y.((event(e) & theme(e,x) & in(e,y)) -> in(x,y))'), LogicParser().parse('all x y.(in(x,y) -> some e.(locate(e) & obj(e,x) & in(e,y)))'), LogicParser().parse('all x y.(of(x,y) -> some e.(have(e) & subj(e,y) & obj(e,x)))'), LogicParser().parse('all e y.((event(e) & subj(e,x)) -> by(e,x))')] def _create_axiom(self, word_text, word_synset, nym_text, pos, operator): nym_text = nym_text.split('(')[0]; nym_word = pos[nym_text] dist = 1#min([word_synset.shortest_path_distance(nym_synset) for nym_synset in nym_word]) word_text = word_text.replace('.', '') nym_text = nym_text.replace('.', '') exp_text = 'all x.(%s(x) %s %s(x))' % (word_text, operator, nym_text) return (LogicParser().parse(exp_text), dist) def _create_axiom_reverse(self, word_text, word_synset, nym_text, pos, operator): nym_text = nym_text.split('(')[0]; nym_word = pos[nym_text] dist = 1#min([word_synset.shortest_path_distance(nym_synset) for nym_synset in nym_word]) word_text = word_text.replace('.', '') nym_text = nym_text.replace('.', '') exp_text = 'all x.(%s(x) %s %s(x))' % (nym_text, operator, word_text) return (LogicParser().parse(exp_text), dist) def _create_axiom_synset_sisters(self, text1, word1_synset, text2, word2_synset, pos): """ Return an expression of the form 'all x.(word(x) -> (not sister(x)))'. The reverse is not needed because it is equal to 'all x.((not word(x)) or (not sister(x)))' """ text2 = text2.split('(')[0]; dist = 1#word1_synset.shortest_path_distance(word2_synset) text1 = text1.replace('.', '') text2 = text2.replace('.', '') exp_text = 'all x.(%s(x) -> (not %s(x)))' % (text1, text2) return (LogicParser().parse(exp_text), dist) def _stem(self, word): stem = self.stemmer.stem(word) if stem: return stem else: return word
class RTEInferenceTagger(object): """ Predict whether a hypothesis can be inferred from a text, based on the degree of word overlap. """ def __init__(self, threshold=33, stop=True): self.threshold = threshold self.stop = stop self.stopwords = set( ['a', 'the', 'it', 'they', 'of', 'in', 'is', 'are', 'were', 'and']) self.stemmer = WordNetLemmatizer() def tag(self, rtepair, verbose=False): """ Tag a RTEPair as to whether the hypothesis can be inferred from the text. """ return self.tag_sentences(rtepair.text, rtepair.hyp) def tag_sentences(self, text, hyp, verbose=False): """ Tag a RTEPair as to whether the hypothesis can be inferred from the text. """ glueclass = DrtGlue() text_drs_list = glueclass.parse_to_meaning(text) if text_drs_list: text_ex = text_drs_list[0].simplify().toFol() else: if verbose: print 'ERROR: No readings were generated for the Text' hyp_drs_list = glueclass.parse_to_meaning(hyp) if hyp_drs_list: hyp_ex = hyp_drs_list[0].simplify().toFol() else: if verbose: print 'ERROR: No readings were generated for the Hypothesis' #1. proof T -> H #2. proof (BK & T) -> H #3. proof :(BK & T) #4. proof :(BK & T & H) #5. satisfy BK & T #6. satisfy BK & T & H result = inference.Prover9().prove(hyp_ex, [text_ex]) if verbose: print 'prove: T -> H: %s' % result if not result: bk = self._generate_BK(text, hyp, verbose) bk_exs = [bk_pair[0] for bk_pair in bk] if verbose: print 'Generated Background Knowledge:' for bk_ex in bk_exs: print bk_ex result = inference.Prover9().prove(hyp_ex, [text_ex] + bk_exs) if verbose: print 'prove: (T & BK) -> H: %s' % result if not result: consistent = self.check_consistency(bk_exs + [text_ex]) if verbose: print 'consistency check: (BK & T): %s' % consistent if consistent: consistent = self.check_consistency(bk_exs + [text_ex, hyp_ex]) if verbose: print 'consistency check: (BK & T & H): %s' % consistent return result def check_consistency(self, assumptions, verbose=False): return inference.ParallelProverBuilderCommand( assumptions=assumptions).build_model() def _tag(self, text, hyp, verbose=False): self._generate_BK(text, hyp, verbose) def _generate_BK(self, text, hyp, verbose=False): text = word_tokenize(text) hyp = word_tokenize(hyp) if self.stemmer: textbow = set(self._stem(word) for word in text) hypbow = set(self._stem(word) for word in hyp) else: textbow = set(word.lower() for word in text) hypbow = set(word.lower() for word in hyp) if verbose: print 'textbow: %s' % textbow print 'hypbow: %s' % hypbow if self.stop: textbow = textbow - self.stopwords hypbow = hypbow - self.stopwords bk = [] fullbow = textbow | hypbow for word_text in fullbow: pos = None if word_text in wordnet.N: bk.extend(self._generate_BK_word(word_text, wordnet.N, fullbow)) if word_text in wordnet.V: bk.extend(self._generate_BK_word(word_text, wordnet.V, fullbow)) if word_text in wordnet.ADJ: bk.extend( self._generate_BK_word(word_text, wordnet.ADJ, fullbow)) if word_text in wordnet.ADV: bk.extend( self._generate_BK_word(word_text, wordnet.ADV, fullbow)) return bk def _generate_BK_word(self, word_text, pos, fullbow): bk = [] synonyms = set() hypernyms = set() for synset in pos[word_text]: for synonym_text in synset: if synonym_text != word_text and synonym_text.lower() in fullbow \ and word_text.lower() in fullbow: synonyms.add(synonym_text) for hypernymset in synset[wordnet.HYPERNYM]: for hypernym_text in hypernymset: if hypernym_text != word_text and hypernym_text.lower() in fullbow \ and word_text.lower() in fullbow: hypernyms.add(hypernym_text) ###################################### # synonym: all x.((synonym x) -> (word x)) # hypernym: all x.((word x) -> (hypernym x)) # synset-sister: all x.((word x) -> (not (sister x))) ###################################### for synonym_text in synonyms: bk.append( self._create_axiom_reverse(word_text, synset, synonym_text, pos, 'implies')) for hypernym_text in hypernyms - synonyms: bk.append( self._create_axiom(word_text, synset, hypernym_text, pos, 'implies')) # Create synset-sisters for i in range(len(pos[word_text])): synset1 = pos[word_text][i] j = i + 1 while j < len(pos[word_text]): synset2 = pos[word_text][j] for word1 in synset1: if word1 != word_text and word1.lower() in fullbow: for word2 in synset2: if word2 != word_text and word2 != word1 and word2.lower( ) in fullbow: bk.append( self._create_axiom_synset_sisters( word1, synset1, word2, synset2, pos)) j = j + 1 return bk def _common_BK(): # From Recognising Textual Entailment by Bos&Markert return [ LogicParser().parse('all x y z.((in(x,y) & in(y,z)) -> in(x,z))'), LogicParser().parse( 'all e x y.((event(e) & subj(e,x) & in(e,y)) -> in(x,y))'), LogicParser().parse( 'all e x y.((event(e) & obj(e,x) & in(e,y)) -> in(x,y))'), LogicParser().parse( 'all e x y.((event(e) & theme(e,x) & in(e,y)) -> in(x,y))'), LogicParser().parse( 'all x y.(in(x,y) -> some e.(locate(e) & obj(e,x) & in(e,y)))' ), LogicParser().parse( 'all x y.(of(x,y) -> some e.(have(e) & subj(e,y) & obj(e,x)))' ), LogicParser().parse('all e y.((event(e) & subj(e,x)) -> by(e,x))') ] def _create_axiom(self, word_text, word_synset, nym_text, pos, operator): nym_text = nym_text.split('(')[0] nym_word = pos[nym_text] dist = 1 #min([word_synset.shortest_path_distance(nym_synset) for nym_synset in nym_word]) word_text = word_text.replace('.', '') nym_text = nym_text.replace('.', '') exp_text = 'all x.(%s(x) %s %s(x))' % (word_text, operator, nym_text) return (LogicParser().parse(exp_text), dist) def _create_axiom_reverse(self, word_text, word_synset, nym_text, pos, operator): nym_text = nym_text.split('(')[0] nym_word = pos[nym_text] dist = 1 #min([word_synset.shortest_path_distance(nym_synset) for nym_synset in nym_word]) word_text = word_text.replace('.', '') nym_text = nym_text.replace('.', '') exp_text = 'all x.(%s(x) %s %s(x))' % (nym_text, operator, word_text) return (LogicParser().parse(exp_text), dist) def _create_axiom_synset_sisters(self, text1, word1_synset, text2, word2_synset, pos): """ Return an expression of the form 'all x.(word(x) -> (not sister(x)))'. The reverse is not needed because it is equal to 'all x.((not word(x)) or (not sister(x)))' """ text2 = text2.split('(')[0] dist = 1 #word1_synset.shortest_path_distance(word2_synset) text1 = text1.replace('.', '') text2 = text2.replace('.', '') exp_text = 'all x.(%s(x) -> (not %s(x)))' % (text1, text2) return (LogicParser().parse(exp_text), dist) def _stem(self, word): stem = self.stemmer.stem(word) if stem: return stem else: return word