class TwitterNLP: def __init__(self, tagger, data=[]): # Lookup cache (constantly rerunning tagger takes time) cache_file = os.path.join(enabled_modules['caches'], 'twitter_nlp') self.cache = Cache(cache_file) # Output from the tagger self._words = {} self._entities = {} self._pos = {} self._events = {} # Unescape data self.h = HTMLParser() # Resolve tweets self.tagger = tagger self.resolve(data) def resolve(self, data): #print 'resolve length: ', len(data) data = [ self.h.unescape(twt).strip() for twt in set(data) ] # Tag the data if self.tagger: # Tag all uncached data uncached = [ twt for twt in data if not self.cache.has_key(twt) ] #print uncached #print 'len : ', len(uncached) #print 'uncached: ' #for twt in uncached: print '\t', twt #print '\n\n\n' if uncached: if self.tagger == 'cache': msg = 'Uncached twitter_nlp data. Tagger must be installed.' raise Exception(msg) partial = interface_nlp.resolve(self.tagger, uncached) for twt,tag in zip(uncached,partial): self.cache.add_map(twt, tag) # Lookup all tags tagged = [ self.cache.get_map(twt) for twt in data ] else: tagged = [] # Store the data in the object for twt,tags in zip(data,tagged): self._words[twt] = [ '/'.join(t.split('/')[:-3]) for t in tags ] self._entities[twt] = [ t.split('/')[ -3] for t in tags ] self._pos[twt] = [ t.split('/')[ -2] for t in tags ] self._events[twt] = [ t.split('/')[ -1] for t in tags ] #print 'tweet: ', twt #print 'words: ', self._words[twt] #print 'entities: ', self._entities[twt] #print 'POS: ', self._pos[twt] #print 'events: ', self._events[twt] #print def tokens(self, twt): twt = self.h.unescape(twt).strip() if twt not in self._words: print 'not in: ', twt return [] else: return self._words[twt] def entities(self, twt): twt = self.h.unescape(twt).strip() etype = None ents = [] curr = [] #print twt if twt not in self._words: return [] for i in range(len(self._words[twt])): w = self._words[ twt][i] tag = self._entities[twt][i] #print '\t', w, '\t', tag # Assumes 'I' never comes before a 'O' if tag[0] == 'I': curr.append(w) else: if curr: ents.append( (etype,' '.join(curr)) ) curr = [] if tag[0] == 'B': etype = tag[2:] curr = [w] # Flush remaining entity (if necessary) if curr: ents.append( (etype,' '.join(curr)) ) #print ents return ents def brown(self, twt): twt = self.h.unescape(twt).strip() etype = None ents = [] curr = [] #print twt if twt not in self._words: return [] for i in range(len(self._words[twt])): w = self._words[ twt][i] tag = self._entities[twt][i] #print '\t', w, '\t', tag # Replace non-'O' with entity label if tag[0] != 'I': if curr: ents.append( ' '.join(curr) ) curr = [] if tag[0] == 'B': curr = [tag[2:]] else: curr = [w] # Flush remaining entity (if necessary) if curr: ents.append( ' '.join(curr) ) #print ents #print return ents def update(self, data): """ update() Purpose: Run the tagger on a batch of tweets (rather than individually) @param data. A list of strings (each string is the text of a tweet) """ self.resolve(data) def features(self, twt): """ features() Purpose: Get twitter_nlp features @param twt. The string text of a tweet. @return A feature dictionary. """ # Feature dictionary feats = {} # Escape text if not already done twt = self.h.unescape(twt).strip() # Feature: Entity types ents = self.entities(twt) for ent in ents: feats[ ('entity_type', ent[0]) ] = .5 feats[ ('entity', ent[1]) ] = .5 # Feature: Brown Cluster bigrams clustered = self.brown(twt) for i in range(len(clustered)-1): bigram = clustered[i:i+2] feats[ ('brown_bigram',(clustered[i],clustered[i+1])) ] = .5 # Feature: POS counts pos_counts = defaultdict(lambda:0) for pos in self._pos[twt]: if pos not in string.punctuation: pos_counts[pos] += 1 for pos,count in pos_counts.items(): featname = 'pos_count-%s' % pos feats[featname] = count #print 'nlp: ', twt #print '\t', feats return feats
class TwitterData: def __init__(self, sids=[], data=[]): # Tweet cache self.cache = Cache('twitter_data') # Cache all given data self.resolve(sids, data) def resolve(self, sids, data): """ resolve() Purpose: Wrapper for interface_twitter.resolve() (to use object's cache) @param sids. A list of twiiter IDs. @return A list of tweets. """ # Compile list of tweets that need to be quieried with API uncached = [sid for sid in sids if not self.cache.has_key(sid)] #print 'uncached: ', len(uncached) # Use API to lookup uncached tweets if uncached: partial = interface_twitter.resolve(uncached) for sid, twt in zip(uncached, partial): self.cache.add_map(sid, twt) # Get all tweets resolved = [] for txt, sid in zip(data, sids): twt = self.cache.get_map(sid) if txt == twt['text']: res = twt else: res = None #print 'res: ', res resolved.append(res) return resolved def lookup(self, sids): """ resolve() Purpose: Wrapper for interface_twitter.resolve() (to use object's cache) @param sids. A list of twiiter IDs. @return A list of tweets. """ # Compile list of tweets that need to be quieried with API uncached = [sid for sid in sids if not self.cache.has_key(sid)] #print 'uncached: ', len(uncached) # Use API to lookup uncached tweets if uncached: partial = interface_twitter.resolve(uncached) for sid, twt in zip(uncached, partial): self.cache.add_map(sid, twt) # Get all tweets resolved = [] for sid in sids: twt = self.cache.get_map(sid) resolved.append(twt) return resolved def features(self, sid): """ features() Purpose: Get features from tweet meta data @param sids. A tweet ID @return A dictionary of meta data features. """ # Get tweet tweet = self.cache.get_map(sid) if tweet == None: return {} # Extract features feats = {} # Not available if tweet['text'] == 'Not Available': return {} # Features: Retweet & Favorite counts feats['favorite_count'] = tweet['favorite_count'] # 2 feats['retweet_count'] = tweet['retweet_count'] # 1 # Feature: Whether username contains word 'news' if 'news' in tweet['user']['screen_name'].lower(): feats['is_news'] = 1 if 'news' in tweet['user']['name'].lower(): feats['is_news'] = 1 # Feature: Whether tweet is reply if tweet['in_reply_to_status_id_str']: feats['is_reply'] = 1 return feats
class SpellChecker: def __init__(self): # Global spell checker #self.d = enchant.DictWithPWL("en_US", '/data1/wboag/ml/twitvec/twitvec/spelling/output.txt') self.d = enchant.Dict("en_US") # Common abbreviations and mistakes self.common = {} abbrevs = os.path.join(enabled_modules['spell'], 'abbrv.txt') with open(abbrevs, 'r') as f: for line in f.readlines(): if line == '\n': continue abbrev, full = tuple(line.strip('\n').split(' || ')) self.common[abbrev] = full # Load cache of spell-corrected words self.cache = Cache('B-enchant') def correct_spelling(self, phrase, pos=None): #return phrase # Memoized? key = tuple(phrase) #if self.cache.has_key(key): if False: return self.cache.get_map(key) cands = [] # Build all possible candidates for i, w in enumerate(phrase): if _debug: print w # Special form if do_not_alter(w, pos, i): if _debug: print '\tSTATIC' cands.append([w]) # Numbers elif re.search('\d', w): if _debug: print '\tNumber' cands.append(['000']) # Regexes elif re.search('^a*(?:h+q?a+)+h*$', w): if _debug: print '\tHAHA' cands.append(['haha']) elif re.search('^(?:h+e+)*$', w): if _debug: print '\tHEHE' cands.append(['haha']) elif re.search('^o*(?:xo)+x*$', w): if _debug: print '\tXOXO' cands.append(['xoxo']) elif re.search('^l(?:ol)+$', w): if _debug: print '\tLOLOL' cands.append(['lol']) # Common abbreviations / mistakes elif w.lower() in self.common: if _debug: print '\tCOMMON' cands.append([self.common[w.lower()]]) # Normal else: # FIXME: do this during tokenization cand = w if w[-2:] == "'s": cand = w[:-2] if w[-2:] == "'m": cand = w[:-2] if w[-3:] == "'ve": cand = w[:-2] if w[-3:] == "'ll": cand = w[:-2] if len(cand): w = cand # ends in exlamation mark context exclamation = False if re.search('^[^!]*!$', w): if _debug: '\tECLAMATION' w = w.strip('!') exclamation = True # Capitalized often means proper noun if w[0].isupper(): if _debug: '\tMAYBE PROPER NOUN' possible = [w] # Spelled correct? elif self.d.check(w): if _debug: print '\tCORRECT!' possible = [w] # Try fixing with repeated characters elif elongated_characters(w): # Remove duplicated characters down to just 2 remaining if _debug: print '\tELONGATED' possible = [remove_duplicates(w)] #print w, '\t->\t', possible[0] # Leading apostraphe elif (w[-1] == "'") and (self.d.check(w[:-1])): if _debug: print '\tAPOSTRAPHE!' possible = [w[:-1]] # Word not separated from punctuation elif (w[0] in punctuation) or (w[-1] in punctuation): # Separate word from leading and trailing punctuation match = re.search("([^a-zA-Z]*)([a-zA-z']*)([^a-zA-Z]*)", w) leading, word, trailing = match.groups() possible = [] if leading: possible.append(leading) if word in self.common: possible.append(self.common[word]) else: possible.append(word) if trailing: possible.append(trailing) #print w, ' -> ', possible # Backoff to spell checker correction else: if _debug: print '\tCHECKING SUGGESTIONS' #if not self.cache.has_key(w): if True: # Run spell ccorrection possible = self.d.suggest(w) # If no matches, then use original if possible == []: possible = [w] ''' if (w not in seen) and (edit_distance(w,possible[0])<=2): seen.add(w) print phrase[max(i-3,0):i+4] print w, '\t', possible[0], '\t', edit_distance(w,possible[0]) print ''' # good prediction? if enabled_modules['caches'] is not None: if edit_distance(w, possible[0]) <= 2: self.cache.add_map(key, possible) else: self.cache.add_map(key, w) # lookup cached spell corrections else: possible = self.cache.get_map(w) # trailing exclamation if exclamation: possible = [w + ' !' for w in possible] cands.append(possible) #for c in cands: # print c #print # Select proper candidate corrected = [choices[0] for choices in cands] # memoize if enabled_modules['caches'] is not None: self.cache.add_map(key, corrected) return corrected
class TwitterData: def __init__(self, sids=[], data=[]): # Tweet cache self.cache = Cache('twitter_data') # Cache all given data self.resolve(sids, data) def resolve(self, sids, data): """ resolve() Purpose: Wrapper for interface_twitter.resolve() (to use object's cache) @param sids. A list of twiiter IDs. @return A list of tweets. """ # Compile list of tweets that need to be quieried with API uncached = [ sid for sid in sids if not self.cache.has_key(sid) ] #print 'uncached: ', len(uncached) # Use API to lookup uncached tweets if uncached: partial = interface_twitter.resolve(uncached) for sid,twt in zip(uncached,partial): self.cache.add_map(sid,twt) # Get all tweets resolved = [] for txt,sid in zip(data,sids): twt = self.cache.get_map(sid) if txt == twt['text']: res = twt else: res = None #print 'res: ', res resolved.append(res) return resolved def lookup(self, sids): """ resolve() Purpose: Wrapper for interface_twitter.resolve() (to use object's cache) @param sids. A list of twiiter IDs. @return A list of tweets. """ # Compile list of tweets that need to be quieried with API uncached = [ sid for sid in sids if not self.cache.has_key(sid) ] #print 'uncached: ', len(uncached) # Use API to lookup uncached tweets if uncached: partial = interface_twitter.resolve(uncached) for sid,twt in zip(uncached,partial): self.cache.add_map(sid,twt) # Get all tweets resolved = [] for sid in sids: twt = self.cache.get_map(sid) resolved.append(twt) return resolved def features(self, sid): """ features() Purpose: Get features from tweet meta data @param sids. A tweet ID @return A dictionary of meta data features. """ # Get tweet tweet = self.cache.get_map(sid) if tweet == None: return {} # Extract features feats = {} # Not available if tweet['text'] == 'Not Available': return {} # Features: Retweet & Favorite counts feats['favorite_count' ] = tweet['favorite_count'] # 2 feats['retweet_count' ] = tweet['retweet_count'] # 1 # Feature: Whether username contains word 'news' if 'news' in tweet['user']['screen_name'].lower(): feats['is_news'] = 1 if 'news' in tweet['user']['name'].lower(): feats['is_news'] = 1 # Feature: Whether tweet is reply if tweet['in_reply_to_status_id_str']: feats['is_reply'] = 1 return feats
class SpellChecker: def __init__(self): # Global spell checker #self.d = enchant.DictWithPWL("en_US", '/data1/wboag/ml/twitvec/twitvec/spelling/output.txt') self.d = enchant.Dict("en_US") # Common abbreviations and mistakes self.common = {} abbrevs = os.path.join('/data1/nlp-data/twitter/tools/spell/abbrv.txt') with open(abbrevs,'r') as f: for line in f.readlines(): if line == '\n': continue abbrev,full = tuple(line.strip('\n').split(' || ')) self.common[abbrev] = full # Load cache of spell-corrected words self.cache = Cache('B-enchant') def correct_spelling(self, phrase, pos=None): #return phrase # Memoized? key = tuple(phrase) #if self.cache.has_key(key): if False: return self.cache.get_map(key) cands = [] # Build all possible candidates for i,w in enumerate(phrase): if _debug: print w # Special form if do_not_alter(w,pos,i): if _debug: print '\tSTATIC' cands.append([w]) # Numbers elif re.search('\d',w): if _debug: print '\tNumber' cands.append(['000']) # Regexes elif re.search('^a*(?:h+q?a+)+h*$',w): if _debug: print '\tHAHA' cands.append(['haha']) elif re.search('^(?:h+e+)*$',w): if _debug: print '\tHEHE' cands.append(['haha']) elif re.search('^o*(?:xo)+x*$',w): if _debug: print '\tXOXO' cands.append(['xoxo']) elif re.search('^l(?:ol)+$',w): if _debug: print '\tLOLOL' cands.append(['lol']) # Common abbreviations / mistakes elif w.lower() in self.common: if _debug: print '\tCOMMON' cands.append([self.common[w.lower()]]) # Normal else: # FIXME: do this during tokenization if w[-2:] == "'s": w = w[:-2] if w[-2:] == "'m": w = w[:-2] if w[-3:] == "'ve": w = w[:-2] if w[-3:] == "'ll": w = w[:-2] # ends in exlamation mark context exclamation = False if re.search('^[^!]*!$',w): w = w.strip('!') exclamation = True # Capitalized often means proper noun if w[0].isupper(): if _debug: '\tMAYBE PROPER NOUN' possible = [w] # Spelled correct? elif self.d.check(w): if _debug: print '\tCORRECT!' possible = [w] # Try fixing with repeated characters elif elongated_characters(w): # Remove duplicated characters down to just 2 remaining if _debug: print '\tELONGATED' possible = [remove_duplicates(w)] #print w, '\t->\t', possible[0] # Leading apostraphe elif (w[-1] == "'") and (self.d.check(w[:-1])): if _debug: print '\tAPOSTRAPHE!' possible = [w[:-1]] # Word not separated from punctuation elif (w[0] in punctuation) or (w[-1] in punctuation): # Separate word from leading and trailing punctuation match = re.search("([^a-zA-Z]*)([a-zA-z']*)([^a-zA-Z]*)",w) leading,word,trailing = match.groups() possible = [] if leading: possible.append(leading) if word in self.common: possible.append(self.common[word]) else: possible.append(word) if trailing: possible.append(trailing) #print w, ' -> ', possible # Backoff to spell checker correction else: if _debug: print '\tCHECKING SUGGESTIONS' #if not self.cache.has_key(w): if True: # Run spell ccorrection possible = self.d.suggest(w) # If no matches, then use original if possible == []: possible = [w] ''' if (w not in seen) and (edit_distance(w,possible[0])<=2): seen.add(w) print phrase[max(i-3,0):i+4] print w, '\t', possible[0], '\t', edit_distance(w,possible[0]) print ''' # good prediction? if edit_distance(w,possible[0]) <= 2: self.cache.add_map(key,possible) else: self.cache.add_map(key,w) # lookup cached spell corrections else: possible = self.cache.get_map(w) # trailing exclamation if exclamation: possible = [ w + ' !' for w in possible ] cands.append(possible) #for c in cands: # print c #print # Select proper candidate corrected = [ choices[0] for choices in cands ] # memoize self.cache.add_map(key,corrected) return corrected