def __init__(self, num_tweets=False, debug=False, time=False): self.collection = db.twictionary_models_tweets self.tres = db.twictionary_models_tres opts = dict(fields=dict(date=0, tid=0, user=0), timeout=False) if num_tweets: opts['limit'] = num_tweets if debug or time: opts['limit'] = 5000 self.tweets = self.collection.find(**opts) self.persister = Persister(debug=debug) self.debug = debug self.time = time self.tokenizer = TwitterTokenizer() self.run()
class Pairer(object): def __init__(self, num_tweets=False, debug=False, time=False): self.collection = db.twictionary_models_tweets self.tres = db.twictionary_models_tres opts = dict(fields=dict(date=0, tid=0, user=0), timeout=False) if num_tweets: opts['limit'] = num_tweets if debug or time: opts['limit'] = 5000 self.tweets = self.collection.find(**opts) self.persister = Persister(debug=debug) self.debug = debug self.time = time self.tokenizer = TwitterTokenizer() self.run() def run(self): total = 0 start_time = time.time() end_time = time.time() if self.time: start_time = time.time() for t in self.tweets: if total % 100000 == 0: end_time = time.time() print "processed %s documents!" % total print "these documents took %s seconds to process" % (end_time - start_time) start_time = time.time() self.pair(t) total += 1 if self.time: end_time = time.time() print "Run time: %s" % (end_time - start_time) self.tres.remove({}) def pair(self, tweet): text = tweet['text'] id = tweet['_id'] tokenized = self.tokenizer.tokenize(text) length = len(tokenized) for i in range(length): token = tokenized[i] if not self.tokenizer.is_bad_token(token): previous = None next = None if i != 0: previous = tokenized[i-1] if self.tokenizer.is_bad_token(previous): previous = None if i < (length - 1): next = tokenized[i+1] if self.tokenizer.is_bad_token(next): next = None self.persister.add(main=token, previous=previous, next=next, id=id)
class Pairer(object): def __init__(self, num_tweets=False, debug=False, time=False): self.collection = db.twictionary_models_tweets self.tres = db.twictionary_models_tres opts = dict(fields=dict(date=0, tid=0, user=0), timeout=False) if num_tweets: opts['limit'] = num_tweets if debug or time: opts['limit'] = 5000 self.tweets = self.collection.find(**opts) self.persister = Persister(debug=debug) self.debug = debug self.time = time self.tokenizer = TwitterTokenizer() self.run() def run(self): total = 0 start_time = time.time() end_time = time.time() if self.time: start_time = time.time() for t in self.tweets: if total % 100000 == 0: end_time = time.time() print "processed %s documents!" % total print "these documents took %s seconds to process" % ( end_time - start_time) start_time = time.time() self.pair(t) total += 1 if self.time: end_time = time.time() print "Run time: %s" % (end_time - start_time) self.tres.remove({}) def pair(self, tweet): text = tweet['text'] id = tweet['_id'] tokenized = self.tokenizer.tokenize(text) length = len(tokenized) for i in range(length): token = tokenized[i] if not self.tokenizer.is_bad_token(token): previous = None next = None if i != 0: previous = tokenized[i - 1] if self.tokenizer.is_bad_token(previous): previous = None if i < (length - 1): next = tokenized[i + 1] if self.tokenizer.is_bad_token(next): next = None self.persister.add(main=token, previous=previous, next=next, id=id)
def normalize_mentions(self, s, repl_func='@MENTION'): return MENTION_REGEX.sub(repl_func, s) def normalize_repeated_chars(self, s): return REPEATED_CHAR_REGEX.sub(r'\1\1\1', s) if __name__ == '__main__': print reduce(lambda x, y: '{0}({1})'.format(y, x), ['f', 'g', 'h'], 'x') preprocessor = TwitterTextPreprocessor() from tokenizer import TwitterTokenizer tok = TwitterTokenizer() from pymongo import MongoClient client = MongoClient() db = client.twitter_database db_labeled_tweets = db.labeled_tweets for tweet in db_labeled_tweets.find({u'text': {'$exists': True}}): text = tweet.get(u'text') print tweet.get(u'_id') print text # print decode_html_entities(text) # print normalize_urls(text) # print normalize_repeated_chars(text) # print normalize_mentions(text)