def connect(feed="http://stream.twitter.com/1/statuses/filter.json", track='apple', username=None, password=None, max=5, encoding='utf-8'): values = dict(track=track) data = urllib.urlencode(values) request = urllib2.Request(feed, data) base64string = base64.encodestring('%s:%s' % (username, password))[:-1] request.add_header("Authorization", "Basic %s" % base64string) s = urllib2.urlopen(request) tt = TweetTokenizer() tp = TweetParser() tagger = TweetTagger() count = 0 for line in iter(s.readline, None): count += 1 print '%s of %s' % (count, max) try: tweet = simplejson.loads(line) except: print 'JSON load failed on line===>%s ' % line continue text = tweet.get('text') print text.encode(encoding, 'replace') tokens = tt.tokenize(text) parsed = tp.parse(text, debug=False) tags = tagger.tag(text, debug=False) print tokens print parsed print tags if count >= max: break
def tag(cls, tweet, texttagger=BasicTagger, debug=False): """Class method to tag a tweet or other text.""" parsed_tweet = TweetParser.parse(tweet, debug=debug) text = parsed_tweet.get('text') tags = texttagger.tag(text) # Now add the hashtags from the parsing... hashtags = parsed_tweet.get('hashtags', []) # Strip off the '#'... hashtags = [h[1:] for h in hashtags] for tag in hashtags: if tag in tags: continue tags.append(tag) return tags