def on_get(self, req, resp, keyword, start, end): """Returns words and their counts in all tweets for keyword.""" tw = tweets.find( {"keywords": keyword, "datetime": {"$gte": start, "$lt": end}}, projection={"tweet.text": True, "spam": True, "_id": False}, ) words = Counter() skip_spam = not want_spam(req) for t in tw: if skip_spam and is_spam(t): continue tokens = tokenizeRawTweetText(t["tweet"]["text"]) words.update([w for w in tokens if w.lower() not in stop_words]) data = [{"word": w, "count": c} for w, c in words.most_common()] resp.body = json.dumps(data)
def process_details(prod, params, force_refresh=False, cache_time=CACHE_TIME): tweets = cache(tweety.get_keyword, prod, force_refresh=force_refresh, cache_time=CACHE_TIME, **params) tweetList = [] imagesList = [] URLList = [] wordCloudDict = Counter() tsDict = Counter() mapLocations = [] for tweet in tweets: tweetList.append(tweet["id_str"]) tokens = tokenizeRawTweetText(tweet["text"].lower()) wordCloudDict.update(tokens) dt = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y") tsDict.update([(dt.year, dt.month, dt.day, dt.hour)]) try: for obj in tweet["entities"]["media"]: imagesList.append(obj["media_url"]) except KeyError: pass try: for obj in tweet["entities"]["urls"]: # using "expand" here synchronously will slow everything down tremendously url = obj["expanded_url"] if url is not None: URLList.append(url) except KeyError: pass try: if tweet["coordinates"] is not None: if tweet["coordinates"]["type"] == "Point": coords = tweet["coordinates"]["coordinates"] mapLocations.append({"lng": coords[0], "lat": coords[1]}) except KeyError: pass wordCloud = [] for token in wordCloudDict: if token not in _stop_words and "http" not in token and len(token) > 1: wordCloud.append({"text": token, "weight": wordCloudDict[token]}) ts = [] tsStart = sorted(tsDict)[0] tsEnd = sorted(tsDict)[-1] temp = datetime(tsStart[0], tsStart[1], tsStart[2], tsStart[3], 0, 0) while temp < datetime(tsEnd[0], tsEnd[1], tsEnd[2], tsEnd[3], 0, 0): if (temp.year, temp.month, temp.day, temp.hour) in tsDict: ts.append( { "year": temp.year, "month": temp.month, "day": temp.day, "hour": temp.hour, "value": tsDict[(temp.year, temp.month, temp.day, temp.hour)], } ) else: ts.append({"year": temp.year, "month": temp.month, "day": temp.day, "hour": temp.hour, "value": 0}) temp += timedelta(hours=1) lng = 0 lat = 0 if mapLocations: for loc in mapLocations: lng += loc["lng"] lat += loc["lat"] avLoc = {"lng": lng / len(mapLocations), "lat": lat / len(mapLocations)} else: avLoc = {"lng": 5, "lat": 52} images = [] for (url, count) in Counter(imagesList).most_common(): images.append({"link": url, "occ": count}) urls = [] for (url, count) in Counter(URLList).most_common(): urls.append({"link": url, "occ": count}) data = { "tweets": tweetList[::-1], "timeSeries": ts, "URLs": urls, "photos": images, "tagCloud": wordCloud, "locations": mapLocations, "centerloc": avLoc, } return data
from streamer import get_db, get_keywords, find_keywords_and_groups from hortiradar import tokenizeRawTweetText tweets = get_db().tweets keywords = get_keywords() tw = tweets.find() for t in tw: tokens = tokenizeRawTweetText(t["tweet"]["text"]) kws, groups = find_keywords_and_groups(tokens, keywords) tweets.update_one({"_id": t["_id"]}, { "$set": { "keywords": kws, "groups": groups, "num_keywords": len(kws) } })