Python tokenizeRawTweetText示例

编程语言: Python

命名空间/包名称: hortiradar

方法/功能: tokenizeRawTweetText

hotexamples.com的示例: 3

Python tokenizeRawTweetText - 已找到3个示例。这些是从开源项目中提取的最受好评的hortiradar.tokenizeRawTweetText现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： api.py 项目： mctenthij/big-tu-top10

 def on_get(self, req, resp, keyword, start, end):
     """Returns words and their counts in all tweets for keyword."""
     tw = tweets.find(
         {"keywords": keyword, "datetime": {"$gte": start, "$lt": end}},
         projection={"tweet.text": True, "spam": True, "_id": False},
     )
     words = Counter()
     skip_spam = not want_spam(req)
     for t in tw:
         if skip_spam and is_spam(t):
             continue
         tokens = tokenizeRawTweetText(t["tweet"]["text"])
         words.update([w for w in tokens if w.lower() not in stop_words])
     data = [{"word": w, "count": c} for w, c in words.most_common()]
     resp.body = json.dumps(data)

示例#2

显示文件

文件： app.py 项目： mctenthij/big-tu-top10

def process_details(prod, params, force_refresh=False, cache_time=CACHE_TIME):
    tweets = cache(tweety.get_keyword, prod, force_refresh=force_refresh, cache_time=CACHE_TIME, **params)

    tweetList = []
    imagesList = []
    URLList = []
    wordCloudDict = Counter()
    tsDict = Counter()
    mapLocations = []

    for tweet in tweets:
        tweetList.append(tweet["id_str"])

        tokens = tokenizeRawTweetText(tweet["text"].lower())
        wordCloudDict.update(tokens)

        dt = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
        tsDict.update([(dt.year, dt.month, dt.day, dt.hour)])

        try:
            for obj in tweet["entities"]["media"]:
                imagesList.append(obj["media_url"])
        except KeyError:
            pass

        try:
            for obj in tweet["entities"]["urls"]:
                # using "expand" here synchronously will slow everything down tremendously
                url = obj["expanded_url"]
                if url is not None:
                    URLList.append(url)
        except KeyError:
            pass

        try:
            if tweet["coordinates"] is not None:
                if tweet["coordinates"]["type"] == "Point":
                    coords = tweet["coordinates"]["coordinates"]
                    mapLocations.append({"lng": coords[0], "lat": coords[1]})
        except KeyError:
            pass

    wordCloud = []
    for token in wordCloudDict:
        if token not in _stop_words and "http" not in token and len(token) > 1:
            wordCloud.append({"text": token, "weight": wordCloudDict[token]})

    ts = []
    tsStart = sorted(tsDict)[0]
    tsEnd = sorted(tsDict)[-1]
    temp = datetime(tsStart[0], tsStart[1], tsStart[2], tsStart[3], 0, 0)
    while temp < datetime(tsEnd[0], tsEnd[1], tsEnd[2], tsEnd[3], 0, 0):
        if (temp.year, temp.month, temp.day, temp.hour) in tsDict:
            ts.append(
                {
                    "year": temp.year,
                    "month": temp.month,
                    "day": temp.day,
                    "hour": temp.hour,
                    "value": tsDict[(temp.year, temp.month, temp.day, temp.hour)],
                }
            )
        else:
            ts.append({"year": temp.year, "month": temp.month, "day": temp.day, "hour": temp.hour, "value": 0})

        temp += timedelta(hours=1)

    lng = 0
    lat = 0
    if mapLocations:
        for loc in mapLocations:
            lng += loc["lng"]
            lat += loc["lat"]
            avLoc = {"lng": lng / len(mapLocations), "lat": lat / len(mapLocations)}
    else:
        avLoc = {"lng": 5, "lat": 52}

    images = []
    for (url, count) in Counter(imagesList).most_common():
        images.append({"link": url, "occ": count})

    urls = []
    for (url, count) in Counter(URLList).most_common():
        urls.append({"link": url, "occ": count})

    data = {
        "tweets": tweetList[::-1],
        "timeSeries": ts,
        "URLs": urls,
        "photos": images,
        "tagCloud": wordCloud,
        "locations": mapLocations,
        "centerloc": avLoc,
    }
    return data

示例#3

显示文件

文件： migration.py 项目： mctenthij/big-tu-top10

from streamer import get_db, get_keywords, find_keywords_and_groups

from hortiradar import tokenizeRawTweetText

tweets = get_db().tweets
keywords = get_keywords()


tw = tweets.find()
for t in tw:
    tokens = tokenizeRawTweetText(t["tweet"]["text"])
    kws, groups = find_keywords_and_groups(tokens, keywords)
    tweets.update_one({"_id": t["_id"]}, {
        "$set": {
            "keywords": kws,
            "groups": groups,
            "num_keywords": len(kws)
        }
    })