def process_user(uid, follower_of=None, top_level_followee=None, nest_level=0, no_followers=False): cur.execute( "insert into metadata (uid, follower_of, top_level_followee, nest_level) values (%s, %s, %s, %s) on conflict (uid) do update set nest_level = least(metadata.nest_level, excluded.nest_level)", (uid, follower_of, top_level_followee, nest_level)) if follower_of: cur.execute( "insert into followers (follower_uid, folowee_uid) values (%s, %s) on conflict (follower_uid, folowee_uid) do nothing", (uid, follower_of)) if aggregate("users", uid) >= 100: users = get_aggregate("users", 100) command("post", "users/lookup", {"user_id": ",".join(users)}, "users") if not no_followers: command("get", "followers/ids", { "user_id": uid, "stringify_ids": True }, "followers", { "user_id": uid, "top_level_followee": top_level_followee, "nest_level": nest_level }) command( "get", "statuses/user_timeline", { "user_id": uid, "trim_user": True, "count": 200, "include_rts": True, "exclude_replies": False }, "tweets", {"user_id": uid})
def insert_tweet(tweet): retweet_of = None if "retweeted_status" in tweet and tweet["retweeted_status"]: insert_tweet(tweet["retweeted_status"]) retweet_of = tweet["retweeted_status"]["id_str"] quote_of = None if "quoted_status" in tweet and tweet["quoted_status"]: insert_tweet(tweet["quoted_status"]) quote_of = tweet["quoted_status"]["id_str"] uid = tweet["user"]["id_str"] cur.execute(""" insert into tweets ( twid, uid, tweet, created_at, truncated, hashtags, symbols, user_mentions, urls, in_reply_to_status_id, in_reply_to_user_id, in_reply_to_screen_name, geo, coordinates, place, retweet_of, quote_of, retweet_count, favorite_count, possibly_sensitive, lang ) values ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) on conflict (twid) do nothing """, ( tweet["id_str"], uid, s(tweet["text"]), tweet["created_at"], tweet["truncated"], json.dumps(tweet["entities"]["hashtags"]), json.dumps(tweet["entities"]["symbols"]), json.dumps(tweet["entities"]["user_mentions"]), json.dumps(tweet["entities"]["urls"]), tweet["in_reply_to_status_id"], tweet["in_reply_to_user_id"], tweet["in_reply_to_screen_name"], json.dumps(tweet["geo"]), json.dumps(tweet["coordinates"]), json.dumps(tweet["place"]), retweet_of, quote_of, tweet["retweet_count"], tweet["favorite_count"], tweet.get("possibly_sensitive"), tweet["lang"] ))
def handle_users_response(response): for u in response["result"]: cur.execute(""" insert into users ( uid, name, profile_image_url, location, created_at, favourites_count, utc_offset, profile_use_background_image, lang, followers_count, protected, geo_enabled, description, verified, notifications, time_zone, statuses_count, friends_count, screen_name ) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) on conflict (uid) do nothing """, ( s(u["id_str"]), s(u["name"]), s(u["profile_image_url"]), u["location"], u["created_at"], u["favourites_count"], u["utc_offset"], u["profile_use_background_image"], u["lang"], u["followers_count"], u["protected"], u["geo_enabled"], s(u["description"]), u["verified"], u["notifications"], u["time_zone"], u["statuses_count"], u["friends_count"], s(u["screen_name"]) )) conn.commit() return True
import time import os import networkx as nx import sys parent_path = os.path.realpath(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(parent_path) from common import cur print("Initializing graph...") G = nx.Graph() print("Max hashtag...") cur.execute("select max(total_tweets) from hashtags") print("Requesting hashtags...") cur.execute( "select id, hashtag, total_tweets/%s from hashtags where id in (select h1 from hashtags_relations order by tweets_with_both desc limit 500) or id in (select h2 from hashtags_relations order by tweets_with_both desc limit 500)", (float(cur.fetchone()[0]), )) print("Transforming hashtags into nodes...") G.add_nodes_from(((v[0], {"label": v[1], "weight": v[2]}) for v in cur)) print("Max relation...") cur.execute("select max(tweets_with_both) from hashtags_relations") print("Requesting relations...") cur.execute( "select h1, h2, tweets_with_both/%s from hashtags_relations order by tweets_with_both desc limit 500",
from common import conn, cur, get_response, ack_response, nack_response, get_raw, command, get_raw_nb c = statsd.StatsClient('localhost', 8125, prefix='user_processor') while True: logging.info("Waiting for requests in 'users_to_download' queue...") meta, screen_name = get_raw("users_to_download") with c.timer("valid_jobs"): logging.info("Got request for user=%s, requesting info from Twitter...", screen_name) params = { "screen_name": screen_name } q = command("get", "users/show", params, "user_details") meta_user, resp = get_response("user_details") twid = resp["result"]["id_str"] cur.execute("insert into users (twid, screen_name) values (%s, %s) returning id", (twid, screen_name)) uid = cur.fetchone()[0] conn.commit() ack_response(meta_user) # start job for downloading user tweets params = { "user_id": twid, "count": 200, "trim_user": "******", "include_rts": "false" } metadata = { "user_id": uid, "params": params, "collected": 0, "hashtags": {} } command("get", "statuses/user_timeline", params, "user_tweets", metadata=metadata) # start job for downloading friends params = { "user_id": twid, "count": 5000, "stringify_ids": True } metadata = { "params": params, "parent": twid, "parent_level": 0} command("get", "friends/ids", params, "friends_ids", metadata=metadata)
c = statsd.StatsClient('localhost', 8125, prefix='friends_ids_processor') while True: logging.info("Waiting for data from 'friends_ids' queue...") meta, resp = get_response("friends_ids") if resp["result"] != {} and not ("code" in resp["result"] and resp["result"]["code"] == 34): with c.timer("valid_jobs"): logging.info( "Got friends ids for %s (parent_level: %s) requesting tweets...", resp["metadata"]["parent"], resp["metadata"]["parent_level"]) for user_id in resp["result"]["ids"]: cur.execute( "insert into users (twid) values (%s) returning id", (user_id, )) uid = cur.fetchone()[0] params = { "user_id": user_id, "count": 200, "trim_user": "******", "include_rts": "false" } metadata = { "params": params, "collected": 0, "hashtags": {}, "user_id": uid } command("get",
with c.timer("valid_jobs"): c.incr("tweets", len(resp["result"])) for t in resp["result"]: if cmd_max_id == t["id_str"]: continue if not metadata["max_id"] or t["id_str"] < metadata["max_id"]: metadata["max_id"] = t["id_str"] metadata["collected"] += 1 hashtags = set( [v["text"].lower() for v in t["entities"]["hashtags"]]) fav_count = t["favorite_count"] rt_count = t["retweet_count"] cur.execute( "insert into number_of_hashtags_tmp (number) values (%s)", (len(hashtags), )) for hashtag in hashtags: to_update[hashtag]["favs"] += fav_count to_update[hashtag]["rts"] += rt_count to_update[hashtag]["count"] += 1 for (h1, h2) in itertools.combinations(hashtags, 2): to_update[min(h1, h2)]["combinations"][max(h1, h2)] += 1 c.incr("hashtags_upserts", len(to_update.keys())) for hashtag, var in ((k, to_update[k]) for k in sorted(to_update.keys())): if hashtag not in metadata["hashtags"]: metadata["hashtags"][hashtag] = 0 metadata["hashtags"][hashtag] += var["count"]