def crawlmapper(screen_name): if r.get(getRedisIdByScreenName(screen_name,'crawled_in_60min')) is None: friends_info = getFriendsBatch(screen_name,friends_limit) map(lambda x: r.sadd(getRedisIdByScreenName(screen_name, 'friend_ids'), x['id']), friends_info) scard = r.scard(getRedisIdByScreenName(screen_name, 'friend_ids')) print >> sys.stderr, 'Fetched %s ids for %s' % (scard, screen_name) followers_info = getFollowersBatch(screen_name,followers_limit) map(lambda x: r.sadd(getRedisIdByScreenName(screen_name, 'follower_ids'), x['id']), followers_info) scard = r.scard(getRedisIdByScreenName(screen_name, 'follower_ids')) print >> sys.stderr, 'Fetched %s ids for %s' % (scard, screen_name) if friends_info==[] or followers_info==[]: r.set(getRedisIdByScreenName(screen_name,'crawled_in_60min'),'1') r.expire(getRedisIdByScreenName(screen_name,'crawled_in_60min'),3600) else: friends_info=map(RedisUserId2UserInfoWraper, list(r.smembers(getRedisIdByScreenName(screen_name,'friend_ids')))) followers_info=map(RedisUserId2UserInfoWraper, list(r.smembers(getRedisIdByScreenName(screen_name,'follower_ids')))) return map(lambda u1: u1['screen_name'], filter(lambda info: (info is not None and info['followers_count']<1000 and info['friends_count']<1000), #filter Public Intellectual and Zombie flat(map(samplemapper, [friends_info,followers_info], [friends_sample,followers_sample]))))
def calculate(): r = redis.Redis() # Default connection settings on localhost follower_ids = list( r.smembers(getRedisIdByScreenName(SCREEN_NAME, 'follower_ids'))) followers = r.mget([ getRedisIdByUserId(follower_id, 'info.json') for follower_id in follower_ids ]) followers = [json.loads(f) for f in followers if f is not None] freqs = {} for f in followers: cnt = f['followers_count'] if not freqs.has_key(cnt): freqs[cnt] = [] freqs[cnt].append({ 'screen_name': f['screen_name'], 'user_id': f['id'] }) # It could take a few minutes to calculate freqs, so store a snapshot for later use r.set(getRedisIdByScreenName(SCREEN_NAME, 'follower_freqs'), json.dumps(freqs)) keys = freqs.keys() keys.sort() print 'The top 10 followers from the sample:' field_names = ['Date', 'Count'] pt = PrettyTable(field_names=field_names) pt.align = 'l' for (user, freq) in reversed([(user['screen_name'], k) for k in keys[-10:] for user in freqs[k]]): pt.add_row([user, pp(freq)]) print pt all_freqs = [k for k in keys for user in freqs[k]] avg = reduce(lambda x, y: x + y, all_freqs) / len(all_freqs) print "\nThe average number of followers for %s's followers: %s" \ % (SCREEN_NAME, pp(avg))
def calculate(): r = redis.Redis() # Default connection settings on localhost follower_ids = list(r.smembers(getRedisIdByScreenName(SCREEN_NAME, 'follower_ids'))) followers = r.mget([getRedisIdByUserId(follower_id, 'info.json') for follower_id in follower_ids]) followers = [json.loads(f) for f in followers if f is not None] freqs = {} for f in followers: cnt = f['followers_count'] if not freqs.has_key(cnt): freqs[cnt] = [] freqs[cnt].append({'screen_name': f['screen_name'], 'user_id': f['id']}) # It could take a few minutes to calculate freqs, so store a snapshot for later use r.set(getRedisIdByScreenName(SCREEN_NAME, 'follower_freqs'), json.dumps(freqs)) keys = freqs.keys() keys.sort() print 'The top 10 followers from the sample:' fields = ['Date', 'Count'] pt = PrettyTable(fields=fields) [pt.set_field_align(f, 'l') for f in fields] for (user, freq) in reversed([(user['screen_name'], k) for k in keys[-10:] for user in freqs[k]]): pt.add_row([user, pp(freq)]) pt.printt() all_freqs = [k for k in keys for user in freqs[k]] avg = reduce(lambda x, y: x + y, all_freqs) / len(all_freqs) print "\nThe average number of followers for %s's followers: %s" \ % (SCREEN_NAME, pp(avg))
def friendsFollowersInCommon(screen_names): print "method gets called with args: ", json.dumps(screen_names) r.sinterstore('temp$friends_in_common', [getRedisIdByScreenName(screen_name, 'friends_ids') for screen_name in screen_names] ) r.sinterstore("temp$followers_in_common", [getRedisIdByScreenName(screen_name, 'follower_ids') for screen_name in screen_names] ) print 'Friends in common for %s: %s' % (', '.join(screen_names), pp(r.scard('temp$friends_in_common'))) print 'Followers in common for %s: %s' % (', '.join(screen_names), pp(r.scard('temp$followers_in_common'))) r.delete('temp$friends_in_common') r.delete('temp$followers_in_common')
def crawlmapper(screen_name): if r.get(getRedisIdByScreenName(screen_name, 'crawled_in_60min')) is None: friends_info = getFriendsBatch(screen_name, friends_limit) map( lambda x: r.sadd( getRedisIdByScreenName(screen_name, 'friend_ids'), x['id'] ), friends_info) scard = r.scard(getRedisIdByScreenName(screen_name, 'friend_ids')) print >> sys.stderr, 'Fetched %s ids for %s' % (scard, screen_name) followers_info = getFollowersBatch(screen_name, followers_limit) map( lambda x: r.sadd( getRedisIdByScreenName(screen_name, 'follower_ids'), x[ 'id']), followers_info) scard = r.scard(getRedisIdByScreenName(screen_name, 'follower_ids')) print >> sys.stderr, 'Fetched %s ids for %s' % (scard, screen_name) if friends_info == [] or followers_info == []: r.set(getRedisIdByScreenName(screen_name, 'crawled_in_60min'), '1') r.expire( getRedisIdByScreenName(screen_name, 'crawled_in_60min'), 3600) else: friends_info = map( RedisUserId2UserInfoWraper, list( r.smembers( getRedisIdByScreenName(screen_name, 'friend_ids')))) followers_info = map( RedisUserId2UserInfoWraper, list( r.smembers( getRedisIdByScreenName(screen_name, 'follower_ids')))) return map( lambda u1: u1['screen_name'], filter( lambda info: (info is not None and info[ 'followers_count'] < 1000 and info['friends_count'] < 1000 ), #filter Public Intellectual and Zombie flat( map(samplemapper, [friends_info, followers_info], [friends_sample, followers_sample]))))
def friendsFollowersInCommon(screen_names): r.sinterstore('temp$friends_in_common', [getRedisIdByScreenName(screen_name, 'friend_ids') for screen_name in screen_names] ) r.sinterstore('temp$followers_in_common', [getRedisIdByScreenName(screen_name, 'follower_ids') for screen_name in screen_names] ) print 'Friends in common for %s: %s' % (', '.join(screen_names), pp(r.scard('temp$friends_in_common'))) print 'Followers in common for %s: %s' % (', '.join(screen_names), pp(r.scard('temp$followers_in_common'))) # Clean up scratch workspace r.delete('temp$friends_in_common') r.delete('temp$followers_in_common')
def calculate(): r = redis.Redis() # Default connection settings on localhost follower_ids = list(r.smembers(getRedisIdByScreenName(SCREEN_NAME, "follower_ids"))) followers = r.mget([getRedisIdByUserId(follower_id, "info.json") for follower_id in follower_ids]) followers = [json.loads(f) for f in followers if f is not None] freqs = {} for f in followers: cnt = f["followers_count"] if not freqs.has_key(cnt): freqs[cnt] = [] freqs[cnt].append({"screen_name": f["screen_name"], "user_id": f["id"]}) # It could take a few minutes to calculate freqs, so store a snapshot for later use r.set(getRedisIdByScreenName(SCREEN_NAME, "follower_freqs"), json.dumps(freqs)) keys = freqs.keys() keys.sort() print "The top 10 followers from the sample:" field_names = ["Date", "Count"] pt = PrettyTable(field_names=field_names) pt.align = "l" for (user, freq) in reversed([(user["screen_name"], k) for k in keys[-10:] for user in freqs[k]]): pt.add_row([user, pp(freq)]) print pt all_freqs = [k for k in keys for user in freqs[k]] avg = reduce(lambda x, y: x + y, all_freqs) / len(all_freqs) print "\nThe average number of followers for %s's followers: %s" % (SCREEN_NAME, pp(avg))
t = login() r = redis.Redis() getFriends = functools.partial(_getFriendsOrFollowersUsingFunc, t.friends.ids, 'friend_ids', t, r) getFollowers = functools.partial(_getFriendsOrFollowersUsingFunc, t.followers.ids, 'follower_ids', t, r) screen_name = SCREEN_NAME # data retrieval print >> sys.stderr, 'Getting friends for %s...' % (screen_name, ) getFriends(screen_name, limit=MAXINT) print >> sys.stderr, 'Getting followers for %s...' % (screen_name, ) getFollowers(screen_name, limit=MAXINT) # redis calculations n_friends = r.scard(getRedisIdByScreenName(screen_name, 'friend_ids')) n_followers = r.scard(getRedisIdByScreenName(screen_name, 'follower_ids')) n_friends_diff_followers = r.sdiffstore('temp', [getRedisIdByScreenName(screen_name, 'friend_ids'), getRedisIdByScreenName(screen_name, 'follower_ids')]) r.delete('temp') n_followers_diff_friends = r.sdiffstore('temp', [getRedisIdByScreenName(screen_name, 'follower_ids'), getRedisIdByScreenName(screen_name, 'friend_ids')]) r.delete('temp') n_friends_inter_followers = r.sinterstore('temp', [getRedisIdByScreenName(screen_name, 'follower_ids'), getRedisIdByScreenName(screen_name, 'friend_ids')]) r.delete('temp') print '%s is following %s' % (screen_name, locale.format('%d', n_friends, True)) print '%s is being followed by %s' % (screen_name, locale.format('%d', n_followers, True)) print '%s of %s are not following %s back' % (locale.format('%d', n_friends_diff_followers, True),
response = makeTwitterRequest(t.friends.ids, screen_name=screen_name, cursor=cursor) ids += response['ids'] cursor = response['next_cursor'] print >> sys.stderr, 'Fetched %i ids for %s' % (len(ids), screen_name) except twitter.api.TwitterHTTPError, e: if e.e.code == 404: print >> sys.stderr, "404 Error with screen_name '%s'. Continuing." % screen_name break # Store the ids into Redis [ r.sadd(getRedisIdByScreenName(screen_name, 'friend_ids'), _id) for _id in ids ] count += 1 print >> sys.stderr, '\t\tFetched friends for %s / %s' % ( count, len(screen_names)) print >> sys.stderr, 'Done fetching friend ids...' #################################### # Resolve screen_names for user_ids #################################### while len(screen_names) > 0: (screen_names_str, screen_names) = (','.join(screen_names[:100]),
getFollowers = functools.partial(_getFriendsOrFollowersUsingFunc, t.followers.ids, 'follower_ids', t, r) screen_name = SCREEN_NAME # get the data print >> sys.stderr, 'Getting friends for %s...' % (screen_name, ) getFriends(screen_name, limit=MAXINT) print >> sys.stderr, 'Getting followers for %s...' % (screen_name, ) getFollowers(screen_name, limit=MAXINT) # use redis to compute the numbers n_friends = r.scard(getRedisIdByScreenName(screen_name, 'friend_ids')) n_followers = r.scard(getRedisIdByScreenName(screen_name, 'follower_ids')) n_friends_diff_followers = r.sdiffstore('temp', [getRedisIdByScreenName(screen_name, 'friend_ids'), getRedisIdByScreenName(screen_name, 'follower_ids')]) r.delete('temp') n_followers_diff_friends = r.sdiffstore('temp', [getRedisIdByScreenName(screen_name, 'follower_ids'), getRedisIdByScreenName(screen_name, 'friend_ids')])
import sys import redis import json from twitter__util import getRedisIdByScreenName from twitter__util import getRedisIdByUserId EGO = sys.argv[1] r = redis.Redis() normalized_locations = [] friend_ids = list(r.smembers(getRedisIdByScreenName(EGO, 'friend_ids'))) ego_id = json.loads(r.get(getRedisIdByScreenName(EGO, 'info.json')))['id'] ids = [ego_id] + friend_ids for user_id in ids: redis_id = getRedisIdByUserId(str(user_id), 'info.json') location_json = r.get(redis_id) if location_json: location = json.loads(location_json)['location'] if location: normalized_location = location.lower().encode("utf-8") normalized_locations.append(normalized_location) unique_locations = set(normalized_locations) for ul in unique_locations: print ul
while cursor != 0: try: response = makeTwitterRequest(t.friends.ids, screen_name=screen_name, cursor=cursor) ids += response['ids'] cursor = response['next_cursor'] print >> sys.stderr, 'Fetched %i ids for %s' % (len(ids), screen_name) except twitter.api.TwitterHTTPError, e: if e.e.code == 404: print >> sys.stderr, "404 Error with screen_name '%s'. Continuing." % screen_name break # Store the ids into Redis [r.sadd(getRedisIdByScreenName(screen_name, 'friend_ids'), _id) for _id in ids] count += 1 print >> sys.stderr, '\t\tFetched friends for %s / %s' % (count, len(screen_names)) print >> sys.stderr, 'Done fetching friend ids...' #################################### # Resolve screen_names for user_ids #################################### while len(screen_names) > 0: (screen_names_str, screen_names) = (','.join(screen_names[:100]), screen_names[100:])
t.followers.ids, 'follower_ids', t, r, limit=200) screen_names = SCREEN_NAME friends_ids = [] followers_ids = [] union = [] # get the data for screen_name in screen_names: if screen_name != None: print >> sys.stderr, 'Getting friends for %s...' % (screen_name, ) friends_ids = getFriends(screen_name, limit=200) print >> sys.stderr, 'Getting followers for %s...' % (screen_name, ) followers_ids = getFollowers(screen_name, limit=200) # make union of friends and followers union = r.sunion([getRedisIdByScreenName(screen_name,'friends_ids'),getRedisIdByScreenName(screen_name, 'follower_ids')]) # convert from set to list union = list(union) # we need just 200 of them union = union[1:200] # get info for all 200; needed for the location friends_info = getUserInfo(t, r, user_ids=union, sample=1.0) print "Now harvesting ", screen_name,"'s friends subgraphs" for current_friend in friends_info: if current_friend != None: print "+",current_friend['screen_name']," From ", if current_friend['location'] != None and current_friend['location']!= "" :
import sys import redis import networkx as nx import json import matplotlib.pyplot as plt from twitter__util import getRedisIdByScreenName from twitter__util import getRedisIdByUserId r = redis.Redis() graph = nx.Graph() EGO = sys.argv[1] EGO_ID = json.loads(r.get(getRedisIdByScreenName(EGO, 'info.json')))['id'] TEMP_UNION_KEY = 'temp$union' # union EGO's friends and followers r.sunionstore(TEMP_UNION_KEY, [getRedisIdByScreenName(EGO,'friend_ids'), getRedisIdByScreenName(EGO,'follower_ids')]) friend_follower_ids = list(r.smembers(TEMP_UNION_KEY)) # for each friend/follower : for friend_follower_id in friend_follower_ids: # intersect friend/follower's friends with ego's union # NB: the getFriends function used for mining only uses screen_name keys try: screen_name = json.loads(r.get(getRedisIdByUserId(friend_follower_id, 'info.json')))['screen_name'] except: continue intersecting_ids = list(r.sinter(TEMP_UNION_KEY, getRedisIdByScreenName(screen_name, 'friend_ids'))) # add edge between EGO and the current friend/follower graph.add_edge(EGO_ID, friend_follower_id) # add edges between each id in the intersection and the id of the current friend/follower
import redis import couchdb import sys from twitter__util import getRedisIdByScreenName from twitter__util import getRedisIdByUserId SCREEN_NAME = sys.argv[1] THRESHOLD = int(sys.argv[2]) # Connect using default settings for localhost r = redis.Redis() # Compute screen_names for friends friend_ids = r.smembers(getRedisIdByScreenName(SCREEN_NAME, 'friend_ids')) friend_screen_names = [] for friend_id in friend_ids: try: friend_screen_names.append(json.loads(r.get(getRedisIdByUserId(friend_id, 'info.json')))['screen_name'].lower()) except TypeError, e: continue # not locally available in Redis - look it up or skip it # Pull the list of (entity, frequency) tuples from CouchDB server = couchdb.Server('http://localhost:5984') db = server['tweets-user-timeline-' + SCREEN_NAME] entities_freqs = sorted([(row.key, row.value) for row in db.view('index/entity_count_by_doc', group=True)],
locale.setlocale(locale.LC_ALL, "") t = login() r = redis.Redis() getFriends = functools.partial(_getFriendsOrFollowersUsingFunc, t.friends.ids, "friend_ids", t, r) getFollowers = functools.partial(_getFriendsOrFollowersUsingFunc, t.followers.ids, "follower_ids", t, r) screen_name = SCREEN_NAME print >> sys.stderr, "Getting friends for %s... " % (screen_name,) getFriends(screen_name, limit=MAXINT) print >> sys.stderr, "Getting followers for %s..." % (screen_name,) getFollowers(screen_name, limit=MAXINT) # Number of Friends and Followers n_friends = r.scard(getRedisIdByScreenName(screen_name, "friend_ids")) n_followers = r.scard(getRedisIdByScreenName(screen_name, "follower_ids")) # n_friends_diff_followers = r.sdiffstore( "temp", [getRedisIdByScreenName(screen_name, "friend_ids"), getRedisIdByScreenName(screen_name, "follower_ids")] ) r.delete("temp") n_followers_diff_friends = r.sdiffstore( "temp", [getRedisIdByScreenName(screen_name, "follower_ids"), getRedisIdByScreenName(screen_name, "friend_ids")] ) r.delete("temp")
import sys import json import networkx as nx import redis from twitter__util import getRedisIdByScreenName from twitter__util import getRedisIdByUserId SCREEN_NAME = sys.argv[1] g = nx.Graph() r = redis.Redis() # Compute all ids for nodes appearing in the graph friend_ids = list(r.smembers(getRedisIdByScreenName(SCREEN_NAME, 'friend_ids'))) id_for_screen_name = json.loads(r.get(getRedisIdByScreenName(SCREEN_NAME, 'info.json')))['id'] ids = [id_for_screen_name] + friend_ids # Pickle the graph to disk... if not os.path.isdir('out'): os.mkdir('out') filename = os.path.join('out', SCREEN_NAME + '.gpickle') nx.write_gpickle(g, filename) print 'Pickle file stored in: %s' % filename # You can un-pickle like so...
try: response = makeTwitterRequest(t, t.friends.ids, screen_name=screen_name, cursor=cursor) ids += response['ids'] cursor = response['next_cursor'] print >> sys.stderr, 'Fetched %i ids for %s' % (len(ids), screen_name) except twitter.api.TwitterHTTPError, e: if e.e.code == 404: print >> sys.stderr, "404 Error with screen_name '%s'. Continuing." % screen_name break # Store the ids into Redis [r.sadd(getRedisIdByScreenName(screen_name, 'friend_ids'), _id) for _id in ids] count += 1 print >> sys.stderr, '\t\tFetched friends for %s / %s' % (count, len(screen_names)) print >> sys.stderr, 'Done fetching friend ids...' #################################### # Resolve screen_names for user_ids #################################### while len(screen_names) > 0: (screen_names_str, screen_names) = (','.join(screen_names[:100]), screen_names[100:])
MAXINT = sys.maxint t = login() r = redis.Redis() # get info and friends for central user getUserInfo(t,r,[SCREEN_NAME]) getFriends = functools.partial(_getFriendsOrFollowersUsingFunc, t.friends.ids, 'friend_ids', t, r) getFollowers = functools.partial(_getFriendsOrFollowersUsingFunc, t.followers.ids, 'follower_ids', t, r) # get friends and followers of central user friend_ids = getFriends(SCREEN_NAME) follower_ids = getFollowers(SCREEN_NAME) # do union of friends and followers ids = list(r.sunion(getRedisIdByScreenName(SCREEN_NAME,'friend_ids'), getRedisIdByScreenName(SCREEN_NAME,'follower_ids'))) # get user info for friends and followers getUserInfo(t, r, user_ids=ids) # get friends of friends and followers for user_id in ids: screen_name = json.loads(r.get(getRedisIdByUserId(user_id, 'info.json')))['screen_name'] try: getFriends(screen_name) except: continue
getFollowers = functools.partial(_getFriendsOrFollowersUsingFunc, t.followers.ids, 'follower_ids', t, r) screen_name = SCREEN_NAME # get the data print >> sys.stderr, 'Getting friends for %s...' % (screen_name, ) getFriends(screen_name, limit=MAXINT) print >> sys.stderr, 'Getting followers for %s...' % (screen_name, ) getFollowers(screen_name, limit=MAXINT) # use redis to compute the numbers n_friends = r.scard(getRedisIdByScreenName(screen_name, 'friend_ids')) n_followers = r.scard(getRedisIdByScreenName(screen_name, 'follower_ids')) n_friends_diff_followers = r.sdiffstore('temp', [ getRedisIdByScreenName(screen_name, 'friend_ids'), getRedisIdByScreenName(screen_name, 'follower_ids') ]) r.delete('temp') n_followers_diff_friends = r.sdiffstore('temp', [ getRedisIdByScreenName(screen_name, 'follower_ids'), getRedisIdByScreenName(screen_name, 'friend_ids') ]) r.delete('temp')
import redis import couchdb import sys from twitter__util import getRedisIdByScreenName from twitter__util import getRedisIdByUserId SCREEN_NAME = 'timoreilly' # XXX: iPython Notebook cannot prompt for input THRESHOLD = 15 # XXX: iPython Notebook cannot prompt for input # Connect using default settings for localhost r = redis.Redis() # Compute screen_names for friends friend_ids = r.smembers(getRedisIdByScreenName(SCREEN_NAME, 'friend_ids')) friend_screen_names = [] for friend_id in friend_ids: try: friend_screen_names.append( json.loads(r.get(getRedisIdByUserId( friend_id, 'info.json')))['screen_name'].lower()) except TypeError, e: continue # not locally available in Redis - look it up or skip it # Pull the list of (entity, frequency) tuples from CouchDB server = couchdb.Server('http://localhost:5984') db = server['tweets-user-timeline-' + SCREEN_NAME] entities_freqs = sorted(
import sys import json import networkx as nx import redis from twitter__util import getRedisIdByScreenName from twitter__util import getRedisIdByUserId SCREEN_NAME = sys.argv[1] g = nx.Graph() r = redis.Redis() # Compute all ids for nodes appearing in the graph friend_ids = list(r.smembers(getRedisIdByScreenName(SCREEN_NAME, 'friend_ids'))) id_for_screen_name = json.loads( r.get(getRedisIdByScreenName(SCREEN_NAME, 'info.json')))['id'] ids = [id_for_screen_name] + friend_ids for current_id in ids: print >> sys.stderr, 'Processing user with id', current_id try: current_info = json.loads( r.get(getRedisIdByUserId(current_id, 'info.json'))) current_screen_name = current_info['screen_name'] friend_ids = list( r.smembers( getRedisIdByScreenName(current_screen_name, 'friend_ids')))
for current_id in ids: print >> sys.stderr, 'Processing user with id', current_id try: raw_current_info = r.get(getRedisIdByUserId(current_id, 'info.json' )) if not raw_current_info: # try to get it one more time print "Making req to Twitter API" os.system('python friends_followers__get_user_info_by_id.py ' + current_id) current_info = json.loads(r.get(getRedisIdByUserId(current_id, 'info.json' ))) current_screen_name = current_info['screen_name'] friend_ids = list(r.smembers(getRedisIdByScreenName(current_screen_name, 'friend_ids'))) # filter out ids for this person if they aren't also SCREEN_NAME's friends too, # which is the basis of the query friend_ids = [fid for fid in friend_ids if fid in starting_ids] # TODO czy to nie dziala? except Exception, e: print >> sys.stderr, 'Problems with', current_id for friend_id in friend_ids: if friend_id in starting_ids: try: raw_friend_info = r.get(getRedisIdByUserId(friend_id, 'info.json')) if not raw_friend_info: