def build_graph_encoded(): pair_list = TwitterUser.get_top_100_pair() print DG = nx.DiGraph() DG.add_edges_from(pair_list) for twitter_id in DG.nodes(): t = TwitterUser.get_by_id(twitter_id) node = DG.node[twitter_id] node['twitter_id'] = t.user_id node['label'] = t.scrn_name.encode('utf-8') node['screen_name'] = t.scrn_name.encode('utf-8') node['name'] = t.name.encode('utf-8') node['follower_count'] = t.foer_cnt node['friend_count'] = t.friend_cnt node['status_count'] = t.status_cnt node['description'] = t.desc.encode('utf-8') node['location'] = t.location.encode('utf-8') node['created_at'] = str(t.created_at) node['verified'] = t.verified node['twitter_age'] = (date.today() - t.created_at).days node['daily_tweet'] = t.status_cnt*1.0/node['twitter_age'] node['follower_count_top100'] = len([(id, foer) for id, foer in pair_list if id == twitter_id]) node['friend_count_top100'] = len([(id, foer) for id, foer in pair_list if foer == twitter_id]) return DG
def build_graph(): pair_list = TwitterUser.get_top_100_pair() DG = nx.DiGraph() DG.add_edges_from([(foer, twitter_user) for twitter_user, foer in pair_list]) betweenness = nx.betweenness_centrality(DG) closeness = nx.closeness_centrality(DG) edge_betweenness = nx.edge_betweenness(DG) clustering_co = nx.clustering(nx.Graph(DG)) page_rank = nx.pagerank(DG) for twitter_id in DG.nodes(): t = TwitterUser.get_by_id(twitter_id) node = DG.node[twitter_id] node['user_id'] = t.user_id node['label'] = t.scrn_name node['follower_count'] = t.foer_cnt node['friend_count'] = t.friend_cnt node['status_count'] = t.status_cnt node['location'] = t.location node['verified'] = t.verified node['twitter_age'] = (date.today() - t.created_at).days node['daily_tweet'] = t.status_cnt*1.0/node['twitter_age'] node['indegree'] = len([(id, foer) for id, foer in pair_list if id == twitter_id]) node['outdegree'] = len([(id, foer) for id, foer in pair_list if foer == twitter_id]) node['cluster'] = clustering_co[twitter_id] node['betweenness'] = betweenness[twitter_id] node['closeness'] = closeness[twitter_id] node['page_rank'] = page_rank[twitter_id] for out_n, in_n in DG.edges(): DG[out_n][in_n]['edge_betweenness'] = edge_betweenness[(out_n,in_n)] return DG
def _initial_populate(self): '''Gather a group of TwitterUsers. Tries to choose a highly interconnected group.''' root_user = TwitterUser(self._root_user_id) root_node = root_user.get_all_data() self._community_members.append(root_node) root_score = self._filled_user_score(root_node) #user scores determine how interconnected a user is self._node_pool[self._root_user_id] = {'user':root_node, 'score':root_score} logging.debug('Adding root to node_pool') self.save() self._resume_populate()
def _initial_populate(self): '''Gather a group of TwitterUsers. Tries to choose a highly interconnected group.''' root_user = TwitterUser(self._root_user_id) root_node = root_user.get_all_data() self._community_members.append(root_node) root_score = self._filled_user_score(root_node) #user scores determine how interconnected a user is self._node_pool[self._root_user_id] = { 'user': root_node, 'score': root_score } logging.debug('Adding root to node_pool') self.save() self._resume_populate()
def save_user_followers(user): try: c = Cursor(api.followers,user.user_id) except TweepError: print "tweep breaks!" print TweepError.message while(True): try: print 'taking a rest before move to next page' sleep(10) page = c.pages().next() print "start a new page of user ", user.scrn_name, \ 'page', c.pages().count except TweepError: print "tweep breaks!" print TweepError.message continue except StopIteration: print "Move to next unscanned" break for tweepy_user in page: print "follower -----", tweepy_user.screen_name, "----- found......" if TwitterUser.get_by_id(tweepy_user.id) or \ is_in_no_chn(tweepy_user.id): print 'ALREADY in DB!!, skip' continue try: if not tweepy_user.protected or \ (tweepy_user.protected and tweepy_user.following): if is_chn(tweepy_user): print "and speaks Chinese! Saving...." TwitterUser.save_tweepy_user(tweepy_user) else: save_non_chn(tweepy_user.id) print "pitty, s/he is not Chinese Speaker, next..." continue except TweepError: print "tweep breaks!" print TweepError.message try: print "the remaining hit is ", \ api.rate_limit_status()['remaining_hits'] except TweepError: print "tweep breaks!" print TweepError.message page =[] user.update_scanned()
def fetch(): current_user = TwitterUser.get_next_unscanned() if current_user: print "analyzing ", current_user.scrn_name, "......" save_user_followers(current_user) fetch() else: print "done!"
def get_relation(top_list): top_list.reverse() for twitter_id in top_list: print twitter_id, 'analyzing.....' sleep(3) print 'getting followers id...' twitter_user = TwitterUser.get_by_id(twitter_id) try: tweepy_obj = twitter_user.tweepy_obj foer_ids = get_follower_ids(tweepy_obj) api.rate_limit_status()['remaining_hits'] top_100_foer = list(set(foer_ids).intersection(TOP_100)) print 'saving relation....' for id in top_100_foer: TwitterUser.save_relationship(twitter_id, id) except TweepError: print "tweep breaks!" print TweepError.message
from twitter_user import TwitterUser from tweepy import Cursor, TweepError from time import sleep from twitter_api import get_api2 as get_api TOP_100 = TwitterUser.get_top_100_by_foer() api = get_api() def get_follower_ids(tweepy_obj): ids_list = [] try: c = Cursor(api.followers_ids, tweepy_obj.id) except TweepError: print 'tweepy breaks!' while(True): try: print 'new page...' page = c.pages().next() sleep(2) except TweepError: print "tweep breaks!" except StopIteration: print 'done with', tweepy_obj.id break ids_list.extend(page) try: print "the remaining hit is ", \ api.rate_limit_status()['remaining_hits'] except TweepError:
def _resume_populate(self): '''Gather a group of TwitterUsers. Tries to choose a highly interconnected group. Picks up where a previous _populate() call stopped''' while self._node_pool and len(self._community_members) < \ self._max_population: ''' choose person on list with highest interconnection first 0 at end of line is to take first user, second 0 get's user's id from tuple result ''' self._rescore_node_pool() highest_scoring_id = sorted(self._node_pool.items(), key=lambda item: item[1]['score'], reverse=True)[0][0] curr_node = self._node_pool[highest_scoring_id]['user'] #choose friends by rank to add to community, seems to work #better then followers friend_ids = self._sort_by_empty_score(curr_node['friend_ids']) added_count = 0 community_ids = [] for member in self._community_members: community_ids.append(member['uid']) #flag to delete the id we are using from the node pool when finished with it delete_highest_scoring_id = True for friend_id in friend_ids: if added_count <= self._max_friends_per_user: try: if friend_id not in community_ids: tu = TwitterUser(friend_id) sleep(1) new_user = tu.get_all_data() #TODO add some sort of conditions for addition to the community self._community_members.append(new_user) logging.debug('TwitterUser accepted to community') new_user_score = self._filled_user_score(new_user) self._node_pool[friend_id] = {'user':new_user, 'score':new_user_score} if self._safe: self.save() print len(self._community_members), "members" added_count += 1 logging.debug('TwitterUser accepted to node_pool') delete_highest_scoring_id = True except TwitterHTTPError as error: logging.debug('Twitter error: %s' % error) self.save() print "Number of members: ", len(self._community_members) #rate limiting error if '400' in str(error) or '420' in str(error): delete_highest_scoring_id = False logging.debug('Hit rate limit, quitting') return #unauthorized for user error elif '401' in str(error) or '404' in str(error): delete_highest_scoring_id = True if self._safe: self.save() #otherwise it's probably just a twitter server issue else: delete_highest_scoring_id = False logging.debug('Server error, sleeping for 5 secs') sleep(5) except BadUser as error: delete_highest_scoring_id = True logging.debug('TwitterUser rejected: %s' % error) except URLError: delete_highest_scoring_id = False logging.debug('URLError, sleeping for 5 secs') sleep(5) #once a user is chosen for evaluation pop him off the node list del self._node_pool[highest_scoring_id] if self._safe: self.save() logging.debug('Deleting node from node pool') self.save() print "Maximum community size reached." print "Number of members: ", len(self._community_members)
def init(): db.init() first_user = api.me() TwitterUser.save_tweepy_user(first_user)
def _resume_populate(self): '''Gather a group of TwitterUsers. Tries to choose a highly interconnected group. Picks up where a previous _populate() call stopped''' while self._node_pool and len(self._community_members) < \ self._max_population: ''' choose person on list with highest interconnection first 0 at end of line is to take first user, second 0 get's user's id from tuple result ''' self._rescore_node_pool() highest_scoring_id = sorted(self._node_pool.items(), key=lambda item: item[1]['score'], reverse=True)[0][0] curr_node = self._node_pool[highest_scoring_id]['user'] #choose friends by rank to add to community, seems to work #better then followers friend_ids = self._sort_by_empty_score(curr_node['friend_ids']) added_count = 0 community_ids = [] for member in self._community_members: community_ids.append(member['uid']) #flag to delete the id we are using from the node pool when finished with it delete_highest_scoring_id = True for friend_id in friend_ids: if added_count <= self._max_friends_per_user: try: if friend_id not in community_ids: tu = TwitterUser(friend_id) sleep(1) new_user = tu.get_all_data() #TODO add some sort of conditions for addition to the community self._community_members.append(new_user) logging.debug('TwitterUser accepted to community') new_user_score = self._filled_user_score(new_user) self._node_pool[friend_id] = { 'user': new_user, 'score': new_user_score } if self._safe: self.save() print len(self._community_members), "members" added_count += 1 logging.debug('TwitterUser accepted to node_pool') delete_highest_scoring_id = True except TwitterHTTPError as error: logging.debug('Twitter error: %s' % error) self.save() print "Number of members: ", len( self._community_members) #rate limiting error if '400' in str(error) or '420' in str(error): delete_highest_scoring_id = False logging.debug('Hit rate limit, quitting') return #unauthorized for user error elif '401' in str(error) or '404' in str(error): delete_highest_scoring_id = True if self._safe: self.save() #otherwise it's probably just a twitter server issue else: delete_highest_scoring_id = False logging.debug('Server error, sleeping for 5 secs') sleep(5) except BadUser as error: delete_highest_scoring_id = True logging.debug('TwitterUser rejected: %s' % error) except URLError: delete_highest_scoring_id = False logging.debug('URLError, sleeping for 5 secs') sleep(5) #once a user is chosen for evaluation pop him off the node list del self._node_pool[highest_scoring_id] if self._safe: self.save() logging.debug('Deleting node from node pool') self.save() print "Maximum community size reached." print "Number of members: ", len(self._community_members)