Exemplo n.º 1
0
def build_graph_encoded():
    pair_list = TwitterUser.get_top_100_pair()
    print
    DG = nx.DiGraph()
    DG.add_edges_from(pair_list)
    for twitter_id in DG.nodes():
        t = TwitterUser.get_by_id(twitter_id)
        node = DG.node[twitter_id]
        node['twitter_id'] = t.user_id
        node['label'] = t.scrn_name.encode('utf-8')
        node['screen_name'] = t.scrn_name.encode('utf-8')
        node['name'] = t.name.encode('utf-8')
        node['follower_count'] = t.foer_cnt
        node['friend_count'] = t.friend_cnt
        node['status_count'] = t.status_cnt
        node['description']  = t.desc.encode('utf-8')
        node['location'] = t.location.encode('utf-8')
        node['created_at'] = str(t.created_at)
        node['verified'] = t.verified
        node['twitter_age'] = (date.today() - t.created_at).days
        node['daily_tweet'] = t.status_cnt*1.0/node['twitter_age']
        node['follower_count_top100'] = len([(id, foer) for id, foer 
            in pair_list if id == twitter_id])
        node['friend_count_top100'] = len([(id, foer) for id, foer 
            in pair_list if foer == twitter_id])

    return DG
Exemplo n.º 2
0
def build_graph():
    pair_list = TwitterUser.get_top_100_pair()
    DG = nx.DiGraph()
    DG.add_edges_from([(foer, twitter_user) for twitter_user, foer in
        pair_list])
    betweenness = nx.betweenness_centrality(DG)
    closeness = nx.closeness_centrality(DG)
    edge_betweenness = nx.edge_betweenness(DG)
    clustering_co = nx.clustering(nx.Graph(DG))
    page_rank = nx.pagerank(DG)
    for twitter_id in DG.nodes():
        t = TwitterUser.get_by_id(twitter_id)
        node = DG.node[twitter_id]
        node['user_id'] = t.user_id
        node['label'] = t.scrn_name
        node['follower_count'] = t.foer_cnt
        node['friend_count'] = t.friend_cnt
        node['status_count'] = t.status_cnt
        node['location'] = t.location
        node['verified'] = t.verified
        node['twitter_age'] = (date.today() - t.created_at).days
        node['daily_tweet'] = t.status_cnt*1.0/node['twitter_age']
        node['indegree'] = len([(id, foer) for id, foer 
            in pair_list if id == twitter_id])
        node['outdegree'] = len([(id, foer) for id, foer 
            in pair_list if foer == twitter_id])
        node['cluster'] = clustering_co[twitter_id]
        node['betweenness'] = betweenness[twitter_id]
        node['closeness'] = closeness[twitter_id]
        node['page_rank'] = page_rank[twitter_id]
    for out_n, in_n in DG.edges():
        DG[out_n][in_n]['edge_betweenness'] = edge_betweenness[(out_n,in_n)]

    return DG
Exemplo n.º 3
0
 def _initial_populate(self):
     '''Gather a group of TwitterUsers. Tries to choose a highly
     interconnected group.'''
     root_user = TwitterUser(self._root_user_id)
     root_node = root_user.get_all_data()
     self._community_members.append(root_node)
     root_score = self._filled_user_score(root_node)
     #user scores determine how interconnected a user is
     self._node_pool[self._root_user_id] = {'user':root_node,
       'score':root_score}
     logging.debug('Adding root to node_pool')
     self.save()
     self._resume_populate()
Exemplo n.º 4
0
 def _initial_populate(self):
     '''Gather a group of TwitterUsers. Tries to choose a highly
     interconnected group.'''
     root_user = TwitterUser(self._root_user_id)
     root_node = root_user.get_all_data()
     self._community_members.append(root_node)
     root_score = self._filled_user_score(root_node)
     #user scores determine how interconnected a user is
     self._node_pool[self._root_user_id] = {
         'user': root_node,
         'score': root_score
     }
     logging.debug('Adding root to node_pool')
     self.save()
     self._resume_populate()
Exemplo n.º 5
0
def save_user_followers(user):
    try:
        c = Cursor(api.followers,user.user_id)
    except TweepError:
        print "tweep breaks!"
        print TweepError.message
    while(True):
        try:
            print 'taking a rest before move to next page'
            sleep(10)
            page = c.pages().next()
            print "start a new page of user ", user.scrn_name, \
                'page', c.pages().count
        except TweepError:
            print "tweep breaks!"
            print TweepError.message
            continue
        except StopIteration:
            print "Move to next unscanned"
            break
        
        for tweepy_user in page:
            print "follower -----", tweepy_user.screen_name, "----- found......"
            if TwitterUser.get_by_id(tweepy_user.id) or \
                is_in_no_chn(tweepy_user.id):
                print 'ALREADY in DB!!, skip'
                continue
            try:
                if not tweepy_user.protected or \
                        (tweepy_user.protected and tweepy_user.following):
                        if is_chn(tweepy_user):
                            print "and speaks Chinese! Saving...."
                            TwitterUser.save_tweepy_user(tweepy_user)
                        else:
                            save_non_chn(tweepy_user.id)
                            print "pitty, s/he is not Chinese Speaker, next..."
                            continue
            except TweepError:
                print "tweep breaks!"
                print TweepError.message
            try:
                print "the remaining hit is ", \
                    api.rate_limit_status()['remaining_hits']
            except TweepError:
                print "tweep breaks!"
                print TweepError.message
        page =[]
    user.update_scanned()
Exemplo n.º 6
0
def fetch():
    current_user = TwitterUser.get_next_unscanned()
    if current_user:
        print "analyzing ", current_user.scrn_name, "......"
        save_user_followers(current_user)
        fetch()
    else:
        print "done!"
Exemplo n.º 7
0
def get_relation(top_list):
    top_list.reverse()

    for twitter_id in top_list:
        print twitter_id, 'analyzing.....'
        sleep(3)
        print 'getting followers id...'
        twitter_user = TwitterUser.get_by_id(twitter_id)
        try:
            tweepy_obj = twitter_user.tweepy_obj
            foer_ids = get_follower_ids(tweepy_obj)
            api.rate_limit_status()['remaining_hits']
            top_100_foer = list(set(foer_ids).intersection(TOP_100))
            print 'saving relation....'
            for id in top_100_foer:
                TwitterUser.save_relationship(twitter_id, id)
        except TweepError:
            print "tweep breaks!"
            print TweepError.message
Exemplo n.º 8
0
from twitter_user import TwitterUser
from tweepy import Cursor, TweepError
from time import sleep
from twitter_api import get_api2 as get_api


TOP_100 = TwitterUser.get_top_100_by_foer()
api = get_api()

def get_follower_ids(tweepy_obj):
    ids_list = []
    try:
        c = Cursor(api.followers_ids, tweepy_obj.id)
    except TweepError:
        print 'tweepy breaks!'
    while(True):
        try:
            print 'new page...'
            page = c.pages().next()
            sleep(2)
        except TweepError:
            print "tweep breaks!"
        except StopIteration:
            print 'done with', tweepy_obj.id
            break
        ids_list.extend(page)
        
    try:
        print "the remaining hit is ", \
            api.rate_limit_status()['remaining_hits']
    except TweepError:
Exemplo n.º 9
0
 def _resume_populate(self):
     '''Gather a group of TwitterUsers. Tries to choose a highly
     interconnected group. Picks up where a previous _populate()
     call stopped'''
     while self._node_pool and len(self._community_members) < \
       self._max_population:
         '''
         choose person on list with highest interconnection
         first 0 at end of line is to take first user, second
         0 get's user's id from tuple result
         '''
         self._rescore_node_pool()
         highest_scoring_id = sorted(self._node_pool.items(), key=lambda item: item[1]['score'], reverse=True)[0][0]
         curr_node = self._node_pool[highest_scoring_id]['user']
         #choose friends by rank to add to community, seems to work
         #better then followers
         friend_ids = self._sort_by_empty_score(curr_node['friend_ids'])
         added_count = 0
         community_ids = []
         for member in self._community_members:
             community_ids.append(member['uid'])
         #flag to delete the id we are using from the node pool when finished with it
         delete_highest_scoring_id = True
         for friend_id in friend_ids:
             if added_count <= self._max_friends_per_user:
                 try:
                     if friend_id not in community_ids:
                         tu = TwitterUser(friend_id)
                         sleep(1)
                         new_user = tu.get_all_data()
                         #TODO add some sort of conditions for addition to the community
                         self._community_members.append(new_user)
                         logging.debug('TwitterUser accepted to community')
                         new_user_score = self._filled_user_score(new_user)
                         self._node_pool[friend_id] = {'user':new_user, 'score':new_user_score}
                         if self._safe:
                             self.save()
                         print len(self._community_members), "members"
                         added_count += 1
                         logging.debug('TwitterUser accepted to node_pool')
                         delete_highest_scoring_id = True
                 except TwitterHTTPError as error:
                     logging.debug('Twitter error: %s' % error)
                     self.save()
                     print "Number of members: ", len(self._community_members)
                     #rate limiting error
                     if '400' in str(error) or '420' in str(error):
                        delete_highest_scoring_id = False
                        logging.debug('Hit rate limit, quitting')
                        return
                     #unauthorized for user error
                     elif '401' in str(error) or '404' in str(error):
                         delete_highest_scoring_id = True
                         if self._safe:
                             self.save()
                     #otherwise it's probably just a twitter server issue
                     else:
                         delete_highest_scoring_id = False
                         logging.debug('Server error, sleeping for 5 secs')
                         sleep(5)
                 except BadUser as error:
                     delete_highest_scoring_id = True
                     logging.debug('TwitterUser rejected: %s' % error)
                 except URLError:
                     delete_highest_scoring_id = False
                     logging.debug('URLError, sleeping for 5 secs')
                     sleep(5)
         #once a user is chosen for evaluation pop him off the node list
         del self._node_pool[highest_scoring_id]
         if self._safe:
             self.save()
         logging.debug('Deleting node from node pool')
     self.save()
     print "Maximum community size reached."
     print "Number of members: ", len(self._community_members)
Exemplo n.º 10
0
def init():
    db.init()
    first_user = api.me()
    TwitterUser.save_tweepy_user(first_user)
Exemplo n.º 11
0
 def _resume_populate(self):
     '''Gather a group of TwitterUsers. Tries to choose a highly
     interconnected group. Picks up where a previous _populate()
     call stopped'''
     while self._node_pool and len(self._community_members) < \
       self._max_population:
         '''
         choose person on list with highest interconnection
         first 0 at end of line is to take first user, second
         0 get's user's id from tuple result
         '''
         self._rescore_node_pool()
         highest_scoring_id = sorted(self._node_pool.items(),
                                     key=lambda item: item[1]['score'],
                                     reverse=True)[0][0]
         curr_node = self._node_pool[highest_scoring_id]['user']
         #choose friends by rank to add to community, seems to work
         #better then followers
         friend_ids = self._sort_by_empty_score(curr_node['friend_ids'])
         added_count = 0
         community_ids = []
         for member in self._community_members:
             community_ids.append(member['uid'])
         #flag to delete the id we are using from the node pool when finished with it
         delete_highest_scoring_id = True
         for friend_id in friend_ids:
             if added_count <= self._max_friends_per_user:
                 try:
                     if friend_id not in community_ids:
                         tu = TwitterUser(friend_id)
                         sleep(1)
                         new_user = tu.get_all_data()
                         #TODO add some sort of conditions for addition to the community
                         self._community_members.append(new_user)
                         logging.debug('TwitterUser accepted to community')
                         new_user_score = self._filled_user_score(new_user)
                         self._node_pool[friend_id] = {
                             'user': new_user,
                             'score': new_user_score
                         }
                         if self._safe:
                             self.save()
                         print len(self._community_members), "members"
                         added_count += 1
                         logging.debug('TwitterUser accepted to node_pool')
                         delete_highest_scoring_id = True
                 except TwitterHTTPError as error:
                     logging.debug('Twitter error: %s' % error)
                     self.save()
                     print "Number of members: ", len(
                         self._community_members)
                     #rate limiting error
                     if '400' in str(error) or '420' in str(error):
                         delete_highest_scoring_id = False
                         logging.debug('Hit rate limit, quitting')
                         return
                     #unauthorized for user error
                     elif '401' in str(error) or '404' in str(error):
                         delete_highest_scoring_id = True
                         if self._safe:
                             self.save()
                     #otherwise it's probably just a twitter server issue
                     else:
                         delete_highest_scoring_id = False
                         logging.debug('Server error, sleeping for 5 secs')
                         sleep(5)
                 except BadUser as error:
                     delete_highest_scoring_id = True
                     logging.debug('TwitterUser rejected: %s' % error)
                 except URLError:
                     delete_highest_scoring_id = False
                     logging.debug('URLError, sleeping for 5 secs')
                     sleep(5)
         #once a user is chosen for evaluation pop him off the node list
         del self._node_pool[highest_scoring_id]
         if self._safe:
             self.save()
         logging.debug('Deleting node from node pool')
     self.save()
     print "Maximum community size reached."
     print "Number of members: ", len(self._community_members)