示例#1
0
    def _fetch_member_friends(self, user_subset):
        if self.member_friends is not None:
            return
        log_method_begin()

        members = [
            member for member in self.members if member['id'] in user_subset
        ]
        print('{} users to fetch'.format(len(members)))

        pool_results = []

        with vk_api.VkRequestsPool(self.vk_session) as pool:
            for member in members:
                pool_results.append(
                    (member['id'],
                     pool.method('friends.get', {
                         'user_id': member['id'],
                         'fields': 'photo'
                     })))

        self.member_friends = defaultdict(list)
        for member_id, friend_request in pool_results:
            if friend_request.ok:
                for friend in friend_request.result['items']:
                    if friend['id'] not in user_subset:
                        friend['is_member'] = False
                        self.member_friends[member_id].append(friend)

        self._compress_users()

        self._save_pickle('raw_users_data.member_friends', self.member_friends)

        log_method_end()
示例#2
0
 def fit(self, post_subset=None):
     df = self.action_data.get_all()
     if post_subset is not None:
         df = df[df['post_id'].isin(post_subset)]
     log_method_begin()
     x_df = df.drop(
         ['user_id', 'post_id', 'is_member', 'is_liked', 'is_reposted'],
         axis=1)
     self.like_model.fit(x_df, df['is_liked'])
     self.repost_model.fit(x_df, df['is_reposted'])
     self.is_fitted = True
     log_method_end()
示例#3
0
    def get_true(self, subset=None):
        print('GroupPredict.get_true for group {}'.format(self.group_id))
        log_method_begin()

        direct_likes_count = Counter()
        direct_reposts_count = Counter()
        non_direct_likes_count = Counter()
        non_direct_reposts_count = Counter()

        for post in tqdm(self.raw_wall_data.posts):
            post_id = post['id']
            if subset is not None and post_id not in subset:
                continue

            for user_id in post['likes']['user_ids']:
                user = self.raw_users_data.find_user(user_id)
                if user is None:
                    continue
                if user['is_member']:
                    direct_likes_count[post_id] += 1
                else:
                    non_direct_likes_count[post_id] += 1

            for user_id in post['reposts']['user_ids']:
                user = self.raw_users_data.find_user(user_id)
                if user is None:
                    continue
                if user['is_member']:
                    direct_reposts_count[post_id] += 1
                else:
                    non_direct_reposts_count[post_id] += 1

        post_ids = list(direct_likes_count.keys()
                        | direct_reposts_count.keys()
                        | non_direct_likes_count.keys()
                        | non_direct_reposts_count.keys())
        rows = []
        for post_id in post_ids:
            rows.append([
                direct_likes_count[post_id], direct_reposts_count[post_id],
                non_direct_likes_count[post_id],
                non_direct_reposts_count[post_id]
            ])
        result = pd.DataFrame(rows,
                              index=post_ids,
                              columns=[
                                  'direct_likes_count', 'direct_reposts_count',
                                  'non_direct_likes_count',
                                  'non_direct_reposts_count'
                              ])
        log_method_end()
        return result
示例#4
0
    def _fetch_members(self):
        if self.members is not None:
            return
        log_method_begin()

        self.members = self.vk_tools.get_all('groups.getMembers', 1000, {
            'group_id': self.group_id,
            'fields': self.member_fields
        })['items']
        print('{} members'.format(len(self.members)))

        for member in self.members:
            member['is_member'] = True

        log_method_end()
示例#5
0
    def fit(self):
        log_method_begin()
        print("{} members, {} posts".format(len(self.raw_users_data.members),
                                            len(self.raw_wall_data.posts)))

        rows = []

        friend_post_pairs = set()
        for user in tqdm(self.raw_users_data.members,
                         'ActionData.get_all: for members'):
            if 'groups' not in user:
                continue
            for post in self.raw_wall_data.posts:
                is_liked = user['id'] in post['likes']['user_ids']
                is_reposted = user['id'] in post['reposts']['user_ids']

                if is_reposted:
                    for friend in self.raw_users_data.member_friends[
                            user['id']]:
                        if 'groups' not in friend:
                            continue
                        friend_post_pair = (friend['id'], post['id'])
                        if friend_post_pair not in friend_post_pairs:
                            friend_is_liked = friend['id'] in post['likes'][
                                'user_ids']
                            friend_is_reposted = friend['id'] in post[
                                'reposts']['user_ids']

                            rows.append(
                                self.get_row(friend, post, False,
                                             friend_is_liked,
                                             friend_is_reposted))
                            friend_post_pairs.add(friend_post_pair)

                rows.append(
                    self.get_row(user, post, True, is_liked, is_reposted))

        result = pd.DataFrame(rows, columns=self.get_labels())
        self.table = result
        print("{} rows".format(len(result)))
        print("{} liked, {} reposted".format(sum(result['is_liked']),
                                             sum(result['is_reposted'])))
        print("{} liked, {} reposted by members".format(
            sum(result[result['is_member']]['is_liked']),
            sum(result[result['is_member']]['is_reposted'])))
        log_method_end()
        return result
示例#6
0
 def predict(self, post_subset=None):
     df = self.action_data.get_all()
     if post_subset is not None:
         df = df[df['post_id'].isin(post_subset)]
     log_method_begin()
     x_df = df.drop(
         ['user_id', 'post_id', 'is_member', 'is_liked', 'is_reposted'],
         axis=1)
     pred = [
         df['user_id'], df['post_id'], df['is_member'],
         self.like_model.predict(x_df),
         self.repost_model.predict(x_df)
     ]
     result = pd.DataFrame(np.array(pred).T,
                           columns=[
                               'user_id', 'post_id', 'is_member',
                               'is_liked', 'is_reposted'
                           ])
     log_method_end()
     return result
示例#7
0
    def predict(self, post_subset=None):
        log_method_begin()

        direct_likes_count = Counter()
        direct_reposts_count = Counter()
        non_direct_likes_count = Counter()
        non_direct_reposts_count = Counter()

        pred_df = self.predict_action_model.predict(post_subset)
        for i, row in pred_df.iterrows():
            if row['is_liked']:
                if row['is_member']:
                    direct_likes_count[row['post_id']] += 1
                else:
                    non_direct_likes_count[row['post_id']] += 1
            if row['is_reposted']:
                if row['is_member']:
                    direct_reposts_count[row['post_id']] += 1
                else:
                    non_direct_reposts_count[row['post_id']] += 1

        post_ids = list(direct_likes_count.keys()
                        | direct_reposts_count.keys()
                        | non_direct_likes_count.keys()
                        | non_direct_reposts_count.keys())
        rows = []
        for post_id in post_ids:
            rows.append([
                direct_likes_count[post_id], direct_reposts_count[post_id],
                non_direct_likes_count[post_id],
                non_direct_reposts_count[post_id]
            ])
        result = pd.DataFrame(rows,
                              index=post_ids,
                              columns=[
                                  'direct_likes_count', 'direct_reposts_count',
                                  'non_direct_likes_count',
                                  'non_direct_reposts_count'
                              ])
        log_method_end()
        return result
示例#8
0
    def _fetch_activity(self):
        log_method_begin()
        print('{} posts to fetch'.format(len(self.posts)))

        pool_results = []

        with vk_api.VkRequestsPool(self.vk_session) as pool:
            for post in self.posts:
                likes = pool.method(
                    'likes.getList',
                    {'item_id': post['id'], 'owner_id': -self.group_id, 'type': 'post', 'count': 1000, 'filter': 'likes'}
                )
                reposts = pool.method(
                    'likes.getList',
                    {'item_id': post['id'], 'owner_id': -self.group_id, 'type': 'post', 'count': 1000, 'filter': 'copies'}
                )
                pool_results.append((post, likes, reposts))

        for post, likes, reposts in pool_results:
            if 'likes' not in post:
                post['likes'] = dict()
            if likes.ok:
                likes = likes.result['items']
                post['likes']['user_ids'] = set(likes)
            else:
                post['likes']['user_ids'] = set()

            if 'reposts' not in post:
                post['reposts'] = dict()
            if reposts.ok:
                reposts = reposts.result['items']
                post['reposts']['user_ids'] = set(reposts)
            else:
                post['reposts']['user_ids'] = set()

        log_method_end()
示例#9
0
 def fit(self):
     log_method_begin()
     self.lda_maker = LdaMaker(self._get_corpora_for_lda(), self.num_topics)
     log_method_end()
示例#10
0
 def _fetch_wall(self):
     log_method_begin()
     self.posts = self.vk_tools.get_all('wall.get', 100, {'owner_id': -self.group_id, 'extended': 1})['items']
     print('{} posts'.format(len(self.posts)))
     log_method_end()
示例#11
0
    def _fetch_groups(self, user_subset):
        log_method_begin()

        all_users = [
            user for user in self.get_all_users() if user['id'] in user_subset
        ]
        print('{} users to fetch'.format(len(all_users)))

        all_users_processing_step = 1000
        fetch_start = time.time()
        for i in range(0, len(all_users), all_users_processing_step):
            print('Fetching from {} to {}...'.format(
                i, i + all_users_processing_step))
            users = all_users[i:i + all_users_processing_step]

            if time.time() - fetch_start > 2 * 60 * 60:
                print('Cooldown for 30 minutes')
                time.sleep(30 * 60)
                fetch_start = time.time()

            do_fetch = True
            last_error_time = -1
            while do_fetch:
                try:
                    pool_results = []
                    with vk_api.VkRequestsPool(self.vk_session) as pool:
                        for user in users:
                            if 'groups' not in user:
                                pool_results.append(
                                    (user,
                                     pool.method(
                                         'groups.get', {
                                             'user_id': user['id'],
                                             'count': 1000,
                                             'extended': 1,
                                             'fields': self.group_fields
                                         })))
                    do_fetch = False
                    self.unmark_fetch_groups_error()
                except Exception as e:
                    print('Can\'t fetch groups because of', e)
                    traceback.print_exc()
                    if time.time() - last_error_time < 120:
                        print(
                            'Can\'t do anything, exit. Restart will reuse fetched users'
                        )
                        do_fetch = False
                        self.mark_fetch_groups_error()
                    else:
                        print('Trying again in 1 minute')
                        time.sleep(60)
                    last_error_time = time.time()
                finally:
                    for user, groups_request in pool_results:
                        if groups_request.ok and groups_request.ready:
                            user['groups'] = []
                            for group in groups_request.result['items']:
                                if 'description' in group:
                                    user['groups'].append(
                                        {'description': group['description']})

            self._save_pickle('raw_users_data.members', self.members)
            self._save_pickle('raw_users_data.member_friends',
                              self.member_friends)

        log_method_end()