Пример #1
0
class BoardTest(unittest.TestCase):
    '''
    Тестирование класса для парсинга топиков и комментариев к ним
    '''
    
    def setUp(self):
        self.board = Board(GID)

    def test_get(self):
        topics = self.board.get_topics()


        topics_count = len(topics)

        self.assertTrue(topics_count > 0)
        self.assertTrue(len(filter(lambda x: x['count'] > 0, topics)) == topics_count)     
        
    def test_getComments(self):
        comments = self.board.get_comments(TOPIC_ID)
        self.assertTrue(len(comments) > 0)   
        self.assertTrue(len(filter(lambda x: x.date is not None, comments)) > 0)
        
    def test_count(self):
        count = self.board.get_count()
        self.assertNotEqual(count, None)
        self.assertGreater(count, 0)
Пример #2
0
 def setUp(self):
     self.board = Board(GID)
Пример #3
0
    def run(self):
        '''
        Запускает работу кроулера
        '''
        self.account = self.choose_account()
        if self.account is None:
            raise Exception, "No valid account for parsing"

        
        self.api = API(self.account.access_token)
        self.board = Board(self.gid, api=self.api, resolve_names=False)
        
        # Если True, то для для всех (новых и обновленных) объектов будут
        # записываться действия пользователей, иначе только для 
        # обновленных
        self.save_actions = len(Item.objects.all()) != 0 or CREATE_ACTIONS_WHEN_EMPTY
        vk_group = VkGroup.objects.get(gid=self.gid)        
        
        logging.info("Start crawling for group %s", self.gid)
        logging.info("Save actions = %s", self.save_actions)
        logging.info("Wall Scanning Depth = %s", vk_group.wall_max)
        logging.info("Video Scanning Depth = %s", vk_group.video_max)
        logging.info("Photo Scanning Depth = %s", vk_group.photo_max)
        logging.info("Board Scanning Depth = %s", vk_group.board_max)
        
                
        snapshoter = GroupSnapshoter(self.gid, self.api)

        logging.info("Old Wall Posts Count = %s", vk_group.wall_count)
        logging.info("Old Photos Count = %s", vk_group.photos_count)
        logging.info("Old Videos Count = %s", vk_group.videos_count)
        logging.info("Old Board Topics Count = %s", vk_group.board_count)
        
        wall_count = snapshoter.get_wall_count()
        photos_count = snapshoter.get_photos_count()
        videos_count = snapshoter.get_videos_count()
        board_count = self.board.get_count()
        
        '''
        Рассчитываем разницу в количестве объектов, на которую надо
        увеличить глубину сканирования, чтобы собрать все новые объекты и 
        обновить старые
        '''
        new_posts = max(wall_count - vk_group.wall_count, 0)
        new_videos = max(videos_count - vk_group.videos_count, 0)
        new_photos = max(photos_count - vk_group.photos_count, 0)
        new_topics = max(board_count - vk_group.board_count, 0)
        
        '''
        '''
        vk_group.wall_count = wall_count
        vk_group.photos_count = photos_count
        vk_group.videos_count = videos_count
        vk_group.board_count = board_count

        
        logging.info("New Wall Posts Count = %s", wall_count)
        logging.info("New Photos Count = %s", photos_count)
        logging.info("New Videos Count = %s", videos_count)
        logging.info("New Board Topics Count = %s", board_count)        

        snapshot = []
        snapshot =      snapshoter.make_for_wall(size=vk_group.wall_max + new_posts)
        snapshot.extend(snapshoter.make_for_videos(size=vk_group.video_max + new_videos))
        snapshot.extend(snapshoter.make_for_photos(size=vk_group.photo_max + new_photos))
        snapshot.extend(snapshoter.make_for_board(size=vk_group.board_max + new_topics))
       
        logging.info("Crawling finished")
        logging.info("%s items are at snapshot", len(snapshot))
        
        logging.info("Creating tasks for updating")
        
        tasks = self.create_update_task(snapshot)  
        
        logging.info("Created tasks for updating")       
        logging.info("%s items are to be added", len(filter(lambda task: task.action=='add', tasks)))
        logging.info("%s items are to be updated", len(filter(lambda task: task.action=='update', tasks)))
        
        new_comments, new_likes, new_shares = 0, 0, 0
        for task in tasks:
            if task.comments_change:
                new_comments += task.comments_change[1] - task.comments_change[0] + 1
            if task.likes_change:
                new_likes += task.likes_change[1] - task.likes_change[0] + 1
            if task.shares_change:
                new_shares += task.shares_change[1] - task.shares_change[0] + 1

        logging.info("New comments: %s", new_comments)
        logging.info("New likes: %s", new_likes)
        logging.info("New shares: %s", new_shares)
        
        logging.info("Start fetching updates")
        
        tasks = self.fetch_updates(tasks)
        
        logging.info("Completed fetching updates")
        
        new_comments, new_likes, new_shares = 0, 0, 0
        for task in tasks:
            if task.comments_change:
                new_comments += len(task.comments)
            if task.likes_change:
                new_likes += len(task.likes)
            if task.shares_change:
                new_shares += len(task.shares)

        logging.info("New comments are fetched: %s", new_comments)
        logging.info("New likes are fetched: %s", new_likes)
        logging.info("New shares are fetched: %s", new_shares)
        
        logging.info("Start fixing updates")
        
        self.fix_updates(tasks)
        
        logging.info("Completed fixing updates")
Пример #4
0
class Crawler(object):
    '''
    Класс создания снимка группы
    '''    
    
    def __init__(self, gid):
        self.vk_accounts = VkAccount.objects.all()
        self.gid = str(gid)
        
        self.board = None
        self.account = None
        self.api = None
        self.save_actions = True
    
    def choose_account(self):
        '''
        Возвращает первый аккаунт с валидным токеном доступа
        '''
        vk_account = None
        for vk_acc in self.vk_accounts:
            try:
                if not self.test_token(vk_acc):
                    if not self.refresh_token(vk_acc):
                        continue
                vk_account = vk_acc
            except (AccountError, HTTPError):
                continue 
                            
        return vk_account
    
    @staticmethod
    def test_token(vk_account):
        '''
        Возвращает True, если токен аккаунта account валиден
        '''
        api = API(vk_account.access_token)
        try:
            return (api.friends.get(uid=1) is not None)
        except APIError:
            return False
            
    @staticmethod
    def refresh_token(vk_account):
        '''
        Возвращает True, если удалось обновить токен для аккаунта account
        '''
        account = Account(vk_account.login, vk_account.password)
        account.auth()
        access_token = account.oauth(APP_ID, APP_PERMISSION_SCOPE)
        
        # Нифига! Ну, что ж, придется ответить отказом
        if access_token is None:
            return False
        else:
            # Получили свеженький как цветущая девственница токен :)
            # Не забудим его сохранить в базу данных.
            vk_account.access_token = access_token
            vk_account.save()
            return True
    
    @staticmethod
    def find(item):
        '''
        Возвращает объект Item из базы данных с таким же идентификатором, 
        как у и item из снимка
        '''
        try:
            return Item.objects.get(name=item.name)
        except Item.DoesNotExist:
            return None
            
    def create_update_task(self, snapshot):
        '''
        Возвращает задание для обновления базы данных
        
        snapshot -- свежий снимок группы
        '''
        tasks = []
        for new_item in snapshot:
            old_item = Crawler.find(new_item)
            
            # Новый объект надо добавить в базу данных
            if old_item is None:
                task = BaseDict(
                    action='add',
                    item=new_item,
                )

                if new_item.likes > 0 and self.save_actions:
                    task.likes_change = [1, new_item.likes]
                else:
                    task.likes_change = None
                    
                if new_item.comments > 0 and self.save_actions:
                    task.comments_change = [1, new_item.comments]     
                else:
                    task.comments_change = None
                    
                if new_item.shares > 0 and self.save_actions:
                    task.shares_change = [1, new_item.shares]
                else:
                    task.shares_change = None
            else:
                # Для старого объекта обновить информацию и 
                # составить список действий с ним
                task = BaseDict(
                    action='update',
                    item=new_item
                )
                
                # измененилось количество лайков
                if new_item.likes > old_item.likes:
                    task.likes_change = [1, new_item.likes-old_item.likes]
                else:
                    task.likes_change = None

                # измененилось количество комментариев
                if new_item.comments > old_item.comments:
                    task.comments_change = [
                        old_item.comments+1, 
                        new_item.comments
                    ]
                else:
                    task.comments_change = None

                # измененилось количество репостов                    
                if new_item.shares > old_item.shares:
                    task.shares_change = [old_item.shares+1, new_item.shares]
                else:
                    task.shares_change = None
            
            tasks.append(task)
        
        return tasks
    
    
    def fetch_likes(self, item, change, shares=False):
        '''
        Возвращает список пользователей, совершивших действие 
        "поставил лайк" пользователей для item.
        
        change -- [<начало списка лайков>, <конец списка лайков>]
        shares -- если True, то обрабатывает пользователей, которые сделали 
                  репост объекта
        '''
        sequence = GroupSnapshoter.create_requests_sequence(
                    change[0], 
                    change[1], 
                    GroupSnapshoter.LIKES_COUNT
        )
        
        retval = []
        for offset, count in sequence:
            try:
                likes = self.api.likes.getList(
                    type=item.type, 
                    owner_id=item.owner, 
                    item_id=item.data.internal_id,
                    offset=offset, 
                    count=count,
                    filter='likes' if not shares else 'copies'
                )
                
                retval.extend(likes['users'])
            except APIError:
                continue
            
        return retval
            
    def fetch_comments(self, item, change):
        '''
        Возвращает список комментариев для item.
        
        change -- [<начало списка лайков>, <конец списка лайков>]
        '''            
        if item.type == 'post':
            sequence = GroupSnapshoter.create_requests_sequence(
                change[0], 
                change[1],
                GroupSnapshoter.WALL_COUNT
            )
            get = lambda offset, count: self.api.wall.getComments(
                    owner_id=item.owner,
                    post_id=item.data.internal_id, 
                    offset=offset, 
                    count=count
                )[1:]
        elif item.type == 'topic':
            sequence = GroupSnapshoter.create_requests_sequence(
                change[0], 
                change[1],
                GroupSnapshoter.BOARD_COMMENT_COUNT
            )
            get = lambda offset, count: self.board.get_comments(
                    topic_id=item.data.internal_id, 
                    offset=offset,
            )
        elif item.type == 'photo':
            sequence = GroupSnapshoter.create_requests_sequence(
                change[0], 
                change[1],
                GroupSnapshoter.PHOTOS_COUNT
            )
            get = lambda offset, count: self.api.photos.getComments(
                    owner_id=item.owner,
                    pid=item.data.internal_id, 
                    offset=offset, 
                    count=count
                )[1:]
        elif item.type == 'video':
            sequence = GroupSnapshoter.create_requests_sequence(
                change[0], 
                change[1],
                GroupSnapshoter.VIDEOS_COUNT
            )
            get = lambda offset, count: self.api.video.getComments(
                    owner_id=item.owner,
                    vid=item.data.internal_id, 
                    offset=offset, 
                    count=count
                )[1:]
        
        retval = []
        for offset, count in sequence: 
            try:
                comments = get(offset, count)
            except APIError:
                continue
            
            for comment in comments:
                if 'uid' in comment: 
                    comment['from_id'] = comment['uid']
                    
                if 'message' in comment: 
                    comment['text'] = comment['message']
                    
                if isinstance(comment['date'], basestring) or isinstance(comment['date'], int):
                    try:
                        comment['date'] = date.fromtimestamp(int(comment['date']))
                    except (ValueError, KeyError):
                        comment['date'] = None          
                        
            retval.extend(comments)

                        
        return retval
                    
    def fetch_updates(self, tasks):
        '''
        Выполняет обновление базы объектов и действий пользователей
        '''
        # Получает список новых комментариев, лайков и репостов для объекта
        for task in tasks:
        
            # Узнать номер создателя топика
            if task.action == 'add' and task.item.type == 'topic':
                try:
                    info = self.board.get_topic_info(task.item.data.internal_id)
                    task.item.from_id = info.from_id
                    task.item.date = info.date
                except APIError:
                    task.item.from_id = None

            if task.likes_change:
                logging.info("Fetching likes for %s from %s to %s", task.item.name, task.likes_change[0], task.likes_change[1])
            
                task.likes = self.fetch_likes(task.item, task.likes_change, False)
                
                logging.info("%s likes are fetched for %s", len(task.likes), task.item.name)

            if task.shares_change:
                logging.info("Fetching shares for %s from %s to %s", task.item.name, task.shares_change[0], task.shares_change[1])
            
                task.shares = self.fetch_likes(task.item, task.shares_change, True)
                
                logging.info("%s shares are fetched for %s", len(task.shares), task.item.name)
            
            if task.comments_change:
                logging.info('Fetching comments for %s from %s to %s', task.item.name, task.comments_change[0], task.comments_change[1])
                
                task.comments = self.fetch_comments(task.item, 
                                  task.comments_change)
                                  
                logging.info('%s comments are fetched for %s', len(task.comments), task.item.name)
        
        

        # Резолвинг номеров пользователей для топиков и комментариев к ним
        logging.info("Start resolving names for topics and its comments") 

        ids = []
        for task in tasks:
            if task.item.type == 'topic': 
                if task.action == 'add':
                    ids.append(task.item.from_id)
                if task.comments_change:
                    ids.extend([comment.from_id for comment in task.comments])

        ids = list(set(ids))
        ids_map = self.api.resolve_names(ids)
   
        for task in tasks:
            if task.item.type == 'topic':    
                if task.action == 'add':
                    task.item.from_id = ids_map[task.item.from_id]
                if task.comments_change:                
                    for comment in task.comments:
                        comment.from_id = ids_map[comment.from_id]
        
        logging.info("Completed resolving names for topics and its comments")        
        
        return tasks
    
    def fix_updates(self, tasks):
        '''
        Фиксирует обновления в базе данных 
        '''            
        for task in tasks:
            if task.action == 'add':
                item = Item.objects.create(
                    type=Item.TYPES[task.item.type], name=task.item.name, 
                    owner=task.item.owner, likes=task.item.likes,
                    comments=task.item.comments, shares=task.item.shares, 
                    data=task.item.data)


                if task.item.from_id:
                    Action.objects.create(
                        type=Action.TYPES['added'], item=item, who=task.item.from_id,date=task.item.date
                    )

            elif task.action == 'update' and (task.likes_change or task.shares_change or task.comments_change):
                item = Item.objects.get(name=task.item.name)

                item.likes = task.item.likes
                item.comments = task.item.comments
                item.shares = task.item.shares
                item.data = task.item.data

                item.save()
                
            if task.likes_change:
                for user in task.likes:
                    Action.objects.create(
                        who=user,
                        type=Action.TYPES['like'], 
                        item=item,
                        date=date.today()
                    )

            if task.shares_change:
                for user in task.shares:
                    Action.objects.create(
                        who=user,
                        type=Action.TYPES['share'], 
                        item=item,
                        date=date.today()
                    )

            if task.comments_change:
                for comment in task.comments:
                    Action.objects.create(
                        who=comment['from_id'],
                        type=Action.TYPES['comment'], 
                        item=item, 
                        date=comment['date'],
                        data = {'text': comment['text']}
                    )

    def run(self):
        '''
        Запускает работу кроулера
        '''
        self.account = self.choose_account()
        if self.account is None:
            raise Exception, "No valid account for parsing"

        
        self.api = API(self.account.access_token)
        self.board = Board(self.gid, api=self.api, resolve_names=False)
        
        # Если True, то для для всех (новых и обновленных) объектов будут
        # записываться действия пользователей, иначе только для 
        # обновленных
        self.save_actions = len(Item.objects.all()) != 0 or CREATE_ACTIONS_WHEN_EMPTY
        vk_group = VkGroup.objects.get(gid=self.gid)        
        
        logging.info("Start crawling for group %s", self.gid)
        logging.info("Save actions = %s", self.save_actions)
        logging.info("Wall Scanning Depth = %s", vk_group.wall_max)
        logging.info("Video Scanning Depth = %s", vk_group.video_max)
        logging.info("Photo Scanning Depth = %s", vk_group.photo_max)
        logging.info("Board Scanning Depth = %s", vk_group.board_max)
        
                
        snapshoter = GroupSnapshoter(self.gid, self.api)

        logging.info("Old Wall Posts Count = %s", vk_group.wall_count)
        logging.info("Old Photos Count = %s", vk_group.photos_count)
        logging.info("Old Videos Count = %s", vk_group.videos_count)
        logging.info("Old Board Topics Count = %s", vk_group.board_count)
        
        wall_count = snapshoter.get_wall_count()
        photos_count = snapshoter.get_photos_count()
        videos_count = snapshoter.get_videos_count()
        board_count = self.board.get_count()
        
        '''
        Рассчитываем разницу в количестве объектов, на которую надо
        увеличить глубину сканирования, чтобы собрать все новые объекты и 
        обновить старые
        '''
        new_posts = max(wall_count - vk_group.wall_count, 0)
        new_videos = max(videos_count - vk_group.videos_count, 0)
        new_photos = max(photos_count - vk_group.photos_count, 0)
        new_topics = max(board_count - vk_group.board_count, 0)
        
        '''
        '''
        vk_group.wall_count = wall_count
        vk_group.photos_count = photos_count
        vk_group.videos_count = videos_count
        vk_group.board_count = board_count

        
        logging.info("New Wall Posts Count = %s", wall_count)
        logging.info("New Photos Count = %s", photos_count)
        logging.info("New Videos Count = %s", videos_count)
        logging.info("New Board Topics Count = %s", board_count)        

        snapshot = []
        snapshot =      snapshoter.make_for_wall(size=vk_group.wall_max + new_posts)
        snapshot.extend(snapshoter.make_for_videos(size=vk_group.video_max + new_videos))
        snapshot.extend(snapshoter.make_for_photos(size=vk_group.photo_max + new_photos))
        snapshot.extend(snapshoter.make_for_board(size=vk_group.board_max + new_topics))
       
        logging.info("Crawling finished")
        logging.info("%s items are at snapshot", len(snapshot))
        
        logging.info("Creating tasks for updating")
        
        tasks = self.create_update_task(snapshot)  
        
        logging.info("Created tasks for updating")       
        logging.info("%s items are to be added", len(filter(lambda task: task.action=='add', tasks)))
        logging.info("%s items are to be updated", len(filter(lambda task: task.action=='update', tasks)))
        
        new_comments, new_likes, new_shares = 0, 0, 0
        for task in tasks:
            if task.comments_change:
                new_comments += task.comments_change[1] - task.comments_change[0] + 1
            if task.likes_change:
                new_likes += task.likes_change[1] - task.likes_change[0] + 1
            if task.shares_change:
                new_shares += task.shares_change[1] - task.shares_change[0] + 1

        logging.info("New comments: %s", new_comments)
        logging.info("New likes: %s", new_likes)
        logging.info("New shares: %s", new_shares)
        
        logging.info("Start fetching updates")
        
        tasks = self.fetch_updates(tasks)
        
        logging.info("Completed fetching updates")
        
        new_comments, new_likes, new_shares = 0, 0, 0
        for task in tasks:
            if task.comments_change:
                new_comments += len(task.comments)
            if task.likes_change:
                new_likes += len(task.likes)
            if task.shares_change:
                new_shares += len(task.shares)

        logging.info("New comments are fetched: %s", new_comments)
        logging.info("New likes are fetched: %s", new_likes)
        logging.info("New shares are fetched: %s", new_shares)
        
        logging.info("Start fixing updates")
        
        self.fix_updates(tasks)
        
        logging.info("Completed fixing updates")