def test_error_processing(self):
        """Test that if access error occurred on user in will be requested again later"""
        requester = self._create()
        some_users = [User(1), User(2), User(3)]
        requester.add_users(some_users)

        requests = requester.get_requests()
        for request in requests:
            if request.user.id in [1, 2]:
                requester.request_succeed(request.user, request.req_type)
            else:
                requester.user_unrelated_error(request)

        new_requests = requester.get_requests()
        groups_request_users = []
        friends_request_users = []
        for request in new_requests:
            if isinstance(request, FriendsRequest):
                friends_request_users.append(request.user.id)
            else:
                groups_request_users.append(request.user.id)
        # users 1 and 2 marked as successful so we move to second (groups) request. request for 3 user failed thus we
        #   try friends request again
        self.assertIn(1, groups_request_users)
        self.assertIn(2, groups_request_users)
        self.assertIn(3, friends_request_users)
    def test_memory_usage(self):
        """Adds a lot of users, both fully and non-fully parsed and checks memory usage"""
        data_manager = RAMDataManager(MockLongTermSaver(),
                                      dmp_long_term_every=10**8)
        memory_checker = guppy.hpy()
        memory_usage_before = memory_checker.heap().size
        bytes_per_user = 20
        nb_friends = 10
        nb_groups = 10
        bytes_for_friends = nb_friends * 4
        bytes_for_groups = nb_groups * 4

        nb_unpaired_users = 10**3
        for not_fully_parsed_idx in range(nb_unpaired_users):
            user = User(not_fully_parsed_idx)

            if not_fully_parsed_idx % 2:
                data_manager.save_user_groups(user,
                                              self._create_groups(nb_groups))
            else:
                data_manager.save_user_friends(
                    user, self._create_friends(nb_friends))

        memory_used = memory_checker.heap().size - memory_usage_before

        self.assertLessEqual(
            memory_used,
            nb_unpaired_users *
            (bytes_per_user + bytes_for_friends + bytes_for_groups))
    def test_dumps_only_fully_parsed_users(self):
        """Adds some users, gets long term save and ensures that the save contains only users with all needed data
        """
        inspect_was_called = False
        nb_users_saved = 100
        friends_to_save = self._create_friends()
        groups_to_save = self._create_groups()

        def _inspect_save(save: UsersData):
            nonlocal inspect_was_called
            inspect_was_called = True

            self.assertEqual(len(save), nb_users_saved)
            for user_id, saved_data in save.items():
                self.assertIsNotNone(saved_data["friends"])
                self.assertIsNotNone(saved_data["groups"])

        mock_long_saver = MockLongTermSaver(_inspect_save)
        data_manager = RAMDataManager(long_term_saver=mock_long_saver,
                                      dmp_long_term_every=nb_users_saved)
        for user_id in range(nb_users_saved):
            user = User(user_id)

            data_manager.save_user_friends(user, friends_to_save)
            data_manager.save_user_groups(user, groups_to_save)

        self.assertTrue(inspect_was_called)
    def test_low_number_of_not_fully_parsed_users(self):
        """Makes a lot of loops and checks that requester doesn't generate number of non-fully parsed users larger,
        than max_requests_per_call param value.
        Non-fully-parsed users are users with only one requested made - only groups or only friends"""
        users_groups_req_made, users_friends_req_made = set(), set()
        users_to_request = [User(usr_id) for usr_id in range(10 ** 5)]
        add_users_step = 10 ** 4

        max_requests = 500
        requester = self._create(max_requests)

        while users_to_request:
            chosen_users = users_to_request[:add_users_step]
            users_to_request = users_to_request[add_users_step:]
            requester.add_users(chosen_users)
            requests = requester.get_requests()

            for request in requests:
                requester.request_succeed(request.user, request.req_type)
                if request.req_type == "friends":
                    users_friends_req_made.add(request.user)
                else:
                    users_groups_req_made.add(request.user)

            # users only in one set
            unpaired_users_nb = len(users_groups_req_made.symmetric_difference(users_friends_req_made))

            self.assertLessEqual(unpaired_users_nb, max_requests)
Пример #5
0
    def __call__(self, users: List[User]) -> List[User]:
        raw_user_ids = [user.id for user in users]
        uniq_user_ids = list(set(raw_user_ids))

        users_arr = np.array(uniq_user_ids)
        isin_mask = np.isin(users_arr, self.already_added)
        uniq_users_arr = users_arr[~isin_mask]
        uniq_users = [User(user_id) for user_id, isin in zip(uniq_user_ids, isin_mask) if not isin]
        self._add(uniq_users_arr)

        return uniq_users
    def test_functional_adding_and_getting_users(self):
        """Makes few loops of add_users, get_requests, request_succeeded to imitate real operations"""
        requester = self._create()
        some_users = [User(1), User(2), User(3)]

        requester.add_users(some_users)
        requests = requester.get_requests()

        # need 3 friends requests first, when they'll be marked as successful, should return 3 groups requests first
        for request in requests:
            self.assertIsInstance(request, FriendsRequest)
            requester.request_succeed(request.user, req_type=request.req_type)

        # adding some new users, but their requests should be made only after friends requests for some_users
        new_users = [User(4), User(5), User(6)]
        requester.add_users(new_users)

        requests = requester.get_requests()
        groups_requests_of_old_users = requests[:len(some_users)]
        new_requests = requests[len(some_users):]

        for request in groups_requests_of_old_users:
            self.assertIsInstance(request, GroupsRequest)
            self.assertIn(request.user, some_users)

        for request in new_requests:
            self.assertIsInstance(request, FriendsRequest)
            self.assertIn(request.user, new_users)
    def test_checkpointing(self):
        """Gets checkpoint and passes it back to data manager"""
        data_manager = RAMDataManager(MockLongTermSaver(),
                                      dmp_long_term_every=10**8)
        the_user = User(id=1)
        data_manager.save_user_groups(the_user, groups=[])

        checkp = data_manager.get_checkpoint()
        loaded_checkp = json.loads(json.dumps(checkp))
        data_manager.load_checkpoint(loaded_checkp)

        data_manager.save_user_friends(the_user, friends=[])
        self.assertEqual(data_manager.cnt_fully_parsed, 1)
    def test_deletes_fully_parsed_from_mem(self):
        max_parsed_users_in_ram = 100
        groups_to_save = self._create_groups()
        friends_to_save = self._create_friends()

        data_manager = RAMDataManager(
            MockLongTermSaver(), dmp_long_term_every=max_parsed_users_in_ram)

        memory_checker = guppy.hpy()
        memory_usage_before = memory_checker.heap().size

        for fully_parsed_idx in range(max_parsed_users_in_ram * 100):
            user = User(fully_parsed_idx)
            data_manager.save_user_groups(user, groups_to_save)
            data_manager.save_user_friends(user, friends_to_save)

            if fully_parsed_idx % (max_parsed_users_in_ram * 10) == 0:
                memory_used = memory_checker.heap().size - memory_usage_before
                # 1000 bytes is some basic cost without any data
                self.assertLessEqual(memory_used, 1000)
    def test_counts_fully_parsed(self):
        """Checks that data manager increases cnt_fully_parsed correctly"""
        data_manager = RAMDataManager(None, dmp_long_term_every=100)

        user1, user2, user3, user4 = [User(usr_id) for usr_id in range(4)]
        friends_to_save = self._create_friends()
        groups_to_save = self._create_groups()

        data_manager.save_user_friends(user1, friends_to_save)
        data_manager.save_user_friends(user2, friends_to_save)
        data_manager.save_user_groups(user3, groups_to_save)
        data_manager.save_user_friends(user4, friends_to_save)

        data_manager.save_user_groups(user1, groups_to_save)
        self.assertEqual(data_manager.cnt_fully_parsed, 1)

        data_manager.save_user_groups(user3,
                                      groups_to_save)  # saving groups again
        data_manager.save_user_groups(user2, groups_to_save)
        self.assertEqual(data_manager.cnt_fully_parsed, 2)
 def _create_friends(self, nb=2):
     start_idx = random.randint(10**6, 10**7)
     return [User(usr_id) for usr_id in range(start_idx, start_idx + nb)]
Пример #11
0
 def parse(self, items) -> List[User]:
     friends = [User(id=user_id) for user_id in items]
     return friends
Пример #12
0
    def __init__(self,
                 start_user_id: int,
                 proxy_storage: ProxyStorage,
                 creds_storage: CredsStorage,
                 long_term_save_pth: str,
                 data_backup_path: str,
                 logs_pth: str = "../logs.txt",
                 tracker=None,
                 requester_max_requests_per_loop=10000,
                 tracker_response_freq=500,
                 access_resource_reload_hours=1,
                 use_async=True,
                 nb_sessions=1,
                 dmp_long_term_steps=2000):

        if tracker is None:
            tracker = TerminalEventsTracker(
                log_pth=logs_pth,
                report_every_responses_nb=tracker_response_freq)
        self.tracker = tracker

        self.events_tracker = tracker
        CrawlRunner.__init__(self, tracker=tracker)

        requests_creator = VkApiRequestsCreator()

        friends_req_storage = RequestedUsersFileStorage(
            "./resources/checkpoints/dumped_friends_requests.txt")
        groups_req_storage = RequestedUsersFileStorage(
            "./resources/checkpoints/dumped_groups_requests.txt")
        users_filter = DuplicateUsersFilter()
        self.requester = EconomicRequester(
            requests_creator,
            friends_req_storage=friends_req_storage,
            groups_req_storage=groups_req_storage,
            users_filter=users_filter,
            max_requests_per_call=requester_max_requests_per_loop)

        errors_handler = VkApiErrorsHandler(tracker)

        proxy_manager = ProxyManager(
            proxy_storage,
            tracker,
            hours_for_resource_reload=access_resource_reload_hours)
        creds_manager = CredsManager(
            creds_storage,
            tracker,
            hours_for_resource_reload=access_resource_reload_hours)

        tester = ResourceTester(errors_handler)
        self.session_manager = SessionManagerImpl(errors_handler,
                                                  proxy_manager, creds_manager,
                                                  tester)
        if use_async:
            responses_factory = AioVkResponsesFactory()
            if nb_sessions == 1:
                self.executor = AsyncVkApiPoolExecutor(self.session_manager,
                                                       responses_factory,
                                                       errors_handler)
            else:

                self.executor = MultiSessionAsyncVkApiPoolExecutor(
                    self.session_manager,
                    responses_factory,
                    errors_handler,
                    nb_sessions=nb_sessions)
        else:
            responses_factory = VkApiResponsesFactory()
            self.executor = VkApiPoolExecutor(self.session_manager,
                                              responses_factory)

        long_term_saver = DataLongTermSaver(long_term_save_pth,
                                            data_backup_path)
        self.data_manager = RAMDataManager(
            long_term_saver, dmp_long_term_every=dmp_long_term_steps)

        self.parsed_processor = ParsedProcessorWithHooks(
            self.data_manager, tracker, errors_handler=errors_handler)

        success_request_notifier_hook = HookSuccessParseNotifier()
        success_request_notifier_hook.register_request_success_listener(
            self.requester)

        self.parsed_processor.add_process_success_hook(
            success_request_notifier_hook)

        errors_handler.register_session_error_listener(self.session_manager)
        errors_handler.register_user_unrelated_listener(self.requester)

        self.continue_crawling = True
        self.has_to_break_parsing = False
        self.candidates = [User(id=start_user_id)]
 def _create_users(self, start=0, end=10 ** 3):
     start = int(start)
     end = int(end)
     return [User(usr_id) for usr_id in range(start, end)]
Пример #14
0
 def _create_users(self, ids: List[int]):
     return [User(id=user_id) for user_id in ids]