def test_error_processing(self): """Test that if access error occurred on user in will be requested again later""" requester = self._create() some_users = [User(1), User(2), User(3)] requester.add_users(some_users) requests = requester.get_requests() for request in requests: if request.user.id in [1, 2]: requester.request_succeed(request.user, request.req_type) else: requester.user_unrelated_error(request) new_requests = requester.get_requests() groups_request_users = [] friends_request_users = [] for request in new_requests: if isinstance(request, FriendsRequest): friends_request_users.append(request.user.id) else: groups_request_users.append(request.user.id) # users 1 and 2 marked as successful so we move to second (groups) request. request for 3 user failed thus we # try friends request again self.assertIn(1, groups_request_users) self.assertIn(2, groups_request_users) self.assertIn(3, friends_request_users)
def test_memory_usage(self): """Adds a lot of users, both fully and non-fully parsed and checks memory usage""" data_manager = RAMDataManager(MockLongTermSaver(), dmp_long_term_every=10**8) memory_checker = guppy.hpy() memory_usage_before = memory_checker.heap().size bytes_per_user = 20 nb_friends = 10 nb_groups = 10 bytes_for_friends = nb_friends * 4 bytes_for_groups = nb_groups * 4 nb_unpaired_users = 10**3 for not_fully_parsed_idx in range(nb_unpaired_users): user = User(not_fully_parsed_idx) if not_fully_parsed_idx % 2: data_manager.save_user_groups(user, self._create_groups(nb_groups)) else: data_manager.save_user_friends( user, self._create_friends(nb_friends)) memory_used = memory_checker.heap().size - memory_usage_before self.assertLessEqual( memory_used, nb_unpaired_users * (bytes_per_user + bytes_for_friends + bytes_for_groups))
def test_dumps_only_fully_parsed_users(self): """Adds some users, gets long term save and ensures that the save contains only users with all needed data """ inspect_was_called = False nb_users_saved = 100 friends_to_save = self._create_friends() groups_to_save = self._create_groups() def _inspect_save(save: UsersData): nonlocal inspect_was_called inspect_was_called = True self.assertEqual(len(save), nb_users_saved) for user_id, saved_data in save.items(): self.assertIsNotNone(saved_data["friends"]) self.assertIsNotNone(saved_data["groups"]) mock_long_saver = MockLongTermSaver(_inspect_save) data_manager = RAMDataManager(long_term_saver=mock_long_saver, dmp_long_term_every=nb_users_saved) for user_id in range(nb_users_saved): user = User(user_id) data_manager.save_user_friends(user, friends_to_save) data_manager.save_user_groups(user, groups_to_save) self.assertTrue(inspect_was_called)
def test_low_number_of_not_fully_parsed_users(self): """Makes a lot of loops and checks that requester doesn't generate number of non-fully parsed users larger, than max_requests_per_call param value. Non-fully-parsed users are users with only one requested made - only groups or only friends""" users_groups_req_made, users_friends_req_made = set(), set() users_to_request = [User(usr_id) for usr_id in range(10 ** 5)] add_users_step = 10 ** 4 max_requests = 500 requester = self._create(max_requests) while users_to_request: chosen_users = users_to_request[:add_users_step] users_to_request = users_to_request[add_users_step:] requester.add_users(chosen_users) requests = requester.get_requests() for request in requests: requester.request_succeed(request.user, request.req_type) if request.req_type == "friends": users_friends_req_made.add(request.user) else: users_groups_req_made.add(request.user) # users only in one set unpaired_users_nb = len(users_groups_req_made.symmetric_difference(users_friends_req_made)) self.assertLessEqual(unpaired_users_nb, max_requests)
def __call__(self, users: List[User]) -> List[User]: raw_user_ids = [user.id for user in users] uniq_user_ids = list(set(raw_user_ids)) users_arr = np.array(uniq_user_ids) isin_mask = np.isin(users_arr, self.already_added) uniq_users_arr = users_arr[~isin_mask] uniq_users = [User(user_id) for user_id, isin in zip(uniq_user_ids, isin_mask) if not isin] self._add(uniq_users_arr) return uniq_users
def test_functional_adding_and_getting_users(self): """Makes few loops of add_users, get_requests, request_succeeded to imitate real operations""" requester = self._create() some_users = [User(1), User(2), User(3)] requester.add_users(some_users) requests = requester.get_requests() # need 3 friends requests first, when they'll be marked as successful, should return 3 groups requests first for request in requests: self.assertIsInstance(request, FriendsRequest) requester.request_succeed(request.user, req_type=request.req_type) # adding some new users, but their requests should be made only after friends requests for some_users new_users = [User(4), User(5), User(6)] requester.add_users(new_users) requests = requester.get_requests() groups_requests_of_old_users = requests[:len(some_users)] new_requests = requests[len(some_users):] for request in groups_requests_of_old_users: self.assertIsInstance(request, GroupsRequest) self.assertIn(request.user, some_users) for request in new_requests: self.assertIsInstance(request, FriendsRequest) self.assertIn(request.user, new_users)
def test_checkpointing(self): """Gets checkpoint and passes it back to data manager""" data_manager = RAMDataManager(MockLongTermSaver(), dmp_long_term_every=10**8) the_user = User(id=1) data_manager.save_user_groups(the_user, groups=[]) checkp = data_manager.get_checkpoint() loaded_checkp = json.loads(json.dumps(checkp)) data_manager.load_checkpoint(loaded_checkp) data_manager.save_user_friends(the_user, friends=[]) self.assertEqual(data_manager.cnt_fully_parsed, 1)
def test_deletes_fully_parsed_from_mem(self): max_parsed_users_in_ram = 100 groups_to_save = self._create_groups() friends_to_save = self._create_friends() data_manager = RAMDataManager( MockLongTermSaver(), dmp_long_term_every=max_parsed_users_in_ram) memory_checker = guppy.hpy() memory_usage_before = memory_checker.heap().size for fully_parsed_idx in range(max_parsed_users_in_ram * 100): user = User(fully_parsed_idx) data_manager.save_user_groups(user, groups_to_save) data_manager.save_user_friends(user, friends_to_save) if fully_parsed_idx % (max_parsed_users_in_ram * 10) == 0: memory_used = memory_checker.heap().size - memory_usage_before # 1000 bytes is some basic cost without any data self.assertLessEqual(memory_used, 1000)
def test_counts_fully_parsed(self): """Checks that data manager increases cnt_fully_parsed correctly""" data_manager = RAMDataManager(None, dmp_long_term_every=100) user1, user2, user3, user4 = [User(usr_id) for usr_id in range(4)] friends_to_save = self._create_friends() groups_to_save = self._create_groups() data_manager.save_user_friends(user1, friends_to_save) data_manager.save_user_friends(user2, friends_to_save) data_manager.save_user_groups(user3, groups_to_save) data_manager.save_user_friends(user4, friends_to_save) data_manager.save_user_groups(user1, groups_to_save) self.assertEqual(data_manager.cnt_fully_parsed, 1) data_manager.save_user_groups(user3, groups_to_save) # saving groups again data_manager.save_user_groups(user2, groups_to_save) self.assertEqual(data_manager.cnt_fully_parsed, 2)
def _create_friends(self, nb=2): start_idx = random.randint(10**6, 10**7) return [User(usr_id) for usr_id in range(start_idx, start_idx + nb)]
def parse(self, items) -> List[User]: friends = [User(id=user_id) for user_id in items] return friends
def __init__(self, start_user_id: int, proxy_storage: ProxyStorage, creds_storage: CredsStorage, long_term_save_pth: str, data_backup_path: str, logs_pth: str = "../logs.txt", tracker=None, requester_max_requests_per_loop=10000, tracker_response_freq=500, access_resource_reload_hours=1, use_async=True, nb_sessions=1, dmp_long_term_steps=2000): if tracker is None: tracker = TerminalEventsTracker( log_pth=logs_pth, report_every_responses_nb=tracker_response_freq) self.tracker = tracker self.events_tracker = tracker CrawlRunner.__init__(self, tracker=tracker) requests_creator = VkApiRequestsCreator() friends_req_storage = RequestedUsersFileStorage( "./resources/checkpoints/dumped_friends_requests.txt") groups_req_storage = RequestedUsersFileStorage( "./resources/checkpoints/dumped_groups_requests.txt") users_filter = DuplicateUsersFilter() self.requester = EconomicRequester( requests_creator, friends_req_storage=friends_req_storage, groups_req_storage=groups_req_storage, users_filter=users_filter, max_requests_per_call=requester_max_requests_per_loop) errors_handler = VkApiErrorsHandler(tracker) proxy_manager = ProxyManager( proxy_storage, tracker, hours_for_resource_reload=access_resource_reload_hours) creds_manager = CredsManager( creds_storage, tracker, hours_for_resource_reload=access_resource_reload_hours) tester = ResourceTester(errors_handler) self.session_manager = SessionManagerImpl(errors_handler, proxy_manager, creds_manager, tester) if use_async: responses_factory = AioVkResponsesFactory() if nb_sessions == 1: self.executor = AsyncVkApiPoolExecutor(self.session_manager, responses_factory, errors_handler) else: self.executor = MultiSessionAsyncVkApiPoolExecutor( self.session_manager, responses_factory, errors_handler, nb_sessions=nb_sessions) else: responses_factory = VkApiResponsesFactory() self.executor = VkApiPoolExecutor(self.session_manager, responses_factory) long_term_saver = DataLongTermSaver(long_term_save_pth, data_backup_path) self.data_manager = RAMDataManager( long_term_saver, dmp_long_term_every=dmp_long_term_steps) self.parsed_processor = ParsedProcessorWithHooks( self.data_manager, tracker, errors_handler=errors_handler) success_request_notifier_hook = HookSuccessParseNotifier() success_request_notifier_hook.register_request_success_listener( self.requester) self.parsed_processor.add_process_success_hook( success_request_notifier_hook) errors_handler.register_session_error_listener(self.session_manager) errors_handler.register_user_unrelated_listener(self.requester) self.continue_crawling = True self.has_to_break_parsing = False self.candidates = [User(id=start_user_id)]
def _create_users(self, start=0, end=10 ** 3): start = int(start) end = int(end) return [User(usr_id) for usr_id in range(start, end)]
def _create_users(self, ids: List[int]): return [User(id=user_id) for user_id in ids]