def test_search(self, mock_paging): mock_paging.return_value = self._search_test_data kind = "TestKind" expected_url = "https://test-domain.pushshift.io/reddit/{}/search".format( kind) api = PushshiftAPIMinimal( domain="test-domain", rate_limit_per_minute=self._rate_limit, detect_local_tz=False, ) result_gen = api._search(kind) for data_grp in self._search_test_data: for test_item in data_grp["data"]: actual_item = next(result_gen) self.assertIn(kind, str(actual_item)) self.assertEqual(test_item["created_utc"], actual_item.created) self.assertDictEqual(test_item, actual_item.d_) for key, val in test_item.items(): self.assertEqual(val, getattr(actual_item, key)) mock_paging.assert_called_once_with(expected_url, {}) # Make sure everything is complete try: next(result_gen) self.fail("Expected StopIteration") except StopIteration: pass
def test_limited(self): # Test all of the arguments listed at # https://pushshift.io/api-parameters/ for arg in self._pushshift_args: self.assertFalse(PushshiftAPIMinimal._limited({arg: True})) # Test the limited arguments for arg in PushshiftAPIMinimal._limited_args: self.assertTrue(PushshiftAPIMinimal._limited({arg: True}))
def test_handle_paging_no_limit(self, mock_get): expected_last_timestamp = 1530047819 test_url = "example.com/route" mock_get.return_value = { "data": [ { "created_utc": 1530046703, "id": "e1ccvn7", "score": 1 }, { "created_utc": 1530047319, "id": "e1ccvn8", "score": 2 }, { "created_utc": 1530047619, "id": "e1ccvn9", "score": -3 }, { "created_utc": 1530047719, "id": "e1ccvna", "score": 5 }, { "created_utc": expected_last_timestamp, "id": "e1ccvnb", "score": 8 }, ] } api = PushshiftAPIMinimal(max_results_per_request=10, rate_limit_per_minute=self._rate_limit) results = api._handle_paging(test_url, {}) # Run the first call outside of the loop. # The call values will vary slightly after the first call. self.assertEqual(mock_get.return_value, next(results)) self.assertEqual(expected_last_timestamp, api._last_timestamp) mock_get.assert_called_once() mock_get.assert_called_with(test_url, {"limit": 10}) # This could go on forever. We stop after 15 calls. for call_count in range(2, 15): self.assertEqual(mock_get.return_value, next(results)) self.assertEqual(call_count, mock_get.call_count) self.assertEqual(expected_last_timestamp, api._last_timestamp) mock_get.assert_called_with(test_url, { "limit": 10, "before": expected_last_timestamp })
def test_search_stop_cond_batch(self, mock_paging): mock_paging.return_value = self._search_test_data kind = "TestKind" expected_url = "https://test-domain.pushshift.io/reddit/{}/search".format( kind) api = PushshiftAPIMinimal( domain="test-domain", rate_limit_per_minute=self._rate_limit, detect_local_tz=False, ) result_gen = api._search( kind, stop_condition=lambda x: x.created > 1530049619, return_batch=True) for data_grp in self._search_test_data: # Transform our source data to match what we expect with the stop condition expected_batch = list( filter(lambda x: x["created_utc"] <= 1530049619, data_grp["data"])) actual_batch = next(result_gen) self.assertEqual(len(expected_batch), len(actual_batch)) for idx, test_item in enumerate(expected_batch): actual_item = actual_batch[idx] self.assertIn(kind, str(actual_item)) self.assertEqual(test_item["created_utc"], actual_item.created) self.assertDictEqual(test_item, actual_item.d_) for key, val in test_item.items(): self.assertEqual(val, getattr(actual_item, key)) # Indicates that we hit the stop condition if len(expected_batch) < len(data_grp["data"]): break mock_paging.assert_called_once_with(expected_url, {}) # Make sure everything is complete try: next(result_gen) self.fail("Expected StopIteration") except StopIteration: pass
def test_utc_offset_secs(self): api = PushshiftAPIMinimal(detect_local_tz=False, rate_limit_per_minute=self._rate_limit) self.assertEqual(0, api.utc_offset_secs) api = PushshiftAPIMinimal(detect_local_tz=True, rate_limit_per_minute=self._rate_limit) for timezone in pytz.common_timezones: api._utc_offset_secs = None os.environ["TZ"] = timezone time.tzset() expected_secs = dt.utcnow().astimezone().utcoffset().total_seconds( ) actual_secs = api.utc_offset_secs self.assertEqual(expected_secs, actual_secs)
def test_handle_paging_low_limit(self, mock_get): expected_last_timestamp = 1530047819 test_url = "example.com/route" mock_get.return_value = { "data": [ { "created_utc": 1530046703, "id": "e1ccvn7", "score": 1 }, { "created_utc": 1530047319, "id": "e1ccvn8", "score": 2 }, { "created_utc": 1530047619, "id": "e1ccvn9", "score": -3 }, { "created_utc": 1530047719, "id": "e1ccvna", "score": 5 }, { "created_utc": expected_last_timestamp, "id": "e1ccvnb", "score": 8 }, ] } api = PushshiftAPIMinimal(max_results_per_request=10, rate_limit_per_minute=self._rate_limit) results = api._handle_paging(test_url, {"limit": 5}) self.assertEqual(mock_get.return_value, next(results)) self.assertEqual(expected_last_timestamp, api._last_timestamp) mock_get.assert_called_once_with(test_url, {"limit": 5}) try: next(results) self.fail("Expected StopIteration") except StopIteration: pass
def test_epoch_utc_to_local(self): timestamps = [ 1429981843, 1519981843, 1528981843, 1529781843, 1529881843, 1529931843, 1529981843, ] for timestamp in timestamps: api = PushshiftAPIMinimal(detect_local_tz=False, rate_limit_per_minute=self._rate_limit) self.assertEqual(timestamp, api._epoch_utc_to_local(timestamp)) api = PushshiftAPIMinimal(detect_local_tz=True, rate_limit_per_minute=self._rate_limit) for timezone in pytz.common_timezones: api._utc_offset_secs = None os.environ["TZ"] = timezone time.tzset() expected_secs = ( timestamp - dt.utcnow().astimezone().utcoffset().total_seconds()) actual_secs = api._epoch_utc_to_local(timestamp) self.assertEqual(expected_secs, actual_secs)
def test_get(self, mock_get, mock_rate_limit): max_retries = 7 expected_result = "test_text" test_url = "example.com/route" api = PushshiftAPIMinimal(max_retries=max_retries, rate_limit_per_minute=self._rate_limit) mock_get.return_value = MockResponse(status_code=200, text=json.dumps(expected_result)) self.assertEqual(expected_result, api._get(test_url)) # Ensure the correct count of retries triggered mock_get.assert_called_with(test_url, params={}) self.assertEqual(1, mock_get.call_count) # Ensure the rate limit was applied self.assertEqual(1, mock_rate_limit.call_count) mock_rate_limit.assert_has_calls([mock.call(0)]) mock_get.return_value.raise_for_status.assert_called_once()
def test_wrap_thing(self): test_data = { "created_utc": dt.utcnow().timestamp(), "some": 12, "arbitrary": True, "Set": "of random", "keys": "to", "test": 15.0, } kind = "TestKind" api = PushshiftAPIMinimal(detect_local_tz=False, rate_limit_per_minute=self._rate_limit) wrapped = api._wrap_thing(test_data, kind) self.assertIn(kind, str(wrapped)) self.assertEqual(test_data["created_utc"], wrapped.created) self.assertDictEqual(test_data, wrapped.d_) for key, val in test_data.items(): self.assertEqual(val, getattr(wrapped, key))
def test_get_raise_for_status(self, mock_get, mock_rate_limit): max_retries = 7 expected_result = "test_text" test_url = "example.com/route" api = PushshiftAPIMinimal(max_retries=max_retries, rate_limit_per_minute=self._rate_limit) # Test a subset of codes that should cause an outright failure for idx, status_code in enumerate( [400, 401, 403, 404, 405, 500, 502, 503, 504]): mock_get.return_value = MockResponse( status_code=status_code, text=json.dumps(expected_result)) try: api._get(test_url) self.fail("call failed to trigger expected exception") except HTTPError as exc: self.assertIn( "{} {} Error".format( status_code, "Server" if status_code >= 500 else "Client"), str(exc), ) expected_calls = max_retries * (idx + 1) # Ensure the correct count of retries triggered mock_get.assert_called_with(test_url, params={}) self.assertEqual(expected_calls, mock_get.call_count) # Ensure the rate limit was applied self.assertEqual(expected_calls, mock_rate_limit.call_count) mock_rate_limit.assert_has_calls( [mock.call(idx) for idx in range(0, max_retries)]) mock_get.return_value.raise_for_status.assert_called_once()
def test_get_429(self, mock_get, mock_rate_limit): max_retries = 7 expected_result = "test_text" test_url = "example.com/route" api = PushshiftAPIMinimal(max_retries=max_retries, rate_limit_per_minute=self._rate_limit) mock_get.return_value = MockResponse(status_code=429, text=json.dumps(expected_result)) self.assertEqual(expected_result, api._get(test_url)) # Ensure the correct count of retries triggered mock_get.assert_called_with(test_url, params={}) self.assertEqual(max_retries, mock_get.call_count) # Ensure the rate limit was applied self.assertEqual(max_retries, mock_rate_limit.call_count) mock_rate_limit.assert_has_calls( [mock.call(idx) for idx in range(0, max_retries)]) # This is the key difference with code 429 mock_get.return_value.raise_for_status.assert_not_called()
def test_apply_timestamp(self): api = PushshiftAPIMinimal(rate_limit_per_minute=self._rate_limit) api._last_timestamp = None self.assertDictEqual({"rand_field": "rand_val"}, api._apply_timestamp({"rand_field": "rand_val"})) api._last_timestamp = 12307501 self.assertDictEqual( { "rand_field": "rand_val", "before": 12307501 }, api._apply_timestamp({"rand_field": "rand_val"}), ) self.assertDictEqual( { "rand_field": "rand_val", "sort": "desc", "before": 12307501 }, api._apply_timestamp({ "rand_field": "rand_val", "sort": "desc" }), ) self.assertDictEqual( { "rand_field": "rand_val", "sort": "asc", "after": 12307501 }, api._apply_timestamp({ "rand_field": "rand_val", "sort": "asc" }), )
def test_init(self): api = PushshiftAPIMinimal(**self._base_init_kwargs) self._test_base_init(api)
def test_base_url(self): api = PushshiftAPIMinimal(domain="test-domain", rate_limit_per_minute=self._rate_limit) self.assertEqual("https://test-domain.pushshift.io/{endpoint}", api.base_url)
def test_impose_rate_limit(self, mock_sleep): mock_rlcache = mock.NonCallableMock(blocked=False, interval=13) max_sleep = 69 backoff = 11 api = PushshiftAPIMinimal(max_sleep=max_sleep, backoff=backoff, rate_limit_per_minute=self._rate_limit) api._rlcache = mock_rlcache api._impose_rate_limit() mock_sleep.assert_not_called() mock_rlcache.blocked = True api._impose_rate_limit() mock_sleep.assert_called_with(13) mock_rlcache.interval = 87 api._impose_rate_limit() mock_sleep.assert_called_with(max_sleep) mock_rlcache.interval = 0 api._impose_rate_limit(6) mock_sleep.assert_called_with(6 * backoff)
def test_handle_paging_high_limit(self, mock_get): test_url = "example.com/route" test_data = [ { "data": [ { "created_utc": 1530046703, "id": "e1ccvn7", "score": 1 }, { "created_utc": 1530047319, "id": "e1ccvn8", "score": 2 }, { "created_utc": 1530047619, "id": "e1ccvn9", "score": -3 }, { "created_utc": 1530047719, "id": "e1ccvna", "score": 5 }, { "created_utc": 1530047819, "id": "e1ccvnb", "score": 8 }, ] }, { "data": [ { "created_utc": 1530048703, "id": "e1cdvn7", "score": -1 }, { "created_utc": 1530049319, "id": "e1cdvn8", "score": -2 }, { "created_utc": 1530049619, "id": "e1cdvn9", "score": 3 }, { "created_utc": 1530049719, "id": "e1cdvna", "score": -5 }, { "created_utc": 1530049819, "id": "e1cdvnb", "score": -8 }, ] }, { "data": [ { "created_utc": 1530148703, "id": "e1cdvn7", "score": -1 }, { "created_utc": 1530149319, "id": "e1cdvn8", "score": -2 }, { "created_utc": 1530149619, "id": "e1cdvn9", "score": 3 }, { "created_utc": 1530149719, "id": "e1cdvna", "score": -5 }, { "created_utc": 1530149819, "id": "e1cdvnb", "score": -8 }, ] }, ] mock_get.side_effect = test_data api = PushshiftAPIMinimal(max_results_per_request=10, rate_limit_per_minute=self._rate_limit) results = api._handle_paging(test_url, {"limit": 25}) self.assertEqual(test_data[0], next(results)) self.assertEqual(1530047819, api._last_timestamp) self.assertEqual(1, mock_get.call_count) mock_get.assert_called_with(test_url, {"limit": 10}) self.assertEqual(test_data[1], next(results)) self.assertEqual(1530049819, api._last_timestamp) self.assertEqual(2, mock_get.call_count) mock_get.assert_called_with(test_url, { "limit": 10, "before": 1530047819 }) self.assertEqual(test_data[2], next(results)) self.assertEqual(1530149819, api._last_timestamp) self.assertEqual(3, mock_get.call_count) mock_get.assert_called_with(test_url, { "limit": 5, "before": 1530049819 }) try: next(results) self.fail("Expected StopIteration") except StopIteration: pass
def test_raise_for_unpageable(self): max_results_per_request = 10 valid_payloads = [ {}, { "sort_type": "created_utc" }, { "sort_type": "created_utc", "sort": "desc" }, { "sort_type": "created_utc", "sort": "asc" }, { "sort_type": "score", "sort": "desc", "limit": 2 }, { "sort_type": "num_comments", "sort": "asc", "limit": 5 }, { "sort_type": "whatever", "sort": "desc", "limit": 8 }, { "sort_type": "seriously_whatever", "sort": "desc", "limit": 10 }, ] invalid_payloads = [ { "sort_type": "score" }, { "sort_type": "num_comments", "sort": "desc" }, { "sort_type": "whatever", "sort": "asc" }, { "sort_type": "score", "sort": "desc", "limit": 11 }, { "sort_type": "num_comments", "sort": "asc", "limit": 15 }, { "sort_type": "whatever", "sort": "desc", "limit": 18 }, { "sort_type": "seriously_whatever", "sort": "desc", "limit": 110 }, ] api = PushshiftAPIMinimal( max_results_per_request=max_results_per_request, rate_limit_per_minute=self._rate_limit, ) for payload in valid_payloads: # Everything should page fine api._raise_for_unpageable(payload) for payload in invalid_payloads: try: api._raise_for_unpageable(payload) self.fail("Expected exception failed to trigger") except NotImplementedError as exc: msg = str(exc) # General error self.assertIn(PushshiftAPIMinimal._page_error_msg, msg) # Error specifics if "limit" in payload: self.assertIn( "queries require limit <= max_results_per_request", msg) else: self.assertIn("must provide a limit", msg)
def test_init_none_rate_limit(self, mock_get): mock_get.return_value = {"server_ratelimit_per_minute": 420} api = PushshiftAPIMinimal(rate_limit_per_minute=None) self.assertEqual(420, api._rlcache.max_storage)
def test_add_nec_args(self): max_results_per_request = 127 api = PushshiftAPIMinimal( max_results_per_request=max_results_per_request, rate_limit_per_minute=self._rate_limit, ) expected_payload = { key: True for key in PushshiftAPIMinimal._limited_args } # Ensure limited calls aren't altered self.assertDictEqual( expected_payload, api._add_nec_args( {key: True for key in PushshiftAPIMinimal._limited_args}), ) # Ensure limit is added as expected self.assertDictEqual( { "arbitrary": "value", "limit": max_results_per_request }, api._add_nec_args({"arbitrary": "value"}), ) # Ensure created_utc is appended to filter self.assertDictEqual( { "more_arbitrary": "more_value", "limit": max_results_per_request, "filter": ["created_utc"], }, api._add_nec_args({ "more_arbitrary": "more_value", "filter": [] }), ) # Ensure string filter turned to list self.assertDictEqual( { "more_arbitrary": "more_value", "limit": max_results_per_request, "filter": ["some_string", "created_utc"], }, api._add_nec_args({ "more_arbitrary": "more_value", "filter": "some_string" }), ) # Ensure iterable-but-not-list filter turned to list self.assertDictEqual( { "more_arbitrary": "more_value", "limit": max_results_per_request, "filter": [0, 1, 2, "created_utc"], }, api._add_nec_args({ "more_arbitrary": "more_value", "filter": set(x for x in range(0, 3)) }), ) # Ensure "created_utc" string filter turned to list self.assertDictEqual( { "more_arbitrary": "more_value", "limit": max_results_per_request, "filter": ["created_utc"], }, api._add_nec_args({ "more_arbitrary": "more_value", "filter": "created_utc" }), )