def test_retrieves_in_batches(self): for result_set, batchsize in product(VALID_TEST_DATA, VALID_BATCH_SIZES): queryset_mock = create_queryset_mock(result_set) generator = queryset_iterator(queryset_mock, batchsize=batchsize) for batch_number, batch in groupby(enumerate(result_set), lambda x: x[0] / batchsize): keys = [item[1]['pk'] for item in batch] try: for _ in xrange(batchsize): generator.next() except StopIteration: break finally: queryset_mock.filter.calls |should| equal_to(batch_number + 1) (queryset_mock.filter.last_kwargs['pk__in'] |should| equal_to(keys))
def test_gc_collect_at_end_of_batch(self): counter = Counter() flexmock(gc, collect=counter.increment) for result_set, batchsize in product(VALID_TEST_DATA, VALID_BATCH_SIZES): counter.reset() queryset_mock = create_queryset_mock(result_set) generator = queryset_iterator(queryset_mock, batchsize=batchsize) i = 0 while True: try: generator.next() i += 1 except StopIteration: call_count = i / batchsize if i % batchsize: call_count += 1 counter.count |should| equal_to(call_count) break else: counter.count |should| equal_to((i - 1) / batchsize)
def handle(self, *args, **options): twitter_user = None start_dt = None end_dt = None if options['twitter_user']: try: twitter_user = TwitterUser.objects.get( name=options['twitter_user']) except TwitterUser.DoesNotExist: raise CommandError('TwitterUser %s does not exist' % options['twitter_user']) if options['start_date']: start_dt = make_date_aware(options['start_date']) if not start_dt: raise CommandError('dates must be in the format YYYY-MM-DD') else: start_dt = None if options['end_date']: end_dt = make_date_aware(options['end_date']) if not end_dt: raise CommandError('dates must be in the format YYYY-MM-DD') else: end_dt = None if start_dt and end_dt: if end_dt < start_dt: raise CommandError('start date must be earlier than end date') if twitter_user: qs = twitter_user.items.all() else: qs = TwitterUserItem.objects.all() if not options['refetch']: qs = qs.filter(urls__isnull=True) if start_dt: qs = qs.filter(date_published__gte=start_dt) if end_dt: qs = qs.filter(date_published__lte=end_dt) qs = queryset_iterator(qs) count = 0 for tui in qs: urls = [] urls.extend(tui.tweet['entities']['urls']) if 'media' in tui.tweet['entities'].keys(): urls.extend(tui.tweet['entities']['media']) if not urls: # use of entities.urls was spotty at first for u in tui.links: if ('...' in unicodedata.normalize('NFKD', u).encode('ascii','ignore') and tui.tweet['retweet_count'] > 0) : continue urls.append({'url': u, 'expanded_url': u}) for url in urls: try: r = requests.head(url['expanded_url'], allow_redirects=True, timeout=10) if r.status_code == 405: r = requests.get(url['expanded_url'], allow_redirects=True, stream=True, timeout=10) r.close() req_history_headers = [] for req in r.history: req_headers = self.decode_headers(req.headers, req.encoding) req_history_headers.append(( req.status_code, req.url, req_headers)) final_req_headers = self.decode_headers(r.headers, r.encoding) tuiu = TwitterUserItemUrl( item=tui, start_url=url['url'], expanded_url=url['expanded_url'], history=json.dumps(req_history_headers), final_url=r.url, final_status=r.status_code, final_headers=json.dumps(final_req_headers), duration_seconds=r.elapsed.total_seconds()) tuiu.save() except (requests.RequestException) as e: # TODO: consider trapping/recording # requests.exceptions.ConnectionError, # requests.exceptions.TooManyRedirects etc. # and flagging records as having errored out print("Request Exceptions Error fetching %s: %s" % (url['expanded_url'].encode('utf-8'), e)) except (requests.packages.urllib3.exceptions.HTTPError) as e: print("HTTPError fetching %s: %s" % (url['expanded_url'].encode('utf-8'), e)) except (socket_error) as e: print("Socket error fetching %s: %s" % (url['expanded_url'].encode('utf-8'), e)) tuiu = TwitterUserItemUrl( item=tui, start_url=url['url'], expanded_url=url['expanded_url'], final_url=url['url'], final_status=410) tuiu.save() if urls: count += 1 if options['limit']: if count >= options['limit']: sys.exit()
def handle(self, *args, **options): twitter_user = None start_dt = None end_dt = None if options['twitter_user']: try: twitter_user = TwitterUser.objects.get( name=options['twitter_user']) except TwitterUser.DoesNotExist: raise CommandError('TwitterUser %s does not exist' % options['twitter_user']) if options['start_date']: start_dt = make_date_aware(options['start_date']) if not start_dt: raise CommandError('dates must be in the format YYYY-MM-DD') else: start_dt = None if options['end_date']: end_dt = make_date_aware(options['end_date']) if not end_dt: raise CommandError('dates must be in the format YYYY-MM-DD') else: end_dt = None if start_dt and end_dt: if end_dt < start_dt: raise CommandError('start date must be earlier than end date') if twitter_user: qs = twitter_user.items.all() else: qs = TwitterUserItem.objects.all() if not options['refetch']: qs = qs.filter(urls__isnull=True) if start_dt: qs = qs.filter(date_published__gte=start_dt) if end_dt: qs = qs.filter(date_published__lte=end_dt) qs = queryset_iterator(qs) count = 0 for tui in qs: urls = [] urls.extend(tui.tweet['entities']['urls']) if 'media' in tui.tweet['entities'].keys(): urls.extend(tui.tweet['entities']['media']) if not urls: # use of entities.urls was spotty at first for u in tui.links: if ('...' in unicodedata.normalize('NFKD', u).encode( 'ascii', 'ignore') and tui.tweet['retweet_count'] > 0): continue urls.append({'url': u, 'expanded_url': u}) for url in urls: try: r = requests.head(url['expanded_url'], allow_redirects=True, timeout=10) if r.status_code == 405: r = requests.get(url['expanded_url'], allow_redirects=True, stream=True, timeout=10) r.close() req_history_headers = [] for req in r.history: req_headers = self.decode_headers( req.headers, req.encoding) req_history_headers.append( (req.status_code, req.url, req_headers)) final_req_headers = self.decode_headers( r.headers, r.encoding) tuiu = TwitterUserItemUrl( item=tui, start_url=url['url'], expanded_url=url['expanded_url'], history=json.dumps(req_history_headers), final_url=r.url, final_status=r.status_code, final_headers=json.dumps(final_req_headers), duration_seconds=r.elapsed.total_seconds()) tuiu.save() except (requests.RequestException) as e: # TODO: consider trapping/recording # requests.exceptions.ConnectionError, # requests.exceptions.TooManyRedirects etc. # and flagging records as having errored out print("Request Exceptions Error fetching %s: %s" % (url['expanded_url'].encode('utf-8'), e)) except (requests.packages.urllib3.exceptions.HTTPError) as e: print("HTTPError fetching %s: %s" % (url['expanded_url'].encode('utf-8'), e)) except (socket_error) as e: print("Socket error fetching %s: %s" % (url['expanded_url'].encode('utf-8'), e)) tuiu = TwitterUserItemUrl(item=tui, start_url=url['url'], expanded_url=url['expanded_url'], final_url=url['url'], final_status=410) tuiu.save() if urls: count += 1 if options['limit']: if count >= options['limit']: sys.exit()
def test_return_values_correct(self): for valid_data in VALID_TEST_DATA: queryset_mock = create_queryset_mock(valid_data) generator = queryset_iterator(queryset_mock) for raw_value, queryset_value in zip(valid_data, generator): queryset_value |should| equal_to(raw_value)
def test_fails_on_bad_type_for_batch_size(self): for bad_type_arg in BAD_TYPE_TEST_DATA: queryset_mock = create_queryset_mock(VALID_RESULT_SET) generator = queryset_iterator(queryset_mock, batchsize=bad_type_arg) (list, generator) |should| throw(TypeError)
def test_fails_on_batch_size_lt_eq_to_zero(self): for leq_number in LT_EQ_ZERO_TEST_DATA: queryset_mock = create_queryset_mock(VALID_RESULT_SET) generator = queryset_iterator(queryset_mock, batchsize=leq_number) (list, generator) |should| throw(ValueError)