Exemplo n.º 1
0
 def test_retrieves_in_batches(self):
     for result_set, batchsize in product(VALID_TEST_DATA, VALID_BATCH_SIZES):
         queryset_mock = create_queryset_mock(result_set)
         generator = queryset_iterator(queryset_mock, batchsize=batchsize)
         for batch_number, batch in groupby(enumerate(result_set),
                                            lambda x: x[0] / batchsize):
             keys = [item[1]['pk'] for item in batch]
             try:
                 for _ in xrange(batchsize):
                     generator.next()
             except StopIteration:
                 break
             finally:
                 queryset_mock.filter.calls |should| equal_to(batch_number + 1)
                 (queryset_mock.filter.last_kwargs['pk__in']
                     |should| equal_to(keys))
Exemplo n.º 2
0
    def test_gc_collect_at_end_of_batch(self):
        counter = Counter()
        flexmock(gc, collect=counter.increment)
        for result_set, batchsize in product(VALID_TEST_DATA, VALID_BATCH_SIZES):
            counter.reset()
            queryset_mock = create_queryset_mock(result_set)
            generator = queryset_iterator(queryset_mock, batchsize=batchsize)
            i = 0

            while True:
                try:
                    generator.next()
                    i += 1
                except StopIteration:
                    call_count = i / batchsize
                    if i % batchsize:
                        call_count += 1

                    counter.count |should| equal_to(call_count)
                    break
                else:
                    counter.count |should| equal_to((i - 1) / batchsize)
Exemplo n.º 3
0
    def handle(self, *args, **options):
        twitter_user = None
        start_dt = None
        end_dt = None
        if options['twitter_user']:
            try:
                twitter_user = TwitterUser.objects.get(
                    name=options['twitter_user'])
            except TwitterUser.DoesNotExist:
                raise CommandError('TwitterUser %s does not exist' %
                                   options['twitter_user'])

        if options['start_date']:
            start_dt = make_date_aware(options['start_date'])
            if not start_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
        else:
            start_dt = None
        if options['end_date']:
            end_dt = make_date_aware(options['end_date'])
            if not end_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
        else:
            end_dt = None
        if start_dt and end_dt:
            if end_dt < start_dt:
                raise CommandError('start date must be earlier than end date')

        if twitter_user:
            qs = twitter_user.items.all()
        else:
            qs = TwitterUserItem.objects.all()

        if not options['refetch']:
            qs = qs.filter(urls__isnull=True)

        if start_dt:
            qs = qs.filter(date_published__gte=start_dt)
        if end_dt:
            qs = qs.filter(date_published__lte=end_dt)

        qs = queryset_iterator(qs)

        count = 0
        for tui in qs:
            urls = []
            urls.extend(tui.tweet['entities']['urls'])
            if 'media' in tui.tweet['entities'].keys():
                urls.extend(tui.tweet['entities']['media'])
            if not urls:
                # use of entities.urls was spotty at first
                for u in tui.links:
                    if ('...' in unicodedata.normalize('NFKD', u).encode('ascii','ignore')
                        and tui.tweet['retweet_count'] > 0) :
                        continue
                    urls.append({'url': u, 'expanded_url': u})
            for url in urls:
                try:
                    r = requests.head(url['expanded_url'],
                                              allow_redirects=True,
                                              timeout=10)
                    if r.status_code == 405:
                        r = requests.get(url['expanded_url'],
                                                  allow_redirects=True,
                                                  stream=True, timeout=10)
                        r.close()
                    req_history_headers = []
                    for req in r.history:
                        req_headers = self.decode_headers(req.headers, req.encoding)

                        req_history_headers.append((
                            req.status_code,
                            req.url,
                            req_headers))

                    final_req_headers = self.decode_headers(r.headers, r.encoding)

                    tuiu = TwitterUserItemUrl(
                        item=tui,
                        start_url=url['url'],
                        expanded_url=url['expanded_url'],
                        history=json.dumps(req_history_headers),
                        final_url=r.url,
                        final_status=r.status_code,
                        final_headers=json.dumps(final_req_headers),
                        duration_seconds=r.elapsed.total_seconds())
                    tuiu.save()
                except (requests.RequestException) as e:
                    # TODO: consider trapping/recording
                    # requests.exceptions.ConnectionError,
                    # requests.exceptions.TooManyRedirects etc.
                    # and flagging records as having errored out
                    print("Request Exceptions Error fetching %s: %s" % (url['expanded_url'].encode('utf-8'), e))
                except (requests.packages.urllib3.exceptions.HTTPError) as e:
                    print("HTTPError fetching %s: %s" % (url['expanded_url'].encode('utf-8'), e))
                except (socket_error) as e:
                    print("Socket error fetching %s: %s" % (url['expanded_url'].encode('utf-8'), e))

                    tuiu = TwitterUserItemUrl(
                        item=tui,
                        start_url=url['url'],
                        expanded_url=url['expanded_url'],
                        final_url=url['url'],
                        final_status=410)
                    tuiu.save()

            if urls:
                count += 1
            if options['limit']:
                if count >= options['limit']:
                    sys.exit()
Exemplo n.º 4
0
    def handle(self, *args, **options):
        twitter_user = None
        start_dt = None
        end_dt = None
        if options['twitter_user']:
            try:
                twitter_user = TwitterUser.objects.get(
                    name=options['twitter_user'])
            except TwitterUser.DoesNotExist:
                raise CommandError('TwitterUser %s does not exist' %
                                   options['twitter_user'])

        if options['start_date']:
            start_dt = make_date_aware(options['start_date'])
            if not start_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
        else:
            start_dt = None
        if options['end_date']:
            end_dt = make_date_aware(options['end_date'])
            if not end_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
        else:
            end_dt = None
        if start_dt and end_dt:
            if end_dt < start_dt:
                raise CommandError('start date must be earlier than end date')

        if twitter_user:
            qs = twitter_user.items.all()
        else:
            qs = TwitterUserItem.objects.all()

        if not options['refetch']:
            qs = qs.filter(urls__isnull=True)

        if start_dt:
            qs = qs.filter(date_published__gte=start_dt)
        if end_dt:
            qs = qs.filter(date_published__lte=end_dt)

        qs = queryset_iterator(qs)

        count = 0
        for tui in qs:
            urls = []
            urls.extend(tui.tweet['entities']['urls'])
            if 'media' in tui.tweet['entities'].keys():
                urls.extend(tui.tweet['entities']['media'])
            if not urls:
                # use of entities.urls was spotty at first
                for u in tui.links:
                    if ('...' in unicodedata.normalize('NFKD', u).encode(
                            'ascii', 'ignore')
                            and tui.tweet['retweet_count'] > 0):
                        continue
                    urls.append({'url': u, 'expanded_url': u})
            for url in urls:
                try:
                    r = requests.head(url['expanded_url'],
                                      allow_redirects=True,
                                      timeout=10)
                    if r.status_code == 405:
                        r = requests.get(url['expanded_url'],
                                         allow_redirects=True,
                                         stream=True,
                                         timeout=10)
                        r.close()
                    req_history_headers = []
                    for req in r.history:
                        req_headers = self.decode_headers(
                            req.headers, req.encoding)

                        req_history_headers.append(
                            (req.status_code, req.url, req_headers))

                    final_req_headers = self.decode_headers(
                        r.headers, r.encoding)

                    tuiu = TwitterUserItemUrl(
                        item=tui,
                        start_url=url['url'],
                        expanded_url=url['expanded_url'],
                        history=json.dumps(req_history_headers),
                        final_url=r.url,
                        final_status=r.status_code,
                        final_headers=json.dumps(final_req_headers),
                        duration_seconds=r.elapsed.total_seconds())
                    tuiu.save()
                except (requests.RequestException) as e:
                    # TODO: consider trapping/recording
                    # requests.exceptions.ConnectionError,
                    # requests.exceptions.TooManyRedirects etc.
                    # and flagging records as having errored out
                    print("Request Exceptions Error fetching %s: %s" %
                          (url['expanded_url'].encode('utf-8'), e))
                except (requests.packages.urllib3.exceptions.HTTPError) as e:
                    print("HTTPError fetching %s: %s" %
                          (url['expanded_url'].encode('utf-8'), e))
                except (socket_error) as e:
                    print("Socket error fetching %s: %s" %
                          (url['expanded_url'].encode('utf-8'), e))

                    tuiu = TwitterUserItemUrl(item=tui,
                                              start_url=url['url'],
                                              expanded_url=url['expanded_url'],
                                              final_url=url['url'],
                                              final_status=410)
                    tuiu.save()

            if urls:
                count += 1
            if options['limit']:
                if count >= options['limit']:
                    sys.exit()
Exemplo n.º 5
0
 def test_return_values_correct(self):
     for valid_data in VALID_TEST_DATA:
         queryset_mock = create_queryset_mock(valid_data)
         generator = queryset_iterator(queryset_mock)
         for raw_value, queryset_value in zip(valid_data, generator):
             queryset_value |should| equal_to(raw_value)
Exemplo n.º 6
0
 def test_fails_on_bad_type_for_batch_size(self):
     for bad_type_arg in BAD_TYPE_TEST_DATA:
         queryset_mock = create_queryset_mock(VALID_RESULT_SET)
         generator = queryset_iterator(queryset_mock, batchsize=bad_type_arg)
         (list, generator) |should| throw(TypeError)
Exemplo n.º 7
0
 def test_fails_on_batch_size_lt_eq_to_zero(self):
     for leq_number in LT_EQ_ZERO_TEST_DATA:
         queryset_mock = create_queryset_mock(VALID_RESULT_SET)
         generator = queryset_iterator(queryset_mock, batchsize=leq_number)
         (list, generator) |should| throw(ValueError)