def handle(self, *args, **options): # FIXME: why use options.get again and again? twitter_user = None user_set = None start_dt = None end_dt = None if options.get('twitter_user', False): try: twitter_user = TwitterUser.objects.get( name=options.get('twitter_user')) except TwitterUser.DoesNotExist: raise CommandError('TwitterUser %s does not exist' % options.get('twitter_user')) elif options.get('set_name', False): user_set = None try: user_set = TwitterUserSet.objects.get( name=options.get('set_name')) except TwitterUserSet.DoesNotExist: raise CommandError('TwitterUserSet %s does not exist' % options.get('set_name')) else: raise CommandError('please specify a twitter user or set name') if options.get('start_date', False): start_dt = make_date_aware(options.get('start_date')) if not start_dt: raise CommandError('dates must be in the format YYYY-MM-DD') else: start_dt = None if options.get('end_date', False): end_dt = make_date_aware(options.get('end_date')) if not end_dt: raise CommandError('dates must be in the format YYYY-MM-DD') else: end_dt = None if start_dt and end_dt: if end_dt < start_dt: raise CommandError('start date must be earlier than end date') if twitter_user: qs = twitter_user.items.all() elif user_set: qs = TwitterUserItem.objects.filter( twitter_user__sets__in=[user_set]) if start_dt: qs = qs.filter(date_published__gte=start_dt) if end_dt: qs = qs.filter(date_published__lte=end_dt) # tweak for python 2.7 to avoid having to set PYTHONIOENCODING=utf8 # in environment, see Graham Fawcett's comment/suggestion at: # nedbatchelder.com/blog/200401/printing_unicode_from_python.html writer_class = codecs.getwriter('utf-8') sys.stdout = writer_class(sys.stdout, 'replace') for tui in qs: print '\t'.join(tui.csv)
def handle(self, *args, **options): twitter_user = user_set = start_dt = end_dt = xls = filename = None if options['filename']: filename = options.get('filename') xls = options['xls'] if xls and filename is None: raise CommandError("When --xls is specified, \ --filename=FILENAME is required") if not xls and filename is not None: raise CommandError("Writing CSV files currently not yet \ supported; recommend piping output to a file") if options['twitter_user']: try: twitter_user = TwitterUser.objects.get( name__iexact=options.get('twitter_user')) qs = twitter_user.items.all() except TwitterUser.DoesNotExist: raise CommandError('TwitterUser %s does not exist' % options.get('twitter_user')) elif options['set_name']: try: user_set = TwitterUserSet.objects.get( name=options.get('set_name')) qs = TwitterUserItem.objects.filter( twitter_user__sets__in=[user_set]) except TwitterUserSet.DoesNotExist: raise CommandError('TwitterUserSet %s does not exist' % options['set_name']) else: raise CommandError('please provide either twitteruser or setname') if options['start_date']: start_dt = make_date_aware(options.get('start_date')) if not start_dt: raise CommandError('dates must be in the format YYYY-MM-DD') qs = qs.filter(date_published__gte=start_dt) if options['end_date']: end_dt = make_date_aware(options.get('end_date')) if not end_dt: raise CommandError('dates must be in the format YYYY-MM-DD') qs = qs.filter(date_published__lte=end_dt) if start_dt and end_dt: if end_dt < start_dt: raise CommandError('start date must be earlier than end date') # tweak for python 2.7 to avoid having to set PYTHONIOENCODING=utf8 # in environment, see Graham Fawcett's comment/suggestion at: # nedbatchelder.com/blog/200401/printing_unicode_from_python.html writer_class = codecs.getwriter('utf-8') sys.stdout = writer_class(sys.stdout, 'replace') if xls: tworkbook = xls_tweets_workbook(qs, TwitterUserItem.csv_headers) tworkbook.save(filename) else: for tui in qs: print '\t'.join(tui.csv)
def handle(self, *args, **options): # FIXME: why use options.get again and again? twitter_user = None user_set = None start_dt = None end_dt = None if options.get('twitter_user', False): try: twitter_user = TwitterUser.objects.get( name=options.get('twitter_user')) except TwitterUser.DoesNotExist: raise CommandError('TwitterUser %s does not exist' % options.get('twitter_user')) elif options.get('set_name', False): user_set = None try: user_set = TwitterUserSet.objects.get( name=options.get('set_name')) except TwitterUserSet.DoesNotExist: raise CommandError('TwitterUserSet %s does not exist' % options.get('set_name')) else: raise CommandError('please specify a twitter user or set name') if options.get('start_date', False): start_dt = make_date_aware(options.get('start_date')) if not start_dt: raise CommandError('dates must be in the format YYYY-MM-DD') else: start_dt = None if options.get('end_date', False): end_dt = make_date_aware(options.get('end_date')) if not end_dt: raise CommandError('dates must be in the format YYYY-MM-DD') else: end_dt = None if start_dt and end_dt: if end_dt < start_dt: raise CommandError('start date must be earlier than end date') if twitter_user: qs = twitter_user.items.all() elif user_set: qs = TwitterUserItem.objects.filter( twitter_user__sets__in=[user_set]) if start_dt: qs = qs.filter(date_published__gte=start_dt) if end_dt: qs = qs.filter(date_published__lte=end_dt) for tui in qs: print '\t'.join(tui.csv)
def handle(self, *args, **options): twitter_user = user_set = start_dt = end_dt = fmt = filename = None fmt = options['format'].lower() if fmt not in ['csv', 'json', 'xls']: raise CommandError("format must be either csv, json or xls") if options['filename']: filename = options.get('filename') if fmt == 'xls' and filename is None: raise CommandError("When --format is xls, \ --filename=FILENAME is required") if options['twitter_user']: try: twitter_user = TwitterUser.objects.get( name__iexact=options.get('twitter_user')) qs = twitter_user.items.all() except TwitterUser.DoesNotExist: raise CommandError('TwitterUser %s does not exist' % options.get('twitter_user')) elif options['set_name']: try: user_set = TwitterUserSet.objects.get( name=options.get('set_name')) qs = TwitterUserItem.objects.filter( twitter_user__sets__in=[user_set]) except TwitterUserSet.DoesNotExist: raise CommandError('TwitterUserSet %s does not exist' % options['set_name']) else: raise CommandError('please provide either twitteruser or setname') if options['start_date']: start_dt = make_date_aware(options.get('start_date')) if not start_dt: raise CommandError('dates must be in the format YYYY-MM-DD') qs = qs.filter(date_published__gte=start_dt) if options['end_date']: end_dt = make_date_aware(options.get('end_date')) if not end_dt: raise CommandError('dates must be in the format YYYY-MM-DD') qs = qs.filter(date_published__lte=end_dt) if start_dt and end_dt: if end_dt < start_dt: raise CommandError('start date must be earlier than end date') # tweak for python 2.7 to avoid having to set PYTHONIOENCODING=utf8 # in environment, see Graham Fawcett's comment/suggestion at: # nedbatchelder.com/blog/200401/printing_unicode_from_python.html if filename: sys.stdout = codecs.open(filename, 'w', 'utf-8') else: writer_class = codecs.getwriter('utf-8') sys.stdout = writer_class(sys.stdout, 'replace') if fmt == 'xls': tworkbook = xls_tweets_workbook(qs, TwitterUserItem.csv_headers) tworkbook.save(filename) elif fmt == 'json': for tui in qs: print tui.item_json else: print '\t'.join(TwitterUserItem.csv_headers) for tui in qs: print '\t'.join(tui.csv)
def handle(self, *args, **options): twitter_user = None start_dt = None end_dt = None if options['twitter_user']: try: twitter_user = TwitterUser.objects.get( name=options['twitter_user']) except TwitterUser.DoesNotExist: raise CommandError('TwitterUser %s does not exist' % options['twitter_user']) if options['start_date']: start_dt = make_date_aware(options['start_date']) if not start_dt: raise CommandError('dates must be in the format YYYY-MM-DD') else: start_dt = None if options['end_date']: end_dt = make_date_aware(options['end_date']) if not end_dt: raise CommandError('dates must be in the format YYYY-MM-DD') else: end_dt = None if start_dt and end_dt: if end_dt < start_dt: raise CommandError('start date must be earlier than end date') if twitter_user: qs = twitter_user.items.all() else: qs = TwitterUserItem.objects.all() if not options['refetch']: qs = qs.filter(urls__isnull=True) if start_dt: qs = qs.filter(date_published__gte=start_dt) if end_dt: qs = qs.filter(date_published__lte=end_dt) qs = queryset_iterator(qs) count = 0 for tui in qs: urls = [] urls.extend(tui.tweet['entities']['urls']) if 'media' in tui.tweet['entities'].keys(): urls.extend(tui.tweet['entities']['media']) if not urls: # use of entities.urls was spotty at first for u in tui.links: if ('...' in unicodedata.normalize('NFKD', u).encode('ascii','ignore') and tui.tweet['retweet_count'] > 0) : continue urls.append({'url': u, 'expanded_url': u}) for url in urls: try: r = requests.head(url['expanded_url'], allow_redirects=True, timeout=10) if r.status_code == 405: r = requests.get(url['expanded_url'], allow_redirects=True, stream=True, timeout=10) r.close() req_history_headers = [] for req in r.history: req_headers = self.decode_headers(req.headers, req.encoding) req_history_headers.append(( req.status_code, req.url, req_headers)) final_req_headers = self.decode_headers(r.headers, r.encoding) tuiu = TwitterUserItemUrl( item=tui, start_url=url['url'], expanded_url=url['expanded_url'], history=json.dumps(req_history_headers), final_url=r.url, final_status=r.status_code, final_headers=json.dumps(final_req_headers), duration_seconds=r.elapsed.total_seconds()) tuiu.save() except (requests.RequestException) as e: # TODO: consider trapping/recording # requests.exceptions.ConnectionError, # requests.exceptions.TooManyRedirects etc. # and flagging records as having errored out print("Request Exceptions Error fetching %s: %s" % (url['expanded_url'].encode('utf-8'), e)) except (requests.packages.urllib3.exceptions.HTTPError) as e: print("HTTPError fetching %s: %s" % (url['expanded_url'].encode('utf-8'), e)) except (socket_error) as e: print("Socket error fetching %s: %s" % (url['expanded_url'].encode('utf-8'), e)) tuiu = TwitterUserItemUrl( item=tui, start_url=url['url'], expanded_url=url['expanded_url'], final_url=url['url'], final_status=410) tuiu.save() if urls: count += 1 if options['limit']: if count >= options['limit']: sys.exit()
def handle(self, *args, **options): twitter_user = None start_dt = None end_dt = None if options['twitter_user']: try: twitter_user = TwitterUser.objects.get( name=options['twitter_user']) except TwitterUser.DoesNotExist: raise CommandError('TwitterUser %s does not exist' % options['twitter_user']) if options['start_date']: start_dt = make_date_aware(options['start_date']) if not start_dt: raise CommandError('dates must be in the format YYYY-MM-DD') else: start_dt = None if options['end_date']: end_dt = make_date_aware(options['end_date']) if not end_dt: raise CommandError('dates must be in the format YYYY-MM-DD') else: end_dt = None if start_dt and end_dt: if end_dt < start_dt: raise CommandError('start date must be earlier than end date') if twitter_user: qs = twitter_user.items.all() else: qs = TwitterUserItem.objects.all() if not options['refetch']: qs = qs.filter(urls__isnull=True) if start_dt: qs = qs.filter(date_published__gte=start_dt) if end_dt: qs = qs.filter(date_published__lte=end_dt) qs = queryset_iterator(qs) count = 0 for tui in qs: urls = [] urls.extend(tui.tweet['entities']['urls']) if 'media' in tui.tweet['entities'].keys(): urls.extend(tui.tweet['entities']['media']) if not urls: # use of entities.urls was spotty at first for u in tui.links: if ('...' in unicodedata.normalize('NFKD', u).encode( 'ascii', 'ignore') and tui.tweet['retweet_count'] > 0): continue urls.append({'url': u, 'expanded_url': u}) for url in urls: try: r = requests.head(url['expanded_url'], allow_redirects=True, timeout=10) if r.status_code == 405: r = requests.get(url['expanded_url'], allow_redirects=True, stream=True, timeout=10) r.close() req_history_headers = [] for req in r.history: req_headers = self.decode_headers( req.headers, req.encoding) req_history_headers.append( (req.status_code, req.url, req_headers)) final_req_headers = self.decode_headers( r.headers, r.encoding) tuiu = TwitterUserItemUrl( item=tui, start_url=url['url'], expanded_url=url['expanded_url'], history=json.dumps(req_history_headers), final_url=r.url, final_status=r.status_code, final_headers=json.dumps(final_req_headers), duration_seconds=r.elapsed.total_seconds()) tuiu.save() except (requests.RequestException) as e: # TODO: consider trapping/recording # requests.exceptions.ConnectionError, # requests.exceptions.TooManyRedirects etc. # and flagging records as having errored out print("Request Exceptions Error fetching %s: %s" % (url['expanded_url'].encode('utf-8'), e)) except (requests.packages.urllib3.exceptions.HTTPError) as e: print("HTTPError fetching %s: %s" % (url['expanded_url'].encode('utf-8'), e)) except (socket_error) as e: print("Socket error fetching %s: %s" % (url['expanded_url'].encode('utf-8'), e)) tuiu = TwitterUserItemUrl(item=tui, start_url=url['url'], expanded_url=url['expanded_url'], final_url=url['url'], final_status=410) tuiu.save() if urls: count += 1 if options['limit']: if count >= options['limit']: sys.exit()
def handle(self, *args, **options): twitter_user = None start_dt = None end_dt = None if options['twitter_user']: try: twitter_user = TwitterUser.objects.get( name=options['twitter_user']) except TwitterUser.DoesNotExist: raise CommandError('TwitterUser %s does not exist' % options['twitter_user']) if options['start_date']: start_dt = make_date_aware(options['start_date']) if not start_dt: raise CommandError('dates must be in the format YYYY-MM-DD') else: start_dt = None if options['end_date']: end_dt = make_date_aware(options['end_date']) if not end_dt: raise CommandError('dates must be in the format YYYY-MM-DD') else: end_dt = None if start_dt and end_dt: if end_dt < start_dt: raise CommandError('start date must be earlier than end date') if twitter_user: qs = twitter_user.items.all() else: qs = TwitterUserItem.objects.all() if start_dt: qs = qs.filter(date_published__gte=start_dt) if end_dt: qs = qs.filter(date_published__lte=end_dt) # be sure we move through the list in a consistent order qs = qs.order_by('date_published') session = requests.Session() count = 0 for tui in qs: urls = [] urls.extend(tui.tweet['entities']['urls']) if not urls: # use of entities.urls was spotty at first for u in tui.links: urls.append({'url': u, 'expanded_url': u}) for url in urls: # use filter because 0-to-many might already exist qs_tuiu = TwitterUserItemUrl.objects.filter( item=tui, start_url=url['url'], expanded_url=url['expanded_url']) # if any already exist, and we're not refetching, move on if qs_tuiu.count() > 0 and \ not options['refetch']: continue # otherwise, create a new one from scratch try: r = session.get(url['url'], allow_redirects=True, stream=False) r.close() except: # TODO: consider trapping/recording # requests.exceptions.ConnectionError, # requests.exceptions.TooManyRedirects etc. # and flagging records as having errored out tuiu = TwitterUserItemUrl( item=tui, start_url=url['url'], expanded_url=url['url'], final_url=url['url'], final_status=410) tuiu.save() continue tuiu = TwitterUserItemUrl( item=tui, start_url=url['url'], expanded_url=url['expanded_url'], history=json.dumps([( req.status_code, req.url, dict(req.headers)) for req in r.history]), final_url=r.url, final_status=r.status_code, final_headers=json.dumps(dict(r.headers)), duration_seconds=r.elapsed.total_seconds()) tuiu.save() count += 1 if options['limit']: if count >= options['limit']: sys.exit()