def captions_action(namespace, output_file): # Handling output single_video = namespace.file is sys.stdin and sys.stdin.isatty() if single_video: edit_namespace_with_csv_io(namespace, 'video') enricher = casanova.enricher(namespace.file, output_file, add=YOUTUBE_CAPTIONS_CSV_HEADERS, keep=namespace.select) loading_bar = LoadingBar('Retrieving captions', unit='video') for row, video in enricher.cells(namespace.column, with_rows=True): result = get_video_captions(video, langs=namespace.lang) loading_bar.update() if result is None: continue track, lines = result prefix = [track.lang, '1' if track.generated else ''] for line in lines: enricher.writerow(row, prefix + list(line)) loading_bar.close()
def comments_action(namespace, output_file): # Handling output single_video = namespace.file is sys.stdin and sys.stdin.isatty() if single_video: edit_namespace_with_csv_io(namespace, 'video') enricher = casanova.enricher(namespace.file, output_file, add=YOUTUBE_COMMENT_CSV_HEADERS, keep=namespace.select) loading_bar = LoadingBar('Retrieving comments', unit='comment', stats={'videos': 0}) def before_sleep_until_midnight(seconds): loading_bar.print( 'API limits reached. Will now wait until midnight Pacific time!') client = YouTubeAPIClient( namespace.key, before_sleep_until_midnight=before_sleep_until_midnight) for row, video in enricher.cells(namespace.column, with_rows=True): generator = client.comments(video) for comment in generator: loading_bar.update() enricher.writerow(row, comment.as_csv_row()) loading_bar.inc('videos') loading_bar.close()
def crowdtangle_summary_action(namespace, output_file): if not namespace.start_date: die('Missing --start-date!') if is_url(namespace.column): edit_namespace_with_csv_io(namespace, 'url') enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select.split(',') if namespace.select else None, add=CROWDTANGLE_SUMMARY_CSV_HEADERS) posts_writer = None if namespace.posts is not None: posts_writer = csv.writer(namespace.posts) posts_writer.writerow(CROWDTANGLE_POST_CSV_HEADERS_WITH_LINK) loading_bar = tqdm(desc='Collecting data', dynamic_ncols=True, total=namespace.total, unit=' urls') client = CrowdTangleAPIClient(namespace.token, rate_limit=namespace.rate_limit) for row, url in enricher.cells(namespace.column, with_rows=True): url = url.strip() try: stats = client.summary(url, start_date=namespace.start_date, with_top_posts=namespace.posts is not None, sort_by=namespace.sort_by, format='csv_row', platforms=namespace.platforms) except CrowdTangleInvalidTokenError: die([ 'Your API token is invalid.', 'Check that you indicated a valid one using the `--token` argument.' ]) except Exception as err: raise err if namespace.posts is not None: stats, posts = stats if posts is not None: for post in posts: posts_writer.writerow([url] + post) enricher.writerow(row, stats) loading_bar.update()
def facebook_comments_action(namespace): # Handling output output_file = open_output_file(namespace.output) # Handling input if is_url(namespace.column): edit_namespace_with_csv_io(namespace, 'post_url') try: scraper = FacebookCommentScraper(namespace.cookie) except FacebookInvalidCookieError: if namespace.cookie in ['firefox', 'chrome']: die('Could not extract cookies from %s.' % namespace.cookie) die([ 'Relevant cookie not found.', 'A Facebook authentication cookie is necessary to be able to access Facebook post comments.', 'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.' ]) # Enricher enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=FACEBOOK_COMMENT_CSV_HEADERS) # Loading bar loading_bar = tqdm(desc='Scraping comments', dynamic_ncols=True, unit=' comments') for i, (row, url) in enumerate(enricher.cells(namespace.column, with_rows=True)): if not is_facebook_post_url(url): loading_bar.close() die('Given url (line %i) is not a Facebook post url: %s' % (i + 1, url)) batches = scraper(url, per_call=True, detailed=True, format='csv_row') for details, batch in batches: for comment in batch: enricher.writerow(row, comment) loading_bar.update(len(batch)) loading_bar.set_postfix(calls=details['calls'], replies=details['replies'], q=details['queue_size'], posts=i + 1) loading_bar.close()
def search_action(namespace, output_file): # Handling output output_file = open_output_file(namespace.output) edit_namespace_with_csv_io(namespace, 'keyword') enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select, add=CSV_HEADERS ) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit='videos', ) http = create_pool() error_file = DummyTqdmFile(sys.stderr) limit = namespace.limit for (row, keyword) in enricher.cells(namespace.column, with_rows=True): url = URL_template_accurate % {'subject': keyword, 'key': namespace.key} next_page = True while next_page: if next_page is True: err, response, result = request_json(http, url) else: url_next = url + '&pageToken=' + next_page err, response, result = request_json(http, url_next) if err: die(err) elif response.status == 403: error_file.write('Running out of API points. You will have to wait until midnight, Pacific time!') time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: die(response.status) next_page, data_l = get_data(result) for data in data_l: if limit is not(None): if limit == 0: return True else: limit -= 1 enricher.writerow(row, data) else: enricher.writerow(row, data)
def facebook_url_likes_action(namespace): output_file = open_output_file(namespace.output) if is_url(namespace.column): edit_namespace_with_csv_io(namespace, 'url') enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select, add=REPORT_HEADERS ) if namespace.column not in enricher.pos: die([ 'Could not find the "%s" column containing the urls in the given CSV file.' % namespace.column ]) loading_bar = tqdm( desc='Retrieving likes', dynamic_ncols=True, unit=' urls', total=namespace.total ) http = create_pool() for row, url in enricher.cells(namespace.column, with_rows=True): loading_bar.update() url = url.strip() err, html = make_request(http, url) if err is not None: loading_bar.close() die('An error occurred while fetching like button for this url: %s' % url) scraped = scrape(html) if scraped is None: loading_bar.close() die('Could not extract Facebook likes from this url\'s like button: %s' % url) enricher.writerow(row, scraped)
def search_action(namespace, output_file): # Handling output single_query = namespace.file is sys.stdin and sys.stdin.isatty() if single_query: edit_namespace_with_csv_io(namespace, 'query') enricher = casanova.enricher( namespace.file, output_file, add=YOUTUBE_VIDEO_SNIPPET_CSV_HEADERS, keep=namespace.select ) loading_bar = LoadingBar( 'Searching videos', unit='video' ) def before_sleep_until_midnight(seconds): loading_bar.print('API limits reached. Will now wait until midnight Pacific time!') client = YouTubeAPIClient( namespace.key, before_sleep_until_midnight=before_sleep_until_midnight ) for row, query in enricher.cells(namespace.column, with_rows=True): loading_bar.print('Searching for "%s"' % query) searcher = client.search(query, order=namespace.order) if namespace.limit: searcher = islice(searcher, namespace.limit) for video in searcher: loading_bar.update() enricher.writerow(row, video.as_csv_row()) loading_bar.close()
def facebook_comments_action(namespace): # Handling output output_file = open_output_file(namespace.output) # Handling input if is_url(namespace.column): edit_namespace_with_csv_io(namespace, 'post_url') try: scraper = FacebookMobileScraper(namespace.cookie, throttle=namespace.throttle) except FacebookInvalidCookieError: if namespace.cookie in COOKIE_BROWSERS: die([ 'Could not extract relevant cookie from "%s".' % namespace.cookie ]) die([ 'Relevant cookie not found.', 'A Facebook authentication cookie is necessary to be able to access Facebook post comments.', 'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.' ]) # Enricher enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=FACEBOOK_COMMENT_CSV_HEADERS) # Loading bar loading_bar = tqdm(desc='Scraping comments', dynamic_ncols=True, unit=' comments') for i, (row, url) in enumerate(enricher.cells(namespace.column, with_rows=True)): if not has_facebook_comments(url): tqdm.write( 'Given url (line %i) probably cannot have Facebook comments: %s' % (i + 1, url), file=sys.stderr) continue batches = scraper.comments(url, per_call=True, detailed=True) for details, batch in batches: for comment in batch: enricher.writerow(row, comment.as_csv_row()) loading_bar.update(len(batch)) loading_bar.set_postfix(calls=details['calls'], replies=details['replies'], q=details['queue_size'], posts=i + 1) loading_bar.close()
def comments_action(namespace, output_file): # Handling output output_file = open_output_file(namespace.output) # Handling input if is_youtube_video_id(namespace.column): edit_namespace_with_csv_io(namespace, 'video_id') elif is_youtube_url(namespace.column): edit_namespace_with_csv_io(namespace, 'video_url') # Enricher enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=CSV_HEADERS) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit=' comments', ) http = create_pool() error_file = DummyTqdmFile(sys.stderr) def make_requests(current_url, http=http): return (request_json(http, current_url), current_url) for (row, url_id) in enricher.cells(namespace.column, with_rows=True): if is_youtube_url(url_id): yt_id = extract_video_id_from_youtube_url(url_id) if yt_id: url = URL_TEMPLATE % {'id': yt_id, 'key': namespace.key} elif is_youtube_video_id(url_id): url = URL_TEMPLATE % {'id': url_id, 'key': namespace.key} else: continue url_queue = deque([url]) while len(url_queue) != 0: couche = [] with ThreadPoolExecutor(max_workers=25) as executor: time.sleep(0.01) couche = executor.map(make_requests, url_queue) url_queue = deque() for resp in couche: ((err, response, result), current_url) = resp if err: error_file.write('{} for {}'.format(err, current_url)) continue elif response.status == 403 and result.get('error').get( 'errors')[0].get('reason') == 'commentsDisabled': error_file.write( 'Comments are disabled for {}'.format(current_url)) continue elif response.status == 403: error_file.write( 'Running out of API points. You will have to wait until midnight, Pacific time!' ) time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: error_file.write('Error {} for {}'.format( response.status, current_url)) continue kind = result.get('kind', None) next_page = result.get('nextPageToken', None) if next_page: url_next = current_url + '&pageToken=' + next_page url_queue.append(url_next) if kind == 'youtube#commentThreadListResponse': # Handling comments pagination items = result.get('items', None) for item in items: snippet = item['snippet'] replies = item.get('replies') if replies: # Checking whether youtube's API send a subset of the replies or not if snippet['totalReplyCount'] != len( replies['comments']) and namespace.full: # If we want the replies and those are not all given by the API, we add the URL specific to the topComment # to the queue, and we deal with that topLevelComment new_url = URL_PARENTID_TEMPLATE % { 'id': snippet['topLevelComment']['id'], 'key': namespace.key } url_queue.append(new_url) data = get_data_full(snippet, True) enricher.writerow(row, data) else: dataTop = get_data_full(snippet, True) enricher.writerow(row, dataTop) for rep in replies['comments']: enricher.writerow( row, get_data_full(rep, False)) else: # if there is not 'replies' key, it means that the comment we fetch is only a topLevelComment top_comment = get_data_full(snippet, True) enricher.writerow(row, top_comment) else: # Handling, commentList, nothing to see here, dealing commments by comments items = result.get('items', None) for item in items: data = get_data_full(item, False) enricher.writerow(row, data)
def twitter_scrape_action(namespace, output_file): single_query = namespace.file is sys.stdin and sys.stdin.isatty() if single_query: edit_namespace_with_csv_io(namespace, 'query', attr_name='query') scraper = TwitterAPIScraper() # Stats loading_bar = LoadingBar('Collecting tweets', total=namespace.limit, unit='tweet', stats={ 'tokens': 1, 'queries': 0 }) enricher = casanova.enricher(namespace.file, output_file, add=TWEET_FIELDS + ADDITIONAL_TWEET_FIELDS, keep=namespace.select) def before_sleep(retry_state): exc = retry_state.outcome.exception() if isinstance(exc, TwitterPublicAPIRateLimitError): loading_bar.inc('tokens') else: loading_bar.inc('failures') loading_bar.print( 'Failed to call Twitter search. Will retry in %s' % prettyprint_seconds(retry_state.idle_for)) for row, query in enricher.cells(namespace.query, with_rows=True): # Templating? if namespace.query_template is not None: query = CUSTOM_FORMATTER.format(namespace.query_template, value=query) loading_bar.print('Searching for "%s"' % query) loading_bar.inc('queries') iterator = scraper.search( query, limit=namespace.limit, before_sleep=before_sleep, include_referenced_tweets=namespace.include_refs, with_meta=True) try: for tweet, meta in iterator: loading_bar.update() tweet_row = format_tweet_as_csv_row(tweet) enricher.writerow(row, tweet_row + format_meta_row(meta)) except TwitterPublicAPIOverCapacityError: loading_bar.die('Got an "Over Capacity" error. Shutting down...') loading_bar.close()
def fetch_action(namespace): # Are we resuming resuming = namespace.resume if resuming and not namespace.output: die(['Cannot --resume without specifying -o/--output.']) # Do we need to fetch only a single url? single_url = namespace.file is sys.stdin and is_url(namespace.column) if single_url: edit_namespace_with_csv_io(namespace, 'url') # If we are hitting a single url we enable contents_in_report if namespace.contents_in_report is None: namespace.contents_in_report = True # HTTP method http_method = namespace.method # Cookie grabber get_cookie = None if namespace.grab_cookies: get_cookie = grab_cookies(namespace.grab_cookies) # Global headers global_headers = None if namespace.headers: global_headers = {} for header in namespace.headers: k, v = parse_http_header(header) global_headers = v flag = 'w' if namespace.output is not None and resuming and isfile(namespace.output): flag = 'r+' output_file = open_output_file(namespace.output, flag=flag) # Resume listener listener = None resuming_reader_loading = None skipped = 0 if resuming: resuming_reader_loading = tqdm(desc='Resuming', dynamic_ncols=True, unit=' lines') def listener(event, row): nonlocal skipped if event == 'resume.output': resuming_reader_loading.update() if event == 'resume.input': skipped += 1 loading_bar.set_postfix(skipped=skipped) loading_bar.update() # Enricher enricher = casanova.threadsafe_enricher( namespace.file, output_file, resumable=resuming, auto_resume=False, add=OUTPUT_ADDITIONAL_HEADERS + (['raw_contents'] if namespace.contents_in_report else []), keep=namespace.select, listener=listener) if namespace.column not in enricher.pos: die([ 'Could not find the "%s" column containing the urls in the given CSV file.' % namespace.column ]) url_pos = enricher.pos[namespace.column] filename_pos = None if namespace.filename is not None: if namespace.filename not in enricher.pos: die([ 'Could not find the "%s" column containing the filenames in the given CSV file.' % namespace.filename ]) filename_pos = enricher.pos[namespace.filename] indexed_input_headers = {h: i for i, h in enumerate(enricher.fieldnames)} if resuming: enricher.resume() resuming_reader_loading.close() # Loading bar total = namespace.total loading_bar = tqdm(desc='Fetching pages', total=total, dynamic_ncols=True, unit=' urls') def url_key(item): url = item[1][url_pos].strip() if not url: return # Url templating if namespace.url_template: return namespace.url_template.format(value=url) return url def request_args(url, item): cookie = None # Cookie if get_cookie: cookie = get_cookie(url) # Headers headers = None if global_headers: headers = global_headers return {'method': http_method, 'cookie': cookie, 'headers': headers} def write_output(index, row, resolved=None, status=None, error=None, filename=None, encoding=None, data=None): addendum = [ resolved or '', status or '', error or '', filename or '', encoding or '' ] if namespace.contents_in_report: addendum.append(data or '') enricher.writerow(index, row, addendum) errors = 0 status_codes = Counter() fetch_kwargs = { 'threads': namespace.threads, 'throttle': namespace.throttle, 'domain_parallelism': namespace.domain_parallelism } if namespace.timeout is not None: fetch_kwargs['timeout'] = namespace.timeout multithreaded_iterator = multithreaded_fetch(enricher, key=url_key, request_args=request_args, **fetch_kwargs) for result in multithreaded_iterator: index, row = result.item if not result.url: write_output(index, row) loading_bar.update() continue response = result.response data = response.data if response is not None else None content_write_flag = 'wb' # Updating stats if result.error is not None: errors += 1 else: if response.status >= 400: status_codes[response.status] += 1 postfix = {'errors': errors} for code, count in status_codes.most_common(1): postfix[str(code)] = count loading_bar.set_postfix(**postfix) loading_bar.update() # No error if result.error is None: filename = None # Building filename if data: if filename_pos is not None or namespace.filename_template: if namespace.filename_template: filename = CUSTOM_FORMATTER.format( namespace.filename_template, value=row[filename_pos] if filename_pos is not None else None, ext=result.meta['ext'], line=LazyLineDict(indexed_input_headers, row)) else: filename = row[filename_pos] + result.meta['ext'] else: # NOTE: it would be nice to have an id that can be sorted by time filename = str(uuid4()) + result.meta['ext'] # Standardize encoding? encoding = result.meta['encoding'] if data and namespace.standardize_encoding or namespace.contents_in_report: if encoding is None or encoding != 'utf-8' or namespace.contents_in_report: data = data.decode( encoding if encoding is not None else 'utf-8', errors='replace') encoding = 'utf-8' content_write_flag = 'w' # Writing file on disk if data and not namespace.contents_in_report: if namespace.compress: filename += '.gz' resource_path = join(namespace.output_dir, filename) resource_dir = dirname(resource_path) os.makedirs(resource_dir, exist_ok=True) with open(resource_path, content_write_flag) as f: # TODO: what if standardize_encoding + compress? f.write( gzip.compress(data) if namespace.compress else data) # Reporting in output resolved_url = response.geturl() write_output( index, row, resolved=resolved_url if resolved_url != result.url else None, status=response.status, filename=filename, encoding=encoding, data=data) # Handling potential errors else: error_code = report_error(result.error) write_output(index, row, error=error_code) # Closing files output_file.close()