def videos_action(namespace, output_file): enricher = casanova.enricher(namespace.file, output_file, add=REPORT_HEADERS, keep=namespace.select) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit=' videos', ) http = create_pool() column = namespace.column def rows_with_videos_id(): for row, ytb_data in enricher.cells(namespace.column, with_rows=True): video_id = None if is_youtube_video_id(ytb_data): video_id = ytb_data elif is_youtube_url(ytb_data): video_id = extract_video_id_from_youtube_url(ytb_data) yield row, video_id for chunk in chunks_iter(rows_with_videos_id(), 50): all_ids = [video_id for _, video_id in chunk if video_id] list_id = ",".join(all_ids) url = URL_TEMPLATE % {'list_id': list_id, 'key': namespace.key} err, response, result = request_json(http, url) if err: die(err) elif response.status == 403: time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: die(response.status) data = get_data(result) not_available = [] id_available = set(data) not_available = set(all_ids).difference(id_available) loading_bar.update(len(chunk)) line_empty = [] for row, video_id in chunk: if video_id is None or video_id in not_available: enricher.writerow(row) else: enricher.writerow(row, data[video_id])
def __init__(self, cookie): # Grabbing cookie cookie = grab_facebook_cookie(cookie) if cookie is None: raise FacebookInvalidCookieError self.cookie = cookie self.http = create_pool()
def videos_action(namespace, output_file): enricher = CSVEnricher( namespace.file, namespace.column, output_file, report_headers=REPORT_HEADERS, select=namespace.select.split(',') if namespace.select else None ) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit=' videos', ) http = create_pool() for chunk in gen_chunks(enricher): all_ids = [row[0] for row in chunk if row[0]] list_id = ",".join(all_ids) url = URL_TEMPLATE % {'list_id': list_id, 'key': namespace.key} err, response, result = request_json(http, url) if err: die(err) elif response.status == 403: time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: die(response.status) data = get_data(result) id_available = set(data) not_available = set(all_ids).difference(id_available) loading_bar.update(len(chunk)) line_empty = [] for item in chunk: video_id, line = item if video_id is None: enricher.write_empty(line) elif video_id in not_available: line_empty = [video_id] + [''] * (len(REPORT_HEADERS) - 1) enricher.write(line, line_empty) else: enricher.write(line, data[video_id])
def __init__(self, token, rate_limit=None): if rate_limit is None: rate_limit = CROWDTANGLE_DEFAULT_RATE_LIMIT summary_rate_limit = CROWDTANGLE_LINKS_DEFAULT_RATE_LIMIT else: rate_limit = rate_limit summary_rate_limit = rate_limit self.token = token self.rate_limiter_state = RateLimiterState(rate_limit, period=60) self.summary_rate_limiter_state = RateLimiterState(summary_rate_limit, period=60) self.http = create_pool(timeout=CROWDTANGLE_DEFAULT_TIMEOUT)
def search_action(namespace, output_file): # Handling output output_file = open_output_file(namespace.output) edit_namespace_with_csv_io(namespace, 'keyword') enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select, add=CSV_HEADERS ) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit='videos', ) http = create_pool() error_file = DummyTqdmFile(sys.stderr) limit = namespace.limit for (row, keyword) in enricher.cells(namespace.column, with_rows=True): url = URL_template_accurate % {'subject': keyword, 'key': namespace.key} next_page = True while next_page: if next_page is True: err, response, result = request_json(http, url) else: url_next = url + '&pageToken=' + next_page err, response, result = request_json(http, url_next) if err: die(err) elif response.status == 403: error_file.write('Running out of API points. You will have to wait until midnight, Pacific time!') time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: die(response.status) next_page, data_l = get_data(result) for data in data_l: if limit is not(None): if limit == 0: return True else: limit -= 1 enricher.writerow(row, data) else: enricher.writerow(row, data)
def facebook_url_likes_action(namespace): output_file = open_output_file(namespace.output) if is_url(namespace.column): edit_namespace_with_csv_io(namespace, 'url') enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select, add=REPORT_HEADERS ) if namespace.column not in enricher.pos: die([ 'Could not find the "%s" column containing the urls in the given CSV file.' % namespace.column ]) loading_bar = tqdm( desc='Retrieving likes', dynamic_ncols=True, unit=' urls', total=namespace.total ) http = create_pool() for row, url in enricher.cells(namespace.column, with_rows=True): loading_bar.update() url = url.strip() err, html = make_request(http, url) if err is not None: loading_bar.close() die('An error occurred while fetching like button for this url: %s' % url) scraped = scrape(html) if scraped is None: loading_bar.close() die('Could not extract Facebook likes from this url\'s like button: %s' % url) enricher.writerow(row, scraped)
def comments_action(namespace, output_file): output_file = open_output_file(namespace.output) writer = csv.writer(output_file) writer.writerow(CSV_HEADERS) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit=' comments', ) http = create_pool() url = URL_TEMPLATE % {'id': namespace.id, 'key': namespace.key} next_page = True all_data = [] while next_page: if next_page is True: err, response, result = request_json(http, url) else: url_next = url + '&pageToken=' + next_page err, response, result = request_json(http, url_next) if err: die(err) elif response.status == 403: time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: die(response.status) next_page, data = get_data(result) for comment in data: loading_bar.update() writer.writerow(comment)
def __init__(self, spec=None, spider=None, spiders=None, start_jobs=None, queue_path=None, threads=25, buffer_size=DEFAULT_GROUP_BUFFER_SIZE, throttle=DEFAULT_THROTTLE): # NOTE: crawling could work depth-first but: # buffer_size should be 0 (requires to fix quenouille issue #1) # Params self.start_jobs = start_jobs self.queue_path = queue_path self.threads = threads self.buffer_size = buffer_size self.throttle = throttle self.using_persistent_queue = queue_path is not None self.http = create_pool(threads=threads) self.state = CrawlerState() self.started = False # Memory queue if not self.using_persistent_queue: queue = Queue() # Persistent queue else: queue = SQLiteQueue(queue_path, multithreading=True, auto_commit=False) # Creating spiders if spec is not None: if 'spiders' in spec: spiders = { name: DefinitionSpider(s, name=name) for name, s in spec['spiders'].items() } self.single_spider = False else: spiders = {'default': DefinitionSpider(spec)} self.single_spider = True elif spider is not None: spiders = {'default': spider} elif spiders is None: raise TypeError( 'minet.Crawler: expecting either `spec`, `spider` or `spiders`.' ) # Solving function spiders for name, s in spiders.items(): if callable(s) and not isinstance(s, Spider): spiders[name] = FunctionSpider(s, name) self.queue = queue self.spiders = spiders
def facebook_post_stats_action(namespace): # Handling output output_file = open_output_file(namespace.output) enricher = casanova.enricher(namespace.file, output_file, add=REPORT_HEADERS, keep=namespace.select) http = create_pool() def fetch_facebook_page_stats(url): err, response = request(http, url, cookie='locale=en_US') if err: return 'http-error', None if response.status == 404: return 'not-found', None if response.status >= 400: return 'http-error', None html = response.data if CAPTCHA in html: die(['Rate limit reached!', 'Last url: %s' % url]) if (CURRENT_AVAILABILITY_DISCLAIMER in html or AVAILABILITY_DISCLAIMER in html): return 'unavailable', None if LOGIN_DISCLAIMER in html: return 'private-or-unavailable', None # TODO: integrate into ural bpost_id = url.rsplit('/', 1)[-1].encode() # Extracting metadata meta_extractor = re.compile(META_EXTRACTOR_TEMPLATE % bpost_id) match = meta_extractor.search(html) if match is None: return 'extraction-failed', None data = json5.loads(match.group(1).decode()) data = nested_get([ 'jsmods', 'pre_display_requires', 0, 3, 1, '__bbox', 'result', 'data', 'feedback' ], data) if data is None: return 'extraction-failed', None # TODO: remove, this is here as a test # TODO: try to find a post where comments are disabled if get_count(data['seen_by_count']): print_err('Found seen_by_count: %i for %s' % (get_count(data['seen_by_count']), url)) if 'political_figure_data' in data and data['political_figure_data']: print_err('Found political_figure_data:') print_err(data['political_figure_data']) if get_count(data['reaction_count']) != get_count(data['reactors']): print_err('Found different reactions/reactors for %s' % url) # Extracting data from hidden html hidden_html_extractor = re.compile(HTML_EXTRACTOR_TEMPLATE % bpost_id) match = hidden_html_extractor.search(html) if match is not None: hidden_html = match.group(1).decode() soup = BeautifulSoup(hidden_html, 'lxml') # Sometimes fetching a post behaves weirdly if soup.select_one('h5 a') is None: return 'extraction-failed', None data['scraped'] = {} timestamp_elem = soup.select_one('[data-utime]') timestamp = int(timestamp_elem.get('data-utime')) data['scraped']['account_name'] = soup.select_one( 'h5 a').get_text().strip() data['scraped']['timestamp'] = timestamp data['scraped']['time'] = datetime.fromtimestamp( timestamp).isoformat() # TODO: use a context manager try: data['scraped']['aria_label'] = timestamp_elem.parent.get( 'aria-label') except: pass try: data['scraped']['text'] = soup.select_one( '[data-testid="post_message"]').get_text() except: pass # try: # data['scraped']['link'] = soup.select_one('[data-lynx-uri]').get('href') # except: # pass return None, data # Loading bar loading_bar = tqdm(desc='Fetching post stats', dynamic_ncols=True, unit=' posts', total=namespace.total) for row, post_url in enricher.cells(namespace.column, with_rows=True): loading_bar.update() if (not post_url or not is_facebook_post_url(post_url) or not is_facebook_url(post_url)): enricher.writerow(row, format_err('not-facebook-post')) continue err, data = fetch_facebook_page_stats(post_url) if err: enricher.writerow(row, format_err(err)) else: enricher.writerow(row, format(data)) # Throttling sleep_with_entropy(FACEBOOK_WEB_DEFAULT_THROTTLE, 5.0)
# 'https://www.google.com/url?q=https://www.facebook.com/Contaniunamenos/&sa=D&ust=1603455678482000&usg=AFQjCNFSANkezX4k8Fk4sY6xg30u6CHO2Q', # Invalid URL 'http://www.outremersbeyou.com/talent-de-la-semaine-la-designer-comorienne-aisha-wadaane-je-suis-fiere-de-mes-origines/', # Refresh header 'http://la-grange.net/2015/03/26/refresh/', # GET & UA nonsense 'https://ebay.us/BUkuxU', # Incorrect refresh header 'http://ow.ly/csT350v7mRc', # Utf-8 location header 'http://ow.ly/2awz50v1JkO', 'http://xfru.it/v2uFaC', # IP Host redirect 'https://bit.ly/2ANzJNW' ] http = create_pool() for url in URLS: print() error, stack = resolve(http, url, follow_meta_refresh=True) print(error) for item in stack: print(item)
from minet.utils import RateLimiterState, create_pool FACEBOOK_URL = 'https://www.facebook.com' FACEBOOK_MOBILE_URL = 'https://m.facebook.com' FACEBOOK_OUTPUT_FORMATS = {'raw', 'csv_row'} FACEBOOK_MOBILE_DEFAULT_THROTTLE = 0.5 FACEBOOK_WEB_DEFAULT_THROTTLE = 20.0 FACEBOOK_MOBILE_RATE_LIMITER_STATE = RateLimiterState( 1, FACEBOOK_MOBILE_DEFAULT_THROTTLE) FACEBOOK_WEB_RATE_LIMITER_STATE = RateLimiterState( 1, FACEBOOK_WEB_DEFAULT_THROTTLE) FACEBOOK_DEFAULT_POOL = create_pool() FACEBOOK_COMMENT_CSV_HEADERS = [ 'post_id', 'comment_id', 'user_id', 'user_handle', 'user_url', 'user_label', 'comment_text', 'comment_html', 'formatted_date', 'date', 'reactions', 'replies', 'in_reply_to' ] FACEBOOK_POST_STATS_CSV_HEADERS = [ 'error', 'canonical', 'account_name', 'timestamp', 'time', 'link', 'aria_label', 'text', 'share_count', 'comment_count', 'reaction_count', 'video_view_count' ] FACEBOOK_REACTION_KEYS = OrderedDict({ 1: 'like',
def __init__(self, token): self.token = token self.http = create_pool(timeout=MEDIACLOUD_DEFAULT_TIMEOUT)
def hyphe_dump_action(namespace): # Paths output_dir = 'hyphe_corpus_%s' % namespace.corpus if namespace.output_dir is not None: output_dir = namespace.output_dir os.makedirs(output_dir, exist_ok=True) webentities_output_path = join(output_dir, 'webentities.csv') pages_output_path = join(output_dir, 'pages.csv') if namespace.body: body_output_dir = join(output_dir, 'content') os.makedirs(body_output_dir, exist_ok=True) # Fixing trailing slash if not namespace.url.endswith('/'): namespace.url += '/' http = create_pool() jsonrpc = create_corpus_jsonrpc(http, namespace.url, namespace.corpus) # First we need to start the corpus ensure_corpus_is_started(jsonrpc) # Then we gather some handy statistics err, stats = jsonrpc('get_status') # Then we fetch webentities webentities_file = open(webentities_output_path, 'w') webentities_writer = csv.writer(webentities_file) webentities_writer.writerow(WEBENTITY_HEADERS) loading_bar = tqdm(desc='Paginating web entities', unit=' webentities', dynamic_ncols=True, total=count_total_webentities(stats)) webentities = {} for webentity in webentities_iter(jsonrpc): loading_bar.update() webentities[webentity['id']] = webentity webentities_writer.writerow(format_webentity_for_csv(webentity)) webentities_file.close() loading_bar.close() # Finally we paginate pages pages_file = open(pages_output_path, 'w') pages_writer = csv.writer(pages_file) pages_writer.writerow(PAGE_HEADERS + (ADDITIONAL_PAGE_HEADERS if namespace.body else [])) loading_bar = tqdm(desc='Dumping pages', unit=' pages', dynamic_ncols=True, total=count_total_pages(stats)) for webentity, page in pages_iter(jsonrpc, webentities, body=namespace.body): loading_bar.update() filename = None if namespace.body and 'body' in page: filename = format_page_filename(webentity, page) filepath = join(body_output_dir, filename) os.makedirs(dirname(filepath), exist_ok=True) with open(filepath, 'wb') as f: binary = base64.b64decode(page['body']) binary = zlib.decompress(binary) binary = gzip.compress(binary) f.write(binary) pages_writer.writerow( format_page_for_csv(webentity, page, filename=filename, body=namespace.body))
from minet.utils import create_pool, raw_resolve SSL_ISSUES = [ 'https://lemde.fr/2zmunsV', 'https://buff.ly/2Nnaevg', 'http://www.plateforme1418.com/', 'https://www.silverday-normandie.fr', 'http://swll.to/rJjizGY', 'http://ow.ly/zpnt30mdb9N' ] http = create_pool(insecure=True) for url in SSL_ISSUES: print(url) err, stack, response = raw_resolve(http, url, return_response=True) print('Error', err, type(err)) for r in stack: print(r) print()
def multithreaded_resolve(iterator, key=None, resolve_args=None, threads=25, throttle=DEFAULT_THROTTLE, max_redirects=5, follow_refresh_header=True, follow_meta_refresh=False, follow_js_relocation=False, buffer_size=DEFAULT_GROUP_BUFFER_SIZE, insecure=False, timeout=None): """ Function returning a multithreaded iterator over resolved urls. Args: iterator (iterable): An iterator over urls or arbitrary items. key (callable, optional): Function extracting url from yielded items. resolve_args (callable, optional): Function returning specific arguments to pass to the resolve util per yielded item. threads (int, optional): Number of threads to use. Defaults to 25. throttle (float or callable, optional): Per-domain throttle in seconds. Or a function taking domain name and item and returning the throttle to apply. Defaults to 0.2. max_redirects (int, optional): Max number of redirections to follow. follow_refresh_header (bool, optional): Whether to follow refresh headers. Defaults to True. follow_meta_refresh (bool, optional): Whether to follow meta refresh. Defaults to False. buffer_size (int, optional): Max number of items per domain to enqueue into memory in hope of finding a new domain that can be processed immediately. Defaults to 1. insecure (bool, optional): Whether to ignore SSL certification errors when performing requests. Defaults to False. timeout (float or urllib3.Timeout, optional): Custom timeout for every request. Yields: ResolveWorkerResult """ # Creating the http pool manager http = create_pool(threads=threads, insecure=insecure, timeout=timeout) # Thread worker def worker(payload): http, item, url = payload if url is None: return ResolveWorkerResult(url=None, item=item, error=None, stack=None) kwargs = resolve_args(url, item) if resolve_args is not None else {} error, stack = resolve(http, url, max_redirects=max_redirects, follow_refresh_header=follow_refresh_header, follow_meta_refresh=follow_meta_refresh, follow_js_relocation=follow_js_relocation, **kwargs) return ResolveWorkerResult(url=url, item=item, error=error, stack=stack) # Group resolver def grouper(payload): if payload.url is None: return return get_domain_name(payload.url) # Thread payload iterator def payloads(): for item in iterator: url = item if key is None else key(item) if not url: yield FetchWorkerPayload(http=http, item=item, url=None) continue # Url cleanup url = ensure_protocol(url.strip()) yield FetchWorkerPayload(http=http, item=item, url=url) return imap_unordered(payloads(), worker, threads, group=grouper, group_parallelism=DEFAULT_GROUP_PARALLELISM, group_buffer_size=buffer_size, group_throttle=throttle)
def multithreaded_fetch(iterator, key=None, request_args=None, threads=25, throttle=DEFAULT_THROTTLE, guess_extension=True, guess_encoding=True, buffer_size=DEFAULT_GROUP_BUFFER_SIZE, insecure=False, timeout=None, domain_parallelism=DEFAULT_GROUP_PARALLELISM): """ Function returning a multithreaded iterator over fetched urls. Args: iterator (iterable): An iterator over urls or arbitrary items. key (callable, optional): Function extracting url from yielded items. request_args (callable, optional): Function returning specific arguments to pass to the request util per yielded item. threads (int, optional): Number of threads to use. Defaults to 25. throttle (float or callable, optional): Per-domain throttle in seconds. Or a function taking domain name and item and returning the throttle to apply. Defaults to 0.2. guess_extension (bool, optional): Attempt to guess the resource's extension? Defaults to True. guess_encoding (bool, optional): Attempt to guess the resource's encoding? Defaults to True. domain_parallelism (int, optional): Max number of urls per domain to hit at the same time. Defaults to 1. buffer_size (int, optional): Max number of items per domain to enqueue into memory in hope of finding a new domain that can be processed immediately. Defaults to 1. insecure (bool, optional): Whether to ignore SSL certification errors when performing requests. Defaults to False. timeout (float or urllib3.Timeout, optional): Custom timeout for every request. Yields: FetchWorkerResult """ # Creating the http pool manager http = create_pool(threads=threads, insecure=insecure, timeout=timeout) # Thread worker def worker(payload): http, item, url = payload if url is None: return FetchWorkerResult(url=None, item=item, response=None, error=None, meta=None) kwargs = request_args(url, item) if request_args is not None else {} error, response = request(http, url, **kwargs) if error: return FetchWorkerResult(url=url, item=item, response=response, error=error, meta=None) # Forcing urllib3 to read data in thread data = response.data # Meta meta = extract_response_meta(response, guess_encoding=guess_encoding, guess_extension=guess_extension) return FetchWorkerResult(url=url, item=item, response=response, error=error, meta=meta) # Group resolver def grouper(payload): if payload.url is None: return return get_domain_name(payload.url) # Thread payload iterator def payloads(): for item in iterator: url = item if key is None else key(item) if not url: yield FetchWorkerPayload(http=http, item=item, url=None) continue # Url cleanup url = ensure_protocol(url.strip()) yield FetchWorkerPayload(http=http, item=item, url=url) return imap_unordered(payloads(), worker, threads, group=grouper, group_parallelism=domain_parallelism, group_buffer_size=buffer_size, group_throttle=throttle)
def test_bad_protocol(self): http = create_pool() err, _ = request(http, 'ttps://lemonde.fr') assert type(err) is InvalidURLError
def facebook_comments_action(namespace): # Reformatting url to hit mobile website url = force_protocol(namespace.url, 'https') url = convert_facebook_url_to_mobile(url) # Grabbing cookie cookie = grab_facebook_cookie(namespace) # Handling output output_file = open_output_file(namespace.output) writer = csv.writer(output_file) writer.writerow(CSV_HEADERS) http = create_pool() def request_page(target): error, result = request(http, target, cookie=cookie) if error is not None: raise error return result.data.decode('utf-8') # Loading bar loading_bar = tqdm(desc='Scraping comments', dynamic_ncols=True, unit=' comments') url_queue = deque([(url, None)]) url_count = 0 replies_count = 0 while len(url_queue) != 0: current_url, in_reply_to = url_queue.popleft() html = request_page(current_url) data = scrape_comments(html, in_reply_to) url_count += 1 for reply_url, commented_id in data['replies']: url_queue.append((reply_url, commented_id)) if data['next'] is not None: url_queue.append((data['next'], in_reply_to)) for comment in data['comments']: loading_bar.update() writer.writerow(format_csv_row(comment)) if in_reply_to is not None: replies_count += 1 loading_bar.set_postfix(urls=url_count, replies=replies_count, q=len(url_queue)) # Don't be too greedy time.sleep(FACEBOOK_MOBILE_DEFAULT_THROTTLE) loading_bar.close()
def comments_action(namespace, output_file): # Handling output output_file = open_output_file(namespace.output) # Handling input if is_youtube_video_id(namespace.column): edit_namespace_with_csv_io(namespace, 'video_id') elif is_youtube_url(namespace.column): edit_namespace_with_csv_io(namespace, 'video_url') # Enricher enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=CSV_HEADERS) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit=' comments', ) http = create_pool() error_file = DummyTqdmFile(sys.stderr) def make_requests(current_url, http=http): return (request_json(http, current_url), current_url) for (row, url_id) in enricher.cells(namespace.column, with_rows=True): if is_youtube_url(url_id): yt_id = extract_video_id_from_youtube_url(url_id) if yt_id: url = URL_TEMPLATE % {'id': yt_id, 'key': namespace.key} elif is_youtube_video_id(url_id): url = URL_TEMPLATE % {'id': url_id, 'key': namespace.key} else: continue url_queue = deque([url]) while len(url_queue) != 0: couche = [] with ThreadPoolExecutor(max_workers=25) as executor: time.sleep(0.01) couche = executor.map(make_requests, url_queue) url_queue = deque() for resp in couche: ((err, response, result), current_url) = resp if err: error_file.write('{} for {}'.format(err, current_url)) continue elif response.status == 403 and result.get('error').get( 'errors')[0].get('reason') == 'commentsDisabled': error_file.write( 'Comments are disabled for {}'.format(current_url)) continue elif response.status == 403: error_file.write( 'Running out of API points. You will have to wait until midnight, Pacific time!' ) time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: error_file.write('Error {} for {}'.format( response.status, current_url)) continue kind = result.get('kind', None) next_page = result.get('nextPageToken', None) if next_page: url_next = current_url + '&pageToken=' + next_page url_queue.append(url_next) if kind == 'youtube#commentThreadListResponse': # Handling comments pagination items = result.get('items', None) for item in items: snippet = item['snippet'] replies = item.get('replies') if replies: # Checking whether youtube's API send a subset of the replies or not if snippet['totalReplyCount'] != len( replies['comments']) and namespace.full: # If we want the replies and those are not all given by the API, we add the URL specific to the topComment # to the queue, and we deal with that topLevelComment new_url = URL_PARENTID_TEMPLATE % { 'id': snippet['topLevelComment']['id'], 'key': namespace.key } url_queue.append(new_url) data = get_data_full(snippet, True) enricher.writerow(row, data) else: dataTop = get_data_full(snippet, True) enricher.writerow(row, dataTop) for rep in replies['comments']: enricher.writerow( row, get_data_full(rep, False)) else: # if there is not 'replies' key, it means that the comment we fetch is only a topLevelComment top_comment = get_data_full(snippet, True) enricher.writerow(row, top_comment) else: # Handling, commentList, nothing to see here, dealing commments by comments items = result.get('items', None) for item in items: data = get_data_full(item, False) enricher.writerow(row, data)
def captions_action(namespace, output_file): enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select, add=REPORT_HEADERS ) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit=' videos', ) http = create_pool() for line, video_id in enricher.cells(namespace.column, with_rows=True): url_caption = '' url_inf = INFO_URL_TEMPLATE % {'id': video_id} err1, info_vid = request(http, url_inf) info_vid_dec = unquote(str(info_vid.data)) captionsTracks = re.findall(get_info, info_vid_dec) if captionsTracks: dict_captions = json.loads(captionsTracks[0][0] + '}')['captionTracks'] for i in range(len(dict_captions)): if namespace.lang and namespace.lang == dict_captions[i]['languageCode']: url_caption = dict_captions[i]['baseUrl'] break if not(url_caption) and dict_captions: url_caption = dict_captions[0]['baseUrl'] else: url_vid = VIDEO_CALL_TEMPLATE % {'id': video_id} urls = [] time.sleep(0.01) err, result = request(http, url_vid) timedtext = re.findall(timed, str(result.data)) for x in timedtext: proper_timed = x.replace("\\\\u0026", "&") if proper_timed[-2:] == namespace.lang: url_caption = API_BASE_URL % {'temp': proper_timed} break if not(url_caption) and timedtext and not(namespace.lang): url_caption = API_BASE_URL % {'temp': timedtext[1].replace("\\\\u0026", "&")} if not url_caption: print_err('no subtitles for {}'.format(video_id)) continue time.sleep(0.01) err, result_caption = request(http, url_caption) if err is not None: print_err(err) elif result_caption.status >= 400: print_err(f'error, status : {result_caption.status} for id : {video_id}') enricher.writerow(line) else: soup = BeautifulSoup(result_caption.data, 'lxml') full_text = [] caption_text = " ".join(item.get_text() for item in soup.find_all('text')) enricher.writerow(line, [caption_text]) loading_bar.update()
def __init__(self): self.http = create_pool(timeout=TWITTER_PUBLIC_API_DEFAULT_TIMEOUT) self.reset()