Пример #1
0
def videos_action(namespace, output_file):

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=REPORT_HEADERS,
                                 keep=namespace.select)

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' videos',
    )

    http = create_pool()
    column = namespace.column

    def rows_with_videos_id():

        for row, ytb_data in enricher.cells(namespace.column, with_rows=True):
            video_id = None

            if is_youtube_video_id(ytb_data):
                video_id = ytb_data
            elif is_youtube_url(ytb_data):
                video_id = extract_video_id_from_youtube_url(ytb_data)

            yield row, video_id

    for chunk in chunks_iter(rows_with_videos_id(), 50):

        all_ids = [video_id for _, video_id in chunk if video_id]
        list_id = ",".join(all_ids)

        url = URL_TEMPLATE % {'list_id': list_id, 'key': namespace.key}
        err, response, result = request_json(http, url)

        if err:
            die(err)
        elif response.status == 403:
            time.sleep(seconds_to_midnight_pacific_time())
            continue
        elif response.status >= 400:
            die(response.status)

        data = get_data(result)

        not_available = []

        id_available = set(data)
        not_available = set(all_ids).difference(id_available)

        loading_bar.update(len(chunk))

        line_empty = []

        for row, video_id in chunk:
            if video_id is None or video_id in not_available:
                enricher.writerow(row)
            else:
                enricher.writerow(row, data[video_id])
Пример #2
0
    def __init__(self, cookie):

        # Grabbing cookie
        cookie = grab_facebook_cookie(cookie)

        if cookie is None:
            raise FacebookInvalidCookieError

        self.cookie = cookie
        self.http = create_pool()
Пример #3
0
def videos_action(namespace, output_file):

    enricher = CSVEnricher(
        namespace.file,
        namespace.column,
        output_file,
        report_headers=REPORT_HEADERS,
        select=namespace.select.split(',') if namespace.select else None
    )

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' videos',
    )

    http = create_pool()

    for chunk in gen_chunks(enricher):

        all_ids = [row[0] for row in chunk if row[0]]
        list_id = ",".join(all_ids)

        url = URL_TEMPLATE % {'list_id': list_id, 'key': namespace.key}
        err, response, result = request_json(http, url)

        if err:
            die(err)
        elif response.status == 403:
            time.sleep(seconds_to_midnight_pacific_time())
            continue
        elif response.status >= 400:
            die(response.status)

        data = get_data(result)

        id_available = set(data)
        not_available = set(all_ids).difference(id_available)

        loading_bar.update(len(chunk))

        line_empty = []

        for item in chunk:
            video_id, line = item

            if video_id is None:
                enricher.write_empty(line)

            elif video_id in not_available:
                line_empty = [video_id] + [''] * (len(REPORT_HEADERS) - 1)
                enricher.write(line, line_empty)

            else:
                enricher.write(line, data[video_id])
Пример #4
0
    def __init__(self, token, rate_limit=None):
        if rate_limit is None:
            rate_limit = CROWDTANGLE_DEFAULT_RATE_LIMIT
            summary_rate_limit = CROWDTANGLE_LINKS_DEFAULT_RATE_LIMIT
        else:
            rate_limit = rate_limit
            summary_rate_limit = rate_limit

        self.token = token
        self.rate_limiter_state = RateLimiterState(rate_limit, period=60)
        self.summary_rate_limiter_state = RateLimiterState(summary_rate_limit, period=60)
        self.http = create_pool(timeout=CROWDTANGLE_DEFAULT_TIMEOUT)
Пример #5
0
def search_action(namespace, output_file):

    # Handling output
    output_file = open_output_file(namespace.output)

    edit_namespace_with_csv_io(namespace, 'keyword')

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select,
        add=CSV_HEADERS
    )

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit='videos',
    )
    http = create_pool()
    error_file = DummyTqdmFile(sys.stderr)
    limit = namespace.limit

    for (row, keyword) in enricher.cells(namespace.column, with_rows=True):
        url = URL_template_accurate % {'subject': keyword, 'key': namespace.key}
        next_page = True
        while next_page:
            if next_page is True:
                err, response, result = request_json(http, url)
            else:
                url_next = url + '&pageToken=' + next_page
                err, response, result = request_json(http, url_next)
            if err:
                die(err)
            elif response.status == 403:
                error_file.write('Running out of API points. You will have to wait until midnight, Pacific time!')
                time.sleep(seconds_to_midnight_pacific_time())
                continue
            elif response.status >= 400:
                die(response.status)
            next_page, data_l = get_data(result)
            for data in data_l:
                if limit is not(None):
                    if limit == 0:
                        return True
                    else:
                        limit -= 1
                        enricher.writerow(row, data)
                else:
                    enricher.writerow(row, data)
Пример #6
0
def facebook_url_likes_action(namespace):
    output_file = open_output_file(namespace.output)

    if is_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'url')

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select,
        add=REPORT_HEADERS
    )

    if namespace.column not in enricher.pos:
        die([
            'Could not find the "%s" column containing the urls in the given CSV file.' % namespace.column
        ])

    loading_bar = tqdm(
        desc='Retrieving likes',
        dynamic_ncols=True,
        unit=' urls',
        total=namespace.total
    )

    http = create_pool()

    for row, url in enricher.cells(namespace.column, with_rows=True):
        loading_bar.update()

        url = url.strip()

        err, html = make_request(http, url)

        if err is not None:
            loading_bar.close()
            die('An error occurred while fetching like button for this url: %s' % url)

        scraped = scrape(html)

        if scraped is None:
            loading_bar.close()
            die('Could not extract Facebook likes from this url\'s like button: %s' % url)

        enricher.writerow(row, scraped)
Пример #7
0
def comments_action(namespace, output_file):

    output_file = open_output_file(namespace.output)

    writer = csv.writer(output_file)
    writer.writerow(CSV_HEADERS)

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' comments',
    )

    http = create_pool()

    url = URL_TEMPLATE % {'id': namespace.id, 'key': namespace.key}
    next_page = True
    all_data = []

    while next_page:

        if next_page is True:
            err, response, result = request_json(http, url)
        else:
            url_next = url + '&pageToken=' + next_page
            err, response, result = request_json(http, url_next)

        if err:
            die(err)
        elif response.status == 403:
            time.sleep(seconds_to_midnight_pacific_time())
            continue
        elif response.status >= 400:
            die(response.status)

        next_page, data = get_data(result)

        for comment in data:
            loading_bar.update()
            writer.writerow(comment)
Пример #8
0
    def __init__(self,
                 spec=None,
                 spider=None,
                 spiders=None,
                 start_jobs=None,
                 queue_path=None,
                 threads=25,
                 buffer_size=DEFAULT_GROUP_BUFFER_SIZE,
                 throttle=DEFAULT_THROTTLE):

        # NOTE: crawling could work depth-first but:
        # buffer_size should be 0 (requires to fix quenouille issue #1)

        # Params
        self.start_jobs = start_jobs
        self.queue_path = queue_path
        self.threads = threads
        self.buffer_size = buffer_size
        self.throttle = throttle

        self.using_persistent_queue = queue_path is not None
        self.http = create_pool(threads=threads)
        self.state = CrawlerState()
        self.started = False

        # Memory queue
        if not self.using_persistent_queue:
            queue = Queue()

        # Persistent queue
        else:
            queue = SQLiteQueue(queue_path,
                                multithreading=True,
                                auto_commit=False)

        # Creating spiders
        if spec is not None:
            if 'spiders' in spec:
                spiders = {
                    name: DefinitionSpider(s, name=name)
                    for name, s in spec['spiders'].items()
                }
                self.single_spider = False
            else:
                spiders = {'default': DefinitionSpider(spec)}
                self.single_spider = True

        elif spider is not None:
            spiders = {'default': spider}

        elif spiders is None:
            raise TypeError(
                'minet.Crawler: expecting either `spec`, `spider` or `spiders`.'
            )

        # Solving function spiders
        for name, s in spiders.items():
            if callable(s) and not isinstance(s, Spider):
                spiders[name] = FunctionSpider(s, name)

        self.queue = queue
        self.spiders = spiders
Пример #9
0
def facebook_post_stats_action(namespace):

    # Handling output
    output_file = open_output_file(namespace.output)

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=REPORT_HEADERS,
                                 keep=namespace.select)

    http = create_pool()

    def fetch_facebook_page_stats(url):
        err, response = request(http, url, cookie='locale=en_US')

        if err:
            return 'http-error', None

        if response.status == 404:
            return 'not-found', None

        if response.status >= 400:
            return 'http-error', None

        html = response.data

        if CAPTCHA in html:
            die(['Rate limit reached!', 'Last url: %s' % url])

        if (CURRENT_AVAILABILITY_DISCLAIMER in html
                or AVAILABILITY_DISCLAIMER in html):
            return 'unavailable', None

        if LOGIN_DISCLAIMER in html:
            return 'private-or-unavailable', None

        # TODO: integrate into ural
        bpost_id = url.rsplit('/', 1)[-1].encode()

        # Extracting metadata
        meta_extractor = re.compile(META_EXTRACTOR_TEMPLATE % bpost_id)

        match = meta_extractor.search(html)

        if match is None:
            return 'extraction-failed', None

        data = json5.loads(match.group(1).decode())
        data = nested_get([
            'jsmods', 'pre_display_requires', 0, 3, 1, '__bbox', 'result',
            'data', 'feedback'
        ], data)

        if data is None:
            return 'extraction-failed', None

        # TODO: remove, this is here as a test
        # TODO: try to find a post where comments are disabled
        if get_count(data['seen_by_count']):
            print_err('Found seen_by_count: %i for %s' %
                      (get_count(data['seen_by_count']), url))

        if 'political_figure_data' in data and data['political_figure_data']:
            print_err('Found political_figure_data:')
            print_err(data['political_figure_data'])

        if get_count(data['reaction_count']) != get_count(data['reactors']):
            print_err('Found different reactions/reactors for %s' % url)

        # Extracting data from hidden html
        hidden_html_extractor = re.compile(HTML_EXTRACTOR_TEMPLATE % bpost_id)
        match = hidden_html_extractor.search(html)

        if match is not None:
            hidden_html = match.group(1).decode()
            soup = BeautifulSoup(hidden_html, 'lxml')

            # Sometimes fetching a post behaves weirdly
            if soup.select_one('h5 a') is None:
                return 'extraction-failed', None

            data['scraped'] = {}

            timestamp_elem = soup.select_one('[data-utime]')
            timestamp = int(timestamp_elem.get('data-utime'))

            data['scraped']['account_name'] = soup.select_one(
                'h5 a').get_text().strip()
            data['scraped']['timestamp'] = timestamp
            data['scraped']['time'] = datetime.fromtimestamp(
                timestamp).isoformat()

            # TODO: use a context manager
            try:
                data['scraped']['aria_label'] = timestamp_elem.parent.get(
                    'aria-label')
            except:
                pass

            try:
                data['scraped']['text'] = soup.select_one(
                    '[data-testid="post_message"]').get_text()
            except:
                pass

            # try:
            #     data['scraped']['link'] = soup.select_one('[data-lynx-uri]').get('href')
            # except:
            #     pass

        return None, data

    # Loading bar
    loading_bar = tqdm(desc='Fetching post stats',
                       dynamic_ncols=True,
                       unit=' posts',
                       total=namespace.total)

    for row, post_url in enricher.cells(namespace.column, with_rows=True):
        loading_bar.update()

        if (not post_url or not is_facebook_post_url(post_url)
                or not is_facebook_url(post_url)):
            enricher.writerow(row, format_err('not-facebook-post'))
            continue

        err, data = fetch_facebook_page_stats(post_url)

        if err:
            enricher.writerow(row, format_err(err))
        else:
            enricher.writerow(row, format(data))

        # Throttling
        sleep_with_entropy(FACEBOOK_WEB_DEFAULT_THROTTLE, 5.0)
Пример #10
0
    # 'https://www.google.com/url?q=https://www.facebook.com/Contaniunamenos/&sa=D&ust=1603455678482000&usg=AFQjCNFSANkezX4k8Fk4sY6xg30u6CHO2Q',

    # Invalid URL
    'http://www.outremersbeyou.com/talent-de-la-semaine-la-designer-comorienne-aisha-wadaane-je-suis-fiere-de-mes-origines/',

    # Refresh header
    'http://la-grange.net/2015/03/26/refresh/',

    # GET & UA nonsense
    'https://ebay.us/BUkuxU',

    # Incorrect refresh header
    'http://ow.ly/csT350v7mRc',

    # Utf-8 location header
    'http://ow.ly/2awz50v1JkO',
    'http://xfru.it/v2uFaC',

    # IP Host redirect
    'https://bit.ly/2ANzJNW'
]

http = create_pool()

for url in URLS:
    print()
    error, stack = resolve(http, url, follow_meta_refresh=True)
    print(error)
    for item in stack:
        print(item)
Пример #11
0
from minet.utils import RateLimiterState, create_pool

FACEBOOK_URL = 'https://www.facebook.com'
FACEBOOK_MOBILE_URL = 'https://m.facebook.com'

FACEBOOK_OUTPUT_FORMATS = {'raw', 'csv_row'}

FACEBOOK_MOBILE_DEFAULT_THROTTLE = 0.5
FACEBOOK_WEB_DEFAULT_THROTTLE = 20.0

FACEBOOK_MOBILE_RATE_LIMITER_STATE = RateLimiterState(
    1, FACEBOOK_MOBILE_DEFAULT_THROTTLE)
FACEBOOK_WEB_RATE_LIMITER_STATE = RateLimiterState(
    1, FACEBOOK_WEB_DEFAULT_THROTTLE)

FACEBOOK_DEFAULT_POOL = create_pool()

FACEBOOK_COMMENT_CSV_HEADERS = [
    'post_id', 'comment_id', 'user_id', 'user_handle', 'user_url',
    'user_label', 'comment_text', 'comment_html', 'formatted_date', 'date',
    'reactions', 'replies', 'in_reply_to'
]

FACEBOOK_POST_STATS_CSV_HEADERS = [
    'error', 'canonical', 'account_name', 'timestamp', 'time', 'link',
    'aria_label', 'text', 'share_count', 'comment_count', 'reaction_count',
    'video_view_count'
]

FACEBOOK_REACTION_KEYS = OrderedDict({
    1: 'like',
Пример #12
0
 def __init__(self, token):
     self.token = token
     self.http = create_pool(timeout=MEDIACLOUD_DEFAULT_TIMEOUT)
Пример #13
0
def hyphe_dump_action(namespace):

    # Paths
    output_dir = 'hyphe_corpus_%s' % namespace.corpus

    if namespace.output_dir is not None:
        output_dir = namespace.output_dir

    os.makedirs(output_dir, exist_ok=True)

    webentities_output_path = join(output_dir, 'webentities.csv')
    pages_output_path = join(output_dir, 'pages.csv')

    if namespace.body:
        body_output_dir = join(output_dir, 'content')
        os.makedirs(body_output_dir, exist_ok=True)

    # Fixing trailing slash
    if not namespace.url.endswith('/'):
        namespace.url += '/'

    http = create_pool()
    jsonrpc = create_corpus_jsonrpc(http, namespace.url, namespace.corpus)

    # First we need to start the corpus
    ensure_corpus_is_started(jsonrpc)

    # Then we gather some handy statistics
    err, stats = jsonrpc('get_status')

    # Then we fetch webentities
    webentities_file = open(webentities_output_path, 'w')
    webentities_writer = csv.writer(webentities_file)
    webentities_writer.writerow(WEBENTITY_HEADERS)

    loading_bar = tqdm(desc='Paginating web entities',
                       unit=' webentities',
                       dynamic_ncols=True,
                       total=count_total_webentities(stats))

    webentities = {}

    for webentity in webentities_iter(jsonrpc):
        loading_bar.update()
        webentities[webentity['id']] = webentity
        webentities_writer.writerow(format_webentity_for_csv(webentity))

    webentities_file.close()
    loading_bar.close()

    # Finally we paginate pages
    pages_file = open(pages_output_path, 'w')
    pages_writer = csv.writer(pages_file)
    pages_writer.writerow(PAGE_HEADERS +
                          (ADDITIONAL_PAGE_HEADERS if namespace.body else []))

    loading_bar = tqdm(desc='Dumping pages',
                       unit=' pages',
                       dynamic_ncols=True,
                       total=count_total_pages(stats))

    for webentity, page in pages_iter(jsonrpc,
                                      webentities,
                                      body=namespace.body):
        loading_bar.update()
        filename = None

        if namespace.body and 'body' in page:
            filename = format_page_filename(webentity, page)
            filepath = join(body_output_dir, filename)
            os.makedirs(dirname(filepath), exist_ok=True)

            with open(filepath, 'wb') as f:
                binary = base64.b64decode(page['body'])
                binary = zlib.decompress(binary)
                binary = gzip.compress(binary)

                f.write(binary)

        pages_writer.writerow(
            format_page_for_csv(webentity,
                                page,
                                filename=filename,
                                body=namespace.body))
Пример #14
0
from minet.utils import create_pool, raw_resolve

SSL_ISSUES = [
    'https://lemde.fr/2zmunsV', 'https://buff.ly/2Nnaevg',
    'http://www.plateforme1418.com/', 'https://www.silverday-normandie.fr',
    'http://swll.to/rJjizGY', 'http://ow.ly/zpnt30mdb9N'
]

http = create_pool(insecure=True)

for url in SSL_ISSUES:
    print(url)
    err, stack, response = raw_resolve(http, url, return_response=True)
    print('Error', err, type(err))

    for r in stack:
        print(r)

    print()
Пример #15
0
def multithreaded_resolve(iterator,
                          key=None,
                          resolve_args=None,
                          threads=25,
                          throttle=DEFAULT_THROTTLE,
                          max_redirects=5,
                          follow_refresh_header=True,
                          follow_meta_refresh=False,
                          follow_js_relocation=False,
                          buffer_size=DEFAULT_GROUP_BUFFER_SIZE,
                          insecure=False,
                          timeout=None):
    """
    Function returning a multithreaded iterator over resolved urls.

    Args:
        iterator (iterable): An iterator over urls or arbitrary items.
        key (callable, optional): Function extracting url from yielded items.
        resolve_args (callable, optional): Function returning specific
            arguments to pass to the resolve util per yielded item.
        threads (int, optional): Number of threads to use. Defaults to 25.
        throttle (float or callable, optional): Per-domain throttle in seconds.
            Or a function taking domain name and item and returning the
            throttle to apply. Defaults to 0.2.
        max_redirects (int, optional): Max number of redirections to follow.
        follow_refresh_header (bool, optional): Whether to follow refresh
            headers. Defaults to True.
        follow_meta_refresh (bool, optional): Whether to follow meta refresh.
            Defaults to False.
        buffer_size (int, optional): Max number of items per domain to enqueue
            into memory in hope of finding a new domain that can be processed
            immediately. Defaults to 1.
        insecure (bool, optional): Whether to ignore SSL certification errors
            when performing requests. Defaults to False.
        timeout (float or urllib3.Timeout, optional): Custom timeout for every
            request.

    Yields:
        ResolveWorkerResult

    """

    # Creating the http pool manager
    http = create_pool(threads=threads, insecure=insecure, timeout=timeout)

    # Thread worker
    def worker(payload):
        http, item, url = payload

        if url is None:
            return ResolveWorkerResult(url=None,
                                       item=item,
                                       error=None,
                                       stack=None)

        kwargs = resolve_args(url, item) if resolve_args is not None else {}

        error, stack = resolve(http,
                               url,
                               max_redirects=max_redirects,
                               follow_refresh_header=follow_refresh_header,
                               follow_meta_refresh=follow_meta_refresh,
                               follow_js_relocation=follow_js_relocation,
                               **kwargs)

        return ResolveWorkerResult(url=url,
                                   item=item,
                                   error=error,
                                   stack=stack)

    # Group resolver
    def grouper(payload):
        if payload.url is None:
            return

        return get_domain_name(payload.url)

    # Thread payload iterator
    def payloads():
        for item in iterator:
            url = item if key is None else key(item)

            if not url:
                yield FetchWorkerPayload(http=http, item=item, url=None)

                continue

            # Url cleanup
            url = ensure_protocol(url.strip())

            yield FetchWorkerPayload(http=http, item=item, url=url)

    return imap_unordered(payloads(),
                          worker,
                          threads,
                          group=grouper,
                          group_parallelism=DEFAULT_GROUP_PARALLELISM,
                          group_buffer_size=buffer_size,
                          group_throttle=throttle)
Пример #16
0
def multithreaded_fetch(iterator,
                        key=None,
                        request_args=None,
                        threads=25,
                        throttle=DEFAULT_THROTTLE,
                        guess_extension=True,
                        guess_encoding=True,
                        buffer_size=DEFAULT_GROUP_BUFFER_SIZE,
                        insecure=False,
                        timeout=None,
                        domain_parallelism=DEFAULT_GROUP_PARALLELISM):
    """
    Function returning a multithreaded iterator over fetched urls.

    Args:
        iterator (iterable): An iterator over urls or arbitrary items.
        key (callable, optional): Function extracting url from yielded items.
        request_args (callable, optional): Function returning specific
            arguments to pass to the request util per yielded item.
        threads (int, optional): Number of threads to use. Defaults to 25.
        throttle (float or callable, optional): Per-domain throttle in seconds.
            Or a function taking domain name and item and returning the
            throttle to apply. Defaults to 0.2.
        guess_extension (bool, optional): Attempt to guess the resource's
            extension? Defaults to True.
        guess_encoding (bool, optional): Attempt to guess the resource's
            encoding? Defaults to True.
        domain_parallelism (int, optional): Max number of urls per domain to
            hit at the same time. Defaults to 1.
        buffer_size (int, optional): Max number of items per domain to enqueue
            into memory in hope of finding a new domain that can be processed
            immediately. Defaults to 1.
        insecure (bool, optional): Whether to ignore SSL certification errors
            when performing requests. Defaults to False.
        timeout (float or urllib3.Timeout, optional): Custom timeout for every
            request.

    Yields:
        FetchWorkerResult

    """

    # Creating the http pool manager
    http = create_pool(threads=threads, insecure=insecure, timeout=timeout)

    # Thread worker
    def worker(payload):
        http, item, url = payload

        if url is None:
            return FetchWorkerResult(url=None,
                                     item=item,
                                     response=None,
                                     error=None,
                                     meta=None)

        kwargs = request_args(url, item) if request_args is not None else {}

        error, response = request(http, url, **kwargs)

        if error:
            return FetchWorkerResult(url=url,
                                     item=item,
                                     response=response,
                                     error=error,
                                     meta=None)

        # Forcing urllib3 to read data in thread
        data = response.data

        # Meta
        meta = extract_response_meta(response,
                                     guess_encoding=guess_encoding,
                                     guess_extension=guess_extension)

        return FetchWorkerResult(url=url,
                                 item=item,
                                 response=response,
                                 error=error,
                                 meta=meta)

    # Group resolver
    def grouper(payload):
        if payload.url is None:
            return

        return get_domain_name(payload.url)

    # Thread payload iterator
    def payloads():
        for item in iterator:
            url = item if key is None else key(item)

            if not url:
                yield FetchWorkerPayload(http=http, item=item, url=None)

                continue

            # Url cleanup
            url = ensure_protocol(url.strip())

            yield FetchWorkerPayload(http=http, item=item, url=url)

    return imap_unordered(payloads(),
                          worker,
                          threads,
                          group=grouper,
                          group_parallelism=domain_parallelism,
                          group_buffer_size=buffer_size,
                          group_throttle=throttle)
Пример #17
0
    def test_bad_protocol(self):
        http = create_pool()
        err, _ = request(http, 'ttps://lemonde.fr')

        assert type(err) is InvalidURLError
Пример #18
0
def facebook_comments_action(namespace):

    # Reformatting url to hit mobile website
    url = force_protocol(namespace.url, 'https')
    url = convert_facebook_url_to_mobile(url)

    # Grabbing cookie
    cookie = grab_facebook_cookie(namespace)

    # Handling output
    output_file = open_output_file(namespace.output)

    writer = csv.writer(output_file)
    writer.writerow(CSV_HEADERS)

    http = create_pool()

    def request_page(target):
        error, result = request(http, target, cookie=cookie)

        if error is not None:
            raise error

        return result.data.decode('utf-8')

    # Loading bar
    loading_bar = tqdm(desc='Scraping comments',
                       dynamic_ncols=True,
                       unit=' comments')

    url_queue = deque([(url, None)])

    url_count = 0
    replies_count = 0

    while len(url_queue) != 0:
        current_url, in_reply_to = url_queue.popleft()

        html = request_page(current_url)
        data = scrape_comments(html, in_reply_to)

        url_count += 1

        for reply_url, commented_id in data['replies']:
            url_queue.append((reply_url, commented_id))

        if data['next'] is not None:
            url_queue.append((data['next'], in_reply_to))

        for comment in data['comments']:
            loading_bar.update()
            writer.writerow(format_csv_row(comment))

            if in_reply_to is not None:
                replies_count += 1

        loading_bar.set_postfix(urls=url_count,
                                replies=replies_count,
                                q=len(url_queue))

        # Don't be too greedy
        time.sleep(FACEBOOK_MOBILE_DEFAULT_THROTTLE)

    loading_bar.close()
Пример #19
0
def comments_action(namespace, output_file):

    # Handling output
    output_file = open_output_file(namespace.output)

    # Handling input
    if is_youtube_video_id(namespace.column):
        edit_namespace_with_csv_io(namespace, 'video_id')
    elif is_youtube_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'video_url')

    # Enricher
    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 keep=namespace.select,
                                 add=CSV_HEADERS)

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' comments',
    )

    http = create_pool()
    error_file = DummyTqdmFile(sys.stderr)

    def make_requests(current_url, http=http):
        return (request_json(http, current_url), current_url)

    for (row, url_id) in enricher.cells(namespace.column, with_rows=True):

        if is_youtube_url(url_id):
            yt_id = extract_video_id_from_youtube_url(url_id)
            if yt_id:
                url = URL_TEMPLATE % {'id': yt_id, 'key': namespace.key}
        elif is_youtube_video_id(url_id):
            url = URL_TEMPLATE % {'id': url_id, 'key': namespace.key}
        else:
            continue
        url_queue = deque([url])
        while len(url_queue) != 0:
            couche = []
            with ThreadPoolExecutor(max_workers=25) as executor:
                time.sleep(0.01)
                couche = executor.map(make_requests, url_queue)
            url_queue = deque()
            for resp in couche:
                ((err, response, result), current_url) = resp
                if err:
                    error_file.write('{} for {}'.format(err, current_url))
                    continue
                elif response.status == 403 and result.get('error').get(
                        'errors')[0].get('reason') == 'commentsDisabled':
                    error_file.write(
                        'Comments are disabled for {}'.format(current_url))
                    continue
                elif response.status == 403:
                    error_file.write(
                        'Running out of API points. You will have to wait until midnight, Pacific time!'
                    )
                    time.sleep(seconds_to_midnight_pacific_time())
                    continue
                elif response.status >= 400:
                    error_file.write('Error {} for {}'.format(
                        response.status, current_url))
                    continue
                kind = result.get('kind', None)
                next_page = result.get('nextPageToken', None)
                if next_page:
                    url_next = current_url + '&pageToken=' + next_page
                    url_queue.append(url_next)
                if kind == 'youtube#commentThreadListResponse':
                    # Handling comments pagination
                    items = result.get('items', None)
                    for item in items:
                        snippet = item['snippet']
                        replies = item.get('replies')
                        if replies:
                            # Checking whether youtube's API send a subset of the replies or not
                            if snippet['totalReplyCount'] != len(
                                    replies['comments']) and namespace.full:
                                # If we want the replies and those are not all given by the API, we add the URL specific to the topComment
                                # to the queue, and we deal with that topLevelComment
                                new_url = URL_PARENTID_TEMPLATE % {
                                    'id': snippet['topLevelComment']['id'],
                                    'key': namespace.key
                                }
                                url_queue.append(new_url)
                                data = get_data_full(snippet, True)
                                enricher.writerow(row, data)
                            else:
                                dataTop = get_data_full(snippet, True)
                                enricher.writerow(row, dataTop)
                                for rep in replies['comments']:
                                    enricher.writerow(
                                        row, get_data_full(rep, False))
                        else:
                            # if there is not 'replies' key, it means that the comment we fetch is only a topLevelComment
                            top_comment = get_data_full(snippet, True)
                            enricher.writerow(row, top_comment)
                else:
                    # Handling, commentList, nothing to see here, dealing commments by comments
                    items = result.get('items', None)
                    for item in items:
                        data = get_data_full(item, False)
                        enricher.writerow(row, data)
Пример #20
0
def captions_action(namespace, output_file):

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select,
        add=REPORT_HEADERS
    )

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' videos',
    )

    http = create_pool()

    for line, video_id in enricher.cells(namespace.column, with_rows=True):
        url_caption = ''
        url_inf = INFO_URL_TEMPLATE % {'id': video_id}
        err1, info_vid = request(http, url_inf)
        info_vid_dec = unquote(str(info_vid.data))
        captionsTracks = re.findall(get_info, info_vid_dec)
        if captionsTracks:
            dict_captions = json.loads(captionsTracks[0][0] + '}')['captionTracks']
            for i in range(len(dict_captions)):
                if namespace.lang and namespace.lang == dict_captions[i]['languageCode']:
                    url_caption = dict_captions[i]['baseUrl']
                    break
            if not(url_caption) and dict_captions:
                url_caption = dict_captions[0]['baseUrl']

        else:
            url_vid = VIDEO_CALL_TEMPLATE % {'id': video_id}
            urls = []
            time.sleep(0.01)
            err, result = request(http, url_vid)
            timedtext = re.findall(timed, str(result.data))
            for x in timedtext:
                proper_timed = x.replace("\\\\u0026", "&")
                if proper_timed[-2:] == namespace.lang:
                    url_caption = API_BASE_URL % {'temp': proper_timed}
                    break
            if not(url_caption) and timedtext and not(namespace.lang):
                url_caption = API_BASE_URL % {'temp': timedtext[1].replace("\\\\u0026", "&")}
        if not url_caption:
            print_err('no subtitles for {}'.format(video_id))
            continue

        time.sleep(0.01)
        err, result_caption = request(http, url_caption)

        if err is not None:
            print_err(err)
        elif result_caption.status >= 400:
            print_err(f'error, status : {result_caption.status} for id : {video_id}')
            enricher.writerow(line)
        else:
            soup = BeautifulSoup(result_caption.data, 'lxml')

            full_text = []

            caption_text = " ".join(item.get_text() for item in soup.find_all('text'))

            enricher.writerow(line, [caption_text])

        loading_bar.update()
Пример #21
0
 def __init__(self):
     self.http = create_pool(timeout=TWITTER_PUBLIC_API_DEFAULT_TIMEOUT)
     self.reset()