Exemplo n.º 1
0
def get_video_captions(video_target, langs):
    if not isinstance(langs, list):
        raise TypeError

    video_id = ensure_video_id(video_target)

    if video_id is None:
        raise YouTubeInvalidVideoId

    tracks = get_caption_tracks(video_id)

    if not tracks:
        return

    best_track = select_caption_track(tracks, langs=langs)

    if best_track is None:
        return

    err, response = request(best_track.url, pool=YOUTUBE_SCRAPER_POOL)

    if err:
        raise err

    soup = BeautifulSoup(response.data.decode('utf-8'), 'lxml')

    captions = []

    for item in soup.select('text'):
        captions.append((item.get('start'), item.get('dur'),
                         unescape(item.get_text().strip())))

    return best_track, captions
Exemplo n.º 2
0
def get_caption_tracks(video_id):

    # First we try to retrieve it from video info
    url = 'https://www.youtube.com/get_video_info?video_id=%s' % video_id

    err, response = request(url)

    if err:
        raise err

    data = unquote(response.data.decode('utf-8'))

    m = CAPTION_TRACKS_RE.search(data)

    if m is not None:
        data = json.loads(m.group(0) + '}')['captionTracks']

        return [
            YouTubeCaptionTrack(item['languageCode'], item['baseUrl'],
                                item.get('kind') == 'asr') for item in data
        ]

    # Then we try to scrape it directly from the video page
    # url = 'https://www.youtube.com/watch?v=%s' % video_id

    # err, response = request(url)

    # if err:
    #     raise err

    # timedtexts = TIMEDTEXT_RE.findall(response.data)

    return []
Exemplo n.º 3
0
def make_request(url):
    err, response = request(forge_url(url), headers={'Accept-Language': 'en'})

    if response.status == 404:
        return 'not-found', None

    if err:
        return 'http-error', None

    return err, response.data
Exemplo n.º 4
0
    def request_page(self, url):
        error, result = request(url,
                                pool=self.pool,
                                cookie=self.cookie,
                                headers={'User-Agent': 'curl/7.68.0'})

        if error is not None:
            raise error

        return result.data.decode('utf-8')
Exemplo n.º 5
0
    def work(self, job):
        self.state.inc_working()

        spider = self.spiders.get(job.spider)

        if spider is None:
            raise UnknownSpiderError('Unknown spider "%s"' % job.spider)

        err, response = request(job.url, pool=self.pool)

        if err:
            return CrawlWorkerResult(
                job=job,
                scraped=None,
                error=err,
                response=response,
                meta=None,
                content=None,
                next_jobs=None
            )

        meta = spider.extract_meta_from_response(job, response)

        # Decoding response content
        content = spider.process_content(job, response, meta)

        if isinstance(spider, FunctionSpider):
            scraped, next_jobs = spider.process(job, response, content, meta)
        else:

            # Scraping items
            scraped = spider.scrape(job, response, content, meta)

            # Finding next jobs
            next_jobs = spider.next_jobs(job, response, content, meta)

        # Enqueuing next jobs
        if next_jobs is not None:

            # Consuming so that multiple agents may act on this
            next_jobs = list(next_jobs)
            self.enqueue(next_jobs)

        self.state.dec_working()

        return CrawlWorkerResult(
            job=job,
            scraped=scraped,
            error=None,
            response=response,
            meta=meta,
            content=content,
            next_jobs=next_jobs
        )
Exemplo n.º 6
0
    def worker(payload):
        item, url = payload

        if url is None:
            return FetchWorkerResult(
                url=None,
                item=item,
                response=None,
                error=None,
                meta=None
            )

        kwargs = request_args(url, item) if request_args is not None else {}

        error, response = request(
            url,
            pool=pool,
            max_redirects=max_redirects,
            **kwargs
        )

        if error:
            return FetchWorkerResult(
                url=url,
                item=item,
                response=response,
                error=error,
                meta=None
            )

        # Forcing urllib3 to read data in thread
        # TODO: this is probably useless and should be replaced by preload_content at the right place
        data = response.data

        # Meta
        meta = extract_response_meta(
            response,
            guess_encoding=guess_encoding,
            guess_extension=guess_extension
        )

        return FetchWorkerResult(
            url=url,
            item=item,
            response=response,
            error=error,
            meta=meta
        )
Exemplo n.º 7
0
def step(pool, url, item_key):
    err, result = request(url, pool=pool)

    # Debug
    if err:
        raise err

    # Bad auth
    if result.status == 401:
        raise CrowdTangleInvalidTokenError

    elif result.status == 429:
        raise CrowdTangleRateLimitExceeded

    # Bad params
    if result.status >= 400:
        data = result.data.decode('utf-8')

        try:
            data = json.loads(data)
        except:
            raise CrowdTangleInvalidRequestError(data)

        raise CrowdTangleInvalidRequestError(data['message'],
                                             code=data['code'],
                                             status=result.status)

    try:
        data = json.loads(result.data)['result']
    except (json.decoder.JSONDecodeError, TypeError, KeyError):
        raise CrowdTangleInvalidJSONError

    items = None

    if item_key in data:
        items = data[item_key]

        if len(items) == 0:
            items = None

    # Extracting next link
    pagination = data['pagination']
    next_page = pagination['nextPage'] if 'nextPage' in pagination else None

    return items, next_page
Exemplo n.º 8
0
    def __request(self, url):
        err, response = request(url, pool=self.pool)

        # Debug
        if err:
            raise err

        # Bad auth
        if response.status == 401:
            raise CrowdTangleInvalidTokenError

        # Rate limited
        if response.status == 429:
            raise CrowdTangleRateLimitExceeded

        # Server error
        if response.status >= 500:
            raise CrowdTangleServerError(url=url, status=response.status)

        # Bad params
        if response.status >= 400:
            data = response.data.decode('utf-8')

            try:
                data = json.loads(data)
            except json.decoder.JSONDecodeError:
                raise CrowdTangleInvalidRequestError(data,
                                                     url=url,
                                                     status=response.status)

            raise CrowdTangleInvalidRequestError(data['message'],
                                                 url=url,
                                                 code=data.get('code'),
                                                 status=response.status)

        try:
            data = json.loads(response.data)['result']
        except (json.decoder.JSONDecodeError, TypeError, KeyError):
            raise CrowdTangleInvalidJSONError

        return data
Exemplo n.º 9
0
    def __call__(self, payload):
        item, domain, url = payload

        result = FetchResult(*payload)

        if url is None:
            return result

        # NOTE: request_args must be threadsafe
        kwargs = {}

        if self.request_args is not None:
            kwargs = self.request_args(domain, url, item)

        error, response = request(
            url,
            pool=self.pool,
            max_redirects=self.max_redirects,
            **kwargs
        )

        if error:
            result.error = error
        else:

            # Forcing urllib3 to read data in thread
            # TODO: this is probably useless and should be replaced by preload_content at the right place
            data = response.data

            # Meta
            meta = extract_response_meta(response)

            result.response = response
            result.meta = meta

            if self.callback is not None:
                self.callback(result)

        return result
Exemplo n.º 10
0
    def test_bad_protocol(self):
        err, _ = request('ttps://lemonde.fr')

        assert type(err) is InvalidURLError
Exemplo n.º 11
0
    def fetch_facebook_page_stats(url):
        err, response = request(url, cookie='locale=en_US')

        if err:
            return 'http-error', None

        if response.status == 404:
            return 'not-found', None

        if response.status >= 400:
            return 'http-error', None

        html = response.data

        if CAPTCHA in html:
            die(['Rate limit reached!', 'Last url: %s' % url])

        if (CURRENT_AVAILABILITY_DISCLAIMER in html
                or AVAILABILITY_DISCLAIMER in html):
            return 'unavailable', None

        if LOGIN_DISCLAIMER in html:
            return 'private-or-unavailable', None

        # TODO: integrate into ural
        bpost_id = url.rsplit('/', 1)[-1].encode()

        # Extracting metadata
        meta_extractor = re.compile(META_EXTRACTOR_TEMPLATE % bpost_id)

        match = meta_extractor.search(html)

        if match is None:
            return 'extraction-failed', None

        data = json5.loads(match.group(1).decode())
        data = getpath(data, [
            'jsmods', 'pre_display_requires', 0, 3, 1, '__bbox', 'result',
            'data', 'feedback'
        ])

        if data is None:
            return 'extraction-failed', None

        # TODO: remove, this is here as a test
        # TODO: try to find a post where comments are disabled
        if get_count(data['seen_by_count']):
            print_err('Found seen_by_count: %i for %s' %
                      (get_count(data['seen_by_count']), url))

        if 'political_figure_data' in data and data['political_figure_data']:
            print_err('Found political_figure_data:')
            print_err(data['political_figure_data'])

        if get_count(data['reaction_count']) != get_count(data['reactors']):
            print_err('Found different reactions/reactors for %s' % url)

        # Extracting data from hidden html
        hidden_html_extractor = re.compile(HTML_EXTRACTOR_TEMPLATE % bpost_id)
        match = hidden_html_extractor.search(html)

        if match is not None:
            hidden_html = match.group(1).decode()
            soup = BeautifulSoup(hidden_html, 'lxml')

            # Sometimes fetching a post behaves weirdly
            if soup.select_one('h5 a') is None:
                return 'extraction-failed', None

            data['scraped'] = {}

            timestamp_elem = soup.select_one('[data-utime]')
            timestamp = int(timestamp_elem.get('data-utime'))

            data['scraped']['account_name'] = soup.select_one(
                'h5 a').get_text().strip()
            data['scraped']['timestamp'] = timestamp
            data['scraped']['time'] = datetime.fromtimestamp(
                timestamp).isoformat()

            # TODO: use a context manager
            try:
                data['scraped']['aria_label'] = timestamp_elem.parent.get(
                    'aria-label')
            except:
                pass

            try:
                data['scraped']['text'] = soup.select_one(
                    '[data-testid="post_message"]').get_text()
            except:
                pass

            # try:
            #     data['scraped']['link'] = soup.select_one('[data-lynx-uri]').get('href')
            # except:
            #     pass

        return None, data
Exemplo n.º 12
0
 def request(self, url):
     return request(url, pool=self.pool, spoof_ua=True)
Exemplo n.º 13
0
def export_google_sheets_as_csv(url,
                                cookie=None,
                                authuser=None,
                                max_authuser_attempts=4):
    if is_url(url):
        parsed = parse_google_drive_url(url)

        if parsed is None or parsed.type != 'spreadsheets':
            raise GoogleSheetsInvalidTargetError
    else:
        parsed = GoogleDriveFile('spreadsheets', url)

    base_export_url = parsed.get_export_url()
    export_url = base_export_url

    if authuser is not None:
        if not isinstance(authuser, int) or authuser < 0:
            raise TypeError('authuser should be an int >= 0')

        export_url = append_authuser(export_url, authuser)
        max_authuser_attempts = 1
    else:
        authuser = 0

    if cookie is not None and cookie in COOKIE_BROWSERS:
        jar = getattr(browser_cookie3, cookie)()
        resolver = CookieResolver(jar)
        cookie = resolver(export_url)

        if cookie is None:
            raise GoogleSheetsMissingCookieError

    attempts = max_authuser_attempts

    while True:
        attempts -= 1

        err, response = request(export_url, cookie=cookie)

        if err:
            raise err

        if response.status == 404:
            raise GoogleSheetsNotFoundError

        if response.status == 401:
            raise GoogleSheetsUnauthorizedError

        if response.status == 403:
            authuser += 1

            if attempts != 0:
                export_url = append_authuser(base_export_url, authuser)
                continue

            raise GoogleSheetsMaxAttemptsExceeded

        if 'csv' not in response.headers.get('Content-Type', '').lower():
            raise GoogleSheetsInvalidContentTypeError

        break

    return response.data.decode('utf-8')
Exemplo n.º 14
0
from minet.web import request, extract_response_meta, looks_like_html

err, response = request('https://news.ycombinator.com/')
del response.headers['Content-Type']

print(response.status)
meta = extract_response_meta(response)
# print(response.data)
print(meta)