示例#1
0
    async def _request(self, method, url, headers, data, verify, stream):
        # Note: When using aiobotocore with dynamodb, requests fail on crc32
        # checksum computation as soon as the response data reaches ~5KB.
        # When AWS response is gzip compressed:
        # 1. aiohttp is automatically decompressing the data
        # (http://aiohttp.readthedocs.io/en/stable/client.html#binary-response-content)
        # 2. botocore computes crc32 on the uncompressed data bytes and fails
        # cause crc32 has been computed on the compressed data
        # The following line forces aws not to use gzip compression,
        # if there is a way to configure aiohttp not to perform decompression,
        # we can remove the following line and take advantage of
        # aws gzip compression.
        # https://github.com/boto/botocore/issues/1255
        headers['Accept-Encoding'] = 'identity'
        headers_ = MultiDict(
            (z[0], text_(z[1], encoding='utf-8')) for z in headers.items())

        # botocore does this during the request so we do this here as well
        proxy = self.proxies.get(urlparse(url.lower()).scheme)

        if isinstance(data, io.IOBase):
            data = _IOBaseWrapper(data)

        url = URL(url, encoded=True)
        resp = await self._aio_session.request(method,
                                               url=url,
                                               headers=headers_,
                                               data=data,
                                               proxy=proxy,
                                               verify_ssl=verify)

        # If we're not streaming, read the content so we can retry any timeout
        #  errors, see:
        # https://github.com/boto/botocore/blob/develop/botocore/vendored/requests/sessions.py#L604
        if not stream:
            await resp.read()

        return resp
示例#2
0
文件: endpoint.py 项目: atapp/OCR
def _aiohttp_do_redirect(session, method, url, headers, data, resp):
    # This is the redirect code from aiohttp, remove once
    # https://github.com/aio-libs/aiobotocore/issues/267 is supported

    # For 301 and 302, mimic IE, now changed in RFC
    # https://github.com/kennethreitz/requests/pull/269

    if (resp.status == 303 and
            resp.method != hdrs.METH_HEAD) \
            or (resp.status in (301, 302) and
                resp.method == hdrs.METH_POST):
        method = hdrs.METH_GET
        data = None
        if headers.get(hdrs.CONTENT_LENGTH):
            headers.pop(hdrs.CONTENT_LENGTH)

    r_url = (resp.headers.get(hdrs.LOCATION) or
             resp.headers.get(hdrs.URI))
    if r_url is None:
        return None

    r_url = URL(
        r_url, encoded=not session.requote_redirect_url)

    scheme = r_url.scheme
    if scheme not in ('http', 'https', ''):
        resp.close()
        raise ValueError(
            'Can redirect only to http or https')
    elif not scheme:
        r_url = url.join(r_url)

    url = r_url
    params = None
    resp.release()

    return method, url, headers, params, data
async def download_if_not_exist(session,
                                path,
                                url,
                                site_settings,
                                cancellable_pool,
                                with_extension=True,
                                session_kwargs=None,
                                allowed_extensions=None,
                                forbidden_extensions=None,
                                checksum=None,
                                signal_handler=None,
                                unique_key=None):
    if session_kwargs is None:
        session_kwargs = {}

    if allowed_extensions is None:
        allowed_extensions = []

    if forbidden_extensions is None:
        forbidden_extensions = []

    allowed_extensions += site_settings.allowed_extensions
    forbidden_extensions += site_settings.forbidden_extensions

    if isinstance(url, str):
        url = URL(url)

    domain = url.host

    timeout = aiohttp.ClientTimeout(total=0)
    if os.path.isabs(path):
        raise ValueError("Absolutes paths are not allowed")

    absolute_path = os.path.join(site_settings.base_path, path)

    if not with_extension:
        guess_extension = await cache.check_extension(
            session, str(url), session_kwargs=session_kwargs)
        if guess_extension is None:
            logger.warning(f"Could not retrieve the extension for {url}")
            return

        absolute_path += "." + guess_extension

    force = False
    if checksum is not None:
        force = not cache.is_checksum_same(absolute_path, checksum)
    elif site_settings.force_download and domain not in FORCE_DOWNLOAD_BLACKLIST:
        force = True

    if os.path.exists(absolute_path) and not force:
        return

    if os.path.exists(absolute_path):
        headers = session_kwargs.get("headers", {})
        etag = cache.get_etag(absolute_path)
        if etag is not None:
            headers["If-None-Match"] = etag
        if headers:
            session_kwargs["headers"] = headers

    if os.path.exists(absolute_path):
        action = ACTION_REPLACE
    else:
        action = ACTION_NEW

    file_name = os.path.basename(absolute_path)
    file_extension = get_extension(file_name)
    if is_extension_forbidden(extension=file_extension,
                              forbidden_extensions=forbidden_extensions,
                              allowed_extensions=allowed_extensions):
        return

    async with session.get(url, timeout=timeout, **session_kwargs) as response:
        response.raise_for_status()
        response_headers = response.headers

        if response.status == 304:
            logger.debug(f"File '{absolute_path}' not modified")
            cache.save_checksum(absolute_path, checksum)
            return

        if file_extension.lower() in MOVIE_EXTENSIONS:
            logger.info(f"Starting to download {file_name}")

        pathlib.Path(os.path.dirname(absolute_path)).mkdir(parents=True,
                                                           exist_ok=True)

        if action == ACTION_REPLACE and site_settings.keep_replaced_files:
            dir_path = os.path.dirname(absolute_path)
            pure_name, extension = split_name_extension(file_name)
            old_file_name = f"{pure_name}-old.{extension}"
            old_absolute_path = os.path.join(dir_path, old_file_name)
            os.replace(absolute_path, old_absolute_path)

        try:
            with open(absolute_path, 'wb') as f:
                while True:
                    chunk = await response.content.read(8192)
                    if not chunk:
                        break
                    f.write(chunk)
        except BaseException as e:
            os.remove(absolute_path)
            logger.debug(f"Removed file {absolute_path}")
            raise e

    if site_settings.highlight_difference and \
            action == ACTION_REPLACE and \
            site_settings.keep_replaced_files and \
            file_extension.lower() == "pdf":
        logger.debug(f"Adding highlights to {absolute_path}")

        temp_file_name = f"{pure_name}-temp.{extension}"
        temp_absolute_path = os.path.join(dir_path, temp_file_name)

        future = cancellable_pool.apply(
            functools.partial(pdf_highlighter.add_differ_highlight,
                              new_path=absolute_path,
                              old_path=old_absolute_path,
                              out_path=temp_absolute_path))
        try:
            await future
            os.replace(temp_absolute_path, old_absolute_path)
        except asyncio.CancelledError as e:
            os.replace(old_absolute_path, absolute_path)
            logger.debug(f"Reverted old file {absolute_path}")
            raise e
        except Exception as e:
            logger.warning(
                f"Could not add pdf highlight to {absolute_path}. {type(e).__name__}: {e}"
            )
            signal_handler.got_warning(
                unique_key,
                f"Could not add pdf highlight to {absolute_path}. {type(e).__name__}: {e}"
            )
        finally:
            if os.path.exists(temp_absolute_path):
                logger.debug(f"Removed temp file {temp_absolute_path}")
                os.remove(temp_absolute_path)

    if "ETag" in response_headers:
        cache.save_etag(absolute_path, response.headers["ETag"])
    elif domain not in FORCE_DOWNLOAD_BLACKLIST:
        logger.warning(
            f"url: {url} had not an etag and is not in the blacklist")

    cache.save_checksum(absolute_path, checksum)

    if action == ACTION_REPLACE:
        if site_settings.keep_replaced_files and os.path.exists(
                old_absolute_path):
            signal_handler.replaced_file(unique_key, absolute_path,
                                         old_absolute_path)
        else:
            signal_handler.replaced_file(unique_key, absolute_path)
        method_msg = "Replaced"
    elif action == ACTION_NEW:
        signal_handler.added_new_file(unique_key, absolute_path)
        method_msg = "Added new"
    else:
        method_msg = "Unexpected action"

    start = {
        "name": f"{method_msg} file: '{{}}'",
        "var": file_name,
        "priority": 100,
        "cut": "back",
    }

    end = {
        "name": " in '{}'",
        "var": os.path.dirname(absolute_path),
        "priority": -100,
        "cut": "front",
    }

    logger.info(fit_sections_to_console(start, end, margin=1))
async def download_if_not_exist(session,
                                path,
                                url,
                                download_settings,
                                cancellable_pool,
                                with_extension=True,
                                session_kwargs=None,
                                allowed_extensions=None,
                                forbidden_extensions=None,
                                checksum=None,
                                signal_handler=None,
                                unique_key=None):
    if session_kwargs is None:
        session_kwargs = {}

    if allowed_extensions is None:
        allowed_extensions = []

    if forbidden_extensions is None:
        forbidden_extensions = []

    if download_settings.allowed_extensions is not None:
        allowed_extensions += download_settings.allowed_extensions

    if download_settings.forbidden_extensions is not None:
        forbidden_extensions += download_settings.forbidden_extensions

    if isinstance(url, str):
        url = URL(url)

    domain = url.host

    if os.path.isabs(path):
        raise ValueError("Absolutes paths are not allowed")

    absolute_path = os.path.join(download_settings.save_path, path)

    if not with_extension:
        guess_extension = await cache.check_extension(
            session, str(url), session_kwargs=session_kwargs)
        if guess_extension is None:
            logger.warning(f"Could not retrieve the extension for {url}")
            return

        absolute_path += "." + guess_extension

    file_name = os.path.basename(absolute_path)
    dir_path = os.path.dirname(absolute_path)
    file_extension = core.utils.get_extension(file_name)

    temp_file_name = core.utils.add_extension(f"{random.getrandbits(64)}",
                                              file_extension)
    temp_absolute_path = os.path.join(core.utils.get_temp_path(),
                                      temp_file_name)

    old_file_name = core.utils.insert_text_before_extension(file_name, "-old")
    old_absolute_path = os.path.join(dir_path, old_file_name)

    diff_file_name = core.utils.insert_text_before_extension(
        file_name, "-diff")
    diff_absolute_path = os.path.join(dir_path, diff_file_name)

    force = False
    if checksum is not None:
        force = not cache.is_checksum_same(absolute_path, checksum)
    elif download_settings.force_download and domain not in FORCE_DOWNLOAD_BLACKLIST:
        force = True

    if os.path.exists(absolute_path) and not force:
        return

    if os.path.exists(absolute_path):
        headers = session_kwargs.get("headers", {})
        etag = cache.get_etag(absolute_path)
        if etag is not None:
            headers["If-None-Match"] = etag
        if headers:
            session_kwargs["headers"] = headers

    if os.path.exists(absolute_path):
        action = ACTION_REPLACE
    else:
        action = ACTION_NEW

    if is_extension_forbidden(extension=file_extension,
                              forbidden_extensions=forbidden_extensions,
                              allowed_extensions=allowed_extensions):
        return

    try:

        async with session.get(url,
                               timeout=aiohttp.ClientTimeout(total=0),
                               **session_kwargs) as response:
            response.raise_for_status()
            response_headers = response.headers

            if response.status == 304:
                logger.debug(f"File '{absolute_path}' not modified")
                cache.save_checksum(absolute_path, checksum)
                return

            if file_extension and file_extension.lower() in MOVIE_EXTENSIONS:
                logger.info(f"Starting to download {file_name}")

            pathlib.Path(os.path.dirname(absolute_path)).mkdir(parents=True,
                                                               exist_ok=True)

            if action == ACTION_REPLACE:
                shutil.move(absolute_path, temp_absolute_path)

            file_hash = hashlib.md5()
            try:
                with open(absolute_path, 'wb') as f:
                    while True:
                        chunk = await response.content.read(8192)
                        if not chunk:
                            break
                        f.write(chunk)
                        file_hash.update(chunk)
            except BaseException as e:
                os.remove(absolute_path)
                logger.debug(f"Removed file {absolute_path}")
                if action == ACTION_REPLACE:
                    logger.debug(
                        f"Reverting temp file to new file: {absolute_path}")
                    shutil.move(temp_absolute_path, absolute_path)
                raise e

        if action == ACTION_REPLACE and cache.is_own_checksum_same(
                absolute_path, file_hash.hexdigest()):
            logger.debug(
                f"own_checksum is same for {url}. Skipping processing")
            if "ETag" in response_headers:
                cache.save_etag(absolute_path, response.headers["ETag"])
            elif domain not in FORCE_DOWNLOAD_BLACKLIST:
                logger.warning(
                    f"url: {url} had not an etag and is not in the blacklist")
            cache.save_checksum(absolute_path, checksum)
            return

        if download_settings.highlight_difference and \
                action == ACTION_REPLACE and \
                file_extension and \
                file_extension.lower() == "pdf":
            await _add_pdf_highlights(download_settings=download_settings,
                                      cancellable_pool=cancellable_pool,
                                      signal_handler=signal_handler,
                                      unique_key=unique_key,
                                      absolute_path=absolute_path,
                                      old_absolute_path=temp_absolute_path,
                                      out_path=diff_absolute_path)

        if action == ACTION_REPLACE and download_settings.keep_replaced_files:
            shutil.move(temp_absolute_path, old_absolute_path)

        cache.save_own_checksum(absolute_path, file_hash.hexdigest())

        if "ETag" in response_headers:
            cache.save_etag(absolute_path, response.headers["ETag"])
        elif domain not in FORCE_DOWNLOAD_BLACKLIST:
            logger.warning(
                f"url: {url} had not an etag and is not in the blacklist")

        cache.save_checksum(absolute_path, checksum)

        if action == ACTION_REPLACE:
            signal_old_path, signal_diff_path = None, None
            if os.path.exists(old_absolute_path
                              ) and download_settings.keep_replaced_files:
                signal_old_path = old_absolute_path
            if os.path.exists(diff_absolute_path
                              ) and download_settings.highlight_difference:
                signal_diff_path = diff_absolute_path

            signal_handler.replaced_file(unique_key, absolute_path,
                                         signal_old_path, signal_diff_path)
        elif action == ACTION_NEW:
            signal_handler.added_new_file(unique_key, absolute_path)

        if action == ACTION_REPLACE:
            method_msg = "Replaced"
        elif action == ACTION_NEW:
            method_msg = "Added new"
        else:
            method_msg = "Unexpected action"

        start = {
            "name": f"{method_msg} file: '{{}}'",
            "var": file_name,
            "priority": 100,
            "cut": "back",
        }

        end = {
            "name": " in '{}'",
            "var": os.path.dirname(absolute_path),
            "priority": -100,
            "cut": "front",
        }

        logger.info(core.utils.fit_sections_to_console(start, end, margin=1))

    finally:
        if os.path.exists(temp_absolute_path):
            os.remove(temp_absolute_path)
示例#5
0
async def download(session,
                   queue,
                   base_path,
                   url,
                   password=None,
                   file_name=None):
    domain = re.match(r"https?://([^.]*\.?)zoom.us", url).group(1)

    agent_header = {
        "referer":
        f"https://{domain}zoom.us/",
        "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/74.0.3729.169 "
                       "Safari/537.36")
    }

    async with session.get(url, headers=agent_header) as response:
        html = await response.text()

    if password is not None:
        meet_id_regex = re.compile("<input[^>]*")
        for inp in meet_id_regex.findall(html):
            input_split = inp.split()
            if input_split[2] == 'id="meetId"':
                meet_id = input_split[3][7:-1]
                break

        data = {
            "id": meet_id,
            "passwd": password,
            "action": "viewdetailpage",
            "recaptcha": ""
        }

        check_url = f"https://{domain}zoom.us/rec/validate_meet_passwd"
        async with session.post(check_url, data=data,
                                headers=agent_header) as response:
            pass

        async with session.get(url, headers=agent_header) as response:
            html = await response.text()

    metadata = _get_page_meta(html, ("viewMp4Url", "topic"))
    if metadata is None:
        logger.warning(f"Zoom url: {url} has no video")
        return None

    vid_url = metadata.get("viewMp4Url", None)
    if vid_url is None:
        raise LoginError("Could not Login")
    extension = get_extension(vid_url.split("?")[0].split("/")[-1])
    name = file_name or metadata.get("topic")

    # We need to disable the decoding of the url, because zoom is not RFC-compliant (btw f**k zoom).
    await queue.put({
        "url":
        URL(vid_url, encoded=True),
        "path":
        safe_path_join(base_path, add_extension(name, extension)),
        "session_kwargs":
        dict(headers=agent_header)
    })