Python HTTPutils.getFinalUrl примеры использования

Язык программирования: Python

Класс/Тип: HTTPutils

Метод/Функция: getFinalUrl

Примеров на hotexamples.com: 2

Python HTTPutils.getFinalUrl - 2 примера найдено. Это лучшие примеры Python кода для HTTPutils.getFinalUrl, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

getFinalUrl(2)

getContentType(1)

getEncoding(1)

Пример #1

Показать файл

Файл: yokluktaGider.py Проект: yasar11732/Belge--ndir

    def check_url(url,check_cache = {}):
        """
        Verilen url indirelecek mi diye bakar. Eğer, indirelecekse,
        gereken düzenlemeler de yapılmış olarak url döndürür, eğer
        indirilmeyecekse, None döndürür.
        ------------------------------------------------------------------
        Checks to see if url is ok for download
        """
        try:
            return check_cache[url]
        except KeyError:
            if not url.startswith(initial_url):
                check_cache[url] = None
                return None
            final_location = HTTPutils.getFinalUrl(url)

            if not final_location.startswith(initial_url):
                check_cache[url] = None
                return None
            new_link = myurlparse(final_location).getUrlWithoutFragments()
            check_cache[url] = new_link
            return new_link

Пример #2

Показать файл

Файл: yokluktaGider.py Проект: yasar11732/Belge--ndir

def main(initial_url):

    # List of 3-item tuples.
    # (file_path, encoding, base_url)
    # (dosya_yolu, kodlama, temel_url)
    to_be_processed = []
        
    queue = DownloadQueue()
    
    init_url = myurlparse(initial_url)

    if init_url.path  == "":
        initial_url += "/"
        init_url = myurlparse(initial_url)

    final_location = HTTPutils.getFinalUrl(init_url.geturl())

    if not final_location.startswith(initial_url):
        main_logger.critical("The page you have given redirects to %s")
        main_logger.critical("Aborting...")
    final_location = myurlparse(final_location)
    
    queue.append(final_location.getUrlWithoutFragments())
    
    download_dir = os.path.join(os.getcwd(), init_url.netloc).replace(".", "_")


    if not os.path.isdir(download_dir):
        os.mkdir(download_dir)

    def check_url(url,check_cache = {}):
        """
        Verilen url indirelecek mi diye bakar. Eğer, indirelecekse,
        gereken düzenlemeler de yapılmış olarak url döndürür, eğer
        indirilmeyecekse, None döndürür.
        ------------------------------------------------------------------
        Checks to see if url is ok for download
        """
        try:
            return check_cache[url]
        except KeyError:
            if not url.startswith(initial_url):
                check_cache[url] = None
                return None
            final_location = HTTPutils.getFinalUrl(url)

            if not final_location.startswith(initial_url):
                check_cache[url] = None
                return None
            new_link = myurlparse(final_location).getUrlWithoutFragments()
            check_cache[url] = new_link
            return new_link

    for link in queue:
        
        link = myurlparse(link)
        
        if link.netloc != init_url.netloc:
            main_logger.info("Skipping link from other internet location: %s" % link.geturl())
            continue
        
        content = HTTPutils.getContentType(link.geturl())
        if not content:
            main_logger.warning("Couldn\'t get content type for %s, skipping" % link.geturl())
            continue
        
        if content == "text/html" and not link.geturl().startswith(initial_url):
            main_logger.info("Skipping %s, because not in download subdirectory." % link.geturl())
            continue

        if content not in allowed_downloads:
            main_logger.info("Skipping %s because it is not in allowed downloads." % link.geturl())
            continue
        
        try:
            url = urlopen(link.geturl(), timeout=5)

        except HTTPError as e:
            main_logger.warning("Server couldn\'t fullfill the request. [%s], Skipping" % e.code)
            continue

        except URLError as e:
            main_logger.warning("We failed to reach %s because %s" % (link.geturl(), e.reason))
            main_logger.warning("Skipping %s" % link.geturl())
            continue
        
        main_logger.info("Downloading %s" % link.geturl())
        response = url.read()
        url.close()
        file_path = os.path.join(download_dir,*link.path.split("/"))

        #handle directories.
        if link.path.endswith("/"):
            file_path = os.path.join(file_path, "index.html")

        if not os.path.isdir(os.path.dirname(file_path)):
            os.makedirs(os.path.dirname(file_path))

        with open(file_path, "w") as output_file:
            output_file.write(response)
            
        if content == "text/html":
            main_logger.info("Parsing page for further links, could take a while.")

            link_collect = LinkCollector()
            encoding = HTTPutils.getEncoding(link.geturl())
            if not encoding:
                main_logger.debug("Couldn\'t get encoding in http headers for %s" % link.geturl())
                # If http headers doesn't mention charset,
                # we parse html file to see meta headers
                a = encodingFinder()
                a.feed(response)
                encoding = a.encoding
            if not encoding:
                main_logger.debug("Set default encoding for %s" % link.geturl())
                encoding = "iso-8859-1"

            try:
                response_to_be_parsed = response.decode(encoding)
            except (LookupError, UnicodeDecodeError):
                main_logger.debug("Decoding failed for %s, feeding raw binary data" % link.geturl())
                response_to_be_parsed = response

            try:
                link_collect.feed(unicode(response, encoding))
            except HTMLParseError:
                main_logger.warning("HTML Parse error, could't get all the links.")

            for new_link in link_collect.links:
                new_link = check_url(urljoin(link.geturl(), new_link))
                if new_link:
                    queue.append(new_link)

            base_url = link.geturl()
            if base_url.endswith("/"):
                base_url += "index.html"
            to_be_processed.append((file_path, encoding, base_url))
            main_logger.info("Done parsing for links.")

    main_logger.info("Starting to fix references, this could take a while...")

    for file_path, encoding, url in to_be_processed:
        main_logger.info("processing %s" % file_path)
        with open(file_path, "r") as html_file:
            html_contents = html_file.read()

        a = HTMLReferenceFixer()
        a.setbaseurl(url)
        a.filepath = file_path

        try:
            a.feed(unicode(html_contents, encoding))
        except HTMLParseError:
            main_logger.warning("Couldn\'t parse %s, skipping" % (file_path))
            continue

        with open(file_path, "w") as processed_file:
            processed_file.write(a.output.encode(encoding))