Python LinkCollector примеры использования

Язык программирования: Python

Пространство имен/Пакет: parsers

Класс/Тип: LinkCollector

Примеров на hotexamples.com: 2

Python LinkCollector - 2 примера найдено. Это лучшие примеры Python кода для parsers.LinkCollector, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

feed(2)

Пример #1

Показать файл

Файл: yokluktaGider.py Проект: yasar11732/Belge--ndir

def main(initial_url):

    # List of 3-item tuples.
    # (file_path, encoding, base_url)
    # (dosya_yolu, kodlama, temel_url)
    to_be_processed = []
        
    queue = DownloadQueue()
    
    init_url = myurlparse(initial_url)

    if init_url.path  == "":
        initial_url += "/"
        init_url = myurlparse(initial_url)

    final_location = HTTPutils.getFinalUrl(init_url.geturl())

    if not final_location.startswith(initial_url):
        main_logger.critical("The page you have given redirects to %s")
        main_logger.critical("Aborting...")
    final_location = myurlparse(final_location)
    
    queue.append(final_location.getUrlWithoutFragments())
    
    download_dir = os.path.join(os.getcwd(), init_url.netloc).replace(".", "_")


    if not os.path.isdir(download_dir):
        os.mkdir(download_dir)

    def check_url(url,check_cache = {}):
        """
        Verilen url indirelecek mi diye bakar. Eğer, indirelecekse,
        gereken düzenlemeler de yapılmış olarak url döndürür, eğer
        indirilmeyecekse, None döndürür.
        ------------------------------------------------------------------
        Checks to see if url is ok for download
        """
        try:
            return check_cache[url]
        except KeyError:
            if not url.startswith(initial_url):
                check_cache[url] = None
                return None
            final_location = HTTPutils.getFinalUrl(url)

            if not final_location.startswith(initial_url):
                check_cache[url] = None
                return None
            new_link = myurlparse(final_location).getUrlWithoutFragments()
            check_cache[url] = new_link
            return new_link

    for link in queue:
        
        link = myurlparse(link)
        
        if link.netloc != init_url.netloc:
            main_logger.info("Skipping link from other internet location: %s" % link.geturl())
            continue
        
        content = HTTPutils.getContentType(link.geturl())
        if not content:
            main_logger.warning("Couldn\'t get content type for %s, skipping" % link.geturl())
            continue
        
        if content == "text/html" and not link.geturl().startswith(initial_url):
            main_logger.info("Skipping %s, because not in download subdirectory." % link.geturl())
            continue

        if content not in allowed_downloads:
            main_logger.info("Skipping %s because it is not in allowed downloads." % link.geturl())
            continue
        
        try:
            url = urlopen(link.geturl(), timeout=5)

        except HTTPError as e:
            main_logger.warning("Server couldn\'t fullfill the request. [%s], Skipping" % e.code)
            continue

        except URLError as e:
            main_logger.warning("We failed to reach %s because %s" % (link.geturl(), e.reason))
            main_logger.warning("Skipping %s" % link.geturl())
            continue
        
        main_logger.info("Downloading %s" % link.geturl())
        response = url.read()
        url.close()
        file_path = os.path.join(download_dir,*link.path.split("/"))

        #handle directories.
        if link.path.endswith("/"):
            file_path = os.path.join(file_path, "index.html")

        if not os.path.isdir(os.path.dirname(file_path)):
            os.makedirs(os.path.dirname(file_path))

        with open(file_path, "w") as output_file:
            output_file.write(response)
            
        if content == "text/html":
            main_logger.info("Parsing page for further links, could take a while.")

            link_collect = LinkCollector()
            encoding = HTTPutils.getEncoding(link.geturl())
            if not encoding:
                main_logger.debug("Couldn\'t get encoding in http headers for %s" % link.geturl())
                # If http headers doesn't mention charset,
                # we parse html file to see meta headers
                a = encodingFinder()
                a.feed(response)
                encoding = a.encoding
            if not encoding:
                main_logger.debug("Set default encoding for %s" % link.geturl())
                encoding = "iso-8859-1"

            try:
                response_to_be_parsed = response.decode(encoding)
            except (LookupError, UnicodeDecodeError):
                main_logger.debug("Decoding failed for %s, feeding raw binary data" % link.geturl())
                response_to_be_parsed = response

            try:
                link_collect.feed(unicode(response, encoding))
            except HTMLParseError:
                main_logger.warning("HTML Parse error, could't get all the links.")

            for new_link in link_collect.links:
                new_link = check_url(urljoin(link.geturl(), new_link))
                if new_link:
                    queue.append(new_link)

            base_url = link.geturl()
            if base_url.endswith("/"):
                base_url += "index.html"
            to_be_processed.append((file_path, encoding, base_url))
            main_logger.info("Done parsing for links.")

    main_logger.info("Starting to fix references, this could take a while...")

    for file_path, encoding, url in to_be_processed:
        main_logger.info("processing %s" % file_path)
        with open(file_path, "r") as html_file:
            html_contents = html_file.read()

        a = HTMLReferenceFixer()
        a.setbaseurl(url)
        a.filepath = file_path

        try:
            a.feed(unicode(html_contents, encoding))
        except HTMLParseError:
            main_logger.warning("Couldn\'t parse %s, skipping" % (file_path))
            continue

        with open(file_path, "w") as processed_file:
            processed_file.write(a.output.encode(encoding))

Пример #2

Показать файл

Файл: yokluktaGider.py Проект: yasar11732/Belge--ndir

def main(initial_url):

    # List of 3-item tuples.
    # (file_path, encoding, base_url)
    # (dosya_yolu, kodlama, temel_url)
    to_be_processed = []
        
    queue = DownloadQueue()
    
    init_url = myurlparse(initial_url)

    if init_url.path  == "":
        initial_url += "/"
        init_url = myurlparse(initial_url)

    final_location = getFinalUrl(init_url.geturl())

    if not final_location.startswith(initial_url):
        sys.stderr.write("Your page redirects to unwanted url.")
        sys.stderr.write("I refuse to donwload!")
    final_location = myurlparse(final_location)
    
    queue.append(final_location.getUrlWithoutFragments())
    
    download_dir = os.path.join(os.getcwd(), init_url.netloc).replace(".", "_")


    if not os.path.isdir(download_dir):
        os.mkdir(download_dir)

    def check_url(url,check_cache = {}):
        """
        Verilen url indirelecek mi diye bakar. Eğer, indirelecekse,
        gereken düzenlemeler de yapılmış olarak url döndürür, eğer
        indirilmeyecekse, None döndürür.
        ------------------------------------------------------------------
        Checks to see if url is ok for download
        """
        try:
            return check_cache[url]
        except KeyError:
            if not url.startswith(initial_url):
                check_cache[url] = None
                return None
            final_location = getFinalUrl(url)

            if not final_location.startswith(initial_url):
                check_cache[url] = None
                return None
            new_link = myurlparse(final_location).getUrlWithoutFragments()
            check_cache[url] = new_link
            return new_link

    for link in queue:
        
        link = myurlparse(link)
        
        if link.netloc != init_url.netloc:
            sys.stderr.write("Skipping %s\n" % link.geturl())
            sys.stderr.write("Reason: Link from different location\n")
            continue
        
        content = getContentType(link.geturl())
        if not content:
            print("Failed to get content type from the server.")
            print("Skipping...")
            continue
        
        if content == "text/html" and not link.geturl().startswith(initial_url):
            sys.stderr.write("Skipping %s\n" % link.geturl())
            sys.stderr.write("Reason: Not inside range.\n")
            continue

        if content not in allowed_downloads:
            sys.stderr.write("Skipping %s\n" % link.geturl())
            sys.stderr.write("Reason: Not allowed download.\n")
            continue
        
        try:
            url = urlopen(link.geturl(), timeout=5)

        except HTTPError as e:
            print("The server couldn\'t fullfill the request.")
            print("Error Code: ", e.code)
            print("Skipping...")
            continue

        except URLError as e:
            print("We failed to reach the server.")
            print("Reason: ", e.reason)
            continue
        
        print("Downloading -- İndiriliyor: %s\n" % link.geturl())
        response = url.read()
        url.close()
        file_path = os.path.join(download_dir,*link.path.split("/"))

        #handle directories.
        if link.path.endswith("/"):
            file_path = os.path.join(file_path, "index.html")

        if not os.path.isdir(os.path.dirname(file_path)):
            os.makedirs(os.path.dirname(file_path))

        with open(file_path, "wb") as output_file:
            output_file.write(response)
            
        if content == "text/html":
            print("Searching and checking links, could take a while.")
            print("-------------------------------------------------")
            print("Linkler bulunup kontrol ediliyor, uzun sürebilir.")

            link_collect = LinkCollector()
            encoding = getEncoding(link.geturl())
            if not encoding:
                # If http headers doesn't mention charset,
                # we parse html file to see meta headers
                a = encodingFinder()
                a.feed(response.decode("iso-8859-1"))
                encoding = a.encoding

            # If we still don't have any charset, we go with default.
            encoding = encoding or "iso-8859-1"

            try:
                response_to_be_parsed = response.decode(encoding)
            except (LookupError, UnicodeDecodeError):
                response_to_be_parsed = response

            try:
                link_collect.feed(str(response, encoding))
            except HTMLParseError:
                sys.stderr.write("HTML Parse error, could't get all the links.")

            for new_link in link_collect.links:
                new_link = check_url(urljoin(link.geturl(), new_link))
                if new_link:
                    queue.append(new_link)

            base_url = link.geturl()
            if base_url.endswith("/"):
                base_url += "index.html"
            to_be_processed.append((file_path, encoding, base_url))
            print("Done! -- Tamam!")

    print("Beginning to try to fix references, in some cases,")
    print("this could a really long time.")
    print("--------------------------------------------------")
    print("Referansları düzeltme işlemi başlıyor, bu bazen")
    print("bayağı fazla zaman alabilir.")
    print("--------------------------------------------------")

    for file_path, encoding, url in to_be_processed:
        print(file_path, encoding, url)
        print(("Processing - İşleniyor: %s" % file_path))
        with open(file_path, "r") as html_file:
            html_contents = html_file.read()

        a = HTMLReferenceFixer()
        a.setbaseurl(url)
        a.filepath = file_path

        try:
            a.feed(html_contents)
        except HTMLParseError:
            sys.stderr.write("Couldn\'t parse html file, skipping...")
            continue

        with open(file_path, "w") as processed_file:
            processed_file.write(a.output)