Exemplo n.º 1
0
def main(initial_url):

    # List of 3-item tuples.
    # (file_path, encoding, base_url)
    # (dosya_yolu, kodlama, temel_url)
    to_be_processed = []
        
    queue = DownloadQueue()
    
    init_url = myurlparse(initial_url)

    if init_url.path  == "":
        initial_url += "/"
        init_url = myurlparse(initial_url)

    final_location = HTTPutils.getFinalUrl(init_url.geturl())

    if not final_location.startswith(initial_url):
        main_logger.critical("The page you have given redirects to %s")
        main_logger.critical("Aborting...")
    final_location = myurlparse(final_location)
    
    queue.append(final_location.getUrlWithoutFragments())
    
    download_dir = os.path.join(os.getcwd(), init_url.netloc).replace(".", "_")


    if not os.path.isdir(download_dir):
        os.mkdir(download_dir)

    def check_url(url,check_cache = {}):
        """
        Verilen url indirelecek mi diye bakar. Eğer, indirelecekse,
        gereken düzenlemeler de yapılmış olarak url döndürür, eğer
        indirilmeyecekse, None döndürür.
        ------------------------------------------------------------------
        Checks to see if url is ok for download
        """
        try:
            return check_cache[url]
        except KeyError:
            if not url.startswith(initial_url):
                check_cache[url] = None
                return None
            final_location = HTTPutils.getFinalUrl(url)

            if not final_location.startswith(initial_url):
                check_cache[url] = None
                return None
            new_link = myurlparse(final_location).getUrlWithoutFragments()
            check_cache[url] = new_link
            return new_link

    for link in queue:
        
        link = myurlparse(link)
        
        if link.netloc != init_url.netloc:
            main_logger.info("Skipping link from other internet location: %s" % link.geturl())
            continue
        
        content = HTTPutils.getContentType(link.geturl())
        if not content:
            main_logger.warning("Couldn\'t get content type for %s, skipping" % link.geturl())
            continue
        
        if content == "text/html" and not link.geturl().startswith(initial_url):
            main_logger.info("Skipping %s, because not in download subdirectory." % link.geturl())
            continue

        if content not in allowed_downloads:
            main_logger.info("Skipping %s because it is not in allowed downloads." % link.geturl())
            continue
        
        try:
            url = urlopen(link.geturl(), timeout=5)

        except HTTPError as e:
            main_logger.warning("Server couldn\'t fullfill the request. [%s], Skipping" % e.code)
            continue

        except URLError as e:
            main_logger.warning("We failed to reach %s because %s" % (link.geturl(), e.reason))
            main_logger.warning("Skipping %s" % link.geturl())
            continue
        
        main_logger.info("Downloading %s" % link.geturl())
        response = url.read()
        url.close()
        file_path = os.path.join(download_dir,*link.path.split("/"))

        #handle directories.
        if link.path.endswith("/"):
            file_path = os.path.join(file_path, "index.html")

        if not os.path.isdir(os.path.dirname(file_path)):
            os.makedirs(os.path.dirname(file_path))

        with open(file_path, "w") as output_file:
            output_file.write(response)
            
        if content == "text/html":
            main_logger.info("Parsing page for further links, could take a while.")

            link_collect = LinkCollector()
            encoding = HTTPutils.getEncoding(link.geturl())
            if not encoding:
                main_logger.debug("Couldn\'t get encoding in http headers for %s" % link.geturl())
                # If http headers doesn't mention charset,
                # we parse html file to see meta headers
                a = encodingFinder()
                a.feed(response)
                encoding = a.encoding
            if not encoding:
                main_logger.debug("Set default encoding for %s" % link.geturl())
                encoding = "iso-8859-1"

            try:
                response_to_be_parsed = response.decode(encoding)
            except (LookupError, UnicodeDecodeError):
                main_logger.debug("Decoding failed for %s, feeding raw binary data" % link.geturl())
                response_to_be_parsed = response

            try:
                link_collect.feed(unicode(response, encoding))
            except HTMLParseError:
                main_logger.warning("HTML Parse error, could't get all the links.")

            for new_link in link_collect.links:
                new_link = check_url(urljoin(link.geturl(), new_link))
                if new_link:
                    queue.append(new_link)

            base_url = link.geturl()
            if base_url.endswith("/"):
                base_url += "index.html"
            to_be_processed.append((file_path, encoding, base_url))
            main_logger.info("Done parsing for links.")

    main_logger.info("Starting to fix references, this could take a while...")

    for file_path, encoding, url in to_be_processed:
        main_logger.info("processing %s" % file_path)
        with open(file_path, "r") as html_file:
            html_contents = html_file.read()

        a = HTMLReferenceFixer()
        a.setbaseurl(url)
        a.filepath = file_path

        try:
            a.feed(unicode(html_contents, encoding))
        except HTMLParseError:
            main_logger.warning("Couldn\'t parse %s, skipping" % (file_path))
            continue

        with open(file_path, "w") as processed_file:
            processed_file.write(a.output.encode(encoding))
Exemplo n.º 2
0
def main(initial_url):

    # List of 3-item tuples.
    # (file_path, encoding, base_url)
    # (dosya_yolu, kodlama, temel_url)
    to_be_processed = []
        
    queue = DownloadQueue()
    
    init_url = myurlparse(initial_url)

    if init_url.path  == "":
        initial_url += "/"
        init_url = myurlparse(initial_url)

    final_location = getFinalUrl(init_url.geturl())

    if not final_location.startswith(initial_url):
        sys.stderr.write("Your page redirects to unwanted url.")
        sys.stderr.write("I refuse to donwload!")
    final_location = myurlparse(final_location)
    
    queue.append(final_location.getUrlWithoutFragments())
    
    download_dir = os.path.join(os.getcwd(), init_url.netloc).replace(".", "_")


    if not os.path.isdir(download_dir):
        os.mkdir(download_dir)

    def check_url(url,check_cache = {}):
        """
        Verilen url indirelecek mi diye bakar. Eğer, indirelecekse,
        gereken düzenlemeler de yapılmış olarak url döndürür, eğer
        indirilmeyecekse, None döndürür.
        ------------------------------------------------------------------
        Checks to see if url is ok for download
        """
        try:
            return check_cache[url]
        except KeyError:
            if not url.startswith(initial_url):
                check_cache[url] = None
                return None
            final_location = getFinalUrl(url)

            if not final_location.startswith(initial_url):
                check_cache[url] = None
                return None
            new_link = myurlparse(final_location).getUrlWithoutFragments()
            check_cache[url] = new_link
            return new_link

    for link in queue:
        
        link = myurlparse(link)
        
        if link.netloc != init_url.netloc:
            sys.stderr.write("Skipping %s\n" % link.geturl())
            sys.stderr.write("Reason: Link from different location\n")
            continue
        
        content = getContentType(link.geturl())
        if not content:
            print("Failed to get content type from the server.")
            print("Skipping...")
            continue
        
        if content == "text/html" and not link.geturl().startswith(initial_url):
            sys.stderr.write("Skipping %s\n" % link.geturl())
            sys.stderr.write("Reason: Not inside range.\n")
            continue

        if content not in allowed_downloads:
            sys.stderr.write("Skipping %s\n" % link.geturl())
            sys.stderr.write("Reason: Not allowed download.\n")
            continue
        
        try:
            url = urlopen(link.geturl(), timeout=5)

        except HTTPError as e:
            print("The server couldn\'t fullfill the request.")
            print("Error Code: ", e.code)
            print("Skipping...")
            continue

        except URLError as e:
            print("We failed to reach the server.")
            print("Reason: ", e.reason)
            continue
        
        print("Downloading -- İndiriliyor: %s\n" % link.geturl())
        response = url.read()
        url.close()
        file_path = os.path.join(download_dir,*link.path.split("/"))

        #handle directories.
        if link.path.endswith("/"):
            file_path = os.path.join(file_path, "index.html")

        if not os.path.isdir(os.path.dirname(file_path)):
            os.makedirs(os.path.dirname(file_path))

        with open(file_path, "wb") as output_file:
            output_file.write(response)
            
        if content == "text/html":
            print("Searching and checking links, could take a while.")
            print("-------------------------------------------------")
            print("Linkler bulunup kontrol ediliyor, uzun sürebilir.")

            link_collect = LinkCollector()
            encoding = getEncoding(link.geturl())
            if not encoding:
                # If http headers doesn't mention charset,
                # we parse html file to see meta headers
                a = encodingFinder()
                a.feed(response.decode("iso-8859-1"))
                encoding = a.encoding

            # If we still don't have any charset, we go with default.
            encoding = encoding or "iso-8859-1"

            try:
                response_to_be_parsed = response.decode(encoding)
            except (LookupError, UnicodeDecodeError):
                response_to_be_parsed = response

            try:
                link_collect.feed(str(response, encoding))
            except HTMLParseError:
                sys.stderr.write("HTML Parse error, could't get all the links.")

            for new_link in link_collect.links:
                new_link = check_url(urljoin(link.geturl(), new_link))
                if new_link:
                    queue.append(new_link)

            base_url = link.geturl()
            if base_url.endswith("/"):
                base_url += "index.html"
            to_be_processed.append((file_path, encoding, base_url))
            print("Done! -- Tamam!")

    print("Beginning to try to fix references, in some cases,")
    print("this could a really long time.")
    print("--------------------------------------------------")
    print("Referansları düzeltme işlemi başlıyor, bu bazen")
    print("bayağı fazla zaman alabilir.")
    print("--------------------------------------------------")

    for file_path, encoding, url in to_be_processed:
        print(file_path, encoding, url)
        print(("Processing - İşleniyor: %s" % file_path))
        with open(file_path, "r") as html_file:
            html_contents = html_file.read()

        a = HTMLReferenceFixer()
        a.setbaseurl(url)
        a.filepath = file_path

        try:
            a.feed(html_contents)
        except HTMLParseError:
            sys.stderr.write("Couldn\'t parse html file, skipping...")
            continue

        with open(file_path, "w") as processed_file:
            processed_file.write(a.output)