def fixlink(self, link): """ Bu sınıfın asıl işi yapan metodu. İşlenecek bütün referanslar, bu metoda gönderilir. link argümanıyla gösterilen dosyanın, dosya sisteminde nerede bulunması gerektiğini hesaplar, eğer gerçekten orada o dosya varsa, objenin baseurl'inden, o dosyaya giden, dolaylı referansı döndürür. Eğer bu dosya yerel sürücüde bulunamıyorsa, bahsedilen dosyanın internet üzerinde bulunduğu adresi döndürür. Böylece bozuk linkler ortadan kaldırılmış olur. Ayrıca, HTTP yönlendirmelerini de göz önünde bulundurur. ------------------------------------------------------------------------ This method is used for correcting references in anchors, scripts, styles and images. If referenced file exists locally, a relative link returned. Otherwise, a full url returned. HTTP redirections cannot fool us, since we are checking them! :) """ linked_target = urljoin(self.baseurl.geturl(), link) try: real_target = myurlparse(getFinalUrl(linked_target)) except RuntimeError: stderr.write("Failed to get final url for %s" % linked_target) return linked_target expected_target = real_target if real_target.path.endswith("/"): expected_target = myurlparse( urljoin( real_target.geturl(), "./index.html")) if real_target.netloc != self.baseurl.netloc: return real_target.geturl() target_file = os.path.join(self.downloaddir, *expected_target.path.split("/")) if expected_target.path.endswith("/"): target_file = os.path.join(target_file, "index.html") if os.path.isfile(target_file): return self.relurl(expected_target) else: return real_target.geturl()
def check_url(url,check_cache = {}): """ Verilen url indirelecek mi diye bakar. Eğer, indirelecekse, gereken düzenlemeler de yapılmış olarak url döndürür, eğer indirilmeyecekse, None döndürür. ------------------------------------------------------------------ Checks to see if url is ok for download """ try: return check_cache[url] except KeyError: if not url.startswith(initial_url): check_cache[url] = None return None final_location = getFinalUrl(url) if not final_location.startswith(initial_url): check_cache[url] = None return None new_link = myurlparse(final_location).getUrlWithoutFragments() check_cache[url] = new_link return new_link
def main(initial_url): # List of 3-item tuples. # (file_path, encoding, base_url) # (dosya_yolu, kodlama, temel_url) to_be_processed = [] queue = DownloadQueue() init_url = myurlparse(initial_url) if init_url.path == "": initial_url += "/" init_url = myurlparse(initial_url) final_location = getFinalUrl(init_url.geturl()) if not final_location.startswith(initial_url): sys.stderr.write("Your page redirects to unwanted url.") sys.stderr.write("I refuse to donwload!") final_location = myurlparse(final_location) queue.append(final_location.getUrlWithoutFragments()) download_dir = os.path.join(os.getcwd(), init_url.netloc).replace(".", "_") if not os.path.isdir(download_dir): os.mkdir(download_dir) def check_url(url,check_cache = {}): """ Verilen url indirelecek mi diye bakar. Eğer, indirelecekse, gereken düzenlemeler de yapılmış olarak url döndürür, eğer indirilmeyecekse, None döndürür. ------------------------------------------------------------------ Checks to see if url is ok for download """ try: return check_cache[url] except KeyError: if not url.startswith(initial_url): check_cache[url] = None return None final_location = getFinalUrl(url) if not final_location.startswith(initial_url): check_cache[url] = None return None new_link = myurlparse(final_location).getUrlWithoutFragments() check_cache[url] = new_link return new_link for link in queue: link = myurlparse(link) if link.netloc != init_url.netloc: sys.stderr.write("Skipping %s\n" % link.geturl()) sys.stderr.write("Reason: Link from different location\n") continue content = getContentType(link.geturl()) if not content: print("Failed to get content type from the server.") print("Skipping...") continue if content == "text/html" and not link.geturl().startswith(initial_url): sys.stderr.write("Skipping %s\n" % link.geturl()) sys.stderr.write("Reason: Not inside range.\n") continue if content not in allowed_downloads: sys.stderr.write("Skipping %s\n" % link.geturl()) sys.stderr.write("Reason: Not allowed download.\n") continue try: url = urlopen(link.geturl(), timeout=5) except HTTPError as e: print("The server couldn\'t fullfill the request.") print("Error Code: ", e.code) print("Skipping...") continue except URLError as e: print("We failed to reach the server.") print("Reason: ", e.reason) continue print("Downloading -- İndiriliyor: %s\n" % link.geturl()) response = url.read() url.close() file_path = os.path.join(download_dir,*link.path.split("/")) #handle directories. if link.path.endswith("/"): file_path = os.path.join(file_path, "index.html") if not os.path.isdir(os.path.dirname(file_path)): os.makedirs(os.path.dirname(file_path)) with open(file_path, "wb") as output_file: output_file.write(response) if content == "text/html": print("Searching and checking links, could take a while.") print("-------------------------------------------------") print("Linkler bulunup kontrol ediliyor, uzun sürebilir.") link_collect = LinkCollector() encoding = getEncoding(link.geturl()) if not encoding: # If http headers doesn't mention charset, # we parse html file to see meta headers a = encodingFinder() a.feed(response.decode("iso-8859-1")) encoding = a.encoding # If we still don't have any charset, we go with default. encoding = encoding or "iso-8859-1" try: response_to_be_parsed = response.decode(encoding) except (LookupError, UnicodeDecodeError): response_to_be_parsed = response try: link_collect.feed(str(response, encoding)) except HTMLParseError: sys.stderr.write("HTML Parse error, could't get all the links.") for new_link in link_collect.links: new_link = check_url(urljoin(link.geturl(), new_link)) if new_link: queue.append(new_link) base_url = link.geturl() if base_url.endswith("/"): base_url += "index.html" to_be_processed.append((file_path, encoding, base_url)) print("Done! -- Tamam!") print("Beginning to try to fix references, in some cases,") print("this could a really long time.") print("--------------------------------------------------") print("Referansları düzeltme işlemi başlıyor, bu bazen") print("bayağı fazla zaman alabilir.") print("--------------------------------------------------") for file_path, encoding, url in to_be_processed: print(file_path, encoding, url) print(("Processing - İşleniyor: %s" % file_path)) with open(file_path, "r") as html_file: html_contents = html_file.read() a = HTMLReferenceFixer() a.setbaseurl(url) a.filepath = file_path try: a.feed(html_contents) except HTMLParseError: sys.stderr.write("Couldn\'t parse html file, skipping...") continue with open(file_path, "w") as processed_file: processed_file.write(a.output)