def _map_res_str(captured: [], root_domain: str, page: LinkAttrs, current_match) -> str: returned = None level = page.level try: link = current_match.group(0) # print("cap:", link) match2 = current_match.group(2) current_link = current_match.group(1) + match2 begin_index = str(link).index("/") begin_mark = str(link[:begin_index]).strip() end_index = begin_index + len(current_link) if end_index >= len(link): end_mark = "" else: end_mark = str(link[end_index:]).strip() # if "%3" in current_link: # transform encoded url inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(current_link) if len(inner_link) > 0: if root_domain in domain or link_class != LinkUtility.EXT_WEBPAGE: # data will be saved in file system if root_domain in domain: is_internal = True else: is_internal = False path_decoded = parse.unquote(path) if len(path_decoded) > ArchiveExplorer.MAX_PATH_LEN: short_path, ext = LinkChecker.get_shorter_url_path(path) short_path += ext else: short_path = path if link_class == LinkUtility.EXT_WEBPAGE: if len(ext) > 0 and not ext == ".html": valid_short_path = short_path.replace(ext, ".html") else: valid_short_path = short_path else: valid_short_path = short_path file_path, ref_path = LinkUtility.make_valid_web_res_path(path, fragment) short_file_path, short_ref_path = LinkUtility.make_valid_web_res_path(valid_short_path, fragment) current_link = current_link.replace("\\/", "/") captured.append(LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN+current_link, short_file_path, short_ref_path, ref_path, page.path, link_class, level+1, is_internal=is_internal)) returned = begin_mark + short_ref_path + end_mark else: #root_domain not in domain and ext == LinkUtility.EXT_WEBPAGE: returned = begin_mark + parse.unquote(match2) + end_mark # else: # capture other resources except external webpage # file_path, ref_path = LinkUtility.make_valid_web_res_path(path) # captured.append(LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN+current_link, file_path, ref_path, file_path, ext, level+1)) # returned = begin_mark + ref_path + end_mark else: returned = begin_mark + parse.unquote(current_link) + end_mark except Exception as ex: print("ex in mapping:", ex) finally: if isinstance(returned, str): # print("sub:", returned) return returned else: return ""
def testShortUrl2(self): urls = ["http://gamblingaddiction.cc/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%94%e0%b9%8c%e0%b8%a1%e0%b8%b2%e0%b8%a3%e0%b9%8c%e0%b8%8a%e0%b8%82%e0%b9%88%e0%b8%b2%e0%b8%a7%e0%b8%a3%e0%b8%b4%e0%b8%9f.html", "/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%.html", "/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%", "/中国人民解放军/中国人民解放军/中国人民解放军.html", "strongholeqp4tfq;eafak;faf"] for url in urls: short_path, ext = LinkChecker.get_shorter_url_path(url) print("doing:", url) print("new path:", short_path) print("extension:", ext)
def testShortUrl2(self): urls = [ "http://gamblingaddiction.cc/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%94%e0%b9%8c%e0%b8%a1%e0%b8%b2%e0%b8%a3%e0%b9%8c%e0%b8%8a%e0%b8%82%e0%b9%88%e0%b8%b2%e0%b8%a7%e0%b8%a3%e0%b8%b4%e0%b8%9f.html", "/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%.html", "/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%", "/中国人民解放军/中国人民解放军/中国人民解放军.html", "strongholeqp4tfq;eafak;faf" ] for url in urls: short_path, ext = LinkChecker.get_shorter_url_path(url) print("doing:", url) print("new path:", short_path) print("extension:", ext)
def _map_res_str(captured: [], root_domain: str, page: LinkAttrs, current_match) -> str: returned = None level = page.level try: link = current_match.group(0) # print("cap:", link) match2 = current_match.group(2) current_link = current_match.group(1) + match2 begin_index = str(link).index("/") begin_mark = str(link[:begin_index]).strip() end_index = begin_index + len(current_link) if end_index >= len(link): end_mark = "" else: end_mark = str(link[end_index:]).strip() # if "%3" in current_link: # transform encoded url inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail( current_link) if len(inner_link) > 0: if root_domain in domain or link_class != LinkUtility.EXT_WEBPAGE: # data will be saved in file system if root_domain in domain: is_internal = True else: is_internal = False path_decoded = parse.unquote(path) if len(path_decoded) > ArchiveExplorer.MAX_PATH_LEN: short_path, ext = LinkChecker.get_shorter_url_path( path) short_path += ext else: short_path = path if link_class == LinkUtility.EXT_WEBPAGE: if len(ext) > 0 and not ext == ".html": valid_short_path = short_path.replace(ext, ".html") else: valid_short_path = short_path else: valid_short_path = short_path file_path, ref_path = LinkUtility.make_valid_web_res_path( path, fragment) short_file_path, short_ref_path = LinkUtility.make_valid_web_res_path( valid_short_path, fragment) current_link = current_link.replace("\\/", "/") captured.append( LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN + current_link, short_file_path, short_ref_path, ref_path, page.path, link_class, level + 1, is_internal=is_internal)) returned = begin_mark + short_ref_path + end_mark else: #root_domain not in domain and ext == LinkUtility.EXT_WEBPAGE: returned = begin_mark + parse.unquote(match2) + end_mark # else: # capture other resources except external webpage # file_path, ref_path = LinkUtility.make_valid_web_res_path(path) # captured.append(LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN+current_link, file_path, ref_path, file_path, ext, level+1)) # returned = begin_mark + ref_path + end_mark else: returned = begin_mark + parse.unquote(current_link) + end_mark except Exception as ex: print("ex in mapping:", ex) finally: if isinstance(returned, str): # print("sub:", returned) return returned else: return ""