def _map_res_str(captured: [], root_domain: str, page: LinkAttrs, current_match) -> str:
     returned = None
     level = page.level
     try:
         link = current_match.group(0)
         # print("cap:", link)
         match2 = current_match.group(2)
         current_link = current_match.group(1) + match2
         begin_index = str(link).index("/")
         begin_mark = str(link[:begin_index]).strip()
         end_index = begin_index + len(current_link)
         if end_index >= len(link):
             end_mark = ""
         else:
             end_mark = str(link[end_index:]).strip()
         # if "%3" in current_link:  # transform encoded url
         inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(current_link)
         if len(inner_link) > 0:
             if root_domain in domain or link_class != LinkUtility.EXT_WEBPAGE:  # data will be saved in file system
                 if root_domain in domain:
                     is_internal = True
                 else:
                     is_internal = False
                 path_decoded = parse.unquote(path)
                 if len(path_decoded) > ArchiveExplorer.MAX_PATH_LEN:
                     short_path, ext = LinkChecker.get_shorter_url_path(path)
                     short_path += ext
                 else:
                     short_path = path
                 if link_class == LinkUtility.EXT_WEBPAGE:
                     if len(ext) > 0 and not ext == ".html":
                         valid_short_path = short_path.replace(ext, ".html")
                     else:
                         valid_short_path = short_path
                 else:
                     valid_short_path = short_path
                 file_path, ref_path = LinkUtility.make_valid_web_res_path(path, fragment)
                 short_file_path, short_ref_path = LinkUtility.make_valid_web_res_path(valid_short_path, fragment)
                 current_link = current_link.replace("\\/", "/")
                 captured.append(LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN+current_link, short_file_path,
                                           short_ref_path, ref_path,
                                           page.path, link_class, level+1, is_internal=is_internal))
                 returned = begin_mark + short_ref_path + end_mark
             else: #root_domain not in domain and ext == LinkUtility.EXT_WEBPAGE:
                 returned = begin_mark + parse.unquote(match2) + end_mark
             # else:  # capture other resources except external webpage
             #     file_path, ref_path = LinkUtility.make_valid_web_res_path(path)
             #     captured.append(LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN+current_link, file_path, ref_path, file_path, ext, level+1))
             #     returned = begin_mark + ref_path + end_mark
         else:
             returned = begin_mark + parse.unquote(current_link) + end_mark
     except Exception as ex:
         print("ex in mapping:", ex)
     finally:
         if isinstance(returned, str):
             # print("sub:", returned)
             return returned
         else:
             return ""
 def testShortUrl2(self):
     urls = ["http://gamblingaddiction.cc/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%94%e0%b9%8c%e0%b8%a1%e0%b8%b2%e0%b8%a3%e0%b9%8c%e0%b8%8a%e0%b8%82%e0%b9%88%e0%b8%b2%e0%b8%a7%e0%b8%a3%e0%b8%b4%e0%b8%9f.html",
             "/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%.html",
             "/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%",
             "/中国人民解放军/中国人民解放军/中国人民解放军.html",
             "strongholeqp4tfq;eafak;faf"]
     for url in urls:
         short_path, ext = LinkChecker.get_shorter_url_path(url)
         print("doing:", url)
         print("new path:", short_path)
         print("extension:", ext)
Exemplo n.º 3
0
 def testShortUrl2(self):
     urls = [
         "http://gamblingaddiction.cc/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%94%e0%b9%8c%e0%b8%a1%e0%b8%b2%e0%b8%a3%e0%b9%8c%e0%b8%8a%e0%b8%82%e0%b9%88%e0%b8%b2%e0%b8%a7%e0%b8%a3%e0%b8%b4%e0%b8%9f.html",
         "/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%.html",
         "/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%",
         "/中国人民解放军/中国人民解放军/中国人民解放军.html", "strongholeqp4tfq;eafak;faf"
     ]
     for url in urls:
         short_path, ext = LinkChecker.get_shorter_url_path(url)
         print("doing:", url)
         print("new path:", short_path)
         print("extension:", ext)
Exemplo n.º 4
0
 def _map_res_str(captured: [], root_domain: str, page: LinkAttrs,
                  current_match) -> str:
     returned = None
     level = page.level
     try:
         link = current_match.group(0)
         # print("cap:", link)
         match2 = current_match.group(2)
         current_link = current_match.group(1) + match2
         begin_index = str(link).index("/")
         begin_mark = str(link[:begin_index]).strip()
         end_index = begin_index + len(current_link)
         if end_index >= len(link):
             end_mark = ""
         else:
             end_mark = str(link[end_index:]).strip()
         # if "%3" in current_link:  # transform encoded url
         inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(
             current_link)
         if len(inner_link) > 0:
             if root_domain in domain or link_class != LinkUtility.EXT_WEBPAGE:  # data will be saved in file system
                 if root_domain in domain:
                     is_internal = True
                 else:
                     is_internal = False
                 path_decoded = parse.unquote(path)
                 if len(path_decoded) > ArchiveExplorer.MAX_PATH_LEN:
                     short_path, ext = LinkChecker.get_shorter_url_path(
                         path)
                     short_path += ext
                 else:
                     short_path = path
                 if link_class == LinkUtility.EXT_WEBPAGE:
                     if len(ext) > 0 and not ext == ".html":
                         valid_short_path = short_path.replace(ext, ".html")
                     else:
                         valid_short_path = short_path
                 else:
                     valid_short_path = short_path
                 file_path, ref_path = LinkUtility.make_valid_web_res_path(
                     path, fragment)
                 short_file_path, short_ref_path = LinkUtility.make_valid_web_res_path(
                     valid_short_path, fragment)
                 current_link = current_link.replace("\\/", "/")
                 captured.append(
                     LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN +
                               current_link,
                               short_file_path,
                               short_ref_path,
                               ref_path,
                               page.path,
                               link_class,
                               level + 1,
                               is_internal=is_internal))
                 returned = begin_mark + short_ref_path + end_mark
             else:  #root_domain not in domain and ext == LinkUtility.EXT_WEBPAGE:
                 returned = begin_mark + parse.unquote(match2) + end_mark
             # else:  # capture other resources except external webpage
             #     file_path, ref_path = LinkUtility.make_valid_web_res_path(path)
             #     captured.append(LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN+current_link, file_path, ref_path, file_path, ext, level+1))
             #     returned = begin_mark + ref_path + end_mark
         else:
             returned = begin_mark + parse.unquote(current_link) + end_mark
     except Exception as ex:
         print("ex in mapping:", ex)
     finally:
         if isinstance(returned, str):
             # print("sub:", returned)
             return returned
         else:
             return ""