def u_test_is_link(): assert not (is_link("anyCharactersThatDontEndWithBAD_TYPES", [], []) ) # faulty URL assert not (is_link("", [], [])) # empty string assert not (is_link(" ", [], [])) # whitespace only assert not (is_link(";&&#!!!!", [], [])) # special characters assert (is_link("http://www.example.com", [], [])) # valid link assert (is_link("https://github.com/viliau", [], [])) # valid link
def extractor(url): """Extract details from the response body.""" response = requester(url, main_url, delay, cook, headers, timeout, host, proxies, user_agents, failed, processed) if clone: mirror(url, response) matches = rhref.findall(response) for link in matches: # Remove everything after a "#" to deal with in-page anchors link = link[1].replace('\'', '').replace('"', '').split('#')[0] # Checks if the URLs should be crawled if is_link(link, processed, files): if link[:4] == 'http': if link.startswith(main_url): verb('Internal page', link) internal.add(link) else: verb('External page', link) external.add(link) elif link[:2] == '//': if link.split('/')[2].startswith(host): verb('Internal page', link) internal.add(schema + '://' + link) else: verb('External page', link) external.add(link) elif link[:1] == '/': verb('Internal page', link) internal.add(remove_file(url) + link) else: verb('Internal page', link) usable_url = remove_file(url) if usable_url.endswith('/'): internal.add(usable_url + link) elif link.startswith('/'): internal.add(usable_url + link) else: internal.add(usable_url + '/' + link) if not only_urls: intel_extractor(url, response) js_extractor(response) if args.regex and not supress_regex: regxy(args.regex, response, supress_regex, custom) if api: matches = rentropy.findall(response) for match in matches: if entropy(match) >= 4: verb('Key', match) keys.add(url + ': ' + match)
def extractor(url): """从响应体中提取具体的信息""" response = requester( url, main_url, delay, cook, headers, timeout, host, ninja, user_agents, failed, processed) # 这里涉及到 core/requester.py 文件中的 requester() 函数 matches = re.findall(r'<[aA].*(href|HREF)=([^\s>]+)', response) for link in matches: # 移除"#"后的所有内容以处理页内锚点 link = link[1].replace('\'', '').replace('"', '').split('#')[0] # 检查这些 URLs 是否应该被爬取 if is_link(link, processed, files): # 这里涉及到 core/utils.py 文件中的 is_link() 函数 if link[:4] == 'http': if link.startswith(main_url): internal.add(link) else: external.add(link) elif link[:2] == '//': if link.split('/')[2].startswith(host): internal.add(schema + link) else: external.add(link) elif link[:1] == '/': internal.add(main_url + link) else: internal.add(main_url + '/' + link) if not only_urls: intel_extractor(response) js_extractor(response) if args.regex and not supress_regex: regxy(args.regex, response, supress_regex, custom) # 这里涉及到 core/utils.py 文件中的 regxy() 函数 if api: matches = re.findall(r'[\w-]{16,45}', response) for match in matches: if entropy(match) >= 4: # 这里涉及到 core/utils.py 文件中的 entropy() 函数 keys.add(url + ': ' + match)