def _get_links(self, elements, attribute, base_url_split, original_url_split): links = [] for element in elements: if attribute in element.attrs: url = element[attribute] if not self.worker_config.strict_mode: url = url.strip() if not is_link(url): continue abs_url_split = get_absolute_url_split(url, base_url_split) if not is_supported_scheme( abs_url_split, self.worker_config.ignore_bad_tel_urls): continue link = Link(type=unicode(element.name), url_split=abs_url_split, original_url_split=original_url_split, source_str=unicode(element)) links.append(link) return links
def test_get_absolute_url(self): base_url_split = get_clean_url_split( "https://www.example.com/hello/index.html") self.assertEqual( "https://www.example2.com/test.js", get_absolute_url_split( "//www.example2.com/test.js", base_url_split).geturl()) self.assertEqual( "https://www.example.com/hello2/test.html", get_absolute_url_split( "/hello2/test.html", base_url_split).geturl()) self.assertEqual( "https://www.example.com/hello/test.html", get_absolute_url_split("test.html", base_url_split).geturl()) self.assertEqual( "https://www.example.com/test.html", get_absolute_url_split("../test.html", base_url_split).geturl())
def _add_urls_from_single_content_check(self, start_urls, single_content_check): for key in single_content_check.keys(): if key == PREFIX_ALL: continue if key.netloc and key not in start_urls: start_urls.append(key) else: for url_split in start_urls: new_url = get_absolute_url_split(key.geturl(), url_split) if new_url not in start_urls: start_urls.append(new_url)
def _get_links(self, elements, attribute, base_url_split, original_url_split): links = [] for element in elements: if attribute in element.attrs: url = element[attribute] if not self.worker_config.strict_mode: url = url.strip() if not is_link(url): continue abs_url_split = get_absolute_url_split(url, base_url_split) if abs_url_split.scheme not in SUPPORTED_SCHEMES: continue link = Link( type=unicode(element.name), url_split=abs_url_split, original_url_split=original_url_split, source_str=unicode(element)) links.append(link) return links