def get_selector(self): with zipfile.ZipFile(self.path) as zf: infolist = zf.infolist() assert len(infolist) == 1, f'Unexpected zip content in {self.path}' with zf.open(infolist[0]) as f: sel = Selector(f.read().decode('latin1'), type='xml') sel.remove_namespaces() return sel
def _extract_link_dicts(selector: Selector, base_url: str, only_urls: bool = False): """ Extract dicts with link information:: { 'url': '<absolute URL>', 'attrs': { '<attribute name>': '<value>', ... }, 'inside_text': '<text inside link>', # 'before_text': '<text preceeding this link>', } If only_urls is true, extract only links as strings. Note that ``base_url`` argument must contain page base URL, which can be different from page URL. Use w3lib.html.get_base_url to get it:: from w3lib.html import get_base_url base_url = get_base_url(html[:4096], page_url) links = list(extract_link_dicts(Selector(html), base_url)) If you're using Scrapy, and Response object is available, then scrapy.utils.response.get_base_url should be faster:: from scrapy.utils.response import get_base_url base_url = get_base_url(response) links = list(extract_link_dicts(response.selector, base_url)) """ selector.remove_namespaces() for a in selector.xpath('//a'): link = {} # type: Dict attrs = a.root.attrib if 'href' not in attrs: continue href = strip_html5_whitespace(attrs['href']) if 'mailto:' in href: continue js_link = extract_js_link(href) if js_link: href = js_link link['js'] = True if href.startswith(('tel:', 'skype:', 'fb:', 'javascript:')): continue url = urljoin(base_url, href) if url_has_any_extension(url, _IGNORED): continue if only_urls: yield url else: link['url'] = url link['attrs'] = dict(attrs) link_text = a.xpath('normalize-space()').extract_first(default='') img_link_text = a.xpath('./img/@alt').extract_first(default='') link['inside_text'] = ' '.join([link_text, img_link_text]).strip() # TODO: fix before_text and add after_text # link['before_text'] = a.xpath('./preceding::text()[1]').extract_first(default='').strip()[-100:] yield link