def _extract_links(self, selector, response_url, response_encoding, base_url): links = [] # hacky way to get the underlying lxml parsed document for el, attr, attr_val in self._iter_links(selector._root): if not self._is_valid_link(el, attr, attr_val): continue attr_val = urljoin(base_url, attr_val) url = self.process_attr(attr_val) if url is None: continue if isinstance(url, unicode): url = url.encode(response_encoding) # to fix relative links after process_value url = urljoin(response_url, url) if el.tag != 'a': link = AssetLink(url, _collect_string_content(el) or u'', nofollow=True if el.get('rel') == 'nofollow' else False) else: link = PageLink(url, _collect_string_content(el) or u'', nofollow=True if el.get('rel') == 'nofollow' else False) links.append(link) return unique_list(links, key=lambda link: link.url) \ if self.unique else links
def _extract_links(self, selector, response_url, response_encoding, base_url): links = [] # hacky way to get the underlying lxml parsed document for el, attr, attr_val in self._iter_links(selector._root): # pseudo lxml.html.HtmlElement.make_links_absolute(base_url) try: attr_val = urljoin(base_url, attr_val) except ValueError: continue # skipping bogus links else: url = self.process_attr(attr_val) if url is None: continue if isinstance(url, unicode): url = url.encode(response_encoding) # to fix relative links after process_value url = urljoin(response_url, url) link = Link( url, _collect_string_content(el) or u'', nofollow=True if el.get('rel') == 'nofollow' else False) links.append(link) return unique_list(links, key=lambda link: link.url) \ if self.unique else links
def extract_links(self, response): result = json.loads(response.text) for pattern in self.patterns: extractors = pattern.get('extractors') format = pattern.get('format') data = result for extractor in extractors: type = extractor.get('type') if isinstance(data, dict): if type == 'value': data = self.get_value(*([data] + extractor.get('args'))) elif isinstance(data, list): if type == 'value': data = [ self.get_value(*([item] + extractor.get('args'))) for item in data ] elif type == 'slice': data = self.get_slice(*([data] + extractor.get('args'))) if not isinstance(data, list): data = [data] all_links = [ Link(response.urljoin(format.format( *[item]))) if not isinstance(item, list) else Link( response.urljoin(format.format(*item))) for item in data ] return unique_list(all_links)
def _extract_links(self, selector, response_url, response_encoding, base_url): ''' Pretty much the same function, just added 'ignore' to url.encode ''' links = [] # hacky way to get the underlying lxml parsed document for el, attr, attr_val in self._iter_links(selector.root): # pseudo lxml.html.HtmlElement.make_links_absolute(base_url) try: attr_val = urljoin(base_url, attr_val) except ValueError: continue # skipping bogus links else: url = self.process_attr(attr_val) if url is None: continue if isinstance(url, unicode): # add 'ignore' to encoding errors url = url.encode(response_encoding, 'ignore') # to fix relative links after process_value url = urljoin(response_url, url) link = Link(url, _collect_string_content(el) or u'', nofollow=True if el.get('rel') == 'nofollow' else False) links.append(link) return unique_list(links, key=lambda link: link.url) \ if self.unique else links
def _process_links(self, links): """ Normalize and filter extracted links The subclass should override it if necessary """ links = unique_list(links, key=lambda link: link.url) if self.unique else links return links
def _process_links(self, links): """ Normalize and filter extracted links The subclass should override it if neccessary """ links = unique_list(links, key=lambda link: link.url) if self.unique else links return links
def extract_links(self, response): if not self.base_url: self.base_url = get_base_url(response) items = re.findall(self.restrict_re, response.text) all_links = [ Link(response.urljoin(self.base_url.format(str(item)))) for item in items ] return unique_list(all_links)
def extract_links(self, response): base_url = get_base_url(response) if self.restrict_xpaths: docs = [subdoc for x in self.restrict_xpaths for subdoc in response.xpath(x)] else: docs = [response.selector] all_links = [] for doc in docs: links = self._extract_links(doc, response.url, response.encoding, base_url) all_links.extend(self._process_links(links)) return unique_list(all_links)
def _extract_links(self, response_text, response_url): html = lxml.html.fromstring(response_text) html.make_links_absolute(response_url) for e, a, l, p in html.iterlinks(): if self.scan_tag(e.tag): if self.scan_attr(a): link = Link(self.process_attr(l), text=e.text) self.links.append(link) links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links return links
def _extract_links(self, response_text, response_url): html = lxml.html.fromstring(response_text) html.make_links_absolute(response_url) for e, a, l, p in html.iterlinks(): if self.scan_tag(e.tag): if self.scan_attr(a): link = Link(self.process_attr(l), text=e.text) self.links.append(link) links = unique_list(self.links, key=lambda link: link.url) \ if self.unique else self.links return links
def _extract_links(self, response_text, response_url, response_encoding): self.base_url, self.links = etree.HTML(response_text, self.parser) links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links ret = [] base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url for link in links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = str_to_unicode(link.text, response_encoding) ret.append(link) return ret
def extract_links(self, response): html = Selector(response) try: base_url = response.xpath("//base/@href").extract()[0] except IndexError: base_url = get_base_url(response) if self.restrict_xpaths: docs = [subdoc for x in self.restrict_xpaths for subdoc in html.xpath(x)] else: docs = [html] all_links = [] for doc in docs: links = self._extract_links(doc, response.url, response.encoding, base_url) all_links.extend(self._process_links(links)) return unique_list(all_links)
def merge_clusters(merged, all_urls, min_cluster_size): res = {'clusters': {}, 'unclustered': []} unclustered = all_urls for regex in merged: matches = apply_reg_ex_to_urls(regex, all_urls) matches = unique_list(matches) if len(matches) >= min_cluster_size: for match in matches: try: unclustered.remove(match) except: continue human = regex.replace('([^/]+)', '[...]').replace('([^&=?]+)', '[...]').replace('(\d+)', '[NUMBER]') res['clusters'].update({(regex, human): matches}) res['unclustered'] = unclustered return res
def _extract_links(self, response_text, response_url, response_encoding): self.reset() self.feed(response_text) self.close() links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links ret = [] base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url for link in links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = link.text.decode(response_encoding) ret.append(link) return ret
def get_urls(_url, _html=None, headers=None): if _html is None: response = requests.get(_url, verify=False, headers=headers) _html = response.content page = str(BeautifulSoup(_html)) url_list = [] while True: url, n = get_url(page) page = page[n:] if url: url = urljoin(_url, url) url_list.append(url) else: break url_list = unique_list(url_list) return url_list
def _extract_links(self, response_text, response_url, response_encoding): self.reset() self.feed(response_text) self.close() links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links ret = [] base_url = self.base_url if self.base_url else response_url for link in links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = link.text.decode(response_encoding) ret.append(link) return ret
def extract_links(self, response): base_url = get_base_url(response) if self.restrict_xpaths: links = [link for xpath in self.restrict_xpaths for link in response.xpath(xpath)] else: links = [response.selector,] all_links = [Link(response.url),] for link in links: new_link = self._extract_links(link, response.url, response.encoding, base_url) all_links.extend(self._process_links(new_link)) return unique_list(all_links)
def _extract_links(self, response_text, response_url, response_encoding): links = [] html = lxml.html.fromstring(response_text) html.make_links_absolute(response_url) for e, a, l, p in html.iterlinks(): if self.tag_func(e.tag): if self.attr_func(a): l = safe_url_string(l, response_encoding) text = u'' if e.text: text = str_to_unicode(e.text, response_encoding, errors='replace').strip() link = Link(self.process_func(l), text=text) links.append(link) links = unique_list(links, key=lambda link: link.url) \ if self.unique else links return links
def _extract_links(self, response_text, response_url, response_encoding): self.base_url, self.links = etree.HTML(response_text, self.parser) links = unique_list( self.links, key=lambda link: link.url) if self.unique else self.links ret = [] base_url = urljoin_rfc( response_url, self.base_url) if self.base_url else response_url for link in links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = str_to_unicode(link.text, response_encoding, errors='replace') ret.append(link) return ret
def _extract_links(self, selector, response_url, response_encoding, base_url): links = [] # hacky way to get the underlying lxml parsed document for el, attr, attr_val in self._iter_links(selector._root): # pseudo lxml.html.HtmlElement.make_links_absolute(base_url) attr_val = urljoin(base_url, attr_val) url = self.process_attr(attr_val) if url is None: continue if isinstance(url, unicode): url = url.encode(response_encoding, 'ignore') # to fix relative links after process_value url = urljoin(response_url, url) link = Link(url, _collect_string_content(el) or u'', nofollow=True if el.get('rel') == 'nofollow' else False) links.append(link) return unique_list(links, key=lambda link: link.url) \ if self.unique else links
def extract_links(self, response): base_url = get_base_url(response) print "base_url", base_url domain_name=tldextract.extract(base_url).domain if domain_name in self.crawledPagesPerSite and self.crawledPagesPerSite[domain_name]>self.maximumPagesPerSite: return [] if self.restrict_xpaths: docs = [subdoc for x in self.restrict_xpaths for subdoc in response.xpath(x)] else: docs = [response.selector] all_links = [] for doc in docs: links = self._extract_links(doc, response.url, response.encoding, base_url) all_links.extend(self._process_links(links)) all_links=unique_list(all_links) new_all_links=[] for link in all_links: url=link.url domain_name=tldextract.extract(url).domain suffix=tldextract.extract(url).suffix domain_and_suffix=domain_name+"."+suffix if domain_and_suffix not in self.allow_domains: continue if domain_name in self.crawledPagesPerSite: self.crawledPagesPerSite[domain_name]+=1 else: self.crawledPagesPerSite[domain_name]=1 if self.crawledPagesPerSite[domain_name]>self.maximumPagesPerSite: break else: print "have crawled " , self.crawledPagesPerSite[domain_name], "pages" new_all_links.append(link) return new_all_links
def _extract_links(self, response_text, response_url, response_encoding): self.reset() self.feed(response_text) self.close() links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links ret = [] base_url = urljoin(response_url, self.base_url) if self.base_url else response_url for link in links: if isinstance(link.url, unicode): link.url = link.url.encode(response_encoding) try: link.url = urljoin(base_url, link.url) except ValueError: continue link.url = safe_url_string(link.url, response_encoding) link.text = link.text.decode(response_encoding) ret.append(link) return ret
def extract_links(self, response): # TODO: remove debug code with open('/export/home/asanakoy/tmp/response.txt', 'w') as f: f.write(response.body) assert False, 'enough ;)' base_url = self.base_url if self.base_url else get_base_url(response) if self.restrict_xpaths: docs = [ subdoc for x in self.restrict_xpaths for subdoc in response.xpath(x) ] else: docs = [response.selector] all_links = [] for doc in docs: links = self._extract_links(doc, response.url, response.encoding, base_url) print 'Num links before filter:', len(links) all_links.extend(self._process_links(links)) print 'Num links:', len(all_links) return unique_list(all_links)
def extract_links(self, response): """Returns a list of :class:`~scrapy.link.Link` objects from the specified :class:`response <scrapy.http.Response>`. Only links that match the settings passed to the ``__init__`` method of the link extractor are returned. Duplicate links are omitted. """ base_url = get_base_url(response) if self.restrict_xpaths: docs = [subdoc for x in self.restrict_xpaths for subdoc in response.xpath(x)] else: docs = [response.selector] all_links = [] for doc in docs: links = self._extract_links(doc, response.url, response.encoding, base_url) all_links.extend(self._process_links(links)) return unique_list(all_links)
def extract_links(self, response): """Returns a list of :class:`~scrapy.link.Link` objects from the specified :class:`response <scrapy.http.Response>`. Only links that match the settings passed to the ``__init__`` method of the link extractor are returned. Duplicate links are omitted. """ base_url = get_base_url(response) if self.restrict_xpaths: docs = [ subdoc for x in self.restrict_xpaths #大循环 有多少条restrict_xpaths for subdoc in response.xpath(x) #小循环 符合这个xpath的条目的 有多少个字条目 ]#[a for i in range(3) for a in range(10)]》》》[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9] else: docs = [response.selector] all_links = [] for doc in docs: links = self._extract_links(doc, response.url, response.encoding, base_url) #实际上是调用 LxmlParserLinkExtractor._extract_links all_links.extend(self._process_links(links)) return unique_list(all_links)
def _extract_links(self, selector, response_url, response_encoding, base_url): links = [] # hacky way to get the underlying lxml parsed document for el, attr, attr_val in self._iter_links(selector.root): if self.scan_tag(el.tag) and self.scan_attr(attr): # pseudo root.make_links_absolute(base_url) # START PATCH: Added check to filter links before making absolute if not _is_valid_link(attr_val): continue # END PATCH attr_val = urljoin(base_url, attr_val) url = self.process_attr(attr_val) if url is None: continue # to fix relative links after process_value url = urljoin(response_url, url) link = Link( url, _collect_string_content(el) or '', nofollow=True if el.get('rel') == 'nofollow' else False ) links.append(link) return unique_list(links, key=lambda link: link.url) \ if self.unique else links
def _extract_from_js(self, doc, response_url, response_encoding, base_url): data = doc.re('siblings:\s*(?P<data>.*)\s*\, registryURL') if not data: return [] links = [] articles = json.loads(data[0]) for article in articles['articleList']: try: attr_val = urljoin(base_url, article['uri']) except ValueError: continue # skipping bogus links else: url = self.link_extractor.process_attr(attr_val) if url is None: continue if isinstance(url, unicode): url = url.encode(response_encoding) url = urljoin(response_url, url) link = Link(url, u'', nofollow=False) links.append(link) return unique_list(links, key=lambda link: link.url) \ if self.link_extractor.unique else links
def _deduplicate_if_needed(self, links): if self.unique: return unique_list(links, key=lambda link: link.url) return links
def extract_links(self, response): links = super(RmDupliLinkExtractor, self).extract_links(response) base_url = get_base_url(response) return unique_list([link for link in links if not (link.url.startswith(base_url) and base_url.endswith(link.url[len(base_url):]))])
def _deduplicate_if_needed(self, links): if self.unique: return unique_list(links, key=self.link_key) return links
def _process_links(self, links): """ Normalize and filter extracted links The subclass should override it if necessary """ return unique_list(links, key=self.link_key) if self.unique else links
def extract_links(self, response): all_links = [] for json_path in self.json_paths: links = self._extract_links(json_path, response) all_links.extend(self._process_links(links)) return unique_list(all_links)