def whois(self, url, timeout=10): """Return text of this whois query """ domain = common.get_domain(url) if domain: text = '' key = 'whois_%s' % domain try: if self.cache: text = self.cache[key] else: raise KeyError() except KeyError: # try local whois command r = subprocess.Popen(['whois', domain], stdout=subprocess.PIPE) start = time.time() while r.poll() is None: time.sleep(0.5) if time.time() - start > timeout: try: r.kill() except Exception, e: pass break if r.poll() != 1: text = r.communicate()[0] if '@' in text: if self.cache: self.cache[key] = text return text
def throttle(self, url, delay, proxy=None, variance=0.5): """Delay a minimum time for each domain per proxy by storing last access time `url' is what intend to download `delay' is the minimum amount of time (in seconds) to wait after downloading content from this domain `proxy' is the proxy to download through `variance' is the amount of randomness in delay, 0-1 """ key = str(proxy) + ":" + common.get_domain(url) start = datetime.now() while datetime.now() < Download.domains.get(key, start): time.sleep(SLEEP_TIME) # update domain timestamp to when can query next Download.domains[key] = datetime.now() + timedelta(seconds=delay * (1 + variance * (random.random() - 0.5)))
def crawl(self, D, url, html): """Crawl website html and return list of URLs crawled """ def normalize(link): """Normalize the link to avoid duplicates """ if '#' in link: # remove internal links to avoid duplicates link = link[:link.index('#')] link = common.unescape(link) # remove & from link return urlparse.urljoin(url, link) # support relative links def valid(link): """Check if should crawl this link """ # check if a media file if common.get_extension(link) not in common.MEDIA_EXTENSIONS: # check if a proper HTTP link if link.lower().startswith('http'): # only crawl within website if common.same_domain(domain, link): # passes regex if self.allowed_urls.match( link) and not self.banned_urls.match(link): # not blocked by robots.txt if not self.robots or self.robots.can_fetch( settings.user_agent, link): # allowed to recrawl if self.crawl_existing or (D.cache and link not in D.cache): return True return False domain = common.get_domain(url) depth = CrawlerCallback.found[url] outstanding = [] if depth != self.max_depth: # extract links to continue crawling links_re = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) for link in links_re.findall(html): link = normalize(link) if link not in CrawlerCallback.found: CrawlerCallback.found[link] = depth + 1 if valid(link): # is a new link outstanding.append(link) if len(outstanding) == self.max_links: break return outstanding
def throttle(self, url, delay, proxy=None, variance=0.5): """Delay a minimum time for each domain per proxy by storing last access time `url' is what intend to download `delay' is the minimum amount of time (in seconds) to wait after downloading content from this domain `proxy' is the proxy to download through `variance' is the amount of randomness in delay, 0-1 """ if delay > 0: key = str(proxy) + ':' + common.get_domain(url) start = datetime.datetime.now() while datetime.datetime.now() < Download.domains.get(key, start): time.sleep(SLEEP_TIME) # update domain timestamp to when can query next Download.domains[key] = datetime.datetime.now() + datetime.timedelta(seconds=delay * (1 + variance * (random.random() - 0.5)))
def crawl(self, D, url, html): """Crawl website html and return list of URLs crawled """ def normalize(link): """Normalize the link to avoid duplicates """ if '#' in link: # remove internal links to avoid duplicates link = link[:link.index('#')] link = common.unescape(link) # remove & from link return urlparse.urljoin(url, link) # support relative links def valid(link): """Check if should crawl this link """ # check if a media file if common.get_extension(link) not in common.MEDIA_EXTENSIONS: # check if a proper HTTP link if link.lower().startswith('http'): # only crawl within website if common.same_domain(domain, link): # passes regex if self.allowed_urls.match(link) and not self.banned_urls.match(link): # not blocked by robots.txt if not self.robots or self.robots.can_fetch(settings.user_agent, link): # allowed to recrawl if self.crawl_existing or (D.cache and link not in D.cache): return True return False domain = common.get_domain(url) depth = CrawlerCallback.found[url] outstanding = [] if depth != self.max_depth: # extract links to continue crawling links_re = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) for link in links_re.findall(html): link = normalize(link) if link not in CrawlerCallback.found: CrawlerCallback.found[link] = depth + 1 if valid(link): # is a new link outstanding.append(link) if len(outstanding) == self.max_links: break return outstanding
def crawl(self, D, url, html): """Crawl website html and return list of URLs crawled """ def valid(link): """Check if should crawl this link """ # check if a media file if common.get_extension(link) not in common.MEDIA_EXTENSIONS: # check if a proper HTTP link if link.lower().startswith('http'): # only crawl within website if common.same_domain(domain, link): # passes regex if self.allowed_urls.match( link) and not self.banned_urls.match(link): # not blocked by robots.txt if not self.robots or self.robots.can_fetch( settings.user_agent, link): # allowed to recrawl if self.crawl_existing or (D.cache and link not in D.cache): return True return False domain = common.get_domain(url) depth = self.found[url] outstanding = [] if depth != self.max_depth: # extract links to continue crawling links_re = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) for link in links_re.findall(html): try: link = self.normalize(url, link) except UnicodeDecodeError as e: # unicode error when joining url common.logger.info(e) else: if link not in self.found: self.found[link] = depth + 1 if valid(link): # is a new link outstanding.append(link) if len(outstanding) == self.max_links: break return outstanding
def whois(self, url, timeout=10): """Query whois info """ domain = common.get_domain(url) if domain: text = '' key = 'whois_%s' % domain try: if self.cache: text = self.cache[key] else: raise KeyError() except KeyError: # try online whois app query_url = 'http://whois.chinaz.com/%s' % domain html = self.get(query_url) match = re.compile( "<script src='(request.aspx\?domain=.*?)'></script>" ).search(html) if match: script_url = urlparse.urljoin(query_url, match.groups()[0]) text = self.get(script_url, read_cache=False) if '@' not in text: if self.cache: del self.cache[query_url] # failed, so try local whois command r = subprocess.Popen(['whois', domain], stdout=subprocess.PIPE) start = time.time() while r.poll() is None: time.sleep(0.5) if time.time() - start > timeout: try: r.kill() except Exception, e: pass break if r.poll() != 1: text = r.communicate()[0] if '@' in text: if self.cache: self.cache[key] = text return text
def throttle(self, url, delay, proxy=None, variance=0.5): """Delay a minimum time for each domain per proxy by storing last access time url what intend to download delay the minimum amount of time (in seconds) to wait after downloading content from this domain proxy the proxy to download through variance the amount of randomness in delay, 0-1 """ if delay > 0: key = str(proxy) + ':' + common.get_domain(url) if key in Download._domains: while datetime.datetime.now() < Download._domains.get(key): time.sleep(SLEEP_TIME) # update domain timestamp to when can query next Download._domains[key] = datetime.datetime.now() + datetime.timedelta(seconds=delay * (1 + variance * (random.random() - 0.5)))
def throttle(self, url, delay, proxy=None, variance=0.5): """Delay a minimum time for each domain per proxy by storing last access time url what intend to download delay the minimum amount of time (in seconds) to wait after downloading content from this domain proxy the proxy to download through variance the amount of randomness in delay, 0-1 """ if delay > 0: key = ':'.join([str(proxy), self.throttle_additional_key or '', common.get_domain(url)]) if key in Download._domains: while datetime.datetime.now() < Download._domains.get(key): time.sleep(SLEEP_TIME) # update domain timestamp to when can query next Download._domains[key] = datetime.datetime.now() + datetime.timedelta(seconds=delay * (1 + variance * (random.random() - 0.5)))
def whois(self, url, timeout=10): """Return text of this whois query """ domain = common.get_domain(url) if domain: text = '' key = 'whois_%s' % domain try: if self.cache: text = self.cache[key] else: raise KeyError() except KeyError: # try online whois app query_url = 'http://whois.chinaz.com/%s' % domain html = self.get(query_url) match = re.compile("<script src='(request.aspx\?domain=.*?)'></script>").search(html) if match: script_url = urlparse.urljoin(query_url, match.groups()[0]) text = self.get(script_url, read_cache=False) if '@' not in text: if self.cache: del self.cache[query_url] # failed, so try local whois command r = subprocess.Popen(['whois', domain], stdout=subprocess.PIPE) start = time.time() while r.poll() is None: time.sleep(0.5) if time.time() - start > timeout: try: r.kill() except Exception, e: pass break if r.poll() != 1: text = r.communicate()[0] if '@' in text: if self.cache: self.cache[key] = text return text
def crawl(self, D, url, html): """Crawl website html and return list of URLs crawled """ def valid(link): """Check if should crawl this link """ # check if a media file if common.get_extension(link) not in common.MEDIA_EXTENSIONS: # check if a proper HTTP link if link.lower().startswith('http'): # only crawl within website if common.same_domain(domain, link): # passes regex if self.allowed_urls.match(link) and not self.banned_urls.match(link): # not blocked by robots.txt if not self.robots or self.robots.can_fetch(settings.user_agent, link): # allowed to recrawl if self.crawl_existing or (D.cache and link not in D.cache): return True return False domain = common.get_domain(url) depth = self.found[url] outstanding = [] if depth != self.max_depth: # extract links to continue crawling links_re = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) for link in links_re.findall(html): try: link = self.normalize(url, link) except UnicodeDecodeError as e: # unicode error when joining url common.logger.info(e) else: if link not in self.found: self.found[link] = depth + 1 if valid(link): # is a new link outstanding.append(link) if len(outstanding) == self.max_links: break return outstanding