def __init__(self): self.unvisitedURLs = set() self.visitedURLs = set() self.buggyURLs = set() self.robotParser = robotparser.RobotFileParser() self.contentDigest = {} self.http = httplib2.Http(".cache")
def check_robots(url): robots_parser = robotparser.RobotFileParser() robots_parser.set_url(Spider.currentBaseUrl + '/robots.txt') robots_parser.read() if not robots_parser.can_fetch(Spider.AGENT_NAME, url): return False return True
def craw(self, rool_url): count = 1 self.urls.add_new_url(rool_url) throttle = self.downloader.Throttle(0) rp = robotparser.RobotFileParser() rp.set_url('https://baike.baidu.com/robots.txt') rp.read() user_agent = 'wswp' while self.urls.has_new_url(): try: new_url = self.urls.get_new_url() print u'第%d个页面%s' % (count, new_url) if True: #rp.can_fetch(user_agent,new_url): #print "if is running" throttle.wait(new_url) self.craw_isrunning(new_url) if count == 20: break count = count + 1 else: print 'Blocked by robots.txt', new_url except Exception, e: print 'craw failed' print e.message print "111"
def connect_mozrepl(url_addr): quit = False t = telnetlib.Telnet("localhost", 4242) t.read_until("repl>") #verifies page was accepted rp = robotparser.RobotFileParser() fetched = rp.can_fetch("*", url_addr) print fetched state = True while (state == True): if fetched == True: rdm = random.random() * 500 print rdm time.sleep(rdm) #WAIT FOR WEBPAGE TO LOAD str = "content.location.href='" + url_addr.strip() + "'\n" print str t.write(str) body = t.read_until("repl>") state = False else: state = False print "unable to fetch web page, exiting!!!" quit = True break t.write("content.document.body.innerHTML\n") body = t.read_until("repl>") t.close() return body, quit
def _get_robotparser(self, link): """Return the proper robots parser for the given url or None if one cannot be constructed. Robot parsers are cached per scheme and netloc.""" # only some schemes have a meaningful robots.txt file if link.scheme != 'http' and link.scheme != 'https': debugio.debug( 'crawler._get_robotparser() called with unsupported scheme (%s)' % link.scheme) return None # split out the key part of the url location = urlparse.urlunsplit((link.scheme, link.netloc, '', '', '')) # try to create a new robotparser if we don't already have one if not self._robotparsers.has_key(location): import httplib debugio.info(' getting robots.txt for %s' % location) self._robotparsers[location] = None try: rp = robotparser.RobotFileParser() rp.set_url( urlparse.urlunsplit( (link.scheme, link.netloc, '/robots.txt', '', ''))) rp.read() self._robotparsers[location] = rp except (TypeError, IOError, httplib.HTTPException): # ignore any problems setting up robot parser pass return self._robotparsers[location]
def link_crawler(seed_url, delay=1, link_regex=None, proxies=None, max_depth=2, user_agent='wswp', num_retries=2, scrape_callback=ScrapeCallback(), cache=None): crawl_queue = [seed_url] seen = {seed_url: 0} rp = robotparser.RobotFileParser() #initialize downloader cache = MongoCache() D = downloader(cache=cache) while crawl_queue: url = crawl_queue.pop() #detect whether this url is baned #if rp.can_fetch(user_agent, url): depth = seen[url] if depth != max_depth: html = D(url) #html parser and convert to csv if scrape_callback: scrape_callback(url, html) for link in get_links(html): if re.search(link_regex, link): link = urlparse.urljoin(seed_url, link) if link not in seen: seen[link] = depth + 1 crawl_queue.append(link)
def link_crawler(seed_url, link_regex): crawl_queue = [seed_url] """ #此为先前版本,为了防止爬虫陷阱,把sean修改为字典 #为了防止死循环(页面间的互相链接)加入seen表示已接受的链接 #set表示生成集合,不重复 #seen=set(crawl_queue) """ #max_depth设置为负数就可以取消“避免爬虫陷阱”的功能 #该功能维护一个字典<url,深度> 当深度超过最大深度时停止深挖 max_depth = 1 seen = {} #按robots.txt要求抓取 下面三行都要加入,不能少任何一行 rp = robotparser.RobotFileParser() rp.set_url('http://example.webscraping.com/robots.txt') rp.read() while crawl_queue: url = crawl_queue.pop() #BadCrawler会被阻塞 if rp.can_fetch('Crawler', url): #加入限速 throttle = Throttle(delay=2000) throttle.wait(url) html = download(url) depth = 0 if depth != max_depth: for link in get_links(html): if re.match(link_regex, link): #该处用到了导入包所包含的方法,使相对路径变绝对路径 link = urlparse.urljoin(seed_url, link) if link not in seen: seen[link] = depth + 1 crawl_queue.append(link) else: print 'Block by robots.txt', url
def Allowed(self, url, UserAgentString): try: parsed = urlparse(url) port = "" if (parsed.port): port = ":" + str(parsed.port) except ValueError: print("ValueError: " + url) roboturl = "" try: roboturl = parsed.scheme + "://" + parsed.hostname + port + "/robots.txt" except TypeError: print(parsed) if roboturl not in self.RuleDict: self.RuleDict[roboturl] = robotparser.RobotFileParser(roboturl) try: self.RuleDict[roboturl].read() except IOError: del self.RuleDict[roboturl] return True try: return self.RuleDict[roboturl].can_fetch(UserAgentString, url) except KeyError: print("Keyerror: " + url) return True
def robotparser(url): import robotparser import urlparse rp = robotparser.RobotFileParser() rp.set_url(urlparse.urljoin(url, '/robots.txt')) rp.read() return rp
def check_robots(self, msg): data = json.loads(msg.body) self.ch.basic_ack(msg.delivery_tag) # get the robots.txt URL url = self.get_robots_url(data['url']) logging.info("Using robots url: %s", url) try: # fetch robots.txt robots_txt = requests.get(url, headers=self.headers) # pass the content to the robots.txt parser rbp = robotparser.RobotFileParser() rbp.parse(robots_txt.text.splitlines()) # check to see if we're allowed in - test using OrgProbe's useragent if not rbp.can_fetch(self.config.get('daemon', 'probe_useragent'), data['url']): logging.warn("Disallowed: %s", data['url']) # write rejection to DB self.set_url_status(data['url'], 'disallowed-by-robots-txt') return True else: # we're allowed in. logging.info("Allowed: %s", data['url']) except Exception, v: # if anything bad happens, log it but continue logging.error("Exception: %s", v)
def get_robots(url): """解析robots.txt文件 """ rp = robotparser.RobotFileParser() rp.set_url(urlparse.urljoin(url, '/robots.txt')) rp.read() return rp
def _can_fetch(self, url): return True robots_file = self._get_robots_file_url(url) rp = robotparser.RobotFileParser() rp.set_url(robots_file) rp.read() return rp.can_fetch(USER_AGENT, url)
def link_crawler(seed_url, link_regex): """ crawlfrom the given seed URL following links matched by link_regex :param seed_url: :param link_regex: :return: """ #read the robots.txt rp = robotparser.RobotFileParser() rp.set_url('http://example.webscraping.com/robots.txt') rp.read() #set the agent's name user_agent = "667's Python Spider" #set the delay for crawl speed 5 second th = Throttle.Throttle(5) #set the crawl queue for crawled url crawl_queue = [seed_url] visited = set(crawl_queue) while crawl_queue: url = crawl_queue.pop() if rp.can_fetch(user_agent, url): th.wait(url) html = download_network_page(url) print html # filter for links matching out regular expression for link in get_links(html): if re.match(link_regex, link): link = urlparse.urljoin(seed_url, link) if link not in visited: visited.add(link) crawl_queue.append(link)
def check_robots(url, user_agent, robots_name='robots.txt'): rp = robotparser.RobotFileParser() rp.set_url(urlparse.urljoin(url, robots_name)) rp.read() result = rp.can_fetch(user_agent, url) return result
def check_robot(self, url): """Check robots.txt on the root of the URL""" global robot_trace global lock_robottrace robot_url = urlparse.urljoin(url, '/robots.txt') fetch_robot = False lock_robottrace.acquire() if robot_url not in robot_trace: fetch_robot = True else: ret = robot_trace[robot_url].can_fetch("*", url) lock_robottrace.release() rp = None try: if fetch_robot: rp = robotparser.RobotFileParser() rp.set_url(robot_url) rp.read() else: return ret except: return True else: lock_robottrace.acquire() if robot_url not in robot_trace: robot_trace[robot_url] = rp print "Find robots.txt at %s" % robot_url ret = robot_trace[robot_url].can_fetch("*", url) lock_robottrace.release() return ret
def add_to_redis(r_conn, redis_server, redis_db): robots = rParser.RobotFileParser() robots.set_url(id + "/robots.txt") robots.read() r_conn.set(id, cP.dumps(robots)) r_conn.expire(id, 7200) return robots
def link_crawler1(seed_url, link_regex, user_agent='lcy', max_depth=2): crawl_queue = [seed_url] seen = {} #记录见到过的url seen[seed_url] = 1 rp = robotparser.RobotFileParser() rp.set_url(seed_url) rp.read() while crawl_queue: url = crawl_queue.pop() if rp.can_fetch(user_agent, url): #解析robots文件,判断代理是否可以访问 html = download1(url, user_agent) #get_links解析出html中所有的静态url #link_regex是目标url,过滤掉非目标页面 #未达到指定深度则提取links depth = seen[url] if depth != max_depth: #令max_depth=负数,则永远不等,则屏蔽了深度限制 for link in get_links(html): if re.match(link_regex, link): #网页中的href使用的是相对链接路径时,浏览器可以正常访问,但urllib2不能 #使用urljoin生成绝对链接路径 link = urlparse.urljoin(seed_url, link) if link not in seen: seen[link] = depth + 1 crawl_queue.append(link) else: print 'Blocked by robots.txt:', url
def robot_fetch(path): for rule in kw["robots_exclusions"]: robot = robotparser.RobotFileParser() robot.parse(["User-Agent: *", "Disallow: {0}".format(rule)]) if not robot.can_fetch("*", '/' + path): return False # not robot food return True
def testPythonOrg(self): test_support.requires('network') parser = robotparser.RobotFileParser( "http://www.python.org/robots.txt") parser.read() self.assertTrue( parser.can_fetch("*", "http://www.python.org/robots.txt"))
def get_robots(self, url): #extract domain base url #check for existance of robots.txt #process robots.txt (User-agent, Allow, Disallow, Crawl-delay and Sitemap)?? #If a sitemap is defined shuld all the URLs defined within it be added to the frontier exclusively or additionaly #If site not already in DB, write it there #else just try to find site's RP object in local cache cursor = self.cursor conn = self.db_conn parsed_uri = urlparse(url) domain = Crawler_worker.remove_www(parsed_uri.netloc) domain_url = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) ##### restore from cache if stored else create ##### if domain in Crawler_worker.cache_robots: rp = Crawler_worker.cache_robots[domain] else: robots_url = domain_url + 'robots.txt' rp = robotparser.RobotFileParser() rp.set_url(robots_url) try: rp.read() except Exception as e: print(self.id, 'EXCEPTION get_robots()', e) pass Crawler_worker.cache_robots_lock.acquire() self.cache_robots_lock_timestamp = time.time() if domain not in Crawler_worker.cache_robots: Crawler_worker.cache_robots[domain] = rp Crawler_worker.cache_robots_lock.release() self.cache_robots_lock_timestamp = None return rp
def get_robots(url): """ 返回robots.txt中的url限制判断器 """ rp = robotparser.RobotFileParser() rp.set_url(urlparse.urljoin(url, '/robots.txt')) rp.read() return rp
def __init__(self, url, urlLimit, delay): #self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) #self.s.settimeout(3600) #self.s.connect(("localhost", 1337)) self.sleeptime = delay self.limit = urlLimit self.url = url self.db = MySql.Database("127.0.0.1", "pythondb", "pythonUser", "pythondb") self.filename = self.url self.urlfile = open(self.filename + '.txt', 'wb') self.url = 'http://' + self.url self.urls = [self.url] self.counter = 0 self.rp = robotparser.RobotFileParser() self.rp.set_url(self.url + '/robots.txt') self.rp.read() print 'Set robotparser url to:', self.url + '/robots.txt' self.urlOpener = myOpener()
def link_crawler(seed_url, link_regex, max_depth=1, scrape_callback=None): crawl_queue = [seed_url] seen = {seed_url: 0} rp = robotparser.RobotFileParser() while crawl_queue: url = crawl_queue.pop() rp.set_url(url + '/robots.txt') rp.read() user_agent = 'wswp' if rp.can_fetch(user_agent, url): throttle = Throttle.Throttle(5) throttle.wait(url) html = download(url) links = [] if scrape_callback: links.extend(scrape_callback(url, html) or []) depth = seen[url] if depth != max_depth: for link in get_links(html): if re.match(link_regex, link): link = urlparse.urljoin(seed_url, link) if link not in seen: seen[link] = depth + 1 # seen.add(link) crawl_queue.append(link) else: print 'Blocked by robots.txt:', url
def get_robots(url): """Initialize robots parser for this domain """ rp = robotparser.RobotFileParser() rp.set_url(urlparse.urljoin(url, '/robots.txt')) rp.read() return rp
def link_crawler(seed_url, max_depth=2): """Crawl from the given seed URL following links matched by link_regex """ rp = robotparser.RobotFileParser() rp.set_url(seed_url) rp.read() user_agent = 'Mozilla/5.0' throttle = Throttle(2) crawl_queue = [seed_url] # seen = set(crawl_queue) seen = {} while crawl_queue: url = crawl_queue.pop() if rp.can_fetch(user_agent, url): throttle.wait(url) html = download(url) # Filter for links matching our regular expression for link in get_links(html): link = urlparse.urljoin(seed_url, link) # print link # check if crawler has already seen this link if link not in seen: seen.add(link) crawl_queue.append(link) else: print 'Blocked by robots.txt', url
def testPasswordProtectedSite(self): test_support.requires('network') with test_support.transient_internet('mueblesmoraleda.com'): url = 'http://mueblesmoraleda.com' robots_url = url + "/robots.txt" # First check the URL is usable for our purposes, since the # test site is a bit flaky. try: urlopen(robots_url) except HTTPError as e: if e.code not in {401, 403}: self.skipTest( "%r should return a 401 or 403 HTTP error, not %r" % (robots_url, e.code)) else: self.skipTest( "%r should return a 401 or 403 HTTP error, not succeed" % (robots_url)) parser = robotparser.RobotFileParser() parser.set_url(url) try: parser.read() except IOError: self.skipTest('%s is unavailable' % url) self.assertEqual(parser.can_fetch("*", robots_url), False)
def get_links(self, soup, base_url): """Extract the urls from a parsed html.""" base = soup.find("base", href=True) if base: base_url = urljoin(base_url, base.get("href")) all_links = [urljoin(base_url, i.get('href').strip()) for i in soup.find_all('a', href=True)] # I remove urls starting with "/" # For debugging purpose I use a long version of the following: # links = [l for l in all_links if not l.startwith("/")] links = [] for l in all_links: if l.startswith("/"): logging.debug("skipping url starting with '/'.base: %s link: %s" % (base_url, l)) else: links.append(l) if self.nofollow_compliant is True: nofollow = Set( [urljoin(base_url, i.get('href')) for i in soup.find_all('a', {"rel": "nofollow"}, href=True)] ) links = [l for l in links if l not in nofollow] allowed_links = Set() domains = Set() for l in links: domain = self.get_domain(l) if len(self.allowed_domains) == 0 or domain in self.allowed_domains: norm_l = self.normalize_url(l) if [True for ex in self.exclude_pages if re.match(ex, norm_l)]: continue rb = None domains.add(domain) if self.robots_compliant: rb = self.robotparser_cache.get(domain) if rb is None: rb = robotparser.RobotFileParser() # TODO: fix this rb.set_url("http://" + domain + "/robots.txt") rb.read() self.robotparser_cache[domain] = rb # INFO: this try-catch is here because sometimes # can_fatch returns unicode problems. It seemd # to be a bug of the library though. ################################################################ try: # TODO: ho messo l'* perche' non so lo user-agent qui per ora if not rb or rb.can_fetch("*", norm_l): allowed_links.add(norm_l) except KeyError: pass return list(allowed_links)
def get_robots(url): """Return True if both URL's belong to same domain """ rp = robotparser.RobotFileParser() rp.set_url(urlparse.urljoin(url, '/robots.txt')) rp.read() return rp
def testPythonOrg(self): test_support.requires('network') with test_support.transient_internet('www.python.org'): parser = robotparser.RobotFileParser( "https://www.python.org/robots.txt") parser.read() self.assertTrue( parser.can_fetch("*", "https://www.python.org/robots.txt"))
def robot_file(domain): rp = robotparser.RobotFileParser(urlparse.urljoin(domain, "robots.txt")) rp.read() def _clos(url): return rp.can_fetch("*", url) return _clos