def download_pages_in_queue(self, queue): current_page_url = queue.get() robot = RobotsCache() if (robot.allowed(current_page_url, "*")): print current_page_url if len(current_page_url) < 10: return current_page_html = download_page_by_url(current_page_url) bs = BeautifulSoup(current_page_html, "html.parser") links = bs.find_all('a', href=True) post_links = [link['href'] for link in links] for post_link in post_links: if len(post_link) < 10: continue if str(post_link).find('http') != 0: post_link = str(self.start_url) + str(post_link) queue.put(post_link) self.sites_num = self.sites_num + 1 page = Pages(url = current_page_url, parsed_text = get_text_from_html(current_page_html), is_indexed = False) page.save() else: print "Page can't be indexed because of the rules in ROBOTS.TXT"
def download_pages_in_queue(self, queue): current_page_url = queue.get() robot = RobotsCache() if (robot.allowed(current_page_url, "*")): print current_page_url if len(current_page_url) < 10: return current_page_html = download_page_by_url(current_page_url) bs = BeautifulSoup(current_page_html, "html.parser") links = bs.find_all('a', href=True) post_links = [link['href'] for link in links] for post_link in post_links: if len(post_link) < 10: continue if str(post_link).find('http') != 0: post_link = str(self.start_url) + str(post_link) queue.put(post_link) self.sites_num = self.sites_num + 1 page = Pages(url=current_page_url, parsed_text=get_text_from_html(current_page_html), is_indexed=False) page.save() else: print "Page can't be indexed because of the rules in ROBOTS.TXT"
class RobotsMiddleware(BaseMiddleware): def __init__(self, *args, **kwargs): self.cache = RobotsCache(*args, **kwargs) self.visited = collections.defaultdict(dict) def check_disallow(self, url, agent): if not self.cache.allowed(url, agent): raise RobotsDisallowedError def check_crawl_delay(self, url, agent): delay = self.cache.delay(url, agent) if delay is None: return now = datetime.datetime.utcnow() host = urlparse.urlparse(url).hostname try: last_visit = self.visited[agent][host] if (now - last_visit).seconds < delay: raise RobotsThrottledError except KeyError: pass self.visited[agent][host] = now def before_send(self, request, *args, **kwargs): url = request.url agent = request.headers.get('User-Agent') self.check_disallow(url, agent) self.check_crawl_delay(url, agent)
def robot_pass(self,page): """ Accepts page [object] Creates instance of RobotsCache (from reppy) Passes URL of page as string into robots.allowed method Returns True or False """ robots = RobotsCache() return robots.allowed(page.get_url(), '*')
def get_scanner_mock(request_limit): robots_cache = RobotsCache() robots_cache.fetch = MagicMock(return_value=robots_cache) robots_cache.allowed = MagicMock(return_value=True) robots_validator = RobotsValidator(agent='*') robots_validator.robots = robots_cache scanner = UrlScanner(request_limit) scanner.url_fetcher = get_url_fetcher_mock(request_limit) scanner.robots_validator = robots_validator return scanner
def check_for_robot_access(self, page): self.f.write('--- checking for robots %s\n' % page) robots = RobotsCache() try: if robots.allowed(page + 'robots.txt', 'my-agent'): print 'robots allowed' self.f.write('robots allowed. \n') return True except ServerError, r: print 'error ', r return False
def check_for_robot_access(self, page): self.f.write('--- checking for robots %s\n' %page) robots = RobotsCache() try: if robots.allowed(page+'robots.txt', 'my-agent'): print 'robots allowed' self.f.write('robots allowed. \n') return True except ServerError, r: print 'error ', r return False
def get_text_by_base_url(self): robots = RobotsCache(capacity=100) if not robots.allowed(self.base_url, "python-requests"): return ["Crawling this site is not allowed by robots.txt"] text_list = [] for slug in self.__get_links_by_url_depth(): sleep(0.5) text_list.append( remove_emoji( remove_url(self.__get_text_by_url(self.base_url + slug))).strip()) return text_list
def confirm_robots_txt(target_url, max_capacity): '''confirm that target url is allowed to crawl :type target_url: str :param target_url: agent wanna crawl :type max_capacity: int :param max_capacity: limit of max crawling pages :rtype: bool :return: weather it is possible to scrape ''' robots = RobotsCache(max_capacity) return robots.allowed(target_url, 'python program')
def run(self): global terminator pattern='(http://)(\w*\.)+\w+(/\w*)*' #Initialize RobotsCache object robots=RobotsCache() while 1: if terminator: break cur_raw_tweet=raw_tweets.get(True) curtweet=json.loads(cur_raw_tweet) if DEBUG: print "Got an item from raw_tweets", current_thread().getName() # Check if twitter has tate limited you by sending a blank tweet if u'text' in curtweet.keys(): text=curtweet[u'text'] else: print "Rate limited by twitter. Continuing" continue #Get text and check if it has links using regex. link=re.search(pattern,text) if link: if DEBUG: print "match" flink=link.group() #Check if crawling is allowed try: if robots.allowed(flink,'tweetbot'): soup=BeautifulSoup(urllib2.urlopen(flink),"lxml") #Check if page has title if soup.title: curtweet[u'linkTitle']=soup.title.string except reppy.ReppyException: print "Error fetching robots.txt. Continuing" continue except urllib2.URLError: print "Bad Url. Report to the developer. Continuing" continue except urllib2.HTTPError: print "Error Fetching Web Page. Continuing" continue else: if DEBUG: print "not match" processed_tweets.put(json.dumps(curtweet),True) if DEBUG: print "Put on processed queue. ProcessedSize", processed_tweets.qsize()
class EZWS: """ SELF: config json config file ua user agent robo robotcache obj link current link urlp url parse object for current link soup current html page soup obj req requests obj raw raw html from req.get() check check for robot files, keep true output name of output csv file """ def __init__(self, file, ua, check=True, output="output.csv" ): #setting output to false disables file output if check: #only setup robot checker if robot checking is enabled self.ua = ua #user agent self.robo = RobotsCache(capacity=100) #check var disables or enables robots.txt checking #recommended to keep default True value self.check = check self.req = requests #request obj for parsing url self.output = output #where to output file self.data = [] #init array of grabbed sites self.configarr = [] #empty array of all configs if type(file) is list: self.configarr = file else: self.configarr.append(file) def allowed(self, url): #checks if url is ok to download if self.check: if self.robo.allowed(url, self.ua): #checks robot file return True else: print(url, "is not allowed") #notify user if url isnt allowed return False else: return True #if robot checking is off, return true regardless @property #when url is called, return it def url(self): if hasattr(self, "link"): #handles whether self has link attribute return self.link else: return "" #if not return empty string @url.setter #when url is set, parse it def url(self, url): self.link = url self.urlp = urlparse(url) def download(self, url): if self.allowed(url): self.raw = self.req.get(url).content self.soup = BeautifulSoup(self.raw, "html.parser") #loads html into soup obj def xpath(self, html, xp): #takes html and returns data from xpath tree = lxmlhtml.fromstring(html) #generates tree return tree.xpath(xp) #returns data from tree def select(self, html, obj): #determines whether to grab using css or xpath if "xpath" in obj: #if xpath items = self.xpath(html.getText(), obj["xpath"]) #return xpath selector arr else: #css items = html.select(obj["css"]) #return a css selector arr if self.config["header"]: #if theres a header keep data to one column items = items[:1] if "css" in obj: #if data is css attribute(s) from element row = [] for item in items: cont = [ ] #arr for storing attribs from each css selected element if type( obj["contents"] ) is str: #if contents is a string, put it into an array obj["contents"] = [obj["contents"]] for content in obj["contents"]: if content: #if not empty, get the element from tag cont.append(item[content]) else: #if empty, get the text from tag cont.append(item.text) row += cont #append attribs to attrib array return row #return all the attribs (css) else: return items #return xpath def clear(self): self.data = [] def load(self, index): tmp = self.configarr[index] if type(tmp) is dict: #if file is json obj, load it self.config = tmp else: #assume it is a file and load it if os.path.exists(tmp): with open(tmp) as f: self.config = json.load(f) #opens and parses json file def grab(self, index=None): if index == None: #using grab() with no params will grab all configs passed for i in range(len(self.configarr)): self.grab(i) #grab "i" config file else: self.load(index) #get current file obj if self.output: #only create simplecsv obj if file outputting is on sc = simplecsv(self.output, mode="w+") #using w+ mode to remove old output if self.config["header"]: sc.writerow( self.config["header"]) #add header from config to csv for link in self.config["links"]: #loop through links samelinks = [] #empty list of links for now if type(link["url"]) is str: samelinks.append( link["url"] ) #if url is a single str not array append it to an array else: #assume it is an array samelinks = link["url"] for samelink in samelinks: #passing "url" an array of urls will do the same params on all the links if self.allowed(samelink): #check if url is allowed self.download(samelink) #if so download it for divs in self.soup.select(link["container"]): add = [] for get in link[ "grab"]: #grabs each element from inside each div add += self.select(divs, get) self.data += add #update internal data if self.output: sc.writerow( add ) #only write to disk if file output is on if self.output: sc.close() #only close "sc" if file output is on
class EZWS: """ SELF: config json config file ua user agent txt path to current robot file robo robotcache obj link current link urlp url parse object for current link soup current html page soup obj req requests obj raw raw html from req.get() check check for robot files, keep true output name of output csv file """ def __init__(self, file, ua, check=True, output="output.csv"): if check: #only setup robot checker if robot checking is enabled self.ua = ua #user agent self.robo = RobotsCache(capacity=0) #check disables or enables robots.txt checking #recommended to keep default True value self.check = check self.req = requests if os.path.exists(file): with open(file) as f: self.config = json.load(f) #opens and parses json file def allowed(self, url): #checks if url is ok to download if self.check: if self.robo.allowed(url, self.ua): #checks robot file return True else: print(url, "is not allowed") #notify user if url isnt allowed return False else: return True #if robot checking is off, return true regardless @property #when url is called, return it def url(self): if hasattr(self, "link"): #handles whether self has link attribute return self.link else: return "" #if not return empty string @url.setter #when url is set, parse it def url(self, url): self.link = url self.urlp = urlparse(url) def download(self, url): if self.allowed(url): self.raw = self.req.get(url).content self.soup = BeautifulSoup(self.raw, "html.parser") #loads html into soup obj def grab(self): sc = simplecsv("output.csv", mode="w+") #using w+ mode to remove old output sc.writerow(self.config["header"]) #add header from config to csv for link in self.config["links"]: #loop through links if self.allowed(link["url"]): #check if url is allowed self.download(link["url"]) #if so download it for divs in self.soup.select(link["container"]): row = [] for get in link[ "grab"]: #grabs each element from inside each div item = divs.select(get["css"])[0] if get["content"]: #if not empty, get the element from tag row.append(item[get["content"]]) else: #if empty, get the text from tag row.append(item.text) sc.writerow(row) sc.close()
・画像クローラー http://qiita.com/komakomako/items/dd380f980e56e70fa321 Targets: ・https://reverb.com/jp/marketplace/electric-guitars ・https://www.yahoo.co.jp """ # (1) クロールするurlを決める target_url = "https://www.yahoo.co.jp" # (2) robot.txtを読み込むため際に使用するインスタンスの作成 robots = RobotsCache(100) # (3) もし、robot.txtを読み込んでみて、クロール許可をもらえたら、先の処理に進む if robots.allowed(target_url, 'python program'): # (4) Javascriptで生成されたコードでもクロールできるようにPhatomJSインスタンスを作成する driver = webdriver.PhantomJS() # (5) 作成したインスタンスのGetリクエストを呼ぶメソッドに対象のurlを引数として与え、domの情報を手に入れる driver.get(target_url) # <selenium.webdriver.phantomjs.webdriver.WebDriver (session="b140b9a0-74d3-11e7-b434-8b9f5b309f17")> # type(driver) # <class 'selenium.webdriver.phantomjs.webdriver.WebDriver'> # (6) 先ほど取得したdomの情報をutf-8でエンコードして、クロール対象ページの情報をbyte型として保持する html = driver.page_source.encode('utf-8') # type(html) # <class 'bytes'> # html = requests.get(target_url) # < Response [200]>
while len(url_frontier) != 0: # pop any random url url = url_frontier.pop() try: print("\n---------------------------------------------------------") print("Crawling:", url) print("---------------------------------------------------------") # get crawl delay r = robots_cache.fetch(Robots.robots_url(url))[1] # check if its allowed to crawl that url? If not, then skip this url if not robots_cache.allowed(url, '*'): print("This URL is restricted to be crawled.") continue # insert this link to database cur.execute("INSERT OR IGNORE INTO crawled_urls (url_link) values(?)", (url,)) # if its allowed to crawl, then get the crawling delay crawl_delay = r.agent("*").delay if crawl_delay is not None: time.sleep(crawl_delay) else: time.sleep(default_crawl_delay) #################################################
class Hodor(object): def __init__(self, url, config={}, proxies={}, auth=None, ua=DEFAULT_HODOR_UA, pagination_max_limit=DEFAULT_HODOR_MAX_PAGES, crawl_delay=DEFAULT_CRAWL_DELAY, ssl_verify=False, trim_values=True, robots=True, reppy_capacity=100): self.content = None self.url = url self.domain = self._get_domain() self.proxies = proxies self.auth = auth self.ua = ua self.trim_values = trim_values self.ssl_verify = ssl_verify self.config = {} self.extra_config = {} self.robots = RobotsCache(capacity=reppy_capacity) if robots else None self._pages = [] self._page_count = 0 self._pagination_max_limit = pagination_max_limit self.crawl_delay = self._crawl_delay(crawl_delay) for k, v in config.items(): if k.startswith("_"): self.extra_config[k.lstrip("_")] = v else: self.config[k] = v def _get_domain(self): parsed_uri = urlparse(self.url) return '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) def _crawl_delay(self, crawl_delay): if self.robots not in EMPTY_VALUES: expiry, robots = self.robots.fetch('{}robots.txt'.format(self.domain)) delay = robots.agent(self.ua).delay try: crawl_delay = max(filter(partial(is_not, None), [delay, crawl_delay])) except ConnectionException: pass return crawl_delay def _fetch(self, url): '''Does the requests fetching and stores result in self.content''' if self.robots in EMPTY_VALUES or self.robots.allowed(url, self.ua): session = requests.session() headers = {'User-Agent': self.ua} if len(self.proxies) > 0: session.proxies = self.proxies if self.auth: r = session.get(url, headers=headers, auth=self.auth, verify=self.ssl_verify) else: r = session.get(url, headers=headers, verify=self.ssl_verify) self.content = r.content return self.content @staticmethod def _get_value(content, rule): '''Returns result for a specific xpath''' try: tree = html.fromstring(content) except TypeError: tree = None post_processing = rule.get('transform', lambda data: data) data = "" if tree not in EMPTY_VALUES: if 'xpath' in rule: data = tree.xpath(rule['xpath']) elif 'css' in rule: data = [node.text_content() for node in tree.cssselect(rule['css'])] many = rule.get('many', True) if not many: if len(data) == 0: data = None else: data = post_processing(data[0]) else: data = [post_processing(d) for d in data] return data @staticmethod def _group_data(data, groups, config): del_fields = [] for dest, group_fields in groups.items(): if '__all__' in group_fields or group_fields == '__all__': group_fields = [rule for rule in config.keys() if not rule.startswith('_')] del_fields.extend(group_fields) gdata = [] for field in group_fields: gdata.append(data[field]) data[dest] = [] for gd in zip(*gdata): d = {} for i, field in enumerate(group_fields): d[field] = gd[i] data[dest].append(d) if len(del_fields) == 0: del_fields = [field for field_set in groups.values() for field in field_set] for field in del_fields: if field in data: del data[field] def _package_pages(self): self._data = {} if len(self._pages) == 1: self._data = self._pages[0] else: self._data = {key: [] for key in self._pages[0].keys()} for page in self._pages: for k, v in page.items(): if hasattr(v, '__iter__'): self._data[k].extend(v) else: self._data[k].append(v) return self._data @classmethod def _parse(cls, content, config={}, extra_config={}, trim_values=True): '''Parses the content based on the config set''' if len(config) is 0: _data = {'content': content} else: _data = {} try: str_class = basestring except NameError: str_class = str for key, rule in config.items(): value = cls._get_value(content, rule) if trim_values and value not in EMPTY_VALUES: if 'many' in rule and rule['many']: value = [v.strip() if isinstance(v, str_class) else v for v in value] else: value = value.strip() if isinstance(value, str_class) else value _data[key] = value paginate_by = extra_config.get('paginate_by') if paginate_by: paginate_by = cls._get_value(content, paginate_by) groups = extra_config.get('groups', {}) if groups: cls._group_data(_data, groups, config) return _data, paginate_by def _get(self, url): self._fetch(url) data, paginate_by = self._parse(self.content, self.config, self.extra_config, self.trim_values) if paginate_by not in EMPTY_VALUES: paginate_by = urljoin(self.domain, paginate_by) return data, paginate_by def get(self, url=None): url = url if url else self.url self._data, paginate_by = self._get(url) self._pages.append(self._data) self._page_count += 1 if paginate_by and self._page_count < self._pagination_max_limit: time.sleep(self.crawl_delay) self.get(paginate_by) self._package_pages() return self._data @property def data(self): if not hasattr(self, '_data'): self.get() return self._data
class Crawler: def __init__(self,db_name): """Initialises the crawler with the name of the database""" self.con=sqlite.connect(db_name) self.stemmer = nltk.stem.porter.PorterStemmer() self.headers = { "User-Agent" : "Faizan Bhat's Web Crawler" } self.robots = RobotsCache() def __del__(self): self.con.close() def db_commit(self): self.con.commit() def get_entry_id(self,table,field,value,create_new=True): """Auxiliary function for getting an entry id and adding it if it is not present""" # Construct query cur = self.con.execute("select rowid from %s where %s='%s'" % (table,field,value)) # Fetch res = cur.fetchone() # If not found if res==None: cur=self.con.execute("insert into %s (%s) values ('%s')" % (table,field,value)) return cur.lastrowid else: return res[0] def add_to_index(self,url,soup): """Indexes an individual page""" if self.is_indexed(url): return print 'Indexing ' + url # Get text from soup text = self.get_text_only(soup) # Separate words words = self.separate_words(text) # Stem the list of words words = map(self.stem_word, words) # Get the url ID url_id = self.get_entry_id('urllist','url',url) # Link each word to this url for i in range(len(words)): word = words[i] if word in ignore_words: continue word_id=self.get_entry_id('wordlist','word',word) self.con.execute('insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)' % (url_id,word_id,i)) def get_text_only(self,soup): """Extracts the text from an HTML page (without tags)""" v=soup.string if v==None: c=soup.contents result_text='' for t in c: sub_text = self.get_text_only(t) result_text = result_text + sub_text+'\n' return result_text else: return v.strip() def separate_words(self,text): """Separates the words by any non-whitespace characters""" splitter = re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s!=''] def stem_word(self,word): """Uses NLTK porter stemming algorithm to stem a word""" return self.stemmer.stem(word) def is_indexed(self,url): """Return True if url is already indexed""" u=self.con.execute \ ("select rowid from urllist where url='%s'" % url).fetchone() if u!=None: # Check if it has been crawled v=self.con.execute( 'select * from wordlocation where urlid=%d' % u[0]).fetchone() if v!=None: return True return False def add_link_ref(self,url_from,url_to,link_text): """Adds a link between two pages""" words = self.separate_words(link_text) from_id = self.get_entry_id('urllist','url','urlFrom') to_id=self.get_entry_id('urllist','url','urlTo') if from_id == to_id: return cur = self.con.execute("insert into link(fromid,toid) values (%d,%d)" % (from_id,to_id)) link_id = cur.lastrowid for word in words: if word in ignore_words: continue word_id = self.get_entry_id('wordlist','word',word) self.con.execute("insert into linkwords(linkid,wordid) values (%d,%d)" % (link_id,word_id)) def crawl(self,pages,depth=2): """Does a breadth first search on a given list of pages and indexes as we go""" for i in range(depth): print "Depth = " + str(i) newpages=set() for page in pages: if not self.robots.allowed(page,"*"): print "%s disallows robots. Moving on." %page continue try: req = urllib2.Request(page, None, self.headers) c=urllib2.urlopen(req) except: print "Could not open %s" %page continue soup = BeautifulSoup(c.read()) self.add_to_index(page,soup) links=soup('a') for link in links: if ('href' in dict(link.attrs)): url=urljoin(page,link['href']) if url.find("'")!=-1:continue url=url.split('#')[0] if url[0:4]: if not self.is_indexed(url): newpages.add(url) link_text=self.get_text_only(link) self.add_link_ref(page,url,link_text) self.db_commit() pages=newpages def create_index_tables(self): """Creates the database tables""" self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer, toid integer)') self.con.execute('create table linkwords(wordid, linkid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index wordurlidx on wordlocation(wordid)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.db_commit()
class RobotsTxtMiddleware(object): DOWNLOAD_PRIORITY = 1000 def __init__(self, crawler): if not crawler.settings.getbool('ROBOTSTXT_OBEY'): raise NotConfigured self.completeblacklist = crawler.settings.get('ROBOTSTXT_BLACKLIST', ()) self.blacklist = [] self.generalblacklist = crawler.settings.get('GENERAL_BLACKLIST', ()) self.hasblacklist = False self.whitelist = crawler.settings.get('ROBOTSTXT_WHITELIST', ()) self.crawler = crawler self._useragent = crawler.settings.get('USER_AGENT') self._parsers = {} self._spider_netlocs = set() self.robots = RobotsCache() self.stoprepetitionsrearg = re.compile(ur'.*?\&(.*?\&)\1{1,}.*') self.stoprepetitionsreslash = re.compile(ur'.*?\/(.*?\/)\1{1,}.*') @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_request(self, request, spider): useragent = self._useragent if not self.hasblacklist: self.hasblacklist = True if ('http://' + spider.domain ) in self.completeblacklist and self.completeblacklist[ 'http://' + spider.domain] != None: self.blacklist = [ el.lower() for el in self.completeblacklist['http://' + spider.domain] ] log.msg(format="Got blacklist from DB for domain", level=log.DEBUG, request=request) else: log.msg(format="Didn't get a blacklist from DB for domain", level=log.DEBUG, request=request) self.blacklist.extend([el.lower() for el in self.generalblacklist]) #Check for silly repeating arguments if self.stoprepetitionsrearg.match( request.url) != None or self.stoprepetitionsreslash.match( request.url) != None: log.msg(format="URL is suspicious: %(request)s", level=log.DEBUG, request=request) raise IgnoreRequest #Blacklist overrides whitelist and robots if any(bl in request.url.lower() for bl in self.blacklist): log.msg(format="Forbidden by blacklist: %(request)s", level=log.DEBUG, request=request) raise IgnoreRequest if not any(wl in request.url for wl in self.whitelist) and self.robots and not self.robots.allowed( request.url, useragent): log.msg(format="Forbidden by robots.txt: %(request)s", level=log.DEBUG, request=request) raise IgnoreRequest
class TestCache(unittest.TestCase): def setUp(self): self.robots = RobotsCache() def test_404(self): '''When we get a 404, assume free range''' with asis.Server('tests/asis/test_404', port=8080): self.assertEqual( self.robots.allowed('http://localhost:8080/foo', 'rogerbot'), True) def test_caching(self): '''We should be able to cache results''' with asis.Server('tests/asis/test_caching', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_context_manager(self): '''When using as a context manager, it should clear afterwards''' with asis.Server('tests/asis/test_context_manager', port=8080): with self.robots: self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) # And now, we should have it no longer cached self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_expires(self): '''Should be able to recognize expired rules''' with asis.Server('tests/asis/test_expires', port=8080): old_ttl = self.robots.min_ttl self.robots.min_ttl = 0 self.assertNotEqual( self.robots.find('http://localhost:8080/foo', fetch_if_missing=True), None) # If we ignore the TTL, it should still be there. self.assertNotEqual( self.robots.find('http://localhost:8080/foo', fetch_if_missing=False, honor_ttl=False), None) # However, if we honor the TTL, it should be missing in the cache. self.assertEqual( self.robots.find('http://localhost:8080/foo', fetch_if_missing=False), None) self.robots.min_ttl = old_ttl def test_clear(self): '''Should be able to explicitly clear rules''' with asis.Server('tests/asis/test_clear', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) # Now if we clear the rules, we should not find it self.robots.clear() self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_fetch(self): '''Ensure that 'fetch' doesn't cache''' with asis.Server('tests/asis/test_fetch', port=8080): self.assertNotEqual(self.robots.fetch('http://localhost:8080/foo'), None) self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_cache(self): '''Ensure we can ask it to cache a result''' with asis.Server('tests/asis/test_cache', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.assertNotEqual(self.robots.cache('http://localhost:8080/foo'), None) self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_add(self): '''We should be able to add rules that we get''' with asis.Server('tests/asis/test_add', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.add(self.robots.fetch('http://localhost:8080/foo')) self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_server_error(self): '''Make sure we can catch server errors''' with mock.patch.object(self.robots.session, 'get', side_effect=TypeError): self.assertRaises(ServerError, self.robots.allowed, 'http://localhost:8080/foo', 'rogerbot') def test_disallowed(self): '''Check the disallowed interface''' with asis.Server('tests/asis/test_disallowed', port=8080): self.assertFalse( self.robots.disallowed('http://localhost:8080/foo', 'rogerbot')) urls = ['http://localhost:8080/foo', 'http://localhost:8080/bar'] self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls) self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), []) def test_delay(self): '''Check the delay interface''' with asis.Server('tests/asis/test_delay', port=8080): self.assertEqual( self.robots.delay('http://localhost:8080/foo', 'rogerbot'), 5) def test_sitemaps(self): '''Check the sitemaps interface''' with asis.Server('tests/asis/test_sitemaps', port=8080): self.assertEqual( self.robots.sitemaps('http://localhost:8080/foo'), [ 'http://localhost:8080/a', 'http://localhost:8080/b', 'http://localhost:8080/c' ]) def test_dns_exception(self): '''Raises an exception if url does not resolve.''' self.assertRaises(ConnectionException, self.robots.allowed, 'http://does-not-resolve', 'rogerbot') def test_malformed_url(self): '''Raises an exception if the url is malformed.''' self.assertRaises(MalformedUrl, self.robots.allowed, 'hhttp://moz.com', 'rogerbot') def test_ssl_exception(self): '''Raises an exception if there is an ssl error.''' with asis.Server('tests/asis/test_ssl_exception', port=8080): self.assertRaises(SSLException, self.robots.allowed, 'https://localhost:8080', 'rogerbot') def test_excessive_redirects(self): '''Raises an exception if there are too many redirects.''' with asis.Server('tests/asis/test_excessive_redirects', port=8080): self.assertRaises(ExcessiveRedirects, self.robots.allowed, 'http://localhost:8080/one', 'rogerbot') def test_bad_status_codes(self): '''Raises an exception if there is a 5xx status code.''' with asis.Server('tests/asis/test_bad_status_codes', port=8080): self.assertRaises(BadStatusCode, self.robots.allowed, 'http://localhost:8080', 'rogerbot')
class TestCache(unittest.TestCase): def setUp(self): self.robots = RobotsCache() def test_404(self): '''When we get a 404, assume free range''' with asis.Server('tests/asis/test_404', port=8080): self.assertEqual(self.robots.allowed( 'http://localhost:8080/foo', 'rogerbot'), True) def test_caching(self): '''We should be able to cache results''' with asis.Server('tests/asis/test_caching', port=8080): self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) def test_context_manager(self): '''When using as a context manager, it should clear afterwards''' with asis.Server('tests/asis/test_context_manager', port=8080): with self.robots: self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) # And now, we should have it no longer cached self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) def test_expires(self): '''Should be able to recognize expired rules''' with asis.Server('tests/asis/test_expires', port=8080): old_ttl = self.robots.min_ttl self.robots.min_ttl = 0 self.assertNotEqual( self.robots.find('http://localhost:8080/foo', True), None) # Now, it shouldn't be cached, so when we find it again, it should # be missing (or at least, requiring a refetch) self.assertEqual( self.robots.find('http://localhost:8080/foo', False), None) self.robots.min_ttl = old_ttl def test_clear(self): '''Should be able to explicitly clear rules''' with asis.Server('tests/asis/test_clear', port=8080): self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) # Now if we clear the rules, we should not find it self.robots.clear() self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) def test_fetch(self): '''Ensure that 'fetch' doesn't cache''' with asis.Server('tests/asis/test_fetch', port=8080): self.assertNotEqual( self.robots.fetch('http://localhost:8080/foo'), None) self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) def test_cache(self): '''Ensure we can ask it to cache a result''' with asis.Server('tests/asis/test_cache', port=8080): self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.assertNotEqual( self.robots.cache('http://localhost:8080/foo'), None) self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) def test_add(self): '''We should be able to add rules that we get''' with asis.Server('tests/asis/test_add', port=8080): self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.robots.add(self.robots.fetch( 'http://localhost:8080/foo')) self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) def test_server_error(self): '''Make sure we can catch server errors''' self.assertRaises(ServerError, self.robots.allowed, 'http://localhost:8080/foo', 'rogerbot') def test_disallowed(self): '''Check the disallowed interface''' with asis.Server('tests/asis/test_disallowed', port=8080): self.assertFalse(self.robots.disallowed( 'http://localhost:8080/foo', 'rogerbot')) urls = [ 'http://localhost:8080/foo', 'http://localhost:8080/bar' ] self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls) self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), []) def test_delay(self): '''Check the delay interface''' with asis.Server('tests/asis/test_delay', port=8080): self.assertEqual(self.robots.delay( 'http://localhost:8080/foo', 'rogerbot'), 5) def test_sitemaps(self): '''Check the sitemaps interface''' with asis.Server('tests/asis/test_sitemaps', port=8080): self.assertEqual( self.robots.sitemaps('http://localhost:8080/foo'), [ 'http://localhost:8080/a', 'http://localhost:8080/b', 'http://localhost:8080/c' ])
class Archiver(object): ARCHIVE_SUBFORUM_SUBURL_TEMPLATE = 'index.php/f-{forum_code}.html' ARCHIVE_SUBFORUM_SUBURL_RE_TEMPLATE = 'index.php/f-{forum_code}[^(.html)]?.html' ARCHIVE_THREAD_SUBURL_RE = 'index.php/t-[^(.html)]*.html' ARCHIVE_CSS_RE = '[^(.css)]*.css' def __init__(self, base_url, forum_codes, archive_location, user_agent, worker_count): archiver_logger.info('Archiver initialized.') self.base_url = base_url self.archive_base_url = urljoin(self.base_url, ScraperConfig.ARCHIVE_SUBURL) self.forum_codes = forum_codes self.archive_location = archive_location self.user_agent = user_agent self.robot_parser = RobotsCache() self.scraper_timer = None self.shutdown_event = threading.Event() self.delay_time = 1 self.workers = [] self.worker_count = worker_count self.pages_need_visiting = Queue() self.pages_need_analysis_counter = RachetingCounter() self.pages_visited_lock = threading.Lock() self.pages_visited = [] self.page_re_filters = [] def setup(self): archiver_logger.info('Beginning Archiver setup.') success = True archiver_logger.info('Building page filters.') # Build regular expression filters for pages to attempt to crawl. archive_base_url = self.archive_base_url # Build regular expression for sub-forums we're interested in. for forum_code in self.forum_codes: regex = urljoin( archive_base_url, self.ARCHIVE_SUBFORUM_SUBURL_RE_TEMPLATE.format( forum_code=forum_code)) self.page_re_filters.append(re.compile(regex)) # Add a regular expression for thread pages. thread_regex = urljoin(archive_base_url, self.ARCHIVE_THREAD_SUBURL_RE) self.page_re_filters.append(re.compile(thread_regex)) # Finally add a regular expression to grab the archive CSS. css_regex = urljoin(archive_base_url, self.ARCHIVE_CSS_RE) self.page_re_filters.append(re.compile(css_regex)) archiver_logger.info('Adding seed pages.') for fc in self.forum_codes: subforum_url = urljoin( self.archive_base_url, self.ARCHIVE_SUBFORUM_SUBURL_TEMPLATE.format(forum_code=fc)) self.pages_need_visiting.put(subforum_url) self.pages_need_analysis_counter.increment() archiver_logger.info( 'Archiver seeded with page {}.'.format(subforum_url)) archiver_logger.info('Checking archive location...') # Setup archive location. base_path, new_archive = os.path.split(self.archive_location) if not os.path.exists(base_path) or not os.path.isdir(base_path): success = False archiver_logger.error( 'Base path {} does not exist or is not a directory! Aborting!') return success elif (os.path.exists(self.archive_location) and (not os.path.isdir(self.archive_location) or os.listdir(self.archive_location))): success = False archiver_logger.error( 'Archive location {} is either a not a directory or is not empty! Aborting!' ''.format(self.archive_location)) return success elif not os.path.exists(self.archive_location): archiver_logger.info('Creating archive directory {}.'.format( self.archive_location)) try: os.mkdir(self.archive_location) except OSError: success = False archiver_logger.exception( 'Faulted attempting to create archive directory! Aborting!' ) return success else: archiver_logger.info( 'Empty archive directory {} exists. Proceeding...'.format( self.archive_location)) # Attempt to retrieve robots.txt information about target site. if not self.robot_parser.allowed(self.base_url, self.user_agent): success = False archiver_logger.error('Not allowed to scrape {}! Aborting!'.format( self.base_url)) return success else: archiver_logger.info( 'Successfully polled {} for robots.txt, can scrape.'.format( self.base_url)) # Get crawl delay and build scraper timer. delay_time = self.robot_parser.delay(self.base_url, self.user_agent) if delay_time: archiver_logger.info( 'Site crawl-delay: {} seconds.'.format(delay_time)) else: delay_time = ScraperConfig.DEFAULT_CRAWL_DELAY archiver_logger.info( 'No crawl delay for this site. Using default crawl delay of {} seconds.' ''.format(delay_time)) archiver_logger.info('Intializng Scraper timer.') self.scraper_timer = ScraperTimer(delay_time) self.delay_time = delay_time if success: archiver_logger.info('Archiver setup success!') else: archiver_logger.error('Archiver setup failure! Check logs!') archiver_logger.info('Building workers...') for i in xrange(self.worker_count): archiver_logger.info('Adding worker {}.'.format(i + 1)) worker = ArchiverWorker( self.shutdown_event, self.user_agent, self.robot_parser, self.scraper_timer, self.pages_need_visiting, self.pages_visited, self.pages_visited_lock, self.page_re_filters, self.pages_need_analysis_counter, self.archive_location) worker.daemon = True self.workers.append(worker) return success def run(self): archiver_logger.info('Starting workers...') [worker.start() for worker in self.workers] while not self.pages_need_analysis_counter.empty(): time.sleep(0.1) archiver_logger.info( 'Finished archiving all possible pages. Shutting down.') archiver_logger.info('Waiting for threads to finish up.') self.shutdown_event.set() self.scraper_timer.wait() return True def teardown(self): if not self.shutdown_event.is_set(): self.shutdown_event.set() return True
class TestCache(unittest.TestCase): def setUp(self): self.robots = RobotsCache() def test_404(self): '''When we get a 404, assume free range''' with asis.Server('tests/asis/test_404', port=8080): self.assertEqual( self.robots.allowed('http://localhost:8080/foo', 'rogerbot'), True) def test_caching(self): '''We should be able to cache results''' with asis.Server('tests/asis/test_caching', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_context_manager(self): '''When using as a context manager, it should clear afterwards''' with asis.Server('tests/asis/test_context_manager', port=8080): with self.robots: self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) # And now, we should have it no longer cached self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_expires(self): '''Should be able to recognize expired rules''' with asis.Server('tests/asis/test_expires', port=8080): self.assertNotEqual( self.robots.find('http://localhost:8080/foo', True), None) # Now, it shouldn't be cached, so when we find it again, it should # be missing (or at least, requiring a refetch) self.assertEqual( self.robots.find('http://localhost:8080/foo', False), None) def test_clear(self): '''Should be able to explicitly clear rules''' with asis.Server('tests/asis/test_clear', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) # Now if we clear the rules, we should not find it self.robots.clear() self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_fetch(self): '''Ensure that 'fetch' doesn't cache''' with asis.Server('tests/asis/test_fetch', port=8080): self.assertNotEqual(self.robots.fetch('http://localhost:8080/foo'), None) self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_cache(self): '''Ensure we can ask it to cache a result''' with asis.Server('tests/asis/test_cache', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.assertNotEqual(self.robots.cache('http://localhost:8080/foo'), None) self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_add(self): '''We should be able to add rules that we get''' with asis.Server('tests/asis/test_add', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.add(self.robots.fetch('http://localhost:8080/foo')) self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_server_error(self): '''Make sure we can catch server errors''' self.assertRaises(ServerError, self.robots.allowed, 'http://localhost:8080/foo', 'rogerbot') def test_disallowed(self): '''Check the disallowed interface''' with asis.Server('tests/asis/test_disallowed', port=8080): self.assertFalse( self.robots.disallowed('http://localhost:8080/foo', 'rogerbot')) urls = ['http://localhost:8080/foo', 'http://localhost:8080/bar'] self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls) self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), []) def test_delay(self): '''Check the delay interface''' with asis.Server('tests/asis/test_delay', port=8080): self.assertEqual( self.robots.delay('http://localhost:8080/foo', 'rogerbot'), 5) def test_sitemaps(self): '''Check the sitemaps interface''' with asis.Server('tests/asis/test_sitemaps', port=8080): self.assertEqual( self.robots.sitemaps('http://localhost:8080/foo'), [ 'http://localhost:8080/a', 'http://localhost:8080/b', 'http://localhost:8080/c' ])
class RobotsTxtMiddleware(object): DOWNLOAD_PRIORITY = 1000 def __init__(self, crawler): if not crawler.settings.getbool('ROBOTSTXT_OBEY'): raise NotConfigured self.completeblacklist = crawler.settings.get('ROBOTSTXT_BLACKLIST', ()) self.blacklist = [] self.generalblacklist = crawler.settings.get('GENERAL_BLACKLIST', ()) self.hasblacklist = False self.whitelist = crawler.settings.get('ROBOTSTXT_WHITELIST', ()) self.crawler = crawler self._useragent = crawler.settings.get('USER_AGENT') self._parsers = {} self._spider_netlocs = set() self.robots = RobotsCache() self.stoprepetitionsrearg = re.compile(ur'.*?\&(.*?\&)\1{1,}.*') self.stoprepetitionsreslash = re.compile(ur'.*?\/(.*?\/)\1{1,}.*') @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_request(self, request, spider): useragent = self._useragent if not self.hasblacklist: self.hasblacklist = True if ('http://' + spider.domain) in self.completeblacklist and self.completeblacklist['http://' + spider.domain] != None: self.blacklist = [el.lower() for el in self.completeblacklist['http://' + spider.domain]] log.msg(format="Got blacklist from DB for domain", level=log.DEBUG, request=request) else: log.msg(format="Didn't get a blacklist from DB for domain", level=log.DEBUG, request=request) self.blacklist.extend([el.lower() for el in self.generalblacklist]) #Check for silly repeating arguments if self.stoprepetitionsrearg.match(request.url) != None or self.stoprepetitionsreslash.match(request.url) != None: log.msg(format="URL is suspicious: %(request)s", level=log.DEBUG, request=request) raise IgnoreRequest #Blacklist overrides whitelist and robots if any(bl in request.url.lower() for bl in self.blacklist): log.msg(format="Forbidden by blacklist: %(request)s", level=log.DEBUG, request=request) raise IgnoreRequest if not any(wl in request.url for wl in self.whitelist) and self.robots and not self.robots.allowed(request.url, useragent): log.msg(format="Forbidden by robots.txt: %(request)s", level=log.DEBUG, request=request) raise IgnoreRequest
import time from reppy.cache import RobotsCache from reppy.parser import Rules content = ''' User-agent: '*' Allow: / ''' cache = RobotsCache() cache.add(Rules('http://example.com/', 200, content, float('inf'))) @contextmanager def timer(count): '''Time this block.''' start = time.time() try: yield count finally: duration = time.time() - start print('Total: %s' % duration) print(' Avg: %s' % (duration / count)) print(' Rate: %s' % (count / duration)) with timer(100000) as count: for _ in range(count): cache.allowed('http://example.com/page', 'agent')
class WebConnexion(object): """Manage the web connexion with the page to crawl.""" def __init__(self): self.reqrobots = RobotsCache() self.parser_encoding = parsers.ExtractEncoding() def get_code(self, url): """Get source code of given url. :param url: url of webpage :type url: str :return: source code, True if no take links, score and new url (redirection) """ nofollow, url = connexion.is_nofollow(url) result = self.send_request(url) if not isinstance(result, requests.models.Response): return None, result, None, None, url else: request = result del result allowed = self.check_robots_perm(url) if request.status_code == requests.codes.ok and request.headers.get( 'Content-Type', '').startswith('text/html') and allowed: # Search encoding of webpage: request.encoding, score = self.search_encoding( request.headers, request.text) new_url, code = self.duplicate_content( request, url) # new_url is clean and maybe without params all_urls = connexion.all_urls( request) # List of urls to delete if new_url in all_urls: # new_url don't be delete all_urls.remove(new_url) return new_url, code, nofollow, score, all_urls else: tell('Webpage infos: status code=' + str(request.status_code) + ', Content-Type=' + \ request.headers.get('Content-Type', '') + ', robots perm=' + str(allowed), severity=0) # All redirections urls, the first and the last: all_urls = connexion.all_urls(request) all_urls.append(request.url) all_urls.append(url) return None, 'ignore', None, None, remove_duplicates(all_urls) def send_request(self, url): try: request = requests.get(url, headers=HEADERS, timeout=TIMEOUT) except requests.packages.urllib3.exceptions.ReadTimeoutError: tell('Read timeout error (urllib3): ' + url, 3) return None except requests.exceptions.Timeout: tell('Timeout error: ' + url, 4) return None except requests.exceptions.RequestException as error: tell('Connexion failed: {}, {}'.format(str(error), url), 5) if connexion.no_connexion(): return 'no connexion' else: return None else: return request def search_encoding(self, headers, code): """Searche encoding of webpage in source code. If an encoding is found in source code, score is 1, but if not score is 0 and encoding is utf-8. :param headers: hearders of requests :type headers: dict :param code: source code :type code: str :return: encoding of webpage and it score """ # Search in headers: headers = str(headers).lower() charset = headers.find('charset') end_charset = headers.find('\'', charset) if charset != -1 and end_charset != -1: return headers[charset + 8:end_charset], 1 else: # Search in source code: self.parser_encoding.feed(code) if self.parser_encoding.encoding != '': return self.parser_encoding.encoding, 1 else: tell('No encoding', 9, severity=0) return 'utf-8', 0 def check_robots_perm(self, url): """Check robots.txt for permission. :param url: webpage url :type url: str :return: True if can crawl """ try: allowed = self.reqrobots.allowed(url, USER_AGENT) except ServerError as error: tell('Error robots.txt (reppy): ' + str(error) + ' ' + url, 6) allowed = True except requests.exceptions.Timeout: tell('Error robots.txt (timeout): ' + url) allowed = True except requests.exceptions.RequestException as error: tell('Error robots.txt (requests): ' + str(error) + ' ' + url, 7) allowed = True except Exception as error: tell('Unknow robots.txt error: ' + str(error) + ' ' + url, 8) allowed = True return allowed def duplicate_content(self, request1, url): """Avoid param duplicate. Compare source codes with params and whitout. Return url whitout params if it's the same content. :param request: request :type request: requests.models.Response :return: url, source code """ url1 = clean_link(request1.url) if url1 is None: return url, request1.text infos_url = urlparse(url1) if infos_url.query != '': new_url = infos_url.scheme + '://' + infos_url.netloc + infos_url.path request2 = self.send_request(new_url) if not isinstance(request2, requests.models.Response): return url1, request1.text request2.encoding = self.search_encoding(request2.headers, request2.text)[0] url2 = clean_link(request2.url) if url2 is None: return url1, request1.text if connexion.duplicate_content(request1.text, request2.text): tell("Same content: " + url1 + " and " + url2) # Tests return url2, request2.text else: return url1, request1.text else: return url1, request1.text
class WebCrawler(): """ Web crawler class crawls a specific website """ def __init__(self, url="file:///Users/tharak/Dropbox/code/Python/webcrawler/mock_website/example.org/index.html", useragent="User Agent", outdir="out", max_depth=1000, debug=0): self.url = url self.useragent = useragent self.siteMap = {self.url:""} self.outdir=outdir.rstrip("/")+"/" self.depth = 0 self.MaxDepth = max_depth self.crawled=Set([]) self.debug=debug self.domains=Set([urlparse(self.url).netloc.lower()]) self.robots = RobotsCache() def __crawl_site(self, url_key=""): """Recursively crawls the url passed and populates the sitemap datastructure """ #Do not continue crawling if we are at maximum allowed depth if self.depth > self.MaxDepth: return if url_key=="": url=self.url else: url=url_key #Check the site's robot.txt to figure the list of allowed locs #Do not check robots.txt if the file is located locally if "http" in urlparse(url).scheme: if not self.robots.allowed(url, self.useragent): if(self.debug > 0): print "Page disallowed in robots.txt %s"%(url) return if(self.debug > 0): print "Now crawling: %s"%(url) url_list=[] #When we cycle through the siteMap datastructure we convert to a url_list #Otherwise, the interpreter complains that dictionary is constantly changing for key in self.siteMap: url_list.append(key) for key in url_list: #Fetch the URLs in the webpage and append to siteMap for URLs that have not yet been crawled. if self.siteMap[key] == "": urls =self.__extract_url(url) self.siteMap[key] = urls for url_key in urls: #If the URL has already been crawled or has a # tag, dont crawl it. if (self.debug > 1): print "url_key: %s, crawled: %s"%(url_key,self.crawled) if url_key in self.crawled: continue if "#" in url_key: continue #We do not want to crawl external domains. parsed = urlparse(url_key) if (self.debug > 1): print parsed.netloc #If netloc is empty or is the main domain then the page is part of local domain and needs to be crawled. if parsed.netloc.lower() in self.domains: if (self.debug > 1): print "\ndepth=%s,URL=%s\n"%(self.depth, url_key) self.siteMap[url_key] = "" self.crawled.add(url_key) self.depth = self.depth+1 self.__crawl_site(url_key) self.depth = self.depth-1 def __print_siteMap(self): """Prints the siteMap datastructure in an XML like format """ #Dump Sitemap to an XML file try: fd = open(self.outdir+"site.xml", "w") try: fd.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") fd.write("<WEBSITE>\n") for key in self.siteMap: fd.write("\t<WEBPAGE>\n") fd.write("\t\t<ADDRESS>\"%s\"</ADDRESS>\n"%(key)) for loc in self.siteMap[key]: fd.write("\t\t<LINK>\"%s\"</LINK>\n"%(loc)) fd.write("\t</WEBPAGE>\n") fd.write("</WEBSITE>\n") finally: fd.close() except IOError: pass #Dump siteMap to a json file import json with open(self.outdir+'site.json', 'w') as fp: json.dump(self.siteMap, fp, indent=4) def get_siteMap(self): """Initiates the crawler and populates the siteMap """ from os import makedirs from shutil import rmtree rmtree(self.outdir) makedirs(self.outdir) self.__crawl_site() self.__print_siteMap() return self.siteMap def __extract_url(self, url): """Extracts the links in the input URL """ import urllib2 from urllister import URLLister from sgmllib import SGMLParseError req = urllib2.Request(url, headers={'User-Agent' : self.useragent}) try: usock = urllib2.urlopen(req) parser = URLLister(url) try: parser.feed(usock.read()) parser.close() except Exception as exception: if (self.debug > 0): print "sgmllib: Unable to parse web page.\n sgmllib: Raised exception %s"%(type(exception).__name__) fd = open(self.outdir+"%s.err"%type(exception).__name__, "a") fd.write( "%s\n"%(url)) fd.close() pass usock.close() return parser.urls except (KeyboardInterrupt, SystemExit): raise except Exception as exception: if (self.debug > 0): print "urllib2: Page does not exist or Malformed web address.\n sgmllib: Raised exception %s"%(type(exception).__name__) fd = open(self.outdir+"%s.err"%type(exception).__name__, "a") fd.write( "%s\n"%(url)) fd.close() return []
class Mole: """ fetch web page based on robots.txt """ def __init__(self): self.agent = "jerry's crawler" self.robots = RobotsCache() self.pool = None self.cookieJar = cookielib.CookieJar() timeout = 60 socket.setdefaulttimeout(timeout) def fetch(self, uri): # timeout in seconds if self.robots.allowed(uri, self.agent): opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookieJar)) req = urllib2.Request(uri) req.add_header('User-Agent', self.agent) response = opener.open(req) if response.code == 200: return response.read() return None def filter_punctuation(self, tokens): non_punct = re.compile('.*[A-Za-z0-9].*') return [w for w in tokens if non_punct.match(w)] def get_sitexml_robots(self, url): robot_url = '/'.join([url, 'robots.txt']) content = self.fetch(robot_url) lines = content.split('\n') site = [] for line in lines: line = line.lower() index = line.find("sitemap") if index < 0 : continue m = re.search('sitemap\s*:\s*(\S+)',line[index:]) site.append(m.group(1)) return site def is_within_days(self, d, days=1): ago = date.today() - timedelta(days) return ago <= d def read_sitemap_file(self, mapfile): content = self.fetch(mapfile) if content is None: return None if mapfile.endswith('.gz'): d = zlib.decompressobj(16+zlib.MAX_WBITS) content = d.decompress(content) return content def create_thread_pool(self, size=10): self.pool = WorkerPool(size) def page2tokens(self, content): return nltk.word_tokenize(nltk.clean_html(content))