def is_page_robot_scannable(self): """ Returns a boolean that tells whether the page is robot scrapeable. """ robotcheck = RobotFileParser() robotcheck.set_url(self.urlparse[0]+'://'+self.urlparse[1]+'/robots.txt') robotcheck.read() return robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url)
def test_parse(self): from robotparser import RobotFileParser rules = RobotFileParser() rules.set_url("http://www.sogou.com/robots.txt") rules.read() self.assertEqual( rules.can_fetch("mozilla", "http://www.sogou.com/sohu/robots.txt"), False)
def _get_robot_parser(self): if self.robot_parser_pickle is not None: return pickle.loads(base64.b64decode(self.robot_parser_pickle)) else: parser = RobotFileParser() parser.set_url(self.protocol + "://" + self.domain + "/robots.txt") self.robot_parser = parser return parser
def _get_robot_parser(self): try: return pickle.loads(str(self.robot_parser_pickle)) except (TypeError, IndexError): parser = RobotFileParser() parser.set_url(str(self.protocol) + "://" + str(self.domain) + \ "/robots.txt") self.robot_parser = parser return parser
def _get_robot_parser(self): if self.robot_parser_pickle is not None: return pickle.loads(base64.b64decode(self.robot_parser_pickle)) else: parser = RobotFileParser() parser.set_url(self.protocol + "://" + self.domain + "/robots.txt") self.robot_parser = parser return parser
def checkRobots(URL): time.sleep(1) parsed = urlparse(URL) robotsUrl = parsed.scheme + "://" + parsed.netloc + "/robots.txt" robotParser = RobotFileParser() robotParser.set_url(robotsUrl) robotParser.read() result = robotParser.can_fetch("*", URL) return result
def checkRobots(URL): time.sleep(1) parsed = urlparse(URL) robotsUrl = parsed.scheme + "://"+ parsed.netloc+"/robots.txt" robotParser = RobotFileParser() robotParser.set_url(robotsUrl) robotParser.read() result = robotParser.can_fetch("*",URL) return result
def get_robots(url): ''' Initialize robots parser for this domain :param url: :return: ''' rp = RobotFileParser() rp.set_url(urlparse.urljoin(url, '/robots.txt')) rp.read() return rp
def can_fetch(self, url): host, path = urlparse.urlparse(url)[1:3] if (self.rules.has_key(host)): return self.rules[host].can_fetch(self.agent, url) else: rp = RobotFileParser() robot_url = "http://" + host + "/robots.txt" rp.set_url(robot_url) rp.read() self.rules[host] = rp return rp.can_fetch(self.agent, url)
def can_fetch(self,url): host,path=urlparse.urlparse(url)[1:3] if (self.rules.has_key(host)): return self.rules[host].can_fetch(self.agent,url) else: rp=RobotFileParser() robot_url="http://"+host+"/robots.txt" rp.set_url(robot_url) rp.read() self.rules[host]=rp return rp.can_fetch(self.agent,url)
class HolidayScrapper: def __init__(self): self.rp = RobotFileParser() self.rp.set_url('https://www.timeanddate.com/robots.txt') self.rp.read() if not self.rp.can_fetch('WasThereAHoliday', init_url): raise RuntimeError('Scrapping forbidden due to robots.txt file') self.countries = self.get_countries(self.get_page(init_url)) try: # removing entries which are not countries self.countries.remove('un') except ValueError: pass try: # removing entries which are not countries self.countries.remove('world') except ValueError: pass def get_data(self): all_data = pd.DataFrame() for cntr in self.countries: print 'Fetching data for ' + cntr try: url = 'https://www.timeanddate.com/holidays/' + cntr + '/2016#!hol=8389401' if not self.rp.can_fetch('WasThereAHoliday', url): raise RuntimeError( 'Scrapping forbidden due to robots.txt file') soup = self.get_page('https://www.timeanddate.com/holidays/' + cntr + '/2016#!hol=8389401') html_table = soup.find('table') df_table = pd.read_html(str(html_table))[0] df_table['country'] = cntr all_data = all_data.append(df_table) except ValueError: print 'Problem occured when fetching data for ' + cntr pass return all_data @staticmethod def get_page(url): page = requests.get(url, headers=headers) soup = BeautifulSoup(page.text, 'lxml') return soup @staticmethod def get_countries(soup): countries = [] select_list = soup.find(id="co") for cntr in select_list.children: countries.append(cntr['value']) return countries
def robots_check(url): # creating url for robots.txt root_url = tld.get_tld(url) prefix = "http://www." suffix = "/robots.txt" robots_url = prefix + root_url + suffix # checking url validity rp = RobotFileParser() rp.set_url(robots_url) rp.read() return rp.can_fetch("*", url)
class Host(object): ''' Represents one host. Responsible for parsing and analyzing ``robots.txt``. :param hostname: the name of the host extracted from an URL. ''' def __init__(self, hostname): self.hostname = hostname self.rp = RobotFileParser() self.rp.set_url('http://%s/robots.txt' % self.hostname) def url_allowed(self, url): ''' Checks if the given url is allowed to crawl. :param url: URL to check. ''' return self.rp.can_fetch(USER_AGENT, url)
def can_read(url): domain = domain_name(url) if domain not in Permissions: rp = RobotFileParser() rp.set_url(urljoin('http://' + domain, 'robots.txt')) try: rp.read() except: return False Permissions[domain] = rp res = False try: res = Permissions[domain].can_fetch("*", url) except: return False return res
def _get_soup(path): """Gets soup from the given path, respecting robots.txt""" full_path = BASE_URL + path # Set a user-agent user_agent = 'dcnotify/%s' % __version__ http_headers = {'User-Agent': '%s' % user_agent} # Honor robots.txt robots = RobotFileParser() robots.set_url("%s/robots.txt" % BASE_URL) robots.read() if not robots.can_fetch(user_agent, full_path): raise ValueError("Path disallowed by robots.txt") # Make a make a request, raising any HTTP errors that might occur request = get(full_path, headers=http_headers) request.raise_for_status() return bs(request.text)
def urlopen(self, host): robo_url = host.get_robots_url() print self.robotdict cached_parser = self.robotdict.get(robo_url) if cached_parser: logging.info("Found in Cache: " + robo_url) else: logging.info("Fetching: " + robo_url) cached_parser = RobotFileParser() self.robotdict.put(robo_url, cached_parser) cached_parser.set_url(robo_url) cached_parser.read() if cached_parser.can_fetch('*', host. get_url()): print 'Going to fetch:', host.get_url() return self.fetch_file(host.get_url()) else: logging.info("Forbidden by Robots.txt") return None
def _get_soup(path): """Gets soup from the given path, respecting robots.txt""" full_path = BASE_URL + path # Set a user-agent user_agent = 'dcnotify/%s' % __version__ http_headers = {'User-Agent': '%s' % user_agent} # Honor robots.txt robots = RobotFileParser() robots.set_url("%s/robots.txt" % BASE_URL) robots.read() if not robots.can_fetch(user_agent, full_path): raise ValueError("Path disallowed by robots.txt") # Make a make a request, raising any HTTP errors that might occur request = get(full_path, headers=http_headers) request.raise_for_status() return bs(request.text)
class spider(object): CurLink = "" linknText = [] headings = [] def __init__(self, link): self.CurLink = link self.r = RobotFileParser() def crawl(self): self.r.set_url(urlparse.unquote(self.CurLink)) self.r.read() self.html = urlopen(self.CurLink).read() self.bs = BeautifulSoup(self.html, "lxml") for i in self.bs.findAll("h1", text=True): self.headings.append(i.text) for i in self.bs.findAll("h2", text=True): self.headings.append(i.text) for i in self.bs.findAll("h3", text=True): self.headings.append(i.text) for i in self.bs.findAll("h4", text=True): self.headings.append(i.text) for i in self.bs.findAll("h5", text=True): self.headings.append(i.text) for i in self.bs.findAll("h6", text=True): self.headings.append(i.text) for link in self.bs.findAll('a', href=True): aLink = urlparse.urljoin(self.CurLink, link['href']) if (self.r.can_fetch("*", aLink)): self.linknText.append({ "URL": aLink, "AnchorText": link.string })
class Crawler(): # Variables parserobots = False output = None report = False config = None domain = "" exclude = [] skipext = [] drop = [] debug = False tocrawl = set([]) crawled = set([]) excluded = set([]) marked = {} # TODO also search for window.location={.*?} linkregex = re.compile(b'<a href=[\'|"](.*?)[\'"].*?>') rp = None response_code={} nb_url=1 # Number of url. nb_rp=0 # Number of url blocked by the robots.txt nb_exclude=0 # Number of url excluded by extension or word output_file = None target_domain = "" def __init__(self, parserobots=False, output=None, report=False ,domain="", exclude=[], skipext=[], drop=[], debug=False): self.parserobots = parserobots self.output = output self.report = report self.domain = domain self.exclude = exclude self.skipext = skipext self.drop = drop self.debug = debug if self.debug: logging.basicConfig(level=logging.DEBUG) self.tocrawl = set([domain]) try: self.target_domain = urlparse.urlparse(domain)[1] except: raise ValueError("Invalid domain") if self.output: try: self.output_file = open(self.output, 'w') except: logging.debug ("Output file not available.") exit(255) def run(self): print (config.xml_header, file if file else self.output_file) logging.debug("Start the crawling process") while len(self.tocrawl) != 0: self.__crawling() logging.debug("Crawling as reach the end of all found link") print (config.xml_footer, file if file else self.output_file) def __crawling(self): crawling = self.tocrawl.pop() url = urlparse.urlparse(crawling) self.crawled.add(crawling) request = Request(crawling, headers={"User-Agent":config.crawler_user_agent}) try: response = urlopen(request) except Exception as e: if hasattr(e,'code'): if e.code in self.response_code: self.response_code[e.code]+=1 else: self.response_code[e.code]=1 # Gestion des urls marked pour le reporting if self.report: if e.code in self.marked: self.marked[e.code].append(crawling) else: self.marked[e.code] = [crawling] logging.debug ("{1} ==> {0}".format(e, crawling)) return self.__continue_crawling() # Read the response try: msg = response.read() if response.getcode() in self.response_code: self.response_code[response.getcode()]+=1 else: self.response_code[response.getcode()]=1 response.close() # Get the last modify date if 'last-modified' in response.headers: date = response.headers['Last-Modified'] else: date = response.headers['Date'] date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S %Z') except Exception as e: logging.debug ("{1} ===> {0}".format(e, crawling)) return None print ("<url><loc>"+url.geturl()+"</loc><lastmod>"+date.strftime('%Y-%m-%dT%H:%M:%S+00:00')+"</lastmod></url>", file if file else self.output_file) if self.output_file: self.output_file.flush() # Found links links = self.linkregex.findall(msg) for link in links: link = link.decode("utf-8") #logging.debug("Found : {0}".format(link)) if link.startswith('/'): link = 'http://' + url[1] + link elif link.startswith('#'): link = 'http://' + url[1] + url[2] + link elif not link.startswith('http'): link = 'http://' + url[1] + '/' + link # Remove the anchor part if needed if "#" in link: link = link[:link.index('#')] # Drop attributes if needed for toDrop in self.drop: link=re.sub(toDrop,'',link) # Parse the url to get domain and file extension parsed_link = urlparse.urlparse(link) domain_link = parsed_link.netloc target_extension = os.path.splitext(parsed_link.path)[1][1:] if (link in self.crawled): continue if (link in self.tocrawl): continue if (link in self.excluded): continue if (domain_link != self.target_domain): continue if ("javascript" in link): continue # Count one more URL self.nb_url+=1 # Check if the navigation is allowed by the robots.txt if (not self.can_fetch(link)): self.exclude_link(link) self.nb_rp+=1 continue # Check if the current file extension is allowed or not. if (target_extension in self.skipext): self.exclude_link(link) self.nb_exclude+=1 continue # Check if the current url doesn't contain an excluded word if (not self.exclude_url(link)): self.exclude_link(link) self.nb_exclude+=1 continue self.tocrawl.add(link) return None def __continue_crawling(self): if self.tocrawl: self.__crawling() def exclude_link(self,link): if link not in self.excluded: self.excluded.add(link) def checkRobots(self): if self.domain[len(self.domain)-1] != "/": self.domain += "/" request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent}) self.rp = RobotFileParser() self.rp.set_url(self.domain+"robots.txt") self.rp.read() def can_fetch(self, link): try: if self.parserobots: if self.rp.can_fetch("*", link): return True else: logging.debug ("Crawling of {0} disabled by robots.txt".format(link)) return False if not self.parserobots: return True return True except: # On error continue! logging.debug ("Error during parsing robots.txt") return True def exclude_url(self, link): for ex in self.exclude: if ex in link: return False return True def make_report(self): print ("Number of found URL : {0}".format(self.nb_url)) print ("Number of link crawled : {0}".format(len(self.crawled))) if self.parserobots: print ("Number of link block by robots.txt : {0}".format(self.nb_rp)) if self.skipext or self.exclude: print ("Number of link exclude : {0}".format(self.nb_exclude)) for code in self.response_code: print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code])) for code in self.marked: print ("Link with status {0}:".format(code)) for uri in self.marked[code]: print ("\t- {0}".format(uri))
class WebPage(object): def __init__(self, url): self.page_url = url self.parsed_url = urlparse.urlparse(url) self.lang = "" self.isDownload = False self.title = "" self.text = "" self.soup = None self.robot = RobotFileParser() def __normalize_link__(self, link): if not link: return None if link.startswith('//'): return self.parsed_url.scheme + ':' + link elif link.startswith('/'): return self.parsed_url.scheme + '://' + self.parsed_url.hostname + link elif link.startswith('http://') or link.startswith('https://'): return link elif link.startswith("irc://"): return None elif link.startswith('#') or link.startswith('javascript:'): return None else: return urlparse.urljoin(self.page_url, link) def __delete_unnecessary_tags(self): if self.soup is None: return if self.soup.title is None: self.title = "" else: self.title = self.soup.title.string for tag in self.soup( ['style', 'script', '[document]', 'head', 'title']): tag.decompose() def __get_stems(self, text): if self.lang in LANGUAGES: stemer = snowballstemmer.stemmer(LANGUAGES[self.lang]) else: raise NotImplementedError("That lang not implemented") stems_dict = dict() for char in [",", ". ", "!", "?", " - ", "/n"]: text = text.replace(char, " ") for word in text.split(): stem_word = stemer.stemWord(word.lower()) if stem_word in stems_dict: stems_dict[stem_word] += 1 else: stems_dict[stem_word] = 1 return stems_dict def download_page(self): try: self.robot.set_url("{0}://{1}/robots.txt".format( self.parsed_url.scheme, self.parsed_url.hostname)) self.robot.read() if self.robot.can_fetch("*", self.page_url): response = requests.get(self.page_url, verify=False) else: return False except requests.exceptions.InvalidSchema: return False except KeyError: return False except Exception: return False if response.status_code == 200: self.soup = BeautifulSoup(response.text, "html.parser") self.__delete_unnecessary_tags() self.text = "".join(self.soup.strings) try: self.lang = detect(self.text) except Exception: self.lang = "en" self.isDownload = True return True else: return False def get_links(self): if not self.isDownload: raise Exception("You should download page") def get_links_generator(): for link in self.soup.find_all("a"): normalized_link = self.__normalize_link__(link.get("href")) if normalized_link is None: continue else: yield normalized_link return get_links_generator() def get_text_stems(self): if not self.isDownload: raise Exception("You should download page") return self.__get_stems(self.text) def get_title_stems(self): if not self.isDownload: raise Exception("You should download page") return self.__get_stems(self.title) def get_domain(self): return self.parsed_url.hostname
class SiteMap(): def __init__(self, main_page=None, robotrules=True): """ Constuctor method that initializes the members that are used during crawling process :param main_page: The root page that needs to be crawled for generation of sitemap """ logging.info("Consider Robot.txt ? ==> "+str(robotrules)) self.robotrules = robotrules self.site_map = {} # map that records the visits of urls, datemodified and assets self.network = {} # map that maintains the network/graph of webpages visited # The intention of this map is for visual rendering using d3.js self.unvisited = set([]) # a set to keep the list of urls yet to be visited self.start_page = None # the root page, this is used to avoid cycle and keeping crawl # process limited to single domain. self.robot_txt_rules = None if main_page: self.unvisited.add(main_page) try: self.start_page = urlparse(main_page).netloc except: logging.error("Improper URL, Please provide a Valid Url:"+main_page) exit(0) if self.robotrules == "True": try: logging.info("robot.txt respected") self.robot_txt_rules = RobotFileParser() self.robot_txt_rules.set_url(main_page + "/robots.txt") self.robot_txt_rules.read() except: logging.error("Unable to read the robot.txt file") self.robotrules = False # error reading robot.txt, ignore it forever @timeit def generate(self, site_map=None): """ This method holds the invoking control of the crawler method and drives the crawling process. Basically a BFS style method that keeps popping the elements from the queue [self.unvisited set] and scraping the urls. Once the crawling process is done, this creates sitemap using the self.site_map dictionary with just url, date-modified tags with dummy frequency and priorities. :param site_map: name of the site_map file so as to create xml entries. :return: """ while self.unvisited: self.crawl() # create xml from the site_map dictionary header = """<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"> """ footer = """\n</urlset>\n""" entry = "\t<url>\n\ \t\t<loc>%s</loc>\n\ \t\t<lastmod>%s</lastmod>\n\ \t\t<changefreq>monthly</changefreq>\n\ \t\t<priority> 1 </priority>\n\ \t</url>\ " xml = header for url in self.site_map.keys(): xml += entry % (url, self.site_map[url]['date']) + "\n" xml += footer if site_map != None: self.write_to_file(site_map, xml) else: self.write_to_file("sitemap.xml", xml) return xml def write_to_file(self, file_name, content): """ A utility method to just write the contents of the file into a given file name. Alert: This overwrites if the file does exist in the current directory. :param file_name: name of the file, sitemap in our case. :param content: contents of the file :return: None """ f = open(file_name, 'w') f.write(content) f.close() def compose_url_from_href(self, url, href): """ There are different ways a href could specify a location and it varies in different ways based on how the page is designed. This method takes few styles into consideration and ignores some, cleans and creates a valid url link so as to keep it ready for the crawl method. :param url: basae url of the current page :param href: one of the hyper links of the page :return: a well formed and valid http link """ if href.startswith('/'): return "http://%s%s"%(url.netloc, href) elif href.startswith('#'): return "http://%s%s%s"%(url.netloc, url.path, href) elif href.startswith('./'): return "http://%s%s"%(url.netloc, href[1:]) elif not href.startswith('http'): return "http://" + url.netloc + '/' + href return href def get_out_going_edges(self, url, html_body): """ This method encompasses the BFS along with the coupling with crawl and generator as it changes the state of the unvisited map. Basically this method extracts the links that belong to the same domain as the start page, cleans them with compose_url_from_href method and updates the map. This also avoids unnecessary traps like href links pointing to 'javascript', 'mailto' etc. :param url: current page url :param html_body: current page's html content :return: returns all the valid and wellformed out going links from this page """ soup = BeautifulSoup(html_body, "html.parser") valid_links_for_this_page = [] for a in soup.find_all('a', href=True): href = a['href'] href = self.compose_url_from_href(url, href.decode("utf-8")) # clean the href so that it will have legitimate urls instead of #cluttered ones and q=param prints href = urldefrag(href)[0] # skip intra links [this took time to find out !] ##1 # remove query params as only the path matters if href.find('?') != -1: href = href[:href.find('?')] ##2 new_page = urlparse(href) # add to the queue only it it doesn't cause a cycle # assumption: if a link ends with domain.com, assuming it can be crawled to make sitemap complete if not str(new_page.netloc).endswith(self.start_page): # doesn't belong to domain continue if self.robot_allows(href) and \ not href in self.site_map.keys() and \ not href in self.unvisited and \ not 'javascript:' in href and \ not 'mailto:' in href: self.unvisited.add(href) valid_links_for_this_page.append(href) return valid_links_for_this_page def record_visit(self, url, headers, html_body): """ Any time a specific url of a site is changed, its last-modified date and time are kept in the page headers. This info helps bots and crawlers to not to crawl the page if it has not been updated since last crawl. This method is used to preserve the url crawled and its last-modified time along with assets scraped into the container dictionary for later usage to generate sitemap and visualization network. :param url: url of the just finished crawling page :param headers: header information of the crawled page :param html_body: html content of the page :return: None """ if 'last-modified' in headers: date = headers['Last-Modified'] else: date = headers['Date'] self.site_map[url] = { 'date': date, 'assets': self.get_static_assets(html_body) } def get_static_assets(self, html_body): """ A html page could contain other links such as .css, .img. .mp4 and .js. All these files are not dynamic though they could produce dynamic results. The code or text that exists in these files is constant and static. These files are referred as static assets and for the definition of this challenge, I have chosen to keep all the info in a single dictionary and extract them at the end for reports, results and stats. :param html_body: html content of the page. :return: returns a dictionary that encompasses .css, .img, ijs files as lists. """ # add static assets of the page .css, .js and image urls may be ? soup = BeautifulSoup(html_body, "html.parser") img = soup.findAll("img") css = soup.findAll("link", {"rel": "stylesheet"}) # js is tricky: I faced an issue with inline javascript and ignoring it for the time being. # an extract like html_body with just needed parts is a must for excluding inline scripts and styles. jss = [] for x in soup.findAll('script'): try: list.append(x['src']) except KeyError: pass csss = [] imgs = [] jss = [] for link in css: csss.append(link['href']) for link in img: imgs.append(link['src']) for link in jss: jss.append(link['src']) return { 'css': csss, 'img': imgs, 'js': jss } def crawl(self): """ The main driver method that crawls the pages. This main does below steps: for every unvisited [vertex|page] that belongs to the requested domain: crawl the page record valid links and their last-modified-dates :return: None """ page = self.unvisited.pop() # if robot.txt is defined, use Disallow to avoid pages. domain.robot.txt doesn't exist so the crawler # must find all the pages for report. logging.info("Starting to Crawl Page: " + page) url = urlparse(page) try: response = urlopen(page) except: logging.debug("Issue with the url: " + page) return None try: html_body = response.read() # response.getcode() response.close() # record visit ans assets self.record_visit(page, response.headers, html_body) logging.debug("Queued Pages: {0}, Crawled Pages: {1}".format(len(self.unvisited), len(self.site_map))) except: logging.debug("Issue while opening url: " + page) return None connects = self.get_out_going_edges(url, html_body) # simple Graph that keeps the order of the pages crawled. for i, url in enumerate(connects): self.network[page] = { 'to': connects, 'assets': { 'css': self.site_map[page]['assets']['css'], 'js': self.site_map[page]['assets']['js'], 'img': self.site_map[page]['assets']['img'] } } return None def get_site_map(self): """ Returns the compiled sitemap structure :return: sitemap data structure """ return self.site_map def get_network_graph(self): """ Returns the compiled network in the order of the crawled pages :return: network graph """ return self.network def get_network_json_format(self): """ Returns the crawl traverse order sequence in json format :return: network in json format """ return json.dumps(self.network) def set_start_page(self, url): """ This could be useful if one is testing :param url: start page to start the crawling. :return: """ self.start_page = url def robot_allows(self, link): if not self.robotrules: return True try: if self.robot_txt_rules.can_fetch("*", link): return True return False except: return True
class SimpleCrawler: USER_AGENT = 'SimpleCrawler/0.1' HEADERS = { 'User-Agent': USER_AGENT, 'Accept-Encoding': 'gzip', 'Connection': 'keep-alive' } CONTENT_TYPE_PAT = re.compile(r'([^\s;]+)(.*charset=([^\s;]+))?', re.I) def __init__(self, starturl, index_html='', maxlevel=1, cookie_file=None, acldb=None, urldb=None, default_charset=None, delay=0, timeout=300, debug=0): (proto, self.hostport, _x, _y, _z) = urlsplit(starturl) assert proto == 'http' #Thread.__init__(self) self.debug = debug self.index_html = index_html if cookie_file: self.cookiejar = MozillaCookieJar(cookie_file) self.cookiejar.load() else: self.cookiejar = None self.robotstxt = RobotFileParser() self.robotstxt.set_url(urljoin(starturl, '/robots.txt')) self.robotstxt.read() self.conn = None self.urldb = urldb self.acldb = acldb self.curlevel = 0 self.delay = delay self.timeout = timeout self.default_charset = default_charset if starturl.endswith('/'): starturl += self.index_html self.urls = [(starturl, maxlevel)] self.crawled = {} # 1:injected, 2:crawled return def accept_url(self, url): if url.endswith('/'): url += self.index_html if self.acldb and not self.acldb.allowed(url): return None return url def inject_url(self, url): if (not self.curlevel) or (not url) or (url in self.crawled): return False if not self.robotstxt.can_fetch(self.USER_AGENT, url): if self.debug: print >>stderr, 'DISALLOW: %r' % url return None if self.debug: print >>stderr, 'INJECT: %r' % url self.crawled[url] = 1 self.urls.append((url, self.curlevel-1)) return True def get1(self, url, maxretry=3, maxredirect=3): if self.debug: print >>stderr, 'GET: %r' % url # loop for rtry in range(maxredirect): # forge urllib2.Request object. req = Request(url) # add cookie headers if necessary. if self.cookiejar: self.cookiejar.add_cookie_header(req) headers = req.unredirected_hdrs headers.update(self.HEADERS) else: headers = self.HEADERS # get response. for ctry in range(maxretry): try: if not self.conn: print >>stderr, 'Making connection: %r...' % (self.hostport,) self.conn = HTTPConnection(self.hostport) self.conn.request('GET', req.get_selector().replace(' ',''), '', headers) self.conn.sock.settimeout(self.timeout) resp = self.conn.getresponse() break except BadStatusLine, x: # connection closed unexpectedly print >>stderr, 'Connection closed unexpectedly.' # it restarts the connection... self.conn.close() self.conn = None except socket.error, x: # connection closed unexpectedly print >>stderr, 'Socket error:', x self.conn.close() self.conn = None else:
class Webpage(object): """ Objects that refer to individual webpages. If the url is scrapeable the object will be filled with that data, indexed, and inserted into a database to be searched. """ number_of_scraped_pages = 0 def __init__(self, url): """ Creates a webpage object and assigns it the provided url. """ self.url = url if self.url not in black_list and self.url not in scraped_urls: self.needs_to_be_scraped = True else: self.needs_to_be_scraped = False def page_robot_scannable(self): """ Checks whether the page is allowed to be crawled """ if self.need_to_be_scraped is True: # REFACTOR to remove try statement. try: headers = {'User-agent':settings.SPIDER_USER_AGENT} self.urlparse = urlparse.urlparse(self.url) self.robotcheck = RobotFileParser() self.robotcheck.set_url('http://'+self.urlparse[1]+'/robots.txt') # Only works with http right now. self.robotcheck.read() self.need_to_be_scraped = self.robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url) except: self.need_to_be_scraped = False def get_page(self): """ The url is requested with a GET request. The page html is scraped directly, while elements of it aee scraped in parse_page """ self.headers = {'User-agent':settings.SPIDER_USER_AGENT} #REFACTOR to remove try try: self.request = requests.get(self.url, headers=headers) self.pagehtml = BeautifulSoup(self.request.text) #REFACTOR, don't use BeautifulSoup self.count = self.instanceID.next() Webpage.number_of_scraped_pages += 1 except: raise Exception def get_visible_elements(self, element): """ Checks that the element is not contained in <style>, <script>, <head>, <title> or [document]. It also cannot be commented out. """ if element.parent.name in ['style', 'script', '[document]', 'head', 'title']: return False elif re.match('<!--.*-->', str(element)): return False return True def parse_page(self): """ This method parses the HTML page and extracts the title of the page, the outgoing links, the number of outgoing links, and the text. """ self.title = self.pagehtml.find('title').text self.page_text = self.pagehtml.findAll(text=true) for item in filter(get_visible_elements, self.pagetext): if item != '\n': self.pagetext+= item self.pagelinks = {} for link in soup.findAll('a'): self.pagelinks[link.get('href')] = 1 for link in self.pagehtml: pass # determine if link is relative or absolute. if relative, change it to absolute def inverted_index_page_text(self): """ Iterates through the words in the page text and creates and adds them to an index. """ self.pagetextlist = self.pagetext.split(' ') #Noted error: This catches punctuation along with words. for index, word in enumerate(self.pagetextlist): if word not in STOP_WORDS: if not inverted_index.get(word): inverted_index[word]={'url':self.url,'offsets':[index]} else: inverted_index[word]['offsets'].append(index) def set_page_scraped(self): """ Once the page is scraped it is flagged as such """ self.needs_to_be_scraped = False
class SiteMap(): def __init__(self, main_page=None, robotrules=True): """ Constuctor method that initializes the members that are used during crawling process :param main_page: The root page that needs to be crawled for generation of sitemap """ logging.info("Consider Robot.txt ? ==> " + str(robotrules)) self.robotrules = robotrules self.site_map = { } # map that records the visits of urls, datemodified and assets self.network = { } # map that maintains the network/graph of webpages visited # The intention of this map is for visual rendering using d3.js self.unvisited = set( []) # a set to keep the list of urls yet to be visited self.start_page = None # the root page, this is used to avoid cycle and keeping crawl # process limited to single domain. self.robot_txt_rules = None if main_page: self.unvisited.add(main_page) try: self.start_page = urlparse(main_page).netloc except: logging.error("Improper URL, Please provide a Valid Url:" + main_page) exit(0) if self.robotrules == "True": try: logging.info("robot.txt respected") self.robot_txt_rules = RobotFileParser() self.robot_txt_rules.set_url(main_page + "/robots.txt") self.robot_txt_rules.read() except: logging.error("Unable to read the robot.txt file") self.robotrules = False # error reading robot.txt, ignore it forever @timeit def generate(self, site_map=None): """ This method holds the invoking control of the crawler method and drives the crawling process. Basically a BFS style method that keeps popping the elements from the queue [self.unvisited set] and scraping the urls. Once the crawling process is done, this creates sitemap using the self.site_map dictionary with just url, date-modified tags with dummy frequency and priorities. :param site_map: name of the site_map file so as to create xml entries. :return: """ while self.unvisited: self.crawl() # create xml from the site_map dictionary header = """<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"> """ footer = """\n</urlset>\n""" entry = "\t<url>\n\ \t\t<loc>%s</loc>\n\ \t\t<lastmod>%s</lastmod>\n\ \t\t<changefreq>monthly</changefreq>\n\ \t\t<priority> 1 </priority>\n\ \t</url>\ " xml = header for url in self.site_map.keys(): xml += entry % (url, self.site_map[url]['date']) + "\n" xml += footer if site_map != None: self.write_to_file(site_map, xml) else: self.write_to_file("sitemap.xml", xml) return xml def write_to_file(self, file_name, content): """ A utility method to just write the contents of the file into a given file name. Alert: This overwrites if the file does exist in the current directory. :param file_name: name of the file, sitemap in our case. :param content: contents of the file :return: None """ f = open(file_name, 'w') f.write(content) f.close() def compose_url_from_href(self, url, href): """ There are different ways a href could specify a location and it varies in different ways based on how the page is designed. This method takes few styles into consideration and ignores some, cleans and creates a valid url link so as to keep it ready for the crawl method. :param url: basae url of the current page :param href: one of the hyper links of the page :return: a well formed and valid http link """ if href.startswith('/'): return "http://%s%s" % (url.netloc, href) elif href.startswith('#'): return "http://%s%s%s" % (url.netloc, url.path, href) elif href.startswith('./'): return "http://%s%s" % (url.netloc, href[1:]) elif not href.startswith('http'): return "http://" + url.netloc + '/' + href return href def get_out_going_edges(self, url, html_body): """ This method encompasses the BFS along with the coupling with crawl and generator as it changes the state of the unvisited map. Basically this method extracts the links that belong to the same domain as the start page, cleans them with compose_url_from_href method and updates the map. This also avoids unnecessary traps like href links pointing to 'javascript', 'mailto' etc. :param url: current page url :param html_body: current page's html content :return: returns all the valid and wellformed out going links from this page """ soup = BeautifulSoup(html_body, "html.parser") valid_links_for_this_page = [] for a in soup.find_all('a', href=True): href = a['href'] href = self.compose_url_from_href(url, href.decode("utf-8")) # clean the href so that it will have legitimate urls instead of #cluttered ones and q=param prints href = urldefrag(href)[ 0] # skip intra links [this took time to find out !] ##1 # remove query params as only the path matters if href.find('?') != -1: href = href[:href.find('?')] ##2 new_page = urlparse(href) # add to the queue only it it doesn't cause a cycle # assumption: if a link ends with domain.com, assuming it can be crawled to make sitemap complete if not str(new_page.netloc).endswith( self.start_page): # doesn't belong to domain continue if self.robot_allows(href) and \ not href in self.site_map.keys() and \ not href in self.unvisited and \ not 'javascript:' in href and \ not 'mailto:' in href: self.unvisited.add(href) valid_links_for_this_page.append(href) return valid_links_for_this_page def record_visit(self, url, headers, html_body): """ Any time a specific url of a site is changed, its last-modified date and time are kept in the page headers. This info helps bots and crawlers to not to crawl the page if it has not been updated since last crawl. This method is used to preserve the url crawled and its last-modified time along with assets scraped into the container dictionary for later usage to generate sitemap and visualization network. :param url: url of the just finished crawling page :param headers: header information of the crawled page :param html_body: html content of the page :return: None """ if 'last-modified' in headers: date = headers['Last-Modified'] else: date = headers['Date'] self.site_map[url] = { 'date': date, 'assets': self.get_static_assets(html_body) } def get_static_assets(self, html_body): """ A html page could contain other links such as .css, .img. .mp4 and .js. All these files are not dynamic though they could produce dynamic results. The code or text that exists in these files is constant and static. These files are referred as static assets and for the definition of this challenge, I have chosen to keep all the info in a single dictionary and extract them at the end for reports, results and stats. :param html_body: html content of the page. :return: returns a dictionary that encompasses .css, .img, ijs files as lists. """ # add static assets of the page .css, .js and image urls may be ? soup = BeautifulSoup(html_body, "html.parser") img = soup.findAll("img") css = soup.findAll("link", {"rel": "stylesheet"}) # js is tricky: I faced an issue with inline javascript and ignoring it for the time being. # an extract like html_body with just needed parts is a must for excluding inline scripts and styles. jss = [] for x in soup.findAll('script'): try: list.append(x['src']) except KeyError: pass csss = [] imgs = [] jss = [] for link in css: csss.append(link['href']) for link in img: imgs.append(link['src']) for link in jss: jss.append(link['src']) return {'css': csss, 'img': imgs, 'js': jss} def crawl(self): """ The main driver method that crawls the pages. This main does below steps: for every unvisited [vertex|page] that belongs to the requested domain: crawl the page record valid links and their last-modified-dates :return: None """ page = self.unvisited.pop() # if robot.txt is defined, use Disallow to avoid pages. domain.robot.txt doesn't exist so the crawler # must find all the pages for report. logging.info("Starting to Crawl Page: " + page) url = urlparse(page) try: response = urlopen(page) except: logging.debug("Issue with the url: " + page) return None try: html_body = response.read() # response.getcode() response.close() # record visit ans assets self.record_visit(page, response.headers, html_body) logging.debug("Queued Pages: {0}, Crawled Pages: {1}".format( len(self.unvisited), len(self.site_map))) except: logging.debug("Issue while opening url: " + page) return None connects = self.get_out_going_edges(url, html_body) # simple Graph that keeps the order of the pages crawled. for i, url in enumerate(connects): self.network[page] = { 'to': connects, 'assets': { 'css': self.site_map[page]['assets']['css'], 'js': self.site_map[page]['assets']['js'], 'img': self.site_map[page]['assets']['img'] } } return None def get_site_map(self): """ Returns the compiled sitemap structure :return: sitemap data structure """ return self.site_map def get_network_graph(self): """ Returns the compiled network in the order of the crawled pages :return: network graph """ return self.network def get_network_json_format(self): """ Returns the crawl traverse order sequence in json format :return: network in json format """ return json.dumps(self.network) def set_start_page(self, url): """ This could be useful if one is testing :param url: start page to start the crawling. :return: """ self.start_page = url def robot_allows(self, link): if not self.robotrules: return True try: if self.robot_txt_rules.can_fetch("*", link): return True return False except: return True
class MarioDepth: def __init__(self, starturl, callback, callpre=None, callfail=None, concount=MAXCONCOUNT, depth=2, accept_url_patterns=None, reject_url_patterns=None): self.concount = concount self.callback = callback self.callpre = callpre self.callfail = callfail self.depth = depth self.starturl = starturl self.baseurl = URL.baseurl(starturl) self.urls = [] self.crawled = {} self.link_title_db = LinkTitleDB() self.accept_url_patterns = accept_url_patterns self.reject_url_patterns = reject_url_patterns self.robotstxt = RobotFileParser() self.robotstxt.set_url(urljoin(starturl, '/robots.txt')) self.referer = starturl try: self.robotstxt.read() except: logger.debug(Traceback()) #self.lightcloud = LightCloud.connect('n0') def __call__(self, n=None): if n: self.concount = n current_depth = self.depth self.urls.append((self.starturl, current_depth)) while self.urls: self.depth_get() logger.debug('%d unprocessed urls'%(len(self.urls))) def depth_get(self): mario = MarioBatch(callback=self.next_depth, callpre=self.callpre, callfail=self.callfail) pool = coros.CoroutinePool(max_size=len(self.urls)) while self.urls: waiters = [] #self.add_job(mario) counter = 0 while self.urls: if counter > 9: break; counter += 1 waiters.append(pool.execute(self.add_job, mario)) logger.debug('Depth break') for waiter in waiters: waiter.wait() mario(self.concount) def add_job(self, mario): if not self.urls: return url, depth = self.urls.pop() if self.visited(url, depth): return mario.add_job(url, args=depth) def visited(self, url, depth): #is_duplicate = URL.is_duplicate(url, self.lightcloud) return depth==0 and is_duplicate or depth < self.depth and self.crawled.has_key(url) and self.crawled[url] == 2 def next_depth(self, response): #with_timeout(1, self.lightcloud.set, LightCloud.crawled_url_key(response.effective_url), response.url, timeout_value=None) for link, title in URL.link_title(response.body, response.effective_url): if not self.inject_url(link, response.args):continue self.link_title_db.add(link, response.effective_url, title) if callable(self.callback): self.callback(response) self.crawled[response.effective_url] = 2 if response.effective_url != response.url: self.crawled[response.url] = 2 self.referer = response.effective_url def inject_url(self, url, depth): if not (depth and url and url not in self.crawled): #logger.debug('IGNORE(%d): %r'%(depth, url)) return None if isinstance(url, unicode): url = url.encode('utf-8') if self.reject_url(url): logger.debug('REJECT: %r' % url) return None try: can_fetch = self.robotstxt.can_fetch(USER_AGENT['safari'], url) except: can_fetch = True if self.baseurl!='http://hi.baidu.com/' and not can_fetch: logger.debug('DISALLOW: %r' % url) return None logger.debug('INJECT(%d): %r' % (depth-1, url)) self.crawled[url] = 1 self.urls.append((url, depth-1)) return True def reject_url(self, url): return self.baseurl != URL.baseurl(url) and (not self.accept_url_patterns or not re.match('|'.join(self.accept_url_patterns), url) or self.reject_url_patterns or re.match('|'.join(self.reject_url_patterns), url))
class spider(object): CurLink = "" linkURI = [] texts = [] Meta = {} def __init__(self, link): self.CurLink = link self.r = RobotFileParser() def crawl(self): self.r.set_url(urlparse.unquote(self.CurLink)) self.r.read() self.html = urlopen(self.CurLink).read() self.bs = BeautifulSoup(self.html, "lxml") for script in self.bs(["script", "style"]): script.extract() text = self.bs.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) for chunk in chunks: if chunk: self.texts.append(chunk) # site = urlparse.urlsplit(self.CurLink).scheme + "://" + urlparse.urlsplit(self.CurLink).netloc + "/sitemap.aspx" # r = requests.get(site) if requests.get( urlparse.urlsplit(self.CurLink).scheme + "://" + urlparse.urlsplit(self.CurLink).netloc + "/sitemap.aspx").ok == True: root = etree.fromstring( requests.get( urlparse.urlsplit(self.CurLink).scheme + "://" + urlparse.urlsplit(self.CurLink).netloc + "/sitemap.xml").content) for sitemap in root: children = sitemap.getchildren() self.linkURI.append(children[0].text) elif requests.get( urlparse.urlsplit(self.CurLink).scheme + "://" + urlparse.urlsplit(self.CurLink).netloc + "/sitemap.xml").ok == True: root = etree.fromstring( requests.get( urlparse.urlsplit(self.CurLink).scheme + "://" + urlparse.urlsplit(self.CurLink).netloc + "/sitemap.xml").content) for sitemap in root: children = sitemap.getchildren() self.linkURI.append(children[0].text) else: for link in self.bs.findAll('a', href=True): aLink = urlparse.urljoin(self.CurLink, link['href']) if (self.r.can_fetch("*", aLink)): self.linkURI.append(aLink) page = metadata_parser.MetadataParser(url=self.CurLink) meta = page.metadata keyw = "null" descr = "null" if (meta.get('meta').get('Keywords')): keyw = meta['meta']['Keywords'].split(', ') if (meta.get('meta').get('Description')): descr = meta['meta']['Description'] self.Meta = { 'title': meta['page']['title'], 'url': meta['_internal']['url_actual'], 'description': descr, 'keyword': keyw }
class SimpleCrawler: USER_AGENT = 'SimpleCrawler/0.1' HEADERS = { 'User-Agent': USER_AGENT, 'Accept-Encoding': 'gzip', 'Connection': 'keep-alive' } CONTENT_TYPE_PAT = re.compile(r'([^\s;]+)(.*charset=([^\s;]+))?', re.I) def __init__(self, starturl, index_html='', maxlevel=1, cookie_file=None, acldb=None, urldb=None, default_charset=None, delay=0, timeout=300, debug=0): (proto, self.hostport, _x, _y, _z) = urlsplit(starturl) # assert proto == 'http' #Thread.__init__(self) self.debug = debug self.index_html = index_html if cookie_file: self.cookiejar = MozillaCookieJar(cookie_file) self.cookiejar.load() else: self.cookiejar = None self.robotstxt = RobotFileParser() self.robotstxt.set_url(urljoin(starturl, '/robots.txt')) # self.robotstxt.read() self.conn = None self.urldb = urldb self.acldb = acldb self.curlevel = 0 self.delay = delay self.timeout = timeout self.default_charset = default_charset if starturl.endswith('/'): starturl += self.index_html self.urls = [(starturl, maxlevel)] self.crawled = {} # 1:injected, 2:crawled return def accept_url(self, url): if url.endswith('/'): url += self.index_html if self.acldb and not self.acldb.allowed(url): return None return url def inject_url(self, url): if (not self.curlevel) or (not url) or (url in self.crawled): return False if not self.robotstxt.can_fetch(self.USER_AGENT, url): if self.debug: print >> stderr, 'DISALLOW: %r' % url return None if self.debug: print >> stderr, 'INJECT: %r' % url self.crawled[url] = 1 self.urls.append((url, self.curlevel - 1)) return True def get1(self, url, maxretry=5, maxredirect=5): if self.debug: print >> stderr, 'GET: %r' % url # loop for rtry in range(maxredirect): # forge urllib2.Request object. req = Request(url) # add cookie headers if necessary. if self.cookiejar: self.cookiejar.add_cookie_header(req) headers = req.unredirected_hdrs headers.update(self.HEADERS) else: headers = self.HEADERS # get response. for ctry in range(maxretry): try: if not self.conn: print >> stderr, 'Making connection: %r...' % ( self.hostport, ) self.conn = HTTPConnection(self.hostport) self.conn.request('GET', req.get_selector().replace(' ', ''), '', headers) # self.conn.sock.settimeout(self.timeout) resp = self.conn.getresponse() break except BadStatusLine, x: # connection closed unexpectedly print >> stderr, 'Connection closed unexpectedly.' # it restarts the connection... self.conn.close() self.conn = None except socket.error, x: # connection closed unexpectedly print >> stderr, 'Socket error:', x self.conn.close() self.conn = None else:
def test_parse(self): from robotparser import RobotFileParser rules=RobotFileParser() rules.set_url("http://www.sogou.com/robots.txt") rules.read() self.assertEqual(rules.can_fetch("mozilla","http://www.sogou.com/sohu/robots.txt"),False)