def crawl_sitemaps(url, max_depth=1): robots_url = parse.urljoin(url, "/robots.txt") robots = Robots.fetch(robots_url) sitemap_stack = [] seen_sitemaps = set() for url in robots.sitemaps: sitemap_stack.append((url, 0)) all_urls = [] while sitemap_stack: sitemap_url, depth = sitemap_stack.pop() if depth >= max_depth: continue if sitemap_url in seen_sitemaps: continue seen_sitemaps.add(sitemap_url) sitemap = Sitemap(sitemap_url) all_urls.extend(sitemap.get_urls()) for url in sitemap.get_child_sitemap_urls(): sitemap_stack.append((url, depth + 1)) return all_urls
def allows_crawl(url): try: #url = "https://www.ics.uci.edu" # print("This is the passed url: ", url) url = str(url) parsed = urlparse(url) base_url = parsed.scheme + "://" + parsed.netloc # set up parser with robots.txt file! if base_url in robot_files.keys( ): # if we already downloaded the robots.txt file #print("grabbing already foumd robots.txt") rparser = robot_files[base_url] else: try: robot_url = base_url + "/robots.txt" rparser = Robots.fetch(robot_url) # PUT ROBOTS.TXT FILE INTO HERE! robot_files[base_url] = rparser except IOError: print("IOERROR") return False except: print("some other error?") return False try: return rparser.allowed("*", url) except KeyError: return False except IOError: return False
def __init__(self, url, limit_pages_counter=1, storage=False, meta={}, debug=False, user_agent='Spidar/1.1', allow_external_link_crawling=False, selenium_chrome_driver=None): self.__start_url = url parse_location = urlparse(url) self.__initial_domain_name = parse_location.netloc self.__pages = [] self.__url_to_discover = set() self.__url_discovered = set() self.__max_counter_pages = limit_pages_counter self.__storage = storage self.__PATH_STORAGE = '__storage/' self.__PATH_SOURCE = self.__PATH_STORAGE + 'sources/' self.__PATH_INFO = self.__PATH_STORAGE + 'infos/' self.__meta = meta self.__debug = debug self.__user_agent = user_agent self.__allow_external_link_crawling = allow_external_link_crawling self.__selenium_chrome_driver = selenium_chrome_driver self.__selenium_driver = None if self.__selenium_chrome_driver is not None: self.__selenium_driver = webdriver.Chrome( executable_path=self.__selenium_chrome_driver) self.__rp = Robots.fetch(Robots.robots_url(self.__start_url)) if self.__storage: self.__set_up_folders(self.__initial_domain_name)
def scrape_page(self, url): options = webdriver.ChromeOptions() options.add_argument("headless") options.add_experimental_option( "prefs", {"profile.default_content_settings.cookies": 2}) # disable cookies driver = webdriver.Chrome(options=options) root_url = '{}://{}'.format( urlparse(url).scheme, urlparse(url).netloc) # canonical if root_url in self.robots: robots = self.robots.get(root_url) else: robots = Robots.fetch(root_url + '/robots.txt') self.robots.update({root_url: robots}) cdelay = robots.agent('*').delay if cdelay is None: crawl_delay = 6 else: crawl_delay = int(cdelay) driver.implicitly_wait(crawl_delay) try: driver.get(url) res = {'url': url, 'driver': driver, 'robots': robots} return res # result passed to callback function except: print('PROBLEM: ', url) return
def fetch_url_content(self, url, scheme='http'): client = MongoClient('172.25.0.70', 27017) db = client.crawled_urls collection = db.results stripped_url = url.strip() try: robots = Robots.fetch(scheme + '://' + stripped_url + '/robots.txt') except RequestException: return 'FAILURE: ' + stripped_url + ' : Unable to fetch robots.txt' if not robots.allowed(stripped_url, 'just-some-user-agent'): return 'FAILURE: ' + stripped_url + ' : Robots say no' try: r = requests.get(scheme + '://' + stripped_url) collection.insert({ 'url': url, 'scheme': 'http', 'status': r.status_code, "create_time": time.time(), "result": r.text }) except Exception as exc: raise self.retry(exc=exc) client.close() return 'SUCCESS: ' + stripped_url
def __init__(self, url, links=False, content=False, depth=5, exclusion_pattern=None, *args, **kwargs): domain = urlparse(url).netloc # Setup the rules for link extraction if exclusion_pattern: self._rules = [ Rule(LinkExtractor(allow='.*' + domain + '/.*', deny=exclusion_pattern), callback=self.parse_url, follow=True) ] else: self._rules = [ Rule(LinkExtractor(allow='.*' + domain + '/.*'), callback=self.parse_url, follow=True) ] self.allowed_domains = [domain] self.start_urls = [url] self.links = links # Should we store links ? self.content = content # Should we store content ? self.depth = depth # How deep should we go ? # robots.txt enhanced self.robots = Robots.fetch( urlparse(url).scheme + '://' + domain + '/robots.txt')
def filter(self, ua): """Remove all of the urls in URLS that UA is not allowed to crawl, and fill in the .crawl_delay and .robots_url properties.""" rules = None for url in sorted(self.urls): robots_url = Robots.robots_url(url) if self.robots_url != robots_url: if self.robots_url is None: try: rules = Robots.fetch(robots_url, headers={ 'User-Agent': ua }).agent(ua) except Exception as e: sys.stderr.write( "warning: failed to fetch and parse {}: {}\n". format(robots_url, e)) rules = DummyAgent() self.robots_url = robots_url self.crawl_delay = rules.delay or 1 else: raise ValueError("robots.txt for {} is {}, not {}".format( url, robots_url, self.robots_url)) if not rules.allowed(url): self.urls.remove(url)
def scanRightmove(): db = DB() headers = {'User-Agent': 'Dan070Bot([email protected])'} url = "http://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=USERDEFINEDAREA%5E%7B%22id%22%3A4703045%7D&maxBedrooms=3&minBedrooms=2&maxPrice=1200&dontShow=retirement&furnishTypes=furnished" robots = Robots.fetch('http://www.rightmove.co.uk/robots.txt', headers=headers) allowed = robots.allowed('http://www.rightmove.co.uk/property-to-rent/find.html?*', 'Dan070Bot([email protected])') if not allowed: return page = requests.get(url, headers=headers); soup = BeautifulSoup(page.content, 'html.parser') data = soup.select('div.l-searchResult.is-list') t_url = 'https://api.telegram.org/bot538125304:AAEodL7ns7iuTbbpRPgpLteOs-o4UAunV6k/sendMessage' headers = {'Content-Type': 'application/json'} for property in soup.select('div.l-searchResult.is-list'): added = property.select('span.propertyCard-branchSummary-addedOrReduced')[0].get_text() payload = {'chat_id': '@rightmove_alerts'} if added == 'Added today' or added == 'Reduced today': link = property.select('a.propertyCard-link')[0].get('href') full_link = 'http://www.rightmove.co.uk' + link if db.checkURL(full_link) == 0: db.addURL(full_link) payload['text'] = full_link requests.post(t_url, data=json.dumps(payload), headers=headers); db.close() print('Cycle')
def get_links(url): base = url links = [] res = requests.get(base) soup = BeautifulSoup(res.text, 'html.parser') # Parsing the html file # Getting all the links in page with href attribute and extracting their href attributes a_tags = soup.find_all('a', href=True) # Starting the robots parser rp = Robots.fetch(get_robots_url(base)) agent = rp.agent('*') # Looping all the links found in the page for a_tag in a_tags: link = a_tag.get('href') # Getting the href attribute if is_absolute(link) is False: link = fix_relative_url(link, base) # Fixing relative urls # If robot_parser allows link to be scraped, appending it to a list if agent.allowed(link): links.append(link) else: print('Link is not allowed for scraping ({})'.format(link)) return set(links) # Casting to set for eliminating duplicates
def filter(self, ua): """Remove all of the urls in URLS that UA is not allowed to crawl, and fill in the .crawl_delay and .robots_url properties.""" rules = None for url in sorted(self.urls): robots_url = Robots.robots_url(url) if self.robots_url != robots_url: if self.robots_url is None: try: rules = Robots.fetch(robots_url, headers={ 'User-Agent': ua }).agent(ua) except Exception as e: sys.stderr.write( "warning: failed to fetch and parse {}: {}\n" .format(robots_url, e)) rules = DummyAgent() self.robots_url = robots_url self.crawl_delay = rules.delay or 1 else: raise ValueError( "robots.txt for {} is {}, not {}" .format(url, robots_url, self.robots_url)) if not rules.allowed(url): self.urls.remove(url)
def get_crawl_status(url): robots_url = os.path.join(url, 'robots.txt') robot = Robots.fetch(robots_url) agent = robot.agent('msc-cookie-monster') try: return 1 if agent.allowed(url) else 0 except Exception as _: return -1
def _remember(self, url): log.debug('') urlparsed = urllib.parse.urlparse(url) robots_url = url.replace(urlparsed.path, '/robots.txt') print('[ INFO ]: Reading robots.txt file at: {}'.format(robots_url)) robots = Robots.fetch(robots_url) checker = robots.agent(self.user_agent) self.index[urlparsed.hostname] = checker
def robots_txt(): robots = Robots.fetch('http://www.e-prostor.gov.si/robots.txt') print(robots) print(robots.allowed('http://www.e-prostor.gov.si/nc/', '*')) print(robots.allowed('http://www.e-prostor.gov.si/fileadmin/global/', '*')) print('Crawl-delay: ', robots.agent('*').delay) print(robots.sitemaps) print(robots.agent('my-user-agent').delay) robots = Robots.fetch('http://www.e-prostor.gov.si/robots.txt') print(robots) lst = list(robots.sitemaps) r = requests.get(lst[0]) root = etree.fromstring(r.content) for sitemap in root: children = sitemap.getchildren() print(children[0].text)
def is_allowed_access(uri, base_uri): # robotsで禁止されているアクセスでないか確認 robots = Robots.fetch(base_uri + "/robots.txt") agent = robots.agent("*") if agent.allowed(uri): return True else: return False
def get_sitemaps(url): logging.info('Trying to get sitemaps from robots.txt') robots_url = urljoin(url, '/robots.txt') try: return list(Robots.fetch(robots_url).sitemaps) except ReppyException: logging.warning('Can not get access to: %s', robots_url) return [urljoin(url, 'sitemap.xml')]
def get_robots_delay(url): robots_url = urllib.parse.urljoin(url, 'robots.txt') try: robots = Robots.fetch(robots_url) delay = Right(robots.agent('None').delay) except ReppyException: delay = Left(None) return delay
def get_robots_txt_checker(start_url, agent="Py4Seo Parse"): robots_txt_url = f"{start_url}/robots.txt".replace("//robots.txt", "/robots.txt") try: robots = Robots.fetch("robots_txt_url") agent = robots.agent(agent) return lambda url: agent.allowed(url) except: return lambda _: True
def _remember(self, url): urlparsed = urllib.parse.urlparse(url) robots_url = urlparsed.scheme + '://' + urlparsed.netloc + '/robots.txt' write_log('ROBOTS', 'Reading robots.txt file at: {0}'.format(robots_url), package='reppy') robots = Robots.fetch(robots_url) checker = robots.agent(self.user_agent) self.index[urlparsed.hostname] = checker
def robot_parser(url): ''' Parse the Robots.txt. Send True if it is allowed to crawl ''' robotstxt = get_robots_url(url) parser = Robots.fetch(robotstxt) validation = parser.allowed(url, '*') return validation
def crawler(domain, pathseed, uniqueId, maxSize=5000): pq = queue.PriorityQueue() visited = [] links = [] pq.put((value(domain + pathseed), domain + pathseed)) visited.append(domain + pathseed) rp = Robots.fetch(domain + '/robots.txt', verify=False) driver = webdriver.PhantomJS( service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any']) while (not pq.empty() and pq.qsize() < maxSize * 5): a = pq.get()[1] print("! " + str(len(links)) + " " + a) if (len(links) < maxSize): links.append(a) ls = get_all_links(domain, a, maxSize, rp, driver) for l in ls: if (l not in visited): if (value(l) == 1 or value(l) == 2): visited.append(l) pq.put((value(l), l)) else: while (not pq.empty()): pq.get() while (len(links) < maxSize and not pq.empty()): links.append(pq.get()[1]) os.makedirs('Docs/HTMLPages/Heuristic2/' + folder(domain) + '/True/', exist_ok=True) os.makedirs('Docs/HTMLPages/Heuristic2/' + folder(domain) + '/False/', exist_ok=True) print(len(links)) v = 0 clf = Classifier() pos = 0 res = "" for l in links: v += 1 driver.get(l) time.sleep(1) soup = BeautifulSoup(driver.page_source, "html.parser") res = str(clf.classify(soup)) print(str(v) + " " + l + " " + res) #print(driver.page_source) if (res == 'True'): pos += 1 extractorMain.extractor(soup, folder(domain), "Heuristic2", folder(domain).lower(), l, uniqueId) uniqueId += 1 with open( 'Docs/HTMLPages/Heuristic2/' + folder(domain) + '/' + res + '/' + str(v) + '-' + l.replace('/', '*') + '.html', 'wb') as f: f.write(bytes(driver.page_source, 'UTF-8')) hr = pos / maxSize with open('Docs/HTMLPages/Heuristic2/' + folder(domain) + '/' + 'hr.txt', 'wb') as f: f.write(bytes(str(hr), 'UTF-8')) return 0
def __init__(self, domain, schema): self.domain = domain self.schema = schema self.home_url = f'{self.schema}://{self.domain}' self.robots_url = self.home_url + '/robots.txt' self.real_domain = self.domain.split('/')[0] self.robots = Robots.fetch(self.robots_url) self.queue_urls = {self.home_url: 1} self.PARSED_URLS = set() self.browser = get_browser()
def request(self, url: str) -> None: """ Perform robots.txt request """ if not (self.state is None): return try: self.state = Robots.fetch(url) except ReppyException: self.state = False
def get_robots_parser(self, url): robots_url = parse.urljoin(url, "/robots.txt") r_parser = self._robots_parsers.get(robots_url) if not r_parser: logger.info("Reading robots.txt: %s", robots_url) r_parser = Robots.fetch(robots_url) self._robots_parsers[robots_url] = r_parser return r_parser
def check_robots_txt(sitemap, url, user_agent="googlebot"): ''' Arguments: sitemap: String url: String user_agent String default: googlebot ''' robots = Robots.fetch(sitemap) agent = robots.agent(user_agent) return agent.allowed(url)
def init_robot_checker(respect_robots, user_agent, start_url): if respect_robots: start_path = urllib.parse.urlparse(start_url).path robots_url = start_url.replace(start_path, '/robots.txt') write_log('[INFO]: Reading robots file: {0}'.format(robots_url)) robots = Robots.fetch(robots_url) checker = robots.agent(user_agent) return checker.allowed else: return True
def _remember(self, url): # Solved issue when path was just '/' urlparsed = urlparse(url) robots_url = urlparsed.scheme + "://" + urlparsed.netloc + '/robots.txt' write_log('ROBOTS', 'Reading robots.txt file at: {0}'.format(robots_url), package='reppy') robots = Robots.fetch(robots_url) checker = robots.agent(self.user_agent) self.index[urlparsed.hostname] = checker
def CardsCrawler(url): print('Dominio: ' + url.geturl()) count = 0 paginas = [url] paginasVisitadas = [] robots = Robots.fetch(url.geturl() + '/robots.txt') while count < 1000 and len(paginas) > 0: count = count + 1 primeiraPagina = paginas.pop(0) print(count, '-', primeiraPagina.geturl(), ' Qtd Paginas=', len(paginas)) # Verifica se a pagina foi visitada.Se foi, procuro o proximo link da lista que não foi visitado. # Verifico se o hostname de cada pagina é igual ao do dominio, se não, rocuro o proximo link da lista (navegar apenas dentro do site) # Verifico se hostname existe , se não , procuro o proximo link que tenha. # Verifico se é possivel crawlear a pagina de acordo com as regras do robot.txt , se não , procuro o proximo link. while Visitou( primeiraPagina ) == True or primeiraPagina.hostname != url.hostname or primeiraPagina.hostname == None or robots.allowed( primeiraPagina.geturl(), '*') == False: if len(paginas) == 0: print('lascou@!') return primeiraPagina = paginas.pop(0) # verificar se a requisição funciana. if verificarRequest(primeiraPagina.geturl()) == True: # Verficar se o site é dinamico. if (Dominio[3] in url.geturl()) or (Dominio[6] in url.geturl()): texto = CrawlerDinâmico(primeiraPagina.geturl()) else: #request - usando o content type codigo_fonte = rq.get(primeiraPagina.geturl(), headers={'content-type': 'text/html'}) # texto do codigo_fonte. texto = codigo_fonte.text soup = BeautifulSoup(texto, 'html.parser') # Encontrar todos os links da pagina(url). # attrs={'href': re.compile("^http://")} for link in soup.findAll('a'): href = link.get('href') # Correção de href relativos para absolutos. href = urllib.parse.urljoin(url.geturl(), href) paginas.append(urlparse(href)) # Marco como visitada a urlparse paginasVisitadas.append(primeiraPagina) #esperar 300 milisegundos para visitar o prox link. time.sleep(.300) else: print('Requisição falhou!') Tabela(count, primeiraPagina.geturl(), url.hostname)
def check_robots(self, page_id): result_page = self.session.query(Page).filter( Page.id == page_id).first() result_site = self.session.query(Site).filter( Site.id == result_page.site_id).first() try: robots = Robots.fetch(f'http://{result_site.domain}/robots.txt', timeout=3) except Exception: return True return robots.allowed(result_page.url, 'fri-ieps-kslk')
def website_allows_scraping(url): """ Args: url: The URL of the website you are trying to check. Returns: True if the website's policy allows you to scrape. Otherwise False. """ robot_url = Robots.robots_url(url) robot = Robots.fetch(robot_url) return robot.allowed(url, USER_AGENT)
def agent(self): if self._agent is not None: return self._agent try: self._agent = Robots.fetch(self.robots_txt_url) self._agent.agent(self.agent_name) return self._agent except Exception as e: logging.debug( f"Getting agent `{self.agent_name}` for `{self.robots_txt_url}` failed with {e}" ) raise e
def __init__(self, url, links=False, links_unique=True, content=False, depth=5, exclusion_pattern=None, check_lang=False, extractors=None, store_request_headers=False, store_response_headers=False, http_user=None, http_pass=None, *args, **kwargs): domain = urlparse(url).netloc # Setup the rules for link extraction if exclusion_pattern: self._rules = [ Rule(LinkExtractor(allow='.*' + domain + '/.*', deny=exclusion_pattern), callback=self.parse_url, follow=True) ] else: self._rules = [ Rule(LinkExtractor(allow='.*' + domain + '/.*'), callback=self.parse_url, follow=True) ] self.allowed_domains = [domain] self.start_urls = [url] self.links = links # Should we store links ? self.links_unique = links_unique # Should we store only unique links ? self.content = content # Should we store content ? self.depth = depth # How deep should we go ? self.check_lang = check_lang # Store check-lang results ? self.extractors = extractors # Custom extractors ? self.store_request_headers = store_request_headers self.store_response_headers = store_response_headers # HTTP Auth if http_user and http_pass: self.http_user = http_user self.http_pass = http_pass if self.check_lang: # Should we check content language ? import fasttext self.model = fasttext.load_model('data/lid.176.bin') # robots.txt enhanced self.robots = Robots.fetch( urlparse(url).scheme + '://' + domain + '/robots.txt')