def filter(self, ua): """Remove all of the urls in URLS that UA is not allowed to crawl, and fill in the .crawl_delay and .robots_url properties.""" rules = None for url in sorted(self.urls): robots_url = Robots.robots_url(url) if self.robots_url != robots_url: if self.robots_url is None: try: rules = Robots.fetch(robots_url, headers={ 'User-Agent': ua }).agent(ua) except Exception as e: sys.stderr.write( "warning: failed to fetch and parse {}: {}\n" .format(robots_url, e)) rules = DummyAgent() self.robots_url = robots_url self.crawl_delay = rules.delay or 1 else: raise ValueError( "robots.txt for {} is {}, not {}" .format(url, robots_url, self.robots_url)) if not rules.allowed(url): self.urls.remove(url)
def filter_urls(urls, ua): """Partition URLS (an iterable) into sites, and then filter out all of the urls in each site that UA is not allowed to crawl. Returns a list of Site objects.""" sites = defaultdict(Site) for url in urls: url = canon_url_syntax(url) robots_url = Robots.robots_url(url) sites[robots_url].add(url) for site in sites.values(): site.filter(ua) return sorted(sites.values(), key = lambda s: s.robots_url)
def parse_webpages(webpages): for page in webpages: # obtain the robots.txt url r = Robots.robots_url(page) robots = Robots.fetch(r) if (robots.allowed(page, '*')): # sitemaps is a list of all the sitemaps for a website sitemaps = robots.sitemaps sitemaps_list = list(sitemaps) html = requests.get(page) # html of the webpage soup = bs4.BeautifulSoup(html.text, "html.parser") outlinks = soup.find_all("a") # all the outlinks links = [str(i.get('href')) for i in outlinks] outlinks = [str(i) for i in outlinks] docs = [] # the documents on the page for file in links: directory = page.rsplit("/", 1)[0] link = directory + '/' + file # can be expanded to other file types with a comma if file.endswith(('txt', 'md')): if file.startswith(('http://', 'www.')): text = bs4.BeautifulSoup( requests.get(file).text, "html.parser") ext = file.rsplit(".", 1)[-1] text = [file, ext, text] # text = {'link': link, 'ext': ext, 'text': text} docs.append(text) else: text = bs4.BeautifulSoup( requests.get(link).text, "html.parser") ext = link.rsplit(".", 1)[-1] text = [link, ext, text] # text = {'link': link, 'ext': ext, 'text': text} docs.append(text) elif file.endswith(('pdf')): # special case if PDF x = file try: if file.startswith(('http://', 'www.')): pdf = file.rsplit("/", 1)[-1] response = urlopen(file) else: pdf = file.rsplit("/", 1)[-1] # must first check if pdf is found response = urlopen(link) except urllib.error.HTTPError as e: # if 404 error, put 404 as text text = [link, "pdf", "404"] # text = {'link': link, 'ext': 'pdf', 'text': "404"} docs.append(text) else: # otherwise must save the pdf to run pypdf2 file = open(pdf, 'wb') file.write(response.read()) file.close() if x.startswith('http://'): link = x txt = "" file = open(pdf, 'rb') parser = PDFParser(file) document = PDFDocument(parser) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for p in PDFPage.create_pages(document): # As the interpreter processes the page stored in PDFDocument object interpreter.process_page(p) # The device renders the layout from interpreter layout = device.get_result() # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): txt += lt_obj.get_text() # close the pdf file file.close() name = [link, "pdf", txt] # name = {'link': link, 'ext': 'pdf', 'text': txt} os.remove(pdf) # remove the saved file when done docs.append(name) docs = [[str(i) for i in lis] for lis in docs] timestamp = datetime.datetime.now().isoformat() output = { 'url': page, 'timestamp': timestamp, 'outlinks': outlinks, 'html': html.text, 'docs': docs, 'sitemaps': sitemaps_list } with Crawling_L_REST.app.app_context(): Crawling_L_REST.add_webpage(output) return output
def robots(self,path): self.ro = Robots.fetch(self.url + path) return self.ro
def __init__(self, seed_url, user_agent): self.seed_url = seed_url self.user_agent = user_agent self.robots_url = Robots.robots_url(seed_url) self.robots = Robots.fetch(self.robots_url) self.accepted_header_content_type = "text/html"
def __init__(self, robotstxt_body, spider): from reppy.robots import Robots self.spider = spider self.rp = Robots.parse('', robotstxt_body)
from threading import Thread, Lock from requests_html import HTMLSession from time import sleep from reppy.robots import Robots from time import strftime, gmtime, time domain = input('Введите домен:') domain_link = f'https://{domain}' new_urls = {domain_link} scaned_urls = set() locker = Lock() robots = Robots.fetch(domain_link) agent = 'Googlebot' timestamp = strftime("__%d_%b_%H_%M_%S", gmtime()) file = open(f'{domain}_{timestamp}.txt', 'a', encoding='utf-8') def worker(): with HTMLSession() as session: while True: if len(new_urls) == 0: sleep(10) if len(new_urls) == 0: break try: url = new_urls.pop() response = session.get(url, timeout=1) url_links = response.html.absolute_links
def reppy_robot(url): robot_url = urljoin(get_domain_name(url), "robots.txt") rp = Robots.fetch(robot_url) #print(rp.allowed(href, '*')) yield rp.allowed(url, '*')
Allow: /serv Allow: /~mak Disallow: / ''' @contextmanager def timer(name, count): '''Time this block.''' start = time.time() try: yield count finally: duration = time.time() - start print(name) print('=' * 10) print('Total: %s' % duration) print(' Avg: %s' % (duration / count)) print(' Rate: %s' % (count / duration)) print('') with timer('Parse', 100000) as count: for _ in xrange(count): Robots.parse('http://example.com/robots.txt', content) parsed = Robots.parse('http://example.com/robots.txt', content) with timer('Evaluate', 100000) as count: for _ in xrange(count): parsed.allowed('/org/example.html', 'other-bot')
# In[28]: options = webdriver.ChromeOptions() options.add_argument("headless") driver = webdriver.Chrome( "C:\\Users\\shree\\Downloads\\Softwares\\chromedriver_win32\\chromedriver.exe", options=options) q = queue.Queue() urlsAlreadyVisited = set() seed = "https://www.foodrepublic.com/recipes" q.put(seed) print(urldefrag(seed)[0]) urlsAlreadyVisited.add(urldefrag(seed)[0]) robots = Robots.fetch('http://www.foodrepublic.com/robots.txt') agent = robots.agent('User-agent') # In[1]: while True: url = q.get() driver.get(url) soup = BeautifulSoup(driver.page_source, 'html.parser') # saveContentInHeirarchy(str(soup.extract()),url) links = soup.find_all('a') for link in links: u = link.get('href') if not is_absolute(u): u = urljoin(url, u) if "foodrepublic.com" in u and "@foodrepublic.com" not in u:
import random from time import sleep from threading import Lock from queue import Queue from requests_html import HTMLSession from concurrent.futures import ThreadPoolExecutor from reppy.robots import Robots # This utility uses `requests` to fetch the content DOMAIN = 'goodreads.com' scaned_urls = set() locker = Lock() robots = Robots.fetch(f'https://www.{DOMAIN}/robots.txt') def worker(queue): session = HTMLSession() while True: if queue.qsize() == 0: sleep(30) if queue.qsize() == 0: break try: url = queue.get() print('Send request to', url) resp = session.get(url) title = resp.html.xpath('//title/text()')[0].strip() print(title)
def __extract_info(self, url): self.__print_debug('crawling page', url) parsed_url = urlparse(url) if parsed_url.netloc == self.__initial_domain_name: if not self.__rp.allowed(url, self.__user_agent): self.__print_debug('disallowed by user agent') return None else: current_robot = Robots.fetch(Robots.robots_url(url)) if not current_robot.allowed(url, self.__user_agent): self.__print_debug('disallowed by user agent') return None content, is_html, language = self.__crawl_page(url) if content is None: return None path = urlparse(url).path.replace('/', '_') if path is None or path == '': path = '__index__' if self.__storage: self.__set_up_folders(parsed_url.netloc) fsource = open( self.__PATH_SOURCE + parsed_url.netloc + '/' + path + '.html', 'wb') fsource.write(content) fsource.close() if not is_html: self.__pages.append({ 'content': content, 'language': language, 'url': url, 'html': content }) return content soup = BeautifulSoup(content, 'html.parser') for link in soup.find_all('a'): href = link.get('href') if href is None or '#' in href: continue if href.startswith('http'): self.__add_url(href) continue if href.startswith('mailto'): continue new_url = str(urljoin(url, href)) self.__add_url(new_url) texts = soup.findAll(text=True) visible_texts = filter(self.__tag_visible, texts) visible_texts = ' '.join(t.strip() for t in visible_texts if t.strip() != '') if self.__storage: fout = open( self.__PATH_INFO + parsed_url.netloc + '/' + path + '.json', 'w') fout.write( json.dumps({ 'url': url, 'domain_name': parsed_url.netloc, 'html': content.decode('utf-8'), 'language': language, 'content': visible_texts, 'meta': self.__meta, })) fout.close() self.__pages.append({ 'content': visible_texts, 'language': language, 'url': url, 'html': content })
domain = input('Введите домен для краулинга: ') first_link = f'http://{domain}/' prepared_response = session.get(first_link, proxies={}) first_link = prepared_response.url domain = first_link.split('/')[2] robots_link = f'https://{domain}/robots.txt' crawled_links = set() links_to_crawl = set() links_to_crawl.add(first_link) robots = Robots.fetch(robots_link) file_results = open('checking_results.txt', 'w', encoding='utf-8') while True: if len(links_to_crawl) == 0: break url = links_to_crawl.pop() try: proxies = get_random_proxy() t1 = time() response = session.get(url, proxies=proxies, timeout=8) t2 = time()
def is_robot_valid(self, url): robot_url = urljoin(get_domain_name(url), "robots.txt") rp = Robots.fetch(robot_url) yield rp.allowed(url, self.hdr['User-Agent'])
def parse(self, content, name): '''Parse the robots.txt in content and return the agent of the provided name.''' return Robots.parse('http://example.com', content).agent(name)
def crawl_page(url): if not check_url(url): return try: content = requests.get(url).text except requests.exceptions.ConnectionError: return soup = BeautifulSoup(content, "html.parser") print("Start URL: ", url) site = get_site_information(url) rows = dbm.get_all_rows("sites") for i in range(100): try: if is_existing(site.link, rows): dbm.update_column(site) break else: dbm.insert_into_sites(site) break except: pass links = soup.find_all('a') index = 0 for link in links: links[index] = link.get('href') index += 1 links = list(filter(None, links)) # remove empty strings links = list(set(links)) # removing duplicates filtered_links = list() for link in links: if url_pattern.match(link): filtered_links.append(link) links.remove(link) elif sub_url_pattern.match(link): if str(link).startswith("/") and url.endswith("/"): filtered_links.append(get_url(get_domain(url[:-1])) + link) else: filtered_links.append(get_url(get_domain(url)) + link) links.remove(link) index = 0 for link in filtered_links: if "?" in link: # Found get parameter filtered_links[index] = filtered_links[index][:filtered_links[ index].find("?")] # Remove get parameters if "#" in link: filtered_links[index] = filtered_links[ index][:filtered_links[index].find("#")] if "%" in link: filtered_links[index] = filtered_links[ index][:filtered_links[index].find("%")] index += 1 filtered_links.append(url) print("Links found: ", len(filtered_links)) for link in filtered_links: if not check_url(link): filtered_links.remove(link) sites = list() for link in filtered_links: if link in get_url(get_domain(link)): pass elif link not in url: domain = get_domain(link) robots_url = get_url_to_robots(get_url(domain)) try: robots = Robots.fetch(robots_url) if not robots.allowed(link, user_agent): if link.endswith("/"): if robots.allowed(link[:-1].allowed( link[:-1], user_agent)): pass else: filtered_links.remove(link) else: filtered_links.remove(link) else: print("link: ", link) except: filtered_links.remove(link) for link in filtered_links: if get_url(get_domain(link)) + str("/sitemap") in link: print(1) continue sites.append(get_site_information(link)) filtered_links.remove(link) # Adding all sites to the db in a seperate thread threading.Thread(target=insert_into_db, args=(sites, )).start() print("Links found: ", len(sites)) if len(sites) > 0: with stopit.ThreadingTimeout(100) as to_ctx_mgr: assert to_ctx_mgr.state == to_ctx_mgr.EXECUTING choice = random.choice(sites).link if not choice == url: crawl_page(choice)
from requests_html import HTMLSession from reppy.robots import Robots print('Start working') domain = input('Enter domain name: ') home_url = f'http://{domain}/' robots_url = f'http://{domain}/robots.txt' robots = Robots.fetch(robots_url) links_to_scan = set() links_to_scan.add(home_url) scaned_links = set() session = HTMLSession() result_file = open('results.csv', 'w') result_file.write(f'Is Duplicate\tURL\tTitle\tDescription\tH1\tCanonical\n') all_titles = set() def make(s): try: s = s[0].strip() except IndexError: s = '' return s
def main(): downloader = None parser = argparse.ArgumentParser() parser.add_argument("-i", "--insecure", help="use HTTP instead of HTTPS", action="store_true") parser.add_argument("-e", "--export", help="export immediately without downloading (Only useful if you already downloaded something to the .pickle file)", action="store_true") parser.add_argument('-E', '--Exchange', help='Only export ticker symbols from this exchange (the filtering is done during the export phase)') parser.add_argument('type', nargs='?', default='generic', help='The type to download, this can be: '+" ".join(list(options.keys()))) parser.add_argument("-s", "--sleep", help="The time to sleep in seconds between requests", type=float, default=0) parser.add_argument("-p", "--pandantic", help="Stop and warn the user if some rare assertion fails", action="store_true") args = parser.parse_args() protocol = 'http' if args.insecure else 'https' if args.insecure: print("Using insecure connection") if args.export: print("Exporting pickle file") tickerType = args.type = args.type.lower() print("Checking if we can resume a old download session") try: downloader = loadDownloader(tickerType) print("Downloader found on disk, resuming") except: print("No old downloader found on disk") print("Starting a new session") if tickerType not in options: print("Error: " + tickerType + " is not a valid type option. See --help") exit(1) else: downloader = options[tickerType] robotsUrl = protocol + '://finance.yahoo.com/robots.txt' robots = Robots.fetch(robotsUrl) try: if not args.export: if(not robots.allowed(protocol + '://finance.yahoo.com/_finance_doubledown/api/resource/searchassist', user_agent)): print('Execution of script halted due to ' + robotsUrl) return 1 if not downloader.isDone(): print("Downloading " + downloader.type) print("") downloadEverything(downloader, tickerType, args.insecure, args.sleep, args.pandantic) print ("Saving downloader to disk...") saveDownloader(downloader, tickerType) print ("Downloader successfully saved.") print ("") else: print("The downloader has already finished downloading everything") print("") except Exception as ex: print("A exception occurred while downloading. Suspending downloader to disk") saveDownloader(downloader, tickerType) print("Successfully saved download state") print("Try removing {type}.pickle file if this error persists") print("Issues can be reported on https://github.com/Benny-/Yahoo-ticker-symbol-downloader/issues") print("") raise except KeyboardInterrupt as ex: print("Suspending downloader to disk as .pickle file") saveDownloader(downloader, tickerType) if downloader.isDone() or args.export: print("Exporting "+downloader.type+" symbols") data = tablib.Dataset() data.headers = downloader.getRowHeader() for symbol in downloader.getCollectedSymbols(): if(args.Exchange == None): data.append(symbol.getRow()) elif (symbol.exchange == args.Exchange): data.append(symbol.getRow()) with io.open(downloader.type + '.csv', 'w', encoding='utf-8') as f: f.write(text.join(u',', data.headers) + '\n') writer = csv.writer(f) for i in range(0, len(data)): row = [text(y) if not y is None else u"" for y in data[i]] writer.writerow(row) try: with open(downloader.type + '.xlsx', 'wb') as f: f.write(data.xlsx) except: print("Could not export .xlsx due to a internal error") try: with open(downloader.type + '.json', 'wb') as f: f.write(data.json.encode('UTF-8')) except: print("Could not export .json due to a internal error") try: with open(downloader.type + '.yaml', 'wb') as f: f.write(data.yaml.encode('UTF-8')) except: print("Could not export .yaml due to a internal error")
from reppy.robots import Robots #grab robots url url = Robots.robots_url('https://science.rpi.edu/computer-science') if 'http' in url: #print(url) robots = Robots.fetch(url) #print(robots) print(robots.allowed('https://science.rpi.edu/computer-science/', 'agent')) print(robots.allowed('https://science.rpi.edu/admin/', 'agent'))
def check_allow(urls): robots = Robots.fetch(f'{start_url}/robots.txt') return {url for url in urls if robots.allowed(url, 'Googlebot') == True}
def parse_robots_txt(self, link_list): host, port = self.config.cache_server robotsURL = '' robots = None links = [] for link_url in link_list: parsed_link = parse.urlparse(link_url) link_base = '{0.scheme}://{0.netloc}/'.format(parsed_link) if robots == None or link_base not in robotsURL: if 'today.uci.edu' in link_base: robots = Robots.parse('https://today.uci.edu/department/information_computer_sciences/robots.txt', ''' User-agent: * Disallow: /*/calendar/*?*types* Disallow: /*/browse*?*types* Disallow: /*/calendar/200* Disallow: /*/calendar/2015* Disallow: /*/calendar/2016* Disallow: /*/calendar/2017* Disallow: /*/calendar/2018* Disallow: /*/calendar/2019* Disallow: /*/calendar/202* Disallow: /*/calendar/week Disallow: /*/search Disallow: /*?utm Allow: / Allow: /*/search/events.ics Allow: /*/search/events.xml Allow: /*/calendar/ics Allow: /*/calendar/xml ''') else: robotsURL = link_base + 'robots.txt' time.sleep(0.5) # get the robots.txt file try: robots = Robots.fetch(f"http://{host}:{port}/", params=[("q", f"{robotsURL}"), ("u", f"{self.config.user_agent}")], timeout=20) except Exception as e: print(e) robots = None # WARNING: UNCOMMENTING BYPASSES CACHE # if the robots is empty, get the robots.txt from actual server # robots_str = str(robots) # robots_str = robots_str.split(': ')[1].split('}')[0] # if robots_str == '[]': # robots = Robots.fetch(robotsURL, timeout=20) # print(robots) if robots == None: links.append(link_url) continue if parsed_link.params == '': if parsed_link.query == '': query_only = '{0.path}/'.format(parsed_link) else: query_only = '{0.path}/?{0.query}'.format(parsed_link) else: if parsed_link.query == '': query_only = '{0.path}/{0.params}/'.format(parsed_link) else: query_only = '{0.path}/{0.params}/?{0.query}'.format(parsed_link) if robots.allowed(query_only, self.config.user_agent): links.append(link_url) return links
from reppy.robots import Robots url = "http://www.amazon.com" robots = Robots.fetch(url + "/robots.txt") paths = [ '/', '/gp/dmusic/', '/gp/dmusic/promotions/PrimeMusic/', '/gp/registry/wishlist/' ] for path in paths: print("{0}: {1}".format(robots.allowed(path, '*'), url + path))
from reppy.robots import Robots #%% robots = Robots.fetch('https://allabout.co.jp/robots.txt') agent = robots.agent('*') #アクセス可能かどうか agent.allowed('https://allabout.co.jp/r_finance/') #agent.allowed('https://allabout.co.jp/ranking/daily/') #%%クロール間隔の取得 robots = Robots.fetch('https://allabout.co.jp/robots.txt') agent = robots.agent('bingbot') agent.delay