def get_all_links(domain, path, maxSize): #response = requests.get(domain+path, headers={'User-Agent': 'Mozilla/5.0'}) driver = webdriver.PhantomJS() driver.get(domain + path) soup = BeautifulSoup(driver.page_source, "html.parser") links = [] rp = RobotsCache(10000) for div in soup.findAll('div'): for link in div.findAll('a', href=True): #print(link.get('href')) if (rrobots(domain, link.get('href'), rp)): regex = re.compile( r'^(?:http|ftp)s?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... r'localhost|' #localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) if re.match(regex, domain + link.get('href')) is not None: if (len(link.get('href')) > 0): if ((link.get('href')[0] >= 'a' and link.get('href')[0] <= 'z') or (link.get('href')[0] >= '1' and link.get('href')[0] <= '9')): links.append('/' + link.get('href')) else: links.append(link.get('href')) return links
def __init__(self, url, config={}, proxies={}, auth=None, ua=DEFAULT_HODOR_UA, pagination_max_limit=DEFAULT_HODOR_MAX_PAGES, crawl_delay=DEFAULT_CRAWL_DELAY, ssl_verify=False, trim_values=True, robots=True, reppy_capacity=100): self.content = None self.url = url self.domain = self._get_domain() self.proxies = proxies self.auth = auth self.ua = ua self.trim_values = trim_values self.ssl_verify = ssl_verify self.config = {} self.extra_config = {} self.robots = RobotsCache(capacity=reppy_capacity) if robots else None self._pages = [] self._page_count = 0 self._pagination_max_limit = pagination_max_limit self.crawl_delay = self._crawl_delay(crawl_delay) for k, v in config.items(): if k.startswith("_"): self.extra_config[k.lstrip("_")] = v else: self.config[k] = v
def download_pages_in_queue(self, queue): current_page_url = queue.get() robot = RobotsCache() if (robot.allowed(current_page_url, "*")): print current_page_url if len(current_page_url) < 10: return current_page_html = download_page_by_url(current_page_url) bs = BeautifulSoup(current_page_html, "html.parser") links = bs.find_all('a', href=True) post_links = [link['href'] for link in links] for post_link in post_links: if len(post_link) < 10: continue if str(post_link).find('http') != 0: post_link = str(self.start_url) + str(post_link) queue.put(post_link) self.sites_num = self.sites_num + 1 page = Pages(url=current_page_url, parsed_text=get_text_from_html(current_page_html), is_indexed=False) page.save() else: print "Page can't be indexed because of the rules in ROBOTS.TXT"
def __init__(self, file, ua, check=True, output="output.csv" ): #setting output to false disables file output if check: #only setup robot checker if robot checking is enabled self.ua = ua #user agent self.robo = RobotsCache(capacity=100) #check var disables or enables robots.txt checking #recommended to keep default True value self.check = check self.req = requests #request obj for parsing url self.output = output #where to output file self.data = [] #init array of grabbed sites self.configarr = [] #empty array of all configs if type(file) is list: self.configarr = file else: self.configarr.append(file)
def get_robot_agent(root_domain: str, protocol="http") -> Rules: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain(root_domain)[4] versions = ["http://", "https://", "http://www.", "https://www."] suffix = "/robots.txt" current = "" found = False for version in versions: temp_link = version + root_domain + suffix try: status_code, content_type = LinkChecker.get_response(temp_link) if status_code == ResponseCode.LinkOK: current = temp_link found = True break else: raise ConnectionError except: pass if found: try: robots = RobotsCache() req = robots.session.get(current) ttl = max(robots.min_ttl, Utility.get_ttl(req.headers, robots.default_ttl)) # And now parse the thing and return it return parser.Rules(current, req.status_code, req.content, time.time() + ttl) # rules = robots.fetch(current) # return rules except: return None else: return None
def __FetchRobotFileInfo__(self, url, robotDictForDomains, timeStamp): domainName = self.__GetComSubdomainOfUrl__(url) robotUrl = "" if robotDictForDomains.has_key(domainName) == False: robotUrl = self.__GetRobotUrlForUrl__(domainName) cache = RobotsCache() try: timeStamp[domainName] = datetime.datetime.now() robotFileObj = cache.fetch(robotUrl) doesUrlExistOnline = self.__DoesUrlExistOnline__(robotUrl) except: doesUrlExistOnline = False robotDictForDomains[domainName] = (doesUrlExistOnline, object) if doesUrlExistOnline == True: robotDictForDomains[domainName] = (doesUrlExistOnline, robotFileObj) else: robotDictForDomains[domainName] = (doesUrlExistOnline, object) doesUrlExistOnline = robotDictForDomains[domainName][0] robotFileObj = robotDictForDomains[domainName][1] # print "heyyy",robotUrl, doesUrlExistOnline, robotFileObj, robotDictForDomains return doesUrlExistOnline, robotFileObj, robotDictForDomains, timeStamp, domainName
def testRobot3(self): robots = RobotsCache() rules = robots.fetch("http://www.realwire.com/") crawl_delay = rules.delay("idiot") print("delay is:", crawl_delay) for i in range(1, 1000): print(rules.allowed("http://api.google.com/search/", agent="idiot"))
def check_for_robot_access(self, page): self.f.write('--- checking for robots %s\n' % page) robots = RobotsCache() try: if robots.allowed(page + 'robots.txt', 'my-agent'): print 'robots allowed' self.f.write('robots allowed. \n') return True except ServerError, r: print 'error ', r return False
def robot_rules(_url_scheme, _url_netloc): # return a robot rules objects _domain = urlunparse((_url_scheme, _url_netloc, '', '', '', '')) robots = RobotsCache() try: rules = robots.fetch(_domain, timeout=5) except Exception as exc: print('FAIL to fatch robot.txt {},{}'.format( _url_scheme, _url_netloc)) print(exc) return None return rules
def get_text_by_base_url(self): robots = RobotsCache(capacity=100) if not robots.allowed(self.base_url, "python-requests"): return ["Crawling this site is not allowed by robots.txt"] text_list = [] for slug in self.__get_links_by_url_depth(): sleep(0.5) text_list.append( remove_emoji( remove_url(self.__get_text_by_url(self.base_url + slug))).strip()) return text_list
def confirm_robots_txt(target_url, max_capacity): '''confirm that target url is allowed to crawl :type target_url: str :param target_url: agent wanna crawl :type max_capacity: int :param max_capacity: limit of max crawling pages :rtype: bool :return: weather it is possible to scrape ''' robots = RobotsCache(max_capacity) return robots.allowed(target_url, 'python program')
def robot_rules(_url_scheme, _url_netloc): # return a robot rules objects #_parsed_url = urlparse(_url) _domain = urlunparse((_url_scheme, _url_netloc, '', '', '', '')) robots = RobotsCache() try: #print('DOMAIN: {}'.format(_domain)) rules = robots.fetch(_domain) except Exception as exc: print('FAIL to fatch robot.txt') print(_url_scheme, _url_netloc) print(exc) return None return rules
def __init__(self, robots_url=None): if robots_url: robots = RobotsCache() self._rules = robots.fetch(robots_url) self.is_use_robots = True else: self.is_use_robots = False self._url_norm = UrlNorm() self.counter = 0 self.urls = dict() self.connections = defaultdict(set) self._lock = RLock()
def __init__(self, file, ua, check=True, output="output.csv"): if check: #only setup robot checker if robot checking is enabled self.ua = ua #user agent self.robo = RobotsCache(capacity=0) #check disables or enables robots.txt checking #recommended to keep default True value self.check = check self.req = requests if os.path.exists(file): with open(file) as f: self.config = json.load(f) #opens and parses json file
def setup_method(self, _): """Configure the app.""" self.url = "http://aetfiws.ovh" self.code1 = test_data.CODE1 self.code2 = test_data.CODE2 self.code3 = test_data.CODE3 self.parser = parsers.ExtractData() self.parser_encoding = parsers.ExtractEncoding() self.STOPWORDS = {'fr':('mot', 'pour', 'de')} self.BADWORDS = {'fr': ('pipe', 'xxx')} self.is_title = True self.title = 'letter' self.headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'vary': 'X-PJAX, Accept-Encoding'} self.reqrobots = RobotsCache(capacity=100)
def allowed(self, url): surl = urlparse(url) rurl = surl.scheme + '://' + surl.hostname + '/robots.txt' if rurl in self.__robot: if not self.__robot[rurl].expired: return self.__robot[rurl].allowed(url, UA) try: r = RobotsCache().fetch(rurl) except: return False else: self.__robot[rurl] = r # add a rule object return self.__robot[rurl].allowed(url, UA)
def __init__(self, crawler): if not crawler.settings.getbool('ROBOTSTXT_OBEY'): raise NotConfigured self.completeblacklist = crawler.settings.get('ROBOTSTXT_BLACKLIST', ()) self.blacklist = [] self.generalblacklist = crawler.settings.get('GENERAL_BLACKLIST', ()) self.hasblacklist = False self.whitelist = crawler.settings.get('ROBOTSTXT_WHITELIST', ()) self.crawler = crawler self._useragent = crawler.settings.get('USER_AGENT') self._parsers = {} self._spider_netlocs = set() self.robots = RobotsCache() self.stoprepetitionsrearg = re.compile(ur'.*?\&(.*?\&)\1{1,}.*') self.stoprepetitionsreslash = re.compile(ur'.*?\/(.*?\/)\1{1,}.*')
def __init__(self, base_url, forum_codes, archive_location, user_agent, worker_count): archiver_logger.info('Archiver initialized.') self.base_url = base_url self.archive_base_url = urljoin(self.base_url, ScraperConfig.ARCHIVE_SUBURL) self.forum_codes = forum_codes self.archive_location = archive_location self.user_agent = user_agent self.robot_parser = RobotsCache() self.scraper_timer = None self.shutdown_event = threading.Event() self.delay_time = 1 self.workers = [] self.worker_count = worker_count self.pages_need_visiting = Queue() self.pages_need_analysis_counter = RachetingCounter() self.pages_visited_lock = threading.Lock() self.pages_visited = [] self.page_re_filters = []
dynamodb = boto3.resource('dynamodb') table = dynamodb.Table("crawl-logs") crawl_seeds_table = dynamodb.Table("crawl-seeds") class DecimalEncoder(json.JSONEncoder): def default(self, o): if isinstance(o, decimal.Decimal): if o % 1 > 0: return float(o) else: return int(o) return super(DecimalEncoder, self).default(o) robots = RobotsCache() user_agent = 'OpenHouseProject.co crawler' sleep_time = .9 bucket = 'oh-crawl' expiration_rules = { 'default': datetime.datetime.now() + datetime.timedelta(days=1), 'starts_with': { 'http://www.everyhome.com/Home-For-Sale/': datetime.datetime(2099, 1, 1), 'http://www.everyhome.com/Homes-For-Sale-By-Listing-Date/Listed-on-': datetime.datetime(2099, 1, 1) } }
def __init__(self): self.reqrobots = RobotsCache() self.parser_encoding = parsers.ExtractEncoding()
try : fetch url exception : broken urls STOPING CONDITION => after N(e.g. 10) url dont tc urls into url dictionary and empty all dictionary and put into queue and finish that queue ''' ''' data ''' import re import datetime as dt from bs4 import BeautifulSoup import urllib2 from urlparse import urlparse from urlparse import urljoin from reppy.cache import RobotsCache robots = RobotsCache() ## creating object for cache robots.txt ''' url data ''' class Url_class : def __init__(self , url ): self.url = url self.anchor = [] self.anchor_win = [] self.title = "" self.urldata = "" def add_anchor(self,anchortext,ancwintext): self.anchor.append(anchortext)
http://qiita.com/rusarusa/items/d7f014ba80d6fe7a3e07 ・PythonでWEB上の画像をまとめてダウンロード http://www.dyesac.com/pythonでweb上の画像をまとめてダウンロード/ ・画像クローラー http://qiita.com/komakomako/items/dd380f980e56e70fa321 Targets: ・https://reverb.com/jp/marketplace/electric-guitars ・https://www.yahoo.co.jp """ # (1) クロールするurlを決める target_url = "https://www.yahoo.co.jp" # (2) robot.txtを読み込むため際に使用するインスタンスの作成 robots = RobotsCache(100) # (3) もし、robot.txtを読み込んでみて、クロール許可をもらえたら、先の処理に進む if robots.allowed(target_url, 'python program'): # (4) Javascriptで生成されたコードでもクロールできるようにPhatomJSインスタンスを作成する driver = webdriver.PhantomJS() # (5) 作成したインスタンスのGetリクエストを呼ぶメソッドに対象のurlを引数として与え、domの情報を手に入れる driver.get(target_url) # <selenium.webdriver.phantomjs.webdriver.WebDriver (session="b140b9a0-74d3-11e7-b434-8b9f5b309f17")> # type(driver) # <class 'selenium.webdriver.phantomjs.webdriver.WebDriver'> # (6) 先ほど取得したdomの情報をutf-8でエンコードして、クロール対象ページの情報をbyte型として保持する html = driver.page_source.encode('utf-8') # type(html) # <class 'bytes'>
#! /usr/bin/env python from __future__ import print_function from contextlib import contextmanager import time from reppy.cache import RobotsCache from reppy.parser import Rules content = ''' User-agent: '*' Allow: / ''' cache = RobotsCache() cache.add(Rules('http://example.com/', 200, content, float('inf'))) @contextmanager def timer(count): '''Time this block.''' start = time.time() try: yield count finally: duration = time.time() - start print('Total: %s' % duration) print(' Avg: %s' % (duration / count)) print(' Rate: %s' % (count / duration))
import sqlite3 import urllib import time from bs4 import BeautifulSoup from reppy.cache import RobotsCache from reppy.robots import Robots ################################################# default_crawl_delay = 5 # caching robots.txt files for fast access robots_cache = RobotsCache(capacity=200) # db commit rate commit_rate = 1 current_r = 0 ################################################# db_location = 'content.db' conn = sqlite3.connect(db_location) cur = conn.cursor() ################################################# ################################################# # populate url_frontier url_frontier = set() cur.execute("SELECT `url_link` FROM `crawled_urls` WHERE `is_scraped` = 0")
from reppy.cache import RobotsCache agent = 'spoderman' sandcrawler = RobotsCache(timeout=2) def is_allowed(url): try: return sandcrawler.allowed(url, agent) except: return False def crawl_delay(url): try: delay = sandcrawler.delay(url, agent) Print('Crawl delay for', url, delay) return delay if delay else 1 except: return 1
def robots_parse(): robots = RobotsCache() print robots.allowed("http://www.uky.edu/hr/employment", "my-agent")
def setUp(self): self.robots = RobotsCache()
class EZWS: robo = RobotsCache(capacity=100, cache_policy=ReraiseExceptionPolicy(0)) data: List[str] = [] """ SELF: config json config file ua user agent robo robotcache obj soup current html page soup obj raw raw html from req.get() check check for robot files, keep true output name of output csv file """ def __init__(self, file: Union[str, Dict], ua: str = "", check: bool = True, output: str = "output.csv") -> None: self.ua = ua self.check = check #setting output to false disables file output self.output = output self.configarr = _listify(file) def allowed(self, url: str) -> bool: if not self.check: return True try: if self.robo.allowed(url, self.ua): return True print(url, "is not allowed") except ConnectionException: print(url, "seems to be down") return False def download(self, url: str) -> Optional[Any]: if not self.allowed(url): return None self.raw = requests.get(url).content return BeautifulSoup(self.raw, "html.parser") def xpath(self, html: str, xp: str) -> List[Any]: return cast(List[Any], lxmlhtml.fromstring(html).xpath(xp)) def select(self, html: Any, json: Dict) -> List[str]: xpath = json.get("xpath", "") css = json.get("css", "") if xpath: found = self.xpath(html.getText(), xpath) return [found[0]] if self.config["header"] else found #assume css was passed found = html.select(css) if self.config["header"]: found = [found[0]] completed = [] for item in found: output = [] contents = _listify(json["contents"]) for content in contents: if content and item.has_attr(content): output.append(item[content]) else: output.append(item.text) completed += output return completed def clear(self) -> None: self.data = [] def load(self, index: int) -> None: config = self.configarr[index] if isinstance(config, Dict): self.config = config else: if os.path.exists(config): with open(config) as f: self.config = json.load(f) return None def grab(self, index: Optional[int] = None) -> None: if index is None: #using grab() with no params will grab all configs passed for i in range(len(self.configarr)): self.grab(i) return None self.load(index) if self.output: sc = simplecsv(self.output, mode="w+") if self.config["header"]: sc.writerow(self.config["header"]) for json in self.config["links"]: for link in chain( *[explode(link) for link in _listify(json["urls"])]): if not self.allowed(link): return None soup = self.download(link) if not soup: print("could not download file") return None for divs in soup.select(json["container"]): data = [] for grab in json["grab"]: data += self.select(divs, grab) self.data += data if self.output: sc.writerow(data) if self.output: sc.close()
def _set_robot_rule(self): """ Set the robots.txt rules """ self.rules = RobotsCache().fetch(self.url)
scrape_path = "http://qiita.com/hmatsu47/items/" # 探索対象外URL文字列 exclude_str_list = [ "feed", "rss", "archive", "about", "revision", "like", "follow", "contribution", "comment", "reference", ".md" ] # 探索済みURL scrape_url_list = [] # 抽出した本文 summary_ap_text = [] # 探索する最大ページ数 crawl_limit = 100 # 本文を抽出する最大ページ数 item_limit = 50 # robots.txt判定用 robots_cache = RobotsCache(capacity=crawl_limit) # Watson認証情報 apikey = '【APIキー】' url = '【APIのURL】' # 対象外URLが含まれていないか判定 def is_crawlable_url(url): for es in exclude_str_list: if url.find(es) != -1: break else: robots_flag = robots_cache.allowed(domain, "*") return (robots_flag) return False