예제 #1
0
def robotsTxtParse(url, config, logger):
    # Finds the robot.txt of a domain and subdomain(if one exists) and
    # Stores it in DataStore.RobotChecks
    scheme = urlparse(url).scheme #scheme needed to read robots.txt

    domain = getDomain(url)
    #val=r.hget(robotsCheck,"bhh").decode('utf-8')
    if domain != '' and domain not in DataStore.robotsCheck:
    #if domain != '' and domain not in r.hexists(robotsCheck, domain):
        robotTxtUrl = f"{scheme}://{domain}/robots.txt"
        robot = RobotFileParser(config, logger)
        robot.set_url(robotTxtUrl)
        robot.read()
        #r.hset(robotsCheck, domain, robot)
        DataStore.robotsCheck[domain] = robot

    subdomain = getSubDomain(url)
    if subdomain != '' and subdomain not in DataStore.robotsCheck:
    #if subdomain != '' and not r.hexists(robotsCheck,subdomain):
        robotTxtUrl = f"{scheme}://{subdomain}/robots.txt"
        robot = RobotFileParser(config, logger)
        robot.set_url(robotTxtUrl)
        robot.read()
        #r.hset(robotsCheck, subdomain, robot)
        DataStore.robotsCheck[subdomain] = robot
예제 #2
0
 def test_robot(self):
     # 测试robots
     from urllib.robotparser import RobotFileParser
     rp = RobotFileParser()
     rp.set_url(test_url + 'robots.txt')
     rp.read()
     print(rp.can_fetch('*', test_url))
예제 #3
0
파일: website.py 프로젝트: rebryk/spbau-ir
class Website:
    def __init__(self, scheme: str, hostname: str):
        self.scheme = scheme
        self.hostname = hostname
        self.last_time = 0

        self._urls = set()
        self._queue = deque()

        # parse robots.txt
        self._robot_parser = RobotFileParser()
        self._robot_parser.set_url("{}://{}/robots.txt".format(
            scheme, hostname))
        self._robot_parser.read()

    def can_fetch(self, user_agent: str, url: str) -> bool:
        return self._robot_parser.can_fetch(user_agent, url)

    def add_url(self, url: str, depth: int = 0):
        if url not in self._urls:
            self._urls.add(url)
            self._queue.append((url, depth))

    def get_url(self) -> (str, int):
        return self._queue.popleft()

    def crawl_delay(self, user_agent: str) -> int:
        delay = self._robot_parser.crawl_delay(user_agent)
        return delay * 300 if delay is not None else None

    def is_empty(self) -> bool:
        return len(self._queue) == 0
예제 #4
0
def allowed_to_crawl(url, host_url, scheme):
    '''
    url: the full url, string
    host_url: the domain url, string (eg. wikipedia.org)
    scheme: the communication protocol, string (eg. https)
    '''

    # if host URL is google, assume we are allowed to crawl
    if host_url == 'google.com':
        return True

    # if it is not a link, return False
    if host_url == '' or scheme == '':
        return False

    try:
        # get the robots.txt
        rp = RobotFileParser()
        rp.set_url(scheme + "://" + host_url + "/robots.txt")
        rp.read()

        return rp.can_fetch("*", url)

    except:
        pass

    return True
예제 #5
0
def get_robots_parser_if_exists(url):
    """
    Attempts to parse the robots.txt file for a url.

    :param url: the url to request.
    :type url: str
    :return: a RobotFileParser object if a valid robots.txt is found; otherwise None.
    :rtype: RobotFileParser or None
    """

    if not url.startswith('http'):
        url = 'http://' + url

    parsed_url = urlparse(url)
    robot_path = '{url.scheme}://{url.netloc}/robots.txt'.format(
        url=parsed_url)

    try:
        r = requests.head(robot_path)
        if r.status_code < 300:
            rp = RobotFileParser()
            rp.set_url(robot_path)
            rp.read()
            return rp
        else:
            return None
    except RequestException:
        return None
예제 #6
0
def can_fetch(url, robots_url, useragent):
    from urllib.robotparser import RobotFileParser

    parser = RobotFileParser()
    parser.set_url(robots_url)
    parser.read()
    return parser.can_fetch(useragent, url)
예제 #7
0
def robotExclusion(link):
    # print("Robot exclusion.....")
    rp=RobotFileParser();
    rp.set_url(urljoin(link, '/robot.txt'))
    rp.read()
    # print(rp.can_fetch("*", link))
    return rp.can_fetch("*", link)
예제 #8
0
def rootFileParser():
    rp = RobotFileParser()
    rp.set_url("http://www.jianshu.com/robots.txt")
    rp.read()
    print(rp.can_fetch("*", "https://www.jianshu.com/p/b67554025d7d"))
    print(
        rp.can_fetch(
            "*",
            "http://www.jianshu.com/search?q=python&page=1&type=collections"))
예제 #9
0
 def get_robot_txt(cls, url):
     try:
         rp = RobotFileParser()
         parsed_url = urlparse(url)
         robots_url = "{0.scheme}://{0.netloc}/robot.txt".format(parsed_url)
         rp.set_url(robots_url)
         rp.read()
         return rp.can_fetch("*", url)
     except Exception as e:
         raise Exception(e.args[0])
예제 #10
0
def set_url():  # 通过 set_url() 方法设置 robots.txt 的链接。
    rp = RobotFileParser()
    rp.set_url('http://www.bilibili.com/robots.txt')
    rp.read()
    # 使用 can_fetch() 方法判断网页是否可以被抓取。
    print(rp.can_fetch('*', 'http://www.bilibili.com/vide/BV15J411T7WQ'))
    print(
        rp.can_fetch(
            '*',
            'http://www.bilibili.com/search?q=python&page=1&type=collections'))
예제 #11
0
 def check_robots_txt(self, parsed_link):
     """
     Checks the site's robots.txt file to make sure our user agent is allowed to visit that url.
     :param parsed_link:
     :return: boolean . True if we're allowed to visit (or there's no robots.txt)
     """
     rp = RobotFileParser()
     rp.set_url(urljoin(parsed_link.geturl(), '/robots.txt'))
     rp.read()
     return rp.can_fetch(self.user_agent, parsed_link.geturl())
예제 #12
0
def get_robots(url):
    """
    获取Robots解析器
    :param url: robots.txt存在路径
    :return: Robots解析器
    """
    rp = RobotFileParser()
    rp.set_url(url)
    rp.read()
    return rp
예제 #13
0
def parse_robots(site: ParseResult) -> RobotFileParser:
    """
    Process a robots file for the specified domain.
    :param site:
    :return:
    """
    robots = RobotFileParser()
    robots.set_url(f"{site.scheme}://{site.netloc}/robots.txt")
    robots.read()
    return robots
예제 #14
0
def robot_parse():
    rp = RobotFileParser()
    rp.set_url('http://www.jianshu.com/robots.txt')
    rp.read()

    print(rp.can_fetch('*', 'http://www.jianshu.com/p/'))
    print(
        rp.can_fetch(
            '*',
            "http://www.jianshu.com/search?q=python&page=1&type=collections"))
예제 #15
0
class Driver:
    def __init__(self,
                 root_url: str,
                 header: str,
                 access_delay: int = 3,
                 cookies: dict = None,
                 logger=None):
        self.logger = logger
        self.root_url = root_url
        self.cookies = cookies
        self.header = header
        self.access_delay = access_delay
        self.now_content = None
        self.robots = None
        self.load_robots_txt()

    def load_robots_txt(self):
        self.robots = RobotFileParser()
        self.robots.set_url(self.root_url + '/robots.txt')
        self.robots.read()

    def get(self, path):
        try:
            sleep(self.access_delay)
            url = f'{self.root_url}/{path}'
            if self.robots.can_fetch("*", url):
                res = requests.get(url,
                                   headers=self.header,
                                   cookies=self.cookies)
                if self.logger is not None:
                    self.logger.debug(f"Access to {url}.")
                self.now_content = BeautifulSoup(res.text, 'html.parser')
            else:
                if self.logger is not None:
                    self.logger.warning(
                        f"Access to this url is prohibited by robots.txt.\n<*>[URL={url}]"
                    )
        except Exception as e:
            if self.logger is not None:
                self.logger.warning(e)

    def find_element_by_class_name(self, name):
        return self.now_content.select('.' + name)[0]

    def find_elements_by_class_name(self, name):
        return self.now_content.select('.' + name)

    def find_element_by_id(self, name):
        return self.now_content.select('#' + name)[0]

    def find_elements_by_id(self, name):
        return self.now_content.select('#' + name)

    def find_element_by_tag(self, name):
        return self.now_content.find_all(name)
예제 #16
0
 def __init__(self, page):
     self.root = page
     self.parsed_uri = urlparse(page)
     self.home_page = "{uri.scheme}://{uri.netloc}/".format(
         uri=self.parsed_uri)
     self.to_crawl = set()
     self.crawled = set()
     rp = RobotFileParser()
     rp.set_url(self.home_page + "robots.txt")
     rp.read()
     self.rp = rp
예제 #17
0
class Exclusion(object):
    def __init__(self):
        self.robot_cache = {}
        self.rp = RobotFileParser()

    # Broken? disallows anything in the robots.txt.
    # Even if it is marked with Allow:
    def test_url(self, url):
        self.rp = RobotFileParser()
        robot_url = uu.domain_name(url) + '/robots.txt'
        self.rp.set_url(robot_url)
        self.rp.read()
        return self.rp.can_fetch('*', url)
예제 #18
0
    def is_scraping_allowed(self):
        """
        Megnezi, hogy a robots.txt nem tiltja-e a scrapelest. Nem igazan teljes
        az ellenorzes, mert csak az all job url-t vizsgalja.

        :return:
        """
        robot_parser = RobotFileParser()
        robots_url = urljoin(self.base_url, 'robots.txt')
        robot_parser.set_url(robots_url)
        robot_parser.read()
        return robot_parser.can_fetch('*', urljoin(
                self.base_url, self.all_job_url))
예제 #19
0
 def retrieve_site_robots(self):
     url = url_normalize(self.domain + "/robots.txt")
     rp = RobotFileParser()
     rp.set_url(url)
     try:
         rp.read()
         self.robots_content = str(rp)
         if rp.site_maps():
             self.retrieve_sitemap_content(rp)
     except Exception as e:
         self.robots_content = ""
         print(f"ERROR retrieving robots.txt for {self.domain}")
         print(e)
예제 #20
0
def get_robots(seed_url):
    """
    获取机器人守则
    :param seed_url:
    :return: 返回一个 urllib.robotparser.RobotFileParser 对象 或者 None
    """
    rp = RobotFileParser()
    rp.set_url(urljoin(seed_url, '/robots.txt'))
    try:
        rp.read()
    except Exception as e:
        return None
    return rp
예제 #21
0
def GetRobotsTxt(url):
    rp = RobotFileParser()
    rp.set_url(url)
    rp.read()
    print(
        rp.can_fetch(
            '*',
            'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'))
    print(rp.can_fetch('*', 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4'))
    print(
        rp.can_fetch(
            '*',
            'https://book.douban.com/tag/%E6%9D%91%E4%B8%8A%E6%98%A5%E6%A0%91?start=40&type=S'
        ))
예제 #22
0
def can_fetch_url(robots_url, site_url, useragent="*"):
    """
    Using robots.txt found at robots_url, decides if useragent can fetch
    site url

    :param robots_url: robots.txt url
    :param site_url: to be fetched url
    :param useragent: useragent
    :return: True, if fetching is allowed
    """
    rfp = RobotFileParser()
    rfp.set_url(robots_url)
    rfp.read()
    return rfp.can_fetch(useragent=useragent, url=site_url)
예제 #23
0
def create_sdomain_robot(url: str):
    url = urlparse(url)
    robot = RobotFileParser()
    robot.set_url(url.scheme + '://' + url.netloc + "/robots.txt")

    def can_crawl(url_with_path: str):
        return robot.can_fetch('*', url_with_path)

    #may not be able to read a file, if so we will just not have a robot for that
    #subdomain. Assumes all pages can be crawled
    try:
        robot.read()
        robots[url.netloc] = can_crawl
    except:
        pass
예제 #24
0
def robotsTxtParseSeeds():
    # Stores the robot.txt of the seed urls in DataStore.RobotChecks
    seedUrls = ['https://today.uci.edu/department/information_computer_sciences/',
    'https://www.ics.uci.edu',
    'https://www.cs.uci.edu',
    'https://www.informatics.uci.edu',
    'https://www.stat.uci.edu']
    for seedUrl in seedUrls:
        scheme = urlparse(seedUrl).scheme
        domain = getSubDomain(seedUrl)
        robotTxtUrl = f"{scheme}://{domain}/robots.txt"
        robot = RobotFileParser()
        robot.set_url(robotTxtUrl)
        robot.read()
        DataStore.robotsCheck[domain] = robot
예제 #25
0
class RobotsTxt:
    def __init__(self) -> None:
        self.state = None  # type: Any

    def allowed(self, url: URL) -> bool:

        # We don't have info about this domain for now, so we going to request
        # robots.txt
        if self.state is None:
            self.request(url.link("/robots.txt"))

        # We actually can't find out is there robots.txt or not
        # so we going to allow all in this case.
        if self.state is False or self.state.allow_all:
            return True

        if not self.state.last_checked and self.state.disallow_all:
            return False

        # find entry
        return allowed(matched_rules(self._entry(), url))

    def request(self, url: str) -> None:
        """ Perform robots.txt request """
        if self.state is not None:
            return

        try:
            self.state = RobotFileParser()
            self.state.set_url(url)
            self.state.read()

        except Exception:
            self.state = False

    # This is mostly transferred logics from robotparser.py,
    # but we trying to follow 2019 extension of the Google's Robots Txt
    # protocol and allow, disallowed pathes.
    # https://www.contentkingapp.com/blog/implications-of-new-robots-txt-rfc/
    # https://tools.ietf.org/html/draft-koster-rep-04

    def _entry(self) -> Any:

        for entry in self.state.entries:
            if entry.applies_to(user_agent):
                return entry

        return self.state.default_entry
예제 #26
0
 def get_robots_text(self,url):
     """get_robots_txt
     url: robots.txt を確認するサイトURL
     """
     try:
         rp = RobotFileParser()
         # robots の url 取得
         parsed_url = urlparse(url)
         robots_url = "{0.scheme}://{0.netloc}/robots.txt".format(parsed_url)
         # robots.txt 取得
         rp.set_url(robots_url)
         rp.read()
         # 取得していいか確認
         return rp.can_fetch("*",url)
     except:
         return False
예제 #27
0
def init_robots(allowed_domains):
    robot_filters = {}
    for domain in allowed_domains:
        rp = RobotFileParser()
        url = "https://{}/robots.txt".format(domain)
        rp.set_url(url)
        rp.read()
        robot_filters[domain] = rp

    def crawl_prohibited(robots, url):
        domain = urlparse(url).netloc
        if domain not in robots:
            return True
        return not robots[domain].can_fetch('*', url)

    return lambda url: crawl_prohibited(robot_filters, url)
예제 #28
0
 def get_robots_txt(cls, url):
     """ get_robots_txt
     url: robots.txt を確認するサイトURL
     """
     try:
         # robots.txt 用パーサー
         rp = RobotFileParser()
         # robots の url 取得
         parsed_url = urlparse(url)
         robots_url = "{0.scheme}://{0.netloc}/robots.txt".format(
             parsed_url)
         # robots.txt 取得
         rp.set_url(robots_url)
         rp.read()
         # 取得していいか確認
         return rp.can_fetch("*", url)
     except Exception as e:
         raise Exception(e.args[0])
예제 #29
0
def can_read(url):
    domain = domain_name(url)
    if domain not in Permissions:
        rp = RobotFileParser()
        rp.set_url(urljoin('http://' + domain, 'robots.txt'))
        try:
            rp.read()
        except:
            return False

        Permissions[domain] = rp

    res = False
    try:
        res = Permissions[domain].can_fetch("*", url)
    except:
        return False

    return res
예제 #30
0
def read_all_sitemaps(get_accredited: bool, get_page=0):
    """
    get all the companies using the sitemaps. get_page is used for batch processes.
    set it to -1 to get everything. returns a list of the companies in BBB
    """
    robot_parser = RobotFileParser()
    robot_parser.set_url(urljoin(ICFG.STARTING_URL, 'robots.txt'))
    robot_parser.read()
    site_maps = robot_parser.site_maps()

    sitemap_list = [
        i for i in site_maps
        if i.find("sitemap") + 1 and i.find('business-profiles-index') + 1
    ]
    if not get_accredited:
        [sitemap_list.pop(i.find("accredited")) for i in sitemap_list]
        sitemap = sitemap_list[0]
    else:
        sitemap = [
            sitemap_list.pop(i.find("accredited")) for i in sitemap_list
        ][0]

    sitemap_pages = requests.get(sitemap, headers=r.choice(ICFG.HEADERS))
    soup = bs(sitemap_pages.content, "lxml-xml")

    pages_with_business_profiles = [i.get_text() for i in soup.find_all('loc')]

    business_profile_list = []
    if get_page == -1:
        for page in pages_with_business_profiles:
            business_pages = requests.get(page, headers=r.choice(ICFG.HEADERS))
            soup = bs(business_pages.content, "lxml-xml")
            business_profile_list.extend(
                [i.get_text() for i in soup.find_all('loc')])
    else:
        page = pages_with_business_profiles[get_page]
        business_pages = requests.get(page, headers=r.choice(ICFG.HEADERS))
        soup = bs(business_pages.content, "lxml-xml")
        business_profile_list.extend(
            [i.get_text() for i in soup.find_all('loc')])

    return business_profile_list
예제 #31
0
def can_read(url):

    domain = domain_name(url)
    if domain not in Permissions:
        rp = RobotFileParser()
        rp.set_url(urljoin("http://" + domain, "robots.txt"))
        try:
            rp.read()
        except:
            return False

        Permissions[domain] = rp

    res = False
    try:
        res = Permissions[domain].can_fetch("*", url)
    except:
        return False

    return res
예제 #32
0
    def parse(self, response):
        le = LinkExtractor(unique=True)
        # Use robot file parser to check if links extracted are allowed
        rp = RobotFileParser()
        BASE_URL = response.request.url
        DOMAIN = extract(BASE_URL).domain
        rp.set_url(BASE_URL + '/robots.txt')
        rp.read()

        # Extract all the links
        all_links = le.extract_links(response)
        for link in all_links:
            # Ensure same domain
            if extract(link.url).domain == DOMAIN:
                if rp.can_fetch("*", link.url):
                    # Enforcing unique urls
                    if link.url not in self.unique_urls:
                        self.write_to_file(link.url, DOMAIN)
                        self.unique_urls.add(link.url)
                    yield Request(link.url, callback=self.parse)
예제 #33
0
    def get_parsed_robots(self, base_url):
        rp = RobotFileParser()
        robots_url = base_url + '/robots.txt'
        try:
            is_valid = self.check_and_save_robots(robots_url)
            if (not is_valid):
                raise Exception("Not found robots")

            if (base_url in self.parsed_robots_domains.keys()):
                rp = self.parsed_robots_domains[base_url]
            else:
                rp.set_url(robots_url)
                rp.read()
                self.parsed_robots_domains[base_url] = rp
        except:
            # allow all
            rp.set_url('https://ku.ac.th')
            rp.read()
        finally:
            return rp
예제 #34
0
파일: arana.py 프로젝트: eksop/arana
def parse_robotstxt(url):
    """
    Parse robots.txt
    """

    parsed = urlsplit(url)

    if parsed.scheme not in ['http', 'https']:
        return False

    if parsed.netloc == '':
        return False

    robot = RobotFileParser()
    robot.set_url(parsed.scheme + "://" + parsed.netloc + "/robots.txt")
    robot.read()

    return dict(
        allowed=robot.can_fetch('*', url),
        rate=robot.request_rate('*'),
        delay=robot.crawl_delay('*'),
    )
예제 #35
0
    def can_fetch(self, url):
        parsed = urlparse(url)

        if parsed.netloc in self.robots_txt_dict:
            if self.robots_txt_dict[
                    parsed.
                    netloc] == 0:  # 0 means that a robots.txt was not able to be gotten from that site so it assumes its all able to be crawled
                return True
            return self.robots_txt_dict[parsed.netloc].can_fetch('*', url)

        try:
            rp = RobotFileParser()
            rp.set_url('{}://{}/robots.txt'.format(parsed.scheme,
                                                   parsed.netloc))
            rp.read()
            self.robots_txt_dict[parsed.netloc] = rp

            return rp.can_fetch('*', url)
        except:
            print('error getting robots.txt')
            self.robots_txt_dict[parsed.netloc] = 0
            return True
예제 #36
0
class Crawler():
	
	# Variables
	parserobots = False
	output 	= None
	report 	= False

	config 	= None
	domain	= ""

	exclude = []
	skipext = []
	drop    = []
	
	debug	= False

	tocrawl = set([])
	crawled = set([])
	excluded = set([])
	# TODO also search for window.location={.*?}
	linkregex = re.compile(b'<a href=[\'|"](.*?)[\'"].*?>')

	rp = None
	response_code={}
	nb_url=1 # Number of url.
	nb_rp=0 # Number of url blocked by the robots.txt
	nb_exclude=0 # Number of url excluded by extension or word
	
	output_file = None

	target_domain = ""

	def __init__(self, parserobots=False, output=None, report=False ,domain="", exclude=[], skipext=[], drop=[], debug=False):
		self.parserobots = parserobots
		self.output 	= output
		self.report 	= report
		self.domain 	= domain
		self.exclude 	= exclude
		self.skipext 	= skipext
		self.drop		= drop
		self.debug		= debug

		if self.debug:
			logging.basicConfig(level=logging.DEBUG)

		self.tocrawl = set([domain])

		try:
			self.target_domain = urlparse(domain)[1]
		except:
			raise ("Invalid domain")


		if self.output:
			try:
				self.output_file = open(self.output, 'w')
			except:
				logging.debug ("Output file not available.")
				exit(255)

	def run(self):
		print (config.xml_header, file=self.output_file)

		logging.debug("Start the crawling process")
		self.__crawling()
		logging.debug("Crawling as reach the end of all found link")

		print (config.xml_footer, file=self.output_file)


	def __crawling(self):
		crawling = self.tocrawl.pop()

		url = urlparse(crawling)
		self.crawled.add(crawling)
		request = Request(crawling, headers={"User-Agent":config.crawler_user_agent})
		
		try:
			response = urlopen(request)
		except Exception as e:
			if hasattr(e,'code'):
				if e.code in self.response_code:
					self.response_code[e.code]+=1
				else:
					self.response_code[e.code]=1
			logging.debug ("{1} ==> {0}".format(e, crawling))
			return self.__continue_crawling()

		# Read the response
		try:
			msg = response.read()
			if response.getcode() in self.response_code:
				self.response_code[response.getcode()]+=1
			else:
				self.response_code[response.getcode()]=1
			response.close()

			# Get the last modify date
			if 'last-modified' in response.headers:
				date = response.headers['Last-Modified']
			else:
				date = response.headers['Date']

			date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S %Z')

		except Exception as e:
			logging.debug ("{1} ===> {0}".format(e, crawling))
			return self.__continue_crawling()


		print ("<url><loc>"+url.geturl()+"</loc><lastmod>"+date.strftime('%Y-%m-%dT%H:%M:%S')+"</lastmod></url>", file=self.output_file)
		if self.output_file:
			self.output_file.flush()

		# Found links
		links = self.linkregex.findall(msg)
		for link in links:
			link = link.decode("utf-8")
			#logging.debug("Found : {0}".format(link))		
			if link.startswith('/'):
				link = 'http://' + url[1] + link
			elif link.startswith('#'):
				link = 'http://' + url[1] + url[2] + link
			elif not link.startswith('http'):
				link = 'http://' + url[1] + '/' + link
			
			# Remove the anchor part if needed
			if "#" in link:
				link = link[:link.index('#')]

			# Drop attributes if needed
			for toDrop in self.drop:
				link=re.sub(toDrop,'',link)

			# Parse the url to get domain and file extension
			parsed_link = urlparse(link)
			domain_link = parsed_link.netloc
			target_extension = os.path.splitext(parsed_link.path)[1][1:]

			if (link in self.crawled):
				continue
			if (link in self.tocrawl):
				continue
			if (link in self.excluded):
				continue
			if (domain_link != self.target_domain):
				continue
			if ("javascript" in link):
				continue
			
			# Count one more URL
			self.nb_url+=1

			# Check if the navigation is allowed by the robots.txt
			if (not self.can_fetch(link)):
				self.exclude_link(link)
				self.nb_rp+=1
				continue

			# Check if the current file extension is allowed or not.
			if (target_extension in self.skipext):
				self.exclude_link(link)
				self.nb_exclude+=1
				continue

			# Check if the current url doesn't contain an excluded word
			if (not self.exclude_url(link)):
				self.exclude_link(link)
				self.nb_exclude+=1
				continue

			self.tocrawl.add(link)

		return self.__continue_crawling()

	def __continue_crawling(self):
		if self.tocrawl:
			self.__crawling()

	def exclude_link(self,link):
		if link not in self.excluded:
			self.excluded.add(link)

	def checkRobots(self):
		if self.domain[len(self.domain)-1] != "/":
			self.domain += "/"
		request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent})
		self.rp = RobotFileParser()
		self.rp.set_url(self.domain+"robots.txt")
		self.rp.read()

	def can_fetch(self, link):
		try:
			if self.parserobots:
				if self.rp.can_fetch("*", link):
					return True
				else:
					logging.debug ("Crawling of {0} disabled by robots.txt".format(link))
					return False

			if not self.parserobots:
				return True

			return True
		except:
			# On error continue!
			logging.debug ("Error during parsing robots.txt")
			return True

	def exclude_url(self, link):
		for ex in self.exclude:
			if ex in link:
				return False
		return True

	def make_report(self):
		print ("Number of found URL : {0}".format(self.nb_url))
		print ("Number of link crawled : {0}".format(len(self.crawled)))
		if self.parserobots:
			print ("Number of link block by robots.txt : {0}".format(self.nb_rp))
		if self.skipext or self.exclude:
			print ("Number of link exclude : {0}".format(self.nb_exclude))

		for code in self.response_code:
			print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code]))
			
예제 #37
0
class Crawler():

	# Variables
	parserobots = False
	output 	= None
	report 	= False
	config 	= None
	domain	= ""
	
	exclude = []
	skipext = []
	drop    = []

	debug	= False

	tocrawl = set([])
	crawled = set([])
	excluded = set([])

	marked = {}

	not_parseable_ressources = (".epub", ".mobi", ".docx", ".doc", ".opf", ".7z", ".ibooks", ".cbr", ".avi", ".mkv", ".mp4", ".jpg", ".jpeg", ".png", ".gif" ,".pdf", ".iso", ".rar", ".tar", ".tgz", ".zip", ".dmg", ".exe")

	# TODO also search for window.location={.*?}
	linkregex = re.compile(b'<a [^>]*href=[\'|"](.*?)[\'"][^>]*?>')
	imageregex = re.compile (b'<img [^>]*src=[\'|"](.*?)[\'"].*?>')

	rp = None
	response_code={}
	nb_url=1 # Number of url.
	nb_rp=0 # Number of url blocked by the robots.txt
	nb_exclude=0 # Number of url excluded by extension or word

	output_file = None

	target_domain = ""
	scheme		  = ""

	def __init__(self, parserobots=False, output=None, report=False ,domain="",
				 exclude=[], skipext=[], drop=[], debug=False, verbose=False, images=False):
		self.parserobots = parserobots
		self.output 	= output
		self.report 	= report
		self.domain 	= domain
		self.exclude 	= exclude
		self.skipext 	= skipext
		self.drop		= drop
		self.debug		= debug
		self.verbose    = verbose
		self.images     = images

		if self.debug:
			log_level = logging.DEBUG
		elif self.verbose:
			log_level = logging.INFO
		else:
			log_level = logging.ERROR

		logging.basicConfig(level=log_level)

		self.tocrawl = set([self.clean_link(domain)])

		try:
			url_parsed = urlparse(domain)
			self.target_domain = url_parsed.netloc
			self.scheme = url_parsed.scheme
		except:
			logging.error("Invalide domain")
			raise ("Invalid domain")

		if self.output:
			try:
				self.output_file = open(self.output, 'w')
			except:
				logging.error ("Output file not available.")
				exit(255)

	def run(self):
		print(config.xml_header, file=self.output_file)

		if self.parserobots:
			self.check_robots()

		logging.info("Start the crawling process")

		while len(self.tocrawl) != 0:
			self.__crawling(first,domainname,all_link)

		logging.info("Crawling has reached end of all found links")

		print (config.xml_footer, file=self.output_file)


	def __crawling(self,frst,dmname,all_links):
		crawling = self.tocrawl.pop()
		if frst==0:
			dmname=obtaindomain(crawling)
			frst=1
		url = urlparse(crawling)
		self.crawled.add(crawling)
		logging.info("Crawling #{}: {}".format(len(self.crawled), url.geturl()))
		request = Request(crawling, headers={"User-Agent":config.crawler_user_agent})

		# Ignore ressources listed in the not_parseable_ressources
		# Its avoid dowloading file like pdf… etc
		if not url.path.endswith(self.not_parseable_ressources):
			try:
				response = urlopen(request)
			except Exception as e:
				if hasattr(e,'code'):
					if e.code in self.response_code:
						self.response_code[e.code]+=1
					else:
						self.response_code[e.code]=1

					# Gestion des urls marked pour le reporting
					if self.report:
						if e.code in self.marked:
							self.marked[e.code].append(crawling)
						else:
							self.marked[e.code] = [crawling]

				logging.debug ("{1} ==> {0}".format(e, crawling))
				return self.__continue_crawling()
		else:
			logging.debug("Ignore {0} content might be not parseable.".format(crawling))
			response = None

		# Read the response
		if response is not None:
			try:
				msg = response.read()
				if response.getcode() in self.response_code:
					self.response_code[response.getcode()]+=1
				else:
					self.response_code[response.getcode()]=1

				response.close()

				# Get the last modify date
				if 'last-modified' in response.headers:
					date = response.headers['Last-Modified']
				else:
					date = response.headers['Date']

				date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S %Z')

			except Exception as e:
				logging.debug ("{1} ===> {0}".format(e, crawling))
				return None
		else:
			# Response is None, content not downloaded, just continu and add
			# the link to the sitemap
			msg = "".encode( )
			date = None

		# Image sitemap enabled ?
		image_list = "";
		if self.images:
			# Search for images in the current page.
			images = self.imageregex.findall(msg)
			for image_link in list(set(images)):
				image_link = image_link.decode("utf-8", errors="ignore")

				# Ignore link starting with data:
				if image_link.startswith("data:"):
					continue

				# If path start with // get the current url scheme
				if image_link.startswith("//"):
					image_link = url.scheme + ":" + image_link
				# Append domain if not present
				elif not image_link.startswith(("http", "https")):
					if not image_link.startswith("/"):
						image_link = "/{0}".format(image_link)
					image_link = "{0}{1}".format(self.domain.strip("/"), image_link.replace("./", "/"))

				# Ignore image if path is in the exclude_url list
				if not self.exclude_url(image_link):
					continue

				# Ignore other domain images
				image_link_parsed = urlparse(image_link)
				if image_link_parsed.netloc != self.target_domain:
					continue


				# Test if images as been already seen and not present in the
				# robot file
				if self.can_fetch(image_link):
					logging.debug("Found image : {0}".format(image_link))
					image_list = "{0}<image:image><image:loc>{1}</image:loc></image:image>".format(image_list, self.htmlspecialchars(image_link))

		# Last mod fetched ?
		lastmod = ""
		if date:
			lastmod = "<lastmod>"+date.strftime('%Y-%m-%dT%H:%M:%S+00:00')+"</lastmod>"

		print ("<url><loc>"+self.htmlspecialchars(url.geturl())+"</loc>" + lastmod + image_list + "</url>", file=self.output_file)
		tempdom=obtaindomain(self.htmlspecialchars(url.geturl()))
		templink=self.htmlspecialchars(url.geturl())
		if tempdom==dmname:
			if templink not in all_links:
				outputprint(templink)
				all_links.append(templink)
		if self.output_file:
			self.output_file.flush()

		# Found links
		links = self.linkregex.findall(msg)
		for link in links:
			link = link.decode("utf-8", errors="ignore")
			link = self.clean_link(link)
			logging.debug("Found : {0}".format(link))

			if link.startswith('/'):
				link = url.scheme + '://' + url[1] + link
			elif link.startswith('#'):
				link = url.scheme + '://' + url[1] + url[2] + link
			elif link.startswith(("mailto", "tel")):
				continue
			elif not link.startswith(('http', "https")):
				link = url.scheme + '://' + url[1] + '/' + link

			# Remove the anchor part if needed
			if "#" in link:
				link = link[:link.index('#')]

			# Drop attributes if needed
			for toDrop in self.drop:
				link=re.sub(toDrop,'',link)

			# Parse the url to get domain and file extension
			parsed_link = urlparse(link)
			domain_link = parsed_link.netloc
			target_extension = os.path.splitext(parsed_link.path)[1][1:]

			if link in self.crawled:
				continue
			if link in self.tocrawl:
				continue
			if link in self.excluded:
				continue
			if domain_link != self.target_domain:
				continue
			if parsed_link.path in ["", "/"]:
				continue
			if "javascript" in link:
				continue
			if self.is_image(parsed_link.path):
				continue
			if parsed_link.path.startswith("data:"):
				continue

			# Count one more URL
			self.nb_url+=1

			# Check if the navigation is allowed by the robots.txt
			if not self.can_fetch(link):
				self.exclude_link(link)
				self.nb_rp+=1
				continue

			# Check if the current file extension is allowed or not.
			if (target_extension in self.skipext):
				self.exclude_link(link)
				self.nb_exclude+=1
				continue

			# Check if the current url doesn't contain an excluded word
			if (not self.exclude_url(link)):
				self.exclude_link(link)
				self.nb_exclude+=1
				continue

			self.tocrawl.add(link)

		return None

	def clean_link(self, link):
		l = urlparse(link)
		l_res = list(l)
		l_res[2] = l_res[2].replace("./", "/")
		l_res[2] = l_res[2].replace("//", "/")
		return urlunparse(l_res)

	def is_image(self, path):
		 mt,me = mimetypes.guess_type(path)
		 return mt is not None and mt.startswith("image/")

	def __continue_crawling(self):
		if self.tocrawl:
			self.__crawling(first,domainname,all_link)

	def exclude_link(self,link):
		if link not in self.excluded:
			self.excluded.add(link)

	def check_robots(self):
		robots_url = urljoin(self.domain, "robots.txt")
		self.rp = RobotFileParser()
		self.rp.set_url(robots_url)
		self.rp.read()

	def can_fetch(self, link):
		try:
			if self.parserobots:
				if self.rp.can_fetch("*", link):
					return True
				else:
					logging.debug ("Crawling of {0} disabled by robots.txt".format(link))
					return False

			if not self.parserobots:
				return True

			return True
		except:
			# On error continue!
			logging.debug ("Error during parsing robots.txt")
			return True

	def exclude_url(self, link):
		for ex in self.exclude:
			if ex in link:
				return False
		return True

	def htmlspecialchars(self, text):
		return text.replace("&", "&amp;").replace('"', "&quot;").replace("<", "&lt;").replace(">", "&gt;")

	def make_report(self):
		print ("Number of found URL : {0}".format(self.nb_url))
		print ("Number of link crawled : {0}".format(len(self.crawled)))
		if self.parserobots:
			print ("Number of link block by robots.txt : {0}".format(self.nb_rp))
		if self.skipext or self.exclude:
			print ("Number of link exclude : {0}".format(self.nb_exclude))

		for code in self.response_code:
			print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code]))

		for code in self.marked:
			print ("Link with status {0}:".format(code))
			for uri in self.marked[code]:
				print ("\t- {0}".format(uri))
예제 #38
0
파일: crawler.py 프로젝트: nchikkam/hk
class SiteMap():
    """ This class composes of all the functionalities needed to generate site_map"""

    def __init__(self, main_page=None, robotrules=True, threadcount=1):
        """ctor that checks args and decides to enable single or multithreaded
           generation of sitemap
        """
        logging.info("Consider Robot.txt ? ==> "+str(robotrules))
        self.robotrules = robotrules
        self.site_map = {}

        self.unvisited = set([])
        self.start_page = None

        self.robot_txt_rules = None

        if main_page:
            self.unvisited.add(main_page)
            try:
                self.start_page = urlparse(main_page).netloc
            except:
                logging.error("Improper URL, Please provide a Valid Url:"+main_page)
                exit(0)

        if self.robotrules == "True":
            try:
                logging.info("robot.txt respected")
                self.robot_txt_rules = RobotFileParser()
                self.robot_txt_rules.set_url(main_page + "/robots.txt")
                self.robot_txt_rules.read()
            except:
                logging.error("Unable to read the robot.txt file")
                self.robotrules = False # error reading robot.txt, ignore it forever

        self.threadcount = int(threadcount)

    def execute(self):
        if self.threadcount <= 1: # if single threaded model is chosen, avoid threading
            self.generate()
        else:
            self.start()          # fasten by multi threads

    def start(self):
        """This creates a pool of chosen limit so as to have the control and
           spawns the main function and waits until process and subsequently
           spawned process finish.
        """
        self.pool = pool.Pool(self.threadcount)
        self.pool.spawn(self.generate_parallels)
        self.pool.join()

        self.generate_reports()


    def generate(self):
        """Non multithreaded model method that crawls until all pages are
           crawled and assets are extracted. Once its done, it creates the
           sitemap and assets json file for the given domain.
        """
        while self.unvisited:
            self.crawl()

        self.generate_reports()

    def generate_reports(self):
        """composes the xml tags with the keys in site_map member which are
           nothing but the sitemap urls
        """
        header = """<?xml version="1.0" encoding="UTF-8"?>
                            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
                            xmlns:xhtml="http://www.w3.org/1999/xhtml"
                            xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
                            xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
                            http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
                        """
        footer = """\n</urlset>\n"""
        entry = "\t<url><loc>%s</loc></url>\n"

        xml = header
        for url in self.site_map.keys():
            xml += entry % (url)

        xml += footer
        name = self.start_page.replace(".", "_")
        self.create_file("%s.xml" % (name), xml)
        self.create_file("%s_assets.json" % (name), json.dumps(self.site_map, indent=2, sort_keys=True))


    def generate_parallels(self):
        """
            This method is similar to recursive in a way that crawls pages and clears
            the queue, which is self.unvisited. It stops when there are no urls to crawl
            and all threads in pool are empty i.e they are not active anymore due to
            finishing of crawling. Since its spawning a new thread and not calling
            directly, it is a nice way to go about it for now :)
            [Note:] There is a limit for recursion in Python and it can be increased by
            sys.setrecursionlimit(1500)

            An assumption has been made for this implementation that a website that has
            more than 500 nested links needs a bit higher design in terms to store the
            assets which might go above a hundred MB. In such cases, this can just be
            converted into a loop. More over, there is no extra stack variables.
        """
        self.crawl()
        while len(self.unvisited) > 0 and not self.pool.full():
            self.pool.spawn(self.generate_parallels)

    def create_file(self, file, content):
        """writes the given content to the file"""
        f = open(file, 'w')
        f.write(content)
        f.close()

    def compose_url_from_href(self, page, href):
        """composes a proper url from domainlink and intralinks with in the page"""
        url = urlparse(page)

        if href.startswith('/'):
            return "http://%s%s"%(url.netloc, href)
        elif href.startswith('#'):
            return "http://%s%s%s"%(url.netloc, url.path, href)
        elif href.startswith('./'):
            return "http://%s%s"%(url.netloc, href[1:])
        elif not href.startswith('http'):
            return "http://" + url.netloc + '/' + href
        elif href.endswith('/'):
            return href[:-1]

        return href

    def get_out_going_links(self, page, html_body):
        """extracts all the outgoing links and adds links that belong to
           main page domain for further crawling if they are not crawled yet
           This avoids:
            - links that are .zip files
            - links mentioned in href that are javascript methods
            - mailto: links

        """
        soup = BeautifulSoup(html_body, "html.parser")
        valid_links_for_this_page = []
        for a in soup.find_all('a', href=True):

            href = a['href'].lower()
            href = self.compose_url_from_href(page, href)

            # clean the href so that it will have legitimate urls instead of #cluttered ones and q=param prints
            href = urldefrag(href)[0]  # skip intra links [this took time to find out !] ##1
            # remove query params as only the path matters
            if href.find('?') != -1:
                href = href[:href.find('?')]  ##2

            new_page = urlparse(href)

            # add to the queue only it it doesn't cause a cycle
            # assumption: if a link ends with domain.com, assuming it can be crawled to make sitemap complete
            if  not str(new_page.netloc).endswith(self.start_page):          # doesn't belong to domain
                valid_links_for_this_page.append(href)
                continue

            if  self.robot_allows(href) and \
                not href in self.site_map            and \
                not href in self.unvisited                  and \
                not 'javascript:' in href           and \
                not 'mailto:' in href:
                if not ( href.endswith(".zip") or
                             href.endswith(".gz") or
                             href.endswith(".gzip") or
                             href.endswith(".tar") or
                             href.endswith(".bz2") or
                             href.endswith(".jpg") or
                             href.endswith(".png") or
                             href.endswith(".exe")
                         ):
                    self.unvisited.add(href)
                valid_links_for_this_page.append(href)

        return valid_links_for_this_page

    def get_assets(self, page, headers, html_body):
        """A nice feature of response header is that it reports the last-modified
           time of the link on the server. If we are doing regular crawling, we can
           avoid if the link is not updates since the last time. This method is
           useful for indexing the data so as to minimize the crawling effort to
           save execution time.
           It updates the site_map dictionary with the links, css, images and scripts
        """
        if 'last-modified' in headers:
            date = headers['Last-Modified']
        else:
            date = headers['Date']

        soup = BeautifulSoup(html_body, "html.parser")
        img = soup.findAll("img")
        css = soup.findAll("link", {"rel": "stylesheet"})
        js = soup.findAll('script')

        self.site_map[page] = {
            'date': date,
            'links': self.get_out_going_links(page, html_body),
            'css': [c['href'] for c in css],
            'img': [i['src'] for i in img],
            'js': [x.get('src', 'inline jscode') for x in js]
        }


    def crawl(self):
        """This actually opens the url and calls the assets method """
        if len(self.unvisited) <= 0:
            return
        page = self.unvisited.pop()
        if page in self.site_map:
            return
        logging.info("Starting to Crawl Page: " + page)

        try:
            response = self.access_page(page)
            if (response.status_code != 200):
                return None

            html_body = response.text

            self.get_assets(page, response.headers, html_body)
        except:
            logging.error("Issue while opening url: %s" + page)
            return None
        logging.debug("Crawled Pages: {}".format(len(self.site_map)))

    def access_page(self, url):
        """accesses the url from the server. This method was created
            to enable mock tests.
        """
        return requests.get(url)

    def get_site_map(self):
        """exposes site_map"""
        return self.site_map

    def set_start_page(self, url):
        """sets the start page for the crawler"""
        self.start_page = url

    def robot_allows(self, link):
        """method to check if link can be accessed as per robot rules"""
        if not self.robotrules: return True
        try:
            if self.robot_txt_rules.can_fetch("*", link):
                    return True
            return False
        except:
            return True
예제 #39
0
    def _get_robot_parser(self):
        parser = RobotFileParser()
        parser.set_url(self.protocol + "://" + self.domain + "/robots.txt")

        return parser