Пример #1
0
 def test_parse(self):
     from robotparser import RobotFileParser
     rules = RobotFileParser()
     rules.set_url("http://www.sogou.com/robots.txt")
     rules.read()
     self.assertEqual(
         rules.can_fetch("mozilla", "http://www.sogou.com/sohu/robots.txt"),
         False)
Пример #2
0
 def parse_robots(self, netloc, content):
     """ Parse the given robots.txt content and store against the given
         domain. If content is None, any URL will be allowed.
     """
     robot = RobotFileParser()
     if content is not None:
         robot.parse(content.split("\n"))
     self._robots[netloc] = robot
Пример #3
0
def accessible(url):
    u = urlparse(url)
    if u.netloc not in robots_cache:
        resp = requests.get('http://%s/robots.txt' % u.netloc)
        rp = RobotFileParser()
        rp.parse(resp.content.splitlines())
        robots_cache[u.netloc] = rp
    return robots_cache[u.netloc].can_fetch('*', url)
Пример #4
0
 def __init__(self, url):
     self.page_url = url
     self.parsed_url = urlparse.urlparse(url)
     self.lang = ""
     self.isDownload = False
     self.title = ""
     self.text = ""
     self.soup = None
     self.robot = RobotFileParser()
Пример #5
0
 def parse_robots(self, netloc, content):
     """ Parse the given robots.txt content and store against the given
         domain. If content is None, any URL will be allowed.
     """
     robot = RobotFileParser()
     if content is not None:
         robot.parse(content.split("\n"))
     self.execute("UPDATE domain SET robots=? WHERE netloc=?", dumps(robot),
                  netloc)
Пример #6
0
    def _get_robot_parser(self):
        if self.robot_parser_pickle is not None:
            return pickle.loads(base64.b64decode(self.robot_parser_pickle))
        else:
            parser = RobotFileParser()
            parser.set_url(self.protocol + "://" + self.domain + "/robots.txt")
            self.robot_parser = parser

            return parser
Пример #7
0
def get_robots(url):
    '''
    Initialize robots parser for this domain
    :param url:
    :return:
    '''
    rp = RobotFileParser()
    rp.set_url(urlparse.urljoin(url, '/robots.txt'))
    rp.read()
    return rp
Пример #8
0
 def http_open(self, request):
     #request -- urllib2.Request
     url = request.get_full_url()
     host = urlsplit(url)[1]
     robots_url = urlunsplit(('http', host, '/robots.txt', '', ''))
     rp = RobotFileParser(robots_url)
     rp.read()
     if not rp.can_fetch(self.agentname, url):
         raise RuntimeError('Forbidden by robots.txt')
     return urllib2.HTTPHandler.http_open(self, request)
Пример #9
0
def checkRobots(URL):

    time.sleep(1)
    parsed = urlparse(URL)
    robotsUrl = parsed.scheme + "://" + parsed.netloc + "/robots.txt"
    robotParser = RobotFileParser()
    robotParser.set_url(robotsUrl)
    robotParser.read()
    result = robotParser.can_fetch("*", URL)
    return result
Пример #10
0
 def can_fetch(self, url):
     host, path = urlparse.urlparse(url)[1:3]
     if (self.rules.has_key(host)):
         return self.rules[host].can_fetch(self.agent, url)
     else:
         rp = RobotFileParser()
         robot_url = "http://" + host + "/robots.txt"
         rp.set_url(robot_url)
         rp.read()
         self.rules[host] = rp
         return rp.can_fetch(self.agent, url)
Пример #11
0
 def try_add_robot(self, url):
     parsed_url = urlparse(url)
     if parsed_url.netloc not in self.robots:
         try:
             robot_url = parsed_url.scheme + '://' + parsed_url.netloc + \
                         '/robots.txt'
             rp = RobotFileParser(robot_url)
             rp.read()
             self.robots[parsed_url.netloc] = rp
         except IOError as e:
             print str(e)
         except Exception as e:
             print str(e)
Пример #12
0
 def check_robots(self, url):
     '''check the robots.txt in this url's domain'''
     hostname = urlparse(url).netloc
     if hostname not in self.domain_list.keys(
     ):  # no records in domain_list
         rp = RobotFileParser('http://%s/robots.txt' % hostname)
         print("%s: fetching %s" % (url, rp.url))
         try:
             rp.read()  # get new robots.txt
         except IOError, e:  # url's server not available(connection timeout)
             log.error(str(e))
             rp.disallow_all = True  # reject all request
         self.domain_list[
             hostname] = rp  # add domain entry into domain_list
Пример #13
0
    def disallow(self, url):
        """
		TO BE DONE
		"""
        robotFile = urljoin(url, "/robots.txt")
        # key = hashlib.sha1(robotFile).hexdigest()
        if (not self._dict.has_key(key)):
            self._dict[key] = RobotFileParser(robotFile)
            try:
                self._dict[key].read()
            except:
                self._dict[key] = None
        result = self._dict[key] is None or not self._dict[key].can_fetch(
            self._userAgent, url)
        return result
Пример #14
0
 def __init__(self):
     self.rp = RobotFileParser()
     self.rp.set_url('https://www.timeanddate.com/robots.txt')
     self.rp.read()
     if not self.rp.can_fetch('WasThereAHoliday', init_url):
         raise RuntimeError('Scrapping forbidden due to robots.txt file')
     self.countries = self.get_countries(self.get_page(init_url))
     try:
         # removing entries which are not countries
         self.countries.remove('un')
     except ValueError:
         pass
     try:
         # removing entries which are not countries
         self.countries.remove('world')
     except ValueError:
         pass
Пример #15
0
def link_crawler(seed_url, link_regex):
    import re
    crawler_queue = [seed_url]
    seen = {}
    while crawler_queue:
        url = crawler_queue.pop()
        html = download(url, now=1)
        from robotparser import RobotFileParser
        rp = RobotFileParser().set_url(web_url.join('robots.txt')).read()
        for link in get_links(url):
            depth = seen.get(link, 1)
            seen[link] = depth
            if re.match(link_regex,
                        link) and link not in seen and rp.can_fetch(
                            user_agent, web_url) and seen[link] != max_try:
                seen[link] = depth + 1
                link = urlparse.urljoin(seed_url, link)
                crawler_queue.append(link)
def can_read(url):

    domain = domain_name(url)
    if domain not in Permissions:
        rp = RobotFileParser()
        rp.set_url(urljoin('http://' + domain, 'robots.txt'))
        try:
            rp.read()
        except:
            return False

        Permissions[domain] = rp

    res = False
    try:
        res = Permissions[domain].can_fetch("*", url)
    except:
        return False

    return res
Пример #17
0
def _get_soup(path):
    """Gets soup from the given path, respecting robots.txt"""

    full_path = BASE_URL + path

    # Set a user-agent
    user_agent = 'dcnotify/%s' % __version__
    http_headers = {'User-Agent': '%s' % user_agent}

    # Honor robots.txt
    robots = RobotFileParser()
    robots.set_url("%s/robots.txt" % BASE_URL)
    robots.read()
    if not robots.can_fetch(user_agent, full_path):
        raise ValueError("Path disallowed by robots.txt")

    # Make a make a request, raising any HTTP errors that might occur
    request = get(full_path, headers=http_headers)
    request.raise_for_status()

    return bs(request.text)
 def __init__(self,
              starturl,
              index_html='',
              maxlevel=1,
              cookie_file=None,
              acldb=None,
              urldb=None,
              default_charset=None,
              delay=0,
              timeout=300,
              debug=0):
     (proto, self.hostport, _x, _y, _z) = urlsplit(starturl)
     #    assert proto == 'http'
     #Thread.__init__(self)
     self.debug = debug
     self.index_html = index_html
     if cookie_file:
         self.cookiejar = MozillaCookieJar(cookie_file)
         self.cookiejar.load()
     else:
         self.cookiejar = None
     self.robotstxt = RobotFileParser()
     self.robotstxt.set_url(urljoin(starturl, '/robots.txt'))
     try:
         self.robotstxt.read()
     except IOError:
         pass
     self.conn = None
     self.urldb = urldb
     self.acldb = acldb
     self.curlevel = 0
     self.delay = delay
     self.timeout = timeout
     self.default_charset = default_charset
     if starturl.endswith('/'):
         starturl += self.index_html
     self.urls = [(starturl, maxlevel)]
     self.crawled = {}  # 1:injected, 2:crawled
     return
Пример #19
0
    def __init__(self, main_page=None, robotrules=True):
        """
            Constuctor method that initializes the members that are used during crawling process
        :param main_page: The root page that needs to be crawled for generation of sitemap
        """

        logging.info("Consider Robot.txt ? ==> " + str(robotrules))
        self.robotrules = robotrules
        self.site_map = {
        }  # map that records the visits of urls, datemodified and assets
        self.network = {
        }  # map that maintains the network/graph of webpages visited
        # The intention of this map is for visual rendering using d3.js

        self.unvisited = set(
            [])  # a set to keep the list of urls yet to be visited
        self.start_page = None  # the root page, this is used to avoid cycle and keeping crawl
        # process limited to single domain.
        self.robot_txt_rules = None

        if main_page:
            self.unvisited.add(main_page)
            try:
                self.start_page = urlparse(main_page).netloc
            except:
                logging.error("Improper URL, Please provide a Valid Url:" +
                              main_page)
                exit(0)

        if self.robotrules == "True":
            try:
                logging.info("robot.txt respected")
                self.robot_txt_rules = RobotFileParser()
                self.robot_txt_rules.set_url(main_page + "/robots.txt")
                self.robot_txt_rules.read()
            except:
                logging.error("Unable to read the robot.txt file")
                self.robotrules = False  # error reading robot.txt, ignore it forever
Пример #20
0
 def parse_robots(self, robots_text):
     self.robots_parser = RobotFileParser(robots_text)
     self.robots_parser.read()
Пример #21
0
 def _create_robot_file_parser(self, url):
     host = urlparse.urlsplit(url)[1]
     robots_url = urlparse.urlunsplit(('http', host, '/robots.txt', '', ''))
     rp = RobotFileParser(robots_url)
     rp.read()
     return rp
Пример #22
0
 def __init__(self, link):
     self.CurLink = link
     self.r = RobotFileParser()