Python RobotFileParser.RobotFileParser примеры использования

Язык программирования: Python

Пространство имен/Пакет: robotparser

Класс/Тип: RobotFileParser

Метод/Функция: RobotFileParser

Примеров на hotexamples.com: 22

Python RobotFileParser.RobotFileParser - 22 примеров найдено. Это лучшие примеры Python кода для robotparser.RobotFileParser.RobotFileParser, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

RobotFileParser(22)

read(17)

can_fetch(13)

set_url(13)

parse(3)

__init__(1)

allow_all(1)

crawl_delay(1)

disallow_all(1)

modified(1)

Пример #1

Показать файл

Файл: robot.py Проект: seawindx/hyer

 def test_parse(self):
     from robotparser import RobotFileParser
     rules = RobotFileParser()
     rules.set_url("http://www.sogou.com/robots.txt")
     rules.read()
     self.assertEqual(
         rules.can_fetch("mozilla", "http://www.sogou.com/sohu/robots.txt"),
         False)

Пример #2

Показать файл

Файл: crawler.py Проект: anhnguyendepocen/flaxcode

 def parse_robots(self, netloc, content):
     """ Parse the given robots.txt content and store against the given
         domain. If content is None, any URL will be allowed.
     """
     robot = RobotFileParser()
     if content is not None:
         robot.parse(content.split("\n"))
     self._robots[netloc] = robot

Пример #3

Показать файл

def accessible(url):
    u = urlparse(url)
    if u.netloc not in robots_cache:
        resp = requests.get('http://%s/robots.txt' % u.netloc)
        rp = RobotFileParser()
        rp.parse(resp.content.splitlines())
        robots_cache[u.netloc] = rp
    return robots_cache[u.netloc].can_fetch('*', url)

Пример #4

Показать файл

 def __init__(self, url):
     self.page_url = url
     self.parsed_url = urlparse.urlparse(url)
     self.lang = ""
     self.isDownload = False
     self.title = ""
     self.text = ""
     self.soup = None
     self.robot = RobotFileParser()

Пример #5

Показать файл

Файл: sql_crawler.py Проект: anhnguyendepocen/flaxcode

 def parse_robots(self, netloc, content):
     """ Parse the given robots.txt content and store against the given
         domain. If content is None, any URL will be allowed.
     """
     robot = RobotFileParser()
     if content is not None:
         robot.parse(content.split("\n"))
     self.execute("UPDATE domain SET robots=? WHERE netloc=?", dumps(robot),
                  netloc)

Пример #6

Показать файл

    def _get_robot_parser(self):
        if self.robot_parser_pickle is not None:
            return pickle.loads(base64.b64decode(self.robot_parser_pickle))
        else:
            parser = RobotFileParser()
            parser.set_url(self.protocol + "://" + self.domain + "/robots.txt")
            self.robot_parser = parser

            return parser

Пример #7

Показать файл

Файл: CrawlerStudy.py Проект: liuzhenrain/PythonTools

def get_robots(url):
    '''
    Initialize robots parser for this domain
    :param url:
    :return:
    '''
    rp = RobotFileParser()
    rp.set_url(urlparse.urljoin(url, '/robots.txt'))
    rp.read()
    return rp

Пример #8

Показать файл

Файл: acrawler.py Проект: AsamlDevel/gator

 def http_open(self, request):
     #request -- urllib2.Request
     url = request.get_full_url()
     host = urlsplit(url)[1]
     robots_url = urlunsplit(('http', host, '/robots.txt', '', ''))
     rp = RobotFileParser(robots_url)
     rp.read()
     if not rp.can_fetch(self.agentname, url):
         raise RuntimeError('Forbidden by robots.txt')
     return urllib2.HTTPHandler.http_open(self, request)

Пример #9

Показать файл

def checkRobots(URL):

    time.sleep(1)
    parsed = urlparse(URL)
    robotsUrl = parsed.scheme + "://" + parsed.netloc + "/robots.txt"
    robotParser = RobotFileParser()
    robotParser.set_url(robotsUrl)
    robotParser.read()
    result = robotParser.can_fetch("*", URL)
    return result

Пример #10

Показать файл

 def can_fetch(self, url):
     host, path = urlparse.urlparse(url)[1:3]
     if (self.rules.has_key(host)):
         return self.rules[host].can_fetch(self.agent, url)
     else:
         rp = RobotFileParser()
         robot_url = "http://" + host + "/robots.txt"
         rp.set_url(robot_url)
         rp.read()
         self.rules[host] = rp
         return rp.can_fetch(self.agent, url)

Пример #11

Показать файл

Файл: robots.py Проект: maxkolotilin/SearchSystem

 def try_add_robot(self, url):
     parsed_url = urlparse(url)
     if parsed_url.netloc not in self.robots:
         try:
             robot_url = parsed_url.scheme + '://' + parsed_url.netloc + \
                         '/robots.txt'
             rp = RobotFileParser(robot_url)
             rp.read()
             self.robots[parsed_url.netloc] = rp
         except IOError as e:
             print str(e)
         except Exception as e:
             print str(e)

Пример #12

Показать файл

 def check_robots(self, url):
     '''check the robots.txt in this url's domain'''
     hostname = urlparse(url).netloc
     if hostname not in self.domain_list.keys(
     ):  # no records in domain_list
         rp = RobotFileParser('http://%s/robots.txt' % hostname)
         print("%s: fetching %s" % (url, rp.url))
         try:
             rp.read()  # get new robots.txt
         except IOError, e:  # url's server not available(connection timeout)
             log.error(str(e))
             rp.disallow_all = True  # reject all request
         self.domain_list[
             hostname] = rp  # add domain entry into domain_list

Пример #13

Показать файл

Файл: urlFilter.py Проект: zymITsky/distributed_web_crawler

    def disallow(self, url):
        """
		TO BE DONE
		"""
        robotFile = urljoin(url, "/robots.txt")
        # key = hashlib.sha1(robotFile).hexdigest()
        if (not self._dict.has_key(key)):
            self._dict[key] = RobotFileParser(robotFile)
            try:
                self._dict[key].read()
            except:
                self._dict[key] = None
        result = self._dict[key] is None or not self._dict[key].can_fetch(
            self._userAgent, url)
        return result

Пример #14

Показать файл

 def __init__(self):
     self.rp = RobotFileParser()
     self.rp.set_url('https://www.timeanddate.com/robots.txt')
     self.rp.read()
     if not self.rp.can_fetch('WasThereAHoliday', init_url):
         raise RuntimeError('Scrapping forbidden due to robots.txt file')
     self.countries = self.get_countries(self.get_page(init_url))
     try:
         # removing entries which are not countries
         self.countries.remove('un')
     except ValueError:
         pass
     try:
         # removing entries which are not countries
         self.countries.remove('world')
     except ValueError:
         pass

Пример #15

Показать файл

Файл: test_urllib2.py Проект: yuelinsoft/Crawler

def link_crawler(seed_url, link_regex):
    import re
    crawler_queue = [seed_url]
    seen = {}
    while crawler_queue:
        url = crawler_queue.pop()
        html = download(url, now=1)
        from robotparser import RobotFileParser
        rp = RobotFileParser().set_url(web_url.join('robots.txt')).read()
        for link in get_links(url):
            depth = seen.get(link, 1)
            seen[link] = depth
            if re.match(link_regex,
                        link) and link not in seen and rp.can_fetch(
                            user_agent, web_url) and seen[link] != max_try:
                seen[link] = depth + 1
                link = urlparse.urljoin(seed_url, link)
                crawler_queue.append(link)

Пример #16

Показать файл

Файл: PCcrawler.py Проект: SamuelDunn/PageRank_Implementation

def can_read(url):

    domain = domain_name(url)
    if domain not in Permissions:
        rp = RobotFileParser()
        rp.set_url(urljoin('http://' + domain, 'robots.txt'))
        try:
            rp.read()
        except:
            return False

        Permissions[domain] = rp

    res = False
    try:
        res = Permissions[domain].can_fetch("*", url)
    except:
        return False

    return res

Пример #17

Показать файл

def _get_soup(path):
    """Gets soup from the given path, respecting robots.txt"""

    full_path = BASE_URL + path

    # Set a user-agent
    user_agent = 'dcnotify/%s' % __version__
    http_headers = {'User-Agent': '%s' % user_agent}

    # Honor robots.txt
    robots = RobotFileParser()
    robots.set_url("%s/robots.txt" % BASE_URL)
    robots.read()
    if not robots.can_fetch(user_agent, full_path):
        raise ValueError("Path disallowed by robots.txt")

    # Make a make a request, raising any HTTP errors that might occur
    request = get(full_path, headers=http_headers)
    request.raise_for_status()

    return bs(request.text)

Пример #18

Показать файл

Файл: textcrawler.py Проект: abstract-open-solutions/transmogrify.webcrawler

 def __init__(self,
              starturl,
              index_html='',
              maxlevel=1,
              cookie_file=None,
              acldb=None,
              urldb=None,
              default_charset=None,
              delay=0,
              timeout=300,
              debug=0):
     (proto, self.hostport, _x, _y, _z) = urlsplit(starturl)
     #    assert proto == 'http'
     #Thread.__init__(self)
     self.debug = debug
     self.index_html = index_html
     if cookie_file:
         self.cookiejar = MozillaCookieJar(cookie_file)
         self.cookiejar.load()
     else:
         self.cookiejar = None
     self.robotstxt = RobotFileParser()
     self.robotstxt.set_url(urljoin(starturl, '/robots.txt'))
     try:
         self.robotstxt.read()
     except IOError:
         pass
     self.conn = None
     self.urldb = urldb
     self.acldb = acldb
     self.curlevel = 0
     self.delay = delay
     self.timeout = timeout
     self.default_charset = default_charset
     if starturl.endswith('/'):
         starturl += self.index_html
     self.urls = [(starturl, maxlevel)]
     self.crawled = {}  # 1:injected, 2:crawled
     return

Пример #19

Показать файл

Файл: crawler.py Проект: schikkam/projects

    def __init__(self, main_page=None, robotrules=True):
        """
            Constuctor method that initializes the members that are used during crawling process
        :param main_page: The root page that needs to be crawled for generation of sitemap
        """

        logging.info("Consider Robot.txt ? ==> " + str(robotrules))
        self.robotrules = robotrules
        self.site_map = {
        }  # map that records the visits of urls, datemodified and assets
        self.network = {
        }  # map that maintains the network/graph of webpages visited
        # The intention of this map is for visual rendering using d3.js

        self.unvisited = set(
            [])  # a set to keep the list of urls yet to be visited
        self.start_page = None  # the root page, this is used to avoid cycle and keeping crawl
        # process limited to single domain.
        self.robot_txt_rules = None

        if main_page:
            self.unvisited.add(main_page)
            try:
                self.start_page = urlparse(main_page).netloc
            except:
                logging.error("Improper URL, Please provide a Valid Url:" +
                              main_page)
                exit(0)

        if self.robotrules == "True":
            try:
                logging.info("robot.txt respected")
                self.robot_txt_rules = RobotFileParser()
                self.robot_txt_rules.set_url(main_page + "/robots.txt")
                self.robot_txt_rules.read()
            except:
                logging.error("Unable to read the robot.txt file")
                self.robotrules = False  # error reading robot.txt, ignore it forever

Пример #20

Показать файл

 def parse_robots(self, robots_text):
     self.robots_parser = RobotFileParser(robots_text)
     self.robots_parser.read()

Пример #21

Показать файл

Файл: crawler.py Проект: Starotitorov/search_system

 def _create_robot_file_parser(self, url):
     host = urlparse.urlsplit(url)[1]
     robots_url = urlparse.urlunsplit(('http', host, '/robots.txt', '', ''))
     rp = RobotFileParser(robots_url)
     rp.read()
     return rp

Пример #22

Показать файл

 def __init__(self, link):
     self.CurLink = link
     self.r = RobotFileParser()