예제 #1
0
def robotsTxtParse(url, config, logger):
    # Finds the robot.txt of a domain and subdomain(if one exists) and
    # Stores it in DataStore.RobotChecks
    scheme = urlparse(url).scheme #scheme needed to read robots.txt

    domain = getDomain(url)
    #val=r.hget(robotsCheck,"bhh").decode('utf-8')
    if domain != '' and domain not in DataStore.robotsCheck:
    #if domain != '' and domain not in r.hexists(robotsCheck, domain):
        robotTxtUrl = f"{scheme}://{domain}/robots.txt"
        robot = RobotFileParser(config, logger)
        robot.set_url(robotTxtUrl)
        robot.read()
        #r.hset(robotsCheck, domain, robot)
        DataStore.robotsCheck[domain] = robot

    subdomain = getSubDomain(url)
    if subdomain != '' and subdomain not in DataStore.robotsCheck:
    #if subdomain != '' and not r.hexists(robotsCheck,subdomain):
        robotTxtUrl = f"{scheme}://{subdomain}/robots.txt"
        robot = RobotFileParser(config, logger)
        robot.set_url(robotTxtUrl)
        robot.read()
        #r.hset(robotsCheck, subdomain, robot)
        DataStore.robotsCheck[subdomain] = robot
예제 #2
0
def can_fetch(url, robots_url, useragent):
    from urllib.robotparser import RobotFileParser

    parser = RobotFileParser()
    parser.set_url(robots_url)
    parser.read()
    return parser.can_fetch(useragent, url)
예제 #3
0
def get_robots_parser_if_exists(url):
    """
    Attempts to parse the robots.txt file for a url.

    :param url: the url to request.
    :type url: str
    :return: a RobotFileParser object if a valid robots.txt is found; otherwise None.
    :rtype: RobotFileParser or None
    """

    if not url.startswith('http'):
        url = 'http://' + url

    parsed_url = urlparse(url)
    robot_path = '{url.scheme}://{url.netloc}/robots.txt'.format(
        url=parsed_url)

    try:
        r = requests.head(robot_path)
        if r.status_code < 300:
            rp = RobotFileParser()
            rp.set_url(robot_path)
            rp.read()
            return rp
        else:
            return None
    except RequestException:
        return None
예제 #4
0
 def test_robot(self):
     # 测试robots
     from urllib.robotparser import RobotFileParser
     rp = RobotFileParser()
     rp.set_url(test_url + 'robots.txt')
     rp.read()
     print(rp.can_fetch('*', test_url))
예제 #5
0
파일: models.py 프로젝트: Towhidn/quaero
	def is_allowed(self):
		if self.site.robots_status==200:
			parser = RobotFileParser()
			lines = io.StringIO(self.site.robots).readlines()
			parser.parse(lines)
			return parser.can_fetch(settings.USER_AGENT, self.get_url())
		return True
예제 #6
0
def get_robot_file_parser(start_url: str,
                          **kwargs) -> Union[RobotFileParser, None]:
    """Returns :class:`~python:urllib.robotparser.RobotFileParser` object from given URL.
    If no ``robots.txt`` file is found or error occurs, returns ``None``.

    :param start_url: URL from which ``robots.txt`` will be collected.
    :param kwargs: Will be passed to :func:`get_html`.

    .. seealso:: :func:`async_get_robot_file_parser`
    """
    try:
        parsed_url = ParsedUrl(start_url)

        robot_txt_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
        rp = RobotFileParser(robot_txt_url)

        text = get_html(robot_txt_url,
                        check_http_content_type=False,
                        return_response_object=False,
                        raise_for_status=True,
                        **kwargs)

        lines = [line.strip() for line in text.split("\n") if line != '']
        rp.parse(lines)

        return rp
    except Exception as e:  # Exceptions from URL parsing, HTML retrieval and robot file parsing
        logging.warning(
            f"Unable to retrieve robots.txt from {start_url}. Reason: {e}")
        return None
예제 #7
0
 def get_robots(self):
     rp = RobotFileParser()
     if self.robots_content:
         rp.parse(self.robots_content)
     else:
         rp.allow_all = True
     return rp
예제 #8
0
	def checkRobots(self):
		if self.domain[len(self.domain)-1] != "/":
			self.domain += "/"
		request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent})
		self.rp = RobotFileParser()
		self.rp.set_url(self.domain+"robots.txt")
		self.rp.read()
예제 #9
0
def analyze_badbots(db, filename, max_entries=100000, max_bots=100):
    useragents = defaultdict(int)

    for i, entry in enumerate(db):
        parser = RobotFileParser(entry.url)
        parser.parse(entry.body.split("\n"))

        bans = [
            e for e in parser.entries if len(e.rulelines) == 1
            and not e.rulelines[0].allowance and e.rulelines[0].path == '/'
        ]

        for ban in bans:
            for useragent in ban.useragents:
                useragents[useragent] += 1

        if i >= max_entries:
            break

    useragents = sorted(useragents.items(), key=lambda x: -x[1])
    with open(filename, "w") as output:
        output.write("useragent\tcount\ttype\tinfolink\tcompany\thomepage\n")
        for useragent, count in useragents[:max_bots]:
            agenttype, info, company, homepage = BOT_TYPES.get(
                useragent, ('', '', '', ''))
            if not info:
                info = DEFAULT_USERAGENT_URL % useragent
            output.write(
                "%s\t%s\t%s\t%s\t%s\t%s\n" %
                (useragent, count, agenttype, info, company, homepage))

    return useragents
예제 #10
0
def allowed_to_crawl(url, host_url, scheme):
    '''
    url: the full url, string
    host_url: the domain url, string (eg. wikipedia.org)
    scheme: the communication protocol, string (eg. https)
    '''

    # if host URL is google, assume we are allowed to crawl
    if host_url == 'google.com':
        return True

    # if it is not a link, return False
    if host_url == '' or scheme == '':
        return False

    try:
        # get the robots.txt
        rp = RobotFileParser()
        rp.set_url(scheme + "://" + host_url + "/robots.txt")
        rp.read()

        return rp.can_fetch("*", url)

    except:
        pass

    return True
예제 #11
0
    def __init__(self, location: str, get_accredited: bool):
        self._robot_parser = RobotFileParser()
        self._robot_parser.set_url(urljoin(ICFG.STARTING_URL, 'robots.txt'))
        self._robot_parser.read()
        self._site_maps = self._robot_parser.site_maps()

        self.location = location.lower()
        self._acc = get_accredited
예제 #12
0
 def __init__(self, base_url):
     try:
         url = urljoin(base_url, 'robots.txt')
         self.rp = RobotFileParser()
         self.set_url(url)
         self.rp.read()
     except:
         self.rp = None
예제 #13
0
 def __init__(self, robotstxt_body, spider):
     from urllib.robotparser import RobotFileParser
     self.spider = spider
     robotstxt_body = decode_robotstxt(robotstxt_body,
                                       spider,
                                       to_native_str_type=True)
     self.rp = RobotFileParser()
     self.rp.parse(robotstxt_body.splitlines())
예제 #14
0
 def __init__(self, basicUrl, pageLimit, level):
     self._basicUrl = basicUrl
     self._robotParser = RobotFileParser()
     self._urlRegex = re.compile(basicUrl)
     self._pageCount = pageLimit  #A limit of pages
     self._pageHeap = PriorityQueue()
     self._visited = set()
     self._level = level
예제 #15
0
 def _robots(self):
     robots = RobotFileParser()
     r = fetch_raw(self.url.site + 'robots.txt', strict=False)
     if r is None:
         robots.parse(self.DEFAULT_ROBOTS.splitlines())
     else:
         robots.parse(r.text.splitlines())
     return robots
예제 #16
0
    def robot_rules(self, user_agent: str):
        robot_parser = RobotFileParser(url=self.url_robots)
        robot_parser.read()

        return {
            'can_fetch': robot_parser.can_fetch(user_agent, self.url),
            'crawl_delay': robot_parser.crawl_delay(user_agent),
            'request_rate': robot_parser.request_rate(user_agent),
        }
예제 #17
0
def rootFileParser():
    rp = RobotFileParser()
    rp.set_url("http://www.jianshu.com/robots.txt")
    rp.read()
    print(rp.can_fetch("*", "https://www.jianshu.com/p/b67554025d7d"))
    print(
        rp.can_fetch(
            "*",
            "http://www.jianshu.com/search?q=python&page=1&type=collections"))
예제 #18
0
def set_url():  # 通过 set_url() 方法设置 robots.txt 的链接。
    rp = RobotFileParser()
    rp.set_url('http://www.bilibili.com/robots.txt')
    rp.read()
    # 使用 can_fetch() 方法判断网页是否可以被抓取。
    print(rp.can_fetch('*', 'http://www.bilibili.com/vide/BV15J411T7WQ'))
    print(
        rp.can_fetch(
            '*',
            'http://www.bilibili.com/search?q=python&page=1&type=collections'))
예제 #19
0
def get_robots(url):
    """
    获取Robots解析器
    :param url: robots.txt存在路径
    :return: Robots解析器
    """
    rp = RobotFileParser()
    rp.set_url(url)
    rp.read()
    return rp
예제 #20
0
def parse_robots(site: ParseResult) -> RobotFileParser:
    """
    Process a robots file for the specified domain.
    :param site:
    :return:
    """
    robots = RobotFileParser()
    robots.set_url(f"{site.scheme}://{site.netloc}/robots.txt")
    robots.read()
    return robots
예제 #21
0
 def get_robot_txt(cls, url):
     try:
         rp = RobotFileParser()
         parsed_url = urlparse(url)
         robots_url = "{0.scheme}://{0.netloc}/robot.txt".format(parsed_url)
         rp.set_url(robots_url)
         rp.read()
         return rp.can_fetch("*", url)
     except Exception as e:
         raise Exception(e.args[0])
예제 #22
0
def parse():  # 使用 parse() 方法执行读取和分析
    rp = RobotFileParser()
    rp.parse(
        urlopen('http://www.bilibili.com/robots.txt').read().decode(
            'utf-8').split('\n'))
    print(rp.can_fetch('*', 'http://www.bilibili.com/vide/BV15J411T7WQ'))
    print(
        rp.can_fetch(
            '*',
            'http://www.bilibili.com/search?q=python&page=1&type=collections'))
예제 #23
0
def robot_parse():
    rp = RobotFileParser()
    rp.set_url('http://www.jianshu.com/robots.txt')
    rp.read()

    print(rp.can_fetch('*', 'http://www.jianshu.com/p/'))
    print(
        rp.can_fetch(
            '*',
            "http://www.jianshu.com/search?q=python&page=1&type=collections"))
예제 #24
0
def check_robot_txt(url):
    try:
        # proxy_handler = ProxyHandler(
        #     {
        #         'http' : 'http://127.0.0.1:63231',
        #         'https' : 'http://127.0.0.1:63231'
        #     }
        # )
        #
        # opener = build_opener(proxy_handler)

        rp = RobotFileParser(url + '/robots.txt')
        rp.read()
        #  rp.parse(urlopen(url.read().decode('utf-8').split('\n')))
        print(rp.can_fetch('*', url + '/stock/'))

        # print("***************1")
        # response = opener.open('https://www.baidu.com')
        # print("***************2")
        # print(response.read().decode('utf-8'))
        # headers = {'User-Agent': 'Mozilla / 5.0(Macintosh;Intel Mac OS X) AppleWebKit / 537.36(KHTML, like Gecko)',
        #            'Host': 'httpbin.org'}
        # dict = {'name':'Tobi'}
        #
        # username= '******'
        # password= '******'
        #
        # url = 'http://localhost:5000'
        #
        # p = urllib.request.HTTPPasswordMgrWithDefaultRealm()
        # p.add_password(None,url,username,password)
        # auth_handler = urllib.request.HTTPBasicAuthHandler(p)
        # opener = urllib.request.build_opener(auth_handler)
        #
        # response = opener.open(url)
        # html = response.read().decode('utf-8')
        # print(html)
        # request = urllib.request.Request('http://httpbin.org/post', headers=headers, data=bytes(urllib.parse.urlencode(dict),encoding='utf8'), method='POST')
        # print("***************")
        # response = urllib.request.urlopen(request, context=ssl._create_unverified_context())
        # if  response.status != 200:
        #     print('status : ' + response.status)
        # print("*******result********")
        # print(response.read().decode('utf-8'))
        # print(type(response))
        # print(response.status)
        # print(response.getheaders())
        # print(response.getheader('Server'))

    except URLError as e:
        # if  isinstance(e.reason, socket.timeout):
        #     print('time out!!!')

        print(e.reason)
    return
예제 #25
0
 def can_fetch(self, url):
     parsed_url = urlparse(url)
     # Fetching and parsing the robots.txt file can be expensive in it-self.
     # Let's cache the RobotFileParser instances, one per host, on the
     # scraper itself to reuse them for consecutive queries.
     rfp = self.robot_file_parsers.get(parsed_url.hostname)
     if rfp is None:
         rfp = RobotFileParser(self.get_robot_url(url))
         rfp.read()
         self.robot_file_parsers[parsed_url.hostname] = rfp
     return rfp.can_fetch(self.user_agent, parsed_url.path)
예제 #26
0
 def __init__(self, page):
     self.root = page
     self.parsed_uri = urlparse(page)
     self.home_page = "{uri.scheme}://{uri.netloc}/".format(
         uri=self.parsed_uri)
     self.to_crawl = set()
     self.crawled = set()
     rp = RobotFileParser()
     rp.set_url(self.home_page + "robots.txt")
     rp.read()
     self.rp = rp
예제 #27
0
 def __getParser(self, url):
     if url == '':
         return False
     site = 'https://' + url + '/robots.txt'
     # print("robotparse: " + site)
     try:
         rp = RobotFileParser(site)
         rp.read()
     except Exception as e:
         return False
     else:
         return rp
예제 #28
0
    def request(self, url: str) -> None:
        """ Perform robots.txt request """
        if self.state is not None:
            return

        try:
            self.state = RobotFileParser()
            self.state.set_url(url)
            self.state.read()

        except Exception:
            self.state = False
예제 #29
0
def check_robots(base_url, ext_url):
    '''
    Check the robots.txt
    Prints note if base_url + ext_url is legal for crawling
    '''
    bot = RobotFileParser(base_url + '/robots.txt')
    bot.read()
    if bot.can_fetch('*', base_url + ext_url):
        print('robots.txt permits parsing')
    else:
        print('Do not parse')
    return bot
예제 #30
0
def robots_get(url, *args, **kwargs):
    u = urlparse(url)
    robot_url = '{scm}://{loc}/robots.txt'.format(scm=u.scheme, loc=u.netloc)
    robot = RobotFileParser(robot_url)
    robot.read()
    ua = kwargs.get('headers', dict()).get('User-Agent', '*')
    if not robot.can_fetch(ua, url):
        return 'Not Allowed By robots.txt'
    delay = robot.crawl_delay(ua)
    if delay:
        time.sleep(delay)
    return requests.get(url, *args, **kwargs)