Пример #1
0
def manual_add_robot_policies():
    # coz some critical sites have invalid robots.txt
    ## surprised to see SO MANY sites without valid robots.txt!
    site_rp = RobotExclusionRulesParser()
    site_rp.parse('User-agent: * \n' + 'Disallow: /search\n' +
                  'Disallow: /advanced_search\n')
    robots_policies['findingaids.library.northwestern.edu'] = site_rp

    site_rp = RobotExclusionRulesParser()
    site_rp.parse('User-agent: * \n' + 'Disallow: /catalog\n' +
                  'Disallow: /contact\n' + 'Disallow: /downloads\n' +
                  'Disallow: /users\n')
    robots_policies['digitalhub.northwestern.edu'] = site_rp

    site_rp = RobotExclusionRulesParser()
    site_rp.parse('User-agent: * \n' + 'Disallow: /catalog\n')
    robots_policies['images.library.northwestern.edu'] = site_rp
    robots_policies['images.northwestern.edu'] = site_rp
    robots_policies['media.northwestern.edu'] = site_rp
    robots_policies['arch.library.northwestern.edu'] = site_rp

    site_rp = RobotExclusionRulesParser()
    site_rp.parse('User-agent: * \n' + 'Disallow: /?*\n')
    robots_policies['schedule.radiology.northwestern.edu'] = site_rp

    site_rp = RobotExclusionRulesParser()
    try:
        request = urllib2.Request('http://www.ctd.northwestern.edu/robots.txt')
        response = urllib2.urlopen(request, timeout=5)
        content = response.read()
    except:
        content = 'User-agent: * \n'
    content += ('Disallow: /courses?*\n')
    site_rp.parse(content)
    robots_policies['www.ctd.northwestern.edu'] = site_rp
Пример #2
0
 def __init__(self, info, instance, parent=None):
     super(CrawlerWorker, self).__init__(parent)
     self._instance = instance
     self.running = True
     self.base_url = info['base_url']  # main url of website
     self._links_to_crawl = []  # list of links yet to open
     self.crawled_links = {}  # dictionary of links opened/all links
     self.__parsed_crawled = {}  # list of urls and their html pages
     self.total = 0  # total number of found links
     self.total_crawled = 0  # total number of valid crawled links in website
     self.max_pages = info['max_crawl']  # max pages to crawl
     self.invalid_links_count = 0  # number of broken links found
     self.invalid_links_list = []  # list of broken links found
     self.dynamic = []
     self.info = info
     self.login_url = info['login_url']  # login page url if available
     if info['robo_url']:
         self._rb_parser = RobotExclusionRulesParser()
         self._rb_parser.fetch(info['robo_url'])
         self._user_agent = 'WASecBot'
     else:
         self._rb_parser = None
     self.browser = browser.RoboBrowser(parser="html.parser",
                                        user_agent="WASecBot")
     self.browser.session.verify = False
     self._logged_in = False
     self.running = True
     self._instance.btncrawlcancel.clicked.connect(self.pause)
     self._elapsed = 0
     self.delay = 15
     self._requests = 0
     self.start = None
     urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
Пример #3
0
 def __get_robot_handler(url):
     rp = RobotExclusionRulesParser()
     if Util.is_url(url):
         # get the original base url
         base_url = Util.get_base_url(url)
         page = requests.get(urljoin(base_url, 'robots.txt'))
         rp.fetch(urljoin(base_url, 'robots.txt'))
     return rp
Пример #4
0
    def __init__(self, robots_doc):
        ''' Initialize from database document representation. '''
        self._updated_at = robots_doc['updated_at']
        self._robots = RobotExclusionRulesParser()

        if robots_doc['file'] is not None:
            try:
                self._robots.parse(robots_doc['file'])
            except:
                pass
Пример #5
0
 def __init__(self, content=None, expires=None):
     super(RerpWrapper, self).__init__(content, expires)
     if content:
         self.parser = RobotExclusionRulesParser()
         self.parser.use_local_time = False
         self.parser.expiration_date = self.expires
         self.parser.parse(content)
     else:
         self.parser = None
         self.my_super = super(RerpWrapper, self)
Пример #6
0
def crawl_website(website):
    website.update_robots_txt()  # only updates if necessary
    rules = RobotExclusionRulesParser()
    rules.parse(website.robots_content)

    # TODO add check for site last updated timestamp

    # Has the index been retrieved yet?
    if not website.webpage_set.exists():
        # get index
        if rules.is_allowed('*', '/'):
            webpage = Webpage.objects.create(
                local_url='/',
                robots_allowed=True,
                website=website,
            )
            crawl_existing_webpage(webpage, rules)
        else:
            # create a placeholder index webpage
            webpage = Webpage.objects.create(
                local_url='/',
                robots_allowed=False,
                website=website,
            )
            print 'Robots not allowed to index root'
            return None

    # Are there webpages to be accessed?
    allowed_webpages = website.webpage_set.filter(robots_allowed=True)
    if not allowed_webpages.exists():
        # print 'no allowed webpages found for {website}'.format(website=website.url)
        return None

    # Are there new links to try out?
    new_webpages = allowed_webpages.filter(exists=None)
    if new_webpages.exists():
        # start with the oldest first
        # created and updated are the same for newly-created webpages
        webpage = new_webpages.order_by('created').first()
        print 'crawling new'
        return crawl_existing_webpage(webpage, rules)

    # Crawl an existing webpage
    if rules.is_allowed('*', '/foo.html'):
        webpage = allowed_webpages.filter(
            exists=True).order_by('updated').first()
        print 'crawling existing'
        return crawl_existing_webpage(webpage, rules)
Пример #7
0
    def allowed_url(self):
        #FIXME: Should use the geturl address as it may have been redirected
        scheme, netloc, path, query, fragment = urlsplit(self.url)
        robot_url = urlunsplit([scheme, netloc, "robots.txt", "", ""])

        #FIXME: Should cache robots.txt in a better persistent data structure
        if robot_url in ROBOT_CACHE:
            rp = ROBOT_CACHE[robot_url]
        else:
            rp = RobotExclusionRulesParser()
            try:
                rp.fetch(robot_url)
            # Currently if there's a problem we assume there is no robots.txt
            except IOError:
                # Should be catching the urllib2.URLError exception
                logging.debug("Couldn't retrieve robots.txt for %s" %
                              robot_url)
                rp = None
            except UnicodeDecodeError:
                logging.debug("Unicode decode error for robots.txt at %s" %
                              robot_url)
                rp = None
            except httplib.HTTPException:
                logging.debug("Generic HTTPException for robots.txt at %s" %
                              robot_url)
                rp = None
            ROBOT_CACHE[robot_url] = rp

        if rp is None or rp.is_allowed("*", self.url):
            base_url = urlunsplit([scheme, netloc, "", "", ""])

            # If there's a current delay on the site respect robots.txt and stall
            if self.db.exists(netloc):
                logging.debug("Obeying robot overlord for %s..." % netloc)
                URLHandler.add_to_busy(self.db, self.url)
                return False

            # Set a delay for any other requests to this site to respect robots.txt
            delay = rp.get_crawl_delay("*") if rp else None
            if delay:
                delay = int(math.ceil(float(rp.get_crawl_delay("*"))))
            else:
                delay = SETTINGS["DEFAULT_ROBOTS_DELAY"]
            self.db.setex(netloc, "1", delay)

            return True
        else:
            return False
Пример #8
0
 def __init__(self, robotstxt_body, spider):
     from robotexclusionrulesparser import RobotExclusionRulesParser
     self.spider = spider
     self.rp = RobotExclusionRulesParser()
     try:
         robotstxt_body = robotstxt_body.decode('utf-8')
     except UnicodeDecodeError:
         # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
         # Switch to 'allow all' state.
         logger.warning("Failure while parsing robots.txt using %(parser)s."
                        " File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.",
                        {'parser': "RobotExclusionRulesParser"},
                        exc_info=sys.exc_info(),
                        extra={'spider': self.spider})
         robotstxt_body = ''
     self.rp.parse(robotstxt_body)
Пример #9
0
    def check(self, hostkey, relurl):
        """ Return True if allowed to fetch, False if not, None
        if we do not have robots.txt for this entry. """

        robotstxt, expiration = self.robots.get(hostkey, (None, None))

        if robotstxt is None:
            return None

        # FIXME: mtime?  we need to let robots.txt expire.

        robotparser = RobotExclusionRulesParser()

        if robotsparser.is_expired():
            return None

        robotparser.seturl(hostkey + '/robots.txt')
        robotparser.parse(robotstxt.splitlines())
        return robotparser.can_fetch(hostkey + relurl)
Пример #10
0
 def __init__(self, robotstxt_body, spider):
     from robotexclusionrulesparser import RobotExclusionRulesParser
     self.spider = spider
     self.rp = RobotExclusionRulesParser()
     robotstxt_body = decode_robotstxt(robotstxt_body, spider)
     self.rp.parse(robotstxt_body)
Пример #11
0
def benchmark_rerp_parser(website):
    from robotexclusionrulesparser import RobotExclusionRulesParser
    rp = RobotExclusionRulesParser()
    rp.parse(website['robotstxt'])
    for link in website['links']:
        rp.is_allowed('googlebot', link)
Пример #12
0
 def __init__(self, url):
     self.url = Url(urljoin(url, '/robots.txt'))
     self.rerp = RobotExclusionRulesParser()
     self.rerp.user_agent = 'Mozilla/5.0'
     self.rerp.fetch(self.url.url())
Пример #13
0
class Crawler:
    start_page_url = ''
    rerp = RobotExclusionRulesParser()
    cFiles = CrawlerFiles()
    tld = ''
    waiting_url_set = set()
    crawled_url_set = set()
    bad_url_set = set()
    find_string_set = set()
    find_flname_set = set()
    found_flname_set = set()
    found_string_set = set()
    stop_request = False
    download_chunk_size = 0
    conn_timeout = 0
    delay = 0
    user_agent = ''
    bad_url_prefix = '->Bad Url '
    found_string_prefix = '->Found '
    found_flname_prefix = '->Saved '

    def __init__(self, save_dir, start_url, find_flname_set, find_string_set,
                 chunk_size, conn_timeout, default_delay, user_agent):
        logger.info('->Starting RERP')
        Crawler.rerp.fetch(start_url + '/robots.txt')
        Crawler.user_agent = user_agent
        delay = Crawler.rerp.get_crawl_delay(Crawler.user_agent)
        Crawler.conn_timeout = conn_timeout
        if delay is None:
            Crawler.delay = default_delay
        else:
            Crawler.delay = delay
        Crawler.cFiles = CrawlerFiles(save_dir, start_url)
        logger.info('->Getting Previous Session files (if any) ')
        Crawler.crawled_url_set = Crawler.cFiles.get_file_data(
            Crawler.cFiles.crawled_file)
        Crawler.found_flname_set = Crawler.cFiles.get_file_data(
            Crawler.cFiles.found_files_file)
        Crawler.found_string_set = Crawler.cFiles.get_file_data(
            Crawler.cFiles.found_strings_file)
        Crawler.bad_url_set = Crawler.cFiles.get_file_data(
            Crawler.cFiles.invalid_file)
        Crawler.waiting_url_set = Crawler.cFiles.get_file_data(
            Crawler.cFiles.waiting_file)
        info = Crawler.cFiles.get_file_data(Crawler.cFiles.info_file)

        Crawler.start_page_url = start_url

        Crawler.tld = url_func.return_tld(start_url)

        Crawler.find_string_set = find_string_set
        Crawler.find_flname_set = find_flname_set
        Crawler.download_chunk_size = chunk_size
        logger.info('Crawler Initiated')
        logger.info('->Loading Website Info')
        logger.debug('* ' * 20 + 'Website Info' + '* ' * 20)
        if info is None:
            info = url_func.get_domain_info(Crawler.tld)
            Crawler.cFiles.set_file_data(Crawler.cFiles.info_file, info)
        for key in info:
            val = info[key]
            if val:
                logger.debug("%-20s : %s" % (str(key).upper(), str(val)))
        logger.debug('* ' * 40)

    @staticmethod
    def crawl_page(t_name, page_url):
        # noinspection PyBroadException
        try:
            logger.debug("%s - %s" % (t_name, page_url))
            if not Crawler.rerp.is_allowed(Crawler.user_agent, page_url):
                logger.debug('->%s not allowed to crawl %s' %
                             (t_name, page_url))
                return
            Crawler.add_urls(page_url)
            if not Crawler.stop_request:
                Crawler.waiting_url_set.remove(page_url)
                Crawler.crawled_url_set.add(page_url)
                time.sleep(Crawler.delay)
        except requests.HTTPError as h:
            string = "HTTP Error %d - %s" % (h.response.status_code, page_url)
            logger.debug(Crawler.bad_url_prefix + string)
            Crawler.bad_url_set.add(string)
            Crawler.waiting_url_set.remove(page_url)
            Crawler.crawled_url_set.add(page_url)
        except requests.ReadTimeout:
            string = "Timeout %0.1f secs - %s " % (Crawler.conn_timeout,
                                                   page_url)
            logger.debug(Crawler.bad_url_prefix + string)
            Crawler.bad_url_set.add(string)
            Crawler.waiting_url_set.remove(page_url)
            Crawler.crawled_url_set.add(page_url)
        except requests.TooManyRedirects as t:
            string = "%s - %s" % (t, page_url)
            logger.debug(Crawler.bad_url_prefix + string)
            Crawler.bad_url_set.add(string)
            Crawler.waiting_url_set.remove(page_url)
            Crawler.crawled_url_set.add(page_url)
        except (requests.ConnectionError, requests.ConnectTimeout):
            if url_func.check_connection() != url_func.CONNECTION_OK:
                Crawler.wait(t_name)
        except Exception:
            logger.exception('Exception in %s ' % page_url)
            Crawler.waiting_url_set.remove(page_url)
            Crawler.crawled_url_set.add(page_url)

    @staticmethod
    def add_urls(page_url):
        not_html = False
        with closing(
                requests.get(page_url,
                             stream=True,
                             timeout=Crawler.conn_timeout)
        ) as page:  # html code of page
            type_of_page = page.headers[
                'Content-Type']  # get content type from header of html page
            page.raise_for_status()
            if 'html' in type_of_page:  # web page
                soup = BeautifulSoup(
                    page.content, "html.parser")  # parse the content of page
                text = soup.text
                for string in Crawler.find_string_set:
                    if Crawler.stop_request:  # if stop is requested by user
                        return
                    str_url = string + ' ' + page_url
                    if text is not None and string in text:
                        Crawler.found_string_set.add(str_url)
                        logger.debug(
                            '%s %s %s' %
                            (Crawler.found_string_prefix, string, page_url))
                for a_tag_content in soup.find_all('a'):
                    if Crawler.stop_request:  # if stop is requested by user
                        return
                    url = parse.urljoin(Crawler.start_page_url,
                                        a_tag_content.get('href'))
                    if '#' in url:
                        url = url.split('#')[0]
                    if ' ' in url:
                        url = url.replace(' ', '%20')
                    if url_func.return_tld(url) == Crawler.tld:
                        if url not in Crawler.crawled_url_set:
                            Crawler.waiting_url_set.add(url)
            else:
                not_html = True
        if not_html:
            f_name = page_url.split('/')[-1]
            download_file = False
            for string in Crawler.find_flname_set:
                if Crawler.stop_request:
                    break
                if string in f_name:
                    download_file = True
                    break
            if download_file:
                type_split = type_of_page.split('/')
                f_dir = Crawler.cFiles.save_dir + '/' + type_split[0]
                if not dir_exists(f_dir):
                    make_dir(f_dir)
                Crawler.found_flname_set.add(page_url)
                Crawler.file_download(page_url, f_dir, f_name)
                if not Crawler.stop_request:
                    logger.debug('%s %s' %
                                 (Crawler.found_flname_prefix, page_url))

    # wait
    @staticmethod
    def wait(t_name):
        logger.info('->%s waiting for connection...' % t_name)
        while True:
            if Crawler.stop_request:
                break
            if url_func.check_connection() == url_func.CONNECTION_OK:
                break
            time.sleep(2)

    @staticmethod
    def update_files():
        logger.info('Updating Files')
        Crawler.cFiles.set_file_data(Crawler.cFiles.crawled_file,
                                     Crawler.crawled_url_set)
        Crawler.cFiles.set_file_data(Crawler.cFiles.found_files_file,
                                     Crawler.found_flname_set)
        Crawler.cFiles.set_file_data(Crawler.cFiles.found_strings_file,
                                     Crawler.found_string_set)
        Crawler.cFiles.set_file_data(Crawler.cFiles.invalid_file,
                                     Crawler.bad_url_set)
        Crawler.cFiles.set_file_data(Crawler.cFiles.waiting_file,
                                     Crawler.waiting_url_set)

    @staticmethod
    def file_download(file_url, f_dir, f_name):
        f_path = get_file_path(f_dir, f_name)
        # logger.info('Saving  ', f_name)
        dl = file_size(f_path)
        resume_header = {'Range': 'bytes=%d-' % dl}
        with closing(
                requests.get(file_url,
                             stream=True,
                             headers=resume_header,
                             timeout=Crawler.conn_timeout)) as file:
            tl_str = file.headers.get('content-length')
            # if there is no content length specified in header website doesnt support resuming
            mode = 'ab' if tl_str else 'wb'
            with open(f_path, mode) as handle:
                for chunk in file.iter_content(
                        chunk_size=Crawler.download_chunk_size):
                    if Crawler.stop_request:  # if stop is requested by user
                        return
                    if chunk:
                        handle.write(chunk)
Пример #14
0
 def _parse_robots(self, response):
     rp = RobotExclusionRulesParser()
     rp.parse(response.body)
     self._parsers[urlparse_cached(response).netloc] = rp
Пример #15
0
    def crawl(self, in_url):
        global global_id, last_update, DOMAIN
        print("Crawler %d on P#%d: %s" % (self.id, url_ids[in_url], in_url))
        try:
            request = urllib2.Request(in_url)
            response = urllib2.urlopen(request, timeout=5)
            real_url = w3lib.url.canonicalize_url(response.geturl())
            real_uri = urlparse(real_url)
            extension = real_uri.path.lower().split('.')[-1]
            if response.info(
            ).maintype != 'text' or extension in skip_file_types:
                content = ''
            else:
                content = response.read()
        except:
            real_url = in_url
            content = ''

        if real_url == in_url:  # no redirect
            soup = BeautifulSoup(content, "html.parser")
            raw_urls = [link.get('href') for link in soup.find_all('a')]
        else:  # redirect
            raw_urls = [real_url]

        out_urls = set()
        for url in raw_urls:
            #print('parsing', url)
            if url is None or len(url) <= 1:
                continue

            url = url.strip()

            if url.startswith('/http://') or url.startswith('/https://'):
                # why would someone do this?
                url = url[1:]
            if url.startswith('mailto:') or url.startswith('mailto@'):
                continue

            fixed_url = w3lib.url.canonicalize_url(urljoin(in_url, url))
            if len(fixed_url) > 1000:  # long urls tend to be wrong urls
                continue
            uri = urlparse(fixed_url)
            if uri.scheme is not None and uri.scheme not in [
                    'http', 'https', ''
            ]:
                continue
            if uri.hostname is not None:
                if not uri.hostname.endswith(DOMAIN):
                    continue
                elif uri.hostname not in robots_policies:
                    site_rp = RobotExclusionRulesParser()
                    try:
                        site_rp.fetch('http://' + uri.hostname + '/robots.txt',
                                      timeout=3)
                    except:
                        print "error with", ('http://' + uri.hostname +
                                             '/robots.txt')
                    rp_lock.acquire()
                    robots_policies[uri.hostname] = site_rp
                    rp_lock.release()
                if not (robots_policies[uri.hostname].is_allowed(
                        "*", fixed_url)):
                    continue
            extension = uri.path.lower().split('.')[-1]
            if extension in skip_file_types:
                continue
            if 1 < len(extension) < 8 and '/' not in extension:
                urls_extensions.add(extension)

            out_urls.add(fixed_url)

        #print out_urls
        #get lock
        write_lock.acquire()
        out_ids = []
        for url in out_urls:
            if url in url_ids:
                out_ids.append(url_ids[url])
            else:
                url_ids[url] = global_id
                out_ids.append(global_id)
                url_id_file.write('%d\t%s\n' % (global_id, url))
                url_id_file.flush()
                global_id += 1
                url_tasks.put(url)
        transition_file.write('%d\t%s\n' % (url_ids[in_url], str(out_ids)))
        transition_file.flush()
        last_update = time.time()
        write_lock.release()
        #release lock
        print('%d urls in total reported by %d' % (global_id, self.id))