Пример #1
0
def crawl_sitemaps(url, max_depth=1):
    robots_url = parse.urljoin(url, "/robots.txt")
    robots = Robots.fetch(robots_url)
    sitemap_stack = []
    seen_sitemaps = set()

    for url in robots.sitemaps:
        sitemap_stack.append((url, 0))

    all_urls = []
    while sitemap_stack:
        sitemap_url, depth = sitemap_stack.pop()
        if depth >= max_depth:
            continue

        if sitemap_url in seen_sitemaps:
            continue

        seen_sitemaps.add(sitemap_url)
        sitemap = Sitemap(sitemap_url)
        all_urls.extend(sitemap.get_urls())
        for url in sitemap.get_child_sitemap_urls():
            sitemap_stack.append((url, depth + 1))

    return all_urls
Пример #2
0
def allows_crawl(url):
    try:
        #url = "https://www.ics.uci.edu"
        # print("This is the passed url: ", url)
        url = str(url)
        parsed = urlparse(url)
        base_url = parsed.scheme + "://" + parsed.netloc

        # set up parser with robots.txt file!
        if base_url in robot_files.keys(
        ):  # if we already downloaded the robots.txt file
            #print("grabbing already foumd robots.txt")
            rparser = robot_files[base_url]
        else:
            try:

                robot_url = base_url + "/robots.txt"
                rparser = Robots.fetch(robot_url)

                # PUT ROBOTS.TXT FILE INTO HERE!
                robot_files[base_url] = rparser

            except IOError:
                print("IOERROR")
                return False
            except:
                print("some other error?")
                return False
        try:
            return rparser.allowed("*", url)
        except KeyError:
            return False
    except IOError:
        return False
Пример #3
0
    def __init__(self,
                 url,
                 limit_pages_counter=1,
                 storage=False,
                 meta={},
                 debug=False,
                 user_agent='Spidar/1.1',
                 allow_external_link_crawling=False,
                 selenium_chrome_driver=None):
        self.__start_url = url
        parse_location = urlparse(url)
        self.__initial_domain_name = parse_location.netloc
        self.__pages = []
        self.__url_to_discover = set()
        self.__url_discovered = set()
        self.__max_counter_pages = limit_pages_counter
        self.__storage = storage
        self.__PATH_STORAGE = '__storage/'
        self.__PATH_SOURCE = self.__PATH_STORAGE + 'sources/'
        self.__PATH_INFO = self.__PATH_STORAGE + 'infos/'
        self.__meta = meta
        self.__debug = debug
        self.__user_agent = user_agent
        self.__allow_external_link_crawling = allow_external_link_crawling
        self.__selenium_chrome_driver = selenium_chrome_driver
        self.__selenium_driver = None
        if self.__selenium_chrome_driver is not None:
            self.__selenium_driver = webdriver.Chrome(
                executable_path=self.__selenium_chrome_driver)

        self.__rp = Robots.fetch(Robots.robots_url(self.__start_url))

        if self.__storage:
            self.__set_up_folders(self.__initial_domain_name)
Пример #4
0
    def scrape_page(self, url):
        options = webdriver.ChromeOptions()
        options.add_argument("headless")
        options.add_experimental_option(
            "prefs",
            {"profile.default_content_settings.cookies": 2})  # disable cookies
        driver = webdriver.Chrome(options=options)
        root_url = '{}://{}'.format(
            urlparse(url).scheme,
            urlparse(url).netloc)  # canonical

        if root_url in self.robots:
            robots = self.robots.get(root_url)
        else:
            robots = Robots.fetch(root_url + '/robots.txt')
            self.robots.update({root_url: robots})

        cdelay = robots.agent('*').delay
        if cdelay is None:
            crawl_delay = 6
        else:
            crawl_delay = int(cdelay)

        driver.implicitly_wait(crawl_delay)

        try:
            driver.get(url)
            res = {'url': url, 'driver': driver, 'robots': robots}
            return res  # result passed to callback function
        except:
            print('PROBLEM: ', url)
            return
Пример #5
0
def fetch_url_content(self, url, scheme='http'):
	client = MongoClient('172.25.0.70', 27017)
	db = client.crawled_urls
	collection = db.results
	stripped_url = url.strip()

	try:
		robots = Robots.fetch(scheme + '://' + stripped_url + '/robots.txt')
	except RequestException:
		return 'FAILURE: ' + stripped_url + ' : Unable to fetch robots.txt'

	if not robots.allowed(stripped_url, 'just-some-user-agent'):
		return 'FAILURE: ' + stripped_url + ' : Robots say no'

	try:
		r = requests.get(scheme + '://' + stripped_url)

		collection.insert({
			'url': url,
			'scheme': 'http',
			'status': r.status_code,
			"create_time": time.time(),
			"result": r.text
		})
	except Exception as exc:
		raise self.retry(exc=exc)

	client.close()
	return 'SUCCESS: ' + stripped_url
Пример #6
0
 def __init__(self,
              url,
              links=False,
              content=False,
              depth=5,
              exclusion_pattern=None,
              *args,
              **kwargs):
     domain = urlparse(url).netloc
     # Setup the rules for link extraction
     if exclusion_pattern:
         self._rules = [
             Rule(LinkExtractor(allow='.*' + domain + '/.*',
                                deny=exclusion_pattern),
                  callback=self.parse_url,
                  follow=True)
         ]
     else:
         self._rules = [
             Rule(LinkExtractor(allow='.*' + domain + '/.*'),
                  callback=self.parse_url,
                  follow=True)
         ]
     self.allowed_domains = [domain]
     self.start_urls = [url]
     self.links = links  # Should we store links ?
     self.content = content  # Should we store content ?
     self.depth = depth  # How deep should we go ?
     # robots.txt enhanced
     self.robots = Robots.fetch(
         urlparse(url).scheme + '://' + domain + '/robots.txt')
Пример #7
0
    def filter(self, ua):
        """Remove all of the urls in URLS that UA is not allowed to crawl,
           and fill in the .crawl_delay and .robots_url properties."""

        rules = None
        for url in sorted(self.urls):
            robots_url = Robots.robots_url(url)
            if self.robots_url != robots_url:
                if self.robots_url is None:
                    try:
                        rules = Robots.fetch(robots_url,
                                             headers={
                                                 'User-Agent': ua
                                             }).agent(ua)
                    except Exception as e:
                        sys.stderr.write(
                            "warning: failed to fetch and parse {}: {}\n".
                            format(robots_url, e))
                        rules = DummyAgent()

                    self.robots_url = robots_url
                    self.crawl_delay = rules.delay or 1

                else:
                    raise ValueError("robots.txt for {} is {}, not {}".format(
                        url, robots_url, self.robots_url))

            if not rules.allowed(url):
                self.urls.remove(url)
Пример #8
0
def scanRightmove():
    db = DB()
    headers = {'User-Agent': 'Dan070Bot([email protected])'}
    url = "http://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=USERDEFINEDAREA%5E%7B%22id%22%3A4703045%7D&maxBedrooms=3&minBedrooms=2&maxPrice=1200&dontShow=retirement&furnishTypes=furnished"

    robots = Robots.fetch('http://www.rightmove.co.uk/robots.txt', headers=headers)
    allowed = robots.allowed('http://www.rightmove.co.uk/property-to-rent/find.html?*', 'Dan070Bot([email protected])')

    if not allowed:
        return

    page = requests.get(url, headers=headers);
    soup = BeautifulSoup(page.content, 'html.parser')

    data = soup.select('div.l-searchResult.is-list')

    t_url = 'https://api.telegram.org/bot538125304:AAEodL7ns7iuTbbpRPgpLteOs-o4UAunV6k/sendMessage'
    headers = {'Content-Type': 'application/json'}

    for property in soup.select('div.l-searchResult.is-list'):
        added = property.select('span.propertyCard-branchSummary-addedOrReduced')[0].get_text()
        payload = {'chat_id': '@rightmove_alerts'}

        if added == 'Added today' or added == 'Reduced today':
            link = property.select('a.propertyCard-link')[0].get('href')
            full_link = 'http://www.rightmove.co.uk' + link

            if db.checkURL(full_link) == 0:
                db.addURL(full_link)
                payload['text'] = full_link
                requests.post(t_url, data=json.dumps(payload), headers=headers);

    db.close()
    print('Cycle')
Пример #9
0
def get_links(url):

    base = url
    links = []

    res = requests.get(base)
    soup = BeautifulSoup(res.text, 'html.parser')  # Parsing the html file

    # Getting all the links in page with href attribute and extracting their href attributes
    a_tags = soup.find_all('a', href=True)

    # Starting the robots parser
    rp = Robots.fetch(get_robots_url(base))
    agent = rp.agent('*')

    # Looping all the links found in the page
    for a_tag in a_tags:
        link = a_tag.get('href')  # Getting the href attribute

        if is_absolute(link) is False:
            link = fix_relative_url(link, base)  # Fixing relative urls

        # If robot_parser allows link to be scraped, appending it to a list
        if agent.allowed(link):
            links.append(link)
        else:
            print('Link is not allowed for scraping ({})'.format(link))

    return set(links)  # Casting to set for eliminating duplicates
Пример #10
0
    def filter(self, ua):
        """Remove all of the urls in URLS that UA is not allowed to crawl,
           and fill in the .crawl_delay and .robots_url properties."""

        rules = None
        for url in sorted(self.urls):
            robots_url = Robots.robots_url(url)
            if self.robots_url != robots_url:
                if self.robots_url is None:
                    try:
                        rules = Robots.fetch(robots_url, headers={
                            'User-Agent': ua
                        }).agent(ua)
                    except Exception as e:
                        sys.stderr.write(
                            "warning: failed to fetch and parse {}: {}\n"
                            .format(robots_url, e))
                        rules = DummyAgent()

                    self.robots_url = robots_url
                    self.crawl_delay = rules.delay or 1

                else:
                    raise ValueError(
                        "robots.txt for {} is {}, not {}"
                        .format(url, robots_url, self.robots_url))

            if not rules.allowed(url):
                self.urls.remove(url)
def get_crawl_status(url):
    robots_url = os.path.join(url, 'robots.txt')
    robot = Robots.fetch(robots_url)
    agent = robot.agent('msc-cookie-monster')
    try:
        return 1 if agent.allowed(url) else 0
    except Exception as _:
        return -1
Пример #12
0
 def _remember(self, url):
     log.debug('')
     urlparsed = urllib.parse.urlparse(url)
     robots_url = url.replace(urlparsed.path, '/robots.txt')
     print('[ INFO ]: Reading robots.txt file at: {}'.format(robots_url))
     robots = Robots.fetch(robots_url)
     checker = robots.agent(self.user_agent)
     self.index[urlparsed.hostname] = checker
Пример #13
0
def robots_txt():
    robots = Robots.fetch('http://www.e-prostor.gov.si/robots.txt')
    print(robots)
    print(robots.allowed('http://www.e-prostor.gov.si/nc/', '*'))
    print(robots.allowed('http://www.e-prostor.gov.si/fileadmin/global/', '*'))
    print('Crawl-delay: ', robots.agent('*').delay)
    print(robots.sitemaps)
    print(robots.agent('my-user-agent').delay)

    robots = Robots.fetch('http://www.e-prostor.gov.si/robots.txt')
    print(robots)
    lst = list(robots.sitemaps)
    r = requests.get(lst[0])
    root = etree.fromstring(r.content)
    for sitemap in root:
        children = sitemap.getchildren()
        print(children[0].text)
Пример #14
0
def is_allowed_access(uri, base_uri):
    # robotsで禁止されているアクセスでないか確認
    robots = Robots.fetch(base_uri + "/robots.txt")
    agent = robots.agent("*")
    if agent.allowed(uri):
        return True
    else:
        return False
Пример #15
0
def get_sitemaps(url):
    logging.info('Trying to get sitemaps from robots.txt')
    robots_url = urljoin(url, '/robots.txt')
    try:
        return list(Robots.fetch(robots_url).sitemaps)
    except ReppyException:
        logging.warning('Can not get access to: %s', robots_url)
        return [urljoin(url, 'sitemap.xml')]
    def get_robots_delay(url):
        robots_url = urllib.parse.urljoin(url, 'robots.txt')
        try:
            robots = Robots.fetch(robots_url)
            delay = Right(robots.agent('None').delay)
        except ReppyException:
            delay = Left(None)

        return delay
Пример #17
0
def get_robots_txt_checker(start_url, agent="Py4Seo Parse"):
    robots_txt_url = f"{start_url}/robots.txt".replace("//robots.txt",
                                                       "/robots.txt")
    try:
        robots = Robots.fetch("robots_txt_url")
        agent = robots.agent(agent)
        return lambda url: agent.allowed(url)
    except:
        return lambda _: True
Пример #18
0
 def _remember(self, url):
     urlparsed = urllib.parse.urlparse(url)
     robots_url = urlparsed.scheme + '://' + urlparsed.netloc + '/robots.txt'
     write_log('ROBOTS',
               'Reading robots.txt file at: {0}'.format(robots_url),
               package='reppy')
     robots = Robots.fetch(robots_url)
     checker = robots.agent(self.user_agent)
     self.index[urlparsed.hostname] = checker
def robot_parser(url):
    '''
    Parse the Robots.txt.
    Send True if it is allowed to crawl
    '''
    robotstxt = get_robots_url(url)
    parser = Robots.fetch(robotstxt)
    validation = parser.allowed(url, '*')
    return validation
Пример #20
0
def crawler(domain, pathseed, uniqueId, maxSize=5000):
    pq = queue.PriorityQueue()
    visited = []
    links = []
    pq.put((value(domain + pathseed), domain + pathseed))
    visited.append(domain + pathseed)
    rp = Robots.fetch(domain + '/robots.txt', verify=False)
    driver = webdriver.PhantomJS(
        service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any'])
    while (not pq.empty() and pq.qsize() < maxSize * 5):
        a = pq.get()[1]
        print("! " + str(len(links)) + " " + a)
        if (len(links) < maxSize):
            links.append(a)
            ls = get_all_links(domain, a, maxSize, rp, driver)
            for l in ls:
                if (l not in visited):
                    if (value(l) == 1 or value(l) == 2):
                        visited.append(l)
                        pq.put((value(l), l))
        else:
            while (not pq.empty()):
                pq.get()
    while (len(links) < maxSize and not pq.empty()):
        links.append(pq.get()[1])
    os.makedirs('Docs/HTMLPages/Heuristic2/' + folder(domain) + '/True/',
                exist_ok=True)
    os.makedirs('Docs/HTMLPages/Heuristic2/' + folder(domain) + '/False/',
                exist_ok=True)
    print(len(links))
    v = 0
    clf = Classifier()
    pos = 0
    res = ""
    for l in links:
        v += 1
        driver.get(l)
        time.sleep(1)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        res = str(clf.classify(soup))
        print(str(v) + " " + l + " " + res)
        #print(driver.page_source)
        if (res == 'True'):
            pos += 1
            extractorMain.extractor(soup, folder(domain), "Heuristic2",
                                    folder(domain).lower(), l, uniqueId)
            uniqueId += 1
        with open(
                'Docs/HTMLPages/Heuristic2/' + folder(domain) + '/' + res +
                '/' + str(v) + '-' + l.replace('/', '*') + '.html', 'wb') as f:
            f.write(bytes(driver.page_source, 'UTF-8'))
    hr = pos / maxSize
    with open('Docs/HTMLPages/Heuristic2/' + folder(domain) + '/' + 'hr.txt',
              'wb') as f:
        f.write(bytes(str(hr), 'UTF-8'))
    return 0
Пример #21
0
 def __init__(self, domain, schema):
     self.domain = domain
     self.schema = schema
     self.home_url = f'{self.schema}://{self.domain}'
     self.robots_url = self.home_url + '/robots.txt'
     self.real_domain = self.domain.split('/')[0]
     self.robots = Robots.fetch(self.robots_url)
     self.queue_urls = {self.home_url: 1}
     self.PARSED_URLS = set()
     self.browser = get_browser()
Пример #22
0
    def request(self, url: str) -> None:
        """ Perform robots.txt request """
        if not (self.state is None):
            return

        try:
            self.state = Robots.fetch(url)

        except ReppyException:
            self.state = False
Пример #23
0
    def get_robots_parser(self, url):
        robots_url = parse.urljoin(url, "/robots.txt")
        r_parser = self._robots_parsers.get(robots_url)

        if not r_parser:
            logger.info("Reading robots.txt: %s", robots_url)
            r_parser = Robots.fetch(robots_url)
            self._robots_parsers[robots_url] = r_parser

        return r_parser
Пример #24
0
def check_robots_txt(sitemap, url, user_agent="googlebot"):
    '''
    Arguments:
    sitemap: String
    url: String
    user_agent String default: googlebot
    '''
    robots = Robots.fetch(sitemap)
    agent = robots.agent(user_agent)
    return agent.allowed(url)
Пример #25
0
def init_robot_checker(respect_robots, user_agent, start_url):
    if respect_robots:
        start_path = urllib.parse.urlparse(start_url).path
        robots_url = start_url.replace(start_path, '/robots.txt')
        write_log('[INFO]: Reading robots file: {0}'.format(robots_url))
        robots = Robots.fetch(robots_url)
        checker = robots.agent(user_agent)
        return checker.allowed
    else:
        return True
Пример #26
0
 def _remember(self, url):
     # Solved issue when path was just '/'
     urlparsed = urlparse(url)
     robots_url = urlparsed.scheme + "://" + urlparsed.netloc + '/robots.txt'
     write_log('ROBOTS',
               'Reading robots.txt file at: {0}'.format(robots_url),
               package='reppy')
     robots = Robots.fetch(robots_url)
     checker = robots.agent(self.user_agent)
     self.index[urlparsed.hostname] = checker
def CardsCrawler(url):
    print('Dominio: ' + url.geturl())
    count = 0
    paginas = [url]
    paginasVisitadas = []
    robots = Robots.fetch(url.geturl() + '/robots.txt')
    while count < 1000 and len(paginas) > 0:
        count = count + 1
        primeiraPagina = paginas.pop(0)
        print(count, '-', primeiraPagina.geturl(), ' Qtd Paginas=',
              len(paginas))

        # Verifica se a pagina foi visitada.Se foi, procuro o proximo link da lista que não foi visitado.
        # Verifico se o hostname de cada pagina é igual ao do dominio, se não, rocuro o proximo link da lista (navegar apenas dentro do site)
        # Verifico se hostname existe , se não , procuro o proximo link que tenha.
        # Verifico se é possivel crawlear a pagina de acordo com as regras do robot.txt , se não , procuro o proximo link.
        while Visitou(
                primeiraPagina
        ) == True or primeiraPagina.hostname != url.hostname or primeiraPagina.hostname == None or robots.allowed(
                primeiraPagina.geturl(), '*') == False:
            if len(paginas) == 0:
                print('lascou@!')
                return
            primeiraPagina = paginas.pop(0)

        # verificar se a requisição funciana.
        if verificarRequest(primeiraPagina.geturl()) == True:

            # Verficar se o site é dinamico.
            if (Dominio[3] in url.geturl()) or (Dominio[6] in url.geturl()):
                texto = CrawlerDinâmico(primeiraPagina.geturl())
            else:
                #request - usando o content type
                codigo_fonte = rq.get(primeiraPagina.geturl(),
                                      headers={'content-type': 'text/html'})
                # texto do codigo_fonte.
                texto = codigo_fonte.text
            soup = BeautifulSoup(texto, 'html.parser')

            # Encontrar todos os links da pagina(url). # attrs={'href': re.compile("^http://")}
            for link in soup.findAll('a'):
                href = link.get('href')
                # Correção de href relativos para absolutos.
                href = urllib.parse.urljoin(url.geturl(), href)
                paginas.append(urlparse(href))

            # Marco como visitada a urlparse
            paginasVisitadas.append(primeiraPagina)
            #esperar 300 milisegundos para visitar o prox link.
            time.sleep(.300)

        else:
            print('Requisição falhou!')

        Tabela(count, primeiraPagina.geturl(), url.hostname)
Пример #28
0
 def check_robots(self, page_id):
     result_page = self.session.query(Page).filter(
         Page.id == page_id).first()
     result_site = self.session.query(Site).filter(
         Site.id == result_page.site_id).first()
     try:
         robots = Robots.fetch(f'http://{result_site.domain}/robots.txt',
                               timeout=3)
     except Exception:
         return True
     return robots.allowed(result_page.url, 'fri-ieps-kslk')
Пример #29
0
def website_allows_scraping(url):
    """
    Args:
        url: The URL of the website you are trying to check.

    Returns:
        True if the website's policy allows you to scrape. Otherwise False.
    """
    robot_url = Robots.robots_url(url)
    robot = Robots.fetch(robot_url)
    return robot.allowed(url, USER_AGENT)
Пример #30
0
 def agent(self):
     if self._agent is not None:
         return self._agent
     try:
         self._agent = Robots.fetch(self.robots_txt_url)
         self._agent.agent(self.agent_name)
         return self._agent
     except Exception as e:
         logging.debug(
             f"Getting agent `{self.agent_name}` for `{self.robots_txt_url}` failed with {e}"
         )
         raise e
Пример #31
0
    def __init__(self,
                 url,
                 links=False,
                 links_unique=True,
                 content=False,
                 depth=5,
                 exclusion_pattern=None,
                 check_lang=False,
                 extractors=None,
                 store_request_headers=False,
                 store_response_headers=False,
                 http_user=None,
                 http_pass=None,
                 *args,
                 **kwargs):
        domain = urlparse(url).netloc
        # Setup the rules for link extraction
        if exclusion_pattern:
            self._rules = [
                Rule(LinkExtractor(allow='.*' + domain + '/.*',
                                   deny=exclusion_pattern),
                     callback=self.parse_url,
                     follow=True)
            ]
        else:
            self._rules = [
                Rule(LinkExtractor(allow='.*' + domain + '/.*'),
                     callback=self.parse_url,
                     follow=True)
            ]
        self.allowed_domains = [domain]
        self.start_urls = [url]
        self.links = links  # Should we store links ?
        self.links_unique = links_unique  # Should we store only unique links ?
        self.content = content  # Should we store content ?
        self.depth = depth  # How deep should we go ?
        self.check_lang = check_lang  # Store check-lang results ?
        self.extractors = extractors  # Custom extractors ?
        self.store_request_headers = store_request_headers
        self.store_response_headers = store_response_headers

        # HTTP Auth
        if http_user and http_pass:
            self.http_user = http_user
            self.http_pass = http_pass

        if self.check_lang:  # Should we check content language ?
            import fasttext
            self.model = fasttext.load_model('data/lid.176.bin')

        # robots.txt enhanced
        self.robots = Robots.fetch(
            urlparse(url).scheme + '://' + domain + '/robots.txt')