Пример #1
0
    def filter(self, ua):
        """Remove all of the urls in URLS that UA is not allowed to crawl,
           and fill in the .crawl_delay and .robots_url properties."""

        rules = None
        for url in sorted(self.urls):
            robots_url = Robots.robots_url(url)
            if self.robots_url != robots_url:
                if self.robots_url is None:
                    try:
                        rules = Robots.fetch(robots_url,
                                             headers={
                                                 'User-Agent': ua
                                             }).agent(ua)
                    except Exception as e:
                        sys.stderr.write(
                            "warning: failed to fetch and parse {}: {}\n".
                            format(robots_url, e))
                        rules = DummyAgent()

                    self.robots_url = robots_url
                    self.crawl_delay = rules.delay or 1

                else:
                    raise ValueError("robots.txt for {} is {}, not {}".format(
                        url, robots_url, self.robots_url))

            if not rules.allowed(url):
                self.urls.remove(url)
Пример #2
0
    def __init__(self,
                 url,
                 limit_pages_counter=1,
                 storage=False,
                 meta={},
                 debug=False,
                 user_agent='Spidar/1.1',
                 allow_external_link_crawling=False,
                 selenium_chrome_driver=None):
        self.__start_url = url
        parse_location = urlparse(url)
        self.__initial_domain_name = parse_location.netloc
        self.__pages = []
        self.__url_to_discover = set()
        self.__url_discovered = set()
        self.__max_counter_pages = limit_pages_counter
        self.__storage = storage
        self.__PATH_STORAGE = '__storage/'
        self.__PATH_SOURCE = self.__PATH_STORAGE + 'sources/'
        self.__PATH_INFO = self.__PATH_STORAGE + 'infos/'
        self.__meta = meta
        self.__debug = debug
        self.__user_agent = user_agent
        self.__allow_external_link_crawling = allow_external_link_crawling
        self.__selenium_chrome_driver = selenium_chrome_driver
        self.__selenium_driver = None
        if self.__selenium_chrome_driver is not None:
            self.__selenium_driver = webdriver.Chrome(
                executable_path=self.__selenium_chrome_driver)

        self.__rp = Robots.fetch(Robots.robots_url(self.__start_url))

        if self.__storage:
            self.__set_up_folders(self.__initial_domain_name)
Пример #3
0
    def filter(self, ua):
        """Remove all of the urls in URLS that UA is not allowed to crawl,
           and fill in the .crawl_delay and .robots_url properties."""

        rules = None
        for url in sorted(self.urls):
            robots_url = Robots.robots_url(url)
            if self.robots_url != robots_url:
                if self.robots_url is None:
                    try:
                        rules = Robots.fetch(robots_url, headers={
                            'User-Agent': ua
                        }).agent(ua)
                    except Exception as e:
                        sys.stderr.write(
                            "warning: failed to fetch and parse {}: {}\n"
                            .format(robots_url, e))
                        rules = DummyAgent()

                    self.robots_url = robots_url
                    self.crawl_delay = rules.delay or 1

                else:
                    raise ValueError(
                        "robots.txt for {} is {}, not {}"
                        .format(url, robots_url, self.robots_url))

            if not rules.allowed(url):
                self.urls.remove(url)
Пример #4
0
 def _check_robots(url):
     """Check that our crawler satisfies robot exclusion standard"""
     try:
         robot_url = Robots.robots_url(url)
         parse = robotparser.RobotFileParser()
         parse.set_url(robot_url)
         parse.read()
         return parse.can_fetch('*', url)
     except:
         return True
Пример #5
0
def website_allows_scraping(url):
    """
    Args:
        url: The URL of the website you are trying to check.

    Returns:
        True if the website's policy allows you to scrape. Otherwise False.
    """
    robot_url = Robots.robots_url(url)
    robot = Robots.fetch(robot_url)
    return robot.allowed(url, USER_AGENT)
Пример #6
0
def filter_urls(urls, ua):
    """Partition URLS (an iterable) into sites, and then filter out all of
    the urls in each site that UA is not allowed to crawl.  Returns a list
    of Site objects."""

    sites = defaultdict(Site)
    for url in urls:
        url = canon_url_syntax(url)
        robots_url = Robots.robots_url(url)
        sites[robots_url].add(url)

    for site in sites.values(): site.filter(ua)
    return sorted(sites.values(), key = lambda s: s.robots_url)
Пример #7
0
def filter_urls(urls, ua):
    """Partition URLS (an iterable) into sites, and then filter out all of
    the urls in each site that UA is not allowed to crawl.  Returns a list
    of Site objects."""

    sites = defaultdict(Site)
    for url in urls:
        url = canon_url_syntax(url)
        robots_url = Robots.robots_url(url)
        sites[robots_url].add(url)

    for site in sites.values():
        site.filter(ua)
    return sorted(sites.values(), key=lambda s: s.robots_url)
Пример #8
0
def findSitemap(url):
    '''
    Find the location of an xml sitemap

    :param url: a URL in string format
    :param robotUrl: The url string of the robots.txt location
    :param robots: The parsed robots.txt patterns from a website
    :param common_locations: a list of common sitemap naming conventions
    :param discoveredSitemaps: a list to store discovered XML sitemaps
    :param makeUrl: Parse input URL to create sitemap location guesses
    :param guessed_sitemap: Mutate input URL into a guessed location
    :param guess: String of sitemap guess URL
    :param r: guessed sitemap request object
    :param guessPath: String of guessed path
    :param responsePath: string of the returned path from request object
    :return: A list of discovered XML sitemap URLs
    '''
    # Website is using best practices
    robotUrl = Robots.robots_url(url)
    robots = Robots.fetch(robotUrl)
    if len(list(robots.sitemaps)) == 0:
        pass
    else:
        return list(robots.sitemaps)
    # Website is not using best practice so take some guesses
    common_locations = ['/sitemap.xml', '/sitemap_index.xml']
    discoveredSitemaps = []
    while len(common_locations) > 0:
        makeUrl = urlparse(url)
        guessed_sitemap = makeUrl._replace(path=common_locations.pop(0))
        guess = guessed_sitemap.geturl()
        r = requests.get(guess)
        # match guessed path of response to make sure page resolves
        guessPath = urlparse(guess).path
        responsePath = urlparse(r.url).path
        if r.status_code == 200 and guessPath == responsePath:
            discoveredSitemaps.append(guess)
        else:
            pass
        return [] if len(discoveredSitemaps) == 0 else discoveredSitemaps
Пример #9
0
def robots_check(link):
    """
    py:function:: robots_check(link)
    Checks if webpage is allowed to be crawled
    :param link: link from crawler_process
    :type link: string containing the url
    :returns allow: true if allowed, false if not
    """
    robots = None
    allow = True
    url = Robots.robots_url(link)
    if 'http' in url:
        try:
            robots = Robots.fetch(url)
        except requests.exceptions.SSLError:
            print("SSLError")
            allow = False
        except:
            allow = False
    if not robots is None:
        allow = robots.allowed(link, 'agent')
    return allow
Пример #10
0
#################################################
#################################################
# fetch content of url

while len(url_frontier) != 0:
    # pop any random url
    url = url_frontier.pop()
    
    try:        
        print("\n---------------------------------------------------------")
        print("Crawling:", url)
        print("---------------------------------------------------------")


        # get crawl delay
        r = robots_cache.fetch(Robots.robots_url(url))[1]

        # check if its allowed to crawl that url? If not, then skip this url
        if not robots_cache.allowed(url, '*'):
            print("This URL is restricted to be crawled.")
            continue

        # insert this link to database
        cur.execute("INSERT OR IGNORE INTO crawled_urls (url_link) values(?)", (url,))

        # if its allowed to crawl, then get the crawling delay
        crawl_delay = r.agent("*").delay

        if crawl_delay is not None:
            time.sleep(crawl_delay)
        else:
Пример #11
0
def get_robot_instance(url):
    txt_location = Robots.robots_url(url)
    return Robots.fetch(txt_location)
Пример #12
0
def crawl(thread_id, conn):
    global is_running, robots_cache, history_cache

    # Init
    print("===Thread " + str(thread_id) + " started===")

    f_info = open("info.log", "a", encoding="utf-8")
    f_link = open("onclick.log", "a", encoding="utf-8")
    f_debug = open("debug.log", "a", encoding="utf-8")

    # Create Selenium webdriver and set User-Agent
    profile = webdriver.FirefoxProfile()
    profile.set_preference("general.useragent.override",
                           USER_AGENT)  # Set User-Agent
    profile.set_preference("devtools.jsonview.enabled",
                           False)  # Disable JSON viewer
    profile.set_preference("dom.enable_window_print",
                           False)  # Disable window.print()
    driver = wirewebdriver.Firefox(profile)
    driver.set_page_load_timeout(SELENIUM_TIMEOUT)

    # Create DB connection cursor
    cur = conn.cursor()

    # Save most recent page_url for debugging
    page_url = None

    # Wrap main loop in try/catch block for exception handling
    try:

        ###################
        # Main loop start #
        ###################

        while is_running:
            # Get next element in frontier
            if not thread_active[thread_id]:
                time.sleep(0.5)  # Add small delay while waiting
            with frontier_lock:
                cur.execute(
                    "SELECT id, url FROM crawler.page WHERE page_type_code = 'FRONTIER' ORDER BY accessed_time ASC"
                )
                res = cur.fetchone()
                if res != None:
                    cur.execute(
                        "UPDATE crawler.page SET page_type_code = 'PROCESSING' WHERE id = %s",
                        (res[0], ))
                frontier_element = res

                if frontier_element == None:
                    thread_active[thread_id] = False
                    if not any(thread_active):
                        break
                    continue
                thread_active[thread_id] = True

            # Get URL and domain
            page_id, page_url = frontier_element
            # print(f"[{thread_id}] Processing URL: " + page_url)
            page_url_parsed = urllib.parse.urlparse(page_url)
            domain = page_url_parsed.netloc

            # Get robots.txt
            robots_url = Robots.robots_url(page_url)
            robots = robots_agent = None
            with robots_lock:
                if robots_url in robots_cache:
                    robots, robots_agent = robots_cache[robots_url]
                else:
                    try:
                        robots = Robots.fetch(
                            robots_url, headers={"User-Agent": USER_AGENT})
                        robots_agent = robots.agent(USER_AGENT)
                        # Logging robots.txt
                        f = open("robots.log", "a")
                        if robots.sitemaps:
                            f.write(
                                f"[SITEMAP] {robots.sitemaps} on {domain}\n")
                        if robots_agent.delay != None:
                            f.write(
                                f"[CRAWL DELAY] {robots_agent.delay} on domain {domain}\n"
                            )
                        f.close()
                    except:  # Error fetching robots.txt
                        pass
                    robots_cache[robots_url] = (robots, robots_agent)

            # Get site data
            site_id = None
            cur.execute("SELECT * FROM crawler.site WHERE domain = %s",
                        (domain, ))
            res = cur.fetchone()
            if res == None:
                # Get robots.txt and sitemap.xml content
                robots_content = curl(robots_url)
                sitemap_content = None
                if robots != None and robots.sitemaps != None:  # Check sitemaps in robots.txt
                    for sitemap_url in robots.sitemaps:
                        sitemap_content = curl(sitemap_url)
                        if sitemap_content != None:
                            break
                if sitemap_content == None:  # Check /sitemap.xml
                    sitemap_url = f"{page_url_parsed.scheme}://{page_url_parsed.netloc}/sitemap.xml"
                    sitemap_content = curl(sitemap_url)

                # Insert site into DB
                cur.execute(
                    "INSERT INTO crawler.site(domain, robots_content, sitemap_content) VALUES (%s, %s, %s) RETURNING id",
                    (domain, robots_content, sitemap_content))
                site_id = cur.fetchone()[0]
            else:
                # Fetch site from DB
                site_id = res[0]

            # Check robots.txt compliance
            ## Allowed/disallowed
            if robots_agent != None:
                if not robots_agent.allowed(
                        page_url):  # Mark page as disallowed
                    with frontier_lock:
                        cur.execute(
                            "UPDATE crawler.page " +
                            "SET page_type_code = 'DISALLOWED', accessed_time = now() "
                            + "WHERE id = %s", (page_id, ))
                    continue
            ## Crawl delay
            crawl_delay = CRAWL_DELAY
            if robots_agent != None and robots_agent.delay != None:
                crawl_delay = robots_agent.delay
            with history_lock:
                if domain in history_cache:
                    last_time = history_cache[domain]
                    curr_time = time.time()
                    if curr_time - last_time > crawl_delay:  # Update last accessed time
                        history_cache[domain] = curr_time
                    else:  # Move page to the back of the frontier
                        with frontier_lock:
                            cur.execute(
                                "UPDATE crawler.page " +
                                "SET page_type_code = 'FRONTIER', accessed_time = now() "
                                + "WHERE id = %s", (page_id, ))
                        time.sleep(
                            0.1
                        )  # Small delay before getting next frontier element
                        continue
                else:
                    history_cache[domain] = time.time()

            # Get HTTP header
            http_status_code = None
            page_content_type = None
            page_content_disposition = None
            try:
                response = requests.head(page_url,
                                         allow_redirects=True,
                                         headers={"User-Agent": USER_AGENT},
                                         timeout=REQUEST_TIMEOUT)
                http_status_code = response.status_code
                if "Content-Type" in response.headers:
                    page_content_type = response.headers["Content-Type"]
                if "Content-Disposition" in response.headers:
                    page_content_disposition = response.headers[
                        "Content-Disposition"]
            except requests.exceptions.SSLError:
                pass
            except requests.exceptions.Timeout:
                pass
            except requests.exceptions.ConnectionError:
                pass
            except Exception as e:
                print(
                    f"[HEAD UNKNOWN] Error getting URL HEAD: {page_url}\n\t{e}"
                )
                f_debug.write(
                    f"[HEAD UNKNOWN] Error getting URL HEAD: {page_url}\n\t{e}\n"
                )

            # Check if page is a binary file
            if page_content_type != None and not page_content_type.startswith("text/") or \
               page_content_disposition != None and not page_content_disposition.startswith("inline"):
                cur.execute(
                    "UPDATE crawler.page " +
                    "SET site_id = %s, page_type_code = 'BINARY', " +
                    "http_status_code = %s, accessed_time = now() " +
                    "WHERE id = %s", (site_id, http_status_code, page_id))
                continue

            # Get HTTP body
            try:
                driver.get(page_url)
                time.sleep(SELENIUM_WAIT)
            except TimeoutException:
                timestamp = datetime.now().strftime("%H:%M:%S")
                print(
                    f"[GET TIMEOUT] [{timestamp}] Timeout on URL: {page_url}\n"
                )
                f_debug.write(
                    f"[GET TIMEOUT] [{timestamp}] Timeout on URL: {page_url}\n\n"
                )
                with frontier_lock:
                    cur.execute(
                        "UPDATE crawler.page " +
                        "SET site_id = %s, page_type_code = 'UNAVAILABLE', html_content = NULL, "
                        +
                        "html_hash = NULL, http_status_code = NULL, accessed_time = now() "
                        + "WHERE id = %s", (site_id, page_id))
                continue
            except Exception as e:
                print(f"[GET UNKNOWN] Unable to get URL: {page_url}\n\t{e}")
                f_debug.write(
                    f"[GET UNKNOWN] Unable to get URL: {page_url}\n\t{e}\n")
                with frontier_lock:
                    cur.execute(
                        "UPDATE crawler.page " +
                        "SET site_id = %s, page_type_code = 'UNAVAILABLE', html_content = NULL, "
                        +
                        "html_hash = NULL, http_status_code = NULL, accessed_time = now() "
                        + "WHERE id = %s", (site_id, page_id))
                continue
            # Close alert if present
            try:
                alert = driver.switch_to.alert
                alert.dismiss()  # alert.accept()
                f_debug.write(f"[ALERT PRESENT] On URL: {page_url}\n")
            except NoAlertPresentException as e:
                pass
            # Get HTML source
            html = driver.page_source
            driver_requests = {}
            for request in driver.requests:
                driver_requests[request.url] = request.response

            # Get html hash and check for duplicates
            html_hash = str(hash(html))
            cur.execute("SELECT id FROM crawler.page WHERE html_hash = %s",
                        (html_hash, ))
            res = cur.fetchone()
            if res != None:  # Duplicate detected
                duplicate_page_id = res[0]
                cur.execute(
                    "UPDATE crawler.page " +
                    "SET site_id = %s, page_type_code = 'DUPLICATE', html_content = NULL, "
                    +
                    "html_hash = NULL, http_status_code = %s, accessed_time = now() "
                    + "WHERE id = %s", (site_id, http_status_code, page_id))
                cur.execute(
                    "INSERT INTO crawler.link(from_page, to_page) VALUES (%s, %s)",
                    (page_id, duplicate_page_id))
                continue

            # Update page in DB
            cur.execute(
                "UPDATE crawler.page " +
                "SET site_id = %s, page_type_code = 'HTML', html_content = %s, "
                +
                "html_hash = %s, http_status_code = %s, accessed_time = now() "
                + "WHERE id = %s",
                (site_id, html, html_hash, http_status_code, page_id))

            # Find all href links
            # <area> link example - https://www.stopbirokraciji.gov.si/
            elems = driver.find_elements_by_xpath(
                "//a[@href] | //area[@href]")  # "//body//*[@href]"
            for elem in elems:
                try:
                    href = elem.get_attribute("href")
                    if href != None:
                        process_link(cur, page_id, page_url, href, f_info,
                                     f_link, f_debug)
                except StaleElementReferenceException as e:
                    f_debug.write(
                        f"[STALE ELEMENT] On URL: {page_url}\n\tException: {e}\n"
                    )
                    continue
            # Check if any non-<a> tags contain href for debugging purposes
            elems = driver.find_elements_by_xpath("//body//*[@href]")
            for elem in elems:
                try:
                    href = elem.get_attribute("href")
                    if elem.tag_name != "a" and elem.tag_name != "area" and href != None and not href.startswith(
                            "#") and not href.startswith("javascript:"):
                        f_debug.write(
                            f"[HREF TAG] <{elem.tag_name}>, href='{href}' on URL {page_url}\n"
                        )
                except StaleElementReferenceException as e:
                    f_debug.write(
                        f"[STALE ELEMENT] On URL: {page_url}\n\tException: {e}\n"
                    )
                    continue

            # Find all onclick links
            # document.location, self.location, window.location, location.href
            elems = driver.find_elements_by_xpath("//*[@onclick]")
            for elem in elems:
                onclick = elem.get_attribute("onclick")
                if onclick != None and onclick.strip() != "":
                    f_link.write(onclick + "\n--------\n")
                    # matches = re.findall(r"((document\.location)|(location\.href)|(self\.location)|(window\.location))(.|\n)*?(;|$)", onclick)
                    matches = re.findall(
                        r"(((document|window|self)\.location|location\.href)[^;]*)",
                        onclick)
                    for match in matches:
                        result = re.search("(\".*\")|('.*')|(`.*`)", match[0])
                        if result != None:
                            onclick_url = result.group()[1:-1]
                            f_link.write(onclick_url + "\n")
                            process_link(cur, page_id, page_url, onclick_url,
                                         f_info, f_link, f_debug)

                    f_link.write("\n\n")

            # Find all images
            elems = driver.find_elements_by_xpath("//img[@src]")
            for elem in elems:
                img_src = elem.get_attribute("src")
                if img_src != None:
                    # Parse src
                    img_src_parsed = urllib.parse.urlparse(img_src)
                    img_path = img_src_parsed.path
                    img_query = img_src_parsed.query

                    # Ignore empty src
                    if img_src.strip() == "":
                        continue
                    # Ignore base64 images
                    if img_src.startswith("data:"):
                        continue
                    # Ignore src="#"
                    if img_path == "/" and img_query == "" and img_src.endswith(
                            "#"):
                        continue

                    # Get image name
                    # https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img
                    img_exts = [
                        ".jpg", ".jpeg", ".png", ".svg", ".gif", ".webp",
                        ".apng", ".avif", ".bmp", ".ico", ".cur", ".tif",
                        ".tiff"
                    ]
                    img_name = None
                    img_ext = None
                    for img_ext in img_exts:
                        if img_path.lower().endswith(img_ext):
                            img_name = os.path.basename(img_path)
                            break
                    if img_name == None:
                        img_name = os.path.basename(img_path)
                        if img_name == "":
                            print(
                                f"[IMG NAME ERROR] src {img_src} on URL {page_url}"
                            )
                            f_debug.write(
                                f"[IMG NAME ERROR] src {img_src} on URL {page_url}\n"
                            )
                            img_name = None

                    # Get image content type
                    img_url = urllib.parse.urljoin(page_url, img_src)
                    img_content_type = None
                    if img_url in driver_requests:  # img_src
                        # Check Selenium request
                        response = driver_requests[img_url]
                        if response and response.status_code == 200 and "Content-Type" in response.headers:
                            img_content_type = response.headers["Content-Type"]
                    else:
                        # Manually send request
                        try:
                            response = requests.head(
                                img_url,
                                allow_redirects=True,
                                headers={"User-Agent": USER_AGENT},
                                timeout=REQUEST_TIMEOUT)
                            if response.status_code == 200 and "Content-Type" in response.headers:
                                img_content_type = response.headers[
                                    "Content-Type"]
                        except requests.exceptions.SSLError:
                            f_debug.write(
                                f"[IMG HEAD SSL] SSL exception on src: {img_url}\n"
                            )
                        except requests.exceptions.Timeout:
                            f_debug.write(
                                f"[IMG HEAD TIMEOUT] Timeout exception on src: {img_url}\n"
                            )
                        except requests.exceptions.ConnectionError:
                            f_debug.write(
                                f"[IMG HEAD CONNECTION] Connection error on src: {img_url}\n"
                            )
                            pass
                        except Exception as e:
                            print(
                                f"[IMG HEAD UNKNOWN] Unknown exception on src: {img_url}\n\tOn page: {page_url}\n\t{e}"
                            )
                            f_debug.write(
                                f"[IMG HEAD UNKNOWN] Unknown exception on src: {img_url}\n\t{e}\n"
                            )
                    # Check if file is an image
                    if img_content_type != None:
                        if not img_content_type.startswith("image/"):
                            f_debug.write(
                                f"[IMG CONTENT TYPE] On src: {img_url}\n\tOn url: {page_url}\n"
                            )
                            continue
                    # If requests failed, get content type from filename
                    if img_content_type == None and img_ext != None:
                        img_content_type = f"image/{img_ext[1:]}"

                    # Error check
                    if img_name == None or img_content_type == None:
                        print(
                            f"[IMG META ERROR] src {img_src} on URL {page_url}"
                        )
                        f_debug.write(
                            f"[IMG META ERROR] src {img_src} on URL {page_url}\n"
                        )

                    # Save image metadata to DB
                    cur.execute(
                        "INSERT INTO crawler.image(page_id, filename, content_type) "
                        + "VALUES (%s, %s, %s) ",
                        (page_id, img_name, img_content_type))

        #################
        # Main loop end #
        #################

    except Exception as e:
        # Print exception
        print(f"[{thread_id}] [UNHANDLED EXCEPTION] on URL: {page_url}\n\t{e}")
        f_debug.write(
            f"[{thread_id}] [UNHANDLED EXCEPTION] on URL: {page_url}\n\t{e}\n")

    # Cleanup
    f_info.close()
    f_link.close()
    f_debug.close()

    thread_active[thread_id] = False
    cur.close()
    driver.close()
    print("===Thread " + str(thread_id) + " finished===")
Пример #13
0
    def __extract_info(self, url):

        self.__print_debug('crawling page', url)

        parsed_url = urlparse(url)
        if parsed_url.netloc == self.__initial_domain_name:
            if not self.__rp.allowed(url, self.__user_agent):
                self.__print_debug('disallowed by user agent')
                return None
        else:
            current_robot = Robots.fetch(Robots.robots_url(url))
            if not current_robot.allowed(url, self.__user_agent):
                self.__print_debug('disallowed by user agent')
                return None

        content, is_html, language = self.__crawl_page(url)
        if content is None:
            return None

        path = urlparse(url).path.replace('/', '_')
        if path is None or path == '':
            path = '__index__'

        if self.__storage:
            self.__set_up_folders(parsed_url.netloc)
            fsource = open(
                self.__PATH_SOURCE + parsed_url.netloc + '/' + path + '.html',
                'wb')
            fsource.write(content)
            fsource.close()

        if not is_html:
            self.__pages.append({
                'content': content,
                'language': language,
                'url': url,
                'html': content
            })
            return content
        soup = BeautifulSoup(content, 'html.parser')

        for link in soup.find_all('a'):
            href = link.get('href')
            if href is None or '#' in href:
                continue
            if href.startswith('http'):
                self.__add_url(href)
                continue

            if href.startswith('mailto'):
                continue

            new_url = str(urljoin(url, href))
            self.__add_url(new_url)
        texts = soup.findAll(text=True)
        visible_texts = filter(self.__tag_visible, texts)

        visible_texts = ' '.join(t.strip() for t in visible_texts
                                 if t.strip() != '')

        if self.__storage:
            fout = open(
                self.__PATH_INFO + parsed_url.netloc + '/' + path + '.json',
                'w')
            fout.write(
                json.dumps({
                    'url': url,
                    'domain_name': parsed_url.netloc,
                    'html': content.decode('utf-8'),
                    'language': language,
                    'content': visible_texts,
                    'meta': self.__meta,
                }))
            fout.close()

        self.__pages.append({
            'content': visible_texts,
            'language': language,
            'url': url,
            'html': content
        })
Пример #14
0
from reppy.robots import Robots

#grab robots url
url = Robots.robots_url('https://science.rpi.edu/computer-science')

if 'http' in url:
    #print(url)
    robots = Robots.fetch(url)
    #print(robots)
    print(robots.allowed('https://science.rpi.edu/computer-science/', 'agent'))
    print(robots.allowed('https://science.rpi.edu/admin/', 'agent'))
Пример #15
0
def parse_webpages(webpages):
    for page in webpages:
        # obtain the robots.txt url
        r = Robots.robots_url(page)
        robots = Robots.fetch(r)
        if (robots.allowed(page, '*')):
            # sitemaps is a list of all the sitemaps for a website
            sitemaps = robots.sitemaps
            sitemaps_list = list(sitemaps)
            html = requests.get(page)  # html of the webpage
            soup = bs4.BeautifulSoup(html.text, "html.parser")
            outlinks = soup.find_all("a")  # all the outlinks
            links = [str(i.get('href')) for i in outlinks]
            outlinks = [str(i) for i in outlinks]
            docs = []  # the documents on the page

            for file in links:
                directory = page.rsplit("/", 1)[0]
                link = directory + '/' + file

                # can be expanded to other file types with a comma
                if file.endswith(('txt', 'md')):
                    if file.startswith(('http://', 'www.')):
                        text = bs4.BeautifulSoup(
                            requests.get(file).text, "html.parser")
                        ext = file.rsplit(".", 1)[-1]
                        text = [file, ext, text]
                        # text = {'link': link, 'ext': ext, 'text': text}
                        docs.append(text)
                    else:
                        text = bs4.BeautifulSoup(
                            requests.get(link).text, "html.parser")
                        ext = link.rsplit(".", 1)[-1]
                        text = [link, ext, text]
                        # text = {'link': link, 'ext': ext, 'text': text}
                        docs.append(text)
                elif file.endswith(('pdf')):  # special case if PDF
                    x = file
                    try:
                        if file.startswith(('http://', 'www.')):
                            pdf = file.rsplit("/", 1)[-1]
                            response = urlopen(file)
                        else:
                            pdf = file.rsplit("/", 1)[-1]
                            # must first check if pdf is found
                            response = urlopen(link)

                    except urllib.error.HTTPError as e:
                        # if 404 error, put 404 as text
                        text = [link, "pdf", "404"]
                        # text = {'link': link, 'ext': 'pdf', 'text': "404"}
                        docs.append(text)

                    else:
                        # otherwise must save the pdf to run pypdf2
                        file = open(pdf, 'wb')
                        file.write(response.read())
                        file.close()
                        if x.startswith('http://'):
                            link = x
                        txt = ""
                        file = open(pdf, 'rb')
                        parser = PDFParser(file)
                        document = PDFDocument(parser)
                        rsrcmgr = PDFResourceManager()
                        laparams = LAParams()
                        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                        interpreter = PDFPageInterpreter(rsrcmgr, device)
                        for p in PDFPage.create_pages(document):
                            # As the interpreter processes the page stored in PDFDocument object
                            interpreter.process_page(p)
                            # The device renders the layout from interpreter
                            layout = device.get_result()
                            # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
                            for lt_obj in layout:
                                if isinstance(lt_obj, LTTextBox) or isinstance(
                                        lt_obj, LTTextLine):
                                    txt += lt_obj.get_text()

                        # close the pdf file
                        file.close()
                        name = [link, "pdf", txt]
                        # name = {'link': link, 'ext': 'pdf', 'text': txt}
                        os.remove(pdf)  # remove the saved file when done
                        docs.append(name)

            docs = [[str(i) for i in lis] for lis in docs]
            timestamp = datetime.datetime.now().isoformat()
            output = {
                'url': page,
                'timestamp': timestamp,
                'outlinks': outlinks,
                'html': html.text,
                'docs': docs,
                'sitemaps': sitemaps_list
            }

            with Crawling_L_REST.app.app_context():
                Crawling_L_REST.add_webpage(output)

            return output
Пример #16
0
 def can_crawl(self, url: str) -> bool:
     ret = False
     robots_url = Robots.robots_url(url)
     robots = Robots.fetch(robots_url, headers={'user-agent': 'slurper'})
     ret = robots.allowed(url, 'slurper')
     return ret
 def __init__(self, seed_url, user_agent):
     self.seed_url = seed_url
     self.user_agent = user_agent
     self.robots_url = Robots.robots_url(seed_url)
     self.robots = Robots.fetch(self.robots_url)
     self.accepted_header_content_type = "text/html"