Exemplo n.º 1
0
    def proxy_address(self, flow: http.HTTPFlow) -> typing.Tuple[str, int]:
        # Check if the URL is known to the CDX server
        playback = False

        # Use the canonicalised URL
        r_url = str(urlcanon.whatwg(urlcanon.parse_url(flow.request.url)))

        # Query the CDX service for this URL:
        ctx.log.info("checking %s..." % r_url)
        r = self.s.get('http://cdxserver:8080/fc',
                       params={
                           'url': r_url,
                           'sort': 'reverse',
                           'limit': 10
                       })

        # Loop through response CDX lines:
        for cdxline in r.iter_lines(decode_unicode=True):
            cdx = cdxline.split(" ")
            # Compare canonicalised URLs (in case an intermediary e.g. adds a default :80 port)
            cdx_url = str(urlcanon.whatwg(urlcanon.parse_url(cdx[2])))
            if r_url == cdx_url:
                ctx.log.info("MATCH")
                playback = True
                break
            else:
                ctx.log.info("NO MATCH '%s' '%s'" % (r_url, cdx_url))

        # Either playback or record, depending on the outcome:
        if playback:
            ctx.log.info("PYWB")
            return ("pywb", 8080)
        else:
            ctx.log.info("WARCPROX")
            return ("warcprox", 8000)
Exemplo n.º 2
0
 def _build_fresh_page(self, site, parent_page, url, hops_off=0):
     url_for_scoping = urlcanon.semantic(url)
     url_for_crawling = urlcanon.whatwg(url)
     hashtag = (url_for_crawling.hash_sign +
                url_for_crawling.fragment).decode('utf-8')
     urlcanon.canon.remove_fragment(url_for_crawling)
     page = brozzler.Page(
         self.rr, {
             'url':
             str(url_for_crawling),
             'site_id':
             site.id,
             'job_id':
             site.job_id,
             'hops_from_seed':
             parent_page.hops_from_seed + 1,
             'hop_path':
             str(parent_page.hop_path if parent_page.hop_path else "") +
             "L",
             'via_page_id':
             parent_page.id,
             'via_page_url':
             parent_page.url,
             'hops_off_surt':
             hops_off,
             'hashtags': [hashtag] if hashtag else []
         })
     return page
Exemplo n.º 3
0
 def _scope_and_enforce_robots(self, site, parent_page, outlinks):
     '''
     Returns tuple (
         dict of {page_id: Page} of fresh `brozzler.Page` representing in
             scope links accepted by robots policy,
         set of in scope urls (canonicalized) blocked by robots policy,
         set of out-of-scope urls (canonicalized)).
     '''
     pages = {}  # {page_id: Page, ...}
     blocked = set()
     out_of_scope = set()
     for url in outlinks or []:
         url_for_scoping = urlcanon.semantic(url)
         url_for_crawling = urlcanon.whatwg(url)
         decision = site.accept_reject_or_neither(
                 url_for_scoping, parent_page=parent_page)
         if decision is True:
             hops_off = 0
         elif decision is None:
             decision = parent_page.hops_off < site.scope.get(
                     'max_hops_off', 0)
             hops_off = parent_page.hops_off + 1
         if decision is True:
             if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
                 fresh_page = self._build_fresh_page(
                         site, parent_page, url, hops_off)
                 if fresh_page.id in pages:
                     self._merge_page(pages[fresh_page.id], fresh_page)
                 else:
                     pages[fresh_page.id] = fresh_page
             else:
                 blocked.add(str(url_for_crawling))
         else:
             out_of_scope.add(str(url_for_crawling))
     return pages, blocked, out_of_scope
Exemplo n.º 4
0
 def extract_outlinks(self, timeout=60):
     self.logger.info('extracting outlinks')
     self.websock_thread.expect_result(self._command_id.peek())
     js = brozzler.jinja2_environment().get_template(
         'extract-outlinks.js').render()
     msg_id = self.send_to_chrome(method='Runtime.evaluate',
                                  params={'expression': js})
     self._wait_for(lambda: self.websock_thread.received_result(msg_id),
                    timeout=timeout)
     message = self.websock_thread.pop_result(msg_id)
     if ('result' in message and 'result' in message['result']
             and 'value' in message['result']['result']):
         if message['result']['result']['value']:
             out = []
             for link in message['result']['result']['value'].split('\n'):
                 try:
                     out.append(str(urlcanon.whatwg(link)))
                 except AddressValueError:
                     self.logger.warning('skip invalid outlink: %s', link)
             return frozenset(out)
         else:
             # no links found
             return frozenset()
     else:
         self.logger.error(
             'problem extracting outlinks, result message: %s', message)
         return frozenset()
Exemplo n.º 5
0
 def _scope_and_enforce_robots(self, site, parent_page, outlinks):
     '''
     Returns tuple (
         dict of {page_id: Page} of fresh `brozzler.Page` representing in
             scope links accepted by robots policy,
         set of in scope urls (canonicalized) blocked by robots policy,
         set of out-of-scope urls (canonicalized)).
     '''
     pages = {}  # {page_id: Page, ...}
     blocked = set()
     out_of_scope = set()
     for url in outlinks or []:
         url_for_scoping = urlcanon.semantic(url)
         url_for_crawling = urlcanon.whatwg(url)
         decision = site.accept_reject_or_neither(
                 url_for_scoping, parent_page=parent_page)
         if decision is True:
             hops_off = 0
         elif decision is None:
             decision = parent_page.hops_off < site.scope.get(
                     'max_hops_off', 0)
             hops_off = parent_page.hops_off + 1
         if decision is True:
             if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
                 fresh_page = self._build_fresh_page(
                         site, parent_page, url, hops_off)
                 if fresh_page.id in pages:
                     self._merge_page(pages[fresh_page.id], fresh_page)
                 else:
                     pages[fresh_page.id] = fresh_page
             else:
                 blocked.add(str(url_for_crawling))
         else:
             out_of_scope.add(str(url_for_crawling))
     return pages, blocked, out_of_scope
Exemplo n.º 6
0
 def _build_fresh_pages(self, site, parent_page, urls):
     '''
     Returns a dict of page_id => brozzler.Page.
     '''
     pages = {}
     for url in urls:
         url_for_scoping = urlcanon.semantic(url)
         url_for_crawling = urlcanon.whatwg(url)
         hashtag = (url_for_crawling.hash_sign +
                    url_for_crawling.fragment).decode('utf-8')
         urlcanon.canon.remove_fragment(url_for_crawling)
         if not url_for_scoping.surt().startswith(
                 site.scope['surt'].encode('utf-8')):
             hops_off_surt = parent_page.hops_off_surt + 1
         else:
             hops_off_surt = 0
         page = brozzler.Page(
             self.rr, {
                 'url': str(url_for_crawling),
                 'site_id': site.id,
                 'job_id': site.job_id,
                 'hops_from_seed': parent_page.hops_from_seed + 1,
                 'via_page_id': parent_page.id,
                 'hops_off_surt': hops_off_surt,
                 'hashtags': []
             })
         if page.id in pages:
             pages[page.id].priority += page.priority
             page = pages[page.id]
         else:
             pages[page.id] = page
         if hashtag:
             page.hashtags = list(set(page.hashtags + [hashtag]))
     return pages
Exemplo n.º 7
0
def test_w3c_test_data(input, href, test):
    url = urlcanon.parse_url(input)
    urlcanon.whatwg(url)
    assert test['protocol'].encode('utf-8') == (url.scheme +
                                                url.colon_after_scheme)
    assert test['username'].encode('utf-8') == url.username
    assert test['password'].encode('utf-8') == url.password
    assert test['host'].encode('utf-8') == url.host_port
    assert test['hostname'].encode('utf-8') == url.host
    assert test['pathname'].encode('utf-8') == url.path
    assert test['search'].encode('utf-8') == (url.query and
                                              (url.question_mark + url.query)
                                              or b'')
    assert test['hash'].encode('utf-8') == (url.fragment and
                                            (url.hash_sign + url.fragment)
                                            or b'')
    assert test['href'] == unicode(url)
Exemplo n.º 8
0
 def url_to_canon(self, url):
     parsed_url = urlcanon.parse_url(url)
     urlcanon.whatwg(parsed_url)
     parsed_url = str(parsed_url)
     if parsed_url.lower().endswith("index.html"):
         parsed_url = parsed_url[:parsed_url.index("index.html")]
     neki2 = parsed_url.rsplit('/', 1)[1]
     if '#' in neki2:
         parsed_url = parsed_url[:parsed_url.index("#")]
     if neki2 != '' and '.' not in neki2 and not neki2.endswith('/'):
         parsed_url += '/'
     parsed_url = urllib.parse.unquote(parsed_url)
     if parsed_url.count(':') == 1:
         ena, dva = parsed_url.split(':')
         if ' ' in dva:
             parsed_url = ena + ':' + urllib.parse.quote(dva)
     parsed_url = url_normalize.url_normalize(parsed_url)
     return parsed_url
Exemplo n.º 9
0
 def visit_hashtags(self, page_url, hashtags, outlinks):
     _hashtags = set(hashtags or [])
     for outlink in outlinks:
         url = urlcanon.whatwg(outlink)
         hashtag = (url.hash_sign + url.fragment).decode('utf-8')
         urlcanon.canon.remove_fragment(url)
         if hashtag and str(url) == page_url:
             _hashtags.add(hashtag)
     # could inject a script that listens for HashChangeEvent to figure
     # out which hashtags were visited already and skip those
     for hashtag in _hashtags:
         # navigate_to_hashtag (nothing to wait for so no timeout?)
         self.logger.debug('navigating to hashtag %s', hashtag)
         url = urlcanon.whatwg(page_url)
         url.hash_sign = b'#'
         url.fragment = hashtag[1:].encode('utf-8')
         self.send_to_chrome(
                 method='Page.navigate', params={'url': str(url)})
         time.sleep(5) # um.. wait for idleness or something?
Exemplo n.º 10
0
 def visit_hashtags(self, page_url, hashtags, outlinks):
     _hashtags = set(hashtags or [])
     for outlink in outlinks:
         url = urlcanon.whatwg(outlink)
         hashtag = (url.hash_sign + url.fragment).decode('utf-8')
         urlcanon.canon.remove_fragment(url)
         if hashtag and str(url) == page_url:
             _hashtags.add(hashtag)
     # could inject a script that listens for HashChangeEvent to figure
     # out which hashtags were visited already and skip those
     for hashtag in _hashtags:
         # navigate_to_hashtag (nothing to wait for so no timeout?)
         self.logger.debug('navigating to hashtag %s', hashtag)
         url = urlcanon.whatwg(page_url)
         url.hash_sign = b'#'
         url.fragment = hashtag[1:].encode('utf-8')
         self.send_to_chrome(
                 method='Page.navigate', params={'url': str(url)})
         time.sleep(5) # um.. wait for idleness or something?
Exemplo n.º 11
0
def normalizeUrl(url, link_value):
    parsed_url_str = None
    for exc in PATH_EXCLUSIONS:
        if exc in url.path:
            return None

    if url.query or url.fragment or url.scheme == "mailto" or url.scheme == "tel" or url.scheme == "data" or url.scheme == "javascript":
        return None

    link_value = eliminateFromURL(link_value, EXTRAS)
    parsed_url = urlcanon.parse_url(link_value)
    urlcanon.whatwg(parsed_url)
    parsed_url_str = str(parsed_url)
    parsed_url_str = parsed_url_str.replace('//', '/')
    if parsed_url_str:
        if parsed_url_str[0] == '.':
            parsed_url_str = parsed_url_str[1:]
        if parsed_url_str[-1] == '/':
            parsed_url_str = parsed_url_str[:-1]
    return parsed_url_str
Exemplo n.º 12
0
def test_url_matches_domain():
    assert urlcanon.url_matches_domain('http://1.2.3.4/', '1.2.3.4')
    assert urlcanon.url_matches_domain(b'scheme://1.2.3.4', '1.2.3.4')
    assert urlcanon.url_matches_domain('ftp://1.2.3.4/a/b/c/d', b'1.2.3.4')
    assert urlcanon.url_matches_domain(b'http://1.2.3.4', b'1.2.3.4')
    assert urlcanon.url_matches_domain(
            'http://foo.example.com', 'example.com')
    assert not urlcanon.url_matches_domain(
            'http://example.com', 'foo.example.com')
    assert not urlcanon.url_matches_domain(
            'http://foo.EXAMPLE.COM', 'example.com')
    assert urlcanon.url_matches_domain(
            urlcanon.whatwg('http://foo.EXAMPLE.COM'), 'example.com')
    assert not urlcanon.url_matches_domain('http://☃.net', 'xn--n3h.net')
    assert urlcanon.url_matches_domain('http://☃.net', '☃.net')
    assert urlcanon.url_matches_domain('http://😬.☃.net', '☃.net')
    assert not urlcanon.url_matches_domain(
            'http://😬.☃.net', urlcanon.normalize_host('☃.net'))
    assert urlcanon.url_matches_domain(
            urlcanon.whatwg('https://😬.☃.net'),
            urlcanon.normalize_host('☃.net'))
Exemplo n.º 13
0
 def _build_fresh_page(self, site, parent_page, url, hops_off=0):
     url_for_scoping = urlcanon.semantic(url)
     url_for_crawling = urlcanon.whatwg(url)
     hashtag = (url_for_crawling.hash_sign
                + url_for_crawling.fragment).decode('utf-8')
     urlcanon.canon.remove_fragment(url_for_crawling)
     page = brozzler.Page(self.rr, {
         'url': str(url_for_crawling),
         'site_id': site.id,
         'job_id': site.job_id,
         'hops_from_seed': parent_page.hops_from_seed + 1,
         'via_page_id': parent_page.id,
         'hops_off_surt': hops_off,
         'hashtags': [hashtag] if hashtag else []})
     return page
Exemplo n.º 14
0
    def _try_youtube_dl(self, ydl, site, page):
        try:
            self.logger.info("trying youtube-dl on {}".format(page))

            with brozzler.thread_accept_exceptions():
                # we do whatwg canonicalization here to avoid "<urlopen error
                # no host given>" resulting in ProxyError
                # needs automated test
                info = ydl.extract_info(str(urlcanon.whatwg(page.url)))
            self._remember_videos(page, ydl.brozzler_spy)
            # logging.info('XXX %s', json.dumps(info))
            if self._using_warcprox(site):
                info_json = json.dumps(info, sort_keys=True, indent=4)
                self.logger.info(
                    "sending WARCPROX_WRITE_RECORD request to warcprox "
                    "with youtube-dl json for %s", page)
                self._warcprox_write_record(
                    warcprox_address=self._proxy_for(site),
                    url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
                    warc_type="metadata",
                    content_type=
                    "application/vnd.youtube-dl_formats+json;charset=utf-8",
                    payload=info_json.encode("utf-8"),
                    extra_headers=site.extra_headers())
        except brozzler.ShutdownRequested as e:
            raise
        except BaseException as e:
            if hasattr(
                    e, "exc_info"
            ) and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
                pass
            elif (hasattr(e, "exc_info")
                  and e.exc_info[0] == urllib.error.HTTPError
                  and hasattr(e.exc_info[1], "code")
                  and e.exc_info[1].code == 420):
                raise brozzler.ReachedLimit(e.exc_info[1])
            elif (hasattr(e, 'exc_info')
                  and e.exc_info[0] == urllib.error.URLError
                  and self._proxy_for(site)):
                # connection problem when using a proxy == proxy error (XXX?)
                raise brozzler.ProxyError(
                    'youtube-dl hit apparent proxy error from '
                    '%s' % page.url) from e
            else:
                raise
Exemplo n.º 15
0
def _try_youtube_dl(worker, ydl, site, page):
    try:
        logging.info("trying yt-dlp on %s", page)

        with brozzler.thread_accept_exceptions():
            # we do whatwg canonicalization here to avoid "<urlopen error
            # no host given>" resulting in ProxyError
            # needs automated test
            # and yt-dlp needs sanitize_info for extract_info
            ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url))))
        _remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups)
        if worker._using_warcprox(site):
            info_json = json.dumps(ie_result, sort_keys=True, indent=4)
            logging.info(
                    "sending WARCPROX_WRITE_RECORD request to warcprox "
                    "with yt-dlp json for %s", page)
            worker._warcprox_write_record(
                    warcprox_address=worker._proxy_for(site),
                    url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
                    warc_type="metadata",
                    content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
                    payload=info_json.encode("utf-8"),
                    extra_headers=site.extra_headers(page))
        return ie_result
    except brozzler.ShutdownRequested as e:
        raise
    except Exception as e:
        if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
            return None
        elif (hasattr(e, "exc_info")
                and e.exc_info[0] == urllib.error.HTTPError
                and hasattr(e.exc_info[1], "code")
                and e.exc_info[1].code == 420):
            raise brozzler.ReachedLimit(e.exc_info[1])
        elif (hasattr(e, 'exc_info')
                and e.exc_info[0] == urllib.error.URLError
                and worker._proxy_for(site)):
            # connection problem when using a proxy == proxy error (XXX?)
            raise brozzler.ProxyError(
                    'yt-dlp hit apparent proxy error from '
                    '%s' % page.url) from e
        else:
            raise
Exemplo n.º 16
0
def _try_youtube_dl(worker, ydl, site, page):
    try:
        logging.info("trying youtube-dl on %s", page)

        with brozzler.thread_accept_exceptions():
            # we do whatwg canonicalization here to avoid "<urlopen error
            # no host given>" resulting in ProxyError
            # needs automated test
            ie_result = ydl.extract_info(str(urlcanon.whatwg(page.url)))
        _remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups)
        if worker._using_warcprox(site):
            info_json = json.dumps(ie_result, sort_keys=True, indent=4)
            logging.info(
                    "sending WARCPROX_WRITE_RECORD request to warcprox "
                    "with youtube-dl json for %s", page)
            worker._warcprox_write_record(
                    warcprox_address=worker._proxy_for(site),
                    url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
                    warc_type="metadata",
                    content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
                    payload=info_json.encode("utf-8"),
                    extra_headers=site.extra_headers())
        return ie_result
    except brozzler.ShutdownRequested as e:
        raise
    except Exception as e:
        if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
            return None
        elif (hasattr(e, "exc_info")
                and e.exc_info[0] == urllib.error.HTTPError
                and hasattr(e.exc_info[1], "code")
                and e.exc_info[1].code == 420):
            raise brozzler.ReachedLimit(e.exc_info[1])
        elif (hasattr(e, 'exc_info')
                and e.exc_info[0] == urllib.error.URLError
                and worker._proxy_for(site)):
            # connection problem when using a proxy == proxy error (XXX?)
            raise brozzler.ProxyError(
                    'youtube-dl hit apparent proxy error from '
                    '%s' % page.url) from e
        else:
            raise
Exemplo n.º 17
0
 def _scope_and_enforce_robots(self, site, parent_page, outlinks):
     '''
     Returns tuple (
         set of in scope urls (uncanonicalized) accepted by robots policy,
         set of in scope urls (canonicalized) blocked by robots policy,
         set of out-of-scope urls (canonicalized)).
     '''
     in_scope = set()
     blocked = set()
     out_of_scope = set()
     for url in outlinks or []:
         url_for_scoping = urlcanon.semantic(url)
         url_for_crawling = urlcanon.whatwg(url)
         urlcanon.canon.remove_fragment(url_for_crawling)
         if site.is_in_scope(url_for_scoping, parent_page=parent_page):
             if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
                 in_scope.add(url)
             else:
                 blocked.add(str(url_for_crawling))
         else:
             out_of_scope.add(str(url_for_crawling))
     return in_scope, blocked, out_of_scope
Exemplo n.º 18
0
    def run(self):
        while not frontier.empty():
            # get next url from frontier
            url = frontier.get()

            # parse url to get base url and domain name
            split_url = urlsplit(url)
            base = "{0.netloc}".format(split_url)

            domain = base.replace("www.", "") if "www." in base else base
            base_url = "{0.scheme}://{0.netloc}/".format(split_url)

            # first check if can access page
            canAccess = self.checkIPAccessTime(domain)
            if canAccess != None:
                if not canAccess:
                    # return url to frontier and move on to the next url
                    frontier.put(url)
                    continue
            else:
                continue

            # check if site already saved
            robotLock.acquire()
            site = self.findSiteByDomain(domain)
            if site:
                robotLock.release()
                siteID = site[0]
                robot_content = site[2]
            else:
                # retrieve robots.txt content
                try:
                    r = requests.get(parse.urljoin(base_url, 'robots.txt'))
                    robot_content = None

                    # if it exists, save it
                    if r.status_code == requests.codes.ok:
                        robot_content = r.text
                except(requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL, requests.exceptions.InvalidSchema):
                    robot_content = None

                # wait some time
                time.sleep(MINOR_TIMEOUT)

                # get sitemap.xml
                try:
                    s = requests.get(parse.urljoin(base_url, 'sitemap.xml'))
                    sitemap_content = None

                    # if it exists save it
                    if s.status_code == requests.codes.ok:
                        sitemap_content = s.text
                except(requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL, requests.exceptions.InvalidSchema):
                    sitemap_content = None

                # wait some time
                time.sleep(MINOR_TIMEOUT)

                # save site
                siteID = self.insertSite(domain, robot_content, sitemap_content)
                robotLock.release()

            # create robot file parser object
            robot = robotexclusionrulesparser.RobotExclusionRulesParser()
            if robot_content:
                robot.parse(robot_content)

            # check if current url is allowed by robots.txt
            duplicatesLock.acquire()
            if not robot.is_allowed(USER_AGENT, url):
                pageID = self.findPageByUrl(url)
                self.deleteLinkByID(pageID)
                self.deletePageByUrl(url)
                duplicatesLock.release()
                continue

            duplicatesLock.release()

            # download content from url
            try:
                self.webDriver.get(url)
                time.sleep(TIMEOUT)
            except TimeoutException:
                # save timeout
                if pageID:
                    # page already saved
                    self.updatePage(pageID, siteID, PAGE_TIMEOUT, None, req.response.status_code, datetime.now())
                else:
                    # save new page
                    pageID = self.insertPage(siteID, PAGE_TIMEOUT, url, None, req.response.status_code, datetime.now())

                # continue to next url in frontier
                del self.webDriver.requests
                print(f"Worker {self.threadID}: {url} done...")
                continue

            # retrieve request that loaded page
            req = None
            for request in self.webDriver.requests:
                if request.response and request.response.status_code >= 300 and request.response.status_code <= 399:
                    continue

                if request.response and request.path == url:
                    req = request
                    break

                if request.response and request.response.status_code == requests.codes.ok:
                    req = request
                    break

            if req == None:
                for request in self.webDriver.requests:
                    if request.response:
                        if request.response.status_code == 403 or request.response.status_code == 503:
                            req = request
                            break

                if not req:
                    req = self.webDriver.last_request

            # check page type and save page info
            pageID = self.findPageByUrl(url)
            if req and req.response:
                content_type = req.response.headers.get('Content-Type')
                if content_type:
                    if "text/html" in content_type:
                        # HTML page

                        # check for canonical link
                        try:
                            canonicalLink = self.webDriver.find_element_by_xpath("//link[@rel='canonical']")
                            if canonicalLink:
                                link = canonicalLink.get_attribute('href')

                                if link != url:
                                    # is duplicate
                                    duplicatesLock.acquire()

                                    # check if original page already saved
                                    originalPageID = self.findPageByUrl(link)
                                    if originalPageID:
                                        duplicatesLock.release()

                                        if pageID:
                                            # page already saved
                                            self.updatePage(pageID, None, DUPLICATE, None, None, datetime.now())
                                        else:
                                            # save new page and remember id
                                            pageID = self.insertPage(None, DUPLICATE, None, None, None, datetime.now())

                                        # add link to original page
                                        self.insertLink(pageID, originalPageID)

                                        # continue to next url in frontier
                                        del self.webDriver.requests
                                        print(f"Worker {self.threadID}: {url} done...")
                                        continue
                                    else:
                                        # create blank page
                                        originalPageID = self.insertPage(None, FRONTIER, link, None, None, None)
                                        duplicatesLock.release()

                                        if pageID:
                                            # page already saved
                                            self.updatePage(pageID, None, DUPLICATE, None, None, datetime.now())
                                        else:
                                            # save new page and remember id
                                            pageID = self.insertPage(None, DUPLICATE, None, None, None, datetime.now())

                                        # add link to original page
                                        self.insertLink(pageID, originalPageID)

                                        # add url to frontier
                                        frontier.put(link)

                                        # continue to next url in frontier
                                        del self.webDriver.requests
                                        print(f"Worker {self.threadID}: {url} done...")
                                        continue
                        except(NoSuchElementException, StaleElementReferenceException):
                            pass

                        # check for duplicate content
                        originalPageID = self.findPageByContent(self.webDriver.page_source)
                        if originalPageID:
                            # is duplicate
                            if pageID:
                                # page already saved
                                self.updatePage(pageID, None, DUPLICATE, None, None, datetime.now())
                            else:
                                # save new page and remember id
                                pageID = self.insertPage(None, DUPLICATE, None, None, None, datetime.now())

                            # add link to original page
                            self.insertLink(pageID, originalPageID)

                            # continue to next url in frontier
                            del self.webDriver.requests
                            print(f"Worker {self.threadID}: {url} done...")
                            continue

                        # not duplicate
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, FRONTIER_HTML, self.webDriver.page_source, req.response.status_code, datetime.now())
                        else:
                            # save new page and remember id
                            pageID = self.insertPage(siteID, FRONTIER_HTML, url, self.webDriver.page_source, req.response.status_code, datetime.now())

                        # let through only pages that loaded successfully
                        if req.response.status_code != requests.codes.ok:
                            del self.webDriver.requests
                            print(f"Worker {self.threadID}: {url} done...")
                            continue
                    elif "text/plain" in content_type:
                        # TXT content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, TXT)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "application/pdf" in content_type:
                        # PDF content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, PDF)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "application/msword" in content_type:
                        # DOC content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, DOC)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "application/vnd.openxmlformats-officedocument.wordprocessingml.document" in content_type:
                        # DOCX content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, DOCX)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "application/vnd.ms-powerpoint" in content_type:
                        # PPT content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, PPT)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "application/vnd.openxmlformats-officedocument.presentationml.presentation" in content_type:
                        # PPTX content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, PPTX)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "image" in content_type:
                        # IMAGE content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # parse file name
                        filename = urlparse(url)

                        # insert image data
                        self.insertImage(pageID, os.path.basename(filename.path), content_type, datetime.now())

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "text/css" in content_type:
                        # CSS content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, CSS)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "text/csv" in content_type:
                        # CSV content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, CSV)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "application/zip" in content_type:
                        # ZIP content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, ZIP)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    else:
                        # unknown BINARY content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, UNKNOWN)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                else:
                    # no content header -> mark page as UNDEFINED
                    if pageID:
                        # page already saved
                        self.updatePage(pageID, siteID, UNDEFINED, None, req.response.status_code, datetime.now())
                    else:
                        # save new page
                        pageID = self.insertPage(siteID, UNDEFINED, url, None, req.response.status_code, datetime.now())

                    # continue to next url in frontier
                    del self.webDriver.requests
                    print(f"Worker {self.threadID}: {url} done...")
                    continue
            else:
                # some kind of error happened
                if pageID:
                    # page already saved
                    self.updatePage(pageID, siteID, NO_RESPONSE, None, None, datetime.now())
                else:
                    # save new page
                    pageID = self.insertPage(siteID, NO_RESPONSE, url, None, None, datetime.now())

                # continue to next url in frontier
                del self.webDriver.requests
                print(f"Worker {self.threadID}: {url} done...")
                continue

            # only if page is of HTML type
            # extract links

            # href
            elements = self.webDriver.find_elements_by_xpath("//*[@href]")
            for element in elements:
                try:
                    link = element.get_attribute('href')

                    # check if url allowed by robots.txt and if is from .gov.si
                    if self.isGov(link) and robot.is_allowed(USER_AGENT, link):
                        # canonicalize url
                        link = str(urlcanon.whatwg(urlcanon.parse_url(link)))

                        # add url to frontier
                        self.addUrlToFrontier(pageID, link)
                except(NoSuchElementException, StaleElementReferenceException):
                    continue

            # onclick
            elements = self.webDriver.find_elements_by_xpath("//*[@onclick]")
            for element in elements:
                try:
                    line = element.get_attribute('onclick')
                    if line:
                        link = ""
                        if "location.href='" in line:
                            rightLine = line.split("location.href='")[1]
                            link = rightLine.split("'")[0]
                        elif "document.location='" in line:
                            rightLine = line.split("document.location='")[1]
                            link = rightLine.split("'")[0]

                        if link != "":
                            # check if url allowed by robots.txt and if is from .gov.si
                            if self.isGov(link) and robot.is_allowed(USER_AGENT, link):
                                # canonicalize url
                                link = str(urlcanon.whatwg(urlcanon.parse_url(link)))

                                # add url to frontier
                                self.addUrlToFrontier(pageID, link)
                except(NoSuchElementException, StaleElementReferenceException):
                    continue

            # extract images
            elements = self.webDriver.find_elements_by_tag_name('img')
            for element in elements:
                try:
                    link = element.get_attribute('src')

                    # check if url allowed by robots.txt, if is from .gov.si and if src attribute has URL
                    if self.isGov(link) and robot.is_allowed(USER_AGENT, link) and re.match(self.urlValidator, link):
                        link = str(urlcanon.whatwg(urlcanon.parse_url(link)))

                        self.addUrlToFrontier(pageID, link)
                except(NoSuchElementException, StaleElementReferenceException):
                    continue

            del self.webDriver.requests
            print(f"Worker {self.threadID}: {url} done...")

        self.conn.close()
        self.webDriver.quit()
        print(f"Worker {self.threadID}: finished crawling.")
Exemplo n.º 19
0
def clean_url(s: str) -> str:
    s = s.strip()
    parsed = urlcanon.parse_url(s)
    if not parsed.port and parsed.colon_before_port:
        parsed.colon_before_port = b""
    return str(urlcanon.whatwg(parsed))
Exemplo n.º 20
0
def test_supplemental_whatwg(uncanonicalized, canonicalized):
    url = urlcanon.parse_url(uncanonicalized)
    urlcanon.whatwg(url)
    assert url.__bytes__() == canonicalized
Exemplo n.º 21
0
def parse_record(path, node_id, edge_id, process_record, max_identifier_length,
                 dt14):
    with open(path, "rb") as infile:
        # loop on every record in WAT
        for record in ArchiveIterator(infile):
            record_array = []

            if record.rec_type != 'metadata':
                continue

            warc_target_uri = urlcanon.parse_url(
                record.rec_headers.get_header('WARC-Target-URI'))
            urlcanon.whatwg(warc_target_uri)  # canonicalization

            # select only members whose WARC-Target-URI begins with "https?://"
            if not re.search("^https?://", str(warc_target_uri)) or len(
                    str(warc_target_uri)) > max_identifier_length:
                continue

            dt = record.rec_headers.get_header('WARC-Date')

            if dt14:
                dt = dp.parse(dt).strftime('%Y%m%d%H%M%S')

            # construct node with timestamp (VersionNode)
            version_node = {
                "an": {
                    node_id: {
                        "identifier":
                        str(warc_target_uri.ssurt(), encoding='utf-8'),
                        "timestamp":
                        dt,
                        "TYPE":
                        "VersionNode"
                    }
                }
            }

            record_array.append(json.dumps(version_node))
            record_array.append('\r\n')

            source_id = node_id
            node_id += 1

            content = json.loads(record.raw_stream.read().decode('utf-8'))

            try:
                links = content["Envelope"]["Payload-Metadata"][
                    "HTTP-Response-Metadata"]["HTML-Metadata"]["Links"]
            except:
                links = ''

            # loop on links if not empty and get all urls
            if links != '':
                for link in links:
                    # this is for empty outlink elements, maybe a bug in webarchive-commons used to generate WAT
                    try:
                        # convert relative outlink to absolute one
                        url = urljoin(str(warc_target_uri), link["url"])
                        urlcanon.whatwg(url)  # canonicalization

                        # match only urls that begin with "https?://"
                        if not re.search("^https?://", url) or len(
                                str(url)) > max_identifier_length:
                            continue

                        # construct node and edge
                        node = {
                            "an": {
                                node_id: {
                                    "identifier":
                                    str(urlcanon.parse_url(url).ssurt(),
                                        encoding="utf-8"),
                                    "TYPE":
                                    "Node"
                                }
                            }
                        }

                        edge = {
                            "ae": {
                                edge_id: {
                                    "directed": "true",
                                    "source": str(source_id),
                                    "target": str(node_id)
                                }
                            }
                        }

                        record_array.append(json.dumps(node))
                        record_array.append('\r\n')
                        record_array.append(json.dumps(edge))
                        record_array.append('\r\n')

                        node_id += 1
                        edge_id += 1
                    except:
                        continue

            same_batch = process_record(record_array, node_id, edge_id)

            if not same_batch:
                node_id = edge_id = 1
def canon(s: str) -> str:
    parsed = urlcanon.parse_url(s)
    return str(urlcanon.whatwg(parsed))
Exemplo n.º 23
0
    def gather_links(self):

        # Define Browser Options

        soup = BeautifulSoup(self.current_page_html, "lxml")

        # Extract links to profiles from TWDS Authors
        links = set()
        images = set()
        for link in soup.find_all("a"):
            current_url_relative = link.get('href')

            current_url = urllib.parse.urljoin(self.site_currently_crawling[1],
                                               current_url_relative)

            current_parsed_url_urlcanon = urlcanon.parse_url(current_url)
            urlcanon.whatwg(current_parsed_url_urlcanon)

            current_parsed_url = urllib.parse.urlparse(current_url)

            if (current_parsed_url.scheme != "http"
                    and current_parsed_url.scheme != "https"):
                continue

            #print("uglyurl: ", current_url, "CANON: ", current_parsed_url_urlcanon, "current_parsed_url: ", current_parsed_url)

            # print("DOMAIN", self.site_currently_crawling[1])
            # print("     URL------->", current_url, current_parsed_url.geturl())

            links.add(current_parsed_url)

        onclicks = soup.find_all(attrs={'onclick': True})

        if len(onclicks) > 0:
            for onclick in onclicks:
                try:
                    x = onclick.find("location=")
                    if (x < 0): continue
                    onclick_split = onclick.split(onclick[x + 9])
                    for index, string in enumerate(onclick_split):
                        if "location=" in string:
                            loc = onclick_split[index + 1]
                            print(
                                "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!",
                                loc)
                            current_url = urllib.parse.urljoin(
                                self.site_currently_crawling[1], loc)
                            current_parsed_url_urlcanon = urlcanon.parse_url(
                                current_url)
                            urlcanon.whatwg(current_parsed_url_urlcanon)
                            current_parsed_url = urllib.parse.urlparse(
                                current_url)
                            links.add(current_parsed_url)
                            break
                except Exception:
                    continue

        for image in soup.find_all("img"):
            current_url_relative = image.get('src')

            current_url = urllib.parse.urljoin(self.site_currently_crawling[1],
                                               current_url_relative)

            current_parsed_url = urllib.parse.urlparse(current_url)

            images.add(current_parsed_url)

        # print(images)

        for image in images:
            fullurl = urllib.parse.urljoin(self.site_currently_crawling[1],
                                           image.geturl())
            fullurl = urllib.parse.urlparse(fullurl)

            try:
                res = requests.get(fullurl.geturl())
            except Exception:
                continue

            content_type = res.headers['content-type']
            content = res.content
            url = image.geturl()
            path = urllib.parse.urlparse(url).path
            filename = os.path.basename(path)

            db.insert_image(self.page_currently_crawling[0], filename,
                            content_type, content, int(time.time()))

        return list(links)
Exemplo n.º 24
0
    def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
        decisions = {"accepted": set(), "blocked": set(), "rejected": set()}
        counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0}
        for url in outlinks or []:
            url_for_scoping = urlcanon.semantic(url)
            url_for_crawling = urlcanon.whatwg(url)
            hashtag = (url_for_crawling.hash_sign +
                       url_for_crawling.fragment).decode('utf-8')
            urlcanon.canon.remove_fragment(url_for_crawling)
            if site.is_in_scope(url_for_scoping, parent_page=parent_page):
                if brozzler.is_permitted_by_robots(site,
                                                   str(url_for_crawling)):
                    if not url_for_scoping.surt().startswith(
                            site.scope["surt"].encode("utf-8")):
                        hops_off_surt = parent_page.hops_off_surt + 1
                    else:
                        hops_off_surt = 0
                    new_child_page = brozzler.Page(
                        self.rr, {
                            'url': str(url_for_crawling),
                            'site_id': site.id,
                            'job_id': site.job_id,
                            'hops_from_seed': parent_page.hops_from_seed + 1,
                            'via_page_id': parent_page.id,
                            'hops_off_surt': hops_off_surt
                        })
                    existing_child_page = brozzler.Page.load(
                        self.rr, new_child_page.id)
                    if existing_child_page:
                        existing_child_page.priority += new_child_page.priority
                        if hashtag and existing_child_page.hashtags:
                            hashtags = set(existing_child_page.hashtags)
                            hashtags.add(hashtag)
                            existing_child_page.hashtags = list(hashtags)
                        elif hashtag:
                            existing_child_page.hashtags = [hashtag]
                        existing_child_page.save()
                        counts["updated"] += 1
                    else:
                        if hashtag:
                            new_child_page.hashtags = [
                                hashtag,
                            ]
                        new_child_page.save()
                        counts["added"] += 1
                    decisions["accepted"].add(str(url_for_crawling))
                else:
                    counts["blocked"] += 1
                    decisions["blocked"].add(str(url_for_crawling))
            else:
                counts["rejected"] += 1
                decisions["rejected"].add(str(url_for_crawling))

        parent_page.outlinks = {}
        for k in decisions:
            parent_page.outlinks[k] = list(decisions[k])
        parent_page.save()

        self.logger.info(
            "%s new links added, %s existing links updated, %s links "
            "rejected, %s links blocked by robots from %s", counts["added"],
            counts["updated"], counts["rejected"], counts["blocked"],
            parent_page)
Exemplo n.º 25
0
    def main(self):
        # The page contains HTML, lets scrape it --------------------------------------------------
        firefox_options = FirefoxOptions()

        # Adding a specific user agent
        firefox_options.add_argument("user-agent=fri-ieps-kslk")
        firefox_options.add_argument("--headless")

        print(f"[PageHandler] Retrieving web page URL '{self.page_url}'")
        self.driver = webdriver.Firefox(
            options=firefox_options,
            executable_path=Config.WEB_DRIVER_LOCATION_GECKO)
        self.driver.set_page_load_timeout(10)

        self.driver.get(self.page_url)

        # Timeout needed for Web page to render (read more about it)
        time.sleep(Config.RENDERING_TIMEOUT)

        self.html_content = self.driver.page_source

        # Checking for duplicates ------------------------------------------------------------------
        self.hashed_content = hashlib.md5(
            self.html_content.encode("utf-8")).hexdigest()

        is_duplicate = self.session.query(Page).filter(
            Page.content_hash == self.hashed_content).first()
        if is_duplicate:
            self.page_db.page_type_code = "DUPLICATE"
            self.page_db.http_status_code = self.status_code
            self.page_db.site_id = self.site_id
            self.page_db.url = self.page_url
            self.page_db.accessed_time = getTimestamp()
            self.page_db.content_hash = self.hashed_content
            self.session.commit()
            self.session.close()
            self.driver.quit()
            return

        # The page is valid html and its not a duplicate, now we extract all the links on the page ---
        links = []

        # First, we extract the links with tag name "a"
        elems = self.driver.find_elements_by_tag_name("a")
        for elem in elems:
            href = elem.get_attribute('href')
            if href is None:
                continue
            if href.startswith("/"):
                links.append(self.base_url + href)
            elif href is not None and ("http" in href or "https" in href):
                links.append(href)

        # We also extract links from the onclick sections
        onclicks = self.driver.find_elements_by_xpath("//*[@onclick]")
        for el in onclicks:
            temp = el.get_attribute("onclick")
            if "location.href=" in temp:
                temp = temp.replace("location.href=", "")\
                    .replace("\'", "")\
                    .replace("\"", "")
                links.append(temp)

        # Remove the links that point outside of .gov
        links_trancuted = []
        for el in links:
            if "gov.si/" in el:
                links_trancuted.append(el)

        links = links_trancuted

        # Put the links in the canonical form
        links_canonical = []
        for el in links:
            parsed_link = urlcanon.parse_url(el)
            urlcanon.whatwg(parsed_link)
            links_canonical.append(str(parsed_link))

        links = links_canonical

        # Save the links to the DB -----------------------------------------------------------------
        for link in links:
            # Check if link is already in the DB
            is_duplicate = self.session.query(Page).filter(
                Page.url == link).first()
            if is_duplicate is None:
                extracted_domain_name = get_domain_name_from_url(link)

                page = Page()
                page.site_id = self.get_site_id_for_page(extracted_domain_name)

                # Pages with status == None have yet to be visited
                page.status = None
                page.page_type_code = "FRONTIER"
                page.url = link
                self.session.add(page)
                self.session.commit()

                # Also add a Link to the DB
                link_ = Link()
                link_.from_page = self.page_id
                link_.to_page = self.session.query(Page).filter(
                    Page.url == link).first().id
                self.session.add(link_)
                self.session.commit()
            #else:
            #    print(f"Page {link} is already in the DB")

        # Finding and storing the images on the page --------------------------------------------------
        imgs = self.driver.find_elements_by_tag_name("img")
        for elem in imgs:
            src = elem.get_attribute("src")
            url = ""
            if src is None:
                continue
            if src.startswith("/"):
                url = self.base_url + src
            elif src is not None and ("http" in src or "https" in src):
                url = src
            if url != "" and len(url) <= 255:
                # Save the image
                image = Image()
                image.page_id = self.page_id
                image.filename = url
                image.content_type = "BINARY"
                image.accessed_time = getTimestamp()
                self.session.add(image)
                self.session.commit()

        # With all the data scraped, we can save the page to the DB -------------------------------------
        self.page_db.html_content = self.html_content
        self.page_db.accessed_time = getTimestamp()
        self.page_db.content_hash = self.hashed_content
        self.page_db.http_status_code = self.status_code
        self.page_db.site_id = self.site_id
        self.page_db.page_type_code = "HTML"
        self.page_db.url = self.page_url
        self.session.commit()

        # Lets be responsible and close the session and the driver
        self.session.close()
        self.driver.quit()