示例#1
0
    def crawl(self):
        """
        Crawl the URL set up in the crawler.

        This is the main entry point, and will block while it runs.
        """
        html = self.get(self.url)
        soup = BeautifulSoup(html, "html.parser")
        for tag in soup.findAll('a', href=True):
            link = tag['href']
            parsed = urlparse(link)
            if parsed.scheme:
                to_get = link
            else:
                to_get = self.url + link
            if should_ignore(self.ignore, to_get):
                print('Ignoring URL: {url}'.format(url=to_get))
                continue
            self.get(to_get)
            time.sleep(self.delay)
示例#2
0
    def crawl(self):
        """
        Crawl the URL set up in the crawler.

        This is the main entry point, and will block while it runs.
        """
        html = self.get(self.url)
        soup = BeautifulSoup(html, "html.parser")
        for tag in soup.findAll('a', href=True):
            link = tag['href']
            parsed = urlparse(link)
            if parsed.scheme:
                to_get = link
            else:
                to_get = self.url + link
            if should_ignore(self.ignore, to_get):
                print('Ignoring URL: {url}'.format(url=to_get))
                continue
            self.get(to_get)
            time.sleep(self.delay)
示例#3
0
def client_pushed():
    if request.content_type == 'application/json':
        data = request.get_json(silent=True)
    else:
        data = request.form.to_dict()

    if not data:
        logger.error("Invalid scan request from: %r", request.remote_addr)
        abort(400)
    logger.debug("Client %r request dump:\n%s", request.remote_addr,
                 json.dumps(data, indent=4, sort_keys=True))

    if ('eventType' in data and data['eventType'] == 'Test') or (
            'EventType' in data and data['EventType'] == 'Test'):
        logger.info("Client %r made a test request, event: '%s'",
                    request.remote_addr, 'Test')
    elif 'eventType' in data and data['eventType'] == 'Manual':
        logger.info("Client %r made a manual scan request for: '%s'",
                    request.remote_addr, data['filepath'])
        final_path = utils.map_pushed_path(conf.configs, data['filepath'])
        # ignore this request?
        ignore, ignore_match = utils.should_ignore(final_path, conf.configs)
        if ignore:
            logger.info(
                "Ignored scan request for '%s' because '%s' was matched from SERVER_IGNORE_LIST",
                final_path, ignore_match)
            return "Ignoring scan request because %s was matched from your SERVER_IGNORE_LIST" % ignore_match
        if start_scan(final_path, 'Manual', 'Manual'):
            return """<!DOCTYPE html>
            <html lang="en">
            <head>
            	<title>Plex Autoscan</title>
            	<meta charset="utf-8">
            	<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" rel="stylesheet">
            </head>
            <body>
            	<div class="container">
            		<div class="row justify-content-md-center">
            			<div class="col-md-auto text-center" style="padding-top: 10px;">
            				<h1 style="margin: 10px; margin-bottom: 150px;">Plex Autoscan</h1>
            				<h3 class="text-left" style="margin: 10px;">Success</h3>
            				<div class="alert alert-info" role="alert">
            					<code style="color: #000;">'{0}'</code> was added to scan queue.
            				</div>
            			</div>
            		</div>
            	</div>
            </body>
            </html>""".format(final_path)
        else:
            return """<!DOCTYPE html>
            <html lang="en">
            <head>
            	<title>Plex Autoscan</title>
            	<meta charset="utf-8">
            	<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" rel="stylesheet">
            </head>
            <body>
            	<div class="container">
            		<div class="row justify-content-md-center">
            			<div class="col-md-auto text-center" style="padding-top: 10px;">
            				<h1 style="margin: 10px; margin-bottom: 150px;">Plex Autoscan</h1>
            				<h3 class="text-left" style="margin: 10px;">Error</h3>
            				<div class="alert alert-danger" role="alert">
            					<code style="color: #000;">'{0}'</code> has already been added to the scan queue.
            				</div>
            			</div>
            		</div>
            	</div>
            </body>
            </html>""".format(data['filepath'])

    elif 'series' in data and 'eventType' in data and data[
            'eventType'] == 'Rename' and 'path' in data['series']:
        # sonarr Rename webhook
        logger.info(
            "Client %r scan request for series: '%s', event: '%s'",
            request.remote_addr, data['series']['path'], "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])
        final_path = utils.map_pushed_path(conf.configs,
                                           data['series']['path'])
        start_scan(
            final_path, 'Sonarr', "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])

    elif 'movie' in data and 'eventType' in data and data[
            'eventType'] == 'Rename' and 'folderPath' in data['movie']:
        # radarr Rename webhook
        logger.info(
            "Client %r scan request for movie: '%s', event: '%s'",
            request.remote_addr, data['movie']['folderPath'], "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])
        final_path = utils.map_pushed_path(conf.configs,
                                           data['movie']['folderPath'])
        start_scan(
            final_path, 'Radarr', "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])

    elif 'movie' in data and 'movieFile' in data and 'folderPath' in data['movie'] and \
            'relativePath' in data['movieFile'] and 'eventType' in data:
        # radarr download/upgrade webhook
        path = os.path.join(data['movie']['folderPath'],
                            data['movieFile']['relativePath'])
        logger.info(
            "Client %r scan request for movie: '%s', event: '%s'",
            request.remote_addr, path, "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])
        final_path = utils.map_pushed_path(conf.configs, path)
        start_scan(
            final_path, 'Radarr', "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])

    elif 'series' in data and 'episodeFile' in data and 'eventType' in data:
        # sonarr download/upgrade webhook
        path = os.path.join(data['series']['path'],
                            data['episodeFile']['relativePath'])
        logger.info(
            "Client %r scan request for series: '%s', event: '%s'",
            request.remote_addr, path, "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])
        final_path = utils.map_pushed_path(conf.configs, path)
        start_scan(
            final_path, 'Sonarr', "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])

    elif 'artist' in data and 'trackFile' in data and 'eventType' in data:
        # lidarr download/upgrade webhook
        path = os.path.join(data['artist']['path'],
                            data['trackFile']['relativePath'])
        logger.info(
            "Client %r scan request for album track: '%s', event: '%s'",
            request.remote_addr, path, "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])
        final_path = utils.map_pushed_path(conf.configs, path)
        start_scan(
            final_path, 'Lidarr', "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])

    else:
        logger.error("Unknown scan request from: %r", request.remote_addr)
        abort(400)

    return "OK"
示例#4
0
def client_pushed():
    if request.content_type == 'application/json':
        data = request.get_json(silent=True)
    else:
        data = request.form.to_dict()

    if not data:
        logger.error("Invalid scan request from: %r", request.remote_addr)
        abort(400)
    logger.debug("Client %r request dump:\n%s", request.remote_addr,
                 json.dumps(data, indent=4, sort_keys=True))

    if ('eventType' in data and data['eventType'] == 'Test') or (
            'EventType' in data and data['EventType'] == 'Test'):
        logger.info("Client %r made a test request, event: '%s'",
                    request.remote_addr, 'Test')
    elif 'eventType' in data and data['eventType'] == 'Manual':
        logger.info("Client %r made a manual scan request for: '%s'",
                    request.remote_addr, data['filepath'])
        final_path = utils.map_pushed_path(conf.configs, data['filepath'])
        # ignore this request?
        ignore, ignore_match = utils.should_ignore(final_path, conf.configs)
        if ignore:
            logger.info(
                "Ignored scan request for '%s' because '%s' was matched from SERVER_IGNORE_LIST",
                final_path, ignore_match)
            return "Ignoring scan request because %s was matched from your SERVER_IGNORE_LIST" % ignore_match
        if start_scan(final_path, 'Manual', 'Manual'):
            return "'%s' was added to scan backlog." % final_path
        else:
            return "Error adding '%s' to scan backlog." % data['filepath']

    elif 'series' in data and 'eventType' in data and data[
            'eventType'] == 'Rename' and 'path' in data['series']:
        # sonarr Rename webhook
        logger.info(
            "Client %r scan request for series: '%s', event: '%s'",
            request.remote_addr, data['series']['path'], "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])
        final_path = utils.map_pushed_path(conf.configs,
                                           data['series']['path'])
        start_scan(
            final_path, 'Sonarr', "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])

    elif 'movie' in data and 'eventType' in data and data[
            'eventType'] == 'Rename' and 'folderPath' in data['movie']:
        # radarr Rename webhook
        logger.info(
            "Client %r scan request for movie: '%s', event: '%s'",
            request.remote_addr, data['movie']['folderPath'], "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])
        final_path = utils.map_pushed_path(conf.configs,
                                           data['movie']['folderPath'])
        start_scan(
            final_path, 'Radarr', "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])

    elif 'movie' in data and 'movieFile' in data and 'folderPath' in data['movie'] and \
            'relativePath' in data['movieFile'] and 'eventType' in data:
        # radarr download/upgrade webhook
        path = os.path.join(data['movie']['folderPath'],
                            data['movieFile']['relativePath'])
        logger.info(
            "Client %r scan request for movie: '%s', event: '%s'",
            request.remote_addr, path, "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])
        final_path = utils.map_pushed_path(conf.configs, path)
        start_scan(
            final_path, 'Radarr', "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])

    elif 'series' in data and 'episodeFile' in data and 'eventType' in data:
        # sonarr download/upgrade webhook
        path = os.path.join(data['series']['path'],
                            data['episodeFile']['relativePath'])
        logger.info(
            "Client %r scan request for series: '%s', event: '%s'",
            request.remote_addr, path, "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])
        final_path = utils.map_pushed_path(conf.configs, path)
        start_scan(
            final_path, 'Sonarr', "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])

    elif 'artist' in data and 'trackFile' in data and 'eventType' in data:
        # lidarr download/upgrade webhook
        path = os.path.join(data['artist']['path'],
                            data['trackFile']['relativePath'])
        logger.info(
            "Client %r scan request for album track: '%s', event: '%s'",
            request.remote_addr, path, "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])
        final_path = utils.map_pushed_path(conf.configs, path)
        start_scan(
            final_path, 'Lidarr', "Upgrade" if
            ('isUpgrade' in data and data['isUpgrade']) else data['eventType'])

    else:
        logger.error("Unknown scan request from: %r", request.remote_addr)
        abort(400)

    return "OK"
示例#5
0
def test_should_ignore():
    exclusions = []
    assert not utils.should_ignore("somefile.abc", exclusions), "No match"
    assert utils.should_ignore("somefile.abc", ["*.abc"]), "Match simple"
    assert not utils.should_ignore("somefile.abc", exclusions)
    assert utils.should_ignore("somefile.abc", ["**/*.py", "*.abc"]), "Match second"
示例#6
0
    def get_all_third_party_responses_by_site(self, top_url, lazy=False):
        """Return a dictionary containing third party data loaded on given top_url."""
        top_url = 'http://' + top_url
        tp_query = "SELECT r.url, h.value FROM http_responses_view AS r " \
                   "LEFT JOIN http_response_headers_view as h ON h.response_id = r.id " \
                   " WHERE r.top_url LIKE %s AND " \
                   "url not LIKE %s and h.name = 'Content-Type'"

        try:
            top_ps = utils.get_domain(top_url)
        except AttributeError:
            print("Error while finding public suffix of %s" % top_url)
            return None
        cur = self.connection.cursor()
        cur.itersize = 100000
        try:
            cur.execute(tp_query, (top_url, top_ps))
        except:
            self._reconnect()
            cur = self.connection.cursor()
            cur.itersize = 100000
            cur.execute(tp_query, (top_url, top_ps))

        # If no responses, then clearly this was a crawl failure. Raise exception
        #if cur.rowcount <= 0:
        #    raise CensusException("No responses found: Census crawl failed to ")

        response_data = defaultdict(dict)
        for url, content_type in cur:
            if utils.should_ignore(url):
                continue

            url_data = dict()

            url_ps = utils.get_domain(url)
            if url_ps == top_ps:
                continue
            url_data['url_domain'] = url_ps

            is_js = utils.is_js(url, content_type)
            is_img = utils.is_img(url, content_type)
            if not lazy:
                is_el_tracker = utils.is_tracker(url,
                                                 is_js=is_js,
                                                 is_img=is_img,
                                                 first_party=top_url,
                                                 blocklist='easylist')
                is_ep_tracker = utils.is_tracker(url,
                                                 is_js=is_js,
                                                 is_img=is_img,
                                                 first_party=top_url,
                                                 blocklist='easyprivacy')
                is_tracker = is_el_tracker or is_ep_tracker
                url_data['is_tracker'] = is_tracker

            organization = utils.get_org(url)

            url_data['is_js'] = is_js
            url_data['is_img'] = is_img
            url_data['organization_name'] = organization

            response_data[url] = url_data
        cur.close()
        return dict(response_data)