def crawl(self): """ Crawl the URL set up in the crawler. This is the main entry point, and will block while it runs. """ html = self.get(self.url) soup = BeautifulSoup(html, "html.parser") for tag in soup.findAll('a', href=True): link = tag['href'] parsed = urlparse(link) if parsed.scheme: to_get = link else: to_get = self.url + link if should_ignore(self.ignore, to_get): print('Ignoring URL: {url}'.format(url=to_get)) continue self.get(to_get) time.sleep(self.delay)
def client_pushed(): if request.content_type == 'application/json': data = request.get_json(silent=True) else: data = request.form.to_dict() if not data: logger.error("Invalid scan request from: %r", request.remote_addr) abort(400) logger.debug("Client %r request dump:\n%s", request.remote_addr, json.dumps(data, indent=4, sort_keys=True)) if ('eventType' in data and data['eventType'] == 'Test') or ( 'EventType' in data and data['EventType'] == 'Test'): logger.info("Client %r made a test request, event: '%s'", request.remote_addr, 'Test') elif 'eventType' in data and data['eventType'] == 'Manual': logger.info("Client %r made a manual scan request for: '%s'", request.remote_addr, data['filepath']) final_path = utils.map_pushed_path(conf.configs, data['filepath']) # ignore this request? ignore, ignore_match = utils.should_ignore(final_path, conf.configs) if ignore: logger.info( "Ignored scan request for '%s' because '%s' was matched from SERVER_IGNORE_LIST", final_path, ignore_match) return "Ignoring scan request because %s was matched from your SERVER_IGNORE_LIST" % ignore_match if start_scan(final_path, 'Manual', 'Manual'): return """<!DOCTYPE html> <html lang="en"> <head> <title>Plex Autoscan</title> <meta charset="utf-8"> <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" rel="stylesheet"> </head> <body> <div class="container"> <div class="row justify-content-md-center"> <div class="col-md-auto text-center" style="padding-top: 10px;"> <h1 style="margin: 10px; margin-bottom: 150px;">Plex Autoscan</h1> <h3 class="text-left" style="margin: 10px;">Success</h3> <div class="alert alert-info" role="alert"> <code style="color: #000;">'{0}'</code> was added to scan queue. </div> </div> </div> </div> </body> </html>""".format(final_path) else: return """<!DOCTYPE html> <html lang="en"> <head> <title>Plex Autoscan</title> <meta charset="utf-8"> <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" rel="stylesheet"> </head> <body> <div class="container"> <div class="row justify-content-md-center"> <div class="col-md-auto text-center" style="padding-top: 10px;"> <h1 style="margin: 10px; margin-bottom: 150px;">Plex Autoscan</h1> <h3 class="text-left" style="margin: 10px;">Error</h3> <div class="alert alert-danger" role="alert"> <code style="color: #000;">'{0}'</code> has already been added to the scan queue. </div> </div> </div> </div> </body> </html>""".format(data['filepath']) elif 'series' in data and 'eventType' in data and data[ 'eventType'] == 'Rename' and 'path' in data['series']: # sonarr Rename webhook logger.info( "Client %r scan request for series: '%s', event: '%s'", request.remote_addr, data['series']['path'], "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) final_path = utils.map_pushed_path(conf.configs, data['series']['path']) start_scan( final_path, 'Sonarr', "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) elif 'movie' in data and 'eventType' in data and data[ 'eventType'] == 'Rename' and 'folderPath' in data['movie']: # radarr Rename webhook logger.info( "Client %r scan request for movie: '%s', event: '%s'", request.remote_addr, data['movie']['folderPath'], "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) final_path = utils.map_pushed_path(conf.configs, data['movie']['folderPath']) start_scan( final_path, 'Radarr', "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) elif 'movie' in data and 'movieFile' in data and 'folderPath' in data['movie'] and \ 'relativePath' in data['movieFile'] and 'eventType' in data: # radarr download/upgrade webhook path = os.path.join(data['movie']['folderPath'], data['movieFile']['relativePath']) logger.info( "Client %r scan request for movie: '%s', event: '%s'", request.remote_addr, path, "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) final_path = utils.map_pushed_path(conf.configs, path) start_scan( final_path, 'Radarr', "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) elif 'series' in data and 'episodeFile' in data and 'eventType' in data: # sonarr download/upgrade webhook path = os.path.join(data['series']['path'], data['episodeFile']['relativePath']) logger.info( "Client %r scan request for series: '%s', event: '%s'", request.remote_addr, path, "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) final_path = utils.map_pushed_path(conf.configs, path) start_scan( final_path, 'Sonarr', "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) elif 'artist' in data and 'trackFile' in data and 'eventType' in data: # lidarr download/upgrade webhook path = os.path.join(data['artist']['path'], data['trackFile']['relativePath']) logger.info( "Client %r scan request for album track: '%s', event: '%s'", request.remote_addr, path, "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) final_path = utils.map_pushed_path(conf.configs, path) start_scan( final_path, 'Lidarr', "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) else: logger.error("Unknown scan request from: %r", request.remote_addr) abort(400) return "OK"
def client_pushed(): if request.content_type == 'application/json': data = request.get_json(silent=True) else: data = request.form.to_dict() if not data: logger.error("Invalid scan request from: %r", request.remote_addr) abort(400) logger.debug("Client %r request dump:\n%s", request.remote_addr, json.dumps(data, indent=4, sort_keys=True)) if ('eventType' in data and data['eventType'] == 'Test') or ( 'EventType' in data and data['EventType'] == 'Test'): logger.info("Client %r made a test request, event: '%s'", request.remote_addr, 'Test') elif 'eventType' in data and data['eventType'] == 'Manual': logger.info("Client %r made a manual scan request for: '%s'", request.remote_addr, data['filepath']) final_path = utils.map_pushed_path(conf.configs, data['filepath']) # ignore this request? ignore, ignore_match = utils.should_ignore(final_path, conf.configs) if ignore: logger.info( "Ignored scan request for '%s' because '%s' was matched from SERVER_IGNORE_LIST", final_path, ignore_match) return "Ignoring scan request because %s was matched from your SERVER_IGNORE_LIST" % ignore_match if start_scan(final_path, 'Manual', 'Manual'): return "'%s' was added to scan backlog." % final_path else: return "Error adding '%s' to scan backlog." % data['filepath'] elif 'series' in data and 'eventType' in data and data[ 'eventType'] == 'Rename' and 'path' in data['series']: # sonarr Rename webhook logger.info( "Client %r scan request for series: '%s', event: '%s'", request.remote_addr, data['series']['path'], "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) final_path = utils.map_pushed_path(conf.configs, data['series']['path']) start_scan( final_path, 'Sonarr', "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) elif 'movie' in data and 'eventType' in data and data[ 'eventType'] == 'Rename' and 'folderPath' in data['movie']: # radarr Rename webhook logger.info( "Client %r scan request for movie: '%s', event: '%s'", request.remote_addr, data['movie']['folderPath'], "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) final_path = utils.map_pushed_path(conf.configs, data['movie']['folderPath']) start_scan( final_path, 'Radarr', "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) elif 'movie' in data and 'movieFile' in data and 'folderPath' in data['movie'] and \ 'relativePath' in data['movieFile'] and 'eventType' in data: # radarr download/upgrade webhook path = os.path.join(data['movie']['folderPath'], data['movieFile']['relativePath']) logger.info( "Client %r scan request for movie: '%s', event: '%s'", request.remote_addr, path, "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) final_path = utils.map_pushed_path(conf.configs, path) start_scan( final_path, 'Radarr', "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) elif 'series' in data and 'episodeFile' in data and 'eventType' in data: # sonarr download/upgrade webhook path = os.path.join(data['series']['path'], data['episodeFile']['relativePath']) logger.info( "Client %r scan request for series: '%s', event: '%s'", request.remote_addr, path, "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) final_path = utils.map_pushed_path(conf.configs, path) start_scan( final_path, 'Sonarr', "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) elif 'artist' in data and 'trackFile' in data and 'eventType' in data: # lidarr download/upgrade webhook path = os.path.join(data['artist']['path'], data['trackFile']['relativePath']) logger.info( "Client %r scan request for album track: '%s', event: '%s'", request.remote_addr, path, "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) final_path = utils.map_pushed_path(conf.configs, path) start_scan( final_path, 'Lidarr', "Upgrade" if ('isUpgrade' in data and data['isUpgrade']) else data['eventType']) else: logger.error("Unknown scan request from: %r", request.remote_addr) abort(400) return "OK"
def test_should_ignore(): exclusions = [] assert not utils.should_ignore("somefile.abc", exclusions), "No match" assert utils.should_ignore("somefile.abc", ["*.abc"]), "Match simple" assert not utils.should_ignore("somefile.abc", exclusions) assert utils.should_ignore("somefile.abc", ["**/*.py", "*.abc"]), "Match second"
def get_all_third_party_responses_by_site(self, top_url, lazy=False): """Return a dictionary containing third party data loaded on given top_url.""" top_url = 'http://' + top_url tp_query = "SELECT r.url, h.value FROM http_responses_view AS r " \ "LEFT JOIN http_response_headers_view as h ON h.response_id = r.id " \ " WHERE r.top_url LIKE %s AND " \ "url not LIKE %s and h.name = 'Content-Type'" try: top_ps = utils.get_domain(top_url) except AttributeError: print("Error while finding public suffix of %s" % top_url) return None cur = self.connection.cursor() cur.itersize = 100000 try: cur.execute(tp_query, (top_url, top_ps)) except: self._reconnect() cur = self.connection.cursor() cur.itersize = 100000 cur.execute(tp_query, (top_url, top_ps)) # If no responses, then clearly this was a crawl failure. Raise exception #if cur.rowcount <= 0: # raise CensusException("No responses found: Census crawl failed to ") response_data = defaultdict(dict) for url, content_type in cur: if utils.should_ignore(url): continue url_data = dict() url_ps = utils.get_domain(url) if url_ps == top_ps: continue url_data['url_domain'] = url_ps is_js = utils.is_js(url, content_type) is_img = utils.is_img(url, content_type) if not lazy: is_el_tracker = utils.is_tracker(url, is_js=is_js, is_img=is_img, first_party=top_url, blocklist='easylist') is_ep_tracker = utils.is_tracker(url, is_js=is_js, is_img=is_img, first_party=top_url, blocklist='easyprivacy') is_tracker = is_el_tracker or is_ep_tracker url_data['is_tracker'] = is_tracker organization = utils.get_org(url) url_data['is_js'] = is_js url_data['is_img'] = is_img url_data['organization_name'] = organization response_data[url] = url_data cur.close() return dict(response_data)