def run_flask(): CrawlGlobal.context().logger.info("start flask with host: %s", CRAWLER_HOSTNAME) app.run(debug=DEBUG_MODE, host=CRAWLER_HOSTNAME, port=PORT, use_reloader=False)
def run(self, text): CrawlGlobal.context().logger.info("start running the model") try: pred = self.make_prediction(text) return pred except Exception as e: CrawlGlobal.context().logger.info( "exception thrown while running model %s", str(e))
def construct_model(self): try: CrawlGlobal.context().logger.info("deserailizing the model") self.new_vectorizer = pickle.load(open("vectorizer.pickle", "rb")) self.model = pickle.load(open(self.model_file, "rb")) except Exception as e: CrawlGlobal.context().logger.info("error in deserializing: %s", str(e))
def test_store_in_redis(self): CrawlGlobal.context().cache.rediscache = fakeredis.FakeStrictRedis() links = ["x","y","z"] storageuri ="abc" self.scraper.store_in_redis(storageuri,links) val = CrawlGlobal.context().cache.get(self.url) self.assertEqual(val, {"storage_uri": "abc", "child_urls": ["x", "y", "z"]})
def crawl(): url = flask.request.json['url'] CrawlGlobal.context().logger.info('Crawling %s', url) if CrawlGlobal.context().active_thread_count.get() >= MAX_ACTIVE_THREADS: return flask.jsonify({'accepted': False}) CrawlGlobal.context().active_thread_count.increment() crawljob = CrawlerJob(url) executor.submit(crawljob.execute, CRAWLER_MANAGER_ENDPOINT) return flask.jsonify({'accepted': True})
def start_scrape(self): url = self.base_url CrawlGlobal.context().logger.info('start scraping') key = 'crawl_pages/{}'.format(str(uuid.uuid4())) CrawlGlobal.context().logger.info('Generated key: %s', key) file_ext = self.get_extension(url) # scraper object is decided (FileScraper, WebScraper, BaseScraper) if file_ext: scraper = FileScraper(url, key, file_ext) elif not CrawlGlobal.context().is_dynamic_scrape(): scraper = BaseScraper(url, key) else: scraper = WebScraper(url, key) # scrape the page data = scraper.do_scrape() #CrawlGlobal.context().logger.info(self.data) # store if self.do_store(file_ext, data): CrawlGlobal.context().logger.info( "need to store the data for url: %s", self.base_url) self.storage_uri = scraper.store_in_gcs(data) else: CrawlGlobal.context().logger.info( "not storing the data for url: %s", self.base_url) # get child urls self.links = scraper.get_links(data) # put in cache scraper.store_in_redis(self.storage_uri, self.links)
def do_scrape(self): CrawlGlobal.context().logger.info("Using Base Scraper") CrawlGlobal.context().logger.info("Scraping URL: {}".format( self.base_url)) try: response = requests.get(self.base_url) response.raise_for_status() return response.content except Exception as e: return None
def setup(): try: CrawlGlobal.context().logger.info("crawler end point: %s", ENDPOINT) CrawlGlobal.context().logger.info("crawler manager end point: %s", CRAWLER_MANAGER_ENDPOINT) res = requests.post(os.path.join(CRAWLER_MANAGER_ENDPOINT, 'register_crawler'), json={'endpoint': ENDPOINT}) CrawlGlobal.context().logger.info( "Registreed successfully with crawler manager") CrawlGlobal.context().set_useroptions(res.json()) except Exception as e: CrawlGlobal.context().logger.info( 'Unable to register with crawler manager: %s', str(e))
def store_in_redis(self, storageuri, links): try: CrawlGlobal.context().logger.info( 'Caching storage_uri and child_urls') CrawlGlobal.context().cache.put(self.base_url, { 'storage_uri': storageuri, 'child_urls': links }) CrawlGlobal.context().logger.info('Caching was successful') except Exception as e: CrawlGlobal.context().logger.error( 'Unable to cache data for %s: %s', self.base_url, str(e))
def store_in_gcs(self, fpath): CrawlGlobal.context().logger.info('Attempting to store in GCS') if not fpath: CrawlGlobal.context().logger.info('no file to store in GCS') return try: storage_client = storage.Client() bucket = storage_client.get_bucket(os.environ['GCS_BUCKET']) blob = bucket.blob(self.file_name + self.file_ext) CrawlGlobal.context().logger.info('Got the blob in GCS') blob.upload_from_filename(fpath) blob.make_public() uri = blob.public_url CrawlGlobal.context().logger.info('uri successfully generated!') return uri except Exception as e: CrawlGlobal.context().logger.error( 'Unable to store webpage for %s: %s', self.base_url, str(e)) return ''
def execute(self, endpoint): CrawlGlobal.context().logger.info('Starting crawl thread for %s', self.base_url) try: if not self.is_cached() or CrawlGlobal.context().has_model(): self.start_scrape() else: CrawlGlobal.context().logger.info('Url %s already cached', self.base_url) # callback manager self.send_response_to_manager(endpoint) except Exception as e: CrawlGlobal.context().logger.info("exception: {}".format(str(e))) CrawlGlobal.context().active_thread_count.decrement()
def do_scrape(self): if not self.is_valid(): return super(WebScraper,self).do_scrape() CrawlGlobal.context().logger.info("Using Web Scraper") try: CrawlGlobal.context().logger.info("Scraping URL: {}".format(self.base_url)) return CrawlGlobal.context().get_data(self.base_url) except Exception as e: CrawlGlobal.context().logger.info("error in scraping: {}".format(str(e))) return None
def test_connections(): try: CrawlGlobal.context().cache.ping() CrawlGlobal.context().logger.info('connected to redis successfully') except Exception as e: CrawlGlobal.context().logger.info('could not initialize redis: %s', str(e))
def do_store(self, ext, data): docs_all = CrawlGlobal.context().scrape_all docs_pdf = CrawlGlobal.context().scrape_pdf docs_docx = CrawlGlobal.context().scrape_docx if ext or not CrawlGlobal.context().has_model(): if ((docs_all == True) or (ext == '.pdf' and docs_pdf == True) or (ext == '.docx' and docs_docx == True) or (docs_pdf == False and docs_docx == False and docs_all == False)): return True else: CrawlGlobal.context().logger.info( 'Non model: No matching doc type') return False else: if ((docs_all == True) or (docs_pdf == False and docs_docx == False and docs_all == False)): cur_pred = CrawlGlobal.context().modelrunner.run(data) return cur_pred == -1 or CrawlGlobal.context().has_label( cur_pred) else: CrawlGlobal.context().logger.info( 'Model: No matching doc type') return False
def ping_crawler_manager(): try: CrawlGlobal.context().logger.info( 'pinging CRAWLER_MANAGER_ENDPOINT - %s', CRAWLER_MANAGER_ENDPOINT) response = requests.get(CRAWLER_MANAGER_ENDPOINT) response.raise_for_status() CrawlGlobal.context().logger.info('ping successful!') except Exception as e: CrawlGlobal.context().logger.error( "Could not connect to crawler manager: %s", str(e))
def make_prediction(self, text): try: CrawlGlobal.context().logger.info("transforming and predicting") vectors = self.new_vectorizer.transform([text]) prediction = self.model.predict(vectors) CrawlGlobal.context().logger.info("model predicted: %d", prediction[0]) return prediction[0] except Exception as e: CrawlGlobal.context().logger.info("error in predicting: %s", str(e)) return -1
def kill(): if ENVIRONMENT == 'local': CrawlGlobal.context().logger.info( 'Not killing crawler because running locally') else: CrawlGlobal.context().logger.info( "Will kill flask server in 3 seconds") kill_thread = threading.Thread(target=kill_main_thread) kill_thread.start() CrawlGlobal.context().logger.info('Kill called') return "ok"
def send_response_to_manager(self, endpoint): links_api = os.path.join(endpoint, 'links') CrawlGlobal.context().logger.info('Endpoint on Crawler manager: %s', links_api) try: CrawlGlobal.context().logger.info( 'Sending response back to crawler manager...') response = requests.post(links_api, json={ 'main_url': self.base_url, 'storage_uri': self.storage_uri, 'child_urls': self.links }) response.raise_for_status() CrawlGlobal.context().logger.info('Response sent successfully!') return response except Exception as e: CrawlGlobal.context().logger.error( "Could not connect to crawler manager: %s", str(e)) return None
def is_cached(self): CrawlGlobal.context().logger.info('connecting to redis') if CrawlGlobal.context().cache.exists(self.base_url): cache_val = CrawlGlobal.context().cache.get(self.base_url) if cache_val != None and 'storage_uri' in cache_val and 'child_urls' in cache_val: self.storage_uri = cache_val['storage_uri'] self.links = cache_val['child_urls'] if (self.storage_uri is None): CrawlGlobal.context().logger.info( 'Error condition. storage_uri None for cached url: %s', self.base_url) self.storage_uri = '' if (self.links is None): CrawlGlobal.context().logger.info( 'Error condition. links None for cached url: %s', self.base_url) self.links = [] return True else: return False return False
def do_scrape(self): try: CrawlGlobal.context().logger.info("Scraping URL: {}".format( self.base_url)) r = requests.get(self.base_url, stream=True) CrawlGlobal.context().logger.info("request status: %d", r.status_code) tmpfile = self.file_name if (self.file_name.startswith('crawl_pages/')): tmpfile = self.file_name[len('crawl_pages/'):] fpath = '/tmp/' + tmpfile + self.file_ext CrawlGlobal.context().logger.info("file path is: %s", fpath) with open(fpath, 'wb') as fd: for chunk in r.iter_content(self.chunk_size): fd.write(chunk) return fpath except Exception as e: CrawlGlobal.context().logger.info("error in writing file: %s", str(e)) return None
def store_in_gcs(self, data): if data is None: return '' CrawlGlobal.context().logger.info('Attempting to store in GCS') try: storage_client = storage.Client() bucket = storage_client.get_bucket(os.environ['GCS_BUCKET']) blob = bucket.blob(self.file_name) blob.upload_from_string(data) blob.make_public() uri = blob.public_url CrawlGlobal.context().logger.info('uri successfully generated!') return uri except Exception as e: CrawlGlobal.context().logger.error( 'Unable to store webpage for %s: %s', url, str(e)) return ''
def get_links(self, data): if data is None: return [] CrawlGlobal.context().logger.info('Parsing links...') try: bs_obj = BeautifulSoup(data, 'html.parser') links_obj = {} for link in bs_obj.find_all('a'): if 'href' in link.attrs: links_obj[link.attrs['href']] = 1 links = list(links_obj.keys()) CrawlGlobal.context().logger.info('Found links in %s: %s', self.base_url, str(links)) return links except Exception as e: CrawlGlobal.context().logger.error( "Could not list links in url: %s", str(e)) return []
def is_valid(self): return True if CrawlGlobal.context().get_driver() else False
def __init__(self, base_url, key): CrawlGlobal.context().logger.info("instantiating web scraper") BaseScraper.__init__(self, base_url, key)
def test_is_cached(self): CrawlGlobal.context().cache.rediscache = fakeredis.FakeStrictRedis() retval = self.crawljob.is_cached() self.assertIs(retval, False)
def __init__(self, base_url, key, ext): BaseScraper.__init__(self, base_url, key) CrawlGlobal.context().logger.info("instantiating file scraper") self.file_ext = ext self.chunk_size = 2000
def get_extension(self, url): CrawlGlobal.context().logger.info('url is: %s', self.base_url) ext = list( filter(lambda x: url.lower().endswith(x), ALLOWABLE_EXTENSIONS))[0] CrawlGlobal.context().logger.info('extension is: %s', ext) return ext
def status(): return flask.jsonify( {'active_threads': CrawlGlobal.context().active_thread_count.get()})
def kill_main_thread(): time.sleep(3) CrawlGlobal.context().logger.info("Kill confirmed") os._exit(0)
json={'endpoint': ENDPOINT}) CrawlGlobal.context().logger.info( "Registreed successfully with crawler manager") CrawlGlobal.context().set_useroptions(res.json()) except Exception as e: CrawlGlobal.context().logger.info( 'Unable to register with crawler manager: %s', str(e)) def ping_crawler_manager(): try: CrawlGlobal.context().logger.info( 'pinging CRAWLER_MANAGER_ENDPOINT - %s', CRAWLER_MANAGER_ENDPOINT) response = requests.get(CRAWLER_MANAGER_ENDPOINT) response.raise_for_status() CrawlGlobal.context().logger.info('ping successful!') except Exception as e: CrawlGlobal.context().logger.error( "Could not connect to crawler manager: %s", str(e)) if __name__ == "__main__": ping_crawler_manager() test_connections() CrawlGlobal.context().logger.info('ENVIRONMENT -- %s ', ENVIRONMENT) CrawlGlobal.context().logger.info('starting flask app in separate thread') flask_thread = threading.Thread(target=run_flask) flask_thread.start() if ENVIRONMENT != 'local': setup()