class TaskManager: def __init__(self): self.search = ElasticSearchEngine("od-database") self.db = database.Database("db.sqlite3") def complete_task(self, file_list, task, task_result, crawler_name): self.search.delete_docs(task_result.website_id) if file_list: def iter_lines(): with open(file_list, "r") as f: line = f.readline() while line: yield line line = f.readline() self.search.import_json(iter_lines(), task.website_id) self.db.update_website_date_if_exists(task.website_id) task_result.server_id = crawler_name self.db.log_result(task_result) def queue_task(self, task: Task): self.db.put_task(task) print("Queued task and made it available to crawlers: " + str(task.website_id)) def get_queued_tasks(self) -> list: return self.db.get_tasks()
def index_file_list(path: str, website_id): es = ElasticSearchEngine("od-database") with open(path, "r") as f: es.import_json(f.read(), website_id)
class TaskDispatcher: def __init__(self): scheduler = BackgroundScheduler() scheduler.add_job(self.check_completed_tasks, "interval", seconds=10) scheduler.start() self.search = ElasticSearchEngine("od-database") # TODO load from config self.crawl_servers = [ CrawlServer("http://localhost:5001", "OVH_VPS_SSD2 #1"), ] def check_completed_tasks(self): for server in self.crawl_servers: for task in server.fetch_completed_tasks(): print("Completed task") file_list = server.fetch_website_files(task.website_id) if file_list: self.search.import_json(file_list, task.website_id) def dispatch_task(self, task: Task): self._get_available_crawl_server().queue_task(task) def _get_available_crawl_server(self) -> CrawlServer: # TODO: Load balancing & health check for crawl servers return self.crawl_servers[0] def get_queued_tasks(self) -> list: queued_tasks = [] for server in self.crawl_servers: queued_tasks.extend(server.fetch_queued_tasks()) return queued_tasks def get_current_tasks(self) -> list: # TODO mem cache this current_tasks = [] for server in self.crawl_servers: current_tasks.extend(server.fetch_current_tasks()) return current_tasks def get_task_logs_by_server(self) -> dict: task_logs = dict() for server in self.crawl_servers: task_logs[server.name] = server.fetch_crawl_logs() return task_logs def get_stats_by_server(self) -> dict: stats = dict() for server in self.crawl_servers: server_stats = server.fetch_stats() if server_stats: stats[server.name] = server_stats return stats
class SearchTest(TestCase): def setUp(self): self.search = ElasticSearchEngine("od-database-test") self.search.reset() time.sleep(0.5) def test_ping(self): self.assertTrue(self.search.ping(), "Search engine not running") def test_import_and_search(self): files = [{ "name": "PaNopTicon", "size": 1000000000000000000, "path": "c/d", "mtime": 1528765672 }, { "name": "BLAckwAter.Park", "size": 123, "path": "", "mtime": None }, { "name": "10'000 days", "size": -1, "path": "c", "mtime": 12345 }, { "name": "Dead Racer", "size": 1000, "path": "Speed Machine [FLAC]", "mtime": 12345 }] in_str = "" for file in files: in_str += json.dumps(file) + "\n" self.search.import_json(in_str, 123) time.sleep(2) self.assertEqual( 4, self.search.es.count(self.search.index_name, "file")["count"]) # Search for 'pan' in PaNopTicon and expect 1 result, a scroll id, and an highlight page = self.search.search("pan") self.assertIsNotNone(page["_scroll_id"]) self.assertEqual(1, page["hits"]["total"]) self.assertIsNotNone(page["hits"]["hits"][0]["highlight"]["name"]) # Search for 'park' and expect BLAckwAter.Park page = self.search.search("park") self.assertEqual(1, page["hits"]["total"]) # Search for fla and expect Dead Racer page = self.search.search("fla") self.assertEqual(1, page["hits"]["total"]) # Search for 10'000 and expect 10'000 days page = self.search.search("10'000") self.assertEqual(1, page["hits"]["total"]) def test_scroll(self): files = [{ "name": "PaNopTicon", "size": 1000000000000000000, "path": "c/d", "mtime": 1528765672 }, { "name": "BLAckwAter.Park", "size": 123, "path": "", "mtime": None }, { "name": "10'000 days", "size": -1, "path": "c", "mtime": 12345 }, { "name": "Dead Racer", "size": 1000, "path": "Speed Machine [FLAC]", "mtime": 12345 }] in_str = "" for file in files: in_str += json.dumps(file) + "\n" self.search.import_json(in_str, 123) time.sleep(2) page = self.search.search("") scroll_id = page["_scroll_id"] # next page next_page = self.search.scroll(scroll_id) next_scroll_id = next_page["_scroll_id"] self.assertIsNotNone(next_scroll_id) # again next_page2 = self.search.scroll(next_scroll_id) self.assertIsNotNone(next_page2["_scroll_id"]) def test_invalid_scroll(self): invalid_scroll = "blahblah" self.assertIsNone(self.search.scroll(invalid_scroll))
class TaskManager: def __init__(self): self.search = ElasticSearchEngine("od-database") self.db = database.Database(config.DB_CONN_STR) self.tracker = TaskTrackerApi(config.TT_API) self.worker = Worker.from_file(self.tracker) if not self.worker: self.worker = self.tracker.make_worker("oddb_master") self.worker.dump_to_file() self.worker.request_access(config.TT_CRAWL_PROJECT, False, True) self.worker.request_access(config.TT_INDEX_PROJECT, True, False) self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET) self._indexer_threads = list() def start_indexer_threads(self): logger.info("Starting %s indexer threads " % (config.INDEXER_THREADS, )) for _ in range(config.INDEXER_THREADS): t = Thread(target=self._do_indexing) t.setDaemon(True) self._indexer_threads.append(t) t.start() def _do_indexing(self): while True: task = self.worker.fetch_task(project_id=config.TT_INDEX_PROJECT) if task: try: recipe = task.json_recipe() logger.debug("Got indexing task: " + str(recipe)) filename = os.path.join(config.WSB_PATH, format_file_name(recipe["website_id"], recipe["upload_token"])) self._complete_task(filename, Task(recipe["website_id"], recipe["url"])) except Exception as e: self.worker.release_task(task_id=task.id, result=1, verification=0) finally: try: self.worker.release_task(task_id=task.id, result=0, verification=0) except: pass else: time.sleep(5) def _complete_task(self, file_list, task): self.search.delete_docs(task.website_id) if file_list: def iter_lines(): with open(file_list, "r") as f: line = f.readline() while line: yield line line = f.readline() self.search.import_json(iter_lines(), task.website_id) os.remove(file_list) self.db.update_website_date_if_exists(task.website_id) def do_recrawl(self): logger.debug("Creating re-crawl tasks") self._generate_crawling_tasks() def _generate_crawling_tasks(self): # TODO: Insert more in-depth re-crawl logic here websites_to_crawl = self.db.get_oldest_updated_websites(config.RECRAWL_POOL_SIZE, prefix="http") def recrawl(website: Website): crawl_task = Task(website.id, website.url, priority=(int((time.time() - website.last_modified.timestamp()) / 3600))) self.queue_task(crawl_task) pool = ThreadPool(processes=30) pool.map(func=recrawl, iterable=websites_to_crawl) pool.close() def queue_task(self, task: Task): max_assign_time = 24 * 4 * 3600 upload_token = uuid4().__str__() task.upload_token = upload_token tracker_response = self.worker.submit_task(config.TT_CRAWL_PROJECT, recipe=task.__str__(), priority=task.priority, max_assign_time=max_assign_time, hash64=task.website_id, verification_count=1, max_retries=3 ) print(tracker_response.text) logging.info("Queued task and made it available to crawlers: t=%s, r=%s" % (task, tracker_response.text)) if not tracker_response.json()["ok"]: return bucket_response = self.bucket.allocate(upload_token.__str__(), 21474837499, # 20Gib format_file_name(task.website_id, upload_token), to_dispose_date=int(time.time() + max_assign_time), upload_hook="") logging.info("Allocated upload bucket: %d, t=%s, r=%s" % (task.website_id, upload_token, bucket_response.text))