def submit_task_withdep(driver_handle, task_object_dependencies=[]): ''' submit a task that depend on a list of @args''' task = ray.local_scheduler.Task( ray.local_scheduler.ObjectID(random_string()), ray.local_scheduler.ObjectID(random_string()), task_object_dependencies, 1, # num_returns ray.local_scheduler.ObjectID(random_string()), 0) logger.debug("[DRIVER]: submitting task ", task.task_id()) driver_handle.node_manager_client.submit(task) logger.debug("[DRIVER]: task return values", task.returns()) return task.returns()
def call_scraper(params, ms_id): """ Call to crawl async elements Params: ------- params: dict Stores params to crawl ms_id: uuid Master scraper ID """ logger.debug("Calling to scrape store: {} - {}".format( params.get('name'), params.get('external_id'))) crawl_store.apply_async(args=(ms_id, params), queue=CELERY_QUEUE)
def submit_tasks_nodep(driver_handle, num_tasks): ''' submit a task that depend on a list of @args''' for i in range(num_tasks): task = ray.local_scheduler.Task( ray.local_scheduler.ObjectID(random_string()), ray.local_scheduler.ObjectID(random_string()), [], 1, # num_returns ray.local_scheduler.ObjectID(random_string()), 0) logger.debug("[DRIVER]: submitting task ", task.task_id()) driver_handle.node_manager_client.submit(task) logger.debug("[DRIVER]: task return values", task.returns())
def submit_task_chains(num_chains, tasks_per_chain): # return task placement map on output chain_returns = [] task_placement_map_ = {} for chain_num in range(num_chains): last_task_returns = [] task_placement_map_[chain_num] = [] for i in range(tasks_per_chain): task_returns = submit_task_withdep( driver, task_object_dependencies=last_task_returns) last_task_returns = task_returns task_placement_map_[chain_num].append(task_returns[0]) chain_returns.append(last_task_returns) logger.debug("chain_returns=", chain_returns) chain_results = driver.get([r[0] for r in chain_returns], timeout_ms=5000) print("[DRIVER]: chain return values: ", chain_results) return task_placement_map_
def TEST_run_task_chains(num_chains, tasks_per_chain): task_placement_map = submit_task_chains(num_chains=num_chains, tasks_per_chain=tasks_per_chain) logger.debug("[DRIVER]: task placement information, per chain:") task_placement_total = [] for chain_num in range(len(task_placement_map)): task_placement_list = driver.get(task_placement_map[chain_num], timeout_ms=5000) task_placement_total += [t[1] for t in task_placement_list] logger.debug(chain_num, task_placement_list) logger.debug("task placement overall: ", task_placement_total) task_placement_stats = [(v, task_placement_total.count(v)) for v in set(task_placement_total)] num_total_tasks = sum([t[1] for t in task_placement_stats]) print("total tasks executed = ", num_total_tasks) assert (num_total_tasks == num_chains * tasks_per_chain) print("task placement breakdown: total=", task_placement_stats)
def TEST_run_tasks_nodep(num_tasks): # This test is the same as having num_tasks chains with 1 task per chain # In this test we assume the num_tasks x 1 chain structure. task_placement_map = submit_task_chains(num_chains=num_tasks, tasks_per_chain=1) logger.debug("[DRIVER]: task placement information, per chain:") task_placement_total = [] for chain_num in range(len(task_placement_map)): task_placement_list = driver.get(task_placement_map[chain_num], timeout_ms=5000) task_placement_total += [t[1] for t in task_placement_list] logger.debug(chain_num, task_placement_list) logger.debug("task placement overall: ", task_placement_total) task_placement_stats = [(v, task_placement_total.count(v)) for v in set(task_placement_total)] num_total_tasks = sum([t[1] for t in task_placement_stats]) print("total tasks executed = ", num_total_tasks) assert (num_total_tasks == num_tasks) print("task placement breakdown: total=", task_placement_stats)
parser = argparse.ArgumentParser() parser.add_argument("raylet_socket_name") parser.add_argument("object_store_socket_name") if __name__ == '__main__': args = parser.parse_args() driver = Worker(args.raylet_socket_name, args.object_store_socket_name, is_worker=False) task = ray.local_scheduler.Task( ray.local_scheduler.ObjectID(random_string()), ray.local_scheduler.ObjectID(random_string()), [], 1, ray.local_scheduler.ObjectID(random_string()), 0) logger.debug("submitting %s", task.task_id()) driver.node_manager_client.submit(task) logger.debug("Return values were %s", task.returns()) task2 = ray.local_scheduler.Task( ray.local_scheduler.ObjectID(random_string()), ray.local_scheduler.ObjectID(random_string()), task.returns(), 1, ray.local_scheduler.ObjectID(random_string()), 0) logger.debug("Submitting dependent task 2 %s", task2.task_id()) driver.node_manager_client.submit(task2) # Make sure the tasks get executed and we can get the result of the last # task. obj = driver.get(task2.returns(), timeout_ms=1000)
import argparse import ray from worker import Worker, logger from ray.utils import random_string parser = argparse.ArgumentParser() parser.add_argument("raylet_socket_name") parser.add_argument("object_store_socket_name") if __name__ == '__main__': args = parser.parse_args() driver = Worker(args.raylet_socket_name, args.object_store_socket_name, is_worker=False) task1 = ray.local_scheduler.Task( ray.local_scheduler.ObjectID(random_string()), ray.local_scheduler.ObjectID(random_string()), [], 1, ray.local_scheduler.ObjectID(random_string()), 0) logger.debug("submitting", task1.task_id()) driver.node_manager_client.submit(task1) logger.debug("Return values were", task1.returns()) print("[DRIVER] Return values were", task1.returns()) # Make sure the tasks get executed and we can get the result of the # last task obj = driver.get(task1.returns(), timeout_ms=1000) print("[DRIVER]: task1 driver.get result ", obj)
} for st in stores_d[:int(STORES)]] all_st.extend(stores_list) logger.info('Got {} stores for {}'.format(len(stores_list), retailer)) return all_st # Main method if __name__ == '__main__': logger.info("Started master scraper: " + CELERY_QUEUE + " / scraper_type: " + str(SCRAPER_TYPE)) if SCRAPER_TYPE and len(SCRAPER_TYPE) > 0: if SCRAPER_TYPE == 'price' or SCRAPER_TYPE == 'item': # Fetch Valid Stores sts_to_crawl = request_valid_stores(retailers_to_get, str(SCRAPER_TYPE)) logger.debug(sts_to_crawl[0]) # Number of stores to crawl num_stores = range(0, len(sts_to_crawl)) ms_id = stream_monitor('master', params=sts_to_crawl[0], num_stores=len(sts_to_crawl)) logger.info("Crawling {} stores!".format(len(sts_to_crawl))) # Call to crawl all stores async for s in num_stores: logger.debug("Calling to scrape") call_scraper(sts_to_crawl[s], ms_id) # call_parallel(sts_to_crawl[s], ms_id) elif SCRAPER_TYPE == 'store': logger.debug("CALLING STORES") ms_id = stream_monitor('master', params={}) st_id = 1