def main(): logfile = join(settings.sanitised_directory, "publish.log") fh = logging.FileHandler(logfile) log.addHandler(fh) manager = Manager() semaphore = manager.Semaphore(cpu_count()) pool = Pool(processes=cpu_count()) report_counter = 0 # iterate over report files report_files = list_report_files(settings.sanitised_directory) while True: try: semaphore.acquire() report_file = report_files.next() log.info("Importing %s" % report_file) pool.apply_async(ReportInserter, (report_file, semaphore)) report_counter += 1 except StopIteration: break log.info("Waiting for all the tasks to finish") pool.close() pool.join() log.info("Imported %d reports" % report_counter)
def main(): if len(sys.argv) < 3: print("Usage: %s <num_processes> <num_resources>" % sys.argv[0]) return m = Manager() global_sema = m.Semaphore() num_process = int(sys.argv[1]) num_resource = int(sys.argv[2]) #global_resource_list = m.list([0] * num_resource) #global_res_list_lock = m.Lock() nodes = m.list([]) global_resource_list = m.list( [Resource(m, i, nodes) for i in range(num_resource)]) global_transmit_lock = m.Lock() nodes.extend([ Node(i, num_process, m, global_resource_list, global_sema, global_transmit_lock) for i in range(num_process) ]) # the nodes stop after this many total requests are made max_req = num_process # the worker pool # it contains one process for each of the node in the network jobPool = Pool(processes=len(nodes)) jobPool.starmap_async( fire_node, zip(repeat(nodes), range(num_process), repeat(max_req))) for _ in range(num_process + 1): global_sema.acquire() jobPool.terminate()
def synchronized(max_threads: int = 1): ''' This decorator will allow only up to max_processes to run this function simultaneously. :param max_threads: Maximum number of processes. ''' m = Manager() s = m.Semaphore(max_threads) def locked(func): @wraps(func) def locked_func(*args, **kw_args): exceptions = [] s.acquire() try: res = func(*args, **kw_args) except Exception as e: exceptions.append(e) s.release() if len(exceptions): raise exceptions[0] return res return locked_func return locked
def convert_data(image_label_list, label_id_map, out_path, threads=8, shards=1024): # create output path if not os.path.exists(out_path): os.makedirs(out_path) # create_ranges spacing = np.linspace(0, len(image_label_list), threads + 1).astype(np.int) ranges = [] for i in range(len(spacing) - 1): ranges.append([spacing[i], spacing[i + 1]]) process_list = [] manager = Manager() d = manager.dict() s = manager.Semaphore() d['sample_count'] = 0 for x in range(threads): process = Process(target=convert_data_batch, args=[ d, s, image_label_list, label_id_map, out_path, ranges, x, shards ]) process.start() process_list.append(process) for x in process_list: x.join() logging.info('write finished') return d['sample_count']
def main(): if len(sys.argv) < 3: print("Usage:", sys.argv[0], "num_process", "num_resource") return m = Manager() num_process = int(sys.argv[1]) num_resource = int(sys.argv[2]) res_tab = [m.list([0] * num_process) for _ in range(num_resource)] res_tab = m.list(res_tab) res_tab_lock = m.Lock() res_sems = m.list([Resource(m, i) for i in range(num_resource)]) global_sema = m.Semaphore(0) nodes = [ Node(i, res_tab, res_tab_lock, num_resource, res_sems) for i in range(num_process) ] # the nodes stop after this many total requests are made max_req = num_process #killEvent = m.Event() # controller process controller = Process(target=check_for_deadlock, args=(res_tab, res_tab_lock, num_process, global_sema), daemon=True) controller.start() ''' processes = [] for i in range(num_process): processes.append(Process(target=check_for_deadlock, args=(), daemon=True)) processes[-1].start() ''' # the worker pool # it contains one process for each of the node in the # network. each process gets assigned to perform the # free -> request -> cs loop for one node. jobPool = Pool(processes=len(nodes)) jobPool.starmap_async( fire_node, zip(repeat(nodes), range(len(nodes)), repeat(max_req), repeat(global_sema))) #jobPool.close() # request done for _ in range(num_process): global_sema.acquire() #killEvent.wait() jobPool.terminate() #controller.close() controller.terminate() #controller.join() '''
def main(): num_nodes = int(sys.argv[1]) # number of nodes taken as argument manager = Manager() global_queue = manager.list() global_lock = manager.Lock() global_watch = manager.Semaphore() network = [ Node(0, manager, global_queue, global_lock, global_watch, num_nodes) ] # adding first node in the network # adding the rest of the nodes in the network for i in range(num_nodes - 1): network.append( Node(i + 1, manager, global_queue, global_lock, global_watch, num_nodes)) # the nodes stop after this many total requests are made max_req = num_nodes # printer process initiation # the printer process is independent from the worker # pool which manages the nodes. it wakes up at UPDATE_TIME # interval, queries and prints the statuses of all the # nodes, and sleeps again printer = Process(target=print_status, args=(network, ), daemon=True) printer.start() processes = [] for i in range(num_nodes): processes.append( Process(target=init_processing, args=(network, i), daemon=True)) #processes.append(Process(target=init_request, # args=(network, i, max_req), # daemon=True)) processes[-1].start() # the worker pool # it contains one process for each of the node in the # network. each process gets assigned to perform the # free -> request -> cs loop for one node. jobPool = Pool(processes=len(network)) jobPool.starmap(init_request, zip(repeat(network), range(num_nodes), repeat(max_req))) jobPool.close() jobPool.join() for p in processes: p.join()
def main(): if not os.path.isdir(settings.reports_directory): log.error(settings.reports_directory + " does not exist") sys.exit(1) logfile = os.path.join(settings.reports_directory, "sanitise.log") fh = logging.FileHandler(logfile) log.addHandler(fh) if not os.path.isdir(settings.archive_directory): log.error(settings.archive_directory + " does not exist") sys.exit(1) if not os.path.isfile(settings.bridge_db_mapping_file): log.error(settings.bridge_db_mapping_file + " does not exist") sys.exit(1) if not os.path.isdir(settings.sanitised_directory): log.error(settings.sanitised_directory + " does not exist") sys.exit(1) report_counter = 0 manager = Manager() semaphore = manager.Semaphore(cpu_count()) pool = Pool(processes=cpu_count()) # iterate over report files report_files = list_report_files(settings.reports_directory) while True: try: semaphore.acquire() report_file = report_files.next() pool.apply_async(sanitise_report, (report_file, semaphore)) report_counter += 1 except StopIteration: break log.info("Waiting for all the tasks to finish") pool.close() pool.join() if report_counter > 0: log.info(str(report_counter) + " reports archived") else: log.info("No reports were found in the reports directory: " + settings.reports_directory)
def get_objects_in_project_parallel(project, ccmpool=None, use_cache=False): """ Get all the objects and paths of project with use of multiple ccm sessions """ mgr = Manager() free_ccm = mgr.dict() for ccm in ccmpool.sessionArray.values(): free_ccm[ccm.getCCM_ADDR()] = {'free': True, 'database': ccm.database} ccm_addr, database = get_and_lock_free_ccm_addr(free_ccm) ccm = SynergySession(database, ccm_addr=ccm_addr) ccm.keep_session_alive = True delim = ccm.delim() semaphore = mgr.Semaphore(ccmpool.nr_sessions) # Starting project if use_cache: start_object = ccm_cache.get_object(project, ccm) else: start_object = SynergyObject(project, delim) # unlock update dict entry to inform manager entry = free_ccm[ccm_addr] entry['free'] = True free_ccm[ccm_addr] = entry p_queue = mgr.Queue() c_queue = mgr.Queue() c_queue.put((start_object, None)) p_queue.put(start_object) # start the produce and consumer thread prod = Process(target=producer, args=(c_queue, p_queue, free_ccm)) cons = Process(target=consumer, args=(c_queue, p_queue, free_ccm, semaphore, delim, use_cache)) prod.start() cons.start() logger.debug("Waiting to join") cons.join() hierarchy = p_queue.get() prod.join() return hierarchy
def main(args): cfg = default_cfg.clone() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_folder = mkdir(cfg.OUTPUT_FOLDER) assert_proper_output_dir(args.config_file, output_folder) start_time = time.time() processes = [] case_names = cfg.CASE_NAMES if len(case_names) == 0: case_names = sorted([dir_name for dir_name in os.listdir(cfg.ANNOTATION_FOLDER) if os.path.isdir(os.path.join(cfg.ANNOTATION_FOLDER, dir_name)) and "." not in dir_name]) if args.start_index is None: start_index = get_host_id() % args.stride else: start_index = args.start_index case_names = case_names[start_index::args.stride] manager = Manager() n_filter = manager.Semaphore(1) args.n_filter = n_filter for case_name in case_names: p = Process(target=run_updater, args=(cfg, args, case_name)) processes.append(p) p.start() break #ERASE for p in processes: p.join() if not args.only_plot: for case_name in case_names: results = read_serialized(os.path.join(output_folder, "results", "{}.json".format(case_name))) print(case_name, {key: result for key, result in results[case_name].items() if key in ["sum", "mean", "max"]}) print('| finish with time ', time.time() - start_time)
def __init__(self): self.n = 0 m = Manager() self.count = m.Value('i', 0) self.barrier = m.Semaphore(0)
def __init__(self): _manager = Manager() self._list = _manager.list() self._semaphore = _manager.Semaphore()
rowOf0 = True else: if (rowOf0): rowBoundaries.append(y) rowOf0 = False try: os.mkdir("pieces") except: pass threads = [] lock = Lock() manager = Manager() pieces = manager.list() semaphore = manager.Semaphore(numThreads) start = time.time() for y in range(0, len(rowBoundaries), 2): top = rowBoundaries[y] bottom = rowBoundaries[y + 1] for x in range(0, len(colBoundaries), 2): identifier = int((x + y * len(colBoundaries) / 2) / 2) + 1 try: os.mkdir("pieces/p%02d" % identifier) except: pass left = colBoundaries[x]
class AutoML(BaseEstimator): # TODO continuous multilabel # FIXME sparse config error # FIXME Submodule ConfigSpace # FIXME MAC memory_info # TODO pickle # TODO packaging # TODO repetitive update def __init__(self, time_limit_total=1000, time_limit_each=40, memory_limit=None, metalearning_k=10, ensemble_size=1, n_jobs=3, random_state=1, include_preprocessors=None, resampling='cv', log_filename=None, tmp_dir=None, debug=False): self.time_limit_total = time_limit_total self.time_limit_each = time_limit_each self.metalearning_k = metalearning_k self.ensemble_size = ensemble_size self.n_jobs = n_jobs self.memory_limit = memory_limit self.random_state = random_state self.include_estimators = ("xgradient_boosting", "extra_trees") self.include_preprocessors = include_preprocessors self.mode = 'alice' self.resampling = resampling self.log_filename = log_filename self.tmp_dir = tmp_dir self.debug = debug def fit(self, X, y, valid_X=None, valid_y=None, info=None, feat_type=None): # Input validation (raises ValueError) if valid_X is not None and valid_y is not None: self._has_valid_data = True elif valid_X is not None or valid_y is not None: raise ValueError("validation data is given improperly") else: self._has_valid_data = False if self.mode not in MODES: raise ValueError("Invalid mode setting") if self.resampling not in RESAMPLING: raise ValueError("Invalid resampling strategy setting") if self._has_valid_data and self.resampling == 'cv': warnings.warn( "cv does not use the predefined validation set. " "Valid set will be ignored", RuntimeWarning) X, y = check_X_y(X, y, accept_sparse='csr', force_all_finite=False, multi_output=True, copy=True, dtype=np.float32) X, y = shuffle(X, y, random_state=self.random_state) if self._has_valid_data: valid_X, valid_y = check_X_y(valid_X, valid_y, accept_sparse='csr', force_all_finite=False, multi_output=True, copy=True, dtype=np.float32) if info is None: info = dict() datamanager = get_datamanager(X, y, info=info, feat_type=feat_type) name_ = datamanager.name self._manager = datamanager self.task_ = self._manager.info['task'] self.label_num_ = self._manager.info["label_num"] self._time_budget = self.time_limit_total self._metric_with_default = self._manager.info['metric'] self.metric_ = self._metric_with_default if self.mode == 'metalearning': self.metric_ = None if self.task_ in CLASSIFICATION_TASKS: self._metric_available = CLASSIFICATION_METRICS elif self.task_ in REGRESSION_TASKS: self._metric_available = REGRESSION_METRICS else: raise NotImplementedError() else: self._metric_available = [self.metric_] if self.metalearning_k <= 0: logging.warning("Improper value of metalearning_k, setting to 0") self.metalearning_k = 0 # StopWatch setting self._stopwatch = StopWatch() self._stopwatch.start_task(name_) log_level = logging.INFO if self.debug: log_level = logging.DEBUG ########################################################### # setup logger by. bigwig ########################################################### self._logger = logging.getLogger(name_) formatter = logging.Formatter( '[%(levelname)s] [%(asctime)s:%(name)s] %(message)s', datefmt='%H:%M:%S') if not self.log_filename: print("logging at stream") logHandler = logging.StreamHandler() logHandler.setFormatter(formatter) else: print("logging at " + self.log_filename) logHandler = logging.FileHandler(self.log_filename, mode='w') logHandler.setFormatter(formatter) self._logger.setLevel(log_level) self._logger.addHandler(logHandler) print("logging with " + self._logger.name) # ensemble checking if self.ensemble_size == 1: self._logger.info("ensemble_size=1, Non-ensemble mode.") elif self.ensemble_size > 1: self._logger.info( "Now it will construct an ensemble with size %d." % self.ensemble_size) else: raise NotImplementedError("Invalid ensemble_size is given.") # Handling for small-size data if self._manager.info['train_num'] < 3000: if self.ensemble_size > 1: warnings.warn( "training data is too small to construct ensemble model." "ensemble_size will be set to 1", RuntimeWarning) self.ensemble_size = 1 if self.resampling == 'holdout': warnings.warn( "Now the resampling strategy is set to holdout, " "but it is recommended to use cv strategy" "due to its small data size", RuntimeWarning) elif self._manager.info['train_num'] > 100000: if self.resampling == 'cv': warnings.warn( "For a large dataset, cv is inefficient compared to holdout," "while the advantage is negligible." "So, we'll use holdout instead of cv.", RuntimeWarning) self.resampling = 'holdout' time_load_data = self._stopwatch.wall_elapsed(name_) self._print_time("LoadData", time_load_data) # Calculate metafeatures task_name = "CalculateMetafeatures" self._stopwatch.start_task(task_name) meta_features = self._manager.metafeatures(X, y) self._stopwatch.stop_task(task_name) self._print_time(task_name, self._stopwatch.wall_elapsed(task_name)) if meta_features is None: # Current : task is not in TASK_TYPES raise NotImplementedError() task_name = 'EncodeX' self._stopwatch.start_task(task_name) X = self._manager.encode_X(X) if self._has_valid_data: valid_X = self._manager.encode_X(valid_X, trans_only=True) self._stopwatch.stop_task(task_name) self._print_time(task_name, self._stopwatch.wall_elapsed(task_name)) task_name = 'CalculateMetafeaturesEncoded' self._stopwatch.start_task(task_name) meta_features_enc = datamanager.metafeatures(X, y) if meta_features_enc is None: raise ValueError("meta_features_encoded is None") meta_features.values.update(meta_features_enc.values) self._stopwatch.stop_task(task_name) self._print_time(task_name, self._stopwatch.wall_elapsed(task_name)) # Create a search space task_name = "CreateConfigSpace" from ConfigSpace.hyperparameters import CategoricalHyperparameter self._stopwatch.start_task(task_name) self.configuration_space = get_hyperspace( self._manager.info, include_estimators=self.include_estimators, include_preprocessors=self.include_preprocessors) #import networkx as nx #print(self.configuration_space.dag.node[("classifier:xgradient_boosting:max_delta_step")]["weight"]) #import matplotlib.pyplot as plt #nx.draw(self.configuration_space.dag) n_hyperparameters = len(self.configuration_space._hyperparameters) idx2hps = self.configuration_space._idx_to_hyperparameter hps = [ self.configuration_space.get_hyperparameter(idx2hps[idx]) for idx in range(n_hyperparameters) ] self._space_categorical = np.array( map(lambda x: isinstance(x, CategoricalHyperparameter), hps)) self._stopwatch.stop_task(task_name) self._print_time(task_name, self._stopwatch.wall_elapsed(task_name)) task_name = "MetaInitialize" self._stopwatch.start_task(task_name) _task = TASK_TYPES_TO_STRING[self.task_] if self.task_ == MULTILABEL_CLASSIFICATION: _task = "binary.classification" cur_dir = os.path.dirname(__file__) meta_dir = os.path.join( cur_dir, "metalearning/files", "%s_%s_%s" % (METRIC_TO_STRING[self._metric_with_default], _task, ['dense', 'sparse'][self._manager.info['is_sparse']])) meta_subset = (subsets['all']).copy() if self.task_ in CLASSIFICATION_TASKS: meta_subset -= EXCLUDE_META_CLASSIFICATION elif self.task_ in REGRESSION_TASKS: meta_subset -= EXCLUDE_META_REGRESSION meta_subset -= META_MISSING_VALUES meta_list = list(meta_subset) meta_opt = MetaLearningOptimizer( name=name_ + SENTINEL, configuration_space=self.configuration_space, meta_dir=meta_dir, metric='l1', seed=self.random_state, use_features=meta_list, subset='all', logger=self._logger) # TODO This is hacky, I must find a different way of adding a new dataset! # TODO db ? optimization point..! meta_opt.meta_base.add_dataset(name_ + SENTINEL, meta_features) runs = meta_opt.suggest_all(exclude_double_config=True) self.meta_initial_ = runs[:self.metalearning_k] self._stopwatch.stop_task(task_name) self._print_time(task_name, self._stopwatch.wall_elapsed(task_name)) self.validation_score_, self.model_best_ = defaultdict(lambda: 0), {} self.ensemble_ = {} self.score_history_ = [] task_name = "SelectModelSetting" self._stopwatch.start_task(task_name) self._dm = DumpManager(self.tmp_dir) mem_s = memory_usage() self._split_and_dump(X, y, valid_X, valid_y) del X, y, valid_X, valid_y n = gc.collect() self._logger.debug("Garbage Collecting... %d" % n) mem_data = mem_s - memory_usage() self._mem_expansion = 12 * mem_data self._stopwatch.stop_task(task_name) self._print_time(task_name, self._stopwatch.wall_elapsed(task_name)) ensemble_ = self.query_model() task_name = "FinalizeFit" self._stopwatch.start_task(task_name) softmax_sum = 0 for model, score in ensemble_.items(): softmax_i = np.exp(score) ensemble_[model] = softmax_i softmax_sum += softmax_i for model, sm_score in ensemble_.items(): ensemble_[model] = sm_score / softmax_sum for model, weight in ensemble_.items(): estimator = joblib.load(model) self.ensemble_[estimator] = weight for m in self._metric_available: model = self.model_best_[m] estimator = joblib.load(model) self.model_best_[m] = estimator if self.mode == 'metalearning': for m in self._metric_available: self.update_metalearning(meta_features, m) self._logger.info("A new metalearning data is completely updated") self._stopwatch.stop_task(task_name) self._print_time(task_name, self._stopwatch.wall_elapsed(task_name)) self._logger.info("Best model: %s" % str(self.model_best_)) self._logger.info("Final mode: \n%s" % str(self.ensemble_)) self._logger.info("Accuracy: %s" % self.validation_score_) self._logger.info("Cleaning the temporary folder") self._dm.clean_up() return self def predict(self, X): if not hasattr(self, 'ensemble_'): raise ValueError("Some training data should be fitted " "before calling predict method") if self.task_ in REGRESSION_TASKS: X = X.copy() if self._manager.encoder: X = self._manager.encode_X(X, trans_only=True) predictions = [] for model, weight in self.ensemble_.items(): prediction = model.predict(X) predictions.append(prediction * weight) prediction = np.sum(np.array(predictions), axis=0) elif self.task_ in CLASSIFICATION_TASKS: prediction = self.predict_proba(X) else: raise NotImplementedError() if self.task_ == BINARY_CLASSIFICATION: prediction = prediction[:, 1] #if self.task_ in [MULTICLASS_CLASSIFICATION]: # prediction = np.argmax(prediction, axis=1) #elif self.task_ in [BINARY_CLASSIFICATION, MULTILABEL_CLASSIFICATION]: # prediction = np.around(prediction) if len(prediction) == 0: raise ValueError( 'Something went wrong generating the predictions.') return prediction def predict_proba(self, X): if not hasattr(self, 'ensemble_'): raise ValueError("Some training data should be fitted " "before calling predict_proba method") X = X.copy() if self._manager.encoder: X = self._manager.encode_X(X, trans_only=True) predictions = [] if self.task_ in REGRESSION_TASKS: raise AttributeError( "Regression task cannot perform predict_proba") for model, weight in self.ensemble_.items(): prediction = model.predict_proba(X) predictions.append(prediction * weight) predictions = np.sum(np.array(predictions), axis=0) return predictions def score(self, X, y): if self.task_ in REGRESSION_TASKS: prediction = self.predict(X) else: prediction = self.predict_proba(X) return _calculate_score(y, prediction, self.task_, self.metric_) def query_model(self): from multiprocessing import Manager self._sync_manager = Manager() self._hash = {} defaults = get_default_configs(self._manager.info, self.configuration_space) initials = self.meta_initial_ window = {} self.early_stopped_ = False task_name = "SelectModelTest" self._stopwatch.start_task(task_name) # Do not evaluate default configurations more than once testee = [] for idx, config in enumerate(defaults + initials): if idx >= len(defaults) and config in defaults: continue testee.append(config) window, mfo_data, mfo_labels = self.test_configurations(testee, window) if self._check_early_stop(): self._logger.info("Early-stopped") return window def test_configurations(self, configurations, window): results, test_hash_inv = self._evaluate(configurations) if len(window) == 0: if len(results) == 0: raise NoModelFittedException( "Cannot find any profit model. " "Raise time_limit and just try to run again.") if len(results) < len(configurations) / 4: warnings.warn( "Can't gather enough models " "for this time limit setting. " "Relax the time_limit parameter.", UserWarning) window_ = self._update_window(results, window, test_hash_inv) if self.validation_score_[self._metric_with_default] <= 0: raise NoModelFittedException( "Cannot find any profit model. " "Raise time_limit and just try to run again.") mfo_data, mfo_labels = self._settle_test(configurations, results, self._metric_available) return window_, mfo_data, mfo_labels def _update_window(self, results, window, update_hash_inv): for score, model, idx in results: self._dm.keep_file(model) for score, model, idx in results: for m in self._metric_available: score_ = score[m] if self.mode == 'metalearning' else score if score_ > self.validation_score_[m]: self.validation_score_[m] = score_ self.model_best_[m] = model best_score = self.validation_score_[self._metric_with_default] self.score_history_.append(best_score) if self.mode == "metalearning" or self.ensemble_size == 1: window_ = {self.model_best_[self._metric_with_default]: best_score} return window_ # ensemble_size > 1 && not metalearning mode # In this case we updates windows as well as _hash raise NotImplementedError() for score, model in results: model_hash = update_hash_inv[model] if self._hash.has_key(model_hash): o_model = self._hash[model_hash] if score > window[o_model]: del window[o_model] window[model] = score else: if score > 0: window[model] = score survived = sorted(window, key=window.get, reverse=True)[:self.ensemble_size] if len(window) > self.ensemble_size: window = {k: v for k, v in window.items() if k in survived} # Re-update the best model, since the issue with (unordered) dictionary self.model_best_[self.metric_] = survived[0] self.validation_score_[self.metric_] = window[survived[0]] return window def _settle_test(self, testee, results, metrics): mfo_data, mfo_labels = [], defaultdict(list) success = [] for score, model, idx in results: success.append(idx) config = testee[idx] mfo_data.append(config) for m in metrics: score_ = score[m] if self.mode == "metalearning" else score label = 1 - max(0, score_) mfo_labels[m].append(label) fail = [id for id in range(len(testee)) if id not in success] for idx in fail: mfo_data.append(testee[idx]) for m in metrics: mfo_labels[m].append(1) return mfo_data, mfo_labels def _evaluate(self, configs): tasks = [] test_hash_inv = {} results = self._sync_manager.list() if self.memory_limit is None: self.memory_limit = self._mem_expansion + 4096 else: self.memory_limit = max(3096, self.memory_limit) mem_limit_base = memory_usage() self.memory_limit += mem_limit_base self._logger.info("Setting memory_limit to %s" % (self.memory_limit - mem_limit_base)) if self.n_jobs == 0: raise ValueError('n_jobs == 0 in Parallel has no meaning') if self.n_jobs < 0: from multiprocessing import cpu_count self.n_jobs = max(cpu_count() + 1 + self.n_jobs, 1) for idx, config in enumerate(configs): sema = self._sync_manager.Semaphore(1) model_file = self._dm.alloc_file() test_hash_inv[model_file] = self._hash_configuration(config) proc = EvalProcess(config, results, sema, model_file, idx, self.task_, self._datafile, self._logger, self.metric_, self.time_limit_each, self.memory_limit) tasks.append((proc, sema)) handle_tasks(tasks, self.time_limit_each, self.memory_limit, self.n_jobs, self._logger) return results, test_hash_inv def _hash_configuration(self, configuration): vector = configuration._vector categoricals = vector[self._space_categorical] return hash(categoricals.tostring()) def _split_and_dump(self, X, y, valid_X, valid_y): if not hasattr(self, '_dm'): raise ValueError( "It should be called after the dumpmanager _dm is set") if self.resampling == 'cv': pass elif self.resampling == 'holdout': if not self._has_valid_data: data_size = y.shape[0] if data_size >= 100000: valid_ratio = 0.3 elif 15000 <= data_size < 100000: valid_ratio = 0.2 else: valid_ratio = 0.15 valid_size = int(data_size * valid_ratio) X, valid_X = X[valid_size:], X[:valid_size] y, valid_y = y[valid_size:], y[:valid_size] else: raise NotImplementedError() pkl = { "resampling": self.resampling, "X": X, "y": y, "valid_X": valid_X, "valid_y": valid_y } datafile = os.path.join(self._dm.dir, "data.pkl") joblib.dump(pkl, datafile, protocol=-1) self._datafile = datafile return datafile def update_metalearning(self, metafeatures, metric): import arff, csv from metalearning.aslib_simple import AlgorithmSelectionProblem cur_dir = os.path.dirname(__file__) meta_dir = os.path.join( cur_dir, "metalearning/files", "%s_%s_%s" % (METRIC_TO_STRING[metric], TASK_TYPES_TO_STRING[self.task_], ['dense', 'sparse'][self._manager.info["is_sparse"]])) meta_reader = AlgorithmSelectionProblem(meta_dir, self._logger) config_info = meta_reader.get_config_info() config = self.model_best_[metric].configuration with open(os.path.join(meta_dir, 'configurations.csv'), 'a') as f: wrt = csv.DictWriter(f, config_info['fields']) new_row = {} new_idx = int(config_info['last_idx']) + 1 new_row['idx'] = new_idx for key in config_info['fields']: if key == 'idx': continue new_row[key] = config[key] wrt.writerow(new_row) with open(os.path.join(meta_dir, 'algorithm_runs.arff'), 'r') as f: arff_dict = arff.load(f) data = [] for key, type_ in arff_dict['attributes']: if key.upper() == 'INSTANCE_ID': data.append(self._manager.name) elif key.upper() == "REPETITION": data.append(1) elif key.upper() == "ALGORITHM": data.append(new_idx) elif key.upper() == "RUNSTATUS": data.append('ok') else: val = self.validation_score_[metric] data.append(val) arff_dict['data'].append(data) with open(os.path.join(meta_dir, 'algorithm_runs.arff'), 'w') as f: arff.dump(arff_dict, f) with open(os.path.join(meta_dir, 'feature_values.arff'), 'r') as f: arff_dict = arff.load(f) data = [] for key, type_ in arff_dict['attributes']: if key.upper() == 'INSTANCE_ID': data.append(self._manager.name) elif key.upper() == "REPETITION": data.append(1) else: try: val = metafeatures.values[key].value except KeyError: val = '?' data.append(val) arff_dict['data'].append(data) with open(os.path.join(meta_dir, 'feature_values.arff'), 'w') as f: arff.dump(arff_dict, f) def _check_early_stop(self): score = self.validation_score_[self._metric_with_default] has_eps_loss = score > 1 - SCORE_EPSILON if has_eps_loss: self._logger.info("Found epsilon-loss model") return True if self.mode == 'alice': self._logger.info("Alice mode") return True def _print_time(self, task_name, time_check): self._time_budget = max(0, self._time_budget - time_check) self._logger.info( '%s | remaining time: %5.2f sec | memory usage: %5.2f MB' % (task_name, self._time_budget, memory_usage()))
def run_adept(cfg, rank, num_machines, tim_key, distributed): if cfg.MODULE_CFG.ANALYZE_RESULTS_FOLDER == "None": all_scenes = [] for attributes_key in cfg.MODULE_CFG.ATTRIBUTES_KEYS: for dataset_name in cfg.MODULE_CFG.DATASETS.TEST: dataset_jsons_dir = get_jsons_directory( cfg.DATA_CFG, "adept", attributes_key, dataset_name) dataset_files = sorted(os.listdir(dataset_jsons_dir)) all_scenes.extend([{ "scene_file": os.path.join(dataset_jsons_dir, d), "dataset_split": dn, "perception": attr } for d, dn, attr in zip(dataset_files, repeat(dataset_name), repeat(attributes_key))]) manager = Manager() n_filter = manager.Semaphore(1) if cfg.MODULE_CFG.DEBUG: if len(cfg.MODULE_CFG.DEBUG_VIDEOS) > 0: all_scenes = [ s for s in all_scenes if s["scene_file"] in cfg.MODULE_CFG.DEBUG_VIDEOS ] results = [ compute_scores(*w) for w in zip( repeat(cfg), all_scenes, #[:3][rank::num_machines], repeat(n_filter), repeat(cfg.MODULE_CFG.OUTPUT_DIR), repeat(tim_key), repeat(distributed)) ] else: with Pool(int(cpu_count())) as p: results = p.starmap( compute_scores, zip(repeat(cfg), all_scenes[rank::num_machines], repeat(n_filter), repeat(cfg.MODULE_CFG.OUTPUT_DIR), repeat(tim_key), repeat(distributed))) # send_results_to_tim(cfg.MODULE_CFG.OUTPUT_DIR, tim_key) # write_serialized(results,os.path.join(cfg.MODULE_CFG.OUTPUT_DIR, # str(attributes_key)+"results.json")) else: cfg.MODULE_CFG.OUTPUT_DIR = cfg.MODULE_CFG.ANALYZE_RESULTS_FOLDER # results = read_serialized(os.path.join(cfg.MODULE_CFG.OUTPUT_DIR, "results.json")) if not distributed: base_dir = cfg.MODULE_CFG.OUTPUT_DIR for attributes_key in cfg.MODULE_CFG.ATTRIBUTES_KEYS: # group by dataset attri_dir = os.path.join(base_dir, attributes_key) results = [ read_serialized(os.path.join(attri_dir, v)) for v in os.listdir(attri_dir) ] #group by matched surprise/control for relative scores group_by_control_surprise = CONTROL_SURPRISE_GROUPERS[ cfg.DATA_CFG.BASE_NAME] grouped_dataset = group_by_control_surprise(results) scores_per_stimuli = defaultdict(list) for stimuli in grouped_dataset: for control_surprise_g in grouped_dataset[stimuli]: g_score = relative_score( grouped_dataset[stimuli][control_surprise_g]) scores_per_stimuli[stimuli].append(g_score) scores_per_stimuli['total'].append(g_score) avg_relative_scores = { k: bs.bootstrap(np.array(v), stat_func=bs_stats.mean) for k, v in scores_per_stimuli.items() } write_serialized( avg_relative_scores, os.path.join(cfg.MODULE_CFG.OUTPUT_DIR, str(attributes_key) + "_relative_scores.json")) print(scores_per_stimuli)
if self.waitingEvent.is_set(): self.waitingEvent.clear() self.waitingEvent.wait() def f(queueLock, idleSemaphore, waitingEvent, queue, time, i): s = ShiftQueue(queueLock, idleSemaphore, waitingEvent, queue, time, i) s.setShift(3) s.setIdle() assert(s.getTime() == 3) s.setShift(9) s.setShift(4) s.setIdle() assert(s.getTime() == 7) s.setIdle() assert(s.getTime() == 12) return if __name__ == '__main__': manager = Manager() queueLock = manager.Lock() idleSemaphore = manager.Semaphore(NUM_MASTER - 1) waitingEvent = manager.Event() queue = manager.list() time = manager.list([0]) processes = [Process(target=f, args=(queueLock, idleSemaphore, waitingEvent, queue, time, i)) for i in range(0, NUM_MASTER)] for p in processes: p.start() for p in processes: p.join()
class TestManager: """ Class manages the testers and helps them to execute steps in synchronized order. Each tester is executed in a separeted process. """ def __init__(self): self.manager = Manager() self.lock = self.manager.Lock() self.process_done = self.manager.Semaphore(0) self.queue = self.manager.Queue() self.sub_proc = self.manager.Queue() self._setup() def _setup(self): self.testers = [] self.next_steps = [] self.proc_ids = [] self.subprocToKill = [] def add_tester(self, tester): self.testers.append(tester) def start_processes(self, rand_sleep): """create process for each tester""" self.pids = self.manager.Array('l', range(len(self.testers))) for id in range(len(self.testers)): self.process_done.release() next_s = self.manager.Semaphore(0) p = Process(target=self.testers[id].run, args=(self.process_done, next_s, rand_sleep, self.lock, self.sub_proc, self.pids, id, self.queue)) self.proc_ids.append(p) self.next_steps.append(next_s) p.start() self.pids[id] = p.pid def wait_for_processes(self): """wait for all process to finish""" for p in self.proc_ids: p.join() p.terminate() self.lock.acquire() self.lock.release() def run(self, rand_sleep=True): """Execute tester steps""" self.start_processes(rand_sleep) step = -1 will_continue = range(len(self.next_steps)) wait_for = range(len(self.next_steps)) while True: if step >= 0: print("\n\n=================== TestManager step", step, "testers:", wait_for, file=sys.stderr) for _ in wait_for: self.process_done.acquire() if step >= 0: proc, name, status = self.queue.get() print(("Received ", proc, name, status), file=sys.stderr) if status == True: will_continue.append(proc) elif isinstance(status, BaseException): print("Error in tester", proc, name, "step", step) for p in self.proc_ids: p.terminate() while not self.sub_proc.empty(): pid = self.sub_proc.get() try: os.kill(pid, signal.SIGKILL) except: pass raise status if len(will_continue) == 0: break for id in will_continue: self.next_steps[id].release() wait_for = will_continue[:] will_continue = [] step += 1 self.wait_for_processes()
outputList.append(myBlock) outputSemaphore.release() if __name__ == '__main__': freeze_support() Tk().withdraw() manager = Manager() dic = dictionaryLoader.getDictionaries(manager) a = 1 threads = cpu_count() * 2 procs = [] inputList = manager.list() output = manager.list() inputSemaphore = manager.Semaphore(0) outputSemaphore = manager.Semaphore(0) try: while a <= threads: p = Process(target=translationThread, args = (a, dic, inputList, output, inputSemaphore, outputSemaphore)) p.start() procs.append(p) a+=1 except Exception as e: print("Unable to start thread " + str(a) + ":" + str(e)) file = fileLoader.getFile() options = TranslationOptions() if basename(file) == "RPGMKTRANSPATCH": options.setRPGMakerTrans()