def multi_init_phot(my, f, b, chips): #cuts = {'psf':1.3,'teff':0.02} cuts = stack_tools.get_cuts(f, b) args = [my, f, b, cuts] pool_size = multiprocessing.cpu_count() * 2 act = multiprocessing.active_children() pool = pp.ProcessPool( processes=pool_size, maxtasksperchild=2, ) pool._clear() pool._serve() chips = list(chips) all_args = [] for c in chips: all_args.append([args, c]) #p = Process(target=worker,args=(args,c)) #p.start() #p.join() results = pool.map(init_phot_worker, all_args) pool.close() pool.join() return results
def multi_phot(my, f, chips, parsed_args): args = [my, f, parsed_args] pool_size = multiprocessing.cpu_count() * 2 act = multiprocessing.active_children() pool = pp.ProcessPool( processes=pool_size, maxtasksperchild=2, ) pool._clear() pool._serve() chips = list(chips) all_args = [] for c in chips: all_args.append([args, c]) #p = Process(target=worker,args=(args,c)) #p.start() #p.join() results = pool.map(phot_worker, all_args) pool.close() pool.join() return results
def run(self, ngen, freq, migr, startingGen=0): self.metrics = [] pool = pools.ProcessPool(10) for i in range(startingGen, ngen): self.gen = i + 1 if self.verbose: print("GEN: " + str(i + 1) + "/" + str(ngen)) self.results = map(self.algorithm, self.islands) self.islands = [pop for pop, logbook in self.results] self.metrics += map(self.genMetrics, [i] * len(list(self.results)), [n for n in range(len(list(self.results)))], [logbook for pop, logbook in self.results]) for isle in self.islands: self.history.update(isle) self.beforeMigration(self) if i % freq == 0 and self.gen < ngen - 1: for i in range(0, migr): self.islands = self.migration(self.islands) self.afterMigration(self) self.metrics = [val for sublist in self.metrics for val in sublist] self.metrics = sorted(sorted(self.metrics, key=lambda k: k['island']), key=lambda k: k['gen']) self.accMetrics = (accumulateStats(self.metrics)) #self.metrics = list(self.accumulate(self.metrics)) return self.islands, self.hof, self.metrics, self.accMetrics, self.history
def get_collisions_(self, cam_idxs, LAFs): all_cams = np.unique(cam_idxs.numpy().astype(np.int)).tolist() cam_idxs = [ np.array([i for i, x in enumerate(cam_idxs) if x == c]) for c in all_cams ] printc.green('searching collisions') # for cur_sidxs in tqdm(cam_idxs, desc='searching collisions'): def wrap_fce(patch_size, LAFs): def fce(cur_sidxs): collisions = [] points = np.array( [get_points(patch_size, laf) for laf in LAFs[cur_sidxs]]) for i, cur_sidx in enumerate(cur_sidxs): collides = np.array([ Polygon(points[i]).intersects(Polygon(x)) for x in points ]) collisions += [cur_sidxs[collides]] return collisions return fce p = pp.ProcessPool(multiprocessing.cpu_count()) p.restart() collisions = p.map(wrap_fce(self.patch_size, LAFs), cam_idxs) p.close() return list(itertools.chain.from_iterable(collisions))
def multitask(s, w='stack'): args = [s] pool_size = multiprocessing.cpu_count() * 2 act = multiprocessing.active_children() pool = pp.ProcessPool( processes=pool_size, maxtasksperchild=2, ) pool._clear() pool._serve() chips = list(s.chips) logger = multiprocessing.get_logger() logger.setLevel(logging.INFO) args.append(logger) all_args = [] for c in chips: all_args.append([args, c]) #p = Process(target=worker,args=(args,c)) #p.start() #p.join() if w == 'stack': results = pool.map(stack_worker, all_args) elif w == 'source': results = pool.map(source_worker, all_args) pool.close() pool.join() return results
def getAvgTimebwEventsUsers(self,selectedUsers=True, nCPU=1): df = self.determineDf(selectedUsers) users = self.df['user'].unique() args = [(df, users[i]) for i, item_a in enumerate(users)] pool = pp.ProcessPool(nCPU) deltas = pool.map(self.getMeanTimeHelper, args) return deltas
def compute_scores(self, estimator): dp = DataProcessor() already_processed = False previous_commit = None all_scores = [] reports = dp.read_and_process_report_data(self.path_to_reports_data, self.project) #print self.train_split_index_start, self.train_split_index_end reports_to_process = reports[self.train_split_index_start: self.train_split_index_end] pool = pp.ProcessPool(10) #don't have more than number of reports?? self.cur_estimator = estimator all_scores = pool.map(self.get_report_score, reports_to_process) #pool.close() #pool.join() all_matrixes = [i[0] for i in all_scores] total_tried = sum([i[1] for i in all_scores]) number_achieved = sum([i[2] for i in all_scores]) print "finished pooling" print all_scores final_MAP_score = self.MAP(all_matrixes) final_MRR_score = self.MRR(all_matrixes) print final_MAP_score, " final MAP score" print final_MRR_score, " final MRR score" print float(number_achieved)/float(total_tried), " final accuracy at k score" return final_MAP_score
def upload_to_es(self, parallel=True, reindex_nested_features=False): """ Loops through each file, loads in dictionaries, combines, and uploads to elastic search inputs: reindex_nested_features (bool): optional boolean to reindex features for nested search """ if not parallel: for x in self.fulltext: if reindex_nested_features: self.upload_single_file_with_reindex(x) else: self.upload_single_file(x) else: # start multithreading pool processors = cpu_count() pool = pp.ProcessPool(processors) if reindex_nested_features: r = list( tqdm(pool.imap(self.upload_single_file_with_reindex, self.fulltext), total=len(self.fulltext))) else: r = list( tqdm(pool.imap(self.upload_single_file, self.fulltext), total=len(self.fulltext))) pool.close() pool.join()
def random_sweep_mp(self, num_trials, sweep_funcs, fixed_params=None): def worker(_): params = fixed_params if fixed_params is not None else {} random_params = {k: fn() for k, fn in sweep_funcs.items()} params.update(random_params) m = self.model_type(**params) m.train(self.X_train, self.y_train) combo_key = tuple(sorted(random_params.items())) acc = m.validate(self.X_val, self.y_val) return combo_key, acc, m pool = pp.ProcessPool(NUM_CPUS) results = [] with tqdm(total=num_trials) as pbar: for res in pool.imap(worker, range(num_trials)): print('hey') results.append(res) pbar.update() pool.close() pool.join() max_score = float('-inf') max_model = None history = {} for combo_key, acc, model in results: history[combo_key] = acc if acc > max_score: max_score = acc max_model = model return history, max_model
def __init__(self, args): self.args = args self.threads = int(args.threads) self.input_images = args.images_list self.timeout = int(args.timeout) self.batch_size = int(args.batch_size) self.request_format = args.request_format self.output_name = args.output_name self.input_name = args.input_name self.model_name = args.model_name self.address = "{}/v1/models/{}:predict".format( args.rest_url, self.model_name) self.verbose = args.verbose if self.verbose >= 3: _log.setLevel(logging.DEBUG) elif self.verbose == 2: _log.setLevel(logging.INFO) elif self.verbose == 1: _log.setLevel(logging.ERROR) else: _log.setLevel(logging.CRITICAL) self.p = pp.ProcessPool(int(self.threads)) with open(self.input_images) as f: self.lines = f.readlines() while self.batch_size > len(self.lines): self.lines += self.lines _log.debug("Initialization completed")
def fit(self, URM): self.URM_train = sps.csc_matrix(URM) if self.use_tail_boost: helper = Helper() self.URM = helper.tail_boost(self.URM) n_items = self.URM_train.shape[1] print("Iterating for " + str(n_items) + "times") #create a copy of the URM since each _pfit will modify it copy_urm = self.URM_train.copy() _pfit = partial(self._partial_fit, copy_urm) pool = pp.ProcessPool(self.workers) res = pool.map(_pfit, np.arange(n_items)) # res contains a vector of (values, rows, cols) tuples values, rows, cols = [], [], [] for values_, rows_, cols_ in res: values.extend(values_) rows.extend(rows_) cols.extend(cols_) self.W_sparse = sps.csc_matrix((values, (rows, cols)), shape=(n_items, n_items), dtype=np.float32)
def fetch_record_count(self, date_condition): query_list = [] hive_results = "" try : logger.info("Fetching the record count ...") table_list = self.input_conf_dict["table_list"] final_query = ''' hive -S -e " set hive.exec.parallel=true; set hive.exec.parallel.thread.number=20 ; ''' for table_vals in table_list: table_query = {} table_query["table_name"] = table_vals["table_name"] query = " select dt, '{0}' as table_name, count(*) as rec_count from {2}.{0} where dt > '{1}' group by dt ; \" ".format(table_vals["table_name"], date_condition, self.input_conf_dict["common_config"]["database_name"].strip()) table_query["query"] = final_query + query query_list.append(table_query) pool_size = PARALLEL_POOL_SIZE if len(query_list) < PARALLEL_POOL_SIZE: pool_size = len(query_list) thread_pool = Pool.ProcessPool(pool_size) job_list_results = thread_pool.map(self.run_hive_query, query_list) for query_results in job_list_results: if query_results["status"] == "success": hive_results = hive_results + "\n" + query_results["message"] else: self.failed_table_list.append(query_results["table_name"]) return self.trim_hive_line(hive_results) except : logger.error(str(traceback.format_exc())) return None
def multiprocess_array(ary, func, num_workers, **kwargs): """Applies multi-processing on a segemented array using the given function. Parameters ---------- ary : list List 'like' object to be split for multiple workers. func : Object A function object that the workers should apply on the input data array. num_workers : int The number of threads the program is allowed to use. This number is used to split up the input array into various segments. **kwargs The remaining arguments to be given to the input function. Returns ------- list Containing the results from each of the workers. """ # Check out available worker count and adjust accordingly. num_workers = int(num_workers) if len(ary) < num_workers: num_workers = len(ary) # Divide the array into chucks for the workers. pool = pp.ProcessPool(nodes=num_workers) result = pool.amap(func, [(d, kwargs) for d in numpy.array_split(ary, num_workers)]) return result.get()
def __init__(self, args): self.args = args self.threads = int(args.threads) self.timeout = int(args.timeout) self.url = args.url self.batch_size = int(args.batch_size) self.verbose = args.verbose if self.verbose >= 3: _log.setLevel(logging.DEBUG) elif self.verbose == 2: _log.setLevel(logging.INFO) elif self.verbose == 1: _log.setLevel(logging.ERROR) else: _log.setLevel(logging.CRITICAL) self.p = pp.ProcessPool(int(self.threads)) _URL = 'https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip' path_to_zip = tf.keras.utils.get_file('cats_and_dogs.zip', origin=_URL, extract=True) self.image_path = os.path.join(os.path.dirname(path_to_zip), 'cats_and_dogs_filtered') self.batch = self.setup_input_images(self.batch_size) self.data = json.dumps({"instances": self.batch}) _log.debug("Initialization completed")
def _pool_map(func, *arrays): pool = pp.ProcessPool(NUM_THREADS) mapped_array = np.array(pool.map(func, *arrays), ).T pool.close() pool.join() pool.terminate() pool.restart() return mapped_array
def init_local(self): """Starts the selected boost with the default or specified settings.""" if self._boost == "pathos": self._id_pool = pp.ProcessPool(self.CPUS_COUNT) elif self._boost == "ray": ray.init(memory=self._memory, num_cpus=self.CPUS_COUNT, num_gpus=self.GPUS_COUNT)
def download(emb_name, set_name=None, vs_format=None): '''Loads the full shared embedding `emb_name` and saves the embedding to the current working directory. Args: emb_name(str): Name of the selected embedding set_name(opt, str): Specify if multiple embeddings exist with same name Returns: .csv embedding saved to the current working directory ''' DW_API_TOKEN = os.environ['DW_AUTH_TOKEN'] multiproc = pp.ProcessPool(proc_cnt) def emb_download(appx_num): query_url = "https://query.data.world/file_download/" + set_name + "-appx" + str( appx_num) + "/" + emb_name + "-appx" + str(appx_num) + '.csv' payload, headers = "{}", {'authorization': 'Bearer ' + DW_API_TOKEN} emb_text = requests.request("GET", query_url, data=payload, headers=headers).text with io.open(emb_name + "-appx" + str(appx_num) + '.csv', 'wb') as download_emb: download_emb.write(emb_text) set_name, vs_format = _error_check(emb_name, set_name=set_name, vs_format=vs_format) if vs_format == 'large': num_appx = "SELECT app_num FROM " + info.INDEX_FILE + " WHERE embedding_name = " + emb_name + " and dataset_name = " + set_name app_count = dw.query(info.INDEXER, num_appx).dataframe.iloc[0][0] multiproc.map(emb_download, list(range(num_appx))) with io.open(emb_name + '.csv', 'wb') as compiled: first_appx = io.open(emb_name + '-appx0.csv', 'r', encoding='utf-8') compiled.write(first_appx.read()) for i in range(1, app_count): with io.open(emb_name + "-appx" + str(i) + '.csv', 'r', encoding='utf-8') as appx: appx.next() for line in appx: compiled.write(line) os.remove(emb_name + "-appx" + str(i) + '.csv') else: query_url = "https://query.data.world/file_download/" + set_name + "/" + emb_name + '.csv' payload, headers = "{}", {'authorization': 'Bearer ' + DW_API_TOKEN} emb_text = requests.request("GET", query_url, data=payload, headers=headers).text with io.open(emb_name + '.csv', 'wb') as download_emb: download_emb.write(emb_text) return pd.read_csv(emb_name + '.csv')
def parallelize(rows, func): print("Processing {} items".format(len(rows))) if hasattr(os, 'sched_getaffinity'): processCount = len(os.sched_getaffinity(0)) else: processCount = 4 print('processes {}'.format(processCount)) with pp.ProcessPool(processes=processCount) as pool: pool.map(func, rows)
def generate_frequencies(self, word_estimator, fragment_estimators, alphabet): alphabet.add(self.padding_char) freq_oracle = word_estimator fragment_estimators = fragment_estimators D = [] fragments = self.__generate_fragments(alphabet) frequency_dict = defaultdict(lambda: Counter()) # Computationally checking the frequency estimates of every possible fragment is slow # We use python multithreading to make this quicker # We use the pathos library since the standard multiprocessing library doesn't allow pool maps in class methods pool = pp.ProcessPool() def estimate_fragments(key, frag_estimator): frag_dict = dict() for frag in fragments: frag_dict[frag] = frag_estimator(frag) return key, Counter(frag_dict) # estimate_fragments = lambda key, frag_estimator: (key, Counter({k:v for k,v in map(lambda x: (x, frag_estimator(x)), fragments)})) pool_map = pool.uimap(estimate_fragments, fragment_estimators.keys(), fragment_estimators.values()) for item in pool_map: frequency_dict[item[0]] = item[1] hash_table = defaultdict(lambda: defaultdict(list)) fragment_indices = np.arange(0, self.max_string_length, step=self.fragment_length) for l in fragment_indices: fragments = frequency_dict.get(l).most_common(self.threshold) for fragment in fragments: key, value = self.__split_fragment(fragment[0]) hash_table[key][l].append(value) for dictionary in hash_table.values(): fragment_list = list(dictionary.values()) if len(dictionary.keys() ) == self.max_string_length / self.fragment_length: D += list( map(lambda x: str().join(x), itertools.product(*fragment_list))) return D, freq_oracle, self.padding_char
def _pool_map(func, array): pool = pp.ProcessPool(NUM_PROCESSES) mapped_array = np.array(pool.map(func, array), ).T pool.close() pool.join() pool.clear() pool.terminate() pool.restart() return mapped_array
def GetProcessPool(): """Returns a pathos.pools.ProcessPool instance. Split out for ease of unittesting since pathos can still run into pickling issues with MagicMocks used in tests. Returns: A pathos.pools.ProcessPool instance. """ return pools.ProcessPool()
def multi_fn(lst): pool_size = multiprocessing.cpu_count() * 2 pool = pp.ProcessPool( processes=pool_size, maxtasksperchild=2, ) pool._clear() pool._serve() results = pool.map(worker2, lst)
def parallellized_loop(self): process_map = lambda x: self.procedure(x) import pathos.pools as pp pool = pp.ProcessPool(self.num_threads) a = pool.map(process_map, range(len(self.reshaped_entry_ids))) label_shape, feature_shape, feature_sum = a[0] return label_shape, feature_shape, feature_sum
def __init__(self, asset_service: AssetService, calendar_service: CalendarService, assets, start_date=cfg.DEFAULT_START_DATE, window_length=126): self.asset_service = asset_service self.calendar_service = calendar_service self.assets = assets self._start_date = start_date self.window_length = window_length self.pool = pp.ProcessPool(mp.cpu_count())
def __data_generation(self, batch_fIdA, batch_fIdB): 'Generates data containing batch_size samples' nPool = 4 if self.mix else 1 pool = pp.ProcessPool(nPool) sounds_lbs = pool.map(self._generate_sample, batch_fIdA, batch_fIdB) sounds, labels = zip(*sounds_lbs) sounds = np.array(sounds) labels = np.array(labels) if len(labels.shape) == 1: labels = keras.utils.to_categorical(labels, self.n_classes) return sounds, labels
def run_qaqc(self, las_paths): if self.config.multiprocess: p = pp.ProcessPool(max(int(ph.cpu_count() / 2), 1)) num_las = len(las_paths) for _ in tqdm(p.imap(self.run_qaqc_checks_multiprocess, las_paths), total=num_las, ascii=True): pass p.close() p.join() p.clear() else: self.run_qaqc_checks(las_paths)
def process_all_files(self, cloned_repo_path, reports, all_processed_path, temp_path): os.chdir(cloned_repo_path) first_report = reports[0] base_commit = str(first_report.commit) prev_current_commit = base_commit + "~1" print "calling_checkout" os.system("git checkout " + prev_current_commit) print "done" base_path = all_processed_path + str(first_report.reportID) + "/" print base_path count = 0 for dir_, _, files in os.walk(cloned_repo_path): for fileName in files: count += 1 print count if not fileName.endswith(".java"): continue relDir = os.path.relpath(dir_, cloned_repo_path) relFile = os.path.join(relDir, fileName) infile_path = cloned_repo_path + relFile outfile_path = base_path + relFile to_create = os.path.dirname(outfile_path) try: os.makedirs(to_create) except: continue out_file = outfile_path + ".txt" if not os.path.isfile( out_file ): #so I don't have to reinitialize every time, but I still run a quick sanity check self.process_file(infile_path, out_file) report_datas = [ (report, all_processed_path, base_commit, base_path, temp_path + str(report.reportID) + "/", cloned_repo_path) for report in reports[14:15] ] report_datas_short = [ (temp_path + str(report.reportID) + "/", all_processed_path, report.reportID) for report in reports[14:15] ] #this is so super dumb but I have no more time and patience sorry future caitrin for report_data in report_datas: self.insert_to_temp(report_data) #after you've finished copying to the right temp folder, blast through the processing. pool = pp.ProcessPool(16) res = pool.map(self.temp_to_processed, report_datas_short) pool.close() pool.join()
def __init__(self, args): self.args = args self.threads = args.threads self.timeout = int(args.timeout) self.port = int(args.port) self.verbose = args.verbose if self.verbose >= 3: _log.setLevel(logging.DEBUG) elif self.verbose == 2: _log.setLevel(logging.INFO) elif self.verbose == 1: _log.setLevel(logging.ERROR) else: _log.setLevel(logging.CRITICAL) # # Set metrics to expose # Gauges self.dpdkexporter_busy_percent = Gauge('dpdk_telemetry_busy_percent', '', ['socket', 'port', 'aggregate']) self.dpdkexporter_idle_status = Gauge( 'dpdk_telemetry_idle_status', '', ['socket', 'type', 'direction', 'port', 'aggregate']) # Counter self.dpdkexporter_polls = Counter( 'dpdk_telemetry_polls_total', '', ['socket', 'type', 'port', 'aggregate']) self.dpdkexporter_packets = Counter( 'dpdk_telemetry_packets_total', '', ['socket', 'type', 'direction', 'priority', 'port', 'aggregate']) self.dpdkexporter_bytes = Counter( 'dpdk_telemetry_bytes_total', '', ['socket', 'type', 'direction', 'port', 'aggregate'], unit='bytes') self.dpdkexporter_errors = Counter( 'dpdk_telemetry_errors_total', '', ['socket', 'type', 'direction', 'port', 'aggregate']) self.dpdkexporter_idle_count = Counter( 'dpdk_telemetry_idle_total', '', ['socket', 'type', 'direction', 'port', 'aggregate']) # Histogram self.buckets = (64, 128, 256, 512, 1024, 1522, float("inf")) self.dpdkexporter_packets_size = Histogram( 'dpdk_telemetry_packets_size', '', ['socket', 'direction', 'port', 'aggregate'], buckets=self.buckets) self.p = pp.ProcessPool(int(self.threads))
def GetProcessPool(nodes=None): """Returns a pathos.pools.ProcessPool instance. Split out for ease of unittesting since pathos can still run into pickling issues with MagicMocks used in tests. Args: nodes: How many processes processes to spawn in the process pool. Returns: A pathos.pools.ProcessPool instance. """ return pools.ProcessPool(nodes=nodes)
def log_density(self, n_smc = 100, v = None): p = pp.ProcessPool() if v is None: v = self.cvalue jd = self.jdet # Observations is a matrix where column 0 is observation times T = v[:,0] # Number of species N = v.shape[1] - 1 # Generate particles with shape n_smc, N_dims, N_time_points particles = np.zeros((n_smc, T.shape[0], N)) params = self.params # Sample initial positions for the particles for i in range(n_smc): particles[i,0,:] = self.initial_state llhood_est = 0 # SMC for each time interval weights = 1/n_smc * np.ones(n_smc) deltas = np.diff(T) for i in range(1,T.shape[0]): # Resampling step resampled_indices = random.choices(range(n_smc), weights = weights, k = n_smc) resampled_particles = particles[resampled_indices, i-1, :] # Simulate forward in time res = list(p.map(lambda x: self.sim_delta(x, params, deltas[i-1]), resampled_particles)) # Compute new particles and new weights new_weights = [-np.inf if r is None else self.obs_log_likelihood(v[i,1:], r) for r in res] particles[:,i,:] = np.array([np.zeros(N) if r is None else r for r in res]) n = np.exp(new_weights) d = sum(n) if d == 0: print("Inf") return -np.inf else: weights = n/d # Update likelihood estimate llhood_est += np.log(np.mean(n)) return llhood_est