def _featurize_complexes(self, df, featurizer, parallel=True, worker_pool=None): """Generates circular fingerprints for dataset.""" protein_pdbs = list(df["protein_pdb"]) ligand_pdbs = list(df["ligand_pdb"]) complexes = zip(ligand_pdbs, protein_pdbs) def featurize_wrapper(ligand_protein_pdb_tuple): ligand_pdb, protein_pdb = ligand_protein_pdb_tuple print("Featurizing %s" % ligand_pdb[0:2]) molecule_features = featurizer.featurize_complexes([ligand_pdb], [protein_pdb]) return molecule_features if worker_pool is None: features = [] for ligand_protein_pdb_tuple in zip(ligand_pdbs, protein_pdbs): features.append(featurize_wrapper(ligand_protein_pdb_tuple)) else: if worker_pool is None: worker_pool = ProcessingPool(mp.cpu_count()) features = worker_pool.map(featurize_wrapper, zip(ligand_pdbs, protein_pdbs)) else: features = worker_pool.map_sync(featurize_wrapper, zip(ligand_pdbs, protein_pdbs)) #features = featurize_wrapper(zip(ligand_pdbs, protein_pdbs)) df[featurizer.__class__.__name__] = list(features)
def map(self, f, seq): """ Parallel implementation of map. Parameters ---------- f : callable A function to map to all the values in 'seq' seq : iterable An iterable of values to process with 'f' Returns ------- results : list, shape=[len(seq)] The evaluated values """ if self.n_jobs < 1: n_jobs = multiprocessing.cpu_count() elif self.n_jobs == 1: return list(map(f, seq)) else: n_jobs = self.n_jobs pool = Pool(n_jobs) results = list(pool.map(f, seq)) # Closing/joining is not really allowed because pathos sees pools as # lasting for the duration of the program. return results
def image_division(self): image_rows, image_cols = self.__image.shape[:2] print self.__image.shape[:2] grid_indices = [ np.array([x, y]) for x in xrange(0, image_cols - self.__GRID_SIZE, self.__GRID_SIZE) for y in xrange(0, image_rows - self.__GRID_SIZE, self.__GRID_SIZE) ] pool = Pool() output = pool.map(self.grid_division, grid_indices) threshod_sucess_sample = 6 ransacGrouper = RansacLine(1, threshod_sucess_sample, 25, 2) for index, edgels in enumerate(output): if len(edgels) > threshod_sucess_sample: ransacGrouper.edgels = edgels ransac_groups = ransacGrouper.applay_parallel_ransac() self.line_segment(ransac_groups) # print len(self.__lines) # for line in self.__lines: # print (line.slope, line.intercept) # coefficients = np.array([line.slope, line.intercept]) # # print "cof: ", coefficients # x = np.array([20, 50], dtype=np.int32) # polynomial = np.poly1d(coefficients) # # print "Poly: ", polynomial # y = polynomial(x) # y = [int(e) for e in y] # print "x: ", x, "y: ", y # cv2.line(self.__image, (x[0], y[0]), (x[1], y[1]), (0, 255, 0), 1) cv2.imshow("image", self.__image) cv2.waitKey(0) cv2.destroyAllWindows()
def apply(values): pool = Pool() # result = [] result = pool.map(func, values) # result.append(ret) # pool.close() # pool.join() return result
def compute_importance(self, alpha): """ """ pool = ProcessingPool(self._numJobs) errors = pool.map(self._computeImportanceOfTree, [alpha] * self._numTree, range(self._numTree)) return np.array(errors).mean(axis=0)
def alignAllShapes( self ): import pathos.multiprocessing as mp start = time.time() pool = Pool() self.allShapes = pool.map( self.alignOneShape, self.allShapes ) # for sh in self.allShapes: # self.alignOneShape( sh ) print 'alignAllShapes: %f' % (time.time() - start ) return
def run_all_control_analysis(self): dirs = dir_walker(self.encode_root) control_dir = None for d in dirs: if 'control' in d.lower(): control_dir = d assert control_dir is not None replicates = dir_walker(control_dir, level=1) pool = ProcessingPool(nodes=14) pool.map(self.control_analysis, tuple(replicates)) return replicates
def register_stack_to_template(frames, template, regfn, njobs=4, **fnargs): """ Given stack of frames (or a FSeq obj) and a template image, align every frame to template and return a list of functions, which take an image and return warped image, aligned to template. """ if njobs > 1: pool = ProcessingPool(nodes=njobs) out = pool.map(partial(regfn, template=template, **fnargs), frames) else: out = np.array([regfn(img, template, **fnargs) for img in frames]) return out
def main(): from hyperopt import fmin,tpe,hp,Trials from hyperopt.mongoexp import MongoTrials import os fit_params=eval(open('fit_parameters.txt').read()) fit_params['root']=os.getcwd() directory=init_directory(fit_params) if fit_params['optimization']=='hyperopt': space=search_space(fit_params) trials=Trials() best=fmin(run,space=space,algo=tpe.suggest,max_evals=fit_params['max_evals'],trials=trials) plot_results(trials.trials) #https://github.com/hyperopt/hyperopt/wiki/Parallelizing-Evaluations-During-Search-via-MongoDB ''' commands for MongoDB mongod --dbpath . --port 1234 export PYTHONPATH=$PYTHONPATH:/home/pduggins/influence_susceptibility_conformity hyperopt-mongo-worker --mongo=localhost:1234/foo_db --poll-interval=0.1 ''' if fit_params['optimization']=='mongodb': space=search_space(fit_params) space['directory']=directory trials=MongoTrials('mongo://localhost:1234/foo_db/jobs', exp_key='exp4') best=fmin(run,space=space,algo=tpe.suggest,max_evals=fit_params['max_evals'],trials=trials) plot_results(trials.trials) if fit_params['optimization']=='evolve': from pathos.multiprocessing import ProcessingPool as Pool from pathos.helpers import freeze_support #for Windows import numpy as np import pandas as pd # freeze_support() evo_pop=init_evo_pop(fit_params) pool = Pool(nodes=fit_params['threads']) for g in range(fit_params['generations']): exp_params=[value['P'] for value in evo_pop.itervalues()] fitness_list=pool.map(run, exp_params) # new_gen_list=tournament_selection(fitness_list,fit_params) new_gen_list=rank_proportional_selection(fitness_list) remade_pop=remake(evo_pop,new_gen_list) mutated_pop=mutate(remade_pop,evo_pop,fit_params) evo_pop=mutated_pop # crossed_pop=crossover(mutated_pop) # evo_pop=crossed_pop mean_F=np.average([evo_pop[ind]['F'] for ind in evo_pop.iterkeys()]) std_F=np.std([evo_pop[ind]['F'] for ind in evo_pop.iterkeys()]) print '\nGeneration %s: mean_F=%s, std F=%s' %(g+1,mean_F,std_F) out_pop=pd.DataFrame([evo_pop]) out_pop.reset_index().to_json('evo_pop.json',orient='records')
def apply_warps(warps, frames, njobs=4): """ returns result of applying warps for given frames (one warp per frame) """ if njobs > 1 : pool = ProcessingPool(nodes=njobs) out = np.array(pool.map(parametric_warp, frames, warps)) else: out = np.array([parametric_warp(f,w) for f,w in itt.izip(frames, warps)]) if isinstance(frames, fseq.FrameSequence): out = fseq.open_seq(out) out.meta = frames.meta return out
def multi_ray_sim(self, sources, procs=8): self.minener = 1e-10 # minimum energy threshold self.itmax = 1000 # stop iteration after this many ray bundles were generated (i.e. # after the original rays intersected some surface this many times). # The multiprocessing raytracing method to call from the original engine. if len(sources) != procs: raise Exception('Number of sources and processors do not agree') # Creates a pool of processes and makes them raytrace one different source each. The resm list returned is a list of copies of the original engine post raytrace. pool = Pool(processes=procs) resm = pool.map(self.trace, sources) # New tree container and length envaluation to redimension it. tree_len = N.zeros(len(resm), dtype=N.int) trees = [] for eng in xrange(len(resm)): # Get and regroup results in one tree and assembly only: S = resm[eng]._asm.get_surfaces() tree_len[eng] = len(resm[eng].tree._bunds) trees.append(resm[eng].tree) # Next loop is to get the optics callable objects and copy regroup their values without asumptions about what they are. for s in xrange(len(S)): part_res = S[s]._opt.__dict__ keys = S[s]._opt.__dict__.keys() for k in xrange(len(keys)): if (keys[k] == '_opt') or (keys[k] == '_abs'): continue if len(self._asm.get_surfaces()[s]._opt.__dict__[keys[k]]) < 1: self._asm.get_surfaces()[s]._opt.__dict__[keys[k]] = part_res[keys[k]] elif len(part_res[keys[k]]) < 1: continue else: self._asm.get_surfaces()[s]._opt.__dict__[keys[k]][0] = N.append(self._asm.get_surfaces()[s]._opt.__dict__[keys[k]][0], part_res[keys[k]][0], axis=1) # Regroup trees: self.tree = RayTree() # Create a new tree for all for t in xrange(N.amax(tree_len)): # Browse through general tree levels up to the maximum length that has been raytraced for eng in xrange(len(resm)): # Browse through bundles of each parallel engine. if t<(tree_len[eng]): # to not go over the length of the present parallel tree. if t==len(self.tree._bunds): # if the index is greater than the actual length of the general tree, add a new bundle to the general tree with the present parallel bundle to initialise it. bundt = trees[eng]._bunds[t] else: if t>0: # adapt parents indexing prior to concatenation trees[eng]._bunds[t].set_parents(trees[eng]._bunds[t].get_parents()+len(self.tree._bunds[t].get_parents())) bundt = concatenate_rays([bundt, trees[eng]._bunds[t]]) self.tree.append(bundt) trees = 0
def launch_simulation_parallel(simulation_config, max_iterations, parallel_blocks=gtconfig.parallel_blocks, show_progress=True): """ Parallel version of the simulation launch, to maximize CPU utilization. :param catalog_size: Number of defects present on the system. :param priority_generator: Generator for the priority of the defects. :param team_capacity: :param reporters_config: :param resolution_time_gen: :param max_iterations: :param max_time: :param dev_team_bandwidth: :param gatekeeper_config: :param inflation_factor: :param quota_system: :param parallel_blocks: :return: """ pool = Pool(processes=parallel_blocks) samples_per_worker = max_iterations / parallel_blocks logger.info("Launching " + str(max_iterations) + " replications IN PARALLEL. Using " + str(parallel_blocks) + " workers with " + str(samples_per_worker) + " samples each.") worker_inputs = [] for block_id in range(parallel_blocks): worker_input = {'simulation_config': simulation_config, 'max_iterations': samples_per_worker, 'block_id': block_id, 'show_progress': False} worker_inputs.append(worker_input) # Showing progress bar of first batch worker_inputs[0]['show_progress'] = show_progress worker_outputs = pool.map(launch_simulation_wrapper, worker_inputs) logger.info(str(max_iterations) + " replications finished. Starting output consolidation.") simulation_metrics = SimulationMetrics() for output in worker_outputs: simulation_metrics.append_results(output) return simulation_metrics
def preprocess_docs(self, docs): """ Preprocess string or list of strings """ if isinstance(docs, string_types): docs = [docs] if self.stemming is True: if not self.parallel: logger.info('preprocess %i documents without multiprocessing' % len(docs)) docs_preprocess = list(map(self.preprocess, docs)) else: if sys.version_info[0] == 3: from multiprocessing import Pool pool = Pool() n_processes = pool._processes else: logger.info('use pathos for multiprocessing') from pathos.multiprocessing import ProcessingPool pool = ProcessingPool() n_processes = pool.nodes logger.info('preprocess %i documents with %i workers' % (len(docs), n_processes)) docs_preprocess = pool.map(self.preprocess, docs) else: logger.info('no prepocess function apply') docs_preprocess = docs return docs_preprocess
def __init__(self, dynamics): # dynamics self.dynamics = dynamics # parallel self.pool = Pool(8)
def sample_function(function, value_range=(-1, 1), resolution=(1000, 1000), grid=True, parallel=True, **params): """ Sample a function over an xy plane with the given value range and resolution. Function is called with ((x,y), **params) Returns an array of shape (resolution_x, resolution_y, *function_shape), e.g. (1000,1000,3) if f(p)=[a,b,c] e.g. (1000,1000,3,3) if f(p).shape=(3,3) """ # TODO make over any number of dimensions? xy = xy_plane(value_range, resolution, grid=grid) if parallel: # Flatten into array of 2d points [(x,y), ...] points = xy.reshape(-1, xy.shape[-1]) with ProcessingPool() as pool: values = pool.map(lambda p: function(p, **params), points) sampled = np.resize( values, xy.shape[:-1]) # TODO Doesn't work for non-scalar functions else: sampled = np.apply_along_axis(lambda p: function(p, **params), 2, xy) # returns shape: (resolution_x, resolution_y, *function_shape) return sampled
def main(argv): logging.info('Building assignee features') config = configparser.ConfigParser() config.read([ 'config/database_config.ini', 'config/database_tables.ini', 'config/inventor/build_assignee_features_sql.ini' ]) # create output folder if it doesn't exist logging.info( 'writing results to folder: %s', os.path.dirname(config['INVENTOR_BUILD_ASSIGNEE_FEAT']['feature_out'])) os.makedirs(os.path.dirname( config['INVENTOR_BUILD_ASSIGNEE_FEAT']['feature_out']), exist_ok=True) feats = [n for n in ProcessingPool().imap(run, ['granted', 'pregranted'])] with open(config['INVENTOR_BUILD_ASSIGNEE_FEAT']['base_assignee_features'], 'rb') as fin: features = pickle.load(fin) for i in range(0, len(feats)): features.update(feats[i]) with open( config['INVENTOR_BUILD_ASSIGNEE_FEAT']['feature_out'] + '.%s.pkl' % 'both', 'wb') as fout: pickle.dump(features, fout)
def __init__(self, n_features, n_nodes, embedding_dim, rnn_dim, bidirectional=True, sinkhorn_iters=5, sinkhorn_tau=1, num_workers=4, cuda=True): super(SPGSequentialActor, self).__init__() self.use_cuda = cuda self.n_nodes = n_nodes self.embedding_dim = embedding_dim self.rnn_dim = rnn_dim self.num_workers = num_workers self.embedding = nn.Linear(n_features, embedding_dim) self.gru = nn.GRU(embedding_dim, rnn_dim, bidirectional=bidirectional) scale = 2 if bidirectional else 1 self.fc2 = nn.Linear(scale * self.rnn_dim, n_nodes) self.sinkhorn = Sinkhorn(n_nodes, sinkhorn_iters, sinkhorn_tau) self.round = linear_assignment init_hx = torch.zeros(scale, self.rnn_dim) if cuda: init_hx = init_hx.cuda() self.init_hx = Variable(init_hx, requires_grad=False) if num_workers > 0: self.pool = Pool(num_workers)
def get_signal_bg_many_parallel(runList, detid, **kwargs): """ Return the averaged signal and background (based on blank frames) over the given runs """ def mapfunc(run_number): return get_signal_bg_one_run(run_number, detid, **kwargs) MAXNODES = 14 pool = ProcessingPool(nodes=min(MAXNODES, len(runList))) bg = np.zeros(DIMENSIONS_DICT[detid]) signal = np.zeros(DIMENSIONS_DICT[detid]) run_data = pool.map(mapfunc, runList) for signal_increment, bg_increment in run_data: signal += (signal_increment / len(runList)) bg += (bg_increment / len(runList)) return signal, bg
def __init__(self, n_features, n_nodes, embedding_dim, rnn_dim, sinkhorn_iters=5, sinkhorn_tau=1., num_workers=4, cuda=True): super(SPGMatchingActor, self).__init__() self.use_cuda = cuda self.n_nodes = n_nodes self.rnn_dim = rnn_dim self.num_workers = num_workers self.embedding = nn.Linear(n_features, embedding_dim) self.gru = nn.GRU(n_nodes, rnn_dim) self.fc1 = nn.Linear(self.rnn_dim, n_nodes) self.sinkhorn = Sinkhorn(n_nodes, sinkhorn_iters, sinkhorn_tau) self.round = linear_assignment init_hx = torch.zeros(1, self.rnn_dim) if cuda: init_hx = init_hx.cuda() self.init_hx = Variable(init_hx, requires_grad=False) if num_workers > 0: self.pool = Pool(num_workers)
def __init__(self): """ Initalizes DataProcessing class with utilities and parallel processing """ self.res = Res() self.pool = Pool()
def closure(rolling_groupby, func, *args, **kwargs): groups = list(rolling_groupby._groupby.groups.items()) chunks = chunk(len(groups), nb_workers) object_id = plasma_client.put(rolling_groupby.obj) groups_id = plasma_client.put(groups) attribute2value = { attribute: getattr(rolling_groupby, attribute) for attribute in rolling_groupby._attributes } worker_args = [ ( plasma_store_name, object_id, groups_id, attribute2value, chunk, func, args, kwargs, ) for chunk in chunks ] with ProcessingPool(nb_workers) as pool: result_workers = pool.map(RollingGroupby.worker, worker_args) result = pd.concat( [plasma_client.get(result_worker) for result_worker in result_workers], copy=False, ) return result
def climByAveragingPeriods(urls, # list of (daily) granule URLs for a long time period (e.g. a year) nEpochs, # compute a climatology for every N epochs (days) by 'averaging' nWindow, # number of epochs in window needed for averaging variable, # name of primary variable in file mask, # name of mask variable coordinates, # names of coordinate arrays to read and pass on (e.g. 'lat' and 'lon') maskFn=qcMask, # mask function to compute mask from mask variable averager='pixelAverage', # averaging function to use, one of ['pixelAverage', 'gaussInterp'] mode='sequential', # Map across time periods of N-days for concurrent work, executed by: # 'sequential' map, 'multicore' using pool.map(), 'cluster' using pathos pool.map(), # or 'spark' using PySpark numNodes=1, # number of cluster nodes to use nWorkers=4, # number of parallel workers per node averagingFunctions=AveragingFunctions, # dict of possible averaging functions legalModes=ExecutionModes # list of possiblel execution modes ): '''Compute a climatology every N days by applying a mask and averaging function. Writes the averaged variable grid, attributes of the primary variable, and the coordinate arrays in a dictionary. ***Assumption: This routine assumes that the N grids will fit in memory.*** ''' try: averageFn = averagingFunctions[averager] except : averageFn = average print >>sys.stderr, 'climatology: Error, Averaging function must be one of: %s' % str(averagingFunctions) urlSplits = [s for s in fixedSplit(urls, nEpochs)] if VERBOSE: print >>sys.stderr, urlSplits def climsContoured(urls): n = len(urls) var = climByAveraging(urls, variable, mask, coordinates, maskFn, averageFn) return contourMap(var, variable, coordinates, n, urls[0]) if mode == 'sequential': plots = map(climsContoured, urlSplits) elif mode == 'multicore': pool = Pool(nWorkers) plots = pool.map(climsContoured, urlSplits) elif mode == 'cluster': pass elif mode == 'spark': pass plots = map(climsContoured, urlSplits) print plots return plots
def calculate_bleu(sess, trainable_model, data_loader): # bleu score implementation # used for performance evaluation for pre-training & adv. training # separate true dataset to the valid set # conditionally generate samples from the start token of the valid set # measure similarity with nltk corpus BLEU smoother = SmoothingFunction() data_loader.reset_pointer() bleu_avg = 0 references = [] hypotheses = [] for it in xrange(data_loader.num_batch): batch = data_loader.next_batch() # predict from the batch # TODO: which start tokens? # start_tokens = batch[:, 0] start_tokens = np.array([START_TOKEN] * BATCH_SIZE, dtype=np.int64) prediction = trainable_model.predict(sess, batch, start_tokens) # argmax to convert to vocab #prediction = np.argmax(prediction, axis=2) # cast batch and prediction to 2d list of strings batch_list = batch.astype(np.str).tolist() pred_list = prediction.astype(np.str).tolist() references.extend(batch_list) hypotheses.extend(pred_list) bleu = 0. # calculate bleu for each predicted seq # compare each predicted seq with the entire references # this is slow, use multiprocess def calc_sentence_bleu(hypothesis): return sentence_bleu(references, hypothesis, smoothing_function=smoother.method4) if __name__ == '__main__': p = Pool() result = (p.map(calc_sentence_bleu, hypotheses)) bleu = np.mean(result) return bleu
def makePower(): global c pMin, pMax = d["power"]["pMin"], d["power"]["pMax"] pPath = np.linspace(pMin, pMax, frameCount) pool = Pool(4) # Get interesting c while True: subIm = JuliaTools.subImage(c=c, n=10, iters=iters / 2, r=r, p=pMin, split=split, save=False, aura=False) isBlackList = pool.map(subIm, coords) if not all(isBlackList): break else: c *= 0.975 for frame in xrange(frameCount): subIm = JuliaTools.subImage(c=c, r=r, n=n, p=pPath[frame], iters=iters / 2, split=split) isBlackList = pool.map(subIm, coords) allBlack = all(isBlackList) if not allBlack: JuliaTools.makeFrame(frame, n, split, coords) pool.close() JuliaTools.prepareForFFmpeg(frameCount=frameCount, loop=True) with open("tweet.txt", "w") as out: out.write("woooooooooooooooooooo") stop = timeit.default_timer() print stop - start
def run(non_iter_args, do_multiprocessing): [ weightcalcdata, weightcalculator, box, startindex, size, newconnectionmatrix, method, boxindex, filename, headerline, writeoutput, ] = non_iter_args partial_gaincalc_oneset = partial( calc_weights_oneset, weightcalcdata, weightcalculator, box, startindex, size, newconnectionmatrix, method, boxindex, filename, headerline, writeoutput, ) if do_multiprocessing: pool = Pool(processes=pathos.multiprocessing.cpu_count()) pool.map(partial_gaincalc_oneset, weightcalcdata.causevarindexes) # Current solution to no close and join methods on ProcessingPool # https://github.com/uqfoundation/pathos/issues/46 s = pathos.multiprocessing.__STATE["pool"] s.close() s.join() pathos.multiprocessing.__STATE["pool"] = None else: for causevarindex in weightcalcdata.causevarindexes: partial_gaincalc_oneset(causevarindex) return None
def test_multiprocess(): x_list = [1,2,3,4,5,6,7,] y_list = ['1','2','3','4','5','6','7'] epoch = 8 pool = Pool(epoch) res = pool.amap(test_task,x_list,y_list) pool.pipe(test_task,'22','222') pool.close() pool.join()
def _process_set_reads_library(self, input_object_info, genome_index_base, result_directory, cli_option_params): """ _process_set_reads_library: process set reads library """ reads_refs = self.fetch_reads_refs_from_sampleset( input_object_info['ref'], input_object_info['info']) set_object_name = input_object_info['info'][1] alignment_set_name = set_object_name + cli_option_params[ 'alignment_set_suffix'] arg_1 = [] arg_2 = [genome_index_base] * len(reads_refs) arg_3 = [result_directory] * len(reads_refs) arg_4 = [] conditions = [] for reads_ref in reads_refs: reads_input_object_info = self._get_input_object_info( reads_ref['ref']) option_params = cli_option_params.copy() option_params['reads_condition'] = reads_ref['condition'] conditions.append(reads_ref['condition']) arg_1.append(reads_input_object_info) arg_4.append(option_params) cpus = min(cli_option_params.get('num_threads'), multiprocessing.cpu_count()) pool = Pool(ncpus=cpus) log('running _process_alignment_object with {} cpus'.format(cpus)) reads_alignment_object_refs = pool.map( self._process_single_reads_library, arg_1, arg_2, arg_3, arg_4) for reads_alignment_object_ref in reads_alignment_object_refs: if reads_alignment_object_ref.startswith('ERROR'): error_msg = 'Caught exception in worker\n' error_msg += '{}'.format(reads_alignment_object_ref) raise ValueError(error_msg) workspace_name = cli_option_params['workspace_name'] reads_alignment_set_object_ref = self._save_alignment_set( reads_alignment_object_refs, workspace_name, alignment_set_name, conditions) return reads_alignment_set_object_ref
def pad_pdf(path, ratio, output_path=None): """Pad PDF with a <ratio>% white margin increase on the right. Takes a path to the original PDF file, converts them to PIL images, and pads them with the appropriate whitespace. Returns a path to the padded PDF. If a valid output_path is given, it will move the PDF to the given path and return the path. """ images = pdf2image.convert_from_path(path) p = Pool(4) def overlay_and_store(img): """Pad the individual images by overlaying it on a white background. Passed to a multiprocessing pool as each individual PDF page is independent of each other. Saves the image in a temp path as a JPEG, and returns the absolute file path. """ w, h = img.size padded_img = Image.new("RGB", (int(w * (1.0 + ratio)), h), "white") padded_img.paste(img, (0, 0)) tmp_path = _generate_tmp_path(ext='.jpeg') padded_img.save(tmp_path, "JPEG") return tmp_path padded_images = p.map(overlay_and_store, images) # Output as PDF. output = _generate_tmp_path(ext='.pdf') with open(output, 'wb') as f: f.write(img2pdf.convert(padded_images)) # Clean up temp image files used. for tmp_img in padded_images: os.remove(tmp_img) if output_path: os.rename(output, output_path) return output_path return output
def main(args): """Main function for calculating BD shift. Parameters ---------- args : dict See ``BD_shift`` subcommand """ sys.stderr.write('Loading KDE objects...\n') kde1 = Utils.load_kde(args['<kde1>']) kde2 = Utils.load_kde(args['<kde2>']) # adding top-level library ID if not present kde1 = kde_add_lib(kde1) kde2 = kde_add_lib(kde2) sys.stderr.write('Calculating BD shifts...\n') print '\t'.join(['lib1','lib2','taxon','BD_shift']) for libID1,d1 in kde1.items(): for libID2,d2 in kde2.items(): msg = ' Comparing libraries: "{}", "{}"\n' sys.stderr.write(msg.format(libID1, libID2)) # overlap of taxa btw libraries taxa = taxon_overlap(d1, d2) # calculating BD shift (in parallel) pfunc = partial(kde_intersect, start=float(args['--start']), end=float(args['--end']), step=float(args['--step'])) pool = ProcessingPool(nodes=int(args['--np'])) if args['--debug']: res = map(pfunc, [(taxon, d1[taxon], d2[taxon]) for taxon in taxa]) else: res = pool.amap(pfunc, [(taxon, d1[taxon], d2[taxon]) for taxon in taxa]) while not res.ready(): time.sleep(2) res = res.get() # writing out table for line in res: print '\t'.join([libID1, libID2] + \ [str(x) for x in line])
def __init__( self, func, bounds, niter=500, population=10, ftol=0.001, workers=-1, restart=False, vec_dump=10, seed=None, aggressive_parasite=False ): """ Initialise a symbiotic organisms search instance Args: func (callable): Function to be minimised. f(x, *args) - x is the argument to be minimised, args is a tuple of any additional fixed parameters to specify the function bounds (list(Double)): list of pairs of (min,max) bounds for x niter (Int): number of iterations for optimiser population (Int): number of members in population ftol (Double) : convergence criteria for function workers (Int): number of multiprocessing workers to use. -1 sets workers to mp.cpu_count() vec_dump (Int): outputs restart file vec_dump number of steps restart (Bool): restart the run from a restart file seed (Int): seed for random number generator, useful for tests """ self.function = func self.niter = niter self.population = population self.particles = [] self.best_global_vec = None self.best_global_fit = math.inf self.ftol = ftol self.bounds = np.asarray(bounds) self.restart = restart self.vector_restart = VectorInOut(bounds, "sos.rst") self.vec_dump = vec_dump self.seed = seed self.aggressive_parasite = aggressive_parasite if workers == -1: self.pool = Pool(mp.cpu_count()) else: self.pool = Pool(workers)
def __init__(self, enable_compression=True, enable_s3=True, file_path=None, num_workers=30): """Initialise the S3 array IO interface. :param bool enable_s3: Flag to store objects in s3 or disk. True: store in S3 False: store on disk (for testing purposes) :param str file_path: The root directory for the emulated s3 buckets when enable_se is set to False. :param int num_workers: The number of workers for parallel IO. """ self.s3io = S3IO(enable_s3, file_path, num_workers) self.pool = ProcessingPool(num_workers) self.enable_compression = enable_compression
def fit(self, dataset): """ Runs dataset through the designated pipeline, extracts features, and fits a conditional random field. :param training_data_loader: Instance of Dataset. :return model: a trained instance of a sklearn_crfsuite.CRF model. """ if not isinstance(dataset, Dataset): raise TypeError( "Must pass in an instance of Dataset containing your training files" ) if not isinstance(self.pipeline, BasePipeline): raise TypeError( "Model object must contain a medacy pipeline to pre-process data" ) pool = Pool(nodes=self.n_jobs) results = [ pool.apipe(self._extract_features, data_file, self.pipeline, dataset.is_metamapped()) for data_file in dataset.get_data_files() ] while any([i.ready() is False for i in results]): time.sleep(1) for idx, i in enumerate(results): X, y = i.get() self.X_data += X self.y_data += y logging.info("Currently Waiting") learner_name, learner = self.pipeline.get_learner() logging.info("Training: %s", learner_name) assert self.X_data, "Training data is empty." train_data = [x[0] for x in self.X_data] learner.fit(train_data, self.y_data) logging.info("Successfully Trained: %s", learner_name) self.model = learner return self.model
def delete_resources(self, id_list): """ Batch Deletes stale resources from IBM topology service via DELETE Expects list of _ids that need to be deleted """ def delete_single_resource(resource_id): """ Multiprocess Worker Method to delete single resource from IBM topology service """ r = requests.delete(self.api_prefix + 'resources/' + resource_id, headers=self.headers, verify=False) sys.stdout.write('Deleting ' + str(len(id_list)) + ' resources from IBM Topology Service\n') pool = Pool(NUM_PROC) pool.map(delete_single_resource, id_list)
def __init__(self, conf, bodies, frame=None, n_procs=1): self.conf = conf self.bodies = bodies self.n = len(bodies) self.frame = frame self.n_procs = n_procs self.pool = Pool(self.n_procs)
def run_service3(service, iterable, iterable_arguments, iterable_argument_names, worker_count, log_function=print): start = timer() args = list(iterable_arguments.keys()) args.extend(iterable_argument_names) if log_function is not None: log_function("[run_service] running service {} with {} workers".format( service, worker_count)) # add everything to work queue all_args = [] for x in iterable: if type(x) is not tuple: x = [x] args = dict(dict(zip(iterable_argument_names, x)), **iterable_arguments) all_args.append(args) pool = Pool(worker_count) results = pool.amap(service, all_args) final_results = results.get() # if example service model is used, metrics can be gathered in this way messages = [] total = len(final_results) failure = 0 for error, mem_usage in final_results: if error is not False: failure += 1 if type(error) is str: messages.append(error) # if we should be logging and if there is material to be logged if log_function is not None and (total + failure + len(messages)) > 0: log_function( "[run_service] Summary {}:\n[run_service]\tTime: {}s\n[run_service]\tTotal: {}\n[run_service]\tFailure: {}" .format(service, int(timer() - start), total, failure)) log_function("[run_service]\tMessages:\n[run_service]\t\t{}".format( "\n[run_service]\t\t".join(messages))) # return relevant info return total, failure, messages
def __enter__(self): """Runs the RPKI Validator""" utils.kill_port(self.port) # Must remove these to ensure a clean run utils.clean_paths(self.rpki_db_paths) cmds = [f"cd {self.rpki_package_path}", f"chown -R root:root {self.rpki_package_path}"] utils.run_cmds(cmds) # Writes validator file and serves it # Can't use cntext manager here since it returns it self._rpki_file = RPKI_File(self._table_input) self._rpki_file.spawn_process() self._process = ProcessingPool() self._process.apipe(self._start_validator) self.total_prefix_origin_pairs = self._rpki_file.total_lines return self
def run(nodes=1, filename="config.xml"): """ Dispatch Wrapper to run xml by dispatching each login to a multiprocess pool :param nodes: sys argument 1 :param filename: (determine) by sys argument 2 :return: array of status of pool """ file_xml = "" with open(filename, "r") as fd: lines = fd.readlines() for line in lines: file_xml += line xml = " ".join(file_xml.rsplit()) pool = ProcessingPool(nodes) run_list = [[xml, value] for value in PATTERN_SEL.findall(xml)] return pool.map(helper, run_list)
def produce_classic(self, processes=1): self.create_output_file() self.produced = True if processes == 1: for i in range(len(self.root_objects)): self.create_result(i) else: from pathos.multiprocessing import ProcessingPool as Pool pool = Pool(processes=processes) self.root_objects = pool.map(self.create_result, range(len(self.root_objects))) for h in self.root_objects: # write sequentially to prevent race conditions h.save(self.output_tree) logger.debug("Produced root objects %s", [h.get_name() for h in self.root_objects]) return self
def get_stats(self): """Get stats for all genomes. Concat the results into a DataFrame""" # pool.map needs an arg for each function that will be run dmx_mean = [self.dmx.mean()] * len(self.genome_paths) with ProcessingPool() as pool: results = pool.map(genome.mp_stats, self.genome_paths, dmx_mean) self.stats = pd.concat(results) self.stats.to_csv(self.stats_path)
def __init__(self, n_obj=2, aggregation='WS', n_point=5, n_job=1, *argv, **kwargs): """ Arguments --------- n_point : int, the number of evaluated points in each iteration aggregation: str or callable, the scalarization method/function. Supported options are: 'WS' : weighted sum 'Tchebycheff' : Tchebycheff scalarization """ super(MOBO_D, self).__init__(*argv, **kwargs) self.n_point = int(n_point) # TODO: perhaps leave this an input parameter self.mu = 2 * self.n_point # the number of generated points self.n_obj = int(n_obj) assert self.n_obj > 1 if isinstance(self.minimize, bool): self.minimize = [self.minimize] * self.n_obj elif hasattr(self.minimize, '__iter__'): assert len(self.minimize) == self.n_obj self.minimize = np.asarray(self.minimize) if hasattr(self.obj_func, '__iter__'): assert self.n_obj == len(self.obj_func) assert self.n_obj == len(self.surrogate) self.n_job = min(MOBO_D.__max_procs__, self.mu, n_job) # TODO: implement the Tchebycheff approach if isinstance(aggregation, str): assert aggregation in ['WS', 'Tchebycheff'] else: assert hasattr(aggregation, '__call__') self.aggregation = aggregation # generate weights self.weights = np.random.rand(self.mu, self.n_obj) self.weights /= np.sum(self.weights, axis=1).reshape(self.mu, 1) self.labels_ = KMeans(n_clusters=self.n_point).fit(self.weights).labels_ self.frange = np.zeros(self.n_obj) if self.n_job > 1: self.p = ProcessingPool(ncpus=self.n_job)
def run(self): files = os.listdir(self.folder) outfile_bed = self.outfile.replace('.txt', '.bed') output_file = open(self.outfile, 'w') output_file.write('circle_id\ttranscript_id\tskipped_exon\tintron\tread_names\tsplice_reads\texon_reads\n') output_file.close() output_file = open(outfile_bed, 'w') output_file.write('# bed12 format\n') output_file.close() from pathos.multiprocessing import ProcessingPool as Pool p = Pool(self.cpus) p.map(self.run_parallel, files)
def main(argv): logging.info('Building coinventor features') feats = [n for n in ProcessingPool().imap(run, ['granted', 'pregranted'])] features = feats[0] for i in range(1, len(feats)): features.update(feats[i]) with open(FLAGS.feature_out + '.%s.pkl' % 'both', 'wb') as fout: pickle.dump(features, fout)
def collect_significances(self): with open(self.filename, 'w') as f: f.write( "Higgsino mass,Bino mass,Discovery Significance,Exclusion Limit\n" ) def get_disc_sig(signal, classifier, bdt_cut): try: table = BDTCutFlowTable(signal, classifier, bdt_cut) calc = table.initialize_significance_calculator() sig = calc.calculate_discovery_significance('bdt') return sig except: pass def get_excl_lim(signal, classifier, bdt_cut): try: table = BDTCutFlowTable(signal, classifier, bdt_cut) calc = table.initialize_significance_calculator() lim = calc.calculate_exclusion_limit('bdt') return lim except: pass mySignals = self.signals pbar = tqdm(total=len(mySignals) / 8) def write_sigs(signal): try: classifier = Classifier(signal.mass_combination_tuple) discs = map(lambda x: get_disc_sig(signal, classifier, x), np.arange(-10, 10, 0.1)) excls = map(lambda x: get_excl_lim(signal, classifier, x), np.arange(-10, 10, 0.1)) with open(self.filename, 'a') as f: f.write("{},{},{},{}\n".format(signal.higgsino_mass, signal.bino_mass, max(discs), max(excls))) pbar.update(1) except: pass p = Pool(8) p.map(write_sigs, mySignals)
def _calculate_s_powder_over_atoms_core(self, q_indx=None): """ Helper function for _calculate_s_powder_1d. :returns: Python dictionary with S data """ atoms_items = {} atoms = range(self._num_atoms) self._prepare_data(k_point=q_indx) if PATHOS_FOUND: p_local = ProcessingPool(nodes=AbinsModules.AbinsParameters.threads) result = p_local.map(self._calculate_s_powder_one_atom, atoms) else: result = [self._calculate_s_powder_one_atom(atom=atom) for atom in atoms] for atom in range(self._num_atoms): atoms_items["atom_%s" % atom] = {"s": result[atoms.index(atom)]} self._report_progress(msg="S for atom %s" % atom + " has been calculated.") return atoms_items
def _exec_sample(X): from pathos.multiprocessing import ProcessingPool try: p = ProcessingPool(n_cpus) X = np.array(X) x = np.array_split(X, n_cpus) pipe = [] for i in range(n_cpus): pipe.append(p.apipe(func, x[i])) rs = [] for i in range(n_cpus): rs.append(pipe[i].get()) rs = [item for sublist in rs for item in sublist] return ot.NumericalSample(rs) except ValueError: # Get there if the chuck size left some single evaluations left return func(X)
def transpose_index(self): # WORKS ONLY FOR TEST DATA """Transpose the data according to the index.""" data = self.data indexes = list(set(data.index)) names, datasets = [], [] for name in indexes: names.append(name) datasets.append(data[[name in i for i in data.index]]) plotSets = zip(names, datasets) pool = ProcessingPool() plots = [] for name, dataset in plotSets: plots.append(pool.map(self.create_transposed_plot, [name], [dataset])) logging.debug('Index transposed') return plots
def _featurize_compounds(self, df, featurizer, parallel=True, worker_pool=None): """Featurize individual compounds. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features dataframe """ sample_smiles = df["smiles"].tolist() if worker_pool is None: features = [] for ind, smiles in enumerate(sample_smiles): if ind % self.log_every_n == 0: log("Featurizing sample %d" % ind, self.verbose) mol = Chem.MolFromSmiles(smiles) features.append(featurizer.featurize([mol])) else: def featurize_wrapper(smiles, dilled_featurizer): print("Featurizing %s" % smiles) mol = Chem.MolFromSmiles(smiles) featurizer = dill.loads(dilled_featurizer) feature = featurizer.featurize([mol]) return feature if worker_pool is None: dilled_featurizer = dill.dumps(featurizer) worker_pool = ProcessingPool(mp.cpu_count()) featurize_wrapper_partial = partial(featurize_wrapper, dilled_featurizer=dilled_featurizer) features = [] for smiles in sample_smiles: features.append(featurize_wrapper_partial(smiles)) else: features = worker_pool.map_sync(featurize_wrapper, sample_smiles) df[featurizer.__class__.__name__] = features
def run(self): if self.args.jumpdists: n_bins=100. bin_width = 1/n_bins bins = np.arange(0,1+bin_width,1/n_bins) if self.args.file: user,vals = self.artist_jump_distributions(self.args.file,bins=bins,self_jumps=False) with open(self.args.resultdir+user,'w') as fout: fout.write(','.join(vals.astype(str))+'\n') else: raise('not implemented!') self.pool = Pool(self.args.n) self.rootLogger.info("Pool started") self.rootLogger.info("Starting jump distance analysis") func_partial = partial(self.artist_jump_distributions,bins=bins,self_jumps=False) with open(self.args.resultdir+'jumpdists','w') as fout: for user,vals in self.pool.imap(func_partial,self.listen_files): fout.write(user+'\t'+','.join(vals.astype(str))+'\n') self.pool.close() self.rootLogger.info("Pool closed") if self.args.blockdists: #self.rootLogger.info("Starting block distance analysis") self.mean_block_distances(self.args.file) if self.args.diversity_dists: bins = np.arange(0,1.01,.01) self.diversity_distributions(self.args.file,bins=bins) if self.args.clustering: self.clustering(self.args.file) if self.args.values: self.patch_values(self.args.file) if self.args.exp: self.explore_exploit(self.args.file) if self.args.patch_len_dists: self.patch_len_dists(self.args.file)
def parallelmap(func, data, nodes = None): """ Return the averaged signal and background (based on blank frames) over the given runs """ if not nodes: nodes = multiprocessing.cpu_count() - 2 pool = ProcessingPool(nodes=nodes) try: return pool.map(func, data) except KeyboardInterrupt: pool.terminate() pool.join()
class C(object): def __init__(self,files): self.pool = Pool(4) self.files = files def raw_processor(self, fi,prefix,somedict): df = pd.read_table( fi, header=None, names=['artist_id','ts'], parse_dates=['ts'])\ .sort_values(by='ts') user = fi.split('/')[-1][:-4] df.to_pickle('/Users/jaredlorince/git/MusicForaging/testData/scrobbles_test/{}_{}.pkl'.format(prefix,user)) rootLogger.info('preprocessing complete for user {} ({})'.format(user,fi)) def run_p(self): func_partial = partial(self.raw_processor,prefix='blah',somedict=d) result = self.pool.amap(func_partial, self.files)
def parallelmap(func, lst, nodes = None): """ Return the averaged signal and background (based on blank frames) over the given runs using multiprocessing (as opposed to MPI). """ from pathos.multiprocessing import ProcessingPool from pathos import multiprocessing if not nodes: nodes = multiprocessing.cpu_count() - 2 pool = ProcessingPool(nodes=nodes) try: return pool.map(func, lst) except KeyboardInterrupt: pool.terminate() pool.join()
while True: try: get_states = requests.get(nation_url, timeout=(1,60)).text break except: sleep(1.5**wait) wait += 1 parsed = BeautifulSoup(get_states, 'html.parser') state_urls = [a['href'] for a in parsed.find('div', class_='newLocUSListArea').find_all('a')] ################ #Get town links# ################ print "Getting town URLs..." pool = Pool(10) result_iter = pool.imap(get_town_urls, state_urls) town_urls = [] for result in result_iter: town_urls += result #Clean up town URLs town_urls = [re.sub("st\.-","st-",url) for url in town_urls] ################# #Get paper links# ################# print "Getting paper URLs..." result_iter = pool.imap(get_paper_urls, town_urls)
def parallel_motif_analysis(self, samples_dirs): pool = ProcessingPool(nodes=16) pool.map(self.sample_motif_analysis, tuple(samples_dirs))
def analyse_samples_parallely(self, samples_dirs): pool = ProcessingPool(nodes=15) pool.map(self.sample_analysis, tuple(samples_dirs))
def genseq(idx): first = np.where(np.random.multinomial(1,pvals=pops)==1)[0][0] last = first last_ts = datetime.now() result = {'artist_idx':[first],'ts':[last_ts]} for i in xrange(seq_length-1): next_listen = draw(last) last = next_listen gap_bin = 120*np.where(np.random.multinomial(1,pvals=td)==1)[0][0] gap = np.random.randint(gap_bin,gap_bin+120) result['artist_idx'].append(next_listen) new_ts = last_ts+timedelta(0,gap) result['ts'].append(new_ts) last_ts = new_ts df = pd.DataFrame(result) df['block'] = ((df['artist_idx'].shift(1) != df['artist_idx']).astype(int).cumsum())-1 df.to_pickle(str(idx)+'.pkl') logging.info('idx {} complete'.format(idx)) pool = Pool(cpu_count()) indices = range(n) pool.map(genseq,indices) pool.close()
#!/usr/bin/env python # # Author: Mike McKerns (mmckerns @caltech and @uqfoundation) # Copyright (c) 1997-2014 California Institute of Technology. # License: 3-clause BSD. The full license text is available at: # - http://trac.mystic.cacr.caltech.edu/project/pathos/browser/pathos/LICENSE from pathos.multiprocessing import ProcessingPool as Pool from pathos.multiprocessing import ThreadingPool as TPool pool = Pool() tpool = TPool() # pickle fails for nested functions def adder(augend): zero = [0] def inner(addend): return addend+augend+zero[0] return inner # build from inner function add_me = adder(5) # build from lambda functions squ = lambda x:x**2 # test 'dilled' multiprocessing for inner print "Evaluate 10 items on 2 proc:" pool.ncpus = 2 print pool print pool.map(add_me, range(10)) print ''
def applay_parallel_ransac(self): sample_indices = [i for i in xrange(25)] pool = Pool() output = pool.map(self.calculate_distance, sample_indices) return output
# creates a worker pool from given comand line parameter. If the given # parameter is to large all detectable CPUs will be utilised. If the given # parameter is nonsense only 1 core will be utilized. workers = 1 if len(sys.argv) >= 2 and sys.argv[1].isdigit() and int(sys.argv[1]) > 0: workers = cpu_count() if int(sys.argv[1]) <= workers: workers = int(sys.argv[1]) print 'N: ' + str(N) print 'PW: ' + str(workers) sleep(3) # just 3 seconds pause to read the input again. # All the magic happens here: pool = ProcessingPool(workers) Ys = pool.map(steadyState,y0) clock = time()-clock # elapsed time print 'Seconds: ' + str(clock) # Not essential but useful. # Serilisation of results and stats: ss = {'STrange': STrange, 'PFDrange': PFDrange, 'Ys': Ys, 'Sec': clock, 'PoolWorkers': workers} output = open('steadyStateAnalysisFixedST_MC_N' + str(N) + '.pkl', 'wb') dill.dump(ss,output,2) output.close() else: print('Well, something went wrong.') #================================================================= #
class analyze(setup.setup): def __init__(self,args,logging_level=logging.INFO): super(analyze, self ).__init__(args,logging_level) # set up processing pool and run all analyses specified in args def run(self): if self.args.jumpdists: n_bins=100. bin_width = 1/n_bins bins = np.arange(0,1+bin_width,1/n_bins) if self.args.file: user,vals = self.artist_jump_distributions(self.args.file,bins=bins,self_jumps=False) with open(self.args.resultdir+user,'w') as fout: fout.write(','.join(vals.astype(str))+'\n') else: raise('not implemented!') self.pool = Pool(self.args.n) self.rootLogger.info("Pool started") self.rootLogger.info("Starting jump distance analysis") func_partial = partial(self.artist_jump_distributions,bins=bins,self_jumps=False) with open(self.args.resultdir+'jumpdists','w') as fout: for user,vals in self.pool.imap(func_partial,self.listen_files): fout.write(user+'\t'+','.join(vals.astype(str))+'\n') self.pool.close() self.rootLogger.info("Pool closed") if self.args.blockdists: #self.rootLogger.info("Starting block distance analysis") self.mean_block_distances(self.args.file) if self.args.diversity_dists: bins = np.arange(0,1.01,.01) self.diversity_distributions(self.args.file,bins=bins) if self.args.clustering: self.clustering(self.args.file) if self.args.values: self.patch_values(self.args.file) if self.args.exp: self.explore_exploit(self.args.file) if self.args.patch_len_dists: self.patch_len_dists(self.args.file) # calculate distribution (using histogram with specified bins) # of sequential artist-to-artist distances def artist_jump_distributions(self,fi,bins,self_jumps=False): user = fi.split('/')[-1][:-4] df = pd.read_pickle(fi) if self_jumps: vals = np.histogram(df['dist'].dropna(),bins=bins)[0] else: vals = np.histogram(df['dist'][df['dist']>0],bins=bins)[0] self.rootLogger.info('artist jump distances done for user {} ({})'.format(user,fi)) return user,vals # calculate distribution (using histogram with specified bins) # of patch diversity for each user # awk 'FNR==1' * > diversity_dists_zeros # awk 'FNR==2' * > diversity_dists_nozeros def diversity_distributions(self,fi,bins): if 'patches' not in fi: raise('WRONG DATATYPE') user = fi.split('/')[-1].split('_')[0] df = pd.read_pickle(fi).dropna(subset=['diversity']) zeros = np.histogram(df[df['n']>=5]['diversity'],bins=bins)[0] nozeros = np.histogram(df[(df['n']>=5)&(df['diversity']>0)]['diversity'],bins=bins)[0] zeros = zeros/float(zeros.sum()) nozeros = nozeros/float(nozeros.sum()) with open(self.args.resultdir+user,'w') as fout: fout.write(user+'\t'+'zeros'+'\t'+','.join(zeros.astype(str))+'\n') fout.write(user+'\t'+'nozeros'+'\t'+','.join(nozeros.astype(str))+'\n') self.rootLogger.info('diversity distributions done for user {} ({})'.format(user,fi)) def mean_block_distances(self,fi,n=100): def cos_nan(arr1,arr2): if np.any(np.isnan(arr1)) or np.any(np.isnan(arr2)): return np.nan else: return cosine(arr1,arr2) user = fi.split('/')[-1].split('_')[0] df = pd.read_pickle(fi) blocks = df[df['n']>=5].dropna() result = [] for i in xrange(len(blocks)-n): first = blocks['centroid'].iloc[i] result.append(np.array(blocks['centroid'][i+1:i+n+1].apply(lambda val: cos_nan(val,first)))) result = np.nanmean(np.vstack(result),0) with open(self.args.resultdir+user,'w') as fout: fout.write('\t'.join([user,'patch',','.join(result.astype(str))])+'\n') self.rootLogger.info('Block distances for user {} processed successfully ({})'.format(user,fi)) # now shuffled # idx = np.array(blocks.index) # np.random.shuffle(idx) # blocks = blocks.reindex(idx) # result_random = [] # for i in xrange(len(blocks)-n): # first = blocks['centroid'].iloc[i] # result_random.append(np.array(blocks['centroid'][i+1:i+n+1].apply(lambda val: cos_nan(val,first)))) # result_random = np.nanmean(np.vstack(result_random),0) # with open(self.args.resultdir+user,'w') as fout: # fout.write('\t'.join([user,'patch',','.join(result.astype(str))])+'\n') # fout.write('\t'.join([user,'patch_random',','.join(result_random.astype(str))])+'\n') # self.rootLogger.info('Block distances for user {} processed successfully ({})'.format(user,fi)) def clustering(self,fi): df = pd.read_pickle(fi) user = fi.split('/')[-1].split('_')[0] mask = (df['centroid'].apply(lambda arr: ~np.any(np.isnan(arr))).values)&(df['n']>=5)&(df['diversity']<=0.2) clust_data = df[mask].reset_index() arr = np.vstack(clust_data['centroid']) Z = linkage(arr, 'complete') clusters = fcluster(Z,t=0.2,criterion='distance') assignments = np.repeat(np.nan,len(df)) assignments[np.where(mask)] = clusters df['patch_clust'] = assignments df.to_pickle('{}{}.pkl'.format(self.args.resultdir,user)) self.rootLogger.info('Patch clusters for user {} processed successfully ({})'.format(user,fi)) def patch_len_dists(self,fi): df = pd.read_pickle(fi) user = fi.split('/')[-1][:-4] explore = df[np.isnan(df['patch_clust'])] result_explore = explore['n'].value_counts() df['explore'] = np.isnan(df['patch_clust']).astype(int) df['explore-idx'] = df['explore'].cumsum() result_exploit = df.groupby('explore-idx').apply(lambda df: df.dropna()['n'].sum()).value_counts() result_explore = result_explore.reindex(xrange(1,max(result_explore.index)+1),fill_value=0.).values result_exploit = result_exploit.reindex(xrange(1,max(result_exploit.index)+1),fill_value=0.).values result_explore = sparse.csr_matrix(result_explore) result_exploit = sparse.csr_matrix(result_exploit) with open(self.args.resultdir+user,'w') as fout: fout.write(user+'\t'+'explore'+'\t'+':'.join([','.join(a.astype(str)) for a in result_explore.data,result_explore.indices,result_explore.indptr])+'\n') fout.write(user+'\t'+'exploit'+'\t'+':'.join([','.join(a.astype(str)) for a in result_exploit.data,result_exploit.indices,result_exploit.indptr])+'\n') self.rootLogger.info('User {} processed successfully ({})'.format(user,fi)) def explore_exploit(self,fi): user = fi.split('/')[-1][:-4] df_patches_raw = pd.read_pickle(fi) # add time in next bout df_patches_raw['next_n'] = df_patches_raw['n'].shift(-1) # add patch values # listensPerPatch = df_patches_raw.groupby('patch_clust')['n'].sum() # overall_prop = listensPerPatch/float(df_patches_raw['n'].sum()) # overall_prop.name = 'final_value' # df_patches_raw = df_patches_raw.join(overall_prop,on='patch_clust') """ # time in next exploit patch as function of exploration time result = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])].groupby('n')['next_n'].mean() fout.write(user+'\t'+'next-exploit-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ # total time exploiting as a function of time exploring df_patches_raw['explore'] = np.isnan(df_patches_raw['patch_clust']).astype(int) df_patches_raw['explore-idx'] = df_patches_raw['explore'].cumsum() # combine all exploit listens #grp_explore = df_patches_raw.groupby('explore-idx').apply(lambda df: pd.DataFrame({'n':[df['n'].iloc[0]],'n-exploit':[df['n'][1:].sum()]})) # only last exploit bout grp_explore = df_patches_raw.groupby('explore-idx').apply(lambda df: pd.DataFrame({'n':[df['n'].iloc[0]],'n-exploit':[df['n'].iloc[-1]]})) #result = grp_explore.groupby('n')['n-exploit'].mean() #fout.write(user+'\t'+'total-exploit-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ # exploration time as a function of exploitation time grp_exploit = grp_explore.copy() grp_exploit['n-explore'] = grp_exploit['n'].shift(-1) result = grp_exploit.groupby('n-exploit')['n-explore'].mean() fout.write(user+'\t'+'explore-vs-exploit'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ # prob exploit given explore time - already done # explore_only = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])] # result = explore_only['n'][:-1].value_counts() # arr = result.reindex(xrange(1,max(result.index)+1),fill_value=0.).values # final_result = arr/(np.cumsum(arr[::-1])[::-1]) # final_result = sparse.csr_matrix(final_result) # with open(self.args.resultdir+user+'_exploit','w') as fout: # fout.write(user+'\t'+':'.join([','.join(a.astype(str)) for a in final_result.data,final_result.indices,final_result.indptr])+'\n') # prob explore given exploit time result = grp_explore['n-exploit'][grp_explore['n-exploit']>0].value_counts() arr = result.reindex(xrange(1,max(result.index)+1),fill_value=0.).values final_result = arr/np.cumsum(arr[::-1])[::-1] final_result = sparse.csr_matrix(final_result) with open(self.args.resultdir+user+'_explore','w') as fout: fout.write(user+'\t'+':'.join([','.join(a.astype(str)) for a in final_result.data,final_result.indices,final_result.indptr])+'\n') #fout.write(user+'\t'+'prob-explore-given-exploit'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ # patch value as a function of exploration time df_patches_raw['final_value_next'] = df_patches_raw['final_value'].shift(-1) result = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])].groupby('n')['final_value_next'].mean() fout.write(user+'\t'+'exploit-value-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ self.rootLogger.info('User {} processed successfully ({})'.format(user,fi))