def distance_matrix(untrans, trans, minimum_length, steps=10, parallel_compute=True, free_cores=2, return_partials=False, gpu=False): # initialize the matrix d = np.zeros((len(untrans), len(untrans))) cpus = max(multiprocessing.cpu_count() - free_cores, 1) if parallel_compute: out = Parallel(n_jobs=cpus, verbose=5)(delayed(distance_matrix_loop_func)(ii,jj,untrans,trans, minimum_length,steps,return_partials,gpu) for ii in range(len(untrans)) for jj in range(ii, len(untrans))) else: out = [] for ii in range(len(untrans)): for jj in range(ii, len(untrans)): out.append(distance_matrix_loop_func(ii,jj,untrans,trans, minimum_length,steps,return_partials,gpu)) for cell in out: d[cell[0], cell[1]] = d[cell[1], cell[0]] = cell[2] if return_partials: pd = np.zeros((len(untrans), len(untrans))) for cell in out: pd[cell[0], cell[1]] = pd[cell[1], cell[0]] = cell[3] return d, pd else: return d
def main_get_data(paths, parallel: bool = False, n_jobs: int = -2, modules_name: list = modules_name, skip_countries: list = [], gsheets_api=None): """Get data from sources and export to output folder. Is equivalent to script `run_python_scripts.py` """ print("-- Getting data... --") skip_countries = [x.lower() for x in skip_countries] country_data_getter = CountryDataGetter(paths, skip_countries, gsheets_api) if parallel: modules_execution_results = Parallel(n_jobs=n_jobs, backend="threading")( delayed(country_data_getter.run)( module_name, ) for module_name in modules_name ) else: modules_execution_results = [] for module_name in modules_name: modules_execution_results.append(country_data_getter.run( module_name, )) modules_failed = [m["module_name"] for m in modules_execution_results if m["success"] is False] # Retry failed modules logger.info(f"\n---\n\nRETRIALS ({len(modules_failed)})") modules_execution_results = [] for module_name in modules_failed: modules_execution_results.append( country_data_getter.run(module_name) ) modules_failed_retrial = [m["module_name"] for m in modules_execution_results if m["success"] is False] if len(modules_failed_retrial) > 0: failed_str = "\n".join([f"* {m}" for m in modules_failed_retrial]) print(f"\n---\n\nThe following scripts failed to run ({len(modules_failed_retrial)}):\n{failed_str}") print_eoe()
def simulate_one_cv(lm_eye: pd.DataFrame, lm_mouse: pd.DataFrame, cv_run: int, n_jobs: int = 4) -> List[Dict]: start = time.time() sim_IDs = list(np.arange(1, constants.NUM_SIMS + 1)) lin_models_eye = [lm_eye] * len(sim_IDs) lin_models_mouse = [lm_mouse] * len(sim_IDs) save_results = [False] * len(sim_IDs) try: results = Parallel(n_jobs=n_jobs, backend='loky', verbose=True)( delayed(simulate_batch)(sim_ID, lm_s, lm_m, r) for sim_ID, lm_s, lm_m, r in zip(sim_IDs, lin_models_eye, lin_models_mouse, save_results)) except: print( f'Failed multiprocessing for single CV simulation. Attempting single core...' ) results = [] for sim_ID in sim_IDs: results.append(simulate_batch(sim_ID, lm_eye, lm_mouse, False)) print(f'Simulated {sim_ID} of {len(sim_IDs)} in CV run {cv_run}') save_simulation_results(results, sim_IDs, cv_run) print( f'CV run {cv_run} took {round((time.time() - start) / 60, 1)} minutes') return results
def load_training_set(video_set, grid_size, bins, skip_val, force_refresh=False): """ Load and process all videos in provided training set. video_set: List of integers corresponding to video files (5 char left zero padded). """ if force_refresh: # Process in parallel if we need to refresh (much faster) videos = Parallel(n_jobs=-1, prefer="threads")( delayed(process_video)(i, grid_size, bins, skip_val, force_refresh) for i in video_set) # Remove None values list(filter(None.__ne__, videos)) else: videos = [] for i in video_set: video = process_video(i, grid_size, bins, skip_val, force_refresh) if video is not None: videos.append(video) return videos
def lgb_cv_tuning(grid, data, nfold, return_best=is_return_best, parallel=True, **kwargs): # Modified implementing parallelism print('* Start hyperparameter tuning with {}-fold CV...'.format(nfold)) print('* Hyperparameter grid:') print(params_grid) cv_results = [] all_params = ParameterGrid(grid) if parallel: # num_cores = multiprocessing.cpu_count() print("* Parallel mode activated") print("* Number of cores used: ", 50) print('* Begin CV') cv_results = Parallel(n_jobs=50)( delayed(tune_ind_params)(all_params[i], data, nfold) for i in range(len(all_params))) else: for i in range(all_params[i]): print('Hyperparemeter set: {}'.format(i)) print('* Begin CV') cv_results.append(tune_ind_params(params, data, nfold)) if return_best: return min(cv_results, key=lambda x: x[1]) else: return cv_results
def _find_centroid(X, Sx, n_samples, seedlen): u = uniformset(X, Sx, n_samples, seedlen) # print(u.subs[:u.n_seg], u.n_seg) if parallel is True: results = Parallel(n_jobs=4)([ delayed(_find_centroid_wrap)(X, Sx, seedlen, iter1, iter2, u) for iter1, iter2 in combinations(range(u.n_seg), 2) ]) else: results = [] for iter1, iter2 in tqdm(combinations(range(u.n_seg), 2), desc='SearchCentroid'): results.append(_find_centroid_wrap(X, Sx, seedlen, iter1, iter2, u)) # pp.pprint(results) if not results: print('fixed sampling') s0, s1 = fixed_sampling(X, Sx, seedlen) return s0, s1 centroid = np.argmin([res[0] for res in results]) # print(results[centroid]) costMin, seg0, seg1 = results[centroid] if costMin == np.inf: print('!! --- centroid not found') # s0, s1 = fixed_sampling(X, Sx, seedlen) # print('fixed_sampling', s0.subs, s1.subs) return Regime(), Regime() s0, s1 = Regime(), Regime() s0.add_segment(seg0[0], seg0[1]) s1.add_segment(seg1[0], seg1[1]) # print(s0.n_seg, s1.n_seg) # time.sleep(3) return s0, s1
def create_beat_dataset_fixed(metadf, Xmat, tgrid, do_parallel=True, detrend=True): if do_parallel: from joblib import Parallel, delayed bl_list = Parallel(n_jobs=30, verbose=5)( delayed(segment_beat)(Xmat[idx], tgrid, alg="christov-aligned", detrend=detrend) for idx in range(Xmat.shape[0])) else: bl_list = [] for idx in range(Xmat.shape[0]): bl_list.append(segment_beat(Xmat[idx], tgrid, alg="christov-aligned", detrend=detrend)) # go through and determine bad idx (bad splits) beat_list = [b for b, _ in bl_list] len_list = [l for _, l in bl_list] idx_bad = np.array([ b.shape[-1] != 100 for b in beat_list ]) idx_good = np.where(~idx_bad)[0] # go through each beat and construct a beat dataframe beat_meta, beat_lens = [], [] for idx in idx_good: beat_meta += [metadf.iloc[idx]]*len(beat_list[idx]) beat_lens.append(len_list[idx]) beat_list = [beat_list[i] for i in idx_good] # stack in to dataframe + data matrix beat_metadf = pd.DataFrame(beat_meta) beat_metadf.reset_index(inplace=True) beat_metadf['beat_len'] = np.concatenate(beat_lens) beat_mat = np.row_stack(beat_list) beat_mat = np.rollaxis(beat_mat, 0) # transpose s.t. Nbeat x Nchannel x Nsamp return beat_metadf, beat_mat
def _nlp_sub(disc_clsdict, gold_clsdict, names, label, verbose, n_jobs): # ned ned = NED cov = coverage if verbose: print ' nlp ({2}): subsampled {0} files in {1} sets'\ .format(sum(map(len, names)), len(names), label) with verb_print(' nlp ({0}): calculating scores' .format(label), verbose, False, True, False): if n_jobs>1: ned_score = Parallel(n_jobs=n_jobs, verbose=5 if verbose else 0, pre_dispatch='n_jobs')(delayed(ned)(disc_clsdict.restrict(ns, True)) for ns in names) cov_score = Parallel(n_jobs=n_jobs, verbose=5 if verbose else 0, pre_dispatch='n_jobs')(delayed(cov)(disc_clsdict.restrict(ns, False), gold_clsdict.restrict(ns, False)) for ns in names) else: ned_score = list(); cov_score = list() for ns in names: ned_score_ = ned(disc_clsdict.restrict(ns, True)) cov_score_ = cov(disc_clsdict.restrict(ns, False), gold_clsdict.restrict(ns, False)) ned_score.append(ned_score_) cov_score.append(cov_score_) # don't replace nan's by 1, but ignore them, unless all values in ned_score # are nan ned_score, cov_score = np.array(ned_score), np.array(cov_score) ned_score, cov_score = aggregate(ned_score, default_score=1), \ aggregate(cov_score, default_score=0) return np.array(ned_score), np.array(cov_score)
def process_run(template_k,traindata,testdata,allsubs): roi_num = np.unique(parcellation_data[:,template_k]) roi_num = roi_num[roi_num>0] roi_inds = [] for roi in roi_num: roi_inds.append(np.where(parcellation_data[:,template_k] == roi)[0]) arg_instances = [(k+1,ind,traindata,testdata,allsubs) for k,ind in enumerate(roi_inds)] print('starting computing with %i workers\n' % (n_workers), flush=True) start_time = time.time() if n_workers>1: results = Parallel(n_jobs=n_workers, max_nbytes='500M', mmap_mode='r')(map(delayed(process_roi), arg_instances)) else: results=[] for args in arg_instances: results.append(process_roi(args)) print('Done (took %.1fs)' % (time.time() - start_time), flush=True) print('writing aligned datasets') for sub_ind,sub in enumerate(allsubs): res = {'R_common':[],'S_stat':[]} for roi_res in results: res['R_common'].append(roi_res[0][sub_ind]) res['S_stat'].append(roi_res[1][sub_ind]) dd.io.save(testdata['target'][sub][0],res, compression=None) print('')
def read_video_sample(vid_files, fid, cam_range, calib_file, read_dist=True, read_parallel=False): # read calib calib_path = os.path.join(os.path.dirname(vid_files[0]), calib_file) calib = json_load(calib_path) K_list = [np.array(calib['K']['cam%d' % cid]) for cid in cam_range] M_list = [ np.linalg.inv(np.array(calib['M']['cam%d' % cid])) for cid in cam_range ] if read_dist: dist_list = [ np.array(calib['dist']['cam%d' % cid]) for cid in cam_range ] # read image img_list = list() if read_parallel: img_list = Parallel(n_jobs=len(cam_range))( delayed(read_vid_frame)(vid, fid) for vid in vid_files) else: for vid in vid_files: img_list.append(read_vid_frame(vid, fid)) if read_dist: return img_list, K_list, M_list, dist_list return img_list, K_list, M_list
def main(file_path, output_folder, file_type, download_n_files, max_size=None, n_jobs=1): df = pds.read_csv(file_path, sep=";").sample(frac=1) # naively filter the df to get only the desired file_type df = df[df.format == file_type] if download_n_files: df = df.iloc[:download_n_files] print(f"There are {len(df)} resources of type {file_type}") urls = df["url"].values resource_ids = df["id"].values dataset_ids = df["dataset.id"].values new_ids = dataset_ids + "--" + resource_ids organizations = df["dataset.organization"].fillna("NA").apply( lambda x: unidecode.unidecode(get_valid_filename(x))).values assert len(urls) == len(new_ids) if n_jobs > 1: succes_downloaded = Parallel(n_jobs=n_jobs)( delayed(downloader)(url, id, org, output_folder, file_type, max_size) for url, id, org in tqdm(list(zip(urls, new_ids, organizations)))) else: succes_downloaded = [] for url, id, org in tqdm(list(zip(urls, new_ids, organizations))): succes_downloaded.append( downloader(url, id, org, output_folder, file_type, max_size)) print( f"I successfully downloaded {sum(succes_downloaded)} of {len(succes_downloaded)} files" )
def find_closest_auto(demofile, new_xyz): if args.parallel: from joblib import Parallel, delayed demo_clouds = [asarray(seg["cloud_xyz"]) for seg in demofile.values()] keys = demofile.keys() if args.parallel: costs = Parallel(n_jobs=3, verbose=100)( delayed(registration_cost)(demo_cloud, new_xyz) for demo_cloud in demo_clouds) else: costs = [] for (i, ds_cloud) in enumerate(demo_clouds): costs.append(registration_cost(ds_cloud, new_xyz)) print "completed %i/%i" % (i + 1, len(demo_clouds)) print "costs\n", costs if args.show_neighbors: nshow = min(5, len(keys)) import cv2, rapprentice.cv_plot_utils as cpu sortinds = np.argsort(costs)[:nshow] near_rgbs = [asarray(demofile[keys[i]]["rgb"]) for i in sortinds] bigimg = cpu.tile_images(near_rgbs, 1, nshow) cv2.imshow("neighbors", bigimg) print "press any key to continue" cv2.waitKey() ibest = np.argmin(costs) return keys[ibest]
def extract_patches(img, offsets, patch_size, extract_batch_parallel=False): img = img.permute(0, 3, 1, 2) num_patches = offsets.shape[1] batch_size = img.shape[0] # I pad the images with zeros for the cases that a part of the patch falls outside the image pad_const = int(patch_size[0].item() / 2) pad_func = torch.nn.ConstantPad2d(pad_const, 0.0) img = pad_func(img) # Add the pad_const to the offsets, because everything is now shifted by pad_const offsets = offsets + pad_const all_patches = [] # Extracting in parallel is more expensive than doing it sequentially. This I left it in here if extract_batch_parallel: num_jobs = min(os.cpu_count(), batch_size) all_patches = Parallel(n_jobs=num_jobs)( delayed(_extract_patches_batch)(b, img, offsets, patch_size, num_patches) for b in range(batch_size)) else: # Run sequentially over the elements in the batch for b in range(batch_size): patches = _extract_patches_batch(b, img, offsets, patch_size, num_patches) all_patches.append(patches) return torch.stack(all_patches)
def read_exif_data(images): if use_joblib: from joblib import Parallel, delayed # results = Parallel(n_jobs=n_threads)(delayed(add_exif_data)(im) for im in images[0:10]) results = Parallel(n_jobs=n_threads)(delayed(add_exif_data)(im) for im in images) if n_threads == 1: results = [] for im in images: results.append(add_exif_data[im]) else: if use_threads: pool = ThreadPool(n_threads) else: pool = Pool(n_threads) results = list(pool.map(add_exif_data, images)) return results
def auto_choose(actionfile, new_xyz, nparallel=-1): """ @param demofile: h5py.File object @param new_xyz : new rope point-cloud @nparallel : number of parallel jobs to run for tps cost calculaion. If -1 only 1 job is used (no parallelization). @return : return the name of the segment with the lowest warping cost. """ if not nparallel == -1: from joblib import Parallel, delayed nparallel = min(nparallel, 8) demo_data = actionfile.items() if nparallel != -1: before = time.time() redprint("auto choose parallel with njobs = %d"%nparallel) costs = Parallel(n_jobs=nparallel, verbose=0)(delayed(registration_cost)(ddata[1]['cloud_xyz'][:], new_xyz) for ddata in demo_data) after = time.time() print "Parallel registration time in seconds =", after - before else: costs = [] redprint("auto choose sequential..") for i, ddata in enumerate(demo_data): costs.append(registration_cost(ddata[1]['cloud_xyz'][:], new_xyz)) print(("tps-cost completed %i/%i" % (i + 1, len(demo_data)))) ibest = np.argmin(costs) redprint ("auto choose returning..") return demo_data[ibest][0]
def analysis(self, permute=False): """ Classify based an iteratively increasing the number of features (electrodes) included in the model. Starts with the single best electrode (N=1) and increase until N = the number of electrodes. Note: permute is not used in this analysis, but kept to match the same signature as super. """ if self.subject_data is None: print('%s: compute or load data first with .load_data()!' % self.subject) # Get recalled or not labels if self.recall_filter_func is None: print('%s classifier: please provide a .recall_filter_func function.' % self.subject) y = self.recall_filter_func(self.subject_data) # zscore the data by session x = self.zscore_data() # create the classifier classifier = LogisticRegression(C=self.C, penalty=self.norm, solver='liblinear') # create .num_rand_splits of cv_dicts cv_dicts = [self._make_cross_val_labels() for _ in range(self.num_rand_splits)] # run permutations with joblib f = _par_compute_and_run_split if self.use_joblib: aucs = Parallel(n_jobs=12, verbose=5)(delayed(f)(cv, classifier, x, y) for cv in cv_dicts) else: aucs = [] for cv in tqdm(cv_dicts): aucs.append(f(cv, classifier, x, y)) # store results self.res['auc_x_n'] = np.stack(aucs)
def _extract_multi_patches_batch(b, imgs, offsets, patch_size, num_patches, samples_index, scales, extract_patch_parallel=False): """ Extract patches for a single batch. This function can be called in a for loop or in parallel. This functions returns a tensor of patches of size [num_patches, channels, width, height] """ patches = [] # Extracting in parallel is more expensive than doing it sequentially. This I left it in here if extract_patch_parallel: num_jobs = min(os.cpu_count(), num_patches) patches = Parallel(n_jobs=num_jobs)(delayed(_extract_multi_patch)(imgs[ samples_index[b, p]][b], offsets[b, p], patch_size) for p in range(num_patches)) else: # Run extraction sequentially for p in range(num_patches): s = samples_index[b, p] patch = _extract_multi_patch(imgs[s][b], offsets[b, p], patch_size) # print("Extract patch from image scale %d"%s) # print("offset ", offsets[b, p]) # print("img size ", imgs[s][b].shape) # showPatch(patch, imgs[s][b]) patches.append(patch) return torch.stack(patches)
def evaluate_new_feature(self, prev_subset: list, new_feature, X_f: dict, X_t: dict, y: np.array) -> float: A = prev_subset + [new_feature] scores = list() if self.n_jobs > 1: scores = Parallel(n_jobs=self.n_jobs)( delayed(self.score_function)( A=A, X_f=result['train']['transformed'], X_f_test=result['test']['transformed'], X_t=result['train']['plain'], X_t_test=result['test']['plain'], y=result['train']['target'], y_test=result['test']['target'], decision_function=clone(self.decision_function)) for result in (split_dataset(X_t, X_f, y, self.seeds[i], 1 - self.train_share) for i in range(self.n_cv_ffs))) else: for i in range(self.n_cv_ffs): result = split_dataset(X_t, X_f, y, self.seeds[i], 1 - self.train_share) scores.append( self.score_function( A=A, X_f=result['train']['transformed'], X_f_test=result['test']['transformed'], X_t=result['train']['plain'], X_t_test=result['test']['plain'], y=result['train']['target'], y_test=result['test']['target'], decision_function=self.decision_function)) return float(np.mean(scores))
def basic_compute_loop(compute_function, looper, run_parallel=True, debug=None): """ Canonical form of the basic compute loop. !!! remove this from contacts.py when it works """ #---send the frame as the debug argument if debug != None and debug != False: fr = debug incoming = compute_function(**looper[fr]) import ipdb ipdb.set_trace() sys.quit() start = time.time() if run_parallel: incoming = Parallel(n_jobs=8, verbose=10 if debug else 0)( delayed(compute_function, has_shareable_memory)(**looper[ll]) for ll in framelooper(len(looper), start=start)) else: incoming = [] for ll in framelooper(len(looper)): incoming.append(compute_function(**looper[ll])) return incoming
def data(self): '''Do I need to worry about intake caching? Data will be dataframes for csvs and Datasets for netcdf files. ''' if not hasattr(self, '_data'): if self.parallel: num_cores = multiprocessing.cpu_count() downloads = Parallel(n_jobs=num_cores)( delayed(self.data_by_dataset)(dataset_id) for dataset_id in self.dataset_ids) else: downloads = [] for dataset_id in self.dataset_ids: downloads.append(self.data_by_dataset(dataset_id)) # if downloads is not None: dds = {dataset_id: dd for (dataset_id, dd) in downloads} # else: # dds = None self._data = dds return self._data
def main(): parser = argparse.ArgumentParser(description='Register & align images') parser.add_argument('filenames',nargs='+',help='List of target files to register. Images are aligned to first in list.') parser.add_argument('-odir',metavar='outdir',required=True,type=str,help='Output directory for files.') parser.add_argument('-m',metavar='method',choices=('point','extended'),default='extended',help='Specify alignment method (point or extended); default=extended.') parser.add_argument('-xy',nargs=2,type=float,default=None,help='Specify approximate "x y" pixel coordinate of object to centroid on. Required for point mode; useful for extended mode (default=center of image).') parser.add_argument('-box',nargs=2,type=int,default=None,help='Specify box size (w h) to restrict alignment search. Useful for both point & extended modes (default=full size of array).') parser.add_argument('--c',action='store_true',help='Clobber (overwrite) on output') parser.add_argument('-njobs',type=int,default=1,help='Process images in parallel. "-1" is all CPUs (default=1).') args = parser.parse_args() if args.m == 'point' and args.xy is None: parser.error("-m point requires -xy coordinate") # create output directory if args.odir not in ['','.']: makedirs(args.odir,exist_ok=True) # align all images to first filename ref = args.filenames[0] align = args.filenames[1:] imref = partial(register,ref=ref,outdir=args.odir, method=args.m,center=args.xy,size=args.box, overwrite=args.c) outfiles = Parallel(n_jobs=args.njobs,verbose=11)(delayed(imref)(toshift=a) for a in align) # Write ref to outdir refnew = os.path.join(args.odir,os.path.basename(ref)) copy(ref,refnew) outfiles.append(refnew) print('Wrote %i files to %s' % (len(outfiles), args.odir))
def meta(self): if not hasattr(self, '_meta'): if self.parallel: # get metadata for datasets # run in parallel to save time num_cores = multiprocessing.cpu_count() downloads = Parallel(n_jobs=num_cores)( delayed(self.meta_by_dataset)(dataset_id) for dataset_id in self.dataset_ids ) else: downloads = [] for dataset_id in self.dataset_ids: downloads.append(self.meta_by_dataset(dataset_id)) # make dict from individual dicts from collections import ChainMap meta = dict(ChainMap(*downloads)) # Make dataframe of metadata # variable names are the column names for the dataframe self._meta = pd.DataFrame.from_dict(meta, orient='index', columns=['database','download_url'] \ + self.columns + ['variable names']) return self._meta
def find_closest_auto(demofile, new_xyz): if args.parallel: from joblib import Parallel, delayed demo_clouds = [asarray(seg["cloud_xyz"]) for seg in demofile.values()] keys = demofile.keys() if args.parallel: costs = Parallel(n_jobs=3,verbose=100)(delayed(registration_cost)(demo_cloud, new_xyz) for demo_cloud in demo_clouds) else: costs = [] for (i,ds_cloud) in enumerate(demo_clouds): costs.append(registration_cost(ds_cloud, new_xyz)) print "completed %i/%i"%(i+1, len(demo_clouds)) print "costs\n",costs if args.show_neighbors: nshow = min(5, len(keys)) import cv2, rapprentice.cv_plot_utils as cpu sortinds = np.argsort(costs)[:nshow] near_rgbs = [asarray(demofile[keys[i]]["rgb"]) for i in sortinds] bigimg = cpu.tile_images(near_rgbs, 1, nshow) cv2.imshow("neighbors", bigimg) print "press any key to continue" cv2.waitKey() ibest = np.argmin(costs) return keys[ibest]
def fit_discover(self, D, return_tids=False): """fit LCM on the transactional database, and return the set of closed itemsets in this database, with respect to the minium support Different from ``fit_transform``, see the `Returns` section below. Parameters ---------- D : pd.Series or Iterable The input transactional database Where every entry contain singular items Items must be both hashable and comparable return_tids: bool Either to return transaction ids along with itemset. Default to False, will return supports instead Returns ------- pd.DataFrame DataFrame with the following columns ========== ================================= itemset a `tuple` of co-occured items support frequence for this itemset ========== ================================= if `return_tids=True` then ========== ================================= itemset a `tuple` of co-occured items tids a bitmap tracking positions ========== ================================= Example ------- from skmine.preprocessing import LCM D = [[1, 2, 3, 4, 5, 6], [2, 3, 5], [2, 5]] LCM(min_supp=2).fit_discover(D) itemset support 0 (2, 5) 3 1 (2, 3, 5) 2 LCM(min_supp=2).fit_discover(D, return_tids=True) # doctest: +SKIP itemset tids 0 (2, 5) [0, 1, 2] 1 (2, 3, 5) [0, 1] """ self._fit(D) empty_df = pd.DataFrame(columns=['itemset', 'tids']) # reverse order of support supp_sorted_items = sorted(self.item_to_tids.items(), key=lambda e: len(e[1]), reverse=True) dfs = Parallel(n_jobs=self.n_jobs, prefer='processes')( delayed(self._explore_item)(item, tids) for item, tids in supp_sorted_items) dfs.append(empty_df) # make sure we have something to concat df = pd.concat(dfs, axis=0, ignore_index=True) if not return_tids: df.loc[:, 'support'] = df['tids'].map(len).astype(np.uint32) df.drop('tids', axis=1, inplace=True) return df
def calc_fitness(self,use_parallel=False): if use_parallel: errors = Parallel(n_jobs=num_cores)(delayed(self.fitness)(i) for i in tqdm(self.poolarray)) else: errors = [] for i in self.poolarray: errors.append(self.fitness(i)) self.fitnessmap = errors
def run_jobs(jobs, joblib=True, n_jobs=4, chunks=1, chunk_callback=None, *args, **kwargs): if len(jobs) == 0: return None, None if joblib: jobs = [delayed(job)() for job in jobs] chunk_size = max(1, len(jobs) // chunks) chunks = [ jobs[i:i + chunk_size] for i in range(0, len(jobs), chunk_size) ] out = [] for chunk in chunks: chunk_out = Parallel(n_jobs=n_jobs, *args, **kwargs)(chunk) if chunk_callback is not None: ret = chunk_callback(chunk_out, args=[job[0].args for job in chunk], kwargs=[job[0].keywords for job in chunk]) out.append((chunk_out, ret)) else: out.append((chunk_out, )) else: out = [] # create chunks nr_chunks = chunks chunk_size = max(1, len(jobs) // chunks) chunks = [ jobs[i:i + chunk_size] for i in range(0, len(jobs), chunk_size) ] for j, chunk in enumerate(chunks): chunk_out = [] for i, job in enumerate(chunk): if 'verbose' in kwargs and kwargs['verbose']: print('\r\r Chunk %d / %d' % (j, nr_chunks) + '\n Working on job %d/%d, ' % (i, len(chunk)) + '\n args: %s, \n kwargs: %s' % (', '.join(job.args), ', '.join( [str(tup) for tup in job.keywords.items()]))) chunk_out.append(job()) if chunk_callback is not None: ret = chunk_callback(chunk_out, args=[job.args for job in chunk], kwargs=[job.keywords for job in chunk]) out.append((chunk_out, ret)) else: out.append((chunk_out, )) return list(el[0] for el in zip(*out))
def prepare_data(self, midi_paths): if self.train_from_scratch: midis = Parallel(n_jobs=len(os.sched_getaffinity(0)))( delayed(data_augmentation)(midi_paths[midi]) for midi in tqdm.trange(len(midi_paths))) midis = [item for sublist in midis for item in sublist] # midis = midi_paths all_events = Parallel(n_jobs=len(os.sched_getaffinity(0)))( delayed(extract_events)(midis[path]) for path in tqdm.trange(len(midis))) all_events = list(filter(None, all_events)) total_events = [item for sublist in all_events for item in sublist] dictionary = list(set(total_events)) self.word2event = dict(zip(range(len(dictionary)), dictionary)) self.event2word = dict(zip(dictionary, range(len(dictionary)))) with open(self.dictionary_path, "wb") as file: pickle.dump((self.event2word, self.word2event), file) # extract events all_events = [] for path in midi_paths: events = extract_events(path, self.use_chord) all_events.append(events) # event to word all_words = [] for events in all_events: words = [] for event in events: if event in self.event2word: words.append(self.event2word[event]) else: # OOV if event.name == 'Note Velocity': # replace with max velocity based on our training data words.append(self.event2word['Note Velocity_21']) else: # something is wrong # you should handle it for your own purpose print('something is wrong! {}'.format(event)) all_words.append(words) # to training data self.group_size = 5 segments = [] for words in all_words: pairs = [] for i in range(0, len(words) - self.x_len - 1, self.x_len): x = words[i:i + self.x_len] y = words[i + 1:i + self.x_len + 1] pairs.append([x, y]) pairs = np.array(pairs) # abandon the last for i in np.arange(0, len(pairs) - self.group_size, self.group_size * 2): data = pairs[i:i + self.group_size] if len(data) == self.group_size: segments.append(data) segments = np.array(segments) if self.train_from_scratch: self.n_token = len(self.event2word) self.load_model() return segments
def main_get_data(parallel: bool = False, n_jobs: int = -2): """Get data from sources and export to output folder. Is equivalent to script `run_python_scripts.py` """ def _get_data_country(module_name): country = module_name.split(".")[-1] if country.lower() in SCRAPING_SKIP_COUNTRIES: logger.info(f"{module_name} skipped!") return { "module_name": module_name, "success": None, "skipped": True } logger.info(f"{module_name}: started") module = importlib.import_module(module_name) try: module.main() except Exception as err: success = False logger.error(f"{module_name}: {err}", exc_info=True) else: success = True logger.info(f"{module_name}: SUCCESS") return { "module_name": module_name, "success": success, "skipped": False } if parallel: modules_execution_results = Parallel(n_jobs=n_jobs, backend="threading")( delayed(_get_data_country)(module_name) for module_name in modules_name ) else: modules_execution_results = [] for module_name in modules_name: modules_execution_results.append(_get_data_country(module_name)) modules_failed = [m["module_name"] for m in modules_execution_results if m["success"] is False] # Retry failed modules print(f"\n---\n\nRETRIALS ({len(modules_failed)})") modules_failed_retrial = [] for module_name in modules_failed: date_str = datetime.now().strftime("%Y-%m-%d %X") print(f">> {date_str} - {module_name} - (RETRIAL)") module = importlib.import_module(module_name) try: module.main() except Exception as err: modules_failed_retrial.append(module) logger.error(err, exc_info=True) print() if len(modules_failed_retrial) > 0: print(f"\n---\n\nThe following scripts failed to run ({len(modules_failed_retrial)}):") print("\n".join([f"* {m}" for m in modules_failed_retrial]))
def run(self, parallel=True, combine=True): simulations = [] if parallel: simulations = Parallel(n_jobs=-1)(delayed(self.simulate)() for i in range(self.n_sims)) else: for i in range(self.n_sims): simulations.append(self.simulate()) self.simulations = Simulations(simulations, combine) return self
def fit_Gaussian2D_wrapper(self, PSF_List, scale=5, internal_parallel_flag=False): """ PSF localization using fit_Gaussian2D. Parameters ---------- PSF_List: pandas dataframe The data frame contains PSFs locations( x, y, frame, sigma) scale: int The ROI around PSFs is defined using this scale, which is based on their sigmas. internal_parallel_flag: bool Internal flag for activating parallel computation. Default is True! Returns ------- df: pandas dataframe The data frame contains PSFs locations ( 'y', 'x', 'frame', 'center_intensity', 'sigma', 'Sigma_ratio') and fitting information. fit_params is a list include ('Fit_Amplitude', 'Fit_X-Center', 'Fit_Y-Center', 'Fit_X-Sigma', 'Fit_Y-Sigma', 'Fit_Bias', 'Fit_errors_Amplitude', 'Fit_errors_X-Center', 'Fit_errors_Y-Center', 'Fit_errors_X-Sigma', 'Fit_errors_Y-Sigma', 'Fit_errors_Bias']. """ if type(PSF_List) is list: df_PSF = data_handeling.list2dataframe(feature_position=PSF_List, video=self.video) elif type(PSF_List) is pd.core.frame.DataFrame: df_PSF = PSF_List else: raise ValueError('PSF_List does not have correct bin_type') self.df2numpy = df_PSF.to_numpy() if self.cpu.parallel_active and internal_parallel_flag: print('\n---Fitting 2D gaussian with parallel loop---') list_df = Parallel(n_jobs=self.cpu.n_jobs, backend=self.cpu.backend, verbose=self.cpu.verbose)(delayed( self.fit_2D_gussian_kernel)(i_, scale) for i_ in tqdm(range(self.df2numpy.shape[0]))) else: print('\n---Fitting 2D gaussian without parallel loop---') list_df = [] for i_ in tqdm(range(self.df2numpy.shape[0])): tmp = self.fit_2D_gussian_kernel(i_, scale) list_df.append(tmp) df2numpy = np.asarray(list_df) if df2numpy.shape[0] != 0: df = pd.DataFrame(data=df2numpy, columns=['y', 'x', 'frame', 'center_intensity', 'sigma', 'Sigma_ratio', 'Fit_Amplitude', 'Fit_X-Center', 'Fit_Y-Center', 'Fit_X-Sigma', 'Fit_Y-Sigma', 'Fit_Bias', 'Fit_errors_Amplitude', 'Fit_errors_X-Center', 'Fit_errors_Y-Center', 'Fit_errors_X-Sigma', 'Fit_errors_Y-Sigma', 'Fit_errors_Bias']) else: df = None return df
def custom_query_validation(query, request, request_page): global query_appendix global total global product_appendix if product_appendix: product_appendix = [] available = products_with_details(request.user) query = list(set(query.split(' '))) queryset = Q() for q in query: query_appendix[q] = 0 queryset = queryset | Q(details__icontains=q) product_details = CleanProductDetails.objects.filter( product_id__in=available).filter(queryset) if product_details: for q in query_appendix: for item in product_details: if q in item.details: query_appendix[q] += 1 total = len(product_details) Parallel(n_jobs=psutil.cpu_count() * 2, verbose=50, require='sharedmem')(map(delayed(check_similarity), product_details)) print('Job Done') product_appendix = sorted(product_appendix, key=itemgetter('similarity'), reverse=True) product_appendix = [item['id'].pk for item in product_appendix] print('Sorted') start = (settings.PAGINATE_BY * (request_page - 1)) end = start + (settings.PAGINATE_BY) products = product_appendix[start:end] results = [] results = Parallel( n_jobs=psutil.cpu_count() * 2, verbose=50, require='sharedmem', backend="threading")(delayed(render_item)(Product.objects.get( id=item), request.discounts, request.currency) for item in products) front = [i for i in range((start))] results = front + results for item in product_appendix[end:]: results.append(item) return results else: return []
def _load_corpus(self, **kwargs): """ Generic loader for corpus or contents """ from .corpus import Corpus from .dataset import Dataset from . import multi # current favourite line in buzz codebase :P multiprocess = multi.how_many(kwargs.pop("multiprocess", self.is_parsed)) to_iter = self.files if isinstance(self, Corpus) else self order = {f.path: i for i, f in enumerate(to_iter, start=1)} # i would love to only ever use joblib, and therefore just use the first # part of these conditionals, but django and joblib don't play nice. if multiprocess and multiprocess > 1: chunks = np.array_split(to_iter, multiprocess) if self.is_parsed: delay = (multi.load(x, i, order=order, **kwargs) for i, x in enumerate(chunks)) else: delay = (multi.read(x, i) for i, x in enumerate(chunks)) loaded = Parallel(n_jobs=multiprocess)(delay) # unpack the nested list that multiprocessing creates loaded = [item for sublist in loaded for item in sublist] else: kwa = dict(ncols=120, unit="file", desc="Loading", total=len(self)) t = tqdm(**kwa) if len(to_iter) > 1 else None loaded = list() for i, file in enumerate(to_iter, start=1): data = file.load(**kwargs) if file.is_parsed else file.read() if data is not None: if "order" not in data.columns: data["order"] = i loaded.append(data) _tqdm_update(t) _tqdm_close(t) # for unparsed corpora, we give a dict of {path: text} # this used to be an OrderedDict, but dict order is now guaranteed. if not self.is_parsed: keys = self.filepaths if self.is_parsed else [ i.path for i in self.files ] return dict(sorted(zip(keys, loaded))) # for parsed corpora, we merge each file contents into one huge dataframe df = pd.concat(loaded, sort=False) df["_n"] = range(len(df)) if kwargs.get("set_data_types", True): df = _set_best_data_types(df) df = _order_df_columns(df) print("\n" * multiprocess) # not sure if this really helps return Dataset(df, reference=df, name=self.name)
def extract(self, plot=True): # Prep jobs (one per coordinate) print("preparing jobs...") J = [] # jobs for i, sample in self.coords.iterrows(): coord = np.array([sample.coordZ, sample.coordY, sample.coordX]) if not pd.isnull(sample.coordZ): # job: (path to scan, coordinate, instance shape, coord system 'vox' or 'world') J.append([ os.path.join(self.src_dir, sample.seriesuid + '.mhd'), coord, config['cube_shape'], self.coordSystem ]) print("extracting and augmenting samples...") if self.parallelize: num_cores = int( np.ceil( min(np.ceil(multiprocessing.cpu_count() * 0.75), len(J)))) X = Parallel(n_jobs=num_cores)(delayed(self._processJob)(j) for j in J) else: X = [] for job in J: try: X.append(self._processJob(job)) # 这一步报错 except: print("Failed to process sample") instances = np.array( list(itertools.chain.from_iterable(X)) ) # each job creates a batch of augmented instances: so collect hem print('instance_shape:', instances.shape) # Histogram Equalization: print("equalizing the data...") eq = histEq(instances) instances = eq.equalize(instances) os.makedirs(self.norm_save_dir, exist_ok=True) eq.save(path=os.path.join(self.norm_save_dir, 'equalization.pkl')) # -1 1 Normalization print("normalizing the data...") min_v = np.min(instances) max_v = np.max(instances) mean_v = np.mean(instances) norm_data = np.array([mean_v, min_v, max_v]) instances = (instances - mean_v) / (max_v - min_v) np.save(os.path.join(self.norm_save_dir, 'normalization.npy'), norm_data) if plot: self.plot_sample(instances) print("saving the dataset") np.save(self.dst_path, instances)
def preprocess_from_ray_parallel_inference(dirpath, mode, use_parallel=True): filenames = os.listdir(os.path.join(dirpath, mode)) if use_parallel: num_cores = multiprocessing.cpu_count() preproc_list = Parallel(n_jobs=num_cores)( delayed(process_audio_files_inference)(filename, dirpath, mode) for filename in tqdm(filenames)) else: preproc_list=[] for filename in tqdm(filenames): preproc_list.append(process_audio_files_inference(filename, dirpath, mode)) return preproc_list
def basic_compute_loop(compute_function,looper,run_parallel=True,debug=False): """Canonical form of the basic compute loop.""" start = time.time() if run_parallel: incoming = Parallel(n_jobs=8,verbose=10 if debug else 0)( delayed(compute_function,has_shareable_memory)(**looper[ll]) for ll in framelooper(len(looper),start=start)) else: incoming = [] for ll in framelooper(len(looper)): incoming.append(compute_function(**looper[ll])) return incoming
def auto_choose(actionfile, new_xyz, softmin_k = 1, softmin_alpha = 1, nparallel=-1): """ @param demofile : h5py.File object @param new_xyz : new rope point-cloud @param softmin : use softmin distribution over first <softmin> demonstrations set to 1 for nearest neighbor @param nparallel : number of parallel jobs to run for tps cost calculaion set to -1 for no parallelization @return : return the name of the segment with the lowest warping cost. """ if not nparallel == -1: from joblib import Parallel, delayed nparallel = min(nparallel, 8) demo_data = actionfile.items() if nparallel != -1: before = time.time() redprint("auto choose parallel with njobs = %d"%nparallel) costs = Parallel(n_jobs=nparallel, verbose=100)(delayed(registration_cost)(ddata[1]['cloud_xyz'][:], new_xyz) for ddata in demo_data) after = time.time() print "Parallel registration time in seconds =", after - before else: costs = [] redprint("auto choose sequential..") for i, ddata in enumerate(demo_data): costs.append(registration_cost(ddata[1]['cloud_xyz'][:], new_xyz)) print(("tps-cost completed %i/%i" % (i + 1, len(demo_data)))) # use a random draw from the softmin distribution demo_costs = zip(costs, demo_data) if softmin_k == 1: ibest = np.argmin(costs) return demo_data[ibest][0] best_k_demos = np.asarray(sorted(demo_costs)[:softmin_k]) best_k_exps = np.exp(-1*softmin_alpha*float(best_k_demos[:, 0])) #multiply by -1 b/c we're actually min-ing if len(best_k_exps) > 1: denom = sum(best_k_exps) else: denom = best_k_exps mass_fn = best_k_exps/denom draw = random.random() for i in range(best_k_demos): if draw <= mass_fn[i]: ret_val = demo_data[i][0] break draw -= mass_fn[i] redprint ("auto choose returning..") return ret_val
def train(self): regressors = [] if self.parallel: regressors = Parallel(n_jobs=-1)(delayed(trainBin)(self.params[b], np.atleast_2d(self.ind).T, self.dep[b],self.indWeights) for b in self.OD.bins) else: for b in self.OD.bins: regressors.append(trainBin(self.params[b],np.atleast_2d(self.ind).T, self.dep[b],self.indWeights)) #self.svr[b] = SVR(cache_size=1000,kernel='rbf', C=self.params[b]['C'], gamma=self.params[b]['gamma']) #self.svr[b].fit(np.array([self.ind]).T,self.dep[b]) for i,model in enumerate(regressors): self.svr[self.OD.bins[i]] = model
def run_all(cnf, samples, process_one, finalize_one, finalize_all): if len(samples) == 1: sample_name, sample_cnf = samples.items()[0] run_one(sample_cnf, process_one, finalize_one) else: results = [] if cnf.get('parallel'): try: from joblib import Parallel, delayed except ImportError: critical( '\nERROR: Joblib not found. You may want samples to be processed ' 'in parallel, in this case, make sure python joblib intalled. ' '(pip install joblib).') else: for sample_name, sample_cnf in samples.items(): sample_cnf['verbose'] = False results = Parallel(n_jobs=len(samples)) \ (delayed(run_one)(sample_cnf, process_one, finalize_one, multiple_samples=True) for sample_name, sample_cnf in samples.items()) else: results = [] for sample_name, sample_cnf in samples.items(): results.append( run_one(sample_cnf, process_one, finalize_one, multiple_samples=True)) if samples: info('') info('*' * 70) info('Results for each sample:') finalize_all(cnf, samples, results) # Cleaning for name, data in samples.items(): work_dirpath = data['work_dir'] tx_dirpath = join(work_dirpath, 'tx') if isdir(tx_dirpath): shutil.rmtree(tx_dirpath) if not data.get('keep_intermediate') \ and isdir(work_dirpath): shutil.rmtree(work_dirpath)
def auto_choose(demofile, new_xyz, only_original_segments): """ @param demofile: @param new_xyz: @param only_original_segments: if true, then only the original_segments will be registered with @return: """ import pprint """Return the segment with the lowest warping cost. Takes about 2 seconds.""" parallel = True if parallel: from joblib import Parallel, delayed items = demofile.items() if only_original_segments: #remove all derived segments from items print("Only registering with the original segments") items = [item for item in items if not "derived" in item[1].keys()] unzipped_items = zip(*items) keys = unzipped_items[0] values = unzipped_items[1] ds_clouds, shapes = get_downsampled_clouds(values) ds_new = clouds.downsample(new_xyz, 0.01 * DS_SIZE) #print 'ds_new_len shape', ds_new.shape if parallel: before = time.time() #TODO: change back n_jobs=12 ? costs = Parallel(n_jobs=8, verbose=0)(delayed(registration_cost)(ds_cloud, ds_new) for ds_cloud in ds_clouds) after = time.time() print "Parallel registration time in seconds =", after - before else: costs = [] for (i, ds_cloud) in enumerate(ds_clouds): costs.append(registration_cost(ds_cloud, ds_new)) print(("completed %i/%i" % (i + 1, len(ds_clouds)))) #print(("costs\n", costs)) ibest = np.argmin(costs) print "ibest = ", ibest #pprint.pprint(zip(keys, costs, shapes)) #print keys print "best key = ", keys[ibest] print "best cost = ", costs[ibest] return keys[ibest]
def find_TADs(self, data, gammalist=range(10, 110, 10), segmentation='potts', minlen=3, drop_gamma=False, n_jobs='auto'): ''' Finds TADs in data with a list of gammas. Returns a pandas DataFrame with columns 'Start', 'End' and 'Gamma'. Use genome_intervals_to_chr on the returned object to get coordinates in bed-style format and not in coordinates of concatenated genome. If *drop_gamma*, drops the 'Gamma' column (useful when using 1 gamma) ''' raise DeprecationWarning('Will be deprecated or rewritten to use'\ 'lavaburst: github.com/nezar-compbio/lavaburst') if n_jobs is 'auto': #Empirical values on my computer; with >8 Gb memory try increasing n_jobs if segmentation == 'potts': n_jobs = 3 elif segmentation == 'armatus': n_jobs = 6 if ~np.isfinite(data).any(): print 'Non-finite values in data, substituting them with zeroes' data[~np.isfinite(data)] = 0 Wcomm, Wnull, pass_mask, length = _precalculate_TADs_in_array(data) f = _calculate_TADs if n_jobs >= 1: from joblib import Parallel, delayed domains = Parallel(n_jobs=n_jobs, max_nbytes=1e6)( delayed(f)(Wcomm, Wnull, pass_mask, length, g, segmentation) for g in gammalist) elif n_jobs is None or n_jobs == False or n_jobs == 0: domains = [] for g in gammalist: domains_g = f(Wcomm, Wnull, pass_mask, length, g, segmentation) domains.append(domains_g) domains = pd.concat(domains, ignore_index=True) domains = domains.query('End-Start>='+str(minlen)).copy() domains = domains.sort(columns=['Gamma', 'Start', 'End']) domains.reset_index(drop=True, inplace=True) domains[['Start', 'End']] = domains[['Start', 'End']].astype(int) domains[['Start', 'End']] *= self.resolution domains = domains[['Start', 'End', 'Score', 'Gamma']] if drop_gamma: domains.drop('Gamma', axis=1, inplace=True) domains = self.genome_intervals_to_chr(domains).reset_index(drop=True) return domains
def main(): """ Main function. 1. Setup logging 2. Get arguments 3. Get index 4. Process files 5. Write output """ setup_logging() logger = logging.getLogger("stats." + __name__) args = get_args() index = get_index(args) logger.warning("Positions not in annotation will be ignored.") logger.info("Found " + str(len(args.inputs)) + " input file(s):") for input_file in sorted(args.inputs): logger.debug(input_file) if args.is_parallel: stats = Parallel(n_jobs=args.parallel, verbose=100, batch_size=1)(delayed(process_file)(input_file, args.type, index, args.is_parallel) for input_file in args.inputs) else: stats = [] for input_file in args.inputs: output_table = process_file(input_file, args.type, index, args.is_parallel) stats.append(output_table) write_stats(args.out, stats)
def findPeaks(imgdict, maplist, params, maptype="ccmaxmap", pikfile=True): peaktreelist = [] count = 0 thresh = float(params["thresh"]) bin = int(params["bin"]) diam = float(params["diam"]) apix = float(params["apix"]) olapmult = float(params["overlapmult"]) maxpeaks = int(params["maxpeaks"]) maxthresh = params["maxthresh"] maxsizemult = float(params["maxsize"]) peaktype = params["peaktype"] msg = not params['background'] pixdiam = diam/apix/float(bin) pixrad = diam/apix/2.0/float(bin) numpyVersion = float(numpy.version.version[:3]) if numpyVersion > 1.7: peaktreelist = Parallel(n_jobs=params['nproc'])(delayed(runFindPeaks)(params, maplist,maptype,pikfile,thresh,pixdiam,count,olapmult,maxpeaks,maxsizemult, msg,bin,peaktype,pixrad,imgdict) for count in range(0,len(maplist))) else: ## backup for AttributeError: 'memmap' object has no attribute 'offset', bug #3322 peaktreelist = [] for count in range(0,len(maplist)): mappeaktree = runFindPeaks(params,maplist,maptype,pikfile,thresh,pixdiam,count,olapmult, maxpeaks,maxsizemult,msg,bin,peaktype,pixrad,imgdict) peaktreelist.append(mappeaktree) peaktree = mergePeakTrees(imgdict, peaktreelist, params, msg, pikfile) #max threshold if maxthresh is not None: precount = len(peaktree) peaktree = maxThreshPeaks(peaktree, maxthresh) postcount = len(peaktree) #if precount != postcount: apDisplay.printMsg("Filtered %d particles above threshold %.2f"%(precount-postcount,maxthresh)) return peaktree
def pmultiquery(corpus, search, show='words', query='any', sort_by='total', save=False, multiprocess='default', just_speakers=False, root=False, note=False, print_info=True, **kwargs ): """ - Parallel process multiple queries or corpora. - This function is used by corpkit.interrogator.interrogator() - for multiprocessing. - There's no reason to call this function yourself.""" import os from pandas import DataFrame, Series import pandas as pd import collections from collections import namedtuple, OrderedDict from time import strftime, localtime import corpkit from corpkit.interrogator import interrogator from corpkit.interrogation import Interrogation try: from joblib import Parallel, delayed except ImportError: pass import multiprocessing locs = locals() for k, v in kwargs.items(): locs[k] = v in_notebook = locs.get('in_notebook') def best_num_parallel(num_cores, num_queries): """decide how many parallel processes to run the idea, more or less, is to balance the load when possible""" import corpkit if num_queries <= num_cores: return num_queries if num_queries > num_cores: if (num_queries / num_cores) == num_cores: return int(num_cores) if num_queries % num_cores == 0: try: return max([int(num_queries / n) for n in range(2, num_cores) \ if int(num_queries / n) <= num_cores]) except ValueError: return num_cores else: import math if (float(math.sqrt(num_queries))).is_integer(): square_root = math.sqrt(num_queries) if square_root <= num_queries / num_cores: return int(square_root) return num_cores num_cores = multiprocessing.cpu_count() # what is our iterable? ... multiple_option = False multiple_queries = False multiple_speakers = False multiple_corpora = False multiple_search = False mult_corp_are_subs = False denom = 1 if hasattr(corpus, '__iter__'): multiple_corpora = True num_cores = best_num_parallel(num_cores, len(corpus)) denom = len(corpus) if all(c.__class__ == corpkit.corpus.Subcorpus for c in corpus): mult_corp_are_subs = True elif (isinstance(query, (list, dict)) and not hasattr(search, '__iter__')): multiple_queries = True num_cores = best_num_parallel(num_cores, len(query)) denom = len(query) elif hasattr(search, '__iter__') and all(isinstance(i, dict) for i in list(search.values())): multiple_search = True num_cores = best_num_parallel(num_cores, len(list(search.keys()))) denom = len(list(search.keys())) elif just_speakers: from build import get_speaker_names_from_xml_corpus multiple_speakers = True if just_speakers == 'each' or just_speakers == ['each']: just_speakers = get_speaker_names_from_xml_corpus(corpus.path) if len(just_speakers) == 0: print('No speaker name data found.') return num_cores = best_num_parallel(num_cores, len(just_speakers)) denom = len(just_speakers) if multiple_corpora and any(x is True for x in [multiple_speakers, multiple_queries, multiple_search, multiple_option]): from corpkit.corpus import Corpus, Corpora if isinstance(corpus, Corpora): multiprocess = False else: corpus = Corpus(corpus) if isinstance(multiprocess, int): num_cores = multiprocess if multiprocess is False: num_cores = 1 # make sure saves are right type if save is True: raise ValueError('save must be string when multiprocessing.') # the options that don't change d = {'function': 'interrogator', 'root': root, 'note': note, 'denominator': denom} # add kwargs to query for k, v in list(kwargs.items()): d[k] = v # make a list of dicts to pass to interrogator, # with the iterable unique in every one ds = [] if multiple_corpora: for index, p in enumerate(corpus): name = p.name a_dict = dict(d) a_dict['corpus'] = p a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name.replace('-parsed', '') a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_queries: for index, (name, q) in enumerate(query.items()): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = q a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_speakers: for index, name in enumerate(just_speakers): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = [name] a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_search: for index, (name, val) in enumerate(search.items()): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = val a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) if kwargs.get('conc') is False: message = 'Interrogating' elif kwargs.get('conc') is True: message = 'Interrogating and concordancing' elif kwargs.get('conc').lower() == 'only': message = 'Concordancing' time = strftime("%H:%M:%S", localtime()) sformat = '' if multiple_queries: to_it_over = query else: to_it_over = search for i, (k, v) in enumerate(list(to_it_over.items())): if isinstance(v, list): vformat = ', '.join(v[:5]) if len(v) > 5: vformat += ' ...' elif isinstance(v, dict): vformat = '' for kk, vv in v.items(): if isinstance(vv, list): vv = ', '.join(vv[:5]) vformat += '\n %s: %s' % (kk, vv) if len(vv) > 5: vformat += ' ...' else: try: vformat = v.pattern except AttributeError: vformat = v sformat += '%s: %s' %(k, vformat) if i < len(to_it_over.keys()) - 1: sformat += '\n ' if print_info: # proper printing for plurals # in truth this needs to be revised, it's horrible. if num_cores == 1: add_es = '' else: add_es = 'es' if multiple_corpora and not multiple_option: corplist = "\n ".join([i.name for i in corpus[:20]]) if len(corpus) > 20: corplist += '\n ... and %d more ...\n' % (len(corpus) - 20) print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s):\n %s" \ "\n Query: %s\n %s corpus ... \n" % (time, len(corpus), num_cores, add_es, corplist, sformat, message))) elif multiple_queries: print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \ "\n Queries: %s\n %s corpus ... \n" % (time, len(query), num_cores, add_es, corpus.name, sformat, message) )) elif multiple_search: print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \ "\n Queries: %s\n %s corpus ... \n" % (time, len(list(search.keys())), num_cores, add_es, corpus.name, sformat, message))) elif multiple_option: print(("\n%s: Beginning %d parallel corpus interrogation%s (multiple options): %s" \ "\n Query: %s\n %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) )) elif multiple_speakers: print(("\n%s: Beginning %d parallel corpus interrogation%s: %s" \ "\n Query: %s\n %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) )) # run in parallel, get either a list of tuples (non-c option) # or a dataframe (c option) #import sys #reload(sys) #stdout=sys.stdout failed = False terminal = False used_joblib = False #ds = ds[::-1] if not root and print_info: from blessings import Terminal terminal = Terminal() print('\n' * (len(ds) - 2)) for dobj in ds: linenum = dobj['paralleling'] # this try handles nosetest problems in sublime text try: with terminal.location(0, terminal.height - (linenum + 1)): # this is a really bad idea. thetime = strftime("%H:%M:%S", localtime()) num_spaces = 26 - len(dobj['outname']) print('%s: QUEUED: %s' % (thetime, dobj['outname'])) except: pass if not root and multiprocess: #res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) try: #ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True) res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) used_joblib = True except: failed = True print('Multiprocessing failed.') raise if not res: failed = True else: res = [] for index, d in enumerate(ds): d['startnum'] = (100 / denom) * index res.append(interrogator(**d)) try: res = sorted([i for i in res if i]) except: pass # remove unpicklable bits from query from types import ModuleType, FunctionType, BuiltinMethodType, BuiltinFunctionType badtypes = (ModuleType, FunctionType, BuiltinFunctionType, BuiltinMethodType) qlocs = {k: v for k, v in locs.items() if not isinstance(v, badtypes)} if hasattr(qlocs['corpus'], 'name'): qlocs['corpus'] = qlocs['corpus'].path else: qlocs['corpus'] = list([i.path for i in qlocs['corpus']]) from corpkit.interrogation import Concordance if kwargs.get('conc') == 'only': concs = pd.concat([x for x in res]) thetime = strftime("%H:%M:%S", localtime()) concs = concs.reset_index(drop=True) lines = Concordance(concs) if save: lines.save(save, print_info=print_info) if print_info: print('\n\n%s: Finished! %d results.\n\n' % (thetime, len(concs.index))) return lines if not all(isinstance(i.results, Series) for i in res): out = OrderedDict() for interrog, d in zip(res, ds): for unpicklable in ['note', 'root']: interrog.query.pop(unpicklable, None) try: out[interrog.query['outname']] = interrog except KeyError: out[d['outname']] = interrog from corpkit.interrogation import Interrodict idict = Interrodict(out) if print_info: time = strftime("%H:%M:%S", localtime()) print("\n\n%s: Finished! Output is a dictionary with keys:\n\n '%s'\n" % \ (time, "'\n '".join(sorted(out.keys())))) idict.query = qlocs if save: idict.save(save, print_info=print_info) return idict # make query and total branch, save, return # todo: standardise this so we don't have to guess transposes else: if multiple_corpora and not mult_corp_are_subs: sers = [i.results for i in res] out = DataFrame(sers, index=[i.query['outname'] for i in res]) out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols out = out.fillna(0) # nan to zero out = out.astype(int) # float to int out = out.T else: try: out = pd.concat([r.results for r in res], axis=1) out = out.T out.index = [i.query['outname'] for i in res] except ValueError: return None # format like normal # this sorts subcorpora, which are cls out = out[sorted(list(out.columns))] # puts subcorpora in the right place if not mult_corp_are_subs: out = out.T out = out.fillna(0) # nan to zero out = out.astype(int) if 'c' in show and mult_corp_are_subs: out = out.sum() out.index = sorted(list(out.index)) # sort by total if isinstance(out, DataFrame): out = out[list(out.sum().sort_values(ascending=False).index)] # really need to figure out the deal with tranposing! if all(x.endswith('.xml') for x in list(out.columns)) \ or all(x.endswith('.txt') for x in list(out.columns)): out = out.T out = out.edit(sort_by=sort_by, print_info=False, keep_stats=False, \ df1_always_df=kwargs.get('df1_always_df')) out.query = qlocs if len(out.results.columns) == 1: out.results = out.results.sort_index() if kwargs.get('conc') is True: concs = pd.concat([x.concordance for x in res], ignore_index=True) concs = concs.sort_values(by='c') concs = concs.reset_index(drop=True) out.concordance = Concordance(concs) thetime = strftime("%H:%M:%S", localtime()) if terminal and print_info: with terminal.location(0, terminal.height): print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n')) else: if print_info: print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n')) if save: out.save(save, print_info = print_info) return out
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') num_nf_errors = logger._num_nf_errors success_compilation = compile_aligner(logger) if qconfig.test and is_emem_aligner(): success_compilation = check_emem_functionality(logger) if not success_compilation: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None create_nucmer_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if qconfig.memory_efficient: threads = 1 else: threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.splitted_ref: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: if len(contigs_fpaths) >= len(qconfig.splitted_ref) and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: statuses_results_lengths_tuples = [] for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)): statuses_results_lengths_tuples.append(align_and_analyze( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, parallel_by_chr=True, threads=qconfig.max_threads)) # unzipping statuses, results, aligned_lengths = [x[0] for x in statuses_results_lengths_tuples], \ [x[1] for x in statuses_results_lengths_tuples], \ [x[2] for x in statuses_results_lengths_tuples] reports = [] for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == NucmerStatus.OK: reports.append(save_result(results[index], report, fname)) elif statuses[index] == NucmerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) nucmer_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) if NucmerStatus.OK in nucmer_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) if qconfig.draw_plots: from . import plotter plotter.draw_misassembl_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') if qconfig.is_combined_ref: save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger) oks = list(nucmer_statuses.values()).count(NucmerStatus.OK) not_aligned = list(nucmer_statuses.values()).count(NucmerStatus.NOT_ALIGNED) failed = list(nucmer_statuses.values()).count(NucmerStatus.FAILED) errors = list(nucmer_statuses.values()).count(NucmerStatus.ERROR) problems = not_aligned + failed + errors all = len(nucmer_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') if not qconfig.test and is_emem_aligner(): logger.warning('Please rerun QUAST using --test option to ensure that E-MEM aligner works properly.') return nucmer_statuses, aligned_lengths_per_fpath
def _bivar_factor_operation(phi1, phi2, operation, n_jobs=1): """ Returns product of two factors. Parameters ---------- phi1: factors phi2: factors operation: M | D M: multiplies phi1 and phi2 D: divides phi1 by phi2 """ try: from joblib import Parallel, delayed use_joblib = True except ImportError: use_joblib = False def err_handler(type, flag): raise Exceptions.InvalidValueError(type) np.seterrcall(err_handler) np.seterr(divide='raise', over='raise', under='raise', invalid='call') phi1_vars = list(phi1.variables) phi2_vars = list(phi2.variables) common_var_list = [var for var in phi1_vars if var in phi2_vars] if common_var_list: variables = phi1_vars variables.extend([var for var in phi2.variables if var not in common_var_list]) cardinality = list(phi1.cardinality) cardinality.extend(phi2.get_cardinality(var) for var in phi2.variables if var not in common_var_list) phi1_indexes = [i for i in range(len(phi1.variables))] phi2_indexes = [variables.index(var) for var in phi2.variables] values = [] phi1_cumprod = np.delete(np.concatenate( (np.array([1]), np.cumprod(phi1.cardinality[::-1])), axis=1)[::-1], 0) phi2_cumprod = np.delete(np.concatenate( (np.array([1]), np.cumprod(phi2.cardinality[::-1])), axis=1)[::-1], 0) if operation == 'M': if use_joblib and n_jobs != 1: values = Parallel(n_jobs=n_jobs, backend='threading')( delayed(_parallel_helper_m)(index, phi1, phi2, phi1_indexes, phi2_indexes, phi1_cumprod, phi2_cumprod) for index in product(*[range(card) for card in cardinality])) else: # TODO: @ankurankan Make this cleaner indexes = np.array(list(map(list, product(*[range(card) for card in cardinality])))) values = (phi1.values[np.sum(indexes[:, phi1_indexes] * phi1_cumprod, axis=1).ravel()] * phi2.values[np.sum(indexes[:, phi2_indexes] * phi2_cumprod, axis=1).ravel()]) elif operation == 'D': if use_joblib and n_jobs != 1: values = Parallel(n_jobs, backend='threading')( delayed(_parallel_helper_d)(index, phi1, phi2, phi1_indexes, phi2_indexes, phi1_cumprod, phi2_cumprod) for index in product(*[range(card) for card in cardinality])) else: # TODO: @ankurankan Make this cleaner and handle case of division by zero for index in product(*[range(card) for card in cardinality]): index = np.array(index) try: values.append(phi1.values[np.sum(index[phi1_indexes] * phi1_cumprod)] / phi2.values[np.sum(index[phi2_indexes] * phi2_cumprod)]) except (Exceptions.InvalidValueError, FloatingPointError): # zero division error should return 0 if both operands # equal to 0. Ref Koller page 365, Fig 10.7 values.append(0) phi = Factor(variables, cardinality, values) return phi else: values = np.zeros(phi1.values.shape[0] * phi2.values.shape[0]) phi2_shape = phi2.values.shape[0] if operation == 'M': for value_index in range(phi1.values.shape[0]): values[value_index * phi2_shape: (value_index + 1) * phi2_shape] = (phi1.values[value_index] * phi2.values) elif operation == 'D': # reference: Koller Defination 10.7 raise ValueError("Factors Division not defined for factors with no" " common scope") variables = phi1_vars + phi2_vars cardinality = list(phi1.cardinality) + list(phi2.cardinality) phi = Factor(variables, cardinality, values) return phi
def getAllPredictions(self, mode = 'multi'): logging.basicConfig(filename='results.log', level=logging.INFO) #predictor.displayNumbers(X,y_labels) ## if mode is multiprocessing, individual algorithms must perform in one job, otherwise joblib library would throw an ## exception. if mode is sequential individual algoritms can perform in parallel if mode == 'multi': n_jobs = 1 else: n_jobs = -1 models = [] ## SVM configs svm_C = [0.1, 1, 10, 100] svm_gamma = ['auto', 0.03, 0.003] svm_kernel = ['rbf', 'linear', 'poly', 'sigmoid'] svm_parameters = [(x, y, z) for x in svm_C for y in svm_gamma for z in svm_kernel] for params in svm_parameters: models.append(['SVM',params, svm.SVC(gamma=params[1], C=params[0], kernel=params[2])]) ## random forest configs rf_nestimators = [10, 100, 300, 500] rf_max_features = ['auto', 'sqrt', 'log2'] rf_max_depth = [None, 5] rf_parameters = [(x, y, z) for x in rf_nestimators for y in rf_max_features for z in rf_max_depth] for params in rf_parameters: models.append(['RandomForest', params, RandomForestClassifier(n_estimators=params[0], max_features=params[1], max_depth=params[2], n_jobs = n_jobs)]) ## adaboost configs ab_nestimators = [10, 100, 300, 500] ab_learning_rate = [0.1, 0.3, 1] ab_base_estimator = [DecisionTreeClassifier(max_depth=2, max_features ='auto'), DecisionTreeClassifier(max_depth=5, max_features ='auto'), DecisionTreeClassifier(max_features='auto')] ab_parameters = [(x, y, z) for x in ab_nestimators for y in ab_learning_rate for z in ab_base_estimator] for params in ab_parameters: models.append(['AdaBoost', params, AdaBoostClassifier(n_estimators = params[0], learning_rate= params[1], base_estimator=ab_base_estimator[2])]) ## decisiontrees configs dt_max_depth = [None, 2, 5] dt_max_features = ['auto', 'sqrt', 'log2'] dt_parameters = [(x, y) for x in dt_max_depth for y in dt_max_features] for params in dt_parameters: models.append(['DecisionTrees', params, DecisionTreeClassifier(max_depth=params[0], max_features=params[1])]) ## MutinomialNB configs mnb_aplpha = [0.1, 0.3, 1] for params in mnb_aplpha: models.append(['MultinomialNB', params, MultinomialNB(alpha=params)]) ## GaussianNB configs models.append(['GaussianNB', '', GaussianNB()]) ## LogisticRegression configs lr_C = [0.1, 1, 10, 100] lr_multi_class = ['ovr'] lr_parameters = [(x, y) for x in lr_C for y in lr_multi_class] for params in lr_parameters: models.append(['LogisticRegression', params, LogisticRegression(C=params[0], multi_class=params[1], n_jobs= n_jobs)]) ## KNeighborsClassifier configs knn_n_neighbors = [3, 5, 7] knn_p = [1, 2, 3] knn_algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute'] knn_paramters = [(x, y, z) for x in knn_n_neighbors for y in knn_p for z in knn_algorithm] for params in knn_paramters: models.append(['KNeighbors', params, KNeighborsClassifier(n_neighbors=params[0], p=params[1], algorithm=params[2], n_jobs= n_jobs)]) ## LinearDiscriminantAnalysis configs lda_solver = ['svd', 'lsqr', 'eigen'] lda_n_components = [3, 5, 8] lda_parameters = [(x, y) for x in lda_solver for y in lda_n_components] for params in lda_parameters: models.append(['LinearDiscriminantAnalysis', params, LinearDiscriminantAnalysis(solver=params[0], n_components=params[1])]) ## run models in multiprocessing or sequential way results = [] if mode == 'multi': num_cores = multiprocessing.cpu_count() results = Parallel(n_jobs=num_cores)\ (delayed(self.predictor.predict)(models[i][2], self.X, self.y_labels) for i in range(len(models))) results_all = zip(models, results) else: for i in range(len(models)): results.append(self.predictor.predict(models[i][2], self.X, self.y_labels)) results_all = zip(models, results) sorted_results = sorted(results_all, key= lambda item: item[1], reverse = True) [logging.info(x) for x in sorted_results]
def get_isochrone_grid(grid_feh, grid_logt, model='parsec12s', phot='sloan', Zsun=0.0152, parflag=True, n_jobs=8, **kwargs): """ get a list of isochrones using EZPADOVA Parameters ---------- grid_feh: array [Fe/H] grid grid_logt: array logt grid model: string default is 'parsec12s' phot: string default is 'sloan' Zsun: float default is 0.0152 parflag: bool default is True if True, use JOBLIB to get isochrones in parallel n_jobs: int if parflat is True, specify number of jobs in JOBLIB Returns ------- vgrid_feh, vgrid_logt, isoc_list, grid_list """ # validate grid vgrid_feh, vgrid_logt = _find_valid_grid(grid_feh, grid_logt, Zsun=Zsun) # construct list grid_list = [] for grid_feh_ in vgrid_feh: for grid_logt_ in vgrid_logt: grid_list.append((10.**grid_logt_, 10.**grid_feh_*Zsun)) print('@Cham: you have requested for %s isochrones!') print('@Cham: -----------------------------------------------------------') # get isochrones if parflag: # get isochrones in parallel isoc_list = Parallel(n_jobs=n_jobs, verbose=True)(delayed(cmd.get_one_isochrone)( grid_list_[0], grid_list_[1], model=model, phot=phot, **kwargs) for grid_list_ in grid_list) else: # get isochrones sequentially isoc_list = [] for i in xrange(len(grid_list)): grid_list_ = grid_list[i] print('@Cham: sending request for isochrone (logt=%s, [Fe/H]=%s) (t=%s, Z=%s) [%s/%s]...' % (np.log10(grid_list_[0]), np.log10(grid_list_[1]/Zsun), grid_list_[0], grid_list_[1], i+1, len(grid_list))) isoc_list.append( Table(cmd.get_one_isochrone(grid_list_[0], grid_list_[1], model=model, phot=phot, **kwargs).data)) print('@Cham: got all requested isochrones!') print('@Cham: -----------------------------------------------------------') print('@Cham: colnames are:') print(isoc_list[0].colnames) print('@Cham: -----------------------------------------------------------') return vgrid_feh, vgrid_logt, isoc_list, grid_list
def mab_eval(bandit, T, pol_cfg, N_trials=100, seed=None, parallel=False): if seed is not None: np.random.seed(seed) all_policies = extract_policies(**pol_cfg) policies = [] for p in all_policies: if p.name in pol_cfg['names']: policies.append(p) names = [p.name for p in policies] arm_dists = [bandit.resample_arms() for _ in range(N_trials)] results = [] print 'Evaluating Policies {}'.format(names) if parallel == 1: rc = ipp.Client(profile='ssh') dv = rc[:] n_clients = len(dv) with dv.sync_imports(): import mab v = rc.load_balanced_view() results = v.map(eval_helper, arm_dists, [bandit.arm_prior] * N_trials, [T]*N_trials, [pol_cfg]*N_trials, [frozenset(names)] * N_trials, [seed + inum for inum in range(N_trials)]) start = time.time() while rc.outstanding: try: rc.wait(rc.outstanding, 1e-1) except ipp.TimeoutError: # ignore timeouterrors pass n_complete = N_trials - len(rc.outstanding) if n_complete > 0: est_remaining = ((time.time() - start) / n_complete) * len(rc.outstanding) else: est_remaining = 'No Estimate' sys.stdout.write('\rFinished {} / {} jobs\tEstimated Time Remaining: {}'.format(n_complete, N_trials, est_remaining)) sys.stdout.flush() elif parallel == 2: from joblib import Parallel, delayed results = Parallel(n_jobs=7, verbose=50)(delayed(_eval_helper)( ad, bandit.arm_prior, T, pol_cfg, names, seed + inum) for inum, ad in enumerate(arm_dists)) else: for inum, ad in enumerate(arm_dists): results.append(eval_helper(ad, bandit.arm_prior, T, pol_cfg, names, seed=seed+inum)) sys.stdout.write("{} / {}\t".format(inum, N_trials)) sys.stdout.flush() means = [] variances = [] avg_err = [] discounted_mean = [] for j in range(len(policies)): try: regrets, choices, discounted = results[0].get() except CompositeError, e: print e import IPython; IPython.embed() regrets = regrets[j] choices = choices[j] discounted = discounted[j] errors = np.array(choices != bandit.ibest, dtype=np.int) for i in range(1, N_trials): regrets_i, choices, discounted_i = results[i].get() regrets = np.c_[regrets, regrets_i[j]] errors += (choices[j] != bandit.ibest) discounted += discounted_i[j] discounted /= N_trials discounted_mean.append(discounted) means.append(np.mean(regrets, axis=1)) variances.append(np.var(regrets, axis=1)) avg_err.append(errors / N_trials)
def glm(conditions, onsets, TR, Y, drifts=None, basis='3hrf', mode='r1glm', hrf_length=20, oversample=5, rtol=1e-8, verbose=False, maxiter=500, callback=None, method='L-BFGS-B', n_jobs=1, hrfs=None, return_design_matrix=False): """ Perform a GLM from BOLD signal, given the conditons, onset, TR (repetition time of the scanner) and the BOLD signal. This method is able to fir a variety of models, available through the `mode` keyword. These are: - glm: standard GLM - glms: GLM with separate designs - r1glm: Rank-1 GLM - r1glms: Rank-1 GLM with separate designs basis: - hrf: single element basis - 3hrf: basis with 3 elements - fir: basis with hrf_length elements (in multiples of TR) **Note** the output parameters need are not normalized. Rank-1 models are specified up to a constant term between the betas and the HRF. This implies that some normalization must be done prior to interpreting the activation coefficients. Typically the HRF is normalized to have unit amplitude and to correlate positively with a reference HRF. Parameters ---------- conditions: array-like, shape (n_trials) array of conditions onsets: array-like, shape (n_trials) array of onsets TR: float Repetition Time, the delay between two succesive aquisitions of the same image. Y : array-like, shape (n_scans, n_voxels) Time-series vector. mode: {'r1glm', 'r1glms', 'glms', 'glm'} Different GLM models. rtol : float Relative tolerance for stopping criterion. maxiter : int maximum number of iterations verbose : {0, 1, 2} Different levels of verbosity n_jobs: int Number of CPUs to use. Use -1 to use all available CPUs. method: {'L-BFGS-B', 'TNC'} Different algorithmic solvers, only used for 'r1*' modes. All should yield the same result but their efficiency might vary. Returns ------- U : array Estimated HRF. Will be of shape (basis_len, n_voxels) for rank-1 methods and of (basis_len, n_conditions, n_voxels) for the other methods. V : array, shape (p, n_voxels) Estimated activation coefficients (beta-map). dmtx: array, Design matrix. Only returned if return_design_matrix=True """ if not mode in ('glm', 'r1glm', 'r1glms', 'glms'): raise NotImplementedError conditions = np.asarray(conditions) onsets = np.asarray(onsets) if conditions.size != onsets.size: raise ValueError('array conditions and onsets should have the same size') Y = np.asarray(Y) n_scans = Y.shape[0] verbose = int(verbose) if verbose > 0: print('.. creating design matrix ..') if drifts is None: drifts = np.ones((n_scans, 1)) X_design, Q = create_design_matrix( conditions, onsets, TR, n_scans, basis, oversample, hrf_length) if verbose > 0: print('.. done creating design matrix ..') if Y.ndim == 1: Y = Y.reshape((-1, 1)) n_task = Y.shape[1] size_u = Q.shape[1] size_v = X_design.shape[1] // size_u if mode == 'glms': U, V = utils.glms_from_glm( X_design, Q, n_jobs, False, Y) elif mode == 'glm': U, V = utils.glm( X_design, Q, Y, convolve=False) elif mode in ('r1glm', 'r1glms'): U = np.zeros((size_u, n_task)) V = np.zeros((size_v, n_task)) if verbose > 0: print('.. computing initialization ..') X_design_canonical, Q_canonical = create_design_matrix(conditions, onsets, TR, n_scans, [hrf.spmt], oversample, hrf_length) X_design_canonical = np.concatenate( (X_design_canonical, drifts), axis=1) V_init = linalg.lstsq(X_design_canonical, Y)[0] U_init = np.tile(linalg.lstsq(Q, Q_canonical)[0], n_task) if mode == 'r1glm': W_init = np.concatenate((U_init, V_init)) else: # XXX TODO intercept W_init = np.concatenate((U_init, V_init[:-1], V_init[:-1])) if verbose > 0: print('.. done initialization ..') if n_jobs == -1: n_jobs = cpu_count() Y_split = np.array_split(Y, n_jobs, axis=1) W_init_split = np.array_split(W_init, n_jobs, axis=1) X_design = sparse.csr_matrix(X_design) out = Parallel(n_jobs=n_jobs)( delayed(rank_one)( X_design, y_i, size_u, w_i, drifts=drifts, callback=callback, maxiter=maxiter, method=method, rtol=rtol, verbose=verbose, mode=mode, hrfs=hrfs, basis=basis) for y_i, w_i in zip(Y_split, W_init_split)) counter = 0 for tmp in out: u, v = tmp u = u.T v = v.T for i in range(len(u)): U[:, counter] = u[i] V[:, counter] = v[i] counter += 1 raw_U = U.copy() # normalize if mode in ('r1glm',) and basis == '3hrf': xx = np.linspace(0, hrf_length * TR) generated_hrfs = U[0] * hrf.spmt(xx)[:, None] + \ U[1] * hrf.dspmt(xx)[:, None] + U[2] * hrf.ddspmt(xx)[:, None] sign = np.sign(np.dot(generated_hrfs.T, hrf.spmt(xx))) norm = np.abs(generated_hrfs).max(0) U = U * sign / norm V = V * sign * norm elif mode in ('r1glm',) and basis == '2hrf': xx = np.linspace(0, hrf_length * TR) generated_hrfs = U[0] * hrf.spmt(xx)[:, None] + \ U[1] * hrf.dspmt(xx)[:, None] sign = np.sign(np.dot(generated_hrfs.T, hrf.spmt(xx))) norm = np.abs(generated_hrfs).max(0) U = U * sign / norm V = V * sign * norm elif mode == 'r1glm' and basis == 'fir': xx = np.arange(0, TR * hrf_length, TR) sign = np.sign(np.dot(U.T, hrf.spmt(xx))) norm = np.abs(U).max(0) U = U * sign / norm V = V * sign * norm out = [U, V] if return_design_matrix: out.append(X_design.toarray()) return out
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') success_compilation = compile_aligner(logger) if not success_compilation: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None if qconfig.draw_plots: compile_gnuplot(logger, only_clean=False) num_nf_errors = logger._num_nf_errors create_nucmer_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.splitted_ref and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: if len(contigs_fpaths) >= len(qconfig.splitted_ref) and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: statuses_results_lengths_tuples = [] for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)): statuses_results_lengths_tuples.append(align_and_analyze( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, parallel_by_chr=True, threads=qconfig.max_threads)) # unzipping statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs =\ [[x[i] for x in statuses_results_lengths_tuples] for i in range(5)] reports = [] nucmer_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) misc.contigs_aligned_lengths = dict(zip(contigs_fpaths, aligned_lengths_by_contigs)) if NucmerStatus.OK in nucmer_statuses.values(): if qconfig.is_combined_ref: save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger) for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == NucmerStatus.OK: reports.append(save_result(results[index], report, fname, reference)) elif statuses[index] == NucmerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) if NucmerStatus.OK in nucmer_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) from . import plotter if qconfig.draw_plots: plotter.draw_misassemblies_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') if qconfig.draw_plots or qconfig.html_report: misassemblies_in_contigs = dict((contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths))) plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs, join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies') oks = list(nucmer_statuses.values()).count(NucmerStatus.OK) not_aligned = list(nucmer_statuses.values()).count(NucmerStatus.NOT_ALIGNED) failed = list(nucmer_statuses.values()).count(NucmerStatus.FAILED) errors = list(nucmer_statuses.values()).count(NucmerStatus.ERROR) problems = not_aligned + failed + errors all = len(nucmer_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return nucmer_statuses, aligned_lengths_per_fpath
pickle.dump( lis, open( "test-windows.pkl", "wb" ) ) #lis = pickle.load( open( "test-windows.pkl", "rb" ) ) ##Finding-windows k = len(lis)/10 #no of chunks, jobs iterator = range(0,len(lis)-k,k) from joblib import Parallel, delayed from parr_test import myfunc pdb.set_trace() results = Parallel(n_jobs=-1)(delayed(myfunc)(lis[i:i+k]) for i in iterator) if len(lis[iterator[-1]+k:]) >= 2: results.append(myfunc(lis[iterator[-1]+k:])) detects = np.concatenate(results) ##Plotting result windows=[] for w in x_list: for h in y_list: windows.append((h,w)) ind = np.where(detects==1)[0] ws1=[] for i in ind: ws1.append(windows[i])
def mab_eval(bandit, T, pol_cfg, N_trials=100, seed=None, parallel=False): if seed is not None: np.random.seed(seed) seed += 1 all_policies = extract_policies(**pol_cfg) policies = [] for p in all_policies: if p.name in pol_cfg['names']: policies.append(p) names = [p.name for p in policies] arm_dists = [bandit.resample_arms() for _ in range(N_trials)] # print [U for (ad, U) in arm_dists] results = [] if parallel == 1: rc = ipp.Client(profile='ssh') dv = rc[:] n_clients = len(dv) with dv.sync_imports(): import mab v = rc.load_balanced_view() print 'Evaluating Policies {}'.format(names) results = v.map(eval_helper, arm_dists, [bandit.theta_prior] * N_trials, [T]*N_trials, [pol_cfg]*N_trials, [frozenset(names)] * N_trials, [(seed + inum) for inum in range(N_trials)]) start = time.time() rate = 0 n_complete = 0 while rc.outstanding: try: rc.wait(rc.outstanding, 1e-1) except ipp.TimeoutError: # ignore timeouterrors pass if n_complete < N_trials - len(rc.outstanding): n_complete = N_trials - len(rc.outstanding) rate = ((time.time() - start) / n_complete) if n_complete > 0: est_remaining = rate * len(rc.outstanding) else: est_remaining = 'No Estimate' sys.stdout.write( '\rFinished {} / {} jobs\tEstimated Time Remaining: {:.4}'.format( n_complete, N_trials, est_remaining)) sys.stdout.flush() elif parallel == 2: from joblib import Parallel, delayed print 'Evaluating Policies {}'.format(names) results = Parallel(n_jobs=7, verbose=50)(delayed(_eval_helper)( ad, bandit.theta_prior, T, pol_cfg, names, seed + inum) for inum, ad in enumerate(arm_dists)) else: for inum, ad in enumerate(arm_dists): results.append( eval_helper( ad, bandit.theta_prior, T, pol_cfg, names, seed=seed+inum)) sys.stdout.write("{} / {}\t".format(inum, N_trials)) sys.stdout.flush() means = [] variances = [] avg_err = [] discounted_mean = [] try: if type(results[0]) == list: results = [x[0] for x in results] except CompositeError, e: print e import IPython; IPython.embed()
def pmultiquery(corpus, search, show = 'words', query = 'any', sort_by = 'total', quicksave = False, multiprocess = 'default', function_filter = False, just_speakers = False, root = False, note = False, print_info = True, **kwargs): """Parallel process multiple queries or corpora. This function is used by interrogator() if: a) path is a list of paths b) query is a dict of named queries c) just speakers == 'each', or a list of speakers with len(list) > 1 This function needs joblib 0.8.4 or above in order to run properly. There's no reason to call it yourself.""" import collections import os import pandas as pd import collections from collections import namedtuple from time import strftime, localtime import corpkit from interrogator import interrogator from editor import editor from other import save from interrogation import Interrogation try: from joblib import Parallel, delayed except: pass #raise ValueError('joblib, the module used for multiprocessing, cannot be found. ' \ # 'Install with:\n\n pip install joblib') import multiprocessing def best_num_parallel(num_cores, num_queries): import corpkit """decide how many parallel processes to run the idea, more or less, is to balance the load when possible""" if num_queries <= num_cores: return num_queries if num_queries > num_cores: if (num_queries / num_cores) == num_cores: return int(num_cores) if num_queries % num_cores == 0: try: return max([int(num_queries / n) for n in range(2, num_cores) if int(num_queries / n) <= num_cores]) except ValueError: return num_cores else: import math if (float(math.sqrt(num_queries))).is_integer(): square_root = math.sqrt(num_queries) if square_root <= num_queries / num_cores: return int(square_root) return num_cores num_cores = multiprocessing.cpu_count() # what is our iterable? ... multiple_option = False multiple_queries = False multiple_speakers = False multiple_corpora = False multiple_search = False mult_corp_are_subs = False denom = 1 if hasattr(corpus, '__iter__'): multiple_corpora = True num_cores = best_num_parallel(num_cores, len(corpus)) denom = len(corpus) if all(c.__class__ == corpkit.corpus.Subcorpus for c in corpus): mult_corp_are_subs = True elif (type(query) == list or type(query) == dict) and not hasattr(search, '__iter__'): multiple_queries = True num_cores = best_num_parallel(num_cores, len(query)) denom = len(query) elif hasattr(search, '__iter__') and type(search) != dict: multiple_search = True num_cores = best_num_parallel(num_cores, len(list(search.keys()))) denom = len(list(search.keys())) elif hasattr(function_filter, '__iter__'): multiple_option = True num_cores = best_num_parallel(num_cores, len(list(function_filter.keys()))) denom = len(list(function_filter.keys())) elif just_speakers: from build import get_speaker_names_from_xml_corpus multiple_speakers = True if just_speakers == 'each' or just_speakers == ['each']: just_speakers = get_speaker_names_from_xml_corpus(corpus.path) if len(just_speakers) == 0: print('No speaker name data found.') return num_cores = best_num_parallel(num_cores, len(just_speakers)) denom = len(just_speakers) if type(multiprocess) == int: num_cores = multiprocess if multiprocess is False: num_cores = 1 # make sure quicksaves are right type if quicksave is True: raise ValueError('quicksave must be string when using pmultiquery.') # the options that don't change d = { #'paralleling': True, 'function': 'interrogator', 'root': root, 'note': note, 'denominator': denom} # add kwargs to query for k, v in list(kwargs.items()): d[k] = v # make a list of dicts to pass to interrogator, # with the iterable unique in every one ds = [] if multiple_corpora: for index, p in enumerate(corpus): name = p.name a_dict = dict(d) a_dict['corpus'] = p a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name.replace('-parsed', '') a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_queries: for index, (name, q) in enumerate(query.items()): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = q a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_option: for index, (name, q) in enumerate(function_filter.items()): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['function_filter'] = q a_dict['printstatus'] = False ds.append(a_dict) elif multiple_speakers: for index, name in enumerate(just_speakers): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = [name] a_dict['function_filter'] = function_filter a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_search: for index, val in enumerate(search): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = val a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['function_filter'] = function_filter a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) if kwargs.get('do_concordancing') is False: message = 'Interrogating' elif kwargs.get('do_concordancing') is True: message = 'Interrogating and concordancing' elif kwargs.get('do_concordancing').lower() == 'only': message = 'Concordancing' time = strftime("%H:%M:%S", localtime()) sformat = '' for i, (k, v) in enumerate(list(search.items())): if type(v) == list: vformat = ', '.join(v[:5]) if len(v) > 5: vformat += ' ...' else: vformat = v sformat += '%s: %s' %(k, vformat) if i < len(search.keys()) - 1: sformat += '\n ' if multiple_corpora and not multiple_option: corplist = "\n ".join([i.name for i in corpus[:20]]) if len(corpus) > 20: corplist += '\n ... and %d more ...\n' % (len(corpus) - 20) print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes):\n %s" \ "\n Query: '%s'\n %s corpus ... \n" % (time, len(corpus), num_cores, corplist, sformat, message))) elif multiple_queries: print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \ "\n Queries: '%s'\n %s corpus ... \n" % (time, len(search), num_cores, corpus.name, "', '".join(list(search.values())), message) )) elif multiple_search: print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \ "\n Queries: '%s'\n %s corpus ... \n" % (time, len(list(search.keys())), num_cores, corpus.name, str(list(search.values())), message))) elif multiple_option: print(("\n%s: Beginning %d parallel corpus interrogations (multiple options): %s" \ "\n Query: '%s'\n %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) )) elif multiple_speakers: print(("\n%s: Beginning %d parallel corpus interrogations: %s" \ "\n Query: '%s'\n %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) )) # run in parallel, get either a list of tuples (non-c option) # or a dataframe (c option) #import sys #reload(sys) #stdout=sys.stdout failed = False terminal = False used_joblib = False #ds = ds[::-1] if not root: from blessings import Terminal terminal = Terminal() print('\n' * (len(ds) - 2)) for dobj in ds: linenum = dobj['paralleling'] # this try handles nosetest problems in sublime text try: with terminal.location(0, terminal.height - (linenum + 1)): # this is a really bad idea. thetime = strftime("%H:%M:%S", localtime()) num_spaces = 26 - len(dobj['outname']) print('%s: QUEUED: %s' % (thetime, dobj['outname'])) except: pass if not root and multiprocess: #res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) try: #ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True) res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) used_joblib = True except: failed = True print('Multiprocessing failed.') raise if not res: failed = True else: res = [] for index, d in enumerate(ds): d['startnum'] = (100 / denom) * index res.append(interrogator(**d)) try: res = sorted(res) except: pass # multiprocessing way #from multiprocessing import Process #from interrogator import interrogator #jobs = [] ##for d in ds: ## p = multiprocessing.Process(target=interrogator, kwargs=(**d,)) ## jobs.append(p) ## p.start() ## while p.is_alive(): ## import time ## time.sleep(2) ## if root: ## root.update() #result_queue = multiprocessing.Queue() # #for d in ds: #funs = [interrogator(result_queue, **kwargs) for kwargs in ds] #jobs = [multiprocessing.Process(mc) for mc in funs] #for job in jobs: job.start() #for job in jobs: job.join() #results = [result_queue.get() for mc in funs] import corpkit from interrogation import Concordance if kwargs.get('do_concordancing') == 'only': concs = pd.concat([x for x in res]) thetime = strftime("%H:%M:%S", localtime()) print('\n\n%s: Finished! %d results.\n\n' % (thetime, len(concs.index))) return Concordance(concs) from collections import OrderedDict if not all(type(i.results) == pd.core.series.Series for i in res): out = OrderedDict() for interrog, d in zip(res, ds): for unpicklable in ['note', 'root']: interrog.query.pop(unpicklable, None) out[interrog.query['outname']] = interrog if quicksave: fullpath = os.path.join('saved_interrogations', quicksave) while os.path.isdir(fullpath): selection = input("\nSave error: %s already exists in %s.\n\nType 'o' to overwrite, or enter a new name: " % (quicksave, 'saved_interrogations')) if selection == 'o' or selection == 'O': import shutil shutil.rmtree(fullpath) else: import os fullpath = os.path.join('saved_interrogations', selection) for k, v in list(out.items()): save(v, k, savedir = fullpath, print_info = False) time = strftime("%H:%M:%S", localtime()) print("\n%s: %d files saved to %s" % ( time, len(list(out.keys())), fullpath)) time = strftime("%H:%M:%S", localtime()) print("\n\n%s: Finished! Output is a dictionary with keys:\n\n '%s'\n" % (time, "'\n '".join(sorted(out.keys())))) from interrogation import Interrodict return Interrodict(out) # make query and total branch, save, return else: #print sers #print ds if multiple_corpora and not mult_corp_are_subs: sers = [i.results for i in res] out = pd.DataFrame(sers, index = [i.query['outname'] for i in res]) out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols out = out.fillna(0) # nan to zero out = out.astype(int) # float to int out = out.T else: out = pd.concat([r.results for r in res], axis = 1) # format like normal out = out[sorted(list(out.columns))] out = out.T out = out.fillna(0) # nan to zero out = out.astype(int) if 'c' in show and mult_corp_are_subs: out = out.sum() out.index = sorted(list(out.index)) # sort by total if type(out) == pd.core.frame.DataFrame: out.ix['Total-tmp'] = out.sum() tot = out.ix['Total-tmp'] out = out[tot.argsort()[::-1]] out = out.drop('Total-tmp', axis = 0) out = out.edit(sort_by = sort_by, print_info = False, keep_stats = False, \ df1_always_df = kwargs.get('df1_always_df')) if len(out.results.columns) == 1: out.results = out.results.sort_index() if kwargs.get('do_concordancing') is True: concs = pd.concat([x.concordance for x in res], ignore_index = True) concs = concs.sort_values(by='c') concs = concs.reset_index(drop=True) out.concordance = Concordance(concs) thetime = strftime("%H:%M:%S", localtime()) if terminal: with terminal.location(0, terminal.height): print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n')) else: print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n')) #if used_joblib: if quicksave: from other import save save(out, quicksave) print('\n') return out
def pmultiquery( path, option="c", query="any", sort_by="total", quicksave=False, num_proc="default", function_filter=False, just_speakers=False, root=False, note=False, print_info=True, **kwargs ): """Parallel process multiple queries or corpora. This function is used by interrogator if: a) path is a list of paths b) query is a dict of named queries c) function_filter is iterable d) just speakers == 'each' This function needs joblib 0.8.4 or above in order to run properly.""" import collections import os import pandas import pandas as pd import collections from collections import namedtuple from time import strftime, localtime from interrogator import interrogator from editor import editor from other import save_result try: from joblib import Parallel, delayed except: pass # raise ValueError('joblib, the module used for multiprocessing, cannot be found. ' \ # 'Install with:\n\n pip install joblib') import multiprocessing def best_num_parallel(num_cores, num_queries): import corpkit """decide how many parallel processes to run the idea, more or less, is to balance the load when possible""" if num_queries <= num_cores: return num_queries if num_queries > num_cores: if (num_queries / num_cores) == num_cores: return int(num_cores) if num_queries % num_cores == 0: return max([int(num_queries / n) for n in range(2, num_cores) if int(num_queries / n) <= num_cores]) else: import math if (float(math.sqrt(num_queries))).is_integer(): square_root = math.sqrt(num_queries) if square_root <= num_queries / num_cores: return int(square_root) return num_cores num_cores = multiprocessing.cpu_count() # are we processing multiple queries or corpora? # find out optimal number of cores to use. multiple_option = False multiple_queries = False multiple_speakers = False multiple_corpora = False denom = 1 if hasattr(path, "__iter__"): multiple_corpora = True num_cores = best_num_parallel(num_cores, len(path)) denom = len(path) elif hasattr(query, "__iter__"): multiple_queries = True num_cores = best_num_parallel(num_cores, len(query)) denom = len(query) elif hasattr(function_filter, "__iter__"): multiple_option = True num_cores = best_num_parallel(num_cores, len(function_filter.keys())) denom = len(function_filter.keys()) elif just_speakers: from corpkit.build import get_speaker_names_from_xml_corpus multiple_speakers = True if just_speakers == "each": just_speakers = get_speaker_names_from_xml_corpus(path) if len(just_speakers) == 0: print "No speaker name data found." return num_cores = best_num_parallel(num_cores, len(just_speakers)) denom = len(just_speakers) if num_proc != "default": num_cores = num_proc # make sure quicksaves are right type if quicksave is True: raise ValueError("quicksave must be string when using pmultiquery.") # the options that don't change d = { "option": option, #'paralleling': True, "function": "interrogator", "root": root, "note": note, "denominator": denom, } # add kwargs to query for k, v in kwargs.items(): d[k] = v # make a list of dicts to pass to interrogator, # with the iterable unique in every one ds = [] if multiple_corpora: path = sorted(path) for index, p in enumerate(path): name = os.path.basename(p) a_dict = dict(d) a_dict["path"] = p a_dict["query"] = query a_dict["outname"] = name a_dict["just_speakers"] = just_speakers a_dict["paralleling"] = index a_dict["printstatus"] = False ds.append(a_dict) elif multiple_queries: for index, (name, q) in enumerate(query.items()): a_dict = dict(d) a_dict["path"] = path a_dict["query"] = q a_dict["outname"] = name a_dict["just_speakers"] = just_speakers a_dict["paralleling"] = index a_dict["printstatus"] = False ds.append(a_dict) elif multiple_option: for index, (name, q) in enumerate(function_filter.items()): a_dict = dict(d) a_dict["path"] = path a_dict["query"] = query a_dict["outname"] = name a_dict["just_speakers"] = just_speakers a_dict["paralleling"] = index a_dict["function_filter"] = q a_dict["printstatus"] = False ds.append(a_dict) elif multiple_speakers: for index, name in enumerate(just_speakers): a_dict = dict(d) a_dict["path"] = path a_dict["query"] = query a_dict["outname"] = name a_dict["just_speakers"] = [name] a_dict["function_filter"] = function_filter a_dict["paralleling"] = index a_dict["printstatus"] = False ds.append(a_dict) time = strftime("%H:%M:%S", localtime()) if multiple_corpora and not multiple_option: print ( "\n%s: Beginning %d parallel corpus interrogations:\n %s" "\n\n Query: '%s'" "\n Interrogating corpus ... \n" % (time, num_cores, "\n ".join(path), query) ) elif multiple_queries: print ( "\n%s: Beginning %d parallel corpus interrogations: %s" "\n Queries: '%s'" "\n Interrogating corpus ... \n" % (time, num_cores, os.path.basename(path), "', '".join(query.values())) ) elif multiple_option: print ( "\n%s: Beginning %d parallel corpus interrogations (multiple options): %s" "\n\n Query: '%s'" "\n Interrogating corpus ... \n" % (time, num_cores, os.path.basename(path), query) ) elif multiple_speakers: print ( "\n%s: Beginning %d parallel corpus interrogations: %s" "\n\n Query: '%s'" "\n Interrogating corpus ... \n" % (time, num_cores, os.path.basename(path), query) ) # run in parallel, get either a list of tuples (non-c option) # or a dataframe (c option) # import sys # reload(sys) # stdout=sys.stdout failed = False # ds = ds[::-1] if not root: from blessings import Terminal terminal = Terminal() print "\n" * (len(ds) - 2) for dobj in ds: linenum = dobj["paralleling"] with terminal.location(0, terminal.height - (linenum + 1)): # this is a really bad idea. thetime = strftime("%H:%M:%S", localtime()) print "%s: [ 0%% (%s) ]" % (thetime, dobj["outname"]) # res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) try: # ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True) res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) print "\n\n\n" except: failed = True print "Multiprocessing failed." raise try: res = sorted(res) except: failed = True pass elif root or failed: res = [] for index, d in enumerate(ds): d["startnum"] = (100 / denom) * index res.append(interrogator(**d)) try: res = sorted(res) except: pass # multiprocessing way # from multiprocessing import Process # from corpkit.interrogator import interrogator # jobs = [] ##for d in ds: ## p = multiprocessing.Process(target=interrogator, kwargs=(**d,)) ## jobs.append(p) ## p.start() ## while p.is_alive(): ## import time ## time.sleep(2) ## if root: ## root.update() # result_queue = multiprocessing.Queue() # # for d in ds: # funs = [interrogator(result_queue, **kwargs) for kwargs in ds] # jobs = [multiprocessing.Process(mc) for mc in funs] # for job in jobs: job.start() # for job in jobs: job.join() # results = [result_queue.get() for mc in funs] # turn list into dict of results, make query and total branches, # save and return if not option.startswith("c"): out = {} # print '' for (name, data), d in zip(res, ds): for unpicklable in ["note", "root"]: try: del d[unpicklable] except KeyError: pass if not option.startswith("k"): outputnames = collections.namedtuple("interrogation", ["query", "results", "totals"]) try: stotal = data.sum(axis=1) stotal.name = u"Total" except ValueError: stotal = data.sum() output = outputnames(d, data, stotal) else: outputnames = collections.namedtuple("interrogation", ["query", "results"]) output = outputnames(d, data) out[name] = output # could be wrong for unstructured corpora? if quicksave: fullpath = os.path.join("saved_interrogations", quicksave) while os.path.isdir(fullpath): selection = raw_input( "\nSave error: %s already exists in %s.\n\nType 'o' to overwrite, or enter a new name: " % (quicksave, "saved_interrogations") ) if selection == "o" or selection == "O": import shutil shutil.rmtree(fullpath) else: import os fullpath = os.path.join("saved_interrogations", selection) for k, v in out.items(): save_result(v, k, savedir=fullpath, print_info=False) time = strftime("%H:%M:%S", localtime()) print "\n%s: %d files saved to %s" % (time, len(out.keys()), fullpath) time = strftime("%H:%M:%S", localtime()) print "\n\n%s: Finished! Output is a dictionary with keys:\n\n '%s'\n" % ( time, "'\n '".join(sorted(out.keys())), ) return out # make query and total branch, save, return else: out = pd.concat(res, axis=1) out = editor(out, sort_by=sort_by, print_info=False, keep_stats=False) time = strftime("%H:%M:%S", localtime()) print "\n\n%s: Finished! %d unique results, %d total." % (time, len(out.results.columns), out.totals.sum()) if quicksave: from other import save_result save_result(out, quicksave) return out