def make_cooc_mat(num_procs = 4): cv = joblib.load(VOCAB_PATH) if num_procs > 1: pool = Pool(num_procs) analyzer = cv.build_analyzer() vocab = cv.vocabulary_ dim = len(cv.vocabulary_) shared_params = dim, analyzer, vocab print('vocab size:', dim) cooc_mat = np.zeros((dim, dim), dtype = np.uint) sub_lines = [] sub_blocks = [] for line in bz2_line_gen(TEXT_PATH): if len(sub_lines) < 10000: sub_lines.append(line) elif num_procs == 1: cooc_mat += mp((0, shared_params, sub_lines)) sub_lines = [] elif len(sub_blocks) < num_procs: sub_blocks.append((len(sub_blocks), shared_params, sub_lines)) sub_lines = [] else: for cooc in pool.uimap(mp, sub_blocks): cooc_mat += cooc sub_blocks = [] print('block') if sub_lines: cooc_mat += mp((0, shared_params, sub_lines)) for sub_lines in sub_blocks: cooc_mat += mp((0, shared_params, sub_lines)) print(np.sum(cooc_mat)) joblib.dump(cooc_mat, COOC_MAT_PATH)
def run(): """Run the previously declared experiments. You should call this exactly once at the end of the file. If ``dry_run`` is given as command line parameter, then the runs are not executed but the commands printed to ``stdout``. """ if _state.run_was_called: _print_warning("run() was called more than once") _state.run_was_called = True _print_runs() if _is_selected("dry_run"): _run_dry() return _print_section("\nrunning the experiments:") for name, runs in _state.runs_by_name.items(): if len(runs) == 0: continue # run in parallel orig_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) pool = ProcessingPool(nodes=_cores) signal.signal(signal.SIGINT, orig_sigint_handler) try: for _ in tqdm.tqdm( pool.uimap(_run_run, runs), desc=name.ljust(_max_name_len()), total=len(runs), ): pass except KeyboardInterrupt: _print_warning("aborted during experiment " + name)
x for x in clean_filenames(filenames, json.load(open(AU_EMOTE_DICT_LOC))) if (clean_base(x) not in already_done_dirs and ( x not in already_done_dict or (already_done_dict[x] > 0))) ] out_q = m.Queue() Thread(target=listener, args=(ALREADY_DONE_FILE, out_q)).start() # for filename in tqdm(filenames): # find_filename_data(AU_EMOTE_DICT_LOC, CLASSIFIER_LOC, # REAL_TIME_FILE_LOC, OUT_FILE_PATH, out_q, filename) f = functools.partial(find_filename_data, AU_EMOTE_DICT_LOC, CLASSIFIER_LOC, REAL_TIME_FILE_LOC, OUT_FILE_PATH, out_q) num_processes = 5 with tqdm(total=len(filenames)) as pbar: p = Pool(num_processes) for iteration, _ in enumerate(p.uimap(f, enumerate(filenames))): pbar.update() # for filename in tqdm(filenames): # find_filename_data(AU_EMOTE_DICT_LOC, CLASSIFIER_LOC, # REAL_TIME_FILE_LOC, OUT_FILE_PATH, out_q, # (0, filename)) out_q.put('kill') p.close() p.join()
def download(modename, polarity, year, full, test=False, mc=None, njobs=1): import root_pandas log.info('Getting data for {} {} {}'.format( modename, polarity, year)) mode = get_mode(polarity, year, modename, mc) # I accidentally forgot the p in Dstp. Got to rename everything now for # this one exception. Hack incoming if modename == 'WS' and year == 2016: # As this is the start, hack name of the particle in the mode. mode.Dstp.name = 'Dst' sel = get_root_preselection.get(mode) # Always download the entire MC if full != 1 and mc is None: ctr = int(1./float(full)) sel = '({} % {} == 0) && '.format(evt_num(), ctr) + sel log.info('Using ({} % {} == 0)'.format(evt_num(), ctr)) tempfile.mktemp('.root') input_files = mode.get_file_list() if test: input_files = input_files[:4] chunked = list(helpers.chunks(input_files, 25)) length = len(list(chunked)) # While the code is in developement, just get any variables we can # access for part in mode.head.all_mothers() + mode.head.all_daughters(): for func in variables.__all__: try: getattr(variables, func)(part) except variables.AccessorUsage: pass # Make some sorted variables. Saves the hassle when later training BDTs arg_sorted_ip = '{},{},{},{}'.format( *[ipchi2(p) for p in mode.D0.all_daughters()]) arg_sorted_pt = '{},{},{},{}'.format( *[pt(p) for p in mode.D0.all_daughters()]) add_vars = { 'delta_m': '{} - {}'.format(m(mode.Dstp), m(mode.D0)), 'delta_m_dtf': '{} - {}'.format(dtf_m(mode.Dstp), dtf_m(mode.D0)), 'ltime_ratio': '{} / {}'.format(ltime(mode.D0), config.Dz_ltime), 'ipchi2_1': 'ROOTex::Leading({})'.format(arg_sorted_ip), 'ipchi2_2': 'ROOTex::SecondLeading({})'.format(arg_sorted_ip), 'ipchi2_3': 'ROOTex::ThirdLeading({})'.format(arg_sorted_ip), 'ipchi2_4': 'ROOTex::FourthLeading({})'.format(arg_sorted_ip), 'pt_1': 'ROOTex::Leading({})'.format(arg_sorted_pt), 'pt_2': 'ROOTex::SecondLeading({})'.format(arg_sorted_pt), 'pt_3': 'ROOTex::ThirdLeading({})'.format(arg_sorted_pt), 'pt_4': 'ROOTex::FourthLeading({})'.format(arg_sorted_pt), } variables_needed = list(variables.all_ever_used) if mc == 'mc': variables_needed.append('Dstp_BKGCAT') def run_splitter(fns): temp_file = tempfile.mktemp('.root') treesplitter(files=fns, treename=mode.get_tree_name(), output=temp_file, variables=variables_needed, selection=sel, addvariables=add_vars) return temp_file pool = ProcessingPool(njobs) temp_files = [] for r in tqdm.tqdm(pool.uimap(run_splitter, chunked), leave=True, total=length, smoothing=0): temp_files.append(r) log.info('Created {} temporary files.'.format(len(temp_files))) bcolz_folder = config.bcolz_locations.format(mode.get_store_name()) try: log.info('Removing already existing data at {}'.format( bcolz_folder)) shutil.rmtree(bcolz_folder) except OSError: log.info('No previous data found. Nothing to delete.') df_gen = root_pandas.read_root(temp_files, mode.get_tree_name(), chunksize=[500000, 100][args.test]) # New storage using bcolz because better ctuple = None for df in df_gen: log.info('Adding {} events of {} to store {}.'.format( len(df), mode.get_tree_name(), bcolz_folder)) if modename == 'WS' and year == 2016: new_names = { old: old.replace('Dst', 'Dstp') for old in df.columns if 'Dst' in old } df = df.rename(index=str, columns=new_names) if ctuple is None: ctuple = bcolz.ctable.fromdataframe(df, rootdir=bcolz_folder) else: ctuple.append(df.to_records(index=False)) for f in temp_files: os.remove(f) # Loop and delete everything in the datastore that needs to be recached remove_buffer_for_mode(mode.mode) if modename == 'WS' and year == 2016: # As this is the start, hack name of the particle in the mode. mode.Dstp.name = 'Dstp'
#from multiprocessing import Pool from pathos.multiprocessing import ProcessingPool as Pool def f(x): return x * x p = Pool() for i in p.uimap(f, range(10)): print(i)
def main(): """ Hardcoded parameters for Ramstake """ rs_codeword_length = 255 * 8 modulus_bitsize = 756839 q = 2**modulus_bitsize - 1 # Parse arguments parser = argparse.ArgumentParser("Secret Estimator. Compute either estimator parameters or estimate secrets based on a set of failures (requires precomputed parameters).") parser.add_argument("-p", "--params", nargs=2, dest="params", metavar=("samplesfail", "samplessucces"), help="Compute estimator parameter using random decryption failures (filename) and successes (filename).") parser.add_argument("-s", "--secrets", nargs=1, dest="secrets", metavar=("estimator"), required=True, help="Compute estimates of the secrets using a set of decryption failures (filename).") args = parser.parse_args() print "---------------------------------" print " modulus_bitsize: " + str(modulus_bitsize) print "---------------------------------" if args.params: print " Calculating estimations from samples..." est = calculateestimator(args.params[0], args.params[1], modulus_bitsize, q, rs_codeword_length) np.save('estimated_params', est) """ If --params not given as argument, file 'estimates_params.npy' should exist is same directory """ if not os.path.exists('estimated_params.npy'): print('Parameters from sampling not yet estimates. Please specify -p sample_fail samples_success') return 0 est = np.load('estimated_params.npy') """ Estimate secrets. """ if args.secrets: #estimate a and b # b=1; a=0 for aorb in [1, 0]: samples = getsamples(args.secrets[0], modulus_bitsize) # skip the secret, save and then delete secret = samples.next() np.save('secret-a',secret[1]) np.save('secret-b',secret[2]) del secret # get a standard probability estimate of the zero's and ones prob = np.ones(modulus_bitsize) # loop over all the samples, and estimate the required change from pathos.multiprocessing import ProcessingPool as Pool pool = Pool(nodes=4) f = lambda x: getestimate(x[aorb], est, modulus_bitsize, q, rs_codeword_length) print 'start' for i in tqdm(pool.uimap(f, samples)): prob = prob * i np.save('estimate-tmp',prob) if aorb==1: tmp = 'b' else: tmp = 'a' np.save('estimate-'+tmp,prob)
def do_parallel(function, list, ncores=28): p = Pool(ncores) for _ in tqdm(p.uimap(function, list), total=len(list)): pass