# -*- coding: utf-8 -*- """ Created on Thu Apr 25 18:28:37 2019 @author: Guanglin Kuang """ from multiprocessing import Pool import time def doubler(number): return number * 2 if __name__ == '__main__': numbers = range(50000000) start_time = time.time() pool = Pool(processes=8) result = pool.map(doubler, numbers) end_time = time.time() print("Parallel time: {:.2f} seconds".format(end_time - start_time))
pid_array = loadtxt(data_path + 'samples_100_order_pids.txt.gz') pid_array = pid_array.flatten() pid_array = pid_array.astype(np.int64) test_list_pid = pid_array*1 gc.collect() I_song = array(train_list_song_co_occur.sum(axis=1)).flatten() I_list = array(train_list_song_co_occur.sum(axis=0)).flatten() alpha = 0.9 beta = 0.9 print("Predictions begin:") start = time() pool = Pool(20) results = pool.map(cf_predict_evaluation_slim, [index for index in test_list_pid]) pool.close() pool.join() print('Time taken:', time() - start) prediction_result = zeros((len(test_list_pid),max_n_predictions+1)) prediction_result[:,0] = test_list_pid for i in range(len(results)): prediction_result[i,range(1,501)] = results[i] savetxt('../submit/cf2submit_v9.txt', prediction_result, delimiter=',')
for i1, task in enumerate(filter(isdir, os.listdir(base_path))): models_path = os.path.join(base_path, task) make_title(i1, task) i2 = 0 for model in os.listdir(models_path): frameworks_path = os.path.join(models_path, model) if os.path.isdir(frameworks_path): for script_name in os.listdir(frameworks_path): download_script = os.path.join(frameworks_path, script_name) if download_script.endswith('download_dataset.sh'): if args.sequential: run_script(download_script) else: scripts.append(download_script) if not args.sequential: from multiprocessing import Pool, cpu_count pool = Pool(cpu_count()) pool.map(run_script, scripts) data = f'Total Time {time.time() - start_all}' print(data) experiment.write(data) experiment.close()
parser = argparse.ArgumentParser() parser.add_argument('--data_file', required=True) parser.add_argument('--save_dir', required=True) parser.add_argument('--mode', type=str, default='train') # 'train' parser.add_argument('--ncpu', type=int, default=8) parser.add_argument('--split_len', type=int, default=10000) # the surplus will be evenly allocated to each split args = parser.parse_args() if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir) if args.mode == 'train': # dataset contains molecule pairs with open(args.data_file) as f: data = f.readlines()[1:] with Pool(args.ncpu) as pool: data = pool.map(process, data) # data = [process(item) for item in data] num_splits = len(data) // args.split_len num_splits = 1 if num_splits == 0 else num_splits le = (len(data) + num_splits - 1) // num_splits for split_id in range(num_splits): st = split_id * le sub_data = data[st: st + le] with open(os.path.join(args.save_dir, f'tensors-{split_id}.pkl'), 'wb') as f: pickle.dump(sub_data, f, pickle.HIGHEST_PROTOCOL)
def validator(proxy, q): '用代理连接百度验证代理可用性' url = 'https://www.baidu.com' proxies = { 'http': proxy, 'https': proxy, } try: r = requests.get(url, headers=headers, proxies=proxies, timeout=10) if (r.status_code == requests.codes.ok): print('valid proxy: ' + proxy) q.append(proxy) except Exception as e: print('error:', proxy, e) time.sleep(0.5) if __name__ == '__main__': num = int(input('高匿代理,一页100个代理,输入你想要爬取的页数:')) getXiCi(num) m = Manager() q = m.list() p = Pool(len(proxyList) // 2) for proxy in proxyList: p.apply_async(validator, args=(proxy, q)) p.close() p.join() print('可用代理如下:') print(q)
def squad_questions_to_metas(self, pretense, file): p=Pool(40) queries = pickle.load(open(file,'rb')) queries_meta = p.starmap(Metatext, zip([len(q.question.split()) for q in queries], [q.question for q in queries], [q.paragraph_num for q in queries])) pickle.dump([queries_meta], open('../../pickled_squad/'+pretense+'queries_meta.pickle','wb')) return queries_meta
results = [] fullmeasuredPeriod = [] fullPeriod = [] fullPower = [] fullSigLevel = [] fullMag = [] MagRangearray = np.linspace(17,24,maglength) MagRange = [x for x in MagRangearray] maglist = [] for x in range(len(MagRange)): maglist.append([MagRange[x]]*7) newlist = Magnitudes.mag1929 pool = Pool(processors) for h in range(startnumber,endnumber): print(newlist[h]) results.append(pool.map(partial(lombScargle, objectmag=newlist[h]),FrangeLoop)) twoDlist = [[],[],[],[],[],[]] for X in range(len(results)): for Y in range(len(results[X])): twoDlist[0].append(results[X][Y][0]) twoDlist[1].append(results[X][Y][1]) twoDlist[2].append(results[X][Y][2]) twoDlist[3].append(results[X][Y][3]) twoDlist[4].append(results[X][Y][4]) twoDlist[5].append(results[X][Y][5]) with open(inFile, 'r') as istr: with open(outFile,'w') as ostr:
assert len(val_idx) + len(test_idx) + len(train_idx) == 30000 assert len(val_idx) == 2993 assert len(test_idx) == 2824 assert len(train_idx) == 24183 def process(item): count, (d, idx, x) = item copyfile(os.path.join(s_label, str(idx) + '.png'), os.path.join(d, str(count) + '.png')) resize_and_write(idx, d, count) with Pool(processes=os.cpu_count() // 2) as pool: for i in tqdm.tqdm(pool.imap_unordered(process, val_idx), 'val', total=len(val_idx)): pass for i in tqdm.tqdm(pool.imap_unordered(process, test_idx), 'test', total=len(test_idx)): pass for i in tqdm.tqdm(pool.imap_unordered(process, train_idx), 'train', total=len(train_idx)): pass # def process_one_line(idx, x): # x = int(x)
# read all MPdata MPdata_all = pd.read_csv("./MPdata_all/MPdata_all.csv", sep=';', header=0, index_col=None) # show statistics of original data if False: print('show statistics of original data') check_properties(data=MPdata_all) else: print('size of original data:', MPdata_all.shape[0]) # check crystal symmetry if False: print('\nchecking crystal symmetry match on all {} data'.format(MPdata_all.shape[0])) sym_thresh = 0.1 nworkers = multiprocessing.cpu_count() pool_Xsys = Pool(processes=nworkers) df_split = np.array_split(MPdata_all, nworkers) args = [(data, sym_thresh) for data in df_split] MPdata_all = pd.concat(pool_Xsys.starmap(check_crystal_system, args), axis=0) pool_Xsys.close() pool_Xsys.join() print('size of data with matched crystal symmetry:', MPdata_all.shape[0]) else: print('\nskip checking..') drop_ids = ['mp-18828', 'mp-12843', 'mp-20811'] MPdata_all = MPdata_all[~MPdata_all['material_id'].isin(drop_ids)] print('size of data with matched crystal system:', MPdata_all.shape[0]) # make dataset directory root_dir = './datasets/' if not os.path.exists(root_dir):
def main(): print('\n******************************') print(' MultiFit v.' + defPar.version) print('******************************') try: opts, args = getopt.getopt(sys.argv[1:], "bftmitph:", [ "batch", "file", "type", "map", "input-par", "test", "plot", "help" ]) except getopt.GetoptError: # print help information and exit: usage() sys.exit(2) # If parameter file not present, make one if not exists(defPar.inputParFile): print('\n Init parameter not found. Generating a new one...') genInitPar() # If summary file is not present, make it and fill header makeHeaderSummary() if (defPar.multiproc == True): print('\n Multiprocessing enabled: ' + str(defPar.numProc) + '/' + str(mp.cpu_count()) + ' CPUs\n') else: print('\n Multiprocessing disabled\n') for o, a in opts: if o in ("-b", "--batch"): try: type = sys.argv[2] except: usage() sys.exit(2) type = int(sys.argv[2]) i = 0 if (defPar.multiproc == True): p = Pool(defPar.numProc) for f in glob.glob('*.txt'): if (f != 'summary.txt'): rs = readSingleSpectra(f) p.apply_async(calculate, args=(rs.x, rs.y, '0', '0', f, type, False, False, i)) i += 1 p.close() p.join() else: for f in glob.glob('*.txt'): if (f != 'summary.txt'): rs = readSingleSpectra(f) calculate(rs.x, rs.y, '0', '0', f, type, False, False, i) i += 1 addBlankLine(defPar.summary) elif o in ("-f", "--file"): try: type = sys.argv[3] except: usage() sys.exit(2) file = str(sys.argv[2]) type = int(sys.argv[3]) rs = readSingleSpectra(file) calculate(rs.x, rs.y, '0', '0', file, type, False, True, '') elif o in ("-p", "--plot"): if (len(sys.argv) < 3): if (defPar.multiproc == True): p = Pool(defPar.numProc) for f in glob.glob('*.txt'): if (f != 'summary.txt'): rs = readSingleSpectra(f) print("Saving plot for: " + f) p.apply_async(plotData, args=(rs.x, rs.y, f, False)) p.close() p.join() else: for f in glob.glob('*.txt'): if (f != 'summary.txt'): rs = readSingleSpectra(f) print("Saving plot for: " + f) plotData(rs.x, rs.y, f, False) else: file = str(sys.argv[2]) rs = readSingleSpectra(file) plotData(rs.x, rs.y, file, True) elif o in ("-m", "--map"): try: type = sys.argv[3] except: usage() sys.exit(2) file = str(sys.argv[2]) type = int(sys.argv[3]) rm = readMap(file) map = Map() i = 0 if (defPar.multiproc == True): p = Pool(defPar.numProc) for i in range(1, rm.num_lines): p.apply_async(calculate, args=(rm.x, rm.y[i], rm.x1[i], rm.y1[i], file, type, True, False, i)) p.close() p.join() #map.draw(os.path.splitext(file)[0] + '_map.txt', True) else: for i in range(1, rm.num_lines): calculate(rm.x, rm.y[i], rm.x1[i], rm.y1[i], file, type, True, False, i) #map.draw(os.path.splitext(file)[0] + '_map.txt', True) elif o in ("-t", "--test"): file = str(sys.argv[2]) map = Map() #map.readCoord(os.path.splitext(file)[0] + '_map.txt') map.draw(os.path.splitext(file)[0] + '_map.txt', True) elif o in ("-i", "--input-par"): genInitPar() else: usage() sys.exit(2)
from multiprocessing import Pool import time def sayHi(num): num += 1 print "process %d start" % num time.sleep(10 / num) print "process %d end" % num return num * num p = Pool(processes=5) result_list = [] for i in range(30): result_list.append(p.apply_async(sayHi, [i])) # p.close() # p.join() id = 0 for res in result_list: print "wait %d" % id print res.get() id += 1
# ========================================= # Filtering II - Position wise filtering # ========================================= pars_toprint = '-m{}-d{}-b{}-c{}-p{}'.format(int(args.m), int(args.d), int(args.b), int(args.c), float(args.p)) filt_folder = args.projdir + '/filtered' + pars_toprint + '/' if not os.path.exists(filt_folder): os.makedirs(filt_folder) os.makedirs(filt_folder + '/pop/') else: shutil.rmtree(filt_folder) os.makedirs(filt_folder) os.makedirs(filt_folder + '/pop/') p = Pool(processes=args.n_threads) partial_Div = partial(filter_two, args=args, snp_files=glob.glob(args.projdir + '/snpCaller/called*'), outdir=filt_folder + '/pop') p.map(partial_Div, samples_of_interest.keys()) p.close() p.join() if args.ind: if not os.path.exists(filt_folder + '/ind/'): os.makedirs(filt_folder + '/ind/') p = Pool(processes=args.n_threads) partial_Div = partial(filter_two, args=args, snp_files=glob.glob(args.projdir + '/snpCaller/indiv*'),
async def get_poi_information(): """Retrieve POI information for an array of ids""" ids = await request.get_json() if len(ids) > 100: abort(400, description='You can send at most 100 ids at once.') pool = Pool(processes=math.ceil(len(ids) / 3)) results = list() def parse_result(r): data = r['data'][6] name = get_nested_value(data, 11) place_id = get_nested_value(data, 78) lat = round(get_nested_value(data, 9, 2), 7) # 7 digits equals a precision of 1 cm lng = round(get_nested_value(data, 9, 3), 7) # 7 digits equals a precision of 1 cm # noinspection PyUnresolvedReferences h3_index = h3.geo_to_h3(lat, lng, POI_RESOLUTION) address = get_nested_value(data, 2) timezone = get_nested_value(data, 30) categories = [t[0] for t in (get_nested_value(data, 76) or [])] opening_hours = parse_opening_hours(get_nested_value(data, 34, 1)) permanently_closed = get_nested_value(data, 88, 0) == 'CLOSED' temporarily_closed = get_nested_value(data, 96, 5, 0, 2) == 'Reopen this place' and not permanently_closed inside_of = get_nested_value(data, 93, 0, 0, 0, 1) phone = get_nested_value(data, 178, 0, 3) website = get_nested_value(data, 7, 0) rating_stars = get_nested_value(data, 4, 7) rating_number_of_reviews = get_nested_value(data, 4, 8) price_level = get_nested_value(data, 4, 2) popularity_data = get_nested_value(data, 84, 0) spending_time = parse_spending_time_data(get_nested_value(data, 117, 0)) popularity, waiting_time = None, None if popularity_data: popularity, waiting_time = parse_popularity_data(popularity_data, timezone) return dict( id=r['id'], data=dict( name=name, placeID=place_id, location=dict(lat=lat, lng=lng), h3Index=h3_index, address=address, timezone=timezone, categories=categories, temporarilyClosed=temporarily_closed, permanentlyClosed=permanently_closed, insideOf=inside_of, contact=dict(phone=phone, website=website), openingHours=opening_hours, rating=dict(stars=rating_stars, numberOfReviews=rating_number_of_reviews), priceLevel=len(price_level) if price_level else None, popularity=popularity, waitingTime=waiting_time, spendingTime=spending_time ) ) for result in pool.imap(google.get_by_id, ids): results.append(parse_result(result)) return jsonify({'success': True, 'data': results})
def update(self, corpus, chunks_as_numpy=False): """Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations is reached). Train the model with new documents, by EM-iterating over the corpus until the topics converge, or until the maximum number of allowed iterations is reached. `corpus` must be an iterable. The E step is distributed into the several processes. Notes ----- This update also supports updating an already trained model (`self`) with new documents from `corpus`; the two models are then merged in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the online update of Hoffman et al. and is guaranteed to converge for any `decay` in (0.5, 1.0>. Parameters ---------- corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`) used to update the model. chunks_as_numpy : bool Whether each chunk passed to the inference step should be a np.ndarray or not. Numpy can in some settings turn the term IDs into floats, these will be converted back into integers in inference, which incurs a performance hit. For distributed computing it may be desirable to keep the chunks as `numpy.ndarray`. """ try: lencorpus = len(corpus) except TypeError: logger.warning("input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: logger.warning("LdaMulticore.update() called with an empty corpus") return self.state.numdocs += lencorpus if not self.batch: updatetype = "online" updateafter = self.chunksize * self.workers else: updatetype = "batch" updateafter = lencorpus evalafter = min(lencorpus, (self.eval_every or 0) * updateafter) updates_per_pass = max(1, lencorpus / updateafter) logger.info( "running %s LDA training, %s topics, %i passes over the supplied corpus of %i documents, " "updating every %i documents, evaluating every ~%i documents, " "iterating %ix with a convergence threshold of %f", updatetype, self.num_topics, self.passes, lencorpus, updateafter, evalafter, self.iterations, self.gamma_threshold ) if updates_per_pass * self.passes < 10: logger.warning( "too few updates, training might not converge; " "consider increasing the number of passes or iterations to improve accuracy" ) job_queue = Queue(maxsize=2 * self.workers) result_queue = Queue() # rho is the "speed" of updating; TODO try other fncs # pass_ + num_updates handles increasing the starting t for each pass, # while allowing it to "reset" on the first pass of each update def rho(): return pow(self.offset + pass_ + (self.num_updates / self.chunksize), -self.decay) logger.info("training LDA model using %i processes", self.workers) pool = Pool(self.workers, worker_e_step, (job_queue, result_queue,)) for pass_ in xrange(self.passes): queue_size, reallen = [0], 0 other = LdaState(self.eta, self.state.sstats.shape) def process_result_queue(force=False): """ Clear the result queue, merging all intermediate results, and update the LDA model if necessary. """ merged_new = False while not result_queue.empty(): other.merge(result_queue.get()) queue_size[0] -= 1 merged_new = True if (force and merged_new and queue_size[0] == 0) or (not self.batch and (other.numdocs >= updateafter)): self.do_mstep(rho(), other, pass_ > 0) other.reset() if self.eval_every is not None and \ ((force and queue_size[0] == 0) or (self.eval_every != 0 and (self.num_updates / updateafter) % self.eval_every == 0)): self.log_perplexity(chunk, total_docs=lencorpus) chunk_stream = utils.grouper(corpus, self.chunksize, as_numpy=chunks_as_numpy) for chunk_no, chunk in enumerate(chunk_stream): reallen += len(chunk) # keep track of how many documents we've processed so far # put the chunk into the workers' input job queue chunk_put = False while not chunk_put: try: job_queue.put((chunk_no, chunk, self), block=False, timeout=0.1) chunk_put = True queue_size[0] += 1 logger.info( "PROGRESS: pass %i, dispatched chunk #%i = documents up to #%i/%i, " "outstanding queue size %i", pass_, chunk_no, chunk_no * self.chunksize + len(chunk), lencorpus, queue_size[0] ) except queue.Full: # in case the input job queue is full, keep clearing the # result queue, to make sure we don't deadlock process_result_queue() process_result_queue() # endfor single corpus pass # wait for all outstanding jobs to finish while queue_size[0] > 0: process_result_queue(force=True) if reallen != lencorpus: raise RuntimeError("input corpus size changed during training (don't use generators as input)") # endfor entire update pool.terminate()
def current(alfa,k,suma,xmin,xmax,nx,ymin,ymax,ny): tic = time.clock() mypath = 'Results/' + 'alfa=' + alfa + '/' + 'k=' + k if not os.path.isdir(mypath): os.makedirs(mypath) path_x = mypath + '/' + 'x_coordinate_current.txt' # array of x coordinates saved in a file path_y = mypath + '/' + 'y_coordinate_current.txt' # array of y coordinates saved in a file path_current_x = mypath + '/' + 'current_x.txt' # array of x component of current for given coordinates (x,y) saved in a file path_current_y = mypath + '/' + 'current_y.txt' # array of y component of current for given coordinates (x,y) saved in a file #Initializing arrays of current with number of rows equal to nx and number of columns equal to ny, filled with 0's current_x = np.zeros((nx, ny), dtype=np.float64) current_y = np.zeros((nx, ny), dtype=np.float64) x = np.linspace(xmin, xmax, nx) y = np.linspace(ymin, ymax, ny) # Initializing list which will become a stack for tasks to be done stack = [] # Setting number of processes used in parallel computing nproc = 8 # This condition is required on Windows if __name__ == '__main__': p = Pool(nproc) for h in range(0, nx): for j in range(0, ny): # Appending tasks to stack list stack.append((h,j)) while len(stack): # Takes nproc last element from stack list temps = stack[-nproc:] # Creates 8 element list of parameters stored in tuple xys = [(x[i], y[j], float(alfa), float(k), int(suma)) for i, j in temps] # Results of last nproc tasks evaluated in parallel results_x = p.map(current_x_formula, xys) results_y = p.map(current_y_formula, xys) for proc in range(len(temps)): # Storing last nproc results in z 2D array current_x[temps[proc][1], temps[proc][0]] = results_x[proc] current_y[temps[proc][1], temps[proc][0]] = results_y[proc] # Deleting last nproc from stack del stack[-nproc:] # Saving x coordinates, y coordinates, current matrices in txt files np.savetxt(path_x, x, delimiter=',') np.savetxt(path_y, y, delimiter=',') np.savetxt(path_current_x, current_x, delimiter=',') np.savetxt(path_current_y, current_y, delimiter=',') outfile = open(mypath + '/' + 'info_current.txt', 'w') outfile.write('alfa = ' + alfa + '\n') outfile.write('k = ' + k + '\n') outfile.write('Number of elements = ' + str(2*int(suma)+1) + '\n') outfile.write('xmin = ' + str(xmin) + '\n') outfile.write('xmax = ' + str(xmax) + '\n') outfile.write('Density of points on x axis = ' + str(nx) + '\n') outfile.write('ymin = ' + str(ymin) + '\n') outfile.write('ymax = ' + str(ymax) + '\n') outfile.write('Density of points on y axis = ' + str(ny) + 's\n') toc = time.clock() outfile.write('Evaluating time = ' + str(toc-tic) + '\n') outfile.close() # Printing time elapsed on evaluating print(toc-tic) # Creates streamplot for current normalising_factor = np.sqrt(current_x**2 + current_y**2) plt.figure() plt.streamplot(x,y,current_x,current_y,color=normalising_factor,cmap=cm.jet,linewidth=2,arrowstyle='->',arrowsize=1.5) plt.colorbar() # Saves that plot plt.savefig(mypath + '/' + 'current.png') # And shows plt.show()
print "==== pretending to run %s (%d entries, %s) ====" % (name, nev, fout) return (name,(nev,0)) print "==== %s starting (%d entries) ====" % (name, nev) booker = Booker(fout) modulesToRun = MODULES if options.modules != []: toRun = {} for m,v in MODULES: for pat in options.modules: if re.match(pat,m): toRun[m] = True modulesToRun = [ (m,v) for (m,v) in MODULES if m in toRun ] el = EventLoop([ VariableProducer(options.treeDir,booker,options.region,sample_nevt,short,modulesToRun), ]) el.loop([tb], eventRange=range) booker.done() fb.Close() time = timer.RealTime() print "=== %s done (%d entries, %.0f s, %.0f e/s) ====" % ( name, nev, time,(nev/time) ) return (name,(nev,time)) if options.jobs > 0: from multiprocessing import Pool pool = Pool(options.jobs) ret = dict(pool.map(_runIt, jobs)) if options.jobs > 0 else dict([_runIt(j) for j in jobs]) else: ret = dict(map(_runIt, jobs)) fulltime = maintimer.RealTime() totev = sum([ev for (ev,time) in ret.itervalues()]) tottime = sum([time for (ev,time) in ret.itervalues()]) print "Done %d tasks in %.1f min (%d entries, %.1f min)" % (len(jobs),fulltime/60.,totev,tottime/60.)
reduce_contacts_by, prob_has_trace_app, backwards_trace, probable_infections_need_test ] return(parameters + simulation.inf_counts) param_names = [ "hazard_rate_scale", "infection_reporting_prob", "contact_tracing_success_prob", "contact_trace_delay_par", "global_contact_reduction", "prob_has_trace_app", "backwards_trace", "probable_infections_require_test", "backwards_tracing_time_limit" ] col_names = param_names + [str(i) for i in range(days_to_simulate)] col_names_dict = {} for i in range(len(col_names)): col_names_dict.update({i: col_names[i]}) if __name__ == '__main__': with Pool(10) as p: results = p.map(run_simulation, range(repeats)) results = pd.DataFrame(results) results = results.rename(columns=col_names_dict) results.to_excel("Data/Simulation Results/UK Model/no_time_lim.xlsx")
from multiprocessing import Pool import os import time def task(task_id): start_time = time.time() print 'task %s in process %s start on %s' % (task_id, os.getpid(), start_time) time.sleep(3) end_time = time.time() print 'task %s in process %s end on %s' % (task_id, os.getpid(), end_time) pool = Pool(10) for i in range(10): pool.apply(task, args=('task' + str(i+1),),) pool.close() pool.join()
w = w.split(":")[0] all_read_ids.add(v) all_read_ids.add(w) for v, w in h_asm_G.sg_edges: if h_asm_G.sg_edges[ (v, w) ][-1] != "G": continue v = v.split(":")[0] w = w.split(":")[0] all_read_ids.add(v) all_read_ids.add(w) seqs = load_sg_seq(all_read_ids, fasta_fn) if ctg_id == "all": ctg_id_list = p_asm_G.ctg_data.keys() else: ctg_id_list = [ctg_id] exe_list = [] for ctg_id in ctg_id_list: if ctg_id[-1] != "F": continue if ctg_id not in all_rid_to_phase: continue exe_list.append( (ctg_id, os.path.join(".", ctg_id)) ) exec_pool = Pool(24) exec_pool.map( generate_haplotigs_for_ctg, exe_list) #map( generate_haplotigs_for_ctg, exe_list)
apply_async """ from multiprocessing import Pool, TimeoutError import time import os def f(x): return x * x if __name__ == '__main__': # 创建4个进程 with Pool(processes=4) as pool: # 打印 "[0, 1, 4,..., 81]" print(pool.map(f, range(10))) # 使用任意顺序输出相同的数字, for i in pool.imap_unordered(f, range(10)): print(i) # 异步执行"f(20)" res = pool.apply_async(f, (20, )) # 只运行一个进程 print(res.get(timeout=1)) # 输出 "400" # 异步执行 "os.getpid()" res = pool.apply_async(os.getpid, ()) # 只运行一个进程 print(res.get(timeout=1)) # 输出进程的 PID
#%% construct word_dict with multi-process to speed up #run before you want to excute the code in Ipython console for create multiprocess if __name__ == '__main__': __spec__ = "ModuleSpec(name='builtins', loader=<class '_frozen_importlib.BuiltinImporter'>)" DATA = pd.read_csv('train.csv') n_workers = 4 chunks = [ ' '.join(DATA['Sentences'][i:i + len(DATA['Sentences']) // n_workers]) for i in range(0, len(DATA['Sentences']), len(DATA['Sentences']) // n_workers) ] pool = Pool(processes=n_workers) result = pool.map_async(word_tokenize, chunks) # tokenize using nltk.word_tokenize words = set(sum(result.get(), [])) word_dict = {'<pad>': 0} for word in words: word_dict[word] = len(word_dict) #%% Train w2v TRAIN_corpus = DATA['TOKEN'].values # setting vector_dim = 64 window_size = 5 min_count = 1 training_iter = 20
#print(input_, alg, output_path, dataset, procs) df_input = pd.read_csv(input_, sep='|') #df_input = df_input.sample(100) #print(df_input) data = df_input['bow_preproc'].apply(lambda x: np.str_(x)) labels = df_input['classes'] filenames = df_input['file_name'] #print("Database: ", np.shape(X), np.unique(y, return_counts=True)) # K Fold não aleatório #kf = KFold(n_splits=10, shuffle = False) # K Fold aleatório estratificado # kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234) # K Fold aleatório nao-estratificado kf = KFold(n_splits=10, shuffle=True, random_state=1234) p = Pool(processes=procs) if (alg == 0): start_job(1, "tf", dataset, pos_label, neg_label) start_job(2, "tf", dataset, pos_label, neg_label) start_job(3, "tf", dataset, pos_label, neg_label) start_job(4, "tf", dataset, pos_label, neg_label) start_job(1, "tfidf", dataset, pos_label, neg_label) start_job(2, "tfidf", dataset, pos_label, neg_label) start_job(3, "tfidf", dataset, pos_label, neg_label) start_job(4, "tfidf", dataset, pos_label, neg_label) else: start_job(alg, "tf", dataset, pos_label, neg_label) start_job(alg, "tfidf", dataset, pos_label, neg_label)
def main(argv): g_resultNameBase = argv[0] new_command = argv[1] #apply to all g_group_command_16nodes = argv[2] g_group_command_36nodes = argv[3] g_group_command_100nodes = argv[4] def options(sim_type): return { # --resultType=3 --noOfRuns=10 --ctrlNoOfRuns=1 # 0: Grid, ndad 2 0: g_common_commands + g_resultNameBase + "-grid-ndad2 " + '--topology=0 --nDADcount=2 ', # 1: Cross-Grid, ndad3 1: g_common_commands + g_resultNameBase + "-crossGrid-ndad3 " + '--topology=1 --nDADcount=3 ', # 2: Uniform-Disc, ndad2 2: g_common_commands + g_resultNameBase + "-uDisc-ndad2 " + '--topology=3 --nDADcount=2 ', # 3: Grid, ndad3 3: g_common_commands + g_resultNameBase + "-grid-ndad3 " + '--topology=0 --nDADcount=3 ', # 4: Cross-Grid, ndad4 4: g_common_commands + g_resultNameBase + "-crossGrid-ndad4 " + '--topology=1 --nDADcount=4 ', # 5: Uniform-Disc, ndad3 5: g_common_commands + g_resultNameBase + "-uDisc-ndad3 " + '--topology=3 --nDADcount=3 ', }.get(sim_type, str('ERROR')) # "ERROR" is default if x not found log_folder = g_pfp + "simul_logs/" + g_resultNameBase + "/" os.system("mkdir -p " + log_folder) # This is the list with the parameters which will change in your simulations; it is split by the multiprocessing library and fed to the Pool, which initiates the simulations on as many threads you want. sim_command_list = [] for x in range(0, 6): log_file = log_folder + "log_" + g_resultNameBase + "_" + str( x) + ".out" # touch the dump file where the log will be written os.system("touch " + log_file) if x < 3: temp_command = options( x ) + new_command + " " + "--resultGroupName=" + g_resultNameBase + " " + g_group_command_16nodes + ' " > ' + log_file + " 2>&1" elif x < 6: temp_command = options( x ) + new_command + " " + "--resultGroupName=" + g_resultNameBase + " " + g_group_command_36nodes + ' " > ' + log_file + " 2>&1" else: temp_command = options( x ) + new_command + " " + "--resultGroupName=" + g_resultNameBase + " " + g_group_command_100nodes + ' " > ' + log_file + " 2>&1" sim_command_list.append([temp_command, str(x)]) # Specify here how many processes/threads you want pool = Pool(processes=6) # start X workers processes # Give here the name of your function which can run one simulation with a given parameter, and a list of parameters which will be split among the threads. pool.map(run_single_sim, sim_command_list)
def prepareTilemap(): global mapData global heatFractal global moistureFractal global r_offset global heat_random global moisture_random r_offset = random.random()*1234 heat_random = random.random()*1234 moisture_random = random.random()*1234 mapData = MapData(g.mapx,g.mapy) heatFractal = [[0 for _ in range(g.mapy)] for _ in range(g.mapx)] moistureFractal = [[0 for _ in range(g.mapy)] for _ in range(g.mapx)] tile_list = [] for x in range(g.mapx): for y in range(g.mapy): tile_list.append((r_offset,heat_random,moisture_random,x,y)) with Pool() as p: squares = p.starmap(setMapDataXY,tile_list) for N,sq in enumerate(squares): x = N//512 y = N %512 mapData.data[x][y] = sq[0] heatFractal[x][y] = sq[1] moistureFractal[x][y] = sq[2] if sq[0] > mapData.maxValue: mapData.maxValue = sq[0] if sq[0] < mapData.minValue: mapData.minValue = sq[0] timepunch("Initial map data: ") tile_list = [] for x in range(g.mapx): for y in range(g.mapy): hval = (mapData.data[x][y] - mapData.minValue) / (mapData.maxValue - mapData.minValue) tile_list.append((hval,heatFractal[x][y],moistureFractal[x][y],x,y)) with Pool() as p: tiles = p.starmap(setTile,tile_list) for N,tile in enumerate(tiles): x = N//512 y = N %512 g.tiles[x][y] = tile updateNeighbours() timepunch("Tile stuff: ") if g.have_savefile: f = get_tar_data('lands.dat') json_str = f.decode('utf-8') json_lands = json.loads(json_str) for land in json_lands: our_land = TileGroup() our_land.tiles = land['tiles'] our_land.area = float(land['area']) g.lands.append(our_land) maxmin = json.loads(get_tar_data('map.dat').decode('utf-8')) mapData.maxValue = maxmin['max'] mapData.minValue = maxmin['min'] #updateBitmasks() else: #updateBitmasks() floodFill() timepunch("Flood filling: ") json_lands = [] for land in g.lands: our_dict = {} our_dict['tiles'] = land.tiles our_dict['area'] = land.area json_lands.append(our_dict) json_str = json.dumps(json_lands) json_bytes = json_str.encode('utf-8') add_to_tarfile((json_bytes,"lands.dat")) json_str = json.dumps({'max':mapData.maxValue,'min':mapData.minValue}) json_bytes = json_str.encode('utf-8') add_to_tarfile((json_bytes,"map.dat"))
from multiprocessing import Pool, TimeoutError import time import os def f(x): return x*x if __name__ == '__main__': pool = Pool(processes=4) # start 4 worker processes # print "[0, 1, 4,..., 81]" print pool.map(f, range(10)) # print same numbers in arbitrary order for i in pool.imap_unordered(f, range(10)): print i, print # evaluate "f(20)" asynchronously res = pool.apply_async(f, (20,)) # runs in *only* one process print res.get(timeout=1) # prints "400" # evaluate "os.getpid()" asynchronously res = pool.apply_async(os.getpid, ()) # runs in *only* one process print res.get(timeout=1) # prints the PID of that process # launching multiple evaluations asynchronously *may* use more processes multiple_results = [pool.apply_async(os.getpid, ()) for i in range(4)] print [res.get(timeout=1) for res in multiple_results] # make a single worker sleep for 10 secs
# plt.show() plt.close() return X, y, count def get_features(args): i = args[0] data = args[1] img = ds.get_image(data[i][1]) label = data[i][0] sift = cv2.xfeatures2d.SIFT_create(500) kp, des = sift.detectAndCompute(img, None) return [des, label] pool = Pool(os.cpu_count()) print("Training") features = pool.map(get_features, [(i, data_train) for i in range(len(data_train))]) X, y, count = vector_quantization_train(features) print("Number of features: ", count) clf = LinearSVC() clf.fit(X, y) print("Testing") features = pool.map(get_features, [(i, data_test) for i in range(len(data_test))]) X, y, count = vector_quantization(features) print("Number of features: ", count) predictions = clf.predict(X)
def main(): pool = Pool(processes=3) for i in range(30): pool.apply(f, (i, )) pool.close() pool.join()
def make_graph(env, sampler, connection_radius, num_vertices, lazy=False, saveto='graph.pkl'): """ Returns a graph on the passed environment. All vertices in the graph must be collision-free. Graph should have node attribute "config" which keeps a configuration in tuple. E.g., for adding vertex "0" with configuration np.array(0, 1), G.add_node(0, config=tuple(config)) To add edges to the graph, call G.add_weighted_edges_from([edges]) where edges is a list of tuples (node_i, node_j, weight), where weight is the distance between the two nodes. @param env: Map Environment for graph to be made on @param sampler: Sampler to sample configurations in the environment @param connection_radius: Maximum distance to connect vertices @param num_vertices: Minimum number of vertices in the graph. @param lazy: If true, edges are made without checking collision. @param saveto: File to save graph and the configurations @returns a undirected weighted graph G where each node is a tuple (x, y) and edge data the distance between nodes. """ print 'dubins graph maker' G = nx.DiGraph() numberOfThreads = 1 pool = Pool(processes=numberOfThreads) # Implement here # TODO: Code needs to be restructured, maybe vectorized? # 1. Sample vertices vertices = sampler.sample(num_vertices) edges = [] # 2. Connect them with edges start_time = time.time() for vid in tqdm(range(len(vertices))): vertex = tuple(vertices[vid]) G.add_node(vertex) #distances = env.compute_distances(vertices[vid], vertices) distances = compute_distance_parallel(numberOfThreads, pool, vertices[vid], vertices) for vid2 in range(len(vertices)): if vid != vid2: dist = distances[vid2] if (dist < connection_radius) and ( lazy or env.edge_validity_checker( vertices[vid], vertices[vid2])[0]): edges.append((vertex, tuple(vertices[vid2]), dist)) if vid % 100 == 0: print 'cost time', time.time() - start_time G.add_weighted_edges_from(edges) # Check for connectivity. #num_connected_components = len(list(nx.connected_components(G))) #if not num_connected_components == 1: # print ("warning, Graph has {} components, not connected".format(num_connected_components)) # Save the graph to reuse. if saveto is not None: data = dict(G=G) pickle.dump(data, open(saveto, 'wb')) print('Saved the graph to {}'.format(saveto)) return G
def predict(ds, **kwargs): global db print(os.getcwd()) data=[] ls=[] pred=False if pred: for doc in col.distinct('ad_creative_body',{'marked':0}): if len(data)<2000: data.append({'ad_creative_body':doc}) else: break if len(data)>0: print(len(data)) df=pd.DataFrame(data) df['label']=0 dev_df_bert = pd.DataFrame({ 'id':range(len(df)), 'label':df['label'], 'alpha':['a']*df.shape[0], 'text': df['ad_creative_body'].replace(r'\n', ' ', regex=True) }) dev_df_bert.to_csv('./home/jay/airflow/dags/data/dev.tsv', sep='\t', index=False, header=False) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # This is where BERT will look for pre-trained models to load parameters from. CACHE_DIR = './home/jay/airflow/dags/cache/' # The maximum total input sequence length after WordPiece tokenization. # Sequences longer than this will be truncated, and sequences shorter than this will be padded. MAX_SEQ_LENGTH = 128 TRAIN_BATCH_SIZE = 24 EVAL_BATCH_SIZE = 8 LEARNING_RATE = 2e-5 RANDOM_SEED = 42 GRADIENT_ACCUMULATION_STEPS = 1 WARMUP_PROPORTION = 0.1 OUTPUT_MODE = 'classification' NUM_TRAIN_EPOCHS = 1 CONFIG_NAME = "config.json" WEIGHTS_NAME = "pytorch_model.bin" Data = 'FB16' DATA_DIR = "./home/jay/airflow/dags/data/" categories = ["Attack", "Advocacy", "CTA", "Issue","Image"] # categories = ["Attack", "Advocacy", "Ceremonial", "CTA", "CI", "Image", "Issue"] for Category in categories: print(Category) TASK_NAME = Data+Category BERT_MODEL = TASK_NAME+'.tar.gz' # The output directory where the fine-tuned model and checkpoints will be written. OUTPUT_DIR = './home/jay/airflow/dags/outputs/'+TASK_NAME+'/' tokenizer = BertTokenizer.from_pretrained(OUTPUT_DIR + 'vocab.txt', do_lower_case=False) processor = BinaryClassificationProcessor() eval_examples = processor.get_dev_examples(DATA_DIR) label_list = processor.get_labels() # [0, 1] for binary classification num_labels = len(label_list) eval_examples_len = len(eval_examples) label_map = {label: i for i, label in enumerate(label_list)} eval_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH, tokenizer, OUTPUT_MODE) for example in eval_examples] process_count = cpu_count() - 1 # if __name__ == '__main__': # print('Preparing to convert' {eval_examples_len} examples..') # print(f'Spawning {process_count} processes..') with Pool(process_count) as p: eval_features = list(p.imap(convert_example_to_feature, eval_examples_for_processing)) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=EVAL_BATCH_SIZE) # Load pre-trained model (weights) model = BertForSequenceClassification.from_pretrained(CACHE_DIR + BERT_MODEL, cache_dir=CACHE_DIR, num_labels=len(label_list)) model.to(device) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) df[Category]=preds del df['label'] dc=df.to_dict('records') for doc in dc: doc['class']=[] for c in categories: if doc[c]==1: doc['class'].append(c) del doc[c] print(len(dc)) print(dc[0]) print("Pushing into DB") for doc in dc: for x in col.find({"ad_creative_body":doc['ad_creative_body'],'marked':0}): x['marked']=1 x['class']=doc['class'] col.update_one({'_id': x['_id']},{"$set":x},True) return "Done"
def main(): with Pool(POOL_SIZE) as pool: results = pool.map(fetch_rates, BASES) for result in results: present_result(*result)