def main(): targets = [] proc = int(sys.argv[1]) start_time = time.time() result = {'joomla': [], 'wordpress': []} p = Pool(proc) with open('file.txt', 'r') as f: lines = f.readlines() for line in lines: url = str(line.replace('\n', '')) targets.append(url) all = p.map(search_for, targets) for p in all: if p.get("joomla") != []: result["joomla"].append(p.get("joomla")) elif p.get("wordpress") != []: result["wordpress"].append(p.get("wordpress")) print "#################################################" print(" Number of charged urls : " + str(len(lines))) print "#################################################" print(" Number of joomla urls : " + str(len(result["joomla"]))) print(" Number of wordpress urls : " + str(len(result["wordpress"]))) print(" Finished in : " + str(int(time.time() - start_time)) + "s") print "#################################################" print "Joomla links" print "#################################################" for item in result["joomla"]: printf(item) print "#################################################" print "Wordpress links" print "#################################################" for item in result["wordpress"]: printf(item)
def get_array_chans_multi_proc(lambs, K, sep, params_list, n_processes, norm): ''' helper function to speed up generation of channels based on parameters. multiple processors are used and their results are combined. ''' p = Pool(processes=n_processes) num_chans = len(params_list) num_chans_per_proc = int(num_chans / n_processes) results = [] for i in range(n_processes): start = i * num_chans_per_proc end = min(start + num_chans_per_proc, num_chans) sub_params_list = params_list[start:end] results.append( p.apply_async(get_array_chans, args=(lambs, K, sep, sub_params_list, norm))) p.close() p.join() output = [p.get() for p in results] X = None for i in range(0, len(output)): x = output[i] if i == 0: X = x else: X = np.vstack([X, x]) del results, output return X
def main(): ''' Main program starts here ''' global opub, odown, orep, info # somefile is false starting turns to true if at elast one file found somefile = False # read inputs and assign constraints assign_constraint() fdown = outfile + '_to_download.csv' frep = outfile + '_replica.csv' fpub = outfile + '_not_published.csv' # test reading inputs print var0 print exp0 print mod0 print fdown print frep print fpub # if one of the output files exists issue a warning an exit if opath.isfile(fdown) or opath.isfile(frep) or opath.isfile(fpub): print "Warning: one of the output files exists, exit to not overwrite!" sys.exit() info = {} # loop through experiments, 1st create a wget request for exp, then parse_file for exp in exp0: wgetfile = "wget_" + exp + ".out" result = parse_file(wgetfile, var0, mod0, exp) # if found any files matching constraints, process them one by one # using multiprocessing Pool to parallelise process_file if result: async_results = Pool(1).map_async(process_file, result) for dinfo in async_results.get(): info.update(dinfo) somefile = True print "Finished checksum for existing files" # if it couldn't find any file for any experiment then exit if not somefile: sys.exit("No files found for any of the experiments, exiting!") # open not published file opub = open(fpub, "w") opub.write("var_mip-table, model, experiment\n") # build all requested combinations and compare to files found nopub_set = compare_query(var0, mod0, exp0) # write replica and download output files # open output files and write header odown = open(fdown, "w") odown.write( "var, mip_table, model, experiment, ensemble, version, file url\n") orep = open(frep, "w") orep.write( "var, mip_table, model, experiment, ensemble, version, filepath\n") write_file() # close all the output files odown.close() orep.close() opub.close() print "Finished to write output files" # if table option create/open spreadsheet # if table option write summary table in csv file if table: write_table(nopub_set)
def shortest_path_4_one_node(start_node, g, out_folder): node_idx = g.vs.select(name_eq=int(start_node)) if node_idx.__len__() == 0: print('error: start_node nis not in graph!') return idx = [v.index for v in node_idx][0] sh_path = g.get_all_shortest_paths(v=idx, weights=g.es['weight'], mode=OUT) print(sh_path[0:10]) """ then calculate the weights sum of each path and sort them write to file for each node """ # sum_path_list = worker(sh_path, start_node, g) size = len(sh_path) P_NUM = 10 p = Pool(P_NUM) rest_list = [] if size % P_NUM != 0: tail = size % P_NUM tail_list = sh_path[-(tail + 1):-1] rest_list = worker( tail_list, start_node, g, ) print('tail end') main_list = [ p.apply_async(worker, args=( sh_path[size // P_NUM * i:size // P_NUM * (i + 1)], start_node, g, )) for i in range(P_NUM) ] p.close() p.join() output = [p.get() for p in main_list] flat_list = [item for sublist in output for item in sublist] sum_path_list = flat_list + rest_list sum_path_list = sorted(sum_path_list, key=lambda x: x[2]) print(sum_path_list[0:10]) """ write start node to all the nodes' shortest path sum to one csv file """ df_out = pd.DataFrame(sum_path_list, columns=[ 'start_index', 'destination_index', 'shortest_path_distance', 'shortest_path_seq', 'weight_list' ]) df_out.to_csv(out_folder + 'path_dist_index_' + str(start_node) + '.csv', sep=',', index=False)
def main(): ''' Main program starts here ''' global opub, odown, orep, info # somefile is false starting turns to true if at elast one file found somefile=False # read inputs and assign constraints assign_constraint() fdown = outfile + '_to_download.csv' frep = outfile + '_replica.csv' fpub = outfile + '_not_published.csv' # test reading inputs print var0 print exp0 print mod0 print fdown print frep print fpub # if one of the output files exists issue a warning an exit if opath.isfile(fdown) or opath.isfile(frep) or opath.isfile(fpub): print "Warning: one of the output files exists, exit to not overwrite!" sys.exit() info={} # loop through experiments, 1st create a wget request for exp, then parse_file for exp in exp0: wgetfile = "wget_" + exp + ".out" result=parse_file(wgetfile,var0,mod0,exp) # if found any files matching constraints, process them one by one # using multiprocessing Pool to parallelise process_file if result: async_results = Pool(1).map_async(process_file, result) for dinfo in async_results.get(): info.update(dinfo) somefile=True print "Finished checksum for existing files" # if it couldn't find any file for any experiment then exit if not somefile: sys.exit("No files found for any of the experiments, exiting!") # open not published file opub=open(fpub, "w") opub.write("var_mip-table, model, experiment\n") # build all requested combinations and compare to files found nopub_set = compare_query(var0,mod0,exp0) # write replica and download output files # open output files and write header odown=open(fdown, "w") odown.write("var, mip_table, model, experiment, ensemble, version, file url\n") orep=open(frep, "w") orep.write("var, mip_table, model, experiment, ensemble, version, filepath\n") write_file() # close all the output files odown.close() orep.close() opub.close() print "Finished to write output files" # if table option create/open spreadsheet # if table option write summary table in csv file if table: write_table(nopub_set)
def test1(): t= time.time() p = Pool(4) kk = 10 results = [] for x in range(1,7): results.append(p.apply_async(f, args=(x,kk))) output = [p.get() for p in results] print output print time.time() - t
def test1(): t = time.time() p = Pool(4) kk = 10 results = [] for x in range(1, 7): results.append(p.apply_async(f, args=(x, kk))) output = [p.get() for p in results] print output print time.time() - t
def main(): process_list = [] p = Pool(3) for r in r_list: result = p.apply_async(circle_area, args=(r, )) process_list.append(result) for p in process_list: print(p.get())
def fit(self, data_indices=None): '''Uses .fit() method on each model operates on models in parallel''' p=Pool(self.processes) p.map_async(lambda x,kwargs: x.fit(self.df[self.vars_of_interest], df[[self.y]],**kwargs), zip(self.models,self.fit_kwarg_dicts)) out=p.get() self.fitted=True p.close() return out
def fit(self, data_indices=None): """Uses .fit() method on each model operates on models in parallel""" p = Pool(self.processes) p.map_async( lambda x, kwargs: x.fit(self.df[self.vars_of_interest], df[[self.y]], **kwargs), zip(self.models, self.fit_kwarg_dicts), ) out = p.get() self.fitted = True p.close() return out
def get_area_multi_processes(): process_list = [] logging.debug('multi process....') p = Pool(3) start = time.time() for r in r_list: result = p.apply_async(circle_area, args=(r, )) process_list.append(result) for p in process_list: print(p.get()) print('multi processes time:', (time.time() - start) * 1000)
def main(): args = get_args() start = time() with open(args.output, 'w') as f: wids = [line.strip() for line in open(args.input).readlines()[:args.number]] mapped = Pool(processes=8).map_async(identify_worker, wids) mapped.wait() print >> f, '\n'.join([x.encode('utf-8') for x in mapped.get()]) end = time() total = end - start print '%d seconds elapsed' % total
def filter_seqs(seqs, good_cols, ncpus): nseqs, lseqs = len(seqs), len(seqs[0]) # each job should not exceed 50M character to avoid transferring too much data diff = _best_chunk_size(nseqs, lseqs, ncpus) p = Pool(ncpus) procs = [] for i in xrange(0, len(seqs), diff): procs.append( p.apply_async(_sub_filter_seqs, (seqs[i:i + diff], good_cols))) p.close() p.join() return [s for p in procs for s in p.get()]
def updateDb(): initDB() start_time = time.time() course_numbers = sorted(getNumberOfCoursesList()) course_numbers_length = len(course_numbers) p = Pool(8) pool = Pool(processes=4) results = [ pool.apply_async(getCourseInfo, args=(course_number, )) for course_number in course_numbers ] output = [p.get() for p in results] for course in output: print(course.number) dbAddCourse(course) print("--- %.2f seconds ---" % (time.time() - start_time))
def parallel_to_corpus(dat, worker_num=NCPU, partition_num=100, index_to_token=INDEX_TO_TOKENS, nlp_sep=NLP_SEP): sub_data_list = chunkIt(dat, num=partition_num) p = Pool(processes=worker_num) data = [ p.apply_async(func=to_corpus, args=(x, index_to_token, nlp_sep)) for x in sub_data_list ] p.close() flat_data = [p.get() for p in data] seq_len = [size for chunk, word_list in flat_data for size in word_list] flat_data = [_ for chunk, word_list in flat_data for _ in chunk] return flat_data, seq_len
def complexity_filtering(seqs, chars, chi2_cut, ncpus): nseqs, lseqs = len(seqs), len(seqs[0]) # each job should not exceed 50M character to avoid transferring too much data diff = _best_chunk_size(nseqs, lseqs, ncpus) p = Pool(ncpus) procs = [] for i in xrange(0, nseqs, diff): procs.append( p.apply_async(_sub_complexity_filtering, (seqs[i:i + diff], chars))) p.close() p.join() counts = procs.pop(0).get() for p in procs: counts = [[counts[i][j] + v for j, v in enumerate(l)] for i, l in enumerate(p.get())] expected = {} total = float(sum(sum(c) for c in counts)) for i, c in enumerate(chars): expected[c] = sum(c[i] for c in counts) / total printime(' * proportion of each site type: ' + ', '.join(['%s: %.4f' % (c.upper(), expected[c]) for c in chars])) good_cols = [] mean = 1. / len(expected) sums = [] prop = [] stds = [] chi2 = [] for i in xrange(lseqs): count = counts[i] total = float(sum(count)) or 1 # in case total is equal to 0 std = sum((c / total - mean)**2 for c in count) stds.append(std**0.5) sums.append(total) av = [total * expected[c] for c in chars] chi2.append(sum((c - av[i])**2 / av[i] for i, c in enumerate(count))) if chi2[-1] > chi2_cut: good_cols.append(i) prop.append([c / total for c in count]) return sums, prop, stds, chi2, good_cols, expected
def to_adops_xls(args, wid_to_topics): my_workbook = xlwt.Workbook() ids_worksheet = my_workbook.add_sheet("Wikis to Topics") ids_worksheet.write(0, 0, 'Wiki') ids_worksheet.write(0, 1, 'URL') ids_worksheet.write(0, 2, 'Topic') ids_worksheet.write(0, 3, 'Rank') ids = wid_to_topics.keys() r = Pool(processes=16).map_async(wiki_data_for_ids, [ids[i:i+20] for i in range(0, len(ids), 20)]) wiki_data = {} map(wiki_data.update, r.get()) row = 1 for wid, topics in wid_to_topics.items(): top_five = sorted(wid_to_topics[wid].keys(), key=lambda x: wid_to_topics[wid][x], reverse=True)[:5] for counter, topic in enumerate(top_five): ids_worksheet.write(row, 0, wid) ids_worksheet.write(row, 1, wiki_data.get(wid, {}).get('url', '?')) ids_worksheet.write(row, 2, int(topic)+1) ids_worksheet.write(row, 3, counter) row += 1 urls_worksheet = my_workbook.add_sheet("Topic Data") urls_worksheet.write(0, 0, 'Topic') urls_worksheet.write(0, 1, 'Phrase') urls_worksheet.write(0, 2, 'Weight') urls_worksheet.write(0, 3, 'Rank') row = 1 for topic, line in enumerate(args.features_file): words = line.decode('utf8').split(u' + ') for rank, word_data in enumerate(words): weight, word = word_data.split('*') urls_worksheet.write(row, 0, topic+1) urls_worksheet.write(row, 1, word) urls_worksheet.write(row, 2, weight) urls_worksheet.write(row, 3, rank+1) row += 1 my_workbook.save(args.topics_file.name.replace('.csv', '-adops-report.xls')) print args.topics_file.name.replace('.csv', '-adops-report.xls')
def get_denser_columns(seqs, cutoff, ncpus): nseqs, lseqs = len(seqs), len(seqs[0]) col_cutoff = nseqs - cutoff ## search for dense columns p = Pool(ncpus) # each job should not exceed 50M character to avoid transferring too much data diff = _best_chunk_size(nseqs, lseqs, ncpus) procs = [] for i in xrange(0, len(seqs), diff): procs.append(p.apply_async(_sub_get_dense_cols, (seqs[i:i + diff], ))) p.close() p.join() good_cols = procs.pop(0).get() for p in procs: good_cols = [good_cols[i] + v for i, v in enumerate(p.get())] return [i for i, col in enumerate(good_cols) if col < col_cutoff]
calibration_cmd = "python ../../tools/modifiedRouteSampler.py -r ./route_files/ai15_run%s.rou.xml --edgedata-files ./calibration/weekday_survey_2019.xml --optimize 20 --optimize-input -a 'type=\"nhbCar\"' --write-route-ids --mismatch-output ./logs/ai15_run%s_calibration_logs.txt -o ./route_files/ai15_run%s_weekday_survey_calibrated.rou.xml" % (run_number, run_number, run_number) subprocess.call(calibration_cmd, shell=False) if __name__ == '__main__': od_sets = ['hbnwauto15', 'hbwauto15', 'nhbauto15', 'truck15', # 'hbnwauto25', # do not include RISM25 demand since RISM15 is used for base calibration # 'hbwauto25', # 'nhbauto25', # 'truck25', ] random_seed = 6789 number_of_runs = 10 i = 0 p = Pool(4) for od_name in od_sets: random_seed += i print(od_name) results = [p.apply_async(generate_trips, args=(od_name, random_seed, run)) for run in range(0, number_of_runs)] out = [p.get() for p in results] i += 1 results = [p.apply_async(generate_routes, args=(run,)) for run in range(0, number_of_runs)] out = [p.get() for p in results]
if __name__ == "__main__": start = time.time() p = Pool() size_of_game_batch = number_of_repeats / CPU_COUNT all_parallel_games = [int(size_of_game_batch) for i in range(CPU_COUNT)] output = [ p.apply_async(games, args=(x, word_counter_container)) for x in all_parallel_games ] p.close() p.join() history = [p.get()[0] for p in output] history = np.reshape(np.ravel(history, order='A'), (number_of_repeats, number_of_rounds)) mean_number_of_different_words = np.mean( [p.get()[1]['different words of all games'] for p in output], axis=0, dtype=int)[0] mean_number_of_total_words = np.mean( [p.get()[1]['total number of words of all games'] for p in output], axis=0, dtype=int)[0] end = time.time() print('Total time in seconds: ' + str(end - start)) Plot = Plot()
def main(): # collect args command = ' '.join(sys.argv[0:]) myparser = argparse.ArgumentParser() myparser.add_argument('-i', '--input_csv', required=True) myparser.add_argument('-b', '--bias', required=True, type=float) myparser.add_argument('-o', '--output_dir', required=True) myparser.add_argument('-cov', '--covariate_dir', required=False, default=None) myparser.add_argument('-boot', '--bootstrap_dir', required=False, default=None) myparser.add_argument('-cut', '--bootstrap_cutoff', required=False, type=float, default=0.95) myparser.add_argument('-do_att', '--do_att', required=False, default=False, type=bool) myparser.add_argument('-multi', '--multi', required=False, default=1, type=int) args = myparser.parse_args() # plot single sensitivity without bootstrap main_plot_coords, main_variable_coords = do_single_sensitivity( args.input_csv, args.bias, args.output_dir, args.covariate_dir, args.do_att, command) # if bootstrap values provided if args.bootstrap_dir is not None: # extract names of input_csv, covariate dir and output dir input_name = os.path.basename(os.path.normpath(args.input_csv)) if args.covariate_dir is not None: covariate_dir_name = os.path.basename( os.path.normpath(args.covariate_dir)) else: covariate_dir_name = None output_dir_name = os.path.basename(os.path.normpath(args.output_dir)) # create paths of input subdirectories subdir_list = [ os.path.join(args.bootstrap_dir, subdir) for subdir in os.listdir(args.bootstrap_dir) ] print('Calculating outputs for individual bootstrapped datasets') # Create pool for multiprocessing pool = Pool(args.multi) # Create progress bar pbar = tqdm(total=len(subdir_list)) # Calculate bootstrap co-ordinates for all provided bootstrap values and combine them into a list of dataframes res = [ pool.apply_async(do_bootstrap_sensitivity, args=(n, subdir_list), kwds={ 'input_name': input_name, 'covariate_dir_name': covariate_dir_name, 'output_dir_name': output_dir_name, 'command': command, 'args': args }, callback=lambda _: pbar.update(1)) for n in range(len(subdir_list)) ] boot_plot_coords, boot_variable_coords = list( zip(*[pool.get() for pool in res])) pbar.close() boot_plot_coords = list(boot_plot_coords) boot_variable_coords = list(boot_variable_coords) # which variables to plot? if args.covariate_dir is None: plot_variables = 'none' else: if has_none(boot_variable_coords): plot_variables = 'main' else: plot_variables = 'both' print('Plotting combined bootstrap graph') p, plot_coords_out, variable_coords_out = plot_bootstrap_sensitivity_graph( main_plot_coords, main_variable_coords, boot_plot_coords, boot_variable_coords, plot_variables, args.bootstrap_cutoff, args.bias) # Save bootstrap output files p.save(os.path.join(args.output_dir, 'austen_plot_bootstrap.png'), dpi=500, verbose=False) with open(os.path.join(args.output_dir, 'austen_plot_coordinates_bootstrap.csv'), 'w+', newline='\n') as file: file.write(f'#{command}\n') plot_coords_out.to_csv(file, index=False) if variable_coords_out is not None: with open(os.path.join(args.output_dir, 'variable_importances_bootstrap.csv'), 'w+', newline='\n') as file: file.write(f'#{command}\n') variable_coords_out.to_csv(file, index=False) return None
pr.start() ## 等待pr结束: pr.join() print print "all data write and read done" print print "Process Pool example...\n" ## 注意:队列对象不能在父进程与子进程间通信,如果想要使用进程池中使用队列则要使用multiprocess的Manager类,如下: manager = multiprocessing.Manager() # 父进程创建Queue,并传给各个子进程: q = manager.Queue() lock = manager.Lock() ## 创建队列锁,保证同一时间,只有一个进程在对队列进行操作 p = Pool(processes=5) ## 保证每次只能5个进程在同时运行 p_list = [] # 同时开20个进程 for i in range(20): pw = p.apply_async(write,args=(i,q,lock)) p_list.append(pw) for p in p_list: p.get() p = Pool() time.sleep(0.5) pr = p.apply_async(read,args=(q,)) p.close() # 调用get()之前必须先调用close(),调用close()之后就不能继续添加新的Process了 p.join() print print 'all data write and read done'
# for more info look at pyesgf module documentation esgfargs=constraints if 'mip' in constraints.keys(): esgfargs['cmor_table']=esgfargs.pop('mip') if 'exp0' in locals(): esgfargs['query']=exp0+"%" esgfargs['replica']=False esgf.search_node(**esgfargs) print("Found ",esgf.ds_count(),"simulations for constraints") # loop returned DatasetResult objects # using multiprocessing Pool to parallelise process_file # using 8 here as it is the number ov VCPU on VDI if esgf.ds_count()>=1: results=esgf.get_ds() async_results = Pool(1).map_async(retrieve_ds, results) for ds_info in async_results.get(): esgf_results.append(ds_info) # append to results list of version dictionaries containing useful info # NB search should return only one latest, not replica version if any # compare local to remote info print("Finished to retrieve remote data") if esgf_results==[]: if db_results!=[]: print("Found local version but none is currently available on ESGF nodes for constraints:\n",constraints) else: print("Nothing currently available on ESGF nodes and no local version exists for constraints:\n",constraints) else: print(esgf.ds_count(),"instances were found on ESGF and ",outputs.count()," on the local database") if sys.version_info < ( 3, 0 ):
initial_word_transitions,reward) aggregated_history.append(history) return aggregated_history, word_frequencies if __name__ == "__main__": start = time.time() p = Pool() size_of_game_batch = number_of_repeats / CPU_COUNT all_parallel_games = [int(size_of_game_batch) for i in range(CPU_COUNT)] output = [p.apply_async(games, args=(x, word_frequencies)) for x in all_parallel_games] p.close() p.join() history = [p.get()[0] for p in output] history = np.reshape(np.ravel(history, order='A'), (number_of_repeats, number_of_rounds)) word_frequencies = np.sum([p.get()[1]['word frequencies'] for p in output], axis=0) word_associations = [p.get()[1]['word associations'] for p in output] word_game_counts = [p.get()[1]['word game counts'] for p in output] word_game_counts = np.reshape(word_game_counts, (number_of_repeats, len(initial_word_memory))) end = time.time() print('Required time, in seconds: '+ str(end - start)) Plot = Plot() Plot.plot_sigmoid(history) Plot.plot_box_word_probabilities([i[0] for i in initial_word_memory], word_game_counts) Plot.plot_word_frequency([i[0] for i in initial_word_memory], word_frequencies)
def build_graph( students, num_partitions=3, friendship_weight=30, classmate_weight=3, schoolmate_weight=1, teacher_multiplier=100, node_weights_to_ubvec=None, ): G = nx.Graph() G.graph['edge_weight_attr'] = 'weight' G.graph['node_weight_attr'] = list(node_weights_to_ubvec.keys()) # helper vars name_to_index = {} class_to_indices = {} school_to_indices = {} i = 0 # add students as nodes for row in students: name = row['Name'] if not name: continue origin_class = row['School'] + ' ' + row['Class'] origin_school = row['School'] gender = row.get('Gender') G.add_node( i, label=name + "\n" + origin_class, origin_school=origin_school, origin_class=origin_class, preferences=row['Preferences'], # styling shape='hexagon' if gender == 'm' else 'ellipse', penwidth=3, # node attributes total=1, # used to balance total number of students gender=1 if gender == 'f' else 0, **{ attribute: int(row[attribute.capitalize()]) for attribute in node_weights_to_ubvec.keys() if attribute not in ('gender', 'total') }, ) # fill helper vars for preference parsing later on name_to_index[name] = i if origin_class not in class_to_indices: class_to_indices[origin_class] = set() class_to_indices[origin_class].add(i) if origin_school not in school_to_indices: school_to_indices[origin_school] = set() school_to_indices[origin_school].add(i) i += 1 # add preferences as edges for student, node in G.nodes.items(): items = node['preferences'].split(',') for item in items: item = item.strip() # remove whitespace if not item: continue # skip empty items # parse negation symbol multiplier = 1 if item.startswith('!'): multiplier = -1 item = item[1:] # parse teacher symbol if item.startswith('*'): multiplier = multiplier * teacher_multiplier item = item[1:] # parse filter (filter_attr, filter_value) = (None, None) if item.startswith('['): item_parts = item[1:].split(']') filter_parts = item_parts[0].split('=') assert len(filter_parts) == 2, 'Invalid filter: ' + item (filter_attr, filter_value) = filter_parts item = item_parts[1] schoolmates = school_to_indices.get(item, None) classmates = class_to_indices.get(item, None) friend = name_to_index.get(item, None) # set targets and base weight if schoolmates is not None: targets = schoolmates base_weight = schoolmate_weight elif classmates is not None: targets = classmates base_weight = classmate_weight elif friend is not None: targets = [friend] base_weight = friendship_weight else: raise ValueError('Could not parse preference: ' + item) # apply filter if filter_attr: targets = [ i for i, node in G.nodes.items() if i in targets and node[filter_attr] == filter_value ] # actually add edges for target in targets: if student != target: add_edge(G, student, target, weight=base_weight * multiplier) # partition graph in async process because METIS will sometimes throw # a segmentation fault which we want to raise properly try: promise = Pool().apply_async( metis.part_graph, args=(), kwds={ 'graph': G, 'nparts': num_partitions, 'tpwgts': [ tuple(1 / num_partitions for i in range(0, len(node_weights_to_ubvec))) for i in range(0, num_partitions) ], 'ubvec': list(node_weights_to_ubvec.values()), 'objtype': 'cut', # 'ctype': 'shem', 'seed': 111, }) (total_volume, parts) = promise.get(timeout=3) except TimeoutError as e: raise ValueError( 'METIS failed to partition graph. Try fewer edges or partitions.' ) from e # assign colors according to assigned partition colors = get_color_list(num_partitions) for i, p in enumerate(parts): G.node[i]['color'] = colors[p] # remove low-weight and negative edges to clean up the painted graph edges = [e for e in G.edges.data()] for edge_obj in edges: edge_from, edge_to, edge_data = edge_obj if edge_data['weight'] < 30: G.remove_edge(edge_from, edge_to) # convert to pydot P = nx.drawing.nx_pydot.to_pydot(G) # create subgraphs to cluster students in same class together for color in colors: subgraph = pydot.Cluster(color, label='test 1', nodesep=7, ranksep=4) for node in P.get_node_list(): if node.get_attributes()['color'] == color: subgraph.add_node(node) num_students = len(subgraph.get_node_list()) subgraph.set_label(str(num_students) + ' students') P.add_subgraph(subgraph) return total_volume, P
fftCepstre = fft2(cepstre) fftCepstre = np.exp(fftCepstre) return fftCepstre ################################# ################################# ################################# ################################# start = time.time() resultH = Pool(6).map_async(repFreqConduitVocal, hann(x)) resultE = Pool(2).map_async(fftCourtTerm, hann(e)) H = resultH.get() E = resultE.get() print time.time() - start ################################# ################################# ################################# ################################# V = np.ndarray(shape=(nbFenetre, N, 2), dtype=np.complex128) for m in range(nbFenetre): V[m] = E[m] * H[m] ################################# #################################
p = Pool(cpu_count()) print(f'Num Core: {cpu_count()}') param_dict = { 'task1': list(range(10, 30000000)), 'task2': list(range(30000000, 60000000)), 'task3': list(range(60000000, 90000000)), 'task4': list(range(90000000, 120000000)), 'task5': list(range(120000000, 150000000)), 'task6': list(range(150000000, 180000000)), 'task7': list(range(180000000, 210000000)), 'task8': list(range(210000000, 240000000)) } mg = Manager() managed_locker = mg.Lock() managed_dict = mg.dict() results = [] for name, param in param_dict.items(): results.append( p.apply_async(train_on_parameter, args=(name, param, managed_dict, managed_locker))) # results = [p.apply_async(train_on_parameter, args=(name, param, managed_dict, managed_locker)) for name, param in param_dict.items()] results = [p.get() for p in results] print(managed_dict) end1 = datetime.datetime.now() print(f'Duration: {end1 - start1}') ## REF: https://zhuanlan.zhihu.com/p/93305921
def run_cohort(cohort, created_cohorts, mutation_input_files, mutations_cohorts_dir, motif_name_index, f_score_index, motif_breaking_score_index, chromatin_cat_index, background_window, background_window_size, filter_on_qval, sig_category, sig_thresh, sim_sig_thresh, sim_output_extension, filter_cond, operation_on_unify, output_extension, distance_to_merge, merged_mut_sig_threshold, local_domain_window, chr_lengths_file, sig_elements_output_file, sig_tfs_file, sig_tfpos_file, tmp_dir, n_cores_fscore, p_value_on_score, active_driver_script_dir, active_driver_min_mut, n_cores): "get the cohort name to use for output file names" cohort_full_name = created_cohorts[cohort][0].split('_')[0] if '/' in created_cohorts[cohort][0]: cohort_full_name = '/'.join( created_cohorts[cohort][0].split('/')[0:-1] ) + "/" + created_cohorts[cohort][0].split('/')[-1].split('_')[0] print('Processing: ', cohort_full_name) '''Calculate std, nummotifs and mean of the scores per TF-motif in the simulation sets The first file in each created_cohorts[cohort] is the observed set, so skip it ''' #dict_simulated_mean_sd_per_TF_motif_output_file = cohort_full_name + "_meansdrand{}sets.dict".format(len(mutation_input_files)-1) #print(dict_simulated_mean_sd_per_TF_motif_output_file) '''As background consider simulated mutations in provided bacground window size around mutation ''' # if background_window: # #print(background_window_size) # dict_type_mean_std_scores = Utilities.get_simulated_mean_sd_per_TF_motif_background_window( # cohort_full_name = cohort_full_name, # annotated_input_file = created_cohorts[cohort][0], # simulated_annotated_input_files=created_cohorts[cohort][1:], # mutations_cohorts_dir = mutations_cohorts_dir, # cohort_mean_sd_per_tf_overall_output_dict_file= dict_simulated_mean_sd_per_TF_motif_output_file, # chr_lengths_file = chr_lengths_file, # background_window_size = background_window_size, # motif_name_index = motif_name_index, f_score_index = f_score_index, # motif_breaking_score_index = motif_breaking_score_index, # chromatin_cat_index = chromatin_cat_index, tmp_dir = tmp_dir, n_cores_fscore=n_cores_fscore) # else: # '''As background consider whole genome # ''' # dict_type_mean_std_scores = Utilities.get_simulated_mean_sd_per_TF_motif( # simulated_annotated_input_files=created_cohorts[cohort][1:], # cohort_mean_sd_per_tf_overall_output_dict_file= dict_simulated_mean_sd_per_TF_motif_output_file, # motif_name_index = motif_name_index, f_score_index = f_score_index, # motif_breaking_score_index = motif_breaking_score_index) # # '''For each mutation in the observed set created_cohorts[cohort][0] # calculate pval and qval by comparing its score to the std and mean # scores in the corresponding TF motif. # Filter out mutations that don't have a sig score or dont' pass other filters # ''' # muts_sig_per_TF_file = Utilities.get_muts_sig_per_TF( # annoted_input_file=created_cohorts[cohort][0], # dict_type_mean_std_scores=dict_type_mean_std_scores, # annoted_output_file_extension="_rand{}setsTF".format(len(mutation_input_files)-1), # annoted_output_file_extension_onlysig="_rand{}setsTFsigQval{}".format( # len(mutation_input_files)-1, sig_thresh), # background_window = background_window, # motif_name_index = motif_name_index, f_score_index = f_score_index, # motif_breaking_score_index = motif_breaking_score_index, # filter_on_qval=filter_on_qval, sig_cat=sig_category, # sig_thresh=sig_thresh, # filter_on_signal = True, dnase_index = 24, fantom_index = 25, num_other_tfs_index = 27) # sig_muts_per_tf_mutation_input_files = [muts_sig_per_TF_file] # # '''repeat the same process to keep only sig muts from the simulated, but use sim_sig_thresh # if sim_sig_thresh >=1.0 no sig is performed and all muts are written to the output''' # for mutations_input_file in created_cohorts[cohort][1:]: # muts_sig_per_TF_file = Utilities.get_muts_sig_per_TF( # annoted_input_file=mutations_input_file, # dict_type_mean_std_scores=dict_type_mean_std_scores, # annoted_output_file_extension="_rand{}setsTF".format(len(mutation_input_files)-1), # annoted_output_file_extension_onlysig=sim_output_extension, # background_window = background_window, # motif_name_index = motif_name_index, f_score_index = f_score_index, # motif_breaking_score_index = motif_breaking_score_index, # filter_on_qval=filter_on_qval, sig_cat=sig_category, # sig_thresh=sim_sig_thresh # ) # sig_muts_per_tf_mutation_input_files.append(muts_sig_per_TF_file) # '''Based on the mutations that have a significant score (specify if qval should be used) Count number of mutations per TF motif and per TF motif position For each calcualate a pvalue and qvalue based on number of mutations in the correponding motif in the simulated sets ''' #sig_tfs_file, sig_tfpos_file = Utilities.get_tf_pval( # cohort, sig_muts_per_tf_mutation_input_files, p_value_on_score, motif_name_index, # f_score_index, motif_breaking_score_index, # filter_cond, fsep='\t', sig_tfs_file=sig_tfs_file, # sig_tfpos_file=sig_tfpos_file, # filter_on_signal = True, dnase_index = 24, fantom_index = 25, # num_other_tfs_index = 27) # '''replace an observed regulatory mutation file with an all annotated observed mutation file ''' #muts_per_tf_mutation_input_files = [created_cohorts[cohort][0]+"_rand{}setsTF".format(len(mutation_input_files)-1)] #muts_per_tf_mutation_input_files.extend(sig_muts_per_tf_mutation_input_files[1:]) #print(muts_per_tf_mutation_input_files) #print(len(muts_per_tf_mutation_input_files)) '''Combine nearby mutations accross the cohort into one element''' '''Unify the mutations that have significant scores accross the cohorts Make one record for mutations that overlap multiple motifs ''' unified_muts_files_obj = [] p = Pool(15) for mutations_input_file in created_cohorts[cohort]: unified_muts_file = p.apply_async( Utilities.unify_muts, args=(mutations_input_file, output_extension, True, filter_cond, operation_on_unify)) unified_muts_files_obj.append(unified_muts_file) p.close() p.join() unified_muts_files = [p.get() for p in unified_muts_files_obj] unified_mutation_input_files_obj = [] p = Pool(15) for unified_muts_file in unified_muts_files: unified_muts_file_wihtmotifinfo = p.apply_async( Utilities.get_max_motif_in_grouped_muts, args=(unified_muts_file, )) unified_mutation_input_files_obj.append( unified_muts_file_wihtmotifinfo) p.close() p.join() unified_mutation_input_files = [ p.get() for p in unified_mutation_input_files_obj ] #for unified_muts_file in unified_muts_files: # if os.path.exists(unified_muts_file): # os.remove(unified_muts_file) # unified_mutation_input_files = [] # for mutations_input_file in created_cohorts[cohort]: # unified_muts_file = mutations_input_file + output_extension + "_groupedbymut" # unified_muts_file_wihtmotifinfo = unified_muts_file+"withmotifinfo" # if not os.path.exists(unified_muts_file_wihtmotifinfo): # print("Unifying: ", mutations_input_file) # Utilities.unify_muts(mutations_input_file, unified_muts_file_ext, # filter_mut_motifs=True, filter_cond=filter_cond, # operation_on_unify=operation_on_unify) # Utilities.get_max_motif_in_grouped_muts( # annotated_mutations_grouped_file=unified_muts_file, # annotated_mutations_grouped_output_file=unified_muts_file_wihtmotifinfo) # os.remove(unified_muts_file) # unified_mutation_input_files.append(unified_muts_file_wihtmotifinfo) # print('Unified mutations input files: ', unified_mutation_input_files) ''' Evaluate the significance of each element based on: - the element score (sum of the score of its mutations) - number of mutations in the element ''' get_sig_merged_elements( unified_mutation_input_files, cohort_full_name, output_extension, #sim_output_extension+output_extension, distance_to_merge, merged_mut_sig_threshold, local_domain_window, chr_lengths_file, sig_elements_output_file, sim_sig_thresh, p_value_on_score=p_value_on_score) #ActiveDriverWGS active_driver_results = sig_elements_output_file + '_ActiveDriver_results' active_driver_results_sig = active_driver_results + '_sig' active_driver_output_file = sig_elements_output_file + '_ActiveDriver' active_driver_output_file_sig = active_driver_output_file + '_sig' if not os.path.exists(active_driver_output_file_sig): print([ 'Rscript', active_driver_script_dir, sig_elements_output_file, created_cohorts[cohort][0], active_driver_min_mut, active_driver_output_file, active_driver_output_file_sig, active_driver_results, n_cores ]) try: subprocess.call([ 'Rscript', active_driver_script_dir, sig_elements_output_file, created_cohorts[cohort][0], str(active_driver_min_mut), active_driver_output_file, active_driver_output_file_sig, active_driver_results, str(n_cores) ]) except KeyError: open(active_driver_output_file_sig, 'a').close() sig_muts_file = created_cohorts[cohort][0] + "_sig" sig_muts_file = Utilities.get_sig_muts( elements_input_file=active_driver_output_file_sig, mutations_input_file=created_cohorts[cohort][0], sig_muts_file=sig_muts_file, motif_breaking_score_index=motif_breaking_score_index, tf_binding_index=30, dnase_index=24) sig_muts_per_tf_mutation_input_files = [] sig_muts_per_tf_mutation_input_files = [sig_muts_file] for mutations_input_file in created_cohorts[cohort][1:]: sig_muts_per_tf_mutation_input_files.append(mutations_input_file) sig_tfs_file, sig_tfpos_file = Utilities.get_tf_pval( cohort, sig_muts_per_tf_mutation_input_files, p_value_on_score, motif_name_index, f_score_index, motif_breaking_score_index, filter_cond, fsep='\t', sig_tfs_file=sig_tfs_file, sig_tfpos_file=sig_tfpos_file, filter_on_signal=True, dnase_index=24, fantom_index=25, num_other_tfs_index=27) return sig_elements_output_file, active_driver_output_file_sig, sig_tfs_file, sig_tfpos_file
url = details.get('url') lang = details.get('lang') #print url #doc_ids = ListDocIdsService().get_value(wid) doc_ids = map(lambda x: x.split('_')[1], filter(lambda y: '_' in y, #ListDocIdsService().get_value(wid)))[:100] ListDocIdsService().get_value(wid))) #pprint(doc_ids); sys.exit(0) #for n in range(0, len(doc_ids), step): ##for n in range(0, 20, step): # print 'n = %d' % n # doc_ids_subset = doc_ids[n:n+step] r = Pool(processes=8).map_async(get_fields, chunks(doc_ids, step)) r.wait() pprint(r.get()) print '*'*80 #for k in r.get(): # DEBUG # print k fields = [] m = map(lambda x: fields.extend(x), r.get()) #pprint(fields) indexed = dict(fields) pprint(indexed) # DEBUG #for doc_id in doc_ids_to_heads: # entity_response = doc_ids_to_entities.get( # doc_id, {'titles': [], 'redirects': {}}) # doc_ids_combined[doc_id] = map(preprocess, # indexed.get(doc_id, []) +
#print url #doc_ids = ListDocIdsService().get_value(wid) doc_ids = map( lambda x: x.split('_')[1], filter( lambda y: '_' in y, #ListDocIdsService().get_value(wid)))[:100] ListDocIdsService().get_value(wid))) #pprint(doc_ids); sys.exit(0) #for n in range(0, len(doc_ids), step): ##for n in range(0, 20, step): # print 'n = %d' % n # doc_ids_subset = doc_ids[n:n+step] r = Pool(processes=8).map_async(get_fields, chunks(doc_ids, step)) r.wait() pprint(r.get()) print '*' * 80 #for k in r.get(): # DEBUG # print k fields = [] m = map(lambda x: fields.extend(x), r.get()) #pprint(fields) indexed = dict(fields) pprint(indexed) # DEBUG #for doc_id in doc_ids_to_heads: # entity_response = doc_ids_to_entities.get( # doc_id, {'titles': [], 'redirects': {}}) # doc_ids_combined[doc_id] = map(preprocess, # indexed.get(doc_id, []) +
import numpy as np from graspy.inference import SemiparametricTest from graspy.embed import AdjacencySpectralEmbed, select_dimension import warnings # from graspy.simulations import sbm from graspy.utils import symmetrize import time import sys import getopt import pickle from multiprocessing import Pool, cpu_count def junk(seed, ): np.random.seed(seed) msg = np.random.normal(0, 1) return msg # outputs = [p.get() for p in tests] # epsilon_outputs.append(outputs) if __name__ == '__main__': p = Pool() n_sims = 100 seeds = [np.random.randint(1, 100000000) for _ in range(n_sims)] print(seeds) jobs = [p.apply_async(junk, args=(seeds[i], )) for i in range(n_sims)] outputs = [p.get() for p in jobs] print(outputs)