def call_cv_train_parallel(train_func, args_iterator=None): if args_iterator is None: args_iterator = get_kfold_ids() from multiprocessing import Pool pool = Pool(get_core_count()) retval = unlist_dataframe(pool.starmap(train_func, args_iterator)) pool.terminate() return retval
def run_all_games(self, multi_thread=True): if multi_thread: pool = Pool(3) games = pool.starmap(self.run_game, [(Bandit(self._n_arms), \ self._agent_cls(self._n_arms, **self._cls_args), self._n_plays) for i in range(0, self._n_games)]) self._games = games else: games = [] for i in range(0, self._n_games): games.append(self.run_game(Bandit(self._n_arms), self._agent_cls(self._n_arms, **self._cls_args), self._n_plays)) self._games = games
def process_image_list(l): db.close_old_connections() pool = Pool() pool.starmap(migrate_image_resize, l) pool.close() pool.join()
def main(): pool = Pool() a_args = [1,2,3] second_arg = 1 argzip=zip(a_args, itertools.repeat(second_arg)) #pool.map(func, argzip) pool.starmap(func, [(1,2),(2,3)])
def call_func_parallel(func, args_iterator, workers=-1): from multiprocessing import Pool if workers == -1: workers = get_core_count() pool = Pool(workers) pool.starmap(func, args_iterator) pool.terminate()
def run_sub_folders2(MotherFolder, DaisyFileName, DaisyExecutabl, NumberOfProcesses=6, recursive=False): """ Runs all the Daisy simulations found below the MotherFolder """ pp = Pool(NumberOfProcesses) input=[] for i in range(NumberOfProcesses): input.append( (MotherFolder, DaisyFileName, DaisyExecutabl, i, recursive, None) ) pp.starmap(run_single2, input) pp.terminate()
def main(): num_process = 4 pool = Pool(num_process) num_post = post_detail_infos.find({'segmented':{'$ne':1}}).count() print('Number to segment: ' + str(num_post)) batch_size = int(num_post/num_process) print('Batch size: ' + str(batch_size)) args = [(idx*batch_size, (idx+1)*batch_size) for idx in range(num_process)] pool.starmap(segmentation, args) pool.close() pool.join()
def repeat_databases(source_path, database_count, exploration_count=0, exploitation_count=0, random_count=0, processes=7, add_new=False, exploit_method='furthest', record_errors=True, start_number=0): # Check source path if not os.path.isdir(source_path): print('Unable to find source:' + source_path) return # Generate threading lists if add_new: items = os.listdir(source_path) databases = [] for item in items: try: databases.append(int(item)) except ValueError: continue if len(databases) != 0: start_number = max(databases) + 1 end_number = start_number + database_count paths = PathNaming(os.name, database_path=source_path) database_paths = [source_path + str(i) + paths.slash for i in range(start_number, end_number)] explorations = [exploration_count for i in range(database_count)] exploitations = [exploitation_count for i in range(database_count)] randoms = [random_count for i in range(database_count)] exploit_method = [exploit_method for i in range(database_count)] record_errors = [record_errors for i in range(database_count)] # Make a new folder for each database and place the input files in it for i in range(database_count): if not os.path.isdir(database_paths[i]): os.mkdir(database_paths[i]) shutil.copy(source_path + paths.base_input, database_paths[i]) shutil.copy(source_path + paths.dbase_input, database_paths[i]) # If there's existing libraries to begin with, copy them as well if os.path.isdir(source_path + paths.slash + paths.FR_Input_folder + paths.slash): source_dir = source_path + paths.slash + paths.FR_Input_folder + paths.slash for i in range(database_count): copy_tree(source_dir, database_paths[i] + paths.slash + paths.FR_Input_folder) if os.path.isdir(source_path + paths.slash + paths.FR_Output_folder + paths.slash): source_dir = source_path + paths.slash + paths.FR_Output_folder + paths.slash for i in range(database_count): copy_tree(source_dir, database_paths[i] + paths.slash + paths.FR_Output_folder) # Run databases pool = Pool(processes=processes) pool.starmap(database_thread, zip(database_paths, explorations, exploitations, randoms, exploit_method, record_errors)) return
def multi_main(proc, FILENAME, FUN, **kargs): pool = Pool(proc) multiargs = [] # FPGrowth_LERS 用 if FUN == updateConfidenceSupport : min_sup_range = kargs['min_sup_range'] for iter1, iter2, min_sup in product(range(1,2), range(1,11), min_sup_range): multiargs.append((FILENAME, iter1, iter2, min_sup)) print(multiargs) pool.starmap(FUN, multiargs) else : print("I dont' know the function.")
def write_cost_matrix(self): begin = timeit.default_timer() pool = Pool(processes=cpu_count()) iterable = [] for i in range(self.n): for j in range(i+1,self.n): iterable.append((i,j)) pool.starmap(self.set_total_costs_matrix, iterable) self.total_costs_matrix.dump(os.getcwd()+'/memoria/outputs/cost_matrix') end = timeit.default_timer() print(end - begin)
def pool_decode(self, data, callback): """ Decode mz and i values in parallel. Args: data (): ... Keyword Args: callback (:obj:`func`): Callback function to call if decoding is finished. Should be :py:meth:`~pymzml.spec.Spectrum._register`. """ ZE_POOL = Pool(processes=2) ZE_POOL.starmap(_decode, data)
def parse_wsj(processes=8): ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ portions 'ptb', CategorizedBracketParseCorpusReader, r'wsj/\d\d/wsj_\d\d\d\d.mrg', cat_file='allcats.txt', tagset='wsj') fileids = ptb.fileids() params = [] for f in fileids: corpus = zip(ptb.parsed_sents(f), ptb.tagged_sents(f)) for i, (parsed, tagged) in enumerate(corpus): params.append((f, i, parsed, tagged)) p = Pool(processes) p.starmap(get_best_parse, sorted(params, key=lambda x: (x[0], x[1])))
def main(): # user_statistic(0, 50) num_process = 4 pool = Pool(num_process) num_post = post_detail_infos.find({'crawled_user_info':{'$ne':1}}).count() print('Number to get user infos: ' + str(num_post)) batch_size = int(num_post/num_process) print('Batch size: ' + str(batch_size)) args = [(idx*batch_size, (idx+1)*batch_size) for idx in range(num_process)] pool.starmap(user_statistic, args) pool.close() pool.join()
def main(processes=1): ptb = Constants().ptb fileids = list(ptb.fileids()) params = [] for fileid in fileids[:10]: for sent_num, parse_tree in enumerate(ptb.parsed_sents(fileid)): params.append((fileid, sent_num, parse_tree)) if processes > 1: p = Pool(processes) p.starmap(score, sorted(params, key=lambda x: (x[0], x[1]))) else: for param in params: score(*param)
def main(): if len(sys.argv) != 4: print("Usage: python3 tweetTokenize.py <tweets_folder> <dest_folder> <num_process>") sys.exit(-1) tweets_folder = sys.argv[1] dest_folder = sys.argv[2] num_process = int(sys.argv[3]) tweets_filenames = glob.glob(os.path.join(tweets_folder, '*')) tweets_filenames = [(f, dest_folder) for f in tweets_filenames] if num_process == 1: for f, dest_folder in tweets_filenames: tokenize_tweets(f, dest_folder) else: pool = Pool(num_process) pool.starmap(tokenize_tweets, tweets_filenames)
def ctag(inputfiles, output_file, remove_stop_words, language, min_freq, min_len, pdf_output, debug, cpu_count): startTime = time.time() lInfo("process {} files".format(len(inputfiles))) pool = Pool(processes=cpu_count) params = [(inputfile, remove_stop_words, language, min_len) for inputfile in inputfiles] results = pool.starmap(build_word_histogram, params) global_histogram = {} for histogram in results: global_histogram = add_dict(global_histogram, histogram) # filter out words with a frequency that is not >= min_freq global_histogram = {t: global_histogram[t] for t in global_histogram if global_histogram[t] >= min_freq} if debug: histogram_file = output_file.replace(os.path.splitext(output_file)[1], ".debug.json") lInfo("for debugging write out intermediate histogram to: {}".format(histogram_file)) with open(histogram_file, "w") as hist: hist.write(json.dumps(global_histogram, indent=4, sort_keys=True)) with open(output_file, "w") as outfile: outfile.write(svg_cloud(global_histogram)) if pdf_output: pdf_file_name = output_file.replace(os.path.splitext(output_file)[1], ".pdf") lInfo("create pdf graphic: {}".format(pdf_file_name)) if shutil.which("inkscape") is None: lError("inkscape is not installed, therefore no pdf export is available.") else: os.system("""inkscape --without-gui --export-pdf="{pdffile}" {svgfile}""".format(svgfile=output_file, pdffile=pdf_file_name)) lInfo("done: {} s".format(time.time() - startTime))
def gekko_search(**parameters): parallel = settings['parallel'] num_rounds = settings['num_rounds'] # remake CS & HS variability; candleSize= settings['candleSize'] historySize= settings['historySize'] if parallel: p = Pool(mp.cpu_count()) param_list = list([(Strategy, parameters),] * num_rounds) scores = p.starmap(Evaluate, param_list) p.close() p.join() else: scores = [Evaluate(Strategy, parameters) for n in range(num_rounds)] series = pd.Series(scores) mean = series.mean() stats.append([series.count(), mean, series.std(), series.min()] + [series.quantile(x) for x in percentiles] + [series.max()]) all_val.append(mean) write_evolution_logs(len(all_val), stats[-1]) return mean
def gene_lcs(genomes,base_dir): #create substring folder, check if substrings have previously been calculated if not os.path.exists(base_dir+'substrings/'): os.makedirs(base_dir+'substrings/') os.chdir(base_dir+'substrings/') #import previous substring file if it exists substring_file = glob.glob('*.csv') orgstring = [] if len(substring_file) == 1: with open('substrings.csv', newline='') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='|') for row in reader: orgstring.append(row[0]) if len(orgstring) == len(genomes): print('Organism substrings already calculated') else: print('Finding common substrings...') pool = ThreadPool(len(genomes)) orgstring = pool.starmap(extract,zip(genomes,repeat(base_dir))) pool.close() pool.join() #write orgstring file os.chdir(base_dir+'substrings/') with open('substrings.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL) for line in orgstring: writer.writerow([line]) return orgstring
def applyGrid(_geo_crimes, _n,_grid, _column): if _n > 128: _n = 128 print("n was too big. Set to 128.") print("splitting crimes in to smaller frames to leverage paralelization") _l = len(_geo_crimes.index) _crimes_args = [] _covered = 0 for _i in range(_n-1): _a, _b = int(round(_i*(_l/_n))), int(round((_i+1)*(_l/_n))) _crimes_args.append(_geo_crimes[_a:_b]) _covered = _covered + (_b - _a) _crimes_args.append(_geo_crimes[_covered:len(_geo_crimes.index)]) print("{} data-chunks created.".format(len(_crimes_args))) print("Trying to start {} parallel processes.".format(_n)) _pool = Pool(processes=_n) print("{} parallel process started.".format(_n)) _result = _pool.starmap(_para_crimes_in_cell, zip(_crimes_args, repeat(_grid), repeat(_column))) _pool.terminate() print("Process terminated.") _df = _result.pop(0) for _frame in _result: _df = _df.append(_frame) print("{} crimes where spatialised to their cell.".format(len(_df.index))) return _df
def reviews_to_sentences(self, review_list): """For a list of reviews, clean each review, and tranform each review to a list of sentences (each sentence is a list of words). Finally add all the list to a single list, each elment of the list is a sentence :param review_list: a list of reviews :returns: the list of all sentences from all reviews :rtype: a list of list of words (string) """ if self.use_pool: pool = Pool(self.pool_size) sentences_tmp = pool.starmap( self.review_to_sentences_static, zip(review_list, repeat(self.tokenizer), repeat(self.clean_method), repeat(self.remove_numbers), repeat(self.remove_punct), repeat(self.remove_stopwords))) pool.close() sentences = [] for sentence in sentences_tmp: sentences.extend(sentence) else: sentences = [] for review in review_list: sentences += self.review_to_sentences(review) return sentences
def getAllMovesLegalConcurrent (self, side) : p = Pool(8) unfilteredMovesWithBoard = [(move, copy.deepcopy(self.board)) for move in self.board.getAllMovesUnfiltered(side)] legalMoves = p.starmap(self.returnMoveIfLegal, unfilteredMovesWithBoard) p.close() p.join() return list(filter(None, legalMoves))
def error_rate(method, level, offset): pool = Pool() error_counts = pool.starmap( error_count, zip(refs, repeat(method), repeat(level), repeat(offset))) pool.close() n_values_total = sum(error_count_[0] for error_count_ in error_counts) n_errors_total = sum(error_count_[1] for error_count_ in error_counts) return n_errors_total/n_values_total
def fit_all_ase(ase, func, xs, colnames=None, pool=None, progress=False, median_in=None): if colnames is None: colnames = inspect.getargspec(func).args colnames.remove('x') if pool is not None: if isinstance(pool, int): pool = Pool(processes=pool) if progress: results = [pool.apply_async(fit_func, (func, i, ase, xs), {'median_in': median_in},) for i in ase.index] res = [] pbar = ProgressBar(maxval=len(results), widgets=[ "|", Percentage(), "|", SimpleProgress(), Bar(), FileTransferSpeed(), "|", Timer(), "|", AdaptiveETA(), ]) for i, r in zip(ase.index, pbar(results)): res.append(r.get()) else: res = list(pool.starmap( fit_func, zip( it.repeat(func), ase.index, it.repeat(ase), it.repeat(xs), it.repeat(None), # p0 it.repeat(median_in), ), )) else: res = list(it.starmap( fit_func, zip( it.repeat(func), ase.index, it.repeat(ase), it.repeat(xs), it.repeat(None), # p0 it.repeat(median_in), ) )) return pd.DataFrame( index=ase.index, columns=colnames, data=res )
def run_bteq_parallel(bteq_in_file, bteq_log_file, database_name, user_name, user_password, \ bteq_ignore_errors, num_of_jobs=1, copy_if_fail=None, copy_to=None): """ Run the same Teradata Bteq script with multiple clones and check for errors. Args: bteq_input_file (str): Bteq script with queries bteq_log_file (str): Name of file that save the output database_name (str): Teradata database that Bteq will connect to user_name (str): User that have permission to run queries from bteq_input_file user_password (str): Password for user_name bteq_ignore_errors (Optional(list([int]))): List of errors that can be ignore num_of_jobs (int): Number of clone jobs that Teradata Bteq will run in parallel copy_if_fail (Optional[str]): The directory that bteq_log_file copy to in case it failed copy_to (Optional[str]): The directory that bteq_log_file copy to after it completed successful Returns: False if errors found not in bteq_ignore_errors, True if successful. Raises: Raising exception """ def bteq_log_name(i): """Creates a unique logfile name with 'i' sequence number.""" if num_of_jobs==1: # if running a single process don't change the name: return bteq_log_file else: return str(i).join(os.path.splitext(bteq_log_file)) #return bteq_log_file + str(i) pool = Pool(processes=num_of_jobs) results = pool.starmap(run_bteq, [(bteq_in_file, bteq_log_name(i), database_name, user_name, \ user_password, bteq_ignore_errors) for i in range(1, num_of_jobs+1)]) bteq_pass = bteq_fail = 0 #https://docs.python.org/2.3/whatsnew/section-enumerate.html for i, (bteq_run_result, bteq_errors_that_not_ignore) in enumerate(results, 1): if bteq_run_result: bteq_pass = bteq_pass + 1 if copy_to is not None: copy_file(bteq_log_name(i), copy_to) else: logging.error("Bteq found that can not be ignored: %s, please check out this log file: %s" \ % (bteq_errors_that_not_ignore, bteq_log_name(i))) bteq_fail = bteq_fail + 1 if copy_if_fail is not None: copy_file(bteq_log_name(i), copy_if_fail) if bteq_fail > 0: return False else: return True
def predict_interval(alpha, Xs, likelihood, basis, m, C, lparams, bparams, nsamples=100, multiproc=True): """ Predictive percentile interval (upper and lower quantiles) for a Bayesian GLM. Parameters ---------- alpha: float The percentile confidence interval (e.g. 95%) to return. Xs: ndarray (Ns,d) array query input dataset (Ns samples, D dimensions). likelihood: Object A likelihood object, see the likelihoods module. basis: Basis A basis object, see the basis_functions module. m: ndarray (D,) array of regression weights (posterior). C: ndarray (D,) or (D, D) array of regression weight covariances (posterior). lparams: sequence a sequence of parameters for the likelihood object, e.g. the likelihoods.Gaussian object takes a variance parameter, so this should be :code:`[var]`. bparams: sequence A sequence of hyperparameters of the basis object. nsamples: int, optional The number of samples to draw from the posterior in order to approximate the predictive mean and variance. multiproc: bool, optional Use multiprocessing to paralellise this prediction computation. Returns ------- a: ndarray The lower end point of the interval with shape (Ns,) b: ndarray The upper end point of the interval with shape (Ns,) """ f = _sample_func(Xs, basis, m, C, bparams, nsamples) work = ((fn, likelihood, lparams, alpha) for fn in f) if multiproc: pool = Pool() res = pool.starmap(_rootfinding, work) pool.close() pool.join() else: res = [_rootfinding(*w) for w in work] ql, qu = zip(*res) return np.array(ql), np.array(qu)
def test(self, test_paragraphs, subgraph_strategy='greedy', order_strategy='greedy', processes=1): print('subgraph_strategy: %s' % subgraph_strategy) print('order_strategy: %s' % order_strategy) if processes > 1: p = Pool(processes) graphs, kendall_taus = zip(*p.starmap(self.evaluate, [(t, subgraph_strategy, order_strategy) for t in test_paragraphs])) else: graphs, kendall_taus = zip(*[self.evaluate(test_paragraph, subgraph_strategy, order_strategy) for test_paragraph in test_paragraphs]) print('mean, std_dev, min, max kendall tau: ', summary(kendall_taus)) print(kendall_taus) print() return graphs
def main(graph, nbk, delta_max, mu, temp, alpha, max_eval, iter, move_operator, logsPath): logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) fh = logging.FileHandler(logsPath + "/mproc_simulated-annealing.log") fh.setLevel(logging.INFO) frmt = logging.Formatter('%(message)s') fh.setFormatter(frmt) log.addHandler(fh) all_num_evaluations = [] all_best_score = [] all_time = [] all_temp = [] nb_proc = cpu_count() pool = Pool(processes=nb_proc) log.info("-------MULTI_PROC SIMULATED ANNEALING-------") startWork = timeit.default_timer() results = pool.starmap(doWork, zip(range(iter), repeat(graph), repeat(move_operator), repeat(max_eval), repeat(delta_max), repeat(mu), repeat(temp), repeat(alpha), repeat(nbk))) stopWork = timeit.default_timer() timeWork = (stopWork - startWork) actual_best_score = sys.maxsize for result in results: num_evaluations, best_score, best, temp, time = result if best_score < actual_best_score: actual_best = best actual_best_score = best_score all_temp.append(temp) all_num_evaluations.append(num_evaluations) all_best_score.append(best_score) all_time.append(time) log.info("Running on %d proc" % nb_proc) log.info("nbS = %d; nbK = %d; delta_max = %d; mu = %r; alpha = %r; move_operator= %s" % (graph.get_nbVertices(), nbk, delta_max, mu, alpha, move_operator.__name__)) log.info("for %d iteration with %d max_evaluations each, " "\n best score found is %d," "\n total time in sec : %r" "\n mean time in sec : %r," "\n mean best_score : %r, EcT : %r" "\n mean num_eval : %r," "\n mean end temperature : %r" % (iter, max_eval, min(score for score in all_best_score), timeWork, statistics.mean(all_time), statistics.mean(all_best_score), statistics.stdev(all_best_score), statistics.mean(all_num_evaluations), statistics.mean(all_temp)))
class PatternFinder(object): def __init__(self): """ Initializes the object and creates a pool of 4 processes """ super(PatternFinder, self).__init__() self.pool = Pool(processes=cpu_count()) def find(self, pattern, files): """ For each file, calls a function that searches for the pattern in that file. Each file is handled by a different process. To give 2 parameters to the function, a list of tuples is passed, each tuple contains the pattern and a file. Finally, the list of lists of ids is flattened and returned. """ res = self.pool.starmap(self.find_in_file, zip([pattern]*len(files), files)) return list(itertools.chain.from_iterable(res)) def find_in_file(self, pattern, fileName): """ Searches for the pattern in a file, line by line. If it finds it, it saves the id. Returns an array of ids. """ ids = [] file = open(fileName, "r" ) lines = file.readlines() for line in lines: res = re.search(pattern, line, re.DOTALL) if res is not None: ids.append(int(re.search('^(\d+)', line[:res.start()]).group(0))) file.close() return ids def __getstate__(self): """ Removes pool object from the instance before pickling since Pool objects cannot be pickled. """ self_dict = self.__dict__.copy() del self_dict['pool'] return self_dict def __setstate__(self, state): """ Called upon unpickling """ self.__dict__.update(state)
def computeNormals(PointsXYZ): p = Pool(4) f1,f2 = PointsXYZ[:int(len(PointsXYZ)/2)],PointsXYZ[int(len(PointsXYZ)/2):] one, two = f1[:int(len(f1)/2)],f1[int(len(f1)/2):] three,four = f2[:int(len(f2)/2)],f2[int(len(f2)/2):] jobs = [[one,PointsXYZ],[two,PointsXYZ],[three,PointsXYZ],[four,PointsXYZ]] L1,L2,L3,L4= p.starmap(Normals,jobs) PointsNormal = L1+L2+L3+L4 return PointsNormal
def evaluate(self): _pool = Pool(self.n_jobs) # iterador de parametros de función objetivo multiproceso _iterable = it.product([self.parametros], np.int32(self.sample[:, :self.dimensions]), [self.estimator], [self.score_cache], [self.resultados]) print(_iterable) self.sample[:, -1] = _pool.starmap(self.objective_function, _iterable) _pool.close() _pool.join() if self.debug: print("evaluate") print(self.sample) print("\n")
def main(options, args): time0 = time.time() image_dir = args[0] geojson_list = io_function.get_file_list_by_ext('.geojson', image_dir, bsub_folder=False) # remove some scenes, or maybe we should set bsub_folder=False # geojson_list = [item for item in geojson_list if 'incomplete_scenes' not in item ] # remove those in "incomplete_scenes" # geojson_list = [item for item in geojson_list if 'scenes_high_cloud_cover' not in item ] # remove those in "scenes_high_cloud_cover" if len(geojson_list) < 1: raise ValueError('There is no geojson files in %s' % image_dir) basic.outputlogMessage('Image Dir: %s' % image_dir) basic.outputlogMessage("Number of geojson files: %d" % len(geojson_list)) grid_polygon_shp = args[ 1] # the polygon should be in projection Cartesian coordinate system (e.g., UTM ) basic.outputlogMessage('Image grid polygon shapefile: %s' % grid_polygon_shp) process_num = options.process_num basic.outputlogMessage( 'The number of processes for creating the mosaic is: %d' % process_num) # read grid polygons grid_polygons = vector_gpd.read_polygons_gpd(grid_polygon_shp) grid_ids = vector_gpd.read_attribute_values_list(grid_polygon_shp, 'id') if grid_ids is None: basic.outputlogMessage( 'Warning, field: id is not in %s, will create default ID for each grid' % grid_polygon_shp) grid_ids = [id + 1 for id in range(len(grid_polygons))] shp_prj = map_projection.get_raster_or_vector_srs_info_proj4( grid_polygon_shp).strip() # print(shp_prj) grid_polygons_latlon = grid_polygons if shp_prj != '+proj=longlat +datum=WGS84 +no_defs': # read polygons and reproject to 4326 projection grid_polygons_latlon = vector_gpd.read_shape_gpd_to_NewPrj( grid_polygon_shp, 'EPSG:4326') # else: # raise ValueError(' %s should be in projection of Cartesian coordinate system'%grid_polygon_shp) shp_prj_wkt = map_projection.get_raster_or_vector_srs_info_wkt( grid_polygon_shp) max_sr = options.max_sr min_sr = options.min_sr original_img_copy_dir = options.original_img_copy_dir b_to_rgb_8bit = options.to_rgb basic.outputlogMessage('Convert to 8bit RGB images: %s' % str(b_to_rgb_8bit)) # group planet image based on acquisition date b_group_date = options.group_date basic.outputlogMessage('Group Planet image based on acquisition date: %s' % str(b_group_date)) if b_group_date: # diff_days as 0, group images acquired at the same date geojson_groups = group_planet_images_date(geojson_list, diff_days=0) # sort based on yeardate in accending order : operator.itemgetter(0) geojson_groups = dict( sorted(geojson_groups.items(), key=operator.itemgetter(0))) save_group_txt = 'geojson_groups_input_folder.txt' basic.outputlogMessage( 'images are divided into %d groups, save to %s' % (len(geojson_groups.keys()), save_group_txt)) io_function.save_dict_to_txt_json(save_group_txt, geojson_groups) else: geojson_groups = {'all': geojson_list} # create mosaic of each grid cloud_cover_thr = options.cloud_cover cloud_cover_thr = cloud_cover_thr * 100 # for Planet image, it is percentage out_res = options.out_res cur_dir = os.getcwd() resampling_method = options.merged_method for key in geojson_groups.keys(): # # test # if key != '20200701': # continue geojson_list = geojson_groups[key] save_dir = os.path.basename(cur_dir) + '_mosaic_' + str( out_res) + '_' + key # print(save_dir) if process_num == 1: for id, polygon, poly_latlon in zip(grid_ids, grid_polygons, grid_polygons_latlon): # if id != 34: # continue create_moasic_of_each_grid_polygon( id, polygon, poly_latlon, out_res, cloud_cover_thr, geojson_list, save_dir, new_prj_wkt=shp_prj_wkt, new_prj_proj4=shp_prj, sr_min=min_sr, sr_max=max_sr, to_rgb=b_to_rgb_8bit, save_org_dir=original_img_copy_dir, resampling_method=resampling_method) elif process_num > 1: theadPool = Pool(process_num) # multi processes parameters_list = [ (id, polygon, poly_latlon, out_res, cloud_cover_thr, geojson_list, save_dir, shp_prj_wkt, shp_prj, min_sr, max_sr, b_to_rgb_8bit, 0, original_img_copy_dir) for id, polygon, poly_latlon in zip(grid_ids, grid_polygons, grid_polygons_latlon) ] results = theadPool.starmap(create_moasic_of_each_grid_polygon, parameters_list) # need python3 theadPool.close() else: raise ValueError('incorrect process number: %d' % process_num) cost_time_sec = time.time() - time0 basic.outputlogMessage( 'Done, total time cost %.2f seconds (%.2f minutes or %.2f hours)' % (cost_time_sec, cost_time_sec / 60, cost_time_sec / 3600)) pass
def initialize_variables(input_v, activate_parallel, num_cores): from evaluate_objective import evaluate_objective from random import uniform import numpy as np from multiprocessing import Pool, cpu_count N = input_v['population'] M = input_v['M'] V = input_v['V'] mini = input_v['min_range'] maxi = input_v['max_range'] K = M + V my_list = [] ##Initialize each chromosome f_obj = np.zeros((N, M)) ##To only contain objective function values f_x = np.zeros((N, V)) ##To contain only variables if True: #print("in parallel") for i in range(0, N): ##Initialize the decision variables based on the minimum and the maximum possible values. ##V is the number of decision variables. A random number is picked between the minimum and ##maximum possible values for each decision variable. for j in range(0, V): f_x[i, j] = mini[j] + ((maxi[j] - mini[j]) * uniform(0, 1)) my_list.append((f_x[i, :], input_v)) ##For ease of computation and handling data the chromosome also has the value of the objective ##function concatenated at the end. The elements V + 1 to K has the objective function values. ##The function evaluate_objective takes one chromosome at a time, infact only the decision variables ##are passed to the function along with information about the number of objective functions which are ##processed and returns the value for the objective functions. These values are now stored at the end ##of the chromosome itself. #print(*my_list) if (num_cores < cpu_count()) and (num_cores >= 1): p = Pool(num_cores) else: p = Pool() results = p.starmap(evaluate_objective, my_list) p.close() p.join() for i in range(0, N): f_obj[i, :] = results[i] else: for i in range(0, N): ##Initialize the decision variables based on the minimum and the maximum possible values. ##V is the number of decision variables. A random number is picked between the minimum and ##maximum possible values for each decision variable. for j in range(0, V): f_x[i, j] = mini[j] + ((maxi[j] - mini[j]) * uniform(0, 1)) ##For ease of computation and handling data the chromosome also has the value of the objective ##function concatenated at the end. The elements V + 1 to K has the objective function values. ##The function evaluate_objective takes one chromosome at a time, infact only the decision variables ##are passed to the function along with information about the number of objective functions which are ##processed and returns the value for the objective functions. These values are now stored at the end ##of the chromosome itself. f_obj[i, :] = evaluate_objective(f_x[i, :], input_v) number = i + 1 display_message = 'Initializing population ' + str( number) + ' of ' + str(N) print(display_message) input_v['objAll'] = f_obj input_v['xAll'] = f_x f_all = np.concatenate((f_x, f_obj), axis=1) return f_all, input_v ##Copyright (c) 2009, Aravind Seshadri ##All rights reserved. ##Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following ##conditions are met: ## * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. ## * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer ## in the documentation and/or other materials provided with the distribution ## ##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT ##NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ##THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ##(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ##HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ##ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
with open(os.path.join(output_path, title + '.json'), 'w') as f: json.dump(f0_labels, f) except: print(title) pass if __name__ == '__main__': input1_path = sys.argv[1] input2_path = sys.argv[2] output_path = sys.argv[3] #input1_path = '../build/15_augmented_f0' #input2_path = '../build/16_f0_timepoints' #output_path = '../build/18_f0_labels' # This parameter is the frequency resolution, i.e. vertical sampling rate s = 24 titles1 = load_titles(input1_path, '.f0') titles2 = load_titles(input2_path, '.json') titles = list(set(titles1).intersection(titles2)) p = Pool() p.starmap(extract_f0_labels, [(title, input1_path, input2_path, output_path, s) for title in titles])
nb_all.append(bcut + n_cmin_buckets) rshape = (len(args.space_list), len(b_cutoffs), len(args.n_hashes_list)) n_cm_all = np.array(nb_all) - np.array(bcut_all) if args.lookup_data: min_scut = np.min(scut_all) # no need to store elements that are smaller x_train = np.asarray(x_train) x_train_hh = x_train[y_train > min_scut] y_train_hh = y_train[y_train > min_scut] lookup_dict = dict(zip(x_train_hh, y_train_hh)) start_t = time.time() pool = Pool(args.n_workers) if args.perfect_order: y_sorted = np.sort(y_valid)[::-1] results = pool.starmap(run_ccm, zip(repeat(y_sorted), bcut_all, nh_all, nb_all, repeat(name), repeat(sketch_type))) elif args.lookup_data: results = pool.starmap(run_ccm_lookup, zip(repeat(x_valid), repeat(y_valid), nh_all, n_cm_all, repeat(lookup_dict), scut_all, repeat(name), repeat(sketch_type))) else: results = pool.starmap(run_ccm, zip(repeat(y_valid_ordered), bcut_all, nh_all, nb_all, repeat(name), repeat(sketch_type))) pool.close() pool.join() valid_results, space_actual = zip(*results) valid_results = np.reshape(valid_results, rshape) space_actual = np.reshape(space_actual, rshape) bcut_all = np.reshape(bcut_all, rshape) scut_all = np.reshape(scut_all, rshape) nh_all = np.reshape(nh_all, rshape) nb_all = np.reshape(nb_all, rshape) log_str += '==== valid_results ====\n'
def fft2_calc(self,line,zone,start_time,mean_time,coef_time=1,fft_type="fft",norm=None,resamp=None,crop=None,val_sum=None,min_lim=None,max_lim=None,val_type=None,data_filter=None,pp_stack=None,pp_fftt=None,pp_ffts=None): """ Calcul de la transformée 2D pos_start,pos_end: position intiale et finale de calcul start_time: Temps à partir duquel la FFT commence mean_time: Durée sur laquelle la FFT est appliquée coef_time: Nombre de set consécutif de donnée de durée mean_time à concaténer fft_type: fft,rfft,ifft,irfft norm: None or ortho dist_corr: Relation between fiber length and pipe length resamp: [lsampling,timesampling] crop: [kinf,ksup,finf,fsup] val_sum: Intensity are summed or averaged (default averaged) min_lim: Set to min_lim[1] intensity less than min_lim[0] max_lim: Set to max_lim[1] intensity more than max_lim[0] val_type Output data "real" or "abs" pp_stack Functions and args to apply before data stacking after the FFTT pp_fftt Functions and args to apply after data stacking pp_ffts Functions and args to apply after the FFTS """ c = 0 start_data = int(start_time*self.fsamp) dt = int(mean_time*self.fsamp) duration = int(mean_time*coef_time*self.fsamp) if fft_type == "fft" or fft_type == "ifft" or fft_type == "fft2": res = n.zeros((int(self.lines[line][zone][1]-self.lines[line][zone][0]),dt),dtype=complex) if fft_type == "rfft" or fft_type == "irfft" or fft_type == "rfft2": res = n.zeros((int(self.lines[line][zone][1]-self.lines[line][zone][0]),int(dt/2)+1),dtype=complex) # if fft_type == "rfft2": # res = n.zeros((int((self.lines[line][zone][1]-self.lines[line][zone][0])/2)+1,dt),dtype=complex) k = cpu_count() print("FFT time") for i in range(start_data,int(start_data+duration),k*dt): argsdat = list() print(i) for l in range(k): if int(i+(l+1)*dt) <= int(start_data+duration): if fft_type == "rfft2" or fft_type == "fft2": argsdat.append((self.get_array(self.lines[line][zone][0],self.lines[line][zone][1],i,int(i+dt)),None,(-2,-1),norm)) else: argsdat.append((self.get_array(self.lines[line][zone][0],self.lines[line][zone][1],i,int(i+dt)),None,1,norm)) pool = Pool(k) t=time.time() if fft_type == "fft": restemp = pool.starmap(n.fft.fft,argsdat) if fft_type == "rfft": restemp = pool.starmap(n.fft.rfft,argsdat) if fft_type == "ifft": restemp = pool.starmap(n.fft.ifft,argsdat) if fft_type == "irfft": restemp = pool.starmap(n.fft.irfft,argsdat) pool.close() pool.join() print("FFT time finished in ",time.time()-t,"s") del pool del argsdat c+= len(restemp) restemp = n.array(restemp) for j in range(len(restemp)): aa = vt.set_to_lim(restemp[j,:,:],min_lim,max_lim) if pp_stack is not None: for i,j in pp_stack.items(): if j["args"] is not None: aa = j["func"](aa,**j["args"]) else: aa = j["func"](aa) res += aa del restemp # X and Y vec calc if fft_type == "fft" or fft_type == "ifft" or fft_type == "fft2": self.fvec = n.fft.fftfreq(n.size(res,1),1/self.fsamp) if fft_type == "rfft" or fft_type == "irfft" or fft_type == "rfft2": self.fvec = n.fft.rfftfreq((n.size(res,1)-1)*2+1,1/self.fsamp) res,[k,self.fvec] = vt.sort_vecs(res,[None,self.fvec]) dx = self.spatial_res/self.lines[line]["dist_ratio"] self.kvec = n.fft.fftfreq(n.size(res,0),dx) # Post fft time processing if pp_fftt is not None: for i,j in pp_fftt.items(): if j["args"] is not None: res = j["func"](res,**j["args"]) else: res = j["func"](res) print("FFT time calc in progress") t=time.time() # if fft_type != "fft2" and fft_type != "rfft2": res = n.fft.fft(res,None,0,norm) print("FFT space calc finished in",time.time()-t,"s") # Post fft space processing if pp_ffts is not None: for i,j in pp_ffts.items(): if j["args"] is not None: res = j["func"](res,**j["args"]) else: res = j["func"](res) # Post processing if val_type == "real": res = n.real(res) if val_type == "abs": res = n.abs(res) if val_sum is not None: res = res/c if crop is not None: res,[self.kvec,self.fvec] = vt.crop_vecs(res,[self.kvec,self.fvec],crop) if resamp is not None: if resamp[0] == "auto": resamp[0] = len(self.fvec) if resamp[1] == "auto": resamp[1] = len(self.kvec) res,[self.kvec,self.fvec] = vt.resamp_vecs(res,[self.kvec,self.fvec],[resamp[0],resamp[1]],False) res,[self.kvec,self.fvec] = vt.sort_vecs(res,[self.kvec,self.fvec]) return res,self.kvec,self.fvec
def test_calc_events(): test_data_dir = '/u/casey/scratch/work/microlens/galaxia_test/OGLE672/' hdf5_file = test_data_dir + 'OGLE672.h5' output_root2 = 'OGLE672_TEST' obs_time = 1000 n_obs = 101 radius_cut = 2 theta_frac = 2 blend_rad = 0.8 microlens_path = '/u/casey/scratch/code/PopSyCLE' n_proc = 1 overwrite = False t0 = time.time() # Initialize events_tmp and blends_tmp. events_tmp = None blends_tmp = None # Get the l and b from the HDF5 file. hf = h5py.File(hdf5_file, 'r') l_array = np.array(hf['long_bin_edges']) b_array = np.array(hf['lat_bin_edges']) hf.close() # Converts radius_cut from arcseconds into milliarcseconds radius_cut *= 1000.0 # Set up the multiprocessing pool = Pool(n_proc) # Set up inputs to be able to be read by pool.map # Only do a few of the bins. nll = 7 nbb = 7 llbb = itertools.product(range(nll), range(nbb)) reps = nll * nbb hd = itertools.repeat(hdf5_file, reps) ot = itertools.repeat(obs_time, reps) no = itertools.repeat(n_obs, reps) rc = itertools.repeat(radius_cut, reps) tf = itertools.repeat(theta_frac, reps) br = itertools.repeat(blend_rad, reps) inputs = zip(llbb, hd, ot, no, rc, tf, br) ########## # Loop through galactic latitude and longitude bins. For each bin vertex, take # the nearest 4 bin samples and calculate microlensing events. We do this # to properly handle bin edges (i.e. a sliding window analysis of 2x2 bins). # Duplicate events are removed. ########## # Should I use starmap_async? results = pool.starmap(synthetic._calc_event_time_loop, inputs) pool.close() pool.join() # Remove all the None values # (occurs for patches with less than 10 objects) results = [i for i in results if i is not None] # Remove all the None values # (occurs for patches with less than 10 objects) results = [i for i in results if i is not None] results_ev = [] results_bl = [] for ii in range(len(results)): if results[ii] is not None: if results[ii][0] is not None: results_ev.append(results[ii][0]) if results[ii][1] is not None: results_bl.append(results[ii][1]) events_tmp = np.concatenate(results_ev, axis=1) blends_tmp = np.concatenate(results_bl, axis=1) # Convert the events numpy array into an Astropy Table for easier consumption. # The dimensions of events_final_table are 52 x Nevents if events_tmp is not None: events_tmp = synthetic.unique_events(events_tmp) events_final = Table( events_tmp.T, names=('zams_mass_L', 'rem_id_L', 'mass_L', 'px_L', 'py_L', 'pz_L', 'vx_L', 'vy_L', 'vz_L', 'rad_L', 'glat_L', 'glon_L', 'vr_L', 'mu_b_L', 'mu_lcosb_L', 'age_L', 'popid_L', 'ubv_k_L', 'ubv_i_L', 'exbv_L', 'obj_id_L', 'ubv_j_L', 'ubv_u_L', 'ubv_r_L', 'ubv_b_L', 'ubv_h_L', 'ubv_v_L', 'zams_mass_S', 'rem_id_S', 'mass_S', 'px_S', 'py_S', 'pz_S', 'vx_S', 'vy_S', 'vz_S', 'rad_S', 'glat_S', 'glon_S', 'vr_S', 'mu_b_S', 'mu_lcosb_S', 'age_S', 'popid_S', 'ubv_k_S', 'ubv_i_S', 'exbv_S', 'obj_id_S', 'ubv_j_S', 'ubv_u_S', 'ubv_r_S', 'ubv_b_S', 'ubv_h_S', 'ubv_v_S', 'theta_E', 'u0', 'mu_rel', 't0')) if blends_tmp is not None: blends_tmp = synthetic.unique_blends(blends_tmp) blends_final = Table( blends_tmp.T, names=('obj_id_L', 'obj_id_S', 'zams_mass_N', 'rem_id_N', 'mass_N', 'px_N', 'py_N', 'pz_N', 'vx_N', 'vy_N', 'vz_N', 'rad_N', 'glat_N', 'glon_N', 'vr_N', 'mu_b_N', 'mu_lcosb_N', 'age_N', 'popid_N', 'ubv_k_N', 'ubv_i_N', 'exbv_N', 'obj_id_N', 'ubv_j_N', 'ubv_u_N', 'ubv_r_N', 'ubv_b_N', 'ubv_h_N', 'ubv_v_N', 'sep_LN')) if events_tmp is None: print('No events!') return t1 = time.time() # Save out file events_final.write(output_root2 + '_events.fits', overwrite=overwrite) blends_final.write(output_root2 + '_blends.fits', overwrite=overwrite) print('Total runtime: {0:f} s'.format(t1 - t0)) return
def analyze_recon(sbml_folder, output_stat_file, padmet_folder=None, padmet_bool=None, nb_cpu=1): """Analyze the sbml and/or the padmet files after metabolic network reconstruction. And write the result in a file. Args: sbml_folder (str): directory of SBML files output_stat_file (str): path to output stat file padmet_folder (str): directory of PADMET files padmet_bool (bool): use or not the padmet files nb_cpu (int): number of CPU to use """ analyze_pool = Pool(processes=nb_cpu) if padmet_bool and padmet_folder: genes = {} reactions = {} gene_associated_reactions = {} compounds = {} pathways = {} multiprocessing_data = [] if os.listdir(padmet_folder) == 0: logger.critical("No padmet in " + padmet_folder) sys.exit(1) for padmet in os.listdir(padmet_folder): padmet_file = os.path.join(padmet_folder, padmet) species_name = padmet.replace('.padmet', '') multiprocessing_data.append((species_name, padmet_file)) recon_stats = analyze_pool.starmap(create_padmet_stat, multiprocessing_data) with open(output_stat_file, 'w') as micro_file: csvwriter = csv.writer(micro_file, delimiter='\t') csvwriter.writerow(['species', 'nb_reactions', 'nb_reactions_with_genes', 'nb_genes', 'nb_compounds', 'nb_pathways']) for recon_stat in recon_stats: species_name = recon_stat[0] genes[species_name] = recon_stat[1] reactions[species_name] = recon_stat[2] gene_associated_reactions[species_name] = recon_stat[3] compounds[species_name] = recon_stat[4] pathways[species_name] = recon_stat[5] csvwriter.writerow([species_name, len(reactions[species_name]), len(gene_associated_reactions[species_name]), len(genes[species_name]), len(compounds[species_name]), len(pathways[species_name])]) else: genes = {} reactions = {} compounds = {} pathways = None gene_associated_reactions = {} multiprocessing_data = [] if os.listdir(sbml_folder) == 0: logger.critical("No sbml in " + sbml_folder) sys.exit(1) for sbml in os.listdir(sbml_folder): species_name = sbml.replace('.sbml','') sbml_file = os.path.join(sbml_folder, sbml) multiprocessing_data.append((species_name, sbml_file)) sbml_stats = analyze_pool.starmap(create_sbml_stat, multiprocessing_data) with open(output_stat_file, 'w') as micro_file: csvwriter = csv.writer(micro_file, delimiter='\t') csvwriter.writerow(['species', 'nb_reactions', 'nb_reactions_with_genes', 'nb_genes', 'nb_compounds']) for sbml_stat in sbml_stats: species_name = sbml_stat[0] genes[species_name] = set(sbml_stat[1]) reactions[species_name] = set(sbml_stat[2]) gene_associated_reactions[species_name] = set(sbml_stat[3]) compounds[species_name] = set(sbml_stat[4]) csvwriter.writerow([species_name, len(reactions[species_name]), len(gene_associated_reactions[species_name]), len(genes[species_name]), len(compounds[species_name])]) analyze_pool.close() analyze_pool.join() logger.info("######### Stats GSMN reconstruction #########") if len(genes) == len(reactions) and len(genes) == len(compounds) and len(reactions) == len(compounds): logger.info("Number of genomes: " + str(len(genes))) dataset_all_reactions = set([reaction for species_name in reactions for reaction in reactions[species_name]]) logger.info("Number of reactions in all GSMN: " + str(len(dataset_all_reactions))) dataset_all_compounds = set([compound for species_name in compounds for compound in compounds[species_name]]) logger.info("Number of compounds in all GSMN: " + str(len(dataset_all_compounds))) species_reactions = [len(reactions[species_name]) for species_name in reactions] if len(species_reactions) > 1: mean_species_reactions, sd_species_reactions = mean_sd_data(species_reactions) if mean_species_reactions and sd_species_reactions: logger.info("Average reactions per GSMN: " + mean_species_reactions + sd_species_reactions) else: logger.info("Number of reactions in GSMN: " + str(species_reactions[0])) species_compounds = [len(compounds[species_name]) for species_name in compounds] if len(species_compounds) > 1: mean_species_compounds, sd_species_compounds = mean_sd_data(species_compounds) if mean_species_compounds and sd_species_compounds: logger.info("Average compounds per GSMN: " + mean_species_compounds + sd_species_compounds) else: logger.info("Number of compounds in GSMN: " + str(species_compounds[0])) species_genes = [len(genes[species_name]) for species_name in genes] if len(species_genes) > 1: mean_species_genes, sd_species_genes = mean_sd_data(species_genes) if mean_species_genes and sd_species_genes: logger.info("Average genes per GSMN: " + mean_species_genes + sd_species_genes) else: logger.info("Number of genes in GSMN: " + str(species_genes[0])) if pathways: species_pathways = [len(pathways[species_name]) for species_name in pathways] if len(species_pathways) > 1: mean_species_pathways, sd_species_pathways = mean_sd_data(species_pathways) if mean_species_pathways and sd_species_pathways: logger.info("Average pathways per GSMN: " + mean_species_pathways + sd_species_pathways) else: logger.info("Number of pathways in GSMN: " + str(species_pathways[0])) gene_reactions_assoc_percentages = [] for species_name in reactions: if len(reactions[species_name]) > 0: gene_reactions_assoc_percentages.append(((len(gene_associated_reactions[species_name]) / len(reactions[species_name]))*100)) else: gene_reactions_assoc_percentages.append(0) logger.info('Warning: ' + species_name + ' metabolic network contains 0 reactions.') if len(gene_reactions_assoc_percentages) > 1: mean_gene_reactions_assoc_percentages, sd_gene_reactions_assoc_percentages = mean_sd_data(gene_reactions_assoc_percentages) if mean_gene_reactions_assoc_percentages and sd_gene_reactions_assoc_percentages: logger.info('Percentage of reactions associated with genes: ' + mean_gene_reactions_assoc_percentages + sd_gene_reactions_assoc_percentages) else: logger.info('Percentage of reactions associated with genes: ' + str(gene_reactions_assoc_percentages[0]))
def main(): parser = ArgumentParser() parser.add_argument('--seed', type=int, default=0) parser.add_argument('--train_corpus', type=Path, required=True) parser.add_argument("--output_dir", type=Path, required=True) parser.add_argument("--bert_model", type=str, required=True, choices=[ "bert-base-uncased", "bert-large-uncased", "bert-base-cased", "bert-base-multilingual-uncased", "bert-base-chinese", "bert-base-multilingual-cased" ]) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--do_whole_word_mask", action="store_true", help= "Whether to use whole word masking rather than per-WordPiece masking.") parser.add_argument( "--reduce_memory", action="store_true", help= "Reduce memory usage for large datasets by keeping data on disc rather than in memory" ) parser.add_argument("--num_workers", type=int, default=1, help="The number of workers to use to write the files") parser.add_argument("--epochs_to_generate", type=int, default=3, help="Number of epochs of data to pregenerate") parser.add_argument("--max_seq_len", type=int, default=128) parser.add_argument( "--short_seq_prob", type=float, default=0.1, help="Probability of making a short sentence as a training example") parser.add_argument( "--masked_lm_prob", type=float, default=0.15, help="Probability of masking each token for the LM task") parser.add_argument( "--max_predictions_per_seq", type=int, default=20, help="Maximum number of tokens to mask in each sequence") args = parser.parse_args() print('Using seed', args.seed) random.seed(args.seed) if args.num_workers > 1 and args.reduce_memory: raise ValueError("Cannot use multiple workers while reducing memory") tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) vocab_list = list(tokenizer.vocab.keys()) with DocumentDatabase(reduce_memory=args.reduce_memory) as docs: with args.train_corpus.open() as f: doc = [] for line in tqdm(f, desc="Loading Dataset", unit=" lines"): line = line.strip() if line == "": docs.add_document(doc) doc = [] else: tokens = tokenizer.tokenize(line) if tokens: # Sometimes the stuff is really weird and tokenization will fail doc.append(tokens) if doc: docs.add_document( doc ) # If the last doc didn't end on a newline, make sure it still gets added if len(docs) <= 1: exit( "ERROR: No document breaks were found in the input file! These are necessary to allow the script to " "ensure that random NextSentences are not sampled from the same document. Please add blank lines to " "indicate breaks between documents in your input file. If your dataset does not contain multiple " "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, " "sections or paragraphs.") args.output_dir.mkdir(exist_ok=True) if args.num_workers > 1: writer_workers = Pool( min(args.num_workers, args.epochs_to_generate)) arguments = [(docs, vocab_list, args, idx) for idx in range(args.epochs_to_generate)] writer_workers.starmap(create_training_file, arguments) else: for epoch in trange(args.epochs_to_generate, desc="Epoch"): create_training_file(docs, vocab_list, args, epoch)
train_titles = sorted( set(titles['train']).intersection(merged_labels_titles)) test_titles = sorted( set(titles['test']).intersection(merged_labels_titles)) valid_titles = sorted( set(titles['valid']).intersection(merged_labels_titles)) print(train_titles) unary_batch = True if unary_batch == True: p = Pool() p.starmap(unary_batching, [(title, input1_path, train_output_path) for title in train_titles]) p.starmap(unary_batching, [(title, input1_path, test_output_path) for title in test_titles]) p.starmap(unary_batching, [(title, input1_path, valid_output_path) for title in valid_titles]) else: # this option puts all the files with the same length in the same batch. # this makes training fasters, but it also produces lower quality models # This is a limit on how long/large a batch will be # A batch is n_time_steps*n_files, # where all the files have the same number of time steps size_limit = 10000
def run_overlay_resources_score_motifs(motif_sites_dir, all_chromatin_makrs_all_cells_combined_dir_path, motifs_overlapping_tracks_output_dir, run_in_parallel_param, number_processes_to_run_in_parallel, normal_expression_per_tissue_origin_per_TF, matching_cell_name_representative_dict, motifTFName_TFNames_matches_dict, cells_assays_dict, cell_tfs, tf_cells, assay_cells_datatypes, header): """pairs matching chromosomes in motif_sites_input_dir and all_chromatin_makrs_all_cells_input_dir and calls overlay_resources_score_motifs Input: moitf instances input dir (one file per chr) chromatin data collection dir (one file per chr, bed4 format; track pos, track cell#assaytype#value or cell#TFname in case of chip-seq) Return: a list of motif_overlapping_track files Precondition: files in motif_sites_input_dir and chromatin_tracks_input_dir should have the same names Recommended: name files in both dirs as chrNumber, chrX or chrY (where number is between 1-22) """ motif_files = [] if not os.path.isdir(motif_sites_dir) and os.path.isfile(motif_sites_dir): motif_files = [motif_sites_dir] motif_sites_dir = "." else: motif_files = os.listdir(motif_sites_dir) motif_files_full_path = [motif_sites_dir+'/' + s for s in motif_files] print(motif_files_full_path ) chromatin_tracks_files = os.listdir(all_chromatin_makrs_all_cells_combined_dir_path) if not os.path.exists(motifs_overlapping_tracks_output_dir): os.makedirs(motifs_overlapping_tracks_output_dir) #scored_motifs_chromatin_tracks_output_file = '.'.join(motifs_overlapping_tracks_file.split('.')[0:-1]) + '_scored.bed10' #if not (os.path.exists(motifs_overlapping_tracks_file) and os.path.exists(scored_motifs_chromatin_tracks_output_file)): if run_in_parallel_param and len(motif_files)>1: p = Pool(int(number_processes_to_run_in_parallel)) motifs_overlapping_tracks_files = p.starmap(overlay_resources_score_motifs, product(motif_files_full_path , [motifs_overlapping_tracks_output_dir], [all_chromatin_makrs_all_cells_combined_dir_path], [chromatin_tracks_files])) p.close() p.join() else: motifs_overlapping_tracks_files = overlay_resources_score_motifs(motif_files_full_path, motifs_overlapping_tracks_output_dir, all_chromatin_makrs_all_cells_combined_dir_path, chromatin_tracks_files) #score intersected track files print(motifs_overlapping_tracks_files) scored_motifs_overlapping_tracks_files =[] for motifs_overlapping_tracks_file in motifs_overlapping_tracks_files: scored_motifs_chromatin_tracks_output_file = '.'.join(motifs_overlapping_tracks_file.split('.')[0:-1]) + '_scored.bed10' with open(motifs_overlapping_tracks_file) as f: count = sum(1 for _ in f) if not os.path.exists(scored_motifs_chromatin_tracks_output_file):#score each motif-track_overlapping file file print("computing scores to: " + scored_motifs_chromatin_tracks_output_file) index_track_names=7 index_motif_name=3 with open(scored_motifs_chromatin_tracks_output_file, 'w') as scored_motifs_writefile: if header: header_line = ['posrange', 'chr', 'motifstart', 'motifend', 'name', 'score', 'pval','strand'] for cell in sorted(cells_assays_dict.keys()): for assay in sorted(cells_assays_dict[cell].keys()): if cell[0].isdigit(): #if cell=='22Rv1' or cell=='8988T': cell='a'+cell #if cell=="Ammon's horn": # cell="Ammons horn" #if cell=="Peyer's patch": # cell="Peyers patch" cell_name ='_'.join(((cell + "___" + assay).replace('(','').replace(')','') .replace('-','__').replace('.','').replace("'","")).split()) header_line.append('"'+cell_name+'"') #print(header_line) scored_motifs_writefile.write('\t'.join(header_line) + '\n') if (run_in_parallel_param): os.system( """split -l 200000 {} {}""" .format(motifs_overlapping_tracks_file,motifs_overlapping_tracks_file+'_tmp')) motifs_overlapping_tracks_file_splitted = glob.glob(motifs_overlapping_tracks_file+'_tmp*') p = Pool(int(number_processes_to_run_in_parallel)) p.starmap(score_motifs_per_cell, product(motifs_overlapping_tracks_file_splitted, [normal_expression_per_tissue_origin_per_TF], [matching_cell_name_representative_dict], [motifTFName_TFNames_matches_dict], [cells_assays_dict], [cell_tfs], [tf_cells], [assay_cells_datatypes], [index_track_names], [index_motif_name])) p.close() p.join() #remove tmp splitted files with open(scored_motifs_chromatin_tracks_output_file, 'a') as scored_motifs_writefile: for f in motifs_overlapping_tracks_file_splitted: print(f+'_scored') with open(f+'_scored', 'r') as f_score_ifile: l = f_score_ifile.readline() while l: scored_motifs_writefile.write(l) l = f_score_ifile.readline() f_score_ifile.close() os.remove(f) os.remove(f+'_scored') scored_motifs_writefile.close() else: score_motifs_per_cell(motifs_overlapping_tracks_file, normal_expression_per_tissue_origin_per_TF, matching_cell_name_representative_dict, motifTFName_TFNames_matches_dict, cells_assays_dict, cell_tfs, tf_cells, assay_cells_datatypes, index_track_names, index_motif_name) scored_motifs_overlapping_tracks_files.append(scored_motifs_chromatin_tracks_output_file) print(scored_motifs_overlapping_tracks_files) return motifs_overlapping_tracks_files, scored_motifs_overlapping_tracks_files
# Establish iterators it1 = batch_iterator(FastqGeneralIterator(f1), n) it2 = batch_iterator(FastqGeneralIterator(f2), n) it3 = batch_iterator(FastqGeneralIterator(f3), n) # iterate over batches of length n for i, batch1 in enumerate(it1): batch2 = it2.__next__() batch3 = it3.__next__() output = o # parallel process the barcode processing and accounting of failures. pool = Pool(processes=cpu) pm = pool.map(debarcode_trio, zip(batch1, batch2, batch3)) pool.close() # Aggregate output fastq1 = [item[0] for item in pm] fastq2 = [item[1] for item in pm] # Export one chunk in parallel filename1 = output +'_R1.fastq.gz' filename2 = output +'_R2.fastq.gz' pool = Pool(processes=2) toke = pool.starmap(chunk_writer_gzip, [(filename1, fastq1), (filename2, fastq2)]) pool.close()
def main(magneticum_snap_directories, bahamas_snap_directories, multi_processing=False, addition=False, check_clusters=False, number_of_virtual_nodes=500, number_of_projections=26, exposure_time=5000., redshift=0.20, plotting=0, cores=16, number_of_neighbours=6, move_to_front=None): yt.funcs.mylog.setLevel(40) # Suppresses yt status output. soxs.utils.soxsLogger.setLevel(40) # Suppresses soxs status output. pyxsim.utils.pyxsimLogger.setLevel(40) # Suppresses pyxsim status output. # Define the directories containing the data if os.getcwd().split('/')[2] == 's2675544': base_data_dir = '/home/s2675544/data' print('Running on ALICE') else: base_data_dir = '/home/matthijs/Documents/Studie/Master_Astronomy/1st_Research_Project/Data' print('Running at home') my_magneticum_data_dir = os.path.join(base_data_dir, 'Magneticum/Box2_hr') my_bahamas_data_dir = os.path.join(base_data_dir, 'Bahamas') my_tf_records_dir = os.path.join(base_data_dir, 'tf_records') magneticum_snap_paths = [ os.path.join(my_magneticum_data_dir, snap_dir) for snap_dir in magneticum_snap_directories ] bahamas_snap_paths = [ os.path.join(my_bahamas_data_dir, snap_dir) for snap_dir in bahamas_snap_directories ] defect_clusters = { 'snap_128': { 'split': [109, 16, 72, 48], 'photon_max': [53, 78], 'too_small': [] }, 'snap_132': { 'split': [75, 50, 110, 18], 'photon_max': [8, 52, 55, 93, 139, 289], 'too_small': [] }, 'snap_136': { 'split': [75, 107, 52, 15], 'photon_max': [96, 137, 51, 315, 216, 55, 102, 101, 20, 3], 'too_small': [] }, 'AGN_TUNED_nu0_L100N256_WMAP9': { 'split': [], 'photon_max': [3], 'too_small': [4, 10] + list(set(np.arange(20, 200)) - {20, 21, 22, 28}) }, 'AGN_TUNED_nu0_L400N1024_WMAP9': { 'split': [62, 89, 108, 125, 130, 191], 'photon_max': [], 'too_small': [] } } for snap_idx, snap_path in enumerate(magneticum_snap_paths + bahamas_snap_paths): print(f'Snapshot path : {snap_path}') snap_dir = os.path.basename(snap_path) if snap_dir[0:3] == 'AGN': cluster_dirs = glob.glob(os.path.join(snap_path, '*')) snapnum = 32 gdata = g.Gadget(os.path.join(base_data_dir, snap_dir), 'subh', snapnum, sim='BAHAMAS') subhalo_ids = [ int(id) for id in gdata.read_var('FOF/FirstSubhaloID', verbose=False) ] centers = gdata.read_var('Subhalo/CentreOfPotential', verbose=False) centers = centers[subhalo_ids[:-1]] # Convert to codelength by going from cm to Mpc and from Mpc to codelength centers /= gdata.cm_per_mpc / 0.7 else: cluster_dirs = glob.glob(os.path.join(snap_path, '*/*/*')) centers = [] tfrecord_dir = os.path.join(my_tf_records_dir, snap_dir + '_tf_records') print(f'Tensorflow records will be saved in : {tfrecord_dir}') print(f'Number of clusters : {len(cluster_dirs)}') bad_cluster_idx = defect_clusters[snap_dir]['split'] + \ defect_clusters[snap_dir]['too_small'] + \ defect_clusters[snap_dir]['photon_max'] print( f'Number of viable clusters : {len(cluster_dirs) - len(bad_cluster_idx)}' ) if check_clusters: if os.path.isdir(tfrecord_dir): tfrecords = glob.glob(os.path.join(tfrecord_dir, '*')) if os.path.isdir(tfrecords[0]): tfrecords = glob.glob(os.path.join(tfrecord_dir, 'train', '*')) + \ glob.glob(os.path.join(tfrecord_dir, 'test', '*')) processed_cluster_idxs = [] # Why does the downsample function not run if this is executed AND multiprocessing is on???? existing_cluster_datasets = tf.data.TFRecordDataset( tfrecords).map( lambda record_bytes: existing_clusters(record_bytes)) os.makedirs(os.path.join(base_data_dir, 'images', snap_dir), exist_ok=True) for cluster_idx, projection_idx, image in tqdm( iter(existing_cluster_datasets)): processed_cluster_idxs.append(cluster_idx.numpy()) if projection_idx.numpy() < 3: fig, ax = plt.subplots(1, 1, figsize=(6, 6)) ax.imshow(image.numpy()[:, :, 0]) plt.savefig( os.path.join( base_data_dir, 'images', snap_dir, 'xray_' + '{:03d}'.format(cluster_idx.numpy()) + '_' + '{:02d}'.format(projection_idx.numpy()) + '.png')) plt.close(fig=fig) bad_cluster_dirs = [] for cluster_dir in cluster_dirs: if get_index(cluster_dir) in bad_cluster_idx: bad_cluster_dirs.append(cluster_dir) for bad_cluster_dir in bad_cluster_dirs: cluster_dirs.remove(bad_cluster_dir) for cluster_dir in sorted(cluster_dirs): if processed_cluster_idxs.count( get_index(cluster_dir)) != 26: print( 'Unfinished cluster ', cluster_dir, ' : ', processed_cluster_idxs.count( get_index(cluster_dir)), ' images') print(f'Number of bad clusters: {len(bad_cluster_dirs)}') print( f'Already processed : {len(set(processed_cluster_idxs))}') print( f'Remaining cluster indices : {list(set([get_index(cluster) for cluster in cluster_dirs]) - set(processed_cluster_idxs))}' ) else: print( 'Can not check clusters because the cluster directory does not yet exist!' ) else: bad_cluster_dirs = [] # print(f'All cluster indices : {[get_index(cluster) for cluster in cluster_dirs]}') if move_to_front is not None: cluster_dirs.insert( 0, cluster_dirs.pop([ get_index(cluster) for cluster in cluster_dirs ].index(move_to_front))) if addition: # Remove clusters which are already processed, lie on a periodic boundary # or will take too long to process # good_cluster_idxs = [53, 78] # snap_128 # good_cluster_idxs = [139, 289, 1] # snap_132 # good_cluster_idxs = [0, 7, 28, 59, 2, 53, 46, 152] # snap_132 # AGN 400 remaining big clusters [0, 1, 2, 3, 4, 40] good_cluster_idxs = [0, 1, 2, 3, 4, 40] for cluster_dir in cluster_dirs: if get_index(cluster_dir) not in good_cluster_idxs: bad_cluster_dirs.append(cluster_dir) else: for cluster_dir in cluster_dirs: if get_index(cluster_dir) in bad_cluster_idx: bad_cluster_dirs.append(cluster_dir) for bad_cluster_dir in bad_cluster_dirs: cluster_dirs.remove(bad_cluster_dir) # print(f'Remaining cluster indices : {[get_index(cluster) for cluster in cluster_dirs]}') if multi_processing: params = [(cluster, tfrecord_dir, base_data_dir, cluster_dirs, snap_dir, centers, number_of_projections, exposure_time, redshift, number_of_virtual_nodes, number_of_neighbours, plotting) for cluster in cluster_dirs] pool = Pool(cores) pool.starmap(generate_data, params) else: for cluster in cluster_dirs: generate_data( cluster=cluster, tfrecord_dir=tfrecord_dir, base_data_dir=base_data_dir, cluster_dirs=cluster_dirs, snap_dir=snap_dir, centers=centers, number_of_projections=number_of_projections, exp_time=exposure_time, redshift=redshift, number_of_virtual_nodes=number_of_virtual_nodes, number_of_neighbours=number_of_neighbours, plotting=plotting)
from multiprocessing import Pool def el(*args): m=args[0] ind = list(args[1]) res=sum(i*j for i,j in zip(m,ind)) return res m1=[[1,1], [2,2]] m2=[[3,4], [3,4]] m22=list(zip(*m2)) res=[] M=[] pool=Pool(3) for i in range(len(m1[0])): for j in range(len(m22)): res = pool.starmap(el, [(i,j) for i in m1 for j in m22]) print(res)
def getVersions(config): p = Pool(len(config['targets'])) configs = [config for t in config['targets']] p.starmap(getVersion, zip(configs, config['targets']))
from itertools import repeat import pandas as pd nodes = search_algos.parse_graph() # PARAMS # iterative experiments MAX_ITER = 10000 RATE = 0.004 # to be determined N_CORES = 12 # GLS++ iterative pool = Pool(12) max_iters = [MAX_ITER] * N_CORES # copies = repeat(nodes.copy()) args = list(zip(repeat(nodes, 12), repeat(MAX_ITER, 12))) MLS_iter_res12 = pool.starmap(search_algos.GLS_own_iter, args) pool = Pool(13) args = list(zip(repeat(nodes, 13), repeat(MAX_ITER, 13))) MLS_iter_res13 = pool.starmap(search_algos.GLS_own_iter, args) total = MLS_iter_res12 + MLS_iter_res13 results = [] i = 0 for dict_ in total: dict_['iteration'] = repeat(i, len(dict_[list(dict_.keys())[0]])) results.append(pd.DataFrame(dict_)) i+=1 results_total = pd.concat(results) results_total.to_csv('GLS_plusplus.csv') del results_total, MLS_iter_res12, MLS_iter_res13, total, results
def getOEMUpdateStatuses(config): p = Pool(len(config['targets'])) configs = [config for t in config['targets']] p.starmap(getOEMUpdateStatus, zip(configs, config['targets']))
class Tangle: def __init__(self, transactions, genesis): self.transactions = transactions self.genesis = genesis if current_process().name == 'MainProcess': os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' self.process_pool = Pool(10) def add_transaction(self, tip): self.transactions[tip.name()] = tip def run_nodes(self, train_fn, clients, rnd, num_epochs=1, batch_size=10, malicious_clients=None, poison_type=PoisonType.NONE): norm_this_round = [] new_transactions = [] sys_metrics = { c.id: {BYTES_WRITTEN_KEY: 0, BYTES_READ_KEY: 0, LOCAL_COMPUTATIONS_KEY: 0} for c in clients} train_params = [[client.id, client.group, client.model.flops, random.randint(0, 4294967295), client.train_data, client.eval_data, self.name, (client.id in malicious_clients), poison_type] for client in clients] results = self.process_pool.starmap(train_fn, train_params) for tx, metrics, client_id, client_sys_metrics in results: if tx is None: continue sys_metrics[client_id][BYTES_READ_KEY] += client_sys_metrics[BYTES_READ_KEY] sys_metrics[client_id][BYTES_WRITTEN_KEY] += client_sys_metrics[BYTES_WRITTEN_KEY] sys_metrics[client_id][LOCAL_COMPUTATIONS_KEY] = client_sys_metrics[LOCAL_COMPUTATIONS_KEY] tx.tag = rnd new_transactions.append(tx) for tx in new_transactions: self.add_transaction(tx) return sys_metrics def test_model(self, test_fn, clients_to_test, set_to_use='test'): metrics = {} test_params = [[client.id, client.group, client.model.flops, random.randint(0, 4294967295), client.train_data, client.eval_data, self.name, set_to_use] for client in clients_to_test] results = self.process_pool.starmap(test_fn, test_params) for client, c_metrics in results: metrics[client] = c_metrics return metrics def save(self, tangle_name, global_loss, global_accuracy, norm): n = [{'name': t.name(), 'time': t.tag, 'malicious': t.malicious, 'parents': list(t.parents)} for _, t in self.transactions.items()] with open(f'tangle_data/tangle_{tangle_name}.json', 'w') as outfile: json.dump({'nodes': n, 'genesis': self.genesis, 'global_loss': global_loss, 'global_accuracy': global_accuracy, 'norm': norm}, outfile) self.name = tangle_name @classmethod def fromfile(cls, tangle_name): with open(f'tangle_data/tangle_{tangle_name}.json', 'r') as tanglefile: t = json.load(tanglefile) transactions = {n['name']: Transaction(None, set(n['parents']), n['name'], n['time'], n['malicious'] if 'malicious' in n else False) for n in t['nodes']} tangle = cls(transactions, t['genesis']) tangle.name = tangle_name return tangle
def updateBIOSfirmwares(config): p = Pool(len(config['targets'])) configs = [config for t in config['targets']] p.starmap(updateBIOSfirmware, zip(configs, config['targets']))
def parallel_simu_help(mode, num_obs, p, generating_mode, individual_level=True, num_iterations=50, noise_type='G'): assert generating_mode in ['ma', 'var'] print("now doing simulation with setting p = {}, mode = {}".format( p, mode)) print("================") weights = fetch_weights(p, mode, generating_mode) stdev = 1 span = fetch_span(num_obs, generating_mode) model_info = {} model_info['model'] = generating_mode model_info['weights'] = weights model_info['span'] = span model_info['stdev'] = stdev if generating_mode == 'ma': ts = generate_ma(weights, num_obs=num_obs, stdev=stdev, noise_type=noise_type) elif generating_mode == 'var': ts = generate_mvar(weights, num_obs=num_obs, stdev=stdev, noise_type=noise_type) arguments = list( zip(cycle([num_obs]), [model_info for _ in range(num_iterations)])) #print(arguments) iteration_pool = Pool(10) res = iteration_pool.starmap(evaluate_iteration, arguments) errs_dict_al, errs_dict_th, errs_dict_so, errs_dict_sh, errs_dict_sm , precision_th, recall_th, F1_th, \ precision_so, recall_so, F1_so, precision_al, recall_al, F1_al, true_spectral_norm_square = list(zip(*res)) true_spectral_norm_square = true_spectral_norm_square[0] result = {} result['raw_error'] = { 'al': errs_dict_al, 'th': errs_dict_th, 'so': errs_dict_so, 'sh': errs_dict_sh, 'sm': errs_dict_sm, 'true': true_spectral_norm_square } result['precision'] = { 'so': (np.mean(precision_so), np.std(precision_so)), 'al': (np.mean(precision_al), np.std(precision_al)), 'th': (np.mean(precision_al), np.std(precision_al)) } result['recall'] = { 'so': (np.mean(recall_so), np.std(recall_so)), 'al': (np.mean(recall_al), np.std(recall_al)), 'th': (np.mean(recall_th), np.std(recall_th)) } result['F1'] = { 'so': (np.mean(F1_so), np.std(F1_so)), 'al': (np.mean(F1_al), np.std(F1_al)), 'th': (np.mean(F1_th), np.std(F1_th)) } append_relative_err(result) return result, simu_setting_2_str(p, mode)
def main(args=None): if args is None: # Top level arg parser parser = argparse.ArgumentParser() # Allow adding of subparsers - one for each model type subparsers = parser.add_subparsers(help='first argument should be the model to use e.g. BIMPM') # Add the parsers for the different model types ms = [i for i in dir(m_args) if '__' not in i] ms = [i for i in ms if i != 'str2bool'] for m in ms: p = subparsers.add_parser(m.split('_args')[0]) p = getattr(m_args, m)(p) args = parser.parse_args() kwargs = {} # Match problems to the loss fns they'll be trained with prob_losses = probs_to_losses(args.problems, args.loss_fn, args.loss_weights) vocab = None # Prep all the datasets prob_iterators = {} # Temporarily store the preprocessed data iterators temp_store = './data/data_temp/' + args.save_id + '/' if os.path.exists(temp_store): shutil.rmtree(temp_store) os.makedirs(temp_store) v_load = True for prob, dd in prob_losses.items(): if problems[prob]['preload']: prob_temp = temp_store + prob + '/' # Read in the datasets - run in separate process so can free up memory when done print('Reading datasets and building vocab for {} problem...'.format(prob)) p = Pool(processes=1) v_load = p.starmap(prep_data, [(args, prob, dd, vocab, False, prob_temp, temp_store)]) p.close() v_load = v_load[0] # Load the vocab object as too large to pass back from multiprocess if v_load: vocab = joblib.load(temp_store + '/vocab') # Read the val iterator into memory and pass list of train iterator names fs = os.listdir(prob_temp) prob_iterators[prob] = {} prob_iterators[prob]['train'] = [prob_temp + t for t in fs if 'train' in t] if 'val' in fs: prob_iterators[prob]['val'] = joblib.load(prob_temp + 'val') else: base = './data/problems/' + prob + '/data/' prob_iterators[prob] = {'train': [base + f for f in os.listdir(base)]} if dd['ent']: setattr(args, 'num_ent_types', problems[prob]['ent_types']) if v_load: args.vocab_size = len(vocab.word_to_index) # Save the args and the vocab object if args.save: save_path = './saved_models/' + '_'.join(args.problems) + '/' + args.model + '/' + args.save_id + '/' os.makedirs(save_path, exist_ok=True) args.save_path = save_path # Save the params pickle.dump(args, open(save_path + 'args.p', 'wb')) # Save the vocab if v_load: pickle.dump(vocab, open(save_path + 'vocab.p', 'wb')) # Create the model model_class = getattr(models, args.model) print('Building model...') model = model_class(args, vocab, **kwargs) if args.gpu: model.cuda() # Train the model print('Training...') r = trainer(model, prob_iterators, args, vocab, prob_losses) return r
class S2Ranker: """A class to encapsulate the Semantic Scholar search ranker. Arguments: data_dir {str} -- where the language models and lightgbm model live. use_posthoc_correction {bool} -- whether to use posthoc correction """ def __init__(self, data_dir, use_posthoc_correction=True): self.use_posthoc_correction = use_posthoc_correction self.data_dir = data_dir with open(os.path.join(data_dir, 'lightgbm_model.pickle'), 'rb') as f: self.model = pickle.load(f) self.pool = Pool(cpu_count()) def score(self, query, papers): """Score each pair of (query, paper) for all papers Arguments: query {str} -- plain text search query papers {list of dicts} -- A list of candidate papers, each of which is a dictionary. Returns: scores {np.array} -- an array of scores, one per paper in papers """ query = str(query) #presults = [self.prepare_result(paper) for paper in papers] presults = self.pool.map(self.prepare_result, papers) args = [(query, pr) for pr in presults] feats = self.pool.starmap(make_features, args) X = np.array(feats) scores = self.model.predict(X) if self.use_posthoc_correction: scores = posthoc_score_adjust(scores, X, query) return scores @classmethod def prepare_result(cls, paper): """Prepare the raw text result for featurization Arguments: paper {dict} -- A dictionary that has the required paper fields: 'title', 'abstract', 'authors', 'venues', 'year', 'n_citations', 'n_key_citations' Returns: out {dict} -- A dictionary where the paper fields have been pre-processed. """ out = {'paper_year': paper.get('year', np.nan)} out['n_citations'] = paper.get('n_citations', 0) # if n_key_citations aren't available, we can get a quick estimate of what they are from the n_citations out['n_key_citations'] = paper.get( 'n_key_citations', int(-1.4 + np.log1p(out['n_citations']))) if out['n_key_citations'] < 0: out['n_key_citations'] = 0 out['paper_title_cleaned'] = fix_text(paper.get('title', '')) out['paper_abstract_cleaned'] = fix_text(paper.get('abstract', '')) out['paper_venue_cleaned'] = fix_text(paper.get('venue', '')) out['author_name'] = [ fix_author_text(i) for i in paper.get('authors', []) ] return out
cv2.RETR_TREE, \ cv2.CHAIN_APPROX_SIMPLE) mask_shape_lv_ = img.shape print(mask_shape_lv_) tissue_mask_lv_ = make_tumor_mask(mask_shape_lv_, contours_tissue_lv_) print('at save') cv2.imwrite(save_location, tissue_mask_lv_) if __name__ == '__main__': tiff_path = './data/svs/' print('start') t1 = time.time() save_location = './data/tissue_mask/' print('start') list_file_name = [f for f in listdir(tiff_path)] opt_list = [] for i, file_name in enumerate(list_file_name): if file_name.split('.')[-1] == 'svs': cur_name = file_name.split('.svs')[0] file_path = tiff_path + file_name img = choose_level(file_path) save_path = save_location + cur_name + '_tissue_mask_' + '64.png' opt_list.append((img, save_path)) pool = Pool(5) pool.starmap(make_mask, opt_list) pool.close() pool.join() t2 = time.time() print((t2 - t1) / 60)
def extract(a_range, a_dirs, a_tensor, size): for i in a_range: a_data = np.load(DATASET_POSITIVE_CHIRPS_PATH + a_dirs[i] + '/X.npy').astype(np.float32) a_data = F.interpolate(torch.tensor(a_data), size=[224, 224]) try: a_tensor[i % size] = a_data except: continue if __name__ == '__main__': MAX_CORES = os.cpu_count() PATH_TO_BUCKET = '../../data-step/processed_data/' p = Pool(MAX_CORES) p_args = [] b = size * int(sys.argv[1]) for i in range(MAX_CORES): s = b + i * (size // MAX_CORES) e = b + (i + 1) * (size // MAX_CORES) p_args.append((np.arange(s, e), dirs, tensor, size)) p.starmap(extract, p_args) p.close() torch.save(tensor, PATH_TO_BUCKET + f'positive/chirps/chirps{b}.pt')
def test_calc_event_time_loop(n_proc): t0 = time.time() test_data_dir = '/u/casey/scratch/work/microlens/galaxia_test/OGLE672/' hdf5_file = test_data_dir + 'OGLE672.h5' obs_time = 1000 n_obs = 101 radius_cut = 2 theta_frac = 2 blend_rad = 0.8 # Initialize events_tmp and blends_tmp. events_tmp = None blends_tmp = None # Get the l and b from the HDF5 file. hf = h5py.File(hdf5_file, 'r') l_array = np.array(hf['long_bin_edges']) b_array = np.array(hf['lat_bin_edges']) hf.close() # Converts radius_cut from arcseconds into milliarcseconds radius_cut *= 1000.0 # Set up the multiprocessing pool = Pool(n_proc) # # Set up inputs to be able to be read by pool.map # # Only do a few of the bins. nll = range(10, 18, 1) nbb = range(10, 11, 1) llbb = itertools.product(nll, nbb) reps = len(nll) * len(nbb) hd = itertools.repeat(hdf5_file, reps) ot = itertools.repeat(obs_time, reps) no = itertools.repeat(n_obs, reps) rc = itertools.repeat(radius_cut, reps) tf = itertools.repeat(theta_frac, reps) br = itertools.repeat(blend_rad, reps) inputs = zip(llbb, hd, ot, no, rc, tf, br) ########## # Loop through galactic latitude and longitude bins. For each bin vertex, take # the nearest 4 bin samples and calculate microlensing events. We do this # to properly handle bin edges (i.e. a sliding window analysis of 2x2 bins). # Duplicate events are removed. ########## # map_async? Make it wait till there are all done. results = pool.starmap(synthetic._calc_event_time_loop, inputs) pool.close() pool.join() results = [i for i in results if i is not None] # Is there a way for this to NOT involve a loop????? if len(results) > 0: for ii in range(len(results)): if events_tmp is not None: events_tmp = np.concatenate((events_tmp, results[ii][0]), axis=1) else: events_tmp = results[ii][0] for ii in range(len(results)): if blends_tmp is not None: blends_tmp = np.concatenate((blends_tmp, results[ii][1]), axis=1) else: blends_tmp = results[ii][1] t1 = time.time() runtime = t1 - t0 return runtime
def main(): config = parseArgs() print(f'config >{config}<') check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got nr_of_cpus >{nr_of_cpus}<') ## build return type dict-file and max-seq-length-file and vocabulary pickle_files = common_stuff_lib.get_all_filenames_of_type( config['save_dir'], '.pickle') #print(f'pickle-files we use to build >{pickle_files}<') pickle_lib.print_X_pickle_filenames(pickle_files, 5) print(f'Building return-type dict, vocabulary and max-squenece-length') p = Pool(nr_of_cpus) pickle_files = [config['save_dir'] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() ret_set = set() vocab = set() seq_length = 0 ##put all stuff together for ret_set1, vocab1, seq_length1 in all_ret_types: ret_set.update(ret_set1) vocab.update(vocab1) if seq_length1 > seq_length: seq_length = seq_length1 # ret_set = set() # vocab = set() # seq_length = 0 # counter = 1 # pickle_count = len(pickle_files) # # for file in pickle_files: # print(f'File >{file}< >{counter}/{pickle_count}<', end='\r') # counter += 1 # cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) # for item in cont: # #print(f'item-1 >{item[1]}<') # ## build ret-type-dict # ret_set.add(item[1]) # # ##build max-seq-length # if len(item[0]) > seq_length: # seq_length = len(item[0]) # # ## build vocabulary # for word in item[0].split(): # vocab.add(word) print( f"Build return-type dict from set and save it to >{config['return_type_dict_file']}<" ) ## build ret-type-dict and save ret_type_dict = dict() counter = 0 ret_set_list = sorted(ret_set) for elem in ret_set_list: ret_type_dict[elem] = counter counter += 1 print(f"ret-type-dict :") for key in ret_type_dict: print(f"nr-of-args >{key}< label >{ret_type_dict[key]}<") pickle_lib.save_to_pickle_file(ret_type_dict, config['return_type_dict_file']) print(f"Saving vocabulary to >{config['vocabulary_file']}<") ## build vocabulary list from set and save vocab_list = list(vocab) pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file']) ## save max-seq-length print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<") pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file']) print("Done. Run build_balanced_dataset.py next")
global_list_minus_dif = manager.list() global_list_plus_difc = manager.list() global_list_minus_difc = manager.list() global_list_ezk_plus_sim = manager.list() global_list_ezk_minus_sim = manager.list() global_list_ezk_plus_simc = manager.list() global_list_ezk_minus_simc = manager.list() global_list_ezk_plus_dif = manager.list() global_list_ezk_minus_dif = manager.list() global_list_ezk_plus_difc = manager.list() global_list_ezk_minus_difc = manager.list() # main function for hypotheses generation: p = Pool(32) p.starmap(extensions_apply, arguments) p.close() p.join() # now we need only 8 processes: no extensions (4x), and we only process lists of each sign (8): arguments2 = [(x, y) for x in signs for y in strategies] p = Pool(8) p.starmap(lists_analysis, arguments2) p.close() p.join() # now to exclude overlapping hypotheses (only in strategies without counter-examples): p = Pool(2) async_eliminate_overap = p.map_async(eliminate_overap, ['sim', 'dif']) p.close()
def eval_mot(results, annotations, logger=None, classes=None, iou_thr=0.5, ignore_iof_thr=0.5, ignore_by_classes=False, nproc=4): """Evaluation CLEAR MOT metrics. Args: results (list[list[list[ndarray]]]): The first list indicates videos, The second list indicates images. The third list indicates categories. The ndarray indicates the tracking results. annotations (list[list[dict]]): The first list indicates videos, The second list indicates images. The third list indicates the annotations of each video. Keys of annotations are - `bboxes`: numpy array of shape (n, 4) - `labels`: numpy array of shape (n, ) - `instance_ids`: numpy array of shape (n, ) - `bboxes_ignore` (optional): numpy array of shape (k, 4) - `labels_ignore` (optional): numpy array of shape (k, ) logger (logging.Logger | str | None, optional): The way to print the evaluation results. Defaults to None. classes (list, optional): Classes in the dataset. Defaults to None. iou_thr (float, optional): IoU threshold for evaluation. Defaults to 0.5. ignore_iof_thr (float, optional): Iof threshold to ignore results. Defaults to 0.5. ignore_by_classes (bool, optional): Whether ignore the results by classes or not. Defaults to False. nproc (int, optional): Number of the processes. Defaults to 4. Returns: dict[str, float]: Evaluation results. """ print_log('---CLEAR MOT Evaluation---', logger) t = time.time() gts = annotations.copy() if classes is None: classes = [i + 1 for i in range(len(results[0]))] assert len(results) == len(gts) metrics = METRIC_MAPS.keys() print_log('Accumulating...', logger) pool = Pool(nproc) accs = pool.starmap( acc_single_video, zip(results, gts, [iou_thr for _ in range(len(gts))], [ignore_iof_thr for _ in range(len(gts))], [ignore_by_classes for _ in range(len(gts))])) names, accs, items = aggregate_accs(accs, classes) print_log('Evaluating...', logger) eval_results = pd.DataFrame(columns=metrics) summaries = pool.starmap(eval_single_class, zip(names, accs)) pool.close() # category and overall results for i, item in enumerate(items): eval_results.loc[item] = summaries[i] dtypes = {m: type(d) for m, d in zip(metrics, summaries[0])} # average results avg_results = [] for i, m in enumerate(metrics): v = np.array([s[i] for s in summaries[:len(classes)]]) v = np.nan_to_num(v, nan=0) if dtypes[m] == int: avg_results.append(int(v.sum())) elif dtypes[m] == float: avg_results.append(float(v.mean())) else: raise TypeError() eval_results.loc['AVERAGE'] = avg_results eval_results = eval_results.astype(dtypes) print_log('Rendering...', logger) strsummary = mm.io.render_summary( eval_results, formatters=mm.metrics.create().formatters, namemap=METRIC_MAPS) print_log('\n' + strsummary, logger) print_log(f'Evaluation finishes with {(time.time() - t):.2f} s.', logger) eval_results = eval_results.to_dict() out = {METRIC_MAPS[k]: v['OVERALL'] for k, v in eval_results.items()} for k, v in out.items(): out[k] = float(f'{(v):.3f}') if isinstance(v, float) else int(f'{v}') for m in ['OVERALL', 'AVERAGE']: out[f'track_{m}_copypaste'] = '' for k in METRIC_MAPS.keys(): v = eval_results[k][m] v = f'{(v):.3f} ' if isinstance(v, float) else f'{v} ' out[f'track_{m}_copypaste'] += v return out
def eval_map(det_results, annotations, scale_ranges=None, iou_thr=0.5, dataset=None, logger=None, nproc=4): """Evaluate mAP of a dataset. Args: det_results (list[list]): [[cls1_det, cls2_det, ...], ...]. The outer list indicates images, and the inner list indicates per-class detected bboxes. annotations (list[dict]): Ground truth annotations where each item of the list indicates an image. Keys of annotations are: - `bboxes`: numpy array of shape (n, 4) - `labels`: numpy array of shape (n, ) - `bboxes_ignore` (optional): numpy array of shape (k, 4) - `labels_ignore` (optional): numpy array of shape (k, ) scale_ranges (list[tuple] | None): Range of scales to be evaluated, in the format [(min1, max1), (min2, max2), ...]. A range of (32, 64) means the area range between (32**2, 64**2). Default: None. iou_thr (float): IoU threshold to be considered as matched. Default: 0.5. dataset (list[str] | str | None): Dataset name or dataset classes, there are minor differences in metrics for different datsets, e.g. "voc07", "imagenet_det", etc. Default: None. logger (logging.Logger | str | None): The way to print the mAP summary. See `mmdet.utils.print_log()` for details. Default: None. nproc (int): Processes used for computing TP and FP. Default: 4. Returns: tuple: (mAP, [dict, dict, ...]) """ assert len(det_results) == len(annotations) num_imgs = len(det_results) num_scales = len(scale_ranges) if scale_ranges is not None else 1 num_classes = len(det_results[0]) # positive class num area_ranges = ([(rg[0]**2, rg[1]**2) for rg in scale_ranges] if scale_ranges is not None else None) pool = Pool(nproc) eval_results = [] for i in range(num_classes): # get gt and det bboxes of this class cls_dets, cls_gts, cls_gts_ignore = get_cls_results( det_results, annotations, i) # choose proper function according to datasets to compute tp and fp if dataset in ['det', 'vid']: tpfp_func = tpfp_imagenet else: tpfp_func = tpfp_default # compute tp and fp for each image with multiple processes tpfp = pool.starmap( tpfp_func, zip(cls_dets, cls_gts, cls_gts_ignore, [iou_thr for _ in range(num_imgs)], [area_ranges for _ in range(num_imgs)])) tp, fp = tuple(zip(*tpfp)) # calculate gt number of each scale # ignored gts or gts beyond the specific scale are not counted num_gts = np.zeros(num_scales, dtype=int) for j, bbox in enumerate(cls_gts): if area_ranges is None: num_gts[0] += bbox.shape[0] else: gt_areas = (bbox[:, 2] - bbox[:, 0]) * (bbox[:, 3] - bbox[:, 1]) for k, (min_area, max_area) in enumerate(area_ranges): num_gts[k] += np.sum((gt_areas >= min_area) & (gt_areas < max_area)) # sort all det bboxes by score, also sort tp and fp cls_dets = np.vstack(cls_dets) num_dets = cls_dets.shape[0] sort_inds = np.argsort(-cls_dets[:, -1]) tp = np.hstack(tp)[:, sort_inds] fp = np.hstack(fp)[:, sort_inds] # calculate recall and precision with tp and fp tp = np.cumsum(tp, axis=1) fp = np.cumsum(fp, axis=1) eps = np.finfo(np.float32).eps recalls = tp / np.maximum(num_gts[:, np.newaxis], eps) precisions = tp / np.maximum((tp + fp), eps) # calculate AP if scale_ranges is None: recalls = recalls[0, :] precisions = precisions[0, :] num_gts = num_gts.item() mode = 'area' if dataset != 'voc07' else '11points' ap = average_precision(recalls, precisions, mode) eval_results.append({ 'num_gts': num_gts, 'num_dets': num_dets, 'recall': recalls, 'precision': precisions, 'ap': ap }) pool.close() if scale_ranges is not None: # shape (num_classes, num_scales) all_ap = np.vstack([cls_result['ap'] for cls_result in eval_results]) all_num_gts = np.vstack( [cls_result['num_gts'] for cls_result in eval_results]) mean_ap = [] for i in range(num_scales): if np.any(all_num_gts[:, i] > 0): mean_ap.append(all_ap[all_num_gts[:, i] > 0, i].mean()) else: mean_ap.append(0.0) else: aps = [] for cls_result in eval_results: if cls_result['num_gts'] > 0: aps.append(cls_result['ap']) mean_ap = np.array(aps).mean().item() if aps else 0.0 print_map_summary(mean_ap, eval_results, dataset, area_ranges, logger=logger) return mean_ap, eval_results
def validate(dataset, strategy, ranker, reader, params=None): if params is not None: global args args = params def dist_forward_ranker_step(input_ids): print("This function is tracing") def step_fn(input_ids): input_ids = tf.reshape(input_ids, [-1, args.max_sequence_length]) attention_mask = tf.cast(input_ids > 0, dtype=tf.int32) rank_logits = ranker(input_ids=input_ids, attention_mask=attention_mask, training=False) return tf.reshape(rank_logits, [args.batch_size, -1]) per_replica_logits = strategy.run(step_fn, args=(input_ids, )) return per_replica_logits def dist_forward_reader_step(input_ids): print("This function is tracing") def step_fn(input_ids): attention_mask = tf.cast(input_ids > 0, dtype=tf.int32) start_logits, end_logits = reader(input_ids=input_ids, attention_mask=attention_mask, training=False) return start_logits, end_logits per_replica_results = strategy.run(step_fn, args=(input_ids, )) return per_replica_results if not args.disable_tf_function: dist_forward_ranker_step = tf.function(dist_forward_ranker_step) dist_forward_reader_step = tf.function(dist_forward_reader_step) def value_fn_template(ctx, pool_tensors): return pool_tensors[ctx.replica_id_in_sync_group] processes = ProcessPool(processes=os.cpu_count()) tokenizer = get_tokenizer(model_name=args.pretrained_model, prefix=args.prefix) get_best_span_partial = partial(get_best_span, max_answer_length=args.max_answer_length, tokenizer=tokenizer) iterator = iter(dataset) em_hits = [] match_stats = [] for element in tqdm(iterator): answers_serialized = element['answers'] question = element['question'] input_ids = element[ 'passages/sequence_ids'] # bsz x num_passages x max_sequence_length passage_offsets = element[ 'passages/passage_offset'] # bsz x num_passages reduced_input_ids = tf.concat(input_ids.values, axis=0) per_replica_passage_offsets = strategy.experimental_local_results( passage_offsets) global_batch_size = reduced_input_ids.shape[0] if global_batch_size < args.batch_size * strategy.num_replicas_in_sync: # TODO: add code in case batch is not divisible aggregated_input_ids = tf.concat(input_ids.values, axis=0) padded_size = args.batch_size * strategy.num_replicas_in_sync - global_batch_size padded_input_ids = tf.zeros( [padded_size, args.max_passages, args.max_sequence_length], dtype=tf.int32) input_ids = tf.concat([aggregated_input_ids, padded_input_ids], axis=0) pool_input_ids = tf.split( input_ids, num_or_size_splits=strategy.num_replicas_in_sync, axis=0) value_fn_for_input_ids = partial(value_fn_template, pool_tensors=pool_input_ids) input_ids = strategy.experimental_distribute_values_from_function( value_fn_for_input_ids) aggregated_per_replica_passage_offsets = tf.concat( per_replica_passage_offsets, axis=0) lack_size = args.batch_size * strategy.num_replicas_in_sync - aggregated_per_replica_passage_offsets.shape[ 0] padded_per_replica_passage_offsets = tf.zeros( [lack_size, args.max_passages], dtype=tf.int32) per_replica_passage_offsets = tf.concat([ aggregated_per_replica_passage_offsets, padded_per_replica_passage_offsets ], axis=0) per_replica_passage_offsets = tf.split( per_replica_passage_offsets, num_or_size_splits=strategy.num_replicas_in_sync) rank_logits = dist_forward_ranker_step(input_ids) rank_logits = strategy.experimental_local_results(rank_logits) selected_passage_idxs = [ tf.cast(tf.argmax(logits, axis=-1), dtype=tf.int32) for logits in rank_logits ] # num_replicas x batch_size selected_passage_offsets = [] per_replica_input_ids = strategy.experimental_local_results( input_ids ) # num_replicas x batch_sizse x max_passages x max_sequence_length selected_input_ids = [] for sequence_ids, psg_offsets, passage_idxs in zip( per_replica_input_ids, per_replica_passage_offsets, selected_passage_idxs): range_idxs = tf.range(sequence_ids.shape[0], dtype=tf.int32) indices = tf.concat([ tf.expand_dims(range_idxs, axis=1), tf.expand_dims(passage_idxs, axis=1) ], axis=1) selected_passage_offsets.append(tf.gather_nd(psg_offsets, indices)) selected_input_ids.append(tf.gather_nd(sequence_ids, indices)) value_fn = partial(value_fn_template, pool_tensors=selected_input_ids) dist_selected_input_ids = strategy.experimental_distribute_values_from_function( value_fn) start_logits, end_logits = dist_forward_reader_step( input_ids=dist_selected_input_ids) sentence_ids = tf.concat(dist_selected_input_ids.values, axis=0) sentence_ids = tf.RaggedTensor.from_tensor( sentence_ids, padding=tokenizer.pad_token_id) sentence_ids = sentence_ids.to_list() sentence_ids = sentence_ids[:global_batch_size] selected_passage_offsets = tf.concat(selected_passage_offsets, axis=0) selected_passage_offsets = selected_passage_offsets[:global_batch_size] ctx_ids = [ ids[offset:] for ids, offset in zip(sentence_ids, selected_passage_offsets) ] start_logits = tf.concat(start_logits.values, axis=0) start_logits = start_logits.numpy().tolist() start_logits = start_logits[:global_batch_size] start_logits = [ logits[offset:offset + len(ctx)] for logits, offset, ctx in zip( start_logits, selected_passage_offsets, ctx_ids) ] end_logits = tf.concat(end_logits.values, axis=0) end_logits = end_logits.numpy().tolist() end_logits = end_logits[:global_batch_size] end_logits = [ logits[offset:offset + len(ctx)] for logits, offset, ctx in zip( end_logits, selected_passage_offsets, ctx_ids) ] best_spans = processes.starmap(get_best_span_partial, zip(start_logits, end_logits, ctx_ids)) answers_serialized = tf.concat(answers_serialized.values, axis=0) question = tf.concat(question.values, axis=0) answers = [] for ans in answers_serialized: ans_sparse = tf.io.parse_tensor(ans, out_type=tf.string) ans_values = tf.io.parse_tensor(ans_sparse[1], out_type=tf.string) ans_values = [answer.numpy().decode() for answer in ans_values] answers.append(ans_values) question = question.numpy().tolist() question = [q.decode() for q in question] hits = processes.starmap(compare_spans, zip(answers, best_spans)) passages = [tokenizer.decode(ids) for ids in ctx_ids] selected_passage_idxs = tf.concat(selected_passage_idxs, axis=0) selected_passage_idxs = selected_passage_idxs.numpy().tolist() stats = [{ "question": q, "answers": ans, "passage": psg, "predicted": span, "retriever_rank": idx + 1, "hit": hit } for q, ans, span, idx, psg, hit in zip( question, answers, best_spans, selected_passage_idxs, passages, hits)] match_stats.extend(stats) em_hits.extend(hits) print("done") print("-----------------------------------------------------------") return em_hits, match_stats
def main(gt_folder: Path, prediction_folder: Path, original_images: str, output_path: Path, no_visualization: bool, eval_tool: Path, processes: int): # Select the number of threads if processes == 0: pool = Pool(processes=cpu_count()) else: pool = Pool(processes=processes) # Get the paths for all gt files gt_files_path = get_file_list(gt_folder, EXTENSIONS_PATTERNS) # Get the the paths for all prediction images prediction_files_path = get_file_list(prediction_folder, EXTENSIONS_PATTERNS) # Check if we have the same amount of gt and prediction files assert len(gt_files_path) == len( prediction_files_path ), "Amount of gt files and prediction files differ." # Get the paths of original images if set if original_images is not None: original_images = Path(original_images) original_files_path = get_file_list(Path(original_images), EXTENSIONS_PATTERNS) else: original_files_path = [None] * len(gt_files_path) # Timer tic = time.time() # Debugging purposes only! # input_images = [input_images[1]] # gt_xml = [gt_xml[1]] # gt_pxl = [gt_pxl[1]] # input_xml = input_xml[0:3] # gt_xml = gt_xml[0:3] # gt_pxl = gt_pxl[0:3] # For each file run results = list( pool.starmap( evaluate, zip(prediction_files_path, gt_files_path, itertools.repeat(output_path), itertools.repeat(eval_tool), original_files_path, itertools.repeat(no_visualization)))) pool.close() print("Pool closed)") scores = [] errors = [] for item in results: if item[0] is not None: scores.append(item[0]) else: errors.append(item) if list(scores): score = np.mean(scores) else: score = -1 # np.save(os.path.join(output_path, 'results.npy'), results) write_stats(results, errors, score) print('Total time taken: {:.2f}, avg_miou={}, nb_errors={}'.format( time.time() - tic, score, len(errors))) return score