def main(): logger.info("Starting on %s with %d CPU's.", socket.gethostname(), multiprocessing.cpu_count()) args = parse_args() if args.link_run is not None: run_json = link_json(args.link_run, args.data_path) run_json.has_runinfo = True else: json_path = os.path.join(args.data_path, 'input', 'AppSession.json') try: with open(json_path, 'r') as json_file: run_json = parse_json(json_file) except: if os.path.exists(json_path): # copy the input file to the output dir for postmortem analysis logger.error("Error occurred while parsing '%s'" % json_path) with open(json_path, 'r') as json_file: file_cont = json_file.read() out_path = os.path.join(args.data_path, 'logs', 'AppSession.json') with open(out_path, 'w') as json_file: json_file.write(file_cont) else: logger.error("Error: no such file as '%s'" % json_path) raise # Do we have run_ids for all sample_ids ? if run_json.run_id is None: run_json.has_runinfo = False else: bs = BSrequest() sample_id_set = bs.check_run_sample_ids( [run_json.run_id], [s["Id"] for s in run_json.samples]) run_json.has_runinfo = (len(sample_id_set) == len( run_json.samples)) logger.info("setting json.has_run_info to %s" % run_json.has_runinfo) pssm = Pssm() scratch_path = os.path.join(args.data_path, 'scratch') makedirs(scratch_path) for filename in os.listdir(scratch_path): filepath = os.path.join(scratch_path, filename) if os.path.isdir(filepath): shutil.rmtree(filepath) else: os.remove(filepath) args.g2p_path = args.qc_path = create_app_result(args.data_path, run_json, suffix='results') if run_json.run_id is None: run_summary = None else: logger.info('Summarizing run.') run_summary = summarize_run(args, run_json) pool = Pool() pool.map( functools.partial(try_sample, run_info=run_json, args=args, pssm=pssm), range(len(run_json.samples))) pool.close() pool.join() collate_samples(args, run_json) if run_json.run_id is not None: summarize_samples(args, run_json, run_summary) logger.info('Done.')
def quality_over_theta(): number_dataset = 1 data, target, enable_i = datasets[number_dataset] pool = Pool(processes=5) # if we want to average nb_launched = 5 theta = 0.1 data_final = {'WRAcc': [], 'theta': [], 'Algorithm': []} for i in range(10): print('Iteration: {}'.format(i)) for i in range(nb_launched): results_misere = pool.apply_async(misere, (data, target), { 'time_budget': TIME_BUDGET_XP, 'theta': theta }) results_beam = pool.apply_async( beam_search, (data, target), { 'enable_i': enable_i, 'time_budget': TIME_BUDGET_XP, 'theta': theta }) result_ucb_opti = pool.apply_async( seq_scout, (data, target), { 'enable_i': enable_i, 'time_budget': TIME_BUDGET_XP, 'theta': theta }) results_misere = results_misere.get() results_beam = results_beam.get() result_ucb_opti = result_ucb_opti.get() if len(results_beam) < TOP_K: print("Too few beam: {}".format(len(results_beam))) if len(result_ucb_opti) < TOP_K: print("Too few seqscout: {}".format(len(result_ucb_opti))) if len(results_misere) < TOP_K: print("Too few misere: {}".format(len(results_misere))) data_add_generic(data_final, WRAcc=max(0, average_results(results_misere)), theta=theta, Algorithm='misere') data_add_generic(data_final, WRAcc=max(0, average_results(results_beam)), theta=theta, Algorithm='beam') data_add_generic(data_final, WRAcc=max(0, average_results(result_ucb_opti)), theta=theta, Algorithm='seqscout') theta += 0.1 df = pd.DataFrame(data=data_final) sns.set(rc={'figure.figsize': (8, 6.5)}) plt.clf() ax = sns.lineplot(data=df, x='theta', y='WRAcc', hue='Algorithm') plt.savefig('./theta/over_theta.png') df.to_pickle('./theta/result') if SHOW: plt.show()
def boxplot_dataset_iterations(): pool = Pool(processes=5) xp_repeat = 5 data_final = {'WRAcc': [], 'dataset': [], 'Algorithm': []} for i, (data, target, enable_i) in enumerate(datasets): print("Dataset {}".format(datasets_names[i])) for j in range(xp_repeat): results_misere = pool.apply_async(misere, (data, target), {'time_budget': TIME_BUDGET_XP}) results_beam = pool.apply_async(beam_search, (data, target), { 'enable_i': enable_i, 'time_budget': TIME_BUDGET_XP }) result_ucb_opti = pool.apply_async(seq_scout, (data, target), { 'enable_i': enable_i, 'time_budget': TIME_BUDGET_XP }) results_misere = results_misere.get() results_beam = results_beam.get() result_ucb_opti = result_ucb_opti.get() if len(results_misere) < TOP_K: print("Too few example on misere on dataset {}: {} results". format(datasets_names[i], len(results_misere))) if len(results_beam) < TOP_K: print( "Too few example on beam_search on dataset {}: {} results". format(datasets_names[i], len(results_beam))) if len(result_ucb_opti) < TOP_K: print("Too few example on seqscout on dataset {}: {} results". format(datasets_names[i], len(result_ucb_opti))) data_add_generic(data_final, WRAcc=max(0, average_results(results_misere)), dataset=datasets_names[i], Algorithm='misere') data_add_generic(data_final, WRAcc=max(0, average_results(results_beam)), dataset=datasets_names[i], Algorithm='beam') data_add_generic(data_final, WRAcc=max(0, average_results(result_ucb_opti)), dataset=datasets_names[i], Algorithm='seqscout') df = pd.DataFrame(data=data_final) sns.set(rc={'figure.figsize': (8, 6.5)}) plt.clf() ax = sns.barplot(x='dataset', y='WRAcc', hue='Algorithm', data=df) plt.savefig('./wracc_datasets/iterations_boxplot.png') df.to_pickle('./wracc_datasets/result') if SHOW: plt.show()
def _read_obs(self, stns_ids=None): # Saw extreme decreased performance due to garbage collection when # pandas ran checks for a chained assignment. Turn off this check # temporarily. opt_val = pd.get_option('mode.chained_assignment') pd.set_option('mode.chained_assignment', None) try: if stns_ids is None: stns_obs = self.stns else: stns_obs = self.stns.loc[stns_ids] nstns = len(stns_obs.station_id) nprocs = self.nprocs if nstns >= self.nprocs else nstns if self.has_start_end_dates: start_end = (self.start_date, self.end_date) else: start_end = None if nprocs > 1: # http://stackoverflow.com/questions/24171725/ # scikit-learn-multicore-attributeerror-stdin-instance- # has-no-attribute-close if not hasattr(sys.stdin, 'close'): def dummy_close(): pass sys.stdin.close = dummy_close iter_stns = [(None, a_id, self.elems, start_end) for a_id in stns_obs.station_id] pool = Pool(processes=nprocs) obs = pool.map(_parse_ghcnd_dly_star_remote, iter_stns) pool.close() pool.join() else: obs = [] for a_id in stns_obs.station_id: abuf = open_remote_file('https://www1.ncdc.noaa.gov/' 'pub/data/ghcn/daily/all/%s.dly' % a_id) obs_stn = _parse_ghcnd_dly(abuf, a_id, self.elems, start_end) obs.append(obs_stn) df_obs = pd.concat(obs, ignore_index=True) finally: pd.set_option('mode.chained_assignment', opt_val) df_obs = df_obs.set_index(['station_id', 'elem', 'time']) df_obs = df_obs.sortlevel(0, sort_remaining=True) return df_obs
def show_quality_over_iterations_ucb(number_dataset): data, target, enable_i = datasets[number_dataset] # if we want to average nb_launched = 5 pool = Pool(processes=3) iterations_limit = 50 iterations_step = 1000 data_final = {'WRAcc': [], 'iterations': [], 'Algorithm': []} for i in range(12): print('Iteration: {}'.format(i)) for i in range(nb_launched): results_misere = pool.apply_async( misere, (data, target), { 'time_budget': TIME_BUDGET_XP, 'iterations_limit': iterations_limit }) results_beam = pool.apply_async( beam_search, (data, target), { 'enable_i': enable_i, 'time_budget': TIME_BUDGET_XP, 'iterations_limit': iterations_limit }) result_ucb_opti = pool.apply_async( seq_scout, (data, target), { 'enable_i': enable_i, 'time_budget': TIME_BUDGET_XP, 'iterations_limit': iterations_limit }) data_add_generic(data_final, WRAcc=max(0, average_results(results_misere.get())), iterations=iterations_limit, Algorithm='misere') data_add_generic(data_final, WRAcc=max(0, average_results(results_beam.get())), iterations=iterations_limit, Algorithm='beam') data_add_generic(data_final, WRAcc=max(0, average_results(result_ucb_opti.get())), iterations=iterations_limit, Algorithm='seqscout') iterations_limit += iterations_step df = pd.DataFrame(data=data_final) sns.set(rc={'figure.figsize': (8, 6.5)}) plt.clf() ax = sns.lineplot(data=df, x='iterations', y='WRAcc', hue='Algorithm', markers=True) plt.savefig('./iterations_ucb/over_iterations{}.png'.format( datasets_names[number_dataset])) df.to_pickle('./iterations_ucb/result{}'.format( datasets_names[number_dataset])) if SHOW: plt.show()
def get_data_for_linker( catalog: str, entity: str, qids: Set[str], url_pids: Set[str], ext_id_pids_to_urls: Dict, qids_and_tids: Dict, fileout: TextIO, ) -> None: """Collect relevant data for linking Wikidata to a given catalog. Dump the result to a given output stream. This function uses multithreaded parallel processing. :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``. A supported catalog :param entity: ``{'actor', 'band', 'director', 'musician', 'producer', 'writer', 'audiovisual_work', 'musical_work'}``. A supported entity :param qids: a set of QIDs :param url_pids: a set of PIDs holding URL values. Returned by :py:func:`soweego.wikidata.sparql_queries.url_pids` :param ext_id_pids_to_urls: a ``{PID: {formatter_URL: (id_regex, url_regex,)} }`` dict. Returned by :py:func:`soweego.wikidata.sparql_queries.external_id_pids_and_urls` :param fileout: a file stream open for writing :param qids_and_tids: a ``{QID: {'tid': {catalog_ID_set} }`` dict. Populated by :py:func:`soweego.commons.data_gathering.gather_target_ids` """ qid_buckets, request_params = _prepare_request( qids, 'labels|aliases|descriptions|sitelinks|claims') # Catalog-specific data needs if catalog in constants.REQUIRE_OCCUPATION.keys(): needs_occupation = entity in constants.REQUIRE_OCCUPATION[catalog] else: needs_occupation = False needs_genre = entity in constants.REQUIRE_GENRE needs_publication_date = entity in constants.REQUIRE_PUBLICATION_DATE # Initialize 7 counters to 0 # Indices legend: # 0 = claims # 1 = labels # 2 = aliases # 3 = descriptions # 4 = sitelinks # 5 = third-party URLs # 6 = third-party IDs counters = [0] * 7 # Create a partial function where all parameters # but the data bucket are passed to `_process_bucket`, # so that we only pass the data bucket # when we call `pool_function`. # In this way, it becomes trivial to use # `multiprocessing.Pool` map functions, like `imap_unordered` pool_function = partial( _process_bucket, request_params=request_params, url_pids=url_pids, ext_id_pids_to_urls=ext_id_pids_to_urls, qids_and_tids=qids_and_tids, needs=(needs_occupation, needs_genre, needs_publication_date), counters=counters, ) # Create a pool of threads and map the list of buckets via `pool_function` with Pool() as pool: # `processed_bucket` will be a list of dicts, where each dict # is a processed entity from the bucket for processed_bucket in pool.imap_unordered( pool_function, tqdm(qid_buckets, total=len(qid_buckets))): # Join results into a string so that we can write them to # the dump file to_write = ''.join( json.dumps(result, ensure_ascii=False) + '\n' for result in processed_bucket) fileout.write(to_write) fileout.flush() LOGGER.info( 'QIDs: got %d with no expected claims, %d with no labels, ' '%d with no aliases, %d with no descriptions, %d with no sitelinks, ' '%d with no third-party links, %d with no external ID links', *counters)
def _build_tobs_hdfs(path_out, fpaths_yrly, elems, nprocs=1): fpaths_yrly = np.array(fpaths_yrly) nprocs = nprocs if fpaths_yrly.size >= nprocs else fpaths_yrly.size stn_nums = pd.DataFrame([(np.nan, np.nan)], columns=['station_id', 'station_num']) num_inc = 0 first_append = {elem:True for elem in elems} # assume ~1.5 millions rows per year to estimate expected number of rows erows = 1500000 * len(fpaths_yrly) def write_data(df_tobs, num_inc, stn_nums): hdfs = {elem:pd.HDFStore(os.path.join(path_out, '%s.hdf' % elem), 'a') for elem in elems} df_tobs.set_index('station_id', inplace=True) df_tobs['obs_value'] = df_tobs.obs_value.astype(np.int16) uids = pd.DataFrame(df_tobs.index.unique(), columns=['station_id']) uids = uids.merge(stn_nums, how='left', on='station_id') mask_nonum = uids.station_num.isnull() if mask_nonum.any(): nums = np.arange(num_inc, (num_inc + mask_nonum.sum())) uids.loc[mask_nonum, 'station_num'] = nums num_inc = nums[-1] + 1 stn_nums = pd.concat([stn_nums, uids[mask_nonum]], ignore_index=True) uids.set_index('station_id', inplace=True) uids['station_num'] = uids.station_num.astype(np.int) df_tobs = df_tobs.join(uids, how='left').set_index('station_num') grped = df_tobs.groupby('elem') for elem in elems: try: grp = grped.get_group(elem)[['time', 'obs_value']].copy() except KeyError: # no observation for element continue if first_append[elem]: hdfs[elem].append('df_tobs', grp, data_columns=['time'], expectedrows=erows, index=False) first_append[elem] = False else: hdfs[elem].append('df_tobs', grp, data_columns=['time'], index=False) for store in hdfs.values(): store.close() return num_inc, stn_nums # Initialize output hdfs hdfs = [pd.HDFStore(os.path.join(path_out, '%s.hdf' % elem), 'w') for elem in elems] for store in hdfs: store.close() if nprocs > 1: # http://stackoverflow.com/questions/24171725/ # scikit-learn-multicore-attributeerror-stdin-instance- # has-no-attribute-close if not hasattr(sys.stdin, 'close'): def dummy_close(): pass sys.stdin.close = dummy_close for i in np.arange(fpaths_yrly.size, step=nprocs): fpaths = fpaths_yrly[i:(i + nprocs)] gc.collect() pool = Pool(processes=nprocs) iter_files = [(fpath, elems) for fpath in fpaths] ls_tobs = pool.map(_parse_ghcnd_yrly_star, iter_files, chunksize=1) pool.close() pool.join() for df_tobs in ls_tobs: num_inc, stn_nums = write_data(df_tobs, num_inc, stn_nums) del df_tobs del ls_tobs else: for fpath in fpaths_yrly: df_tobs = _parse_ghcnd_yrly(fpath, elems) num_inc, stn_nums = write_data(df_tobs, num_inc, stn_nums) stn_nums = stn_nums.dropna() store_stnnums = pd.HDFStore(os.path.join(path_out, 'stn_nums.hdf'), 'w') store_stnnums.put('df_stnnums', stn_nums) store_stnnums.close() # Create indexess for elem in elems: with pd.HDFStore(os.path.join(path_out, '%s.hdf' % elem)) as store: store.create_table_index('df_tobs', optlevel=9, kind='full')
response = client.rpc_add(1, 2) return response.data.result def long_time_task(): for ii in range(1000): # response = client.rpc_add(1, 3) client = get_client2() res = main2(client) # client = get_client() # res = main(client) print(ii) if __name__ == '__main__': t = time.time() from multiprocessing.pool import Pool p = Pool() for i in range(4): p.apply_async(long_time_task, args=()) p.close() p.join() print(time.time() - t) time.sleep(2) tracer.close()
def predict(self, prediction_object): threshold = self.prediction_threshold predictions = list() if isinstance(prediction_object, Commit): # Predict open_issues = [ i for i in self.repository_obj.issues if # (len(i.states) == 0 or i.states[-1].to_ == IssueStates.open) # or (min([ abs(entity.timestamp - prediction_object.timestamp ) if hasattr(entity, 'timestamp') and entity.timestamp else timedelta(days=self.net_size_in_days, seconds=1) for entity in [i.original_post] + i.states + i.actions ]) <= timedelta(days=self.net_size_in_days)) ] open_issues += [null_issue] prediction_data = list() if len(open_issues) > 128: with Pool(processes=os.cpu_count() - 1) as wp: for point in wp.map(func=Issue_Closure( prediction_object, self.feature_generator), iterable=open_issues, chunksize=128): prediction_data.append(point) else: for issue in open_issues: prediction_data.append( self.feature_generator.generate_features_commit( issue, prediction_object, False)) for point in prediction_data: probabilities = self.clf.predict_proba( np.array( tuple([ v for k, v in point.items() if k not in ['linked', 'issue', 'commit'] ])).reshape(1, -1)) if point['issue'] == 'null_issue': threshold = max(threshold, probabilities[0][1]) else: prediction = (point['issue'], float(probabilities[0][1])) predictions.append(prediction) predictions = sorted([p for p in predictions if p[1] >= threshold], key=lambda p: (p[1], p[0]), reverse=True) response = prediction_object.c_hash, predictions elif isinstance(prediction_object, Issue): # Predict candidates = [ c for c in self.repository_obj.commits if (min([ abs(entity.timestamp - c.timestamp ) if hasattr(entity, 'timestamp') and entity.timestamp else timedelta(days=self.net_size_in_days, seconds=1) for entity in [prediction_object.original_post] + prediction_object.states + prediction_object.actions ]) <= timedelta(days=self.net_size_in_days)) ] candidates += [null_commit] prediction_data = list() if len(candidates) > 128: with Pool(processes=os.cpu_count() - 1) as wp: for point in wp.map(func=Commit_Closure( prediction_object, self.feature_generator), iterable=candidates, chunksize=128): prediction_data.append(point) else: for commit in candidates: prediction_data.append( self.feature_generator.generate_features_commit( prediction_object, commit, False)) for point in prediction_data: probabilities = self.clf.predict_proba( np.array( tuple([ v for k, v in point.items() if k not in ['linked', 'issue', 'commit'] ])).reshape(1, -1)) if point['commit'] == 'null_commit': threshold = max(threshold, probabilities[0][1]) else: prediction = (point['commit'], float(probabilities[0][1])) predictions.append(prediction) predictions = sorted([p for p in predictions if p[1] >= threshold], key=lambda p: (p[1], p[0]), reverse=True) response = prediction_object.id_, predictions if self.use_sim_cs or self.use_sim_j or self.use_sim_d or self.use_file: if self.predictions_from_last_tf_idf_update < self.predictions_between_updates: self.predictions_from_last_tf_idf_update += 1 else: self.predictions_from_last_tf_idf_update = 0 temporal_config = None self.model, self.dictionary, new_cache = generate_tfidf_commit( self.repository_obj, self.stopwords, self.min_tok_len, cache=self.feature_generator.text_cache) similarity_config = { 'dict': self.dictionary, 'model': self.model, 'min_len': self.min_tok_len, 'stopwords': self.stopwords, } if self.use_temporal: self.fingerprint = None temporal_config = { 'fingerprint': self.fingerprint, 'net_size_in_days': self.net_size_in_days, } self.feature_generator = FeatureGenerator( use_file=self.use_file, use_sim_cs=self.use_sim_cs, use_sim_j=self.use_sim_j, use_sim_d=self.use_sim_d, use_social=self.use_social, use_temporal=self.use_temporal, use_pr_only=self.use_pr_only, use_issue_only=self.use_issue_only, similarity_config=similarity_config, temporal_config=temporal_config, text_cache=new_cache, selected=self.features, ) return response
def execute(self): yesterday = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d 00:00:00") today = datetime.datetime.now().strftime("%Y-%m-%d 00:00:00") tomorrow = (datetime.datetime.now() + datetime.timedelta(days=1)).strftime("%Y-%m-%d 00:00:00") # self.etl() sql = ( "select * from {} where SBMT_TMSTMP >= to_timestamp('{}', 'yyyy-mm-dd hh24:mi:ss') " "and SBMT_TMSTMP < to_timestamp('{}', 'yyyy-mm-dd hh24:mi:ss') ") # KC21 empi batch_rows = self.batch_rows(sql.format(KC21, yesterday, today)) while True: try: batch = next(batch_rows) m = Manager() share_ls = m.list() share_ls.extend(batch) del batch task = Pool(self.__process_count) for row in share_ls: task.apply_async(self.empi, (row, "KC21")) task.close() task.join() except StopIteration: break batch_rows = self.batch_rows(sql.format(N041, yesterday, today)) while True: try: batch = next(batch_rows) m = Manager() share_ls = m.list() share_ls.extend(batch) del batch task = Pool(self.__process_count) for row in share_ls: task.apply_async(self.process, (row, "N041")) task.close() task.join() except StopIteration: break # pairing check # N041 batch_rows = self.batch_rows(sql.format(N041, yesterday, today)) while True: try: batch = next(batch_rows) m = Manager() share_ls = m.list() share_ls.extend(batch) del batch task = Pool(self.__process_count) for row in share_ls: task.apply_async(self.pairing_check, (row, "N041")) task.close() task.join() except StopIteration: break # KC21 batch_rows = self.batch_rows(sql.format(KC21, yesterday, today)) while True: try: batch = next(batch_rows) m = Manager() share_ls = m.list() share_ls.extend(batch) del batch task = Pool(self.__process_count) for row in share_ls: task.apply_async(self.pairing_check, (row, "KC21")) task.close() task.join() except StopIteration: break # KC24 batch_rows = self.batch_rows(sql.format(KC24, yesterday, today)) while True: try: batch = next(batch_rows) m = Manager() share_ls = m.list() share_ls.extend(batch) del batch task = Pool(self.__process_count) for row in share_ls: task.apply_async(self.pairing_check, (row, "KC24")) task.close() task.join() except StopIteration: break sql = ( "select * from {} where etl_date >= to_date('{}', 'yyyy-mm-dd hh24:mi:ss') " "and etl_date < to_date('{}', 'yyyy-mm-dd hh24:mi:ss')").format( ETL, today, tomorrow) row = self.row(sql) row["RUN"] = "True" self.insert_or_update(ETL, row, key="RUN")
def getting_start(self): sql = ("select * from {}") # KC21 EMPI print("Start processing...") batch_rows = self.batch_rows(sql.format(KC21)) while True: try: batch = next(batch_rows) m = Manager() share_ls = m.list() share_ls.extend(batch) del batch task = Pool(self.__process_count) for row in share_ls: task.apply_async(self.empi, (row, "KC21")) task.close() task.join() except StopIteration: break batch_rows = self.batch_rows(sql.format(N041)) while True: try: batch = next(batch_rows) m = Manager() share_ls = m.list() share_ls.extend(batch) del batch task = Pool(self.__process_count) for row in share_ls: task.apply_async(self.process, (row, "N041")) task.close() task.join() except StopIteration: break # pairing check # N041 batch_rows = self.batch_rows(sql.format(N041)) while True: try: batch = next(batch_rows) m = Manager() share_ls = m.list() share_ls.extend(batch) del batch task = Pool(self.__process_count) for row in share_ls: task.apply_async(self.pairing_check, (row, "N041")) task.close() task.join() except StopIteration: break # KC21 batch_rows = self.batch_rows(sql.format(KC21)) while True: try: batch = next(batch_rows) m = Manager() share_ls = m.list() share_ls.extend(batch) del batch task = Pool(self.__process_count) for row in share_ls: task.apply_async(self.pairing_check, (row, "KC21")) task.close() task.join() except StopIteration: break # KC24 batch_rows = self.batch_rows(sql.format(KC24)) while True: try: batch = next(batch_rows) m = Manager() share_ls = m.list() share_ls.extend(batch) del batch task = Pool(self.__process_count) for row in share_ls: task.apply_async(self.pairing_check, (row, "KC24")) task.close() task.join() except StopIteration: break
meanexp = getMean(prediction_test) meanofdiff = getMeanofDiffs(ground_test, prediction_test) pvarfe = getPvar(ground_test, meanfe) pvarexp = getPvar(prediction_test, meanexp) ccc_test = getCCC(pvarfe, pvarexp, meanofdiff, meanfe, meanexp) print(f"CV = {i}, Test >>> gamma = {best_gamma}, C = {best_c}, RMSE. ={rmse_test}, Spearman = {spearman_test}, CCC = {ccc_test}") logger.info( f"CV = {i}, Test >>> gamma = {best_gamma}, C = {best_c}, RMSE. ={rmse_test}, Spearman = {spearman_test}, CCC = {ccc_test}") # Save df = pd.DataFrame(data={"vggish_prediction_D": prediction_test, "vggish_groundtruth_D": test_y.values.tolist()}) df.to_csv(f"./Prediction_202106_Ratio631/CV{i}_vggish_Dominance_0621.csv") print("save success!") print(f">>>>>>> CV = {i}/10, Over Training >>>>>>>\n") logger.info(f">>>>>>> CV = {i}/10,Over Training >>>>>>>") return [rmse_test,spearman_values_test,ccc_test] if __name__ == '__main__': pool = Pool(int(os.getenv('N_PROC', os.cpu_count()))) futures = [pool.apply_async(func=svr, args=[i]) for i in range(1, 11)] pool.close() # 关闭pool,使其不在接受新的(主进程)任务 average_rmse_test, average_pearson_test, average_ccc_test = [], [], [] for item in futures: result = item.get() average_rmse_test.append(result[0]) average_pearson_test.append(result[1]) average_ccc_test.append(result[2]) print(f"Vggish Regression Average Results of Dominance: RMSE.= {mean(average_rmse_test)}, Spearman = {mean(average_pearson_test)}, CCC = {mean(average_ccc_test)}") logger.info( f"/n/n/n Vggish Regression Average Results of Dominance: RMSE.= {mean(average_rmse_test)}, Spearman = {mean(average_pearson_test)}, CCC = {mean(average_ccc_test)}") pool.join()
def validate(self, do_mirroring: bool = True, use_sliding_window: bool = True, step_size: float = 0.5, save_softmax: bool = True, use_gaussian: bool = True, overwrite: bool = True, validation_folder_name: str = 'validation_raw', debug: bool = False, all_in_gpu: bool = False, segmentation_export_kwargs: dict = None): current_mode = self.network.training self.network.eval() assert self.was_initialized, "must initialize, ideally with checkpoint (or train first)" if self.dataset_val is None: self.load_dataset() self.do_split() if segmentation_export_kwargs is None: if 'segmentation_export_params' in self.plans.keys(): force_separate_z = self.plans['segmentation_export_params'][ 'force_separate_z'] interpolation_order = self.plans['segmentation_export_params'][ 'interpolation_order'] interpolation_order_z = self.plans[ 'segmentation_export_params']['interpolation_order_z'] else: force_separate_z = None interpolation_order = 1 interpolation_order_z = 0 else: force_separate_z = segmentation_export_kwargs['force_separate_z'] interpolation_order = segmentation_export_kwargs[ 'interpolation_order'] interpolation_order_z = segmentation_export_kwargs[ 'interpolation_order_z'] output_folder = join(self.output_folder, validation_folder_name) maybe_mkdir_p(output_folder) if do_mirroring: mirror_axes = self.data_aug_params['mirror_axes'] else: mirror_axes = () pred_gt_tuples = [] export_pool = Pool(2) results = [] transpose_backward = self.plans.get('transpose_backward') for k in self.dataset_val.keys(): properties = load_pickle(self.dataset[k]['properties_file']) data = np.load(self.dataset[k]['data_file'])['data'] # concat segmentation of previous step seg_from_prev_stage = np.load( join(self.folder_with_segs_from_prev_stage, k + "_segFromPrevStage.npz"))['data'][None] print(data.shape) data[-1][data[-1] == -1] = 0 data_for_net = np.concatenate( (data[:-1], to_one_hot(seg_from_prev_stage[0], range(1, self.num_classes)))) softmax_pred = self.predict_preprocessed_data_return_seg_and_softmax( data_for_net, do_mirroring=do_mirroring, mirror_axes=mirror_axes, use_sliding_window=use_sliding_window, step_size=step_size, use_gaussian=use_gaussian, all_in_gpu=all_in_gpu, mixed_precision=self.fp16)[1] if transpose_backward is not None: transpose_backward = self.plans.get('transpose_backward') softmax_pred = softmax_pred.transpose( [0] + [i + 1 for i in transpose_backward]) fname = properties['list_of_data_files'][0].split("/")[-1][:-12] if save_softmax: softmax_fname = join(output_folder, fname + ".npz") else: softmax_fname = None """There is a problem with python process communication that prevents us from communicating obejcts larger than 2 GB between processes (basically when the length of the pickle string that will be sent is communicated by the multiprocessing.Pipe object then the placeholder (\%i I think) does not allow for long enough strings (lol). This could be fixed by changing i to l (for long) but that would require manually patching system python code. We circumvent that problem here by saving softmax_pred to a npy file that will then be read (and finally deleted) by the Process. save_segmentation_nifti_from_softmax can take either filename or np.ndarray and will handle this automatically""" if np.prod(softmax_pred.shape) > (2e9 / 4 * 0.85): # *0.85 just to be save np.save(fname + ".npy", softmax_pred) softmax_pred = fname + ".npy" results.append( export_pool.starmap_async( save_segmentation_nifti_from_softmax, ((softmax_pred, join(output_folder, fname + ".nii.gz"), properties, interpolation_order, self.regions_class_order, None, None, softmax_fname, None, force_separate_z, interpolation_order_z), ))) pred_gt_tuples.append([ join(output_folder, fname + ".nii.gz"), join(self.gt_niftis_folder, fname + ".nii.gz") ]) _ = [i.get() for i in results] task = self.dataset_directory.split("/")[-1] job_name = self.experiment_name _ = aggregate_scores(pred_gt_tuples, labels=list(range(self.num_classes)), json_output_file=join(output_folder, "summary.json"), json_name=job_name, json_author="Fabian", json_description="", json_task=task) # in the old nnunet we would stop here. Now we add a postprocessing. This postprocessing can remove everything # except the largest connected component for each class. To see if this improves results, we do this for all # classes and then rerun the evaluation. Those classes for which this resulted in an improved dice score will # have this applied during inference as well self.print_to_log_file("determining postprocessing") determine_postprocessing(self.output_folder, self.gt_niftis_folder, validation_folder_name, final_subf_name=validation_folder_name + "_postprocessed", debug=debug) # after this the final predictions for the vlaidation set can be found in validation_folder_name_base + "_postprocessed" # They are always in that folder, even if no postprocessing as applied! # detemining postprocesing on a per-fold basis may be OK for this fold but what if another fold finds another # postprocesing to be better? In this case we need to consolidate. At the time the consolidation is going to be # done we won't know what self.gt_niftis_folder was, so now we copy all the niftis into a separate folder to # be used later gt_nifti_folder = join(self.output_folder_base, "gt_niftis") maybe_mkdir_p(gt_nifti_folder) for f in subfiles(self.gt_niftis_folder, suffix=".nii.gz"): success = False attempts = 0 while not success and attempts < 10: try: shutil.copy(f, gt_nifti_folder) success = True except OSError: attempts += 1 sleep(1) self.network.train(current_mode) export_pool.close() export_pool.join()
def run(self): # On compte le nombre de fichiers à traiter deb = time.time() self.fenetre.setpourcent(0) if len(os.listdir(self.data_entree)) == 0: self.fenetre.setInstruction("le dossier \"./results\" est vide.") return pas = 100. / len(os.listdir(self.data_entree)) self.fenetre.setInstruction("Debut de la synthese des fichiers.") # pour chaque fichier dans./results/ MultiListe = [] for file in sorted(os.listdir(self.data_entree)): if file.endswith('.csv'): MultiListe.append([file, self.frequency]) multiprocessing = Pool(self.nbproc) Values = multiprocessing.map(description, MultiListe) for resultat in Values: self.summary.append({ 'name': (resultat[0])[:(resultat[0]).rfind('-')], 'id': (resultat[0])[(resultat[0]).rfind('-') + 1:(resultat[0]).rfind('.')], 'category': "\(O_O)/", 'measurement_tools': self.measurement_tools, 'measurement_mode': self.measurement_mode, 'measurement_system': self.measurement_system, 'measurement_method': self.measurement_method, 'measurement_protocol': self.measurement_protocol, 'min_value': resultat[1], 'number_of_samples': resultat[11], 'max_value': resultat[2], 'mean': resultat[3], 'stdev': resultat[4], 'median_value': resultat[5], 'sum_square': resultat[6], 'sum_square/frequency': resultat[7], 'percentile_25': resultat[8], 'percentile_50': resultat[9], 'percentile_75': resultat[10] }) self.fenetre.setpourcent(self.fenetre.getpourcent() + pas) sorted(self.summary, key=lambda k: (k['name'], k['id'])) summary = pd.DataFrame( self.summary, columns=[ 'name', 'id', 'category', 'measurement_tools', 'measurement_system', 'measurement_mode', 'measurement_method', 'measurement_protocol', 'number_of_samples', 'min_value', 'max_value', 'mean', 'stdev', 'median_value', 'sum_square', 'sum_square/frequency', 'percentile_25', 'percentile_50', 'percentile_75' ]) summary.to_csv('./synthese/{}_summary.csv'.format(self.data_sortie), index=False, sep=';') self.fenetre.setInstruction( "Travail termine.\nTemps total pris pour la synthese : " + str(round((time.time() - deb), 2)) + " secs")
def Pool(processes=None, initializer=None, initargs=()): ''' Returns a process pool object ''' from multiprocessing.pool import Pool return Pool(processes, initializer, initargs)
def run(name): print("%s子进程开始,进程ID:%d" % (name, os.getpid())) start = time() # 可以通过这个方式,保证每个程序开始运行的时间有微小的差异, # 可应用于爬虫。 sleep(random.choice([1, 2, 3, 4])) end = time() print("%s子进程结束,进程ID:%d。耗时%0.2f" % (name, os.getpid(), end - start)) if __name__ == "__main__": print("父进程开始") # 创建多个进程,表示可以同时执行的进程数量。默认大小是CPU的核心数 # 通常获取逻辑cpu个数,然后设置进程数量。 p = Pool(4) for i in range(10): # 创建进程,放入进程池统一管理 # 通过异步运行的方式,每运行一个run,创建一个进程,这些进程由进程池管理。 p.apply_async(run, args=(i, )) # 也可以用同步的方式启动,每个进程需要等待程序结束再继续加载。 # p.apply(run, args=(i,)) # 如果我们用的是进程池,在调用join()之前必须要先close(), # 并且在close()之后不能再继续往进程池添加新的进程 p.close() # 进程池对象调用join,会等待进程池中所有的子进程结束完毕再去结束父进程 p.join() print("父进程结束。") p.terminate() #
def train(self, dump_reader, parallel=True, pool_size=multiprocessing.cpu_count(), chunk_size=100): self._word_counter = multiprocessing.Value(c_uint64, 0) self._word_alpha = multiprocessing.RawValue( c_float, self._word_initial_alpha ) self._entity_alpha = multiprocessing.RawValue( c_float, self._entity_initial_alpha ) logger.info('Initializing weights...') syn0_shared = multiprocessing.RawArray( c_float, len(self.dictionary) * self._size ) syn0 = np.frombuffer(syn0_shared, dtype=REAL) syn0 = syn0.reshape(len(self.dictionary), self._size) for w in self.dictionary: if isinstance(w, Word): np.random.seed(np.uint32(hash(w.text))) elif isinstance(w, Entity): np.random.seed(np.uint32(hash(w.title))) else: RuntimeError('Unknown type') syn0[w.index] = (np.random.rand(self._size) - 0.5) / self._size syn1_shared = multiprocessing.RawArray( c_float, len(self.dictionary) * self._size ) syn1 = np.frombuffer(syn1_shared, dtype=REAL) syn1 = syn1.reshape(len(self.dictionary), self._size) syn1.fill(0) self._total_words = int(sum( w.count for w in self.dictionary.words() )) self._total_words *= self._iteration logger.info('Total number of words: %d', self._total_words) word_neg_table = self._build_word_neg_table() entity_neg_table = self._build_entity_neg_table() logger.info('Starting to train a model...') def iter_dump_reader(): for n in range(self._iteration): logger.info('Iteration: %d', n) for page in dump_reader: yield page init_args = ( self, syn0_shared, syn1_shared, word_neg_table, entity_neg_table ) if parallel: pool = Pool(pool_size, initializer=init_worker, initargs=init_args) imap_func = partial(pool.imap_unordered, chunksize=chunk_size) else: init_worker(*init_args) imap_func = imap for (n, _) in enumerate(imap_func(train_page, iter_dump_reader())): if n % 10000 == 0: prog = float(self._word_counter.value) / self._total_words logger.info( 'Proccessing page #%d progress: %.1f%% ' 'word alpha: %.3f entity alpha: %.3f', n, prog * 100, self._word_alpha.value, self._entity_alpha.value ) if parallel: pool.close() self.syn0 = syn0 self.syn1 = syn1 self._word_neg_table = word_neg_table self._entity_neg_table = entity_neg_table
initClosure(closurePath) initAnnotations(annotationsFilePath) # cafaFiles= ["/mnt/home/hampt/workspace/doctorProject/src/CAFA/sprot_go.fasta.test.outM1.txt","/mnt/home/hampt/workspace/doctorProject/src/CAFA/sprot_go.fasta.test.outM2.txt","/mnt/home/hampt/workspace/doctorProject/src/CAFA/sprot_go.fasta.test.outR.txt","/mnt/home/hampt/workspace/doctorProject/src/CAFA/sprot_go.fasta.test.outX.txt", "/mnt/home/hampt/workspace/doctorProject/src/CAFA/sprot_go.fasta.test.outY.txt", "/mnt/home/hampt/workspace/doctorProject/src/CAFA/sprot_go.fasta.test.outZ.txt", "/mnt/home/hampt/workspace/doctorProject/src/CAFA/sprot_go.fasta.test.outM2R.txt" ] cafaFiles= ["/mnt/home/hampt/workspace/doctorProject/src/CAFA/sprot_go.fasta.test.outX.txt" ] methodToTargetToTermToScore = {} inputs = [] for filePath in cafaFiles: inputs.append((filePath)) print inputs pool = Pool(processes=10) resultMaps = pool.map(fillData, inputs, chunksize=1) for resultMap in resultMaps: for key, val in resultMap.iteritems(): methodToTargetToTermToScore[key] = val targetToTermToMethodToScore = collections.defaultdict(dict) for method, methodDict in methodToTargetToTermToScore.iteritems(): for target, targetDict in methodDict.iteritems(): for term, score in targetDict.iteritems(): targetToTermToMethodToScore[target].setdefault(term,{})[method] = score # outMeta = open("/mnt/home/hampt/workspace/doctorProject/src/CAFA/meta2.out",'w') # i=0 # for target, targetDict in targetToTermToMethodToScore.iteritems(): # for term, termDict in targetDict.iteritems():
import subprocess from multiprocessing.pool import Pool # mocking your code modules = ["mod1.onnx", "mod2.onnx"] def run(args): # pool.map will call run() with a tuple, deconstruct it idx, mod = args with open("output.txt", "w") as fp: subprocess.run(["./SOME_PROGRAM", str(idx)], stdout=fp) print("module", mod, "has finished") # use the maximum number of threads by default with Pool() as pool: pool.map(run, enumerate(modules))
wer_inst = decoder.wer(transcript, reference) cer_inst = decoder.cer(transcript, reference) total_cer += cer_inst total_wer += wer_inst num_tokens += len(reference.split()) num_chars += len(reference) wer = float(total_wer) / num_tokens cer = float(total_cer) / num_chars return [lm_alpha, lm_beta, wer * 100, cer * 100] if __name__ == '__main__': p = Pool(args.num_workers, init, [args.beam_width, model.labels.index('_'), args.lm_path]) cand_alphas = np.linspace(args.lm_alpha_from, args.lm_alpha_to, args.lm_num_alphas) cand_betas = np.linspace(args.lm_beta_from, args.lm_beta_to, args.lm_num_betas) params_grid = [(float(alpha), float(beta)) for alpha in cand_alphas for beta in cand_betas] scores = [] for params in tqdm(p.imap(decode_dataset, params_grid), total=len(params_grid)): scores.append(list(params)) print("Saving tuning results to: {}".format(args.output_path)) with open(args.output_path, "w") as fh:
def _read_obs(self, stns_ids=None): # Saw extreme decreased performance due to garbage collection when # pandas ran checks for a chained assignment. Turn off this check # temporarily. opt_val = pd.get_option('mode.chained_assignment') pd.set_option('mode.chained_assignment', None) try: if stns_ids is None: stns_obs = self.stns else: stns_obs = self.stns.loc[stns_ids] nstns = len(stns_obs.station_id) nprocs = self.nprocs if nstns >= self.nprocs else nstns if self.has_start_end_dates: start_end = (self.start_date, self.end_date) else: start_end = None if nprocs > 1: # http://stackoverflow.com/questions/24171725/ # scikit-learn-multicore-attributeerror-stdin-instance- # has-no-attribute-close if not hasattr(sys.stdin, 'close'): def dummy_close(): pass sys.stdin.close = dummy_close iter_stns = [(os.path.join(self.path_ghcnd_data, 'ghcnd_all', '%s.dly' % a_id), a_id, self._elems, start_end) for a_id in stns_obs.station_id] pool = Pool(processes=nprocs) obs = pool.map(_parse_ghcnd_dly_star, iter_stns) pool.close() pool.join() else: obs = [] for a_id in stns_obs.station_id: fpath = os.path.join(self.path_ghcnd_data, 'ghcnd_all', '%s.dly' % a_id) obs_stn = _parse_ghcnd_dly(fpath, a_id, self._elems, start_end) obs.append(obs_stn) df_obs = pd.concat(obs, ignore_index=True) if self._has_tobs: stnnums = stns_obs.join(self._df_tobs_stnnums).dropna(subset=['station_num']) if not stnnums.empty: stnnums = stnnums.reset_index(drop=True).set_index('station_num') select_str = "index = a_num" df_tobs = [] path_yrly = os.path.join(self.path_ghcnd_data, 'by_year') for elem in self._elems_tobs: store = pd.HDFStore(os.path.join(path_yrly, '%s.hdf' % elem)) # Perform separate read for each station. # Had this in a single call using "index in stnnums" # but memory usage was too high for a_num in stnnums.index: elem_tobs = store.select('df_tobs', select_str).reset_index() elem_tobs['elem'] = elem elem_tobs['station_id'] = stnnums.station_id.loc[a_num] df_tobs.append(elem_tobs[['time', 'elem', 'obs_value', 'station_id']]) store.close() del store gc.collect() df_tobs = pd.concat(df_tobs, ignore_index=True) if self.has_start_end_dates: df_tobs = df_tobs[(df_tobs.time >= self.start_date) & (df_tobs.time <= self.end_date)] df_obs = pd.concat([df_obs, df_tobs], ignore_index=True) finally: pd.set_option('mode.chained_assignment', opt_val) df_obs = df_obs.set_index(['station_id', 'elem', 'time']) df_obs = df_obs.sortlevel(0, sort_remaining=True) return df_obs
def run_cv(f, n_proc): p = Pool(n_proc) p.map(f, range(len(configs))) p.close() # no more tasks p.join() # wrap up current tasks
try: local_image_url = item.get('image') new_image_url = local_image_url.replace('list', 'large') response = requests.get('http:' + new_image_url) if response.status_code == 200: file_path = '{0}/{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg') if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(response.content) else: print('Already Downloaded', file_path) except requests.ConnectionError: print('Failed to save image') def main(offset): json = get_page(offset) for item in get_images(json): print(item) save_image(item) if __name__ == '__main__': pool = Pool() groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)]) pool.map(main, groups) pool.close() pool.join()
def Pool(processes=None, initializer=None, initargs=(), maxtasksperchild=None): ''' Returns a process pool object ''' from multiprocessing.pool import Pool return Pool(processes, initializer, initargs, maxtasksperchild)
def other_measures(): pool = Pool(processes=5) xp_repeat = 5 nb_iterations = 1000 for i, (data, target, enable_i) in enumerate(datasets): print("Dataset {}".format(datasets_names[i])) for measure in ['Informedness', 'F1']: mean_misere = 0 mean_beam = 0 mean_seqscout = 0 for j in range(xp_repeat): results_misere = pool.apply_async( misere, (data, target), { 'time_budget': TIME_BUDGET_XP, 'quality_measure': measure, 'iterations_limit': nb_iterations }) results_beam = pool.apply_async( beam_search, (data, target), { 'enable_i': enable_i, 'time_budget': TIME_BUDGET_XP, 'quality_measure': measure, 'iterations_limit': nb_iterations }) result_ucb_opti = pool.apply_async( seq_scout, (data, target), { 'enable_i': enable_i, 'time_budget': TIME_BUDGET_XP, 'quality_measure': measure, 'iterations_limit': nb_iterations }) results_misere = results_misere.get() results_beam = results_beam.get() result_ucb_opti = result_ucb_opti.get() if len(results_misere) < TOP_K: print( "Too few example on misere on dataset {}: {} results". format(datasets_names[i], len(results_misere))) if len(results_beam) < TOP_K: print( "Too few example on beam_search on dataset {}: {} results" .format(datasets_names[i], len(results_beam))) if len(result_ucb_opti) < TOP_K: print( "Too few example on seqscout on dataset {}: {} results" .format(datasets_names[i], len(result_ucb_opti))) mean_misere += average_results(results_misere) mean_beam += average_results(results_beam) mean_seqscout += average_results(result_ucb_opti) mean_misere = mean_misere / xp_repeat mean_beam = mean_beam / xp_repeat mean_seqscout = mean_seqscout / xp_repeat print( 'For datasets {}, measure {}, algorithm misere the means score is: {}' .format(datasets_names[i], measure, mean_misere)) print( 'For datasets {}, measure {}, algorithm beam_search the means score is: {}' .format(datasets_names[i], measure, mean_beam)) print( 'For datasets {}, measure {}, algorithm seqscout the means score is: {}' .format(datasets_names[i], measure, mean_seqscout))
import subprocess import pickle import sys import ntpath from threading import Lock from multiprocessing.pool import ThreadPool as Pool import xml.etree.ElementTree as ET import urllib2 checksums_dict = {} checksums_dict_lock = Lock() file_path_dict = {} file_path_dict_lock = Lock() pool_size = 5 # your "parallelness" pool = Pool(pool_size) class FileAttributes: def __init__(self): self.file_path=None self.file_name=None self.checksum=None self.itunes_key=-1 self.itunes_file_path=None def path_leaf(path): ''' Taken from https://stackoverflow.com/questions/8384737/extract-file-name-from-path-no-matter-what-the-os-path-format''' head, tail = ntpath.split(path) return tail or ntpath.basename(head)
def quality_over_size(): number_dataset = 6 data_origin, target, enable_i = datasets[number_dataset] pool = Pool(processes=3) # if we want to average nb_launched = 5 size = 15 size_step = 4 data_final = {'WRAcc': [], 'size': [], 'Algorithm': []} for i in range(10): print('Iteration: {}'.format(i)) data = reduce_k_length(size, data_origin) for i in range(nb_launched): results_misere = pool.apply_async(misere, (data, target), {'time_budget': TIME_BUDGET_XP}) results_beam = pool.apply_async(beam_search, (data, target), { 'enable_i': enable_i, 'time_budget': TIME_BUDGET_XP }) result_ucb_opti = pool.apply_async(seq_scout, (data, target), { 'enable_i': enable_i, 'time_budget': TIME_BUDGET_XP }) results_misere = results_misere.get() results_beam = results_beam.get() result_ucb_opti = result_ucb_opti.get() if len(results_beam) < TOP_K: print("Too few beam: {}".format(len(results_beam))) if len(result_ucb_opti) < TOP_K: print("Too few seqscout: {}".format(len(result_ucb_opti))) if len(results_misere) < TOP_K: print("Too few misere: {}".format(len(results_misere))) data_add_generic(data_final, WRAcc=max(0, average_results(results_misere)), size=size, Algorithm='misere') data_add_generic(data_final, WRAcc=max(0, average_results(results_beam)), size=size, Algorithm='beam') data_add_generic(data_final, WRAcc=max(0, average_results(result_ucb_opti)), size=size, Algorithm='seqscout') size += size_step df = pd.DataFrame(data=data_final) sns.set(rc={'figure.figsize': (8, 6.5)}) plt.clf() ax = sns.lineplot(data=df, x='size', y='WRAcc', hue='Algorithm') ax.set(xlabel='Length max', ylabel='WRAcc') # ax.set(xlabel='Time(s)', ylabel='Average WRAcc top-10 patterns') plt.savefig('./space_size/over_size.png') df.to_pickle('./space_size/result') if SHOW: plt.show()
def validate(self, do_mirroring: bool = True, use_train_mode: bool = False, tiled: bool = True, step: int = 2, save_softmax: bool = True, use_gaussian: bool = True, overwrite: bool = True, validation_folder_name: str = 'validation_raw', debug: bool = False, all_in_gpu: bool = False, force_separate_z: bool = None, interpolation_order: int = 3, interpolation_order_z=0): assert self.was_initialized, "must initialize, ideally with checkpoint (or train first)" # save whether network is in deep supervision mode or not ds = self.network.do_ds # disable deep supervision self.network.do_ds = False if self.dataset_val is None: self.load_dataset() self.do_split() output_folder = join(self.output_folder, validation_folder_name) maybe_mkdir_p(output_folder) # this is for debug purposes my_input_args = { 'do_mirroring': do_mirroring, 'use_train_mode': use_train_mode, 'tiled': tiled, 'step': step, 'save_softmax': save_softmax, 'use_gaussian': use_gaussian, 'overwrite': overwrite, 'validation_folder_name': validation_folder_name, 'debug': debug, 'all_in_gpu': all_in_gpu, 'force_separate_z': force_separate_z, 'interpolation_order': interpolation_order, 'interpolation_order_z': interpolation_order_z, } save_json(my_input_args, join(output_folder, "validation_args.json")) if do_mirroring: if not self.data_aug_params['do_mirror']: raise RuntimeError( "We did not train with mirroring so you cannot do inference with mirroring enabled" ) mirror_axes = self.data_aug_params['mirror_axes'] else: mirror_axes = () pred_gt_tuples = [] export_pool = Pool(default_num_threads) results = [] for k in self.dataset_val.keys(): properties = self.dataset[k]['properties'] fname = properties['list_of_data_files'][0].split("/")[-1][:-12] if overwrite or (not isfile(join(output_folder, fname + ".nii.gz"))) or \ (save_softmax and not isfile(join(output_folder, fname + ".npz"))): data = np.load(self.dataset[k]['data_file'])['data'] # concat segmentation of previous step seg_from_prev_stage = np.load( join(self.folder_with_segs_from_prev_stage, k + "_segFromPrevStage.npz"))['data'][None] print(k, data.shape) data[-1][data[-1] == -1] = 0 data_for_net = np.concatenate( (data[:-1], to_one_hot(seg_from_prev_stage[0], range(1, self.num_classes)))) softmax_pred = self.predict_preprocessed_data_return_softmax( data_for_net, do_mirroring, 1, use_train_mode, 1, mirror_axes, tiled, True, step, self.patch_size, use_gaussian=use_gaussian, all_in_gpu=all_in_gpu) softmax_pred = softmax_pred.transpose( [0] + [i + 1 for i in self.transpose_backward]) if save_softmax: softmax_fname = join(output_folder, fname + ".npz") else: softmax_fname = None """There is a problem with python process communication that prevents us from communicating obejcts larger than 2 GB between processes (basically when the length of the pickle string that will be sent is communicated by the multiprocessing.Pipe object then the placeholder (\%i I think) does not allow for long enough strings (lol). This could be fixed by changing i to l (for long) but that would require manually patching system python code. We circumvent that problem here by saving softmax_pred to a npy file that will then be read (and finally deleted) by the Process. save_segmentation_nifti_from_softmax can take either filename or np.ndarray and will handle this automatically""" if np.prod(softmax_pred.shape) > ( 2e9 / 4 * 0.85): # *0.85 just to be save np.save(join(output_folder, fname + ".npy"), softmax_pred) softmax_pred = join(output_folder, fname + ".npy") results.append( export_pool.starmap_async( save_segmentation_nifti_from_softmax, ((softmax_pred, join(output_folder, fname + ".nii.gz"), properties, interpolation_order, None, None, None, softmax_fname, force_separate_z, interpolation_order_z), ))) pred_gt_tuples.append([ join(output_folder, fname + ".nii.gz"), join(self.gt_niftis_folder, fname + ".nii.gz") ]) _ = [i.get() for i in results] self.print_to_log_file("finished prediction") # evaluate raw predictions self.print_to_log_file("evaluation of raw predictions") task = self.dataset_directory.split("/")[-1] job_name = self.experiment_name _ = aggregate_scores( pred_gt_tuples, labels=list(range(self.num_classes)), json_output_file=join(output_folder, "summary.json"), json_name=job_name + " val tiled %s" % (str(tiled)), json_author="Fabian", json_task=task, num_threads=default_num_threads) # in the old nnunet we would stop here. Now we add a postprocessing. This postprocessing can remove everything # except the largest connected component for each class. To see if this improves results, we do this for all # classes and then rerun the evaluation. Those classes for which this resulted in an improved dice score will # have this applied during inference as well self.print_to_log_file("determining postprocessing") determine_postprocessing(self.output_folder, self.gt_niftis_folder, validation_folder_name, final_subf_name=validation_folder_name + "_postprocessed", debug=debug) # after this the final predictions for the vlaidation set can be found in validation_folder_name_base + "_postprocessed" # They are always in that folder, even if no postprocessing as applied! # detemining postprocesing on a per-fold basis may be OK for this fold but what if another fold finds another # postprocesing to be better? In this case we need to consolidate. At the time the consolidation is going to be # done we won't know what self.gt_niftis_folder was, so now we copy all the niftis into a separate folder to # be used later gt_nifti_folder = join(self.output_folder_base, "gt_niftis") maybe_mkdir_p(gt_nifti_folder) for f in subfiles(self.gt_niftis_folder, suffix=".nii.gz"): success = False attempts = 0 e = None while not success and attempts < 10: try: shutil.copy(f, gt_nifti_folder) success = True except OSError as e: attempts += 1 sleep(1) if not success: print("Could not copy gt nifti file %s into folder %s" % (f, gt_nifti_folder)) if e is not None: raise e # restore network deep supervision mode self.network.do_ds = ds
def mmap_(fn: Callable[[A], B], iter: Iterable[A]) -> List[B]: return Pool().map(fn, iter)
def post(self): try: # set up post request parameters parser.add_argument('gre') parser.add_argument('toefl') parser.add_argument('grade') parser.add_argument('email') parser.add_argument('uid') parser.add_argument('work_ex') parser.add_argument('lor1') parser.add_argument('lor2') parser.add_argument('lor3') parser.add_argument('lor4') # Setup google storage for access storage_client = storage.Client.from_service_account_json( 'key1.json') bucket = storage_client.bucket('mf-frontend.appspot.com') # Retrieve post request values args = parser.parse_args() gre = args['gre'] toefl = args['toefl'] grade = args['grade'] email = args['email'] uid = args['uid'] work_ex = args['work_ex'] lor1 = args['lor1'] lor2 = args['lor2'] lor3 = args['lor3'] lor4 = args['lor4'] # Retrieve resume and convert to image blob = bucket.blob('resume/' + uid + '/' + uid + '_resume.pdf') blob.download_to_filename('/tmp/resume.pdf') doc = fitz.open('/tmp/resume.pdf') mat = fitz.Matrix(fitz.Identity) resume = doc[0].getPixmap(alpha=False, matrix=mat) resume.writePNG("/tmp/resume.png") resume = np.array(Image.open('/tmp/resume.png')) logging.info("Loaded resume") # Retrieve sop and extract text blob = bucket.blob('sop/' + uid + '/' + uid + '_sop.pdf') blob.download_to_filename('/tmp/sop.pdf') doc = fitz.open('/tmp/sop.pdf') pages = len(doc) sop = "" for page in range(pages): sop += doc[page].getText() logging.info("Loaded sop") # Make calls to second backend for all universities data = { "gre": gre, "toefl": toefl, "grade": grade, "email": email, "uid": uid, "work_ex": work_ex, "lor1": lor1, "lor2": lor2, "lor3": lor3, "lor4": lor4 } data["resume"] = resume.tolist() data["sop"] = sop universities = ["mit", "neu", "ncsu", "utd", "usc"] # Parallel requests pool = Pool(len(universities)) async_result = [ pool.apply_async(self.send_requests, ( data, univ, )) for univ in universities ] pool.close() pool.join() return_val = sorted([ar.get() for ar in async_result], key=itemgetter('score'), reverse=True) print(return_val) resp = {} for d in return_val: univ = d['univ'] nd = copy.deepcopy(data) nd['score'] = d['score'] resp[univ] = nd # rets = [self.send_requests(data,x,) for x in universities] # resp = {} # for i,u in enumerate(universities): # resp[u] = rets[i] return resp except Exception as e: print(e) return {}