def main(datadir, convert_dir, crop_size): try: os.mkdir(convert_dir) except OSError: pass filenames = data_util.get_image_files(datadir) print('Resizing images in {} to {}'.format(datadir, convert_dir)) n = len(filenames) batch_size = 500 batches = n // batch_size + 1 p = Pool() args = [] for f in filenames: args.append((convert_size, (datadir, convert_dir, f, crop_size))) for i in range(batches): print('batch {:>2} / {}'.format(i + 1, batches)) p.map(convert, args[i * batch_size : (i + 1) * batch_size]) p.close() p.join() print('Done')
def main_mh(): samples_dir_p = Path("/RECH2/huziy/BC-MH/bc_mh_044deg/Samples") out_dir_root = Path("/RECH2/huziy/MH_streamflows/") if samples_dir_p.name.lower() == "samples": out_folder_name = samples_dir_p.parent.name else: out_folder_name = samples_dir_p.name varnames = ["STFA", ] # ====================================== out_dir_p = out_dir_root.joinpath(out_folder_name) if not out_dir_p.is_dir(): out_dir_p.mkdir(parents=True) inputs = [] for y in range(1981, 2010): inputs.append(dict(year=y, varnames=varnames, samples_dir=samples_dir_p, out_dir=out_dir_p, target_freq_hours=24)) # Extract the data for each year in parallel pool = Pool(processes=3) pool.map(extract_data_for_year_in_parallel, inputs)
def run_parallel(num_processes, experiment_names, methods, sparsity_factors, run_ids): """ Run multiple experiments in parallel. Parameters ---------- num_processes : int The maximum number of processes that can run concurrently. experiment_names : list of str The names of experiments to run. methods : list of str The methods to run the experiments under (mix1, mix2, or full). sparsity_factors : list of float The sparsity of inducing points to run the experiments at. run_ids : list of int The ids of the configurations under which to run the experiments. """ # Setup an array of individual experiment configurations. experiment_configs = [] for experiment in experiment_names: for method in methods: for sparsity_factor in sparsity_factors: for run_id in run_ids: experiment_configs.append({'experiment_name': experiment, 'method': method, 'sparsity_factor': sparsity_factor, 'run_id': run_id}) # Now run the experiments. pool = Pool(num_processes) pool.map(run_config, experiment_configs)
class Pool(object): ''' ''' def __init__(self, **pool_kwargs): try: kw = KwargsCheck(MPIPool, pool_kwargs) self._pool = MPIPool(**kw) self.MPI = True except (ImportError, ValueError): kw = KwargsCheck(MultiPool, pool_kwargs) self._pool = MultiPool(**kw) self.MPI = False if self.MPI: if not self._pool.is_master(): self._pool.wait() sys.exit(0) def map(self, f, x, args = (), kwargs = {}): ''' ''' if len(args) or len(kwargs): w = wrap(f, *args, **kwargs) return self._pool.map(w, x) else: return self._pool.map(f, x) def close(self): self._pool.close()
def main_canesm2_rcp85(): samples_dir_p = Path("/RECH/data/Simulations/CRCM5/North_America/NorthAmerica_0.44deg_CanRCP85_B1/Samples") out_dir_root = Path("/RECH2/huziy/BenAlaya/") if samples_dir_p.name.lower() == "samples": out_folder_name = samples_dir_p.parent.name else: out_folder_name = samples_dir_p.name varnames = ["PR", integrated_wv_RPN_name] # Total precipitation m/s; integrated ice, liquid water and vapor (in kg/m**2) averaged over last MOYHR # ====================================== out_dir_p = out_dir_root.joinpath(out_folder_name) if not out_dir_p.is_dir(): out_dir_p.mkdir() inputs = [] for y in range(2006, 2101): inputs.append(dict(year=y, varnames=varnames, samples_dir=samples_dir_p, out_dir=out_dir_p, target_freq_hours=6, calendar_str="365_day")) # Extract the data for each year in parallel pool = Pool(processes=3) pool.map(extract_data_for_year_in_parallel, inputs)
def run_parallel(n_process): """ Creates a process for each element in the array returned by ``get_configs()`` and the experiment corresponding the each element. The maximum number of processes to run in parallel is determined by ``n_process`` """ p = Pool(n_process) p.map(run_config, ExperimentRunner.get_configs())
def main(): # update_item_list(SQL_USER, SQL_PASS, SQL_DATABASE) engine = create_engine('mysql+mysqlconnector://%s:%s@localhost/%s' % (SQL_USER, SQL_PASS, SQL_DATABASE)) region_id = 10000002 item_id_list = [int(index) for (index, row) in pd.read_sql_table('items', engine, index_col='item_id').iterrows()] data_write = partial(update_price_data, region_id) p = Pool(initializer=init_function, initargs=(SQL_USER, SQL_PASS, SQL_DATABASE)) p.map(data_write, item_id_list)
def main_crcm5_nemo(): label = "CRCM5_NEMO" period = Period( datetime(1980, 1, 1), datetime(2015, 12, 31) ) pool = Pool(processes=10) input_params = [] for month_start in period.range("months"): month_end = month_start.add(months=1).subtract(seconds=1) current_month_period = Period(month_start, month_end) current_month_period.months_of_interest = [month_start.month, ] vname_to_level_erai = { T_AIR_2M: VerticalLevel(1, level_kinds.HYBRID), U_WE: VerticalLevel(1, level_kinds.HYBRID), V_SN: VerticalLevel(1, level_kinds.HYBRID), } vname_map = {} vname_map.update(vname_map_CRCM5) vname_map = {} vname_map.update(vname_map_CRCM5) vname_map.update({ default_varname_mappings.SNOWFALL_RATE: "SN" }) label_to_config = OrderedDict([( label, { DataManager.SP_BASE_FOLDER: "/snow3/huziy/NEI/GL/erai0.75deg_driven/GL_with_NEMO_dtN_1h_and_30min/Samples", DataManager.SP_DATASOURCE_TYPE: data_source_types.SAMPLES_FOLDER_FROM_CRCM_OUTPUT, DataManager.SP_INTERNAL_TO_INPUT_VNAME_MAPPING: vname_map, DataManager.SP_LEVEL_MAPPING: vname_to_level_erai, DataManager.SP_OFFSET_MAPPING: vname_to_offset_CRCM5, DataManager.SP_MULTIPLIER_MAPPING: vname_to_multiplier_CRCM5, DataManager.SP_VARNAME_TO_FILENAME_PREFIX_MAPPING: default_varname_mappings.vname_to_fname_prefix_CRCM5, "out_folder": "lake_effect_analysis_{}_{}-{}_monthly".format(label, period.start.year, period.end.year) } )]) kwargs = dict( label_to_config=label_to_config, period=current_month_period, months_of_interest=current_month_period.months_of_interest, nprocs_to_use=1 ) print(current_month_period.months_of_interest) input_params.append(kwargs) # execute in parallel pool.map(monthly_func, input_params)
def main_crcm5_hl(): label = "CRCM5_HL" period = Period( datetime(1980, 1, 1), datetime(2009, 12, 31) ) pool = Pool(processes=12) input_params = [] for month_start in period.range("months"): month_end = month_start.add(months=1).subtract(seconds=1) current_month_period = Period(month_start, month_end) current_month_period.months_of_interest = [month_start.month, ] vname_to_level_erai = { T_AIR_2M: VerticalLevel(1, level_kinds.HYBRID), U_WE: VerticalLevel(1, level_kinds.HYBRID), V_SN: VerticalLevel(1, level_kinds.HYBRID), } vname_map = {} vname_map.update(vname_map_CRCM5) vname_map = {} vname_map.update(vname_map_CRCM5) vname_map.update({ default_varname_mappings.SNOWFALL_RATE: "U3" }) label_to_config = OrderedDict([( label, { DataManager.SP_BASE_FOLDER: "/RECH2/huziy/coupling/GL_440x260_0.1deg_GL_with_Hostetler/Samples_selected", DataManager.SP_DATASOURCE_TYPE: data_source_types.SAMPLES_FOLDER_FROM_CRCM_OUTPUT_VNAME_IN_FNAME, DataManager.SP_INTERNAL_TO_INPUT_VNAME_MAPPING: vname_map, DataManager.SP_LEVEL_MAPPING: vname_to_level_erai, DataManager.SP_OFFSET_MAPPING: vname_to_offset_CRCM5, DataManager.SP_MULTIPLIER_MAPPING: vname_to_multiplier_CRCM5, "out_folder": "lake_effect_analysis_{}_{}-{}_monthly".format(label, period.start.year, period.end.year) } )]) kwargs = dict( label_to_config=label_to_config, period=current_month_period, months_of_interest=current_month_period.months_of_interest, nprocs_to_use=1 ) print(current_month_period.months_of_interest) input_params.append(kwargs) # execute in parallel pool.map(monthly_func, input_params)
def main_future(nprocs=20): period = Period( datetime(2079, 1, 1), datetime(2100, 12, 31) ) label = "CRCM5_NEMO_fix_TT_PR_CanESM2_RCP85_{}-{}_monthly".format(period.start.year, period.end.year) vname_to_level_erai = { T_AIR_2M: VerticalLevel(1, level_kinds.HYBRID), U_WE: VerticalLevel(1, level_kinds.HYBRID), V_SN: VerticalLevel(1, level_kinds.HYBRID), } base_folder = "/scratch/huziy/Output/GL_CC_CanESM2_RCP85/coupled-GL-future_CanESM2/Samples" vname_map = {} vname_map.update(vname_map_CRCM5) # vname_map[default_varname_mappings.SNOWFALL_RATE] = "SN" vname_map[default_varname_mappings.SNOWFALL_RATE] = "XXX" pool = Pool(processes=nprocs) input_params = [] for month_start in period.range("months"): month_end = month_start.add(months=1).subtract(seconds=1) current_month_period = Period(month_start, month_end) current_month_period.months_of_interest = [month_start.month, ] label_to_config = OrderedDict([( label, { # "base_folder": "/HOME/huziy/skynet3_rech1/CRCM5_outputs/cc_canesm2_rcp85_gl/coupled-GL-future_CanESM2/Samples", DataManager.SP_BASE_FOLDER: base_folder, DataManager.SP_DATASOURCE_TYPE: data_source_types.SAMPLES_FOLDER_FROM_CRCM_OUTPUT, DataManager.SP_INTERNAL_TO_INPUT_VNAME_MAPPING: vname_map, DataManager.SP_LEVEL_MAPPING: vname_to_level_erai, DataManager.SP_OFFSET_MAPPING: vname_to_offset_CRCM5, DataManager.SP_MULTIPLIER_MAPPING: vname_to_multiplier_CRCM5, DataManager.SP_VARNAME_TO_FILENAME_PREFIX_MAPPING: vname_to_fname_prefix_CRCM5, "out_folder": "lake_effect_analysis_{}_{}-{}".format(label, period.start.year, period.end.year) } )]) kwargs = dict( label_to_config=label_to_config, period=current_month_period, months_of_interest=current_month_period.months_of_interest, nprocs_to_use=1 ) print(current_month_period.months_of_interest) input_params.append(kwargs) # execute in parallel pool.map(monthly_func, input_params)
def launchCMAESForAllTargetSizesMulti(): ''' Launch in parallel (on differents processor) the cmaes optimization for each target size ''' #initializes setup variables rs = ReadSetupFile() #initializes a pool of worker, ie multiprocessing p = Pool() #run cmaes on each targets size on separate processor p.map(launchCMAESForSpecificTargetSize, rs.sizeOfTarget, "theta")
def get_word(): domains=open('dic/newwords').readlines() try: pool=Pool(processes=2) pool.map(check_domain,domains) pool.close() pool.join() except Exception as e: print e pass
def run(self, test_name=None, db_adapter=None): if db_adapter is None: db_adapter = DEFAULT_DATABASE_ADAPTER if test_name is None: test_name = '_'.join([db_adapter, datetime.datetime.now().strftime("%Y-%m-%d %H:%M")]) print ''.join(['Running "', test_name, '" test']) print 'Prepare database' adapter = adapter_factory(db_adapter) adapter.prepare_db() test_id = adapter.create_new_test(test_name) print '' print 'Create user documents' pool = Pool(processes=10) params = [{'user_id': i, 'docs_per_user': DOCS_PER_USER, 'db_adapter': db_adapter} for i in range(1, USERS_COUNT + 1)] start = time.time() try: pool.map(create_users, params) print 'Full time:', time.time() - start finally: pool.terminate() del pool print 'OK! Users were created!' print '' for i in range(1, MAX_PROCESSES + 1): print 'Run test with %d proceses' % i pool = Pool(processes=i) params = [{'user_id': j, 'db_adapter': db_adapter} for j in range(1, USERS_COUNT + 1)] start = time.time() try: res = pool.map(update_users, params) full_time = time.time() - start finally: pool.terminate() del pool print 'Test is finished! Save results' print '' adapter.save_results(test_id, res, i) print 'Full time:', full_time print '' print 'Finish!'
def main(): ts = time() client_id = os.getenv('IMGUR_CLIENT_ID') if not client_id: raise Exception("Couldn't find IMGUR_CLIENT_ID environment variable!") download_dir = setup_download_dir() links = [l for l in get_links(client_id) if l.endswith('.jpg')] download = partial(download_link, download_dir) p = Pool(8) p.map(download, links) print('Took {}s'.format(time() - ts))
def validate_series(yaml_file, sequence_dictionary): """ :param yaml_file: The mdl yaml file. :param sequence_dictionary: Dictionary of sequences :return: Runs a large number of sequence tests on the series to make sure the sequences for each protein match the given sequence and the series itself """ yaml_file = load_yaml_file(yaml_file) p = Pool(cpu_count()) jobs = [(yaml_file, protein, sequence_dictionary) for protein in yaml_file["protein_list"]] p.map(_validate_protein, jobs) return
def main_obs(): label = "Obs_monthly_icefix_test2_1proc_speedtest_3" period = Period( datetime(1980, 1, 1), datetime(2010, 12, 31) ) pool = Pool(processes=20) input_params = [] for month_start in period.range("months"): month_end = month_start.add(months=1).subtract(seconds=1) current_month_period = Period(month_start, month_end) current_month_period.months_of_interest = [month_start.month, ] vname_to_level_erai = { T_AIR_2M: VerticalLevel(1, level_kinds.HYBRID), U_WE: VerticalLevel(1, level_kinds.HYBRID), V_SN: VerticalLevel(1, level_kinds.HYBRID), } vname_map = {} vname_map.update(vname_map_CRCM5) label_to_config = OrderedDict([( label, { DataManager.SP_BASE_FOLDER: "/HOME/huziy/skynet3_rech1/obs_data_for_HLES/interploated_to_the_same_grid/GL_0.1_452x260_icefix", DataManager.SP_DATASOURCE_TYPE: data_source_types.ALL_VARS_IN_A_FOLDER_IN_NETCDF_FILES_OPEN_EACH_FILE_SEPARATELY, DataManager.SP_INTERNAL_TO_INPUT_VNAME_MAPPING: vname_map, DataManager.SP_LEVEL_MAPPING: vname_to_level_erai, DataManager.SP_OFFSET_MAPPING: vname_to_offset_CRCM5, DataManager.SP_MULTIPLIER_MAPPING: vname_to_multiplier_CRCM5, DataManager.SP_VARNAME_TO_FILENAME_PREFIX_MAPPING: vname_to_fname_prefix_CRCM5, "out_folder": "lake_effect_analysis_daily_{}_{}-{}".format(label, period.start.year, period.end.year) } )]) kwargs = dict( label_to_config=label_to_config, period=current_month_period, months_of_interest=current_month_period.months_of_interest, nprocs_to_use=1 ) print(current_month_period.months_of_interest) input_params.append(kwargs) # execute in parallel pool.map(monthly_func, input_params)
def main(): # catch parameters segmentation_base_string = sys.argv[1] ground_truth_base_string = sys.argv[2] mask_file_base_string = sys.argv[3] cases = sys.argv[4:] # evaluate each case and collect the scores hds = [] assds = [] precisions = [] recalls = [] dcs = [] # load images and apply mask to segmentation and ground truth (to remove ground truth fg outside of brain mask) splush = [load(segmentation_base_string.format(case)) for case in cases] tplush = [load(ground_truth_base_string.format(case)) for case in cases] masks = [load(mask_file_base_string.format(case))[0].astype(numpy.bool) for case in cases] s = [s.astype(numpy.bool) & m for (s, _), m in zip(splush, masks)] t = [t.astype(numpy.bool) & m for (t, _), m in zip(tplush, masks)] hs = [h for _, h in splush] ht = [h for _, h in tplush] # compute and append metrics (Pool-processing) pool = Pool(n_jobs) dcs = pool.map(wdc, zip(t, s)) precisions = pool.map(wprecision, zip(s, t)) recalls = pool.map(wrecall, zip(s, t)) hds = pool.map(whd, zip(t, s, [header.get_pixel_spacing(h) for h in ht])) assds = pool.map(wassd, zip(t, s, [header.get_pixel_spacing(h) for h in ht])) # print case-wise results print 'Metrics:' print 'Case\tDC[0,1]\tHD(mm)\tP2C(mm)\tprec.\trecall' for case, _dc, _hd, _assd, _pr, _rc in zip(cases, dcs, hds, assds, precisions, recalls): print '{}\t{:>3,.3f}\t{:>4,.3f}\t{:>4,.3f}\t{:>3,.3f}\t{:>3,.3f}'.format(case, _dc, _hd, _assd, _pr, _rc) # check for nan/inf values of failed cases and signal warning mask = numpy.isfinite(hds) if not numpy.all(mask): print 'WARNING: Average values only computed on {} of {} cases!'.format(numpy.count_nonzero(mask), mask.size) print 'DM average\t{} +/- {} (Median: {})'.format(numpy.asarray(dcs)[mask].mean(), numpy.asarray(dcs)[mask].std(), numpy.median(numpy.asarray(dcs)[mask])) print 'HD average\t{} +/- {} (Median: {})'.format(numpy.asarray(hds)[mask].mean(), numpy.asarray(hds)[mask].std(), numpy.median(numpy.asarray(hds)[mask])) print 'ASSD average\t{} +/- {} (Median: {})'.format(numpy.asarray(assds)[mask].mean(), numpy.asarray(assds)[mask].std(), numpy.median(numpy.asarray(assds)[mask])) print 'Prec. average\t{} +/- {} (Median: {})'.format(numpy.asarray(precisions)[mask].mean(), numpy.asarray(precisions)[mask].std(), numpy.median(numpy.asarray(precisions)[mask])) print 'Rec. average\t{} +/- {} (Median: {})'.format(numpy.asarray(recalls)[mask].mean(), numpy.asarray(recalls)[mask].std(), numpy.median(numpy.asarray(recalls)[mask]))
def main(): parser = argparse.ArgumentParser(description='portScan.py') parser.add_argument("-ip", dest="ip", help="ip to scan") parser.add_argument("-i", dest="iniPort", help="initial port ") parser.add_argument("-e", dest="endPort", help="end port") params = parser.parse_args() ip = str(params.ip) print('[+] Scanned IP -> ' + ip) print('\t' + str(params.iniPort) + ' - ' + str(params.endPort)) p = Pool(50) p.map(scan, range(int(params.iniPort), int(params.endPort)))
def runoff_to_netcdf_parallel(indir, outdir): if not os.path.isdir(outdir): os.mkdir(outdir) in_names = [x for x in os.listdir(indir) if x.startswith("pm") and x.endswith("p")] in_paths = [os.path.join(indir, name) for name in in_names] out_paths = [os.path.join(outdir, inName + ".nc") for inName in in_names] ppool = Pool(processes=10) print("The paths below go to: ") print(in_paths[0]) print("Go into: {}".format(out_paths[0])) ppool.map(extract_runoff_to_nc_process, list(zip(in_paths, out_paths)))
def main(): opts, args = parse_options() datasets = build_datasets(opts, args[0]) sources = glob(args[1]) stime = time() pool = Pool(opts.jobs) pool.map(partial(job, opts, datasets), sources) cputime = clock() duration = time() - stime opts.verbose and log("Processed in in {0:0.2f}s using {1:0.2f}s of CPU time.", duration, cputime)
def update(self, export='csv'): """ 更新已经下载的历史数据 :param export: 历史数据的导出方式,目前支持持 csv :return: """ stock_codes = [] for file in os.listdir(self.raw_path): if not file.endswith('.json'): continue stock_code = file[:6] stock_codes.append(stock_code) pool = Pool(10) func = partial(self.update_single_code) if export.lower() in ['csv']: pool.map(func, stock_codes)
def get_correlation_parallel(s1,s2): """ params s1 - series 1 params s2 - series 2 NOTE : series are number 1 to 25 when giving in arguments returns the correlation between series """ start = time.time() offsets = [] #this will be the arguments to all the parallel jobs instances = (MAX_ROWS/BATCH_SIZE) mean,std = calculate_mean_std_parallel() stripped_mean,stripped_std = calculate_stripped_mean_std_parallel(mean,std) processes = Pool(processes=instances) for i in range(instances): offsets.append((s1,s2,mean,std,stripped_mean,stripped_std,i*BATCH_SIZE)) results = processes.map(get_correlation,offsets) processes.close() processes.join() pearson_corr = 0 total = 0 for result in results: pearson_corr += result[0]*result[1] total += result[1] pearson_corr = 1.0*pearson_corr / total t_value = abs(pearson_corr*math.sqrt( 1.0*(total - 2) / ( 1 - (pearson_corr*pearson_corr)))) p_value = t.sf(t_value,total-2) print "\n ######### CORRELATION BETWEEN SERIES ",s1," AND SERIES ",s2, " is ",pearson_corr , "t value is ", t_value ," and p value is ", p_value, "######### \n" end = time.time() print "EXECUTION TIME : ", end-start , " sec" return pearson_corr
class Runner(DatabaseRunner): """ Class for running algorithms against test images """ def __init__(self, algorithm, domain, arguments=None): """ The domain dictates which images to use as sources. The limit is an optional maximum number of images to use as sources. If random, images will be pulled in random order, up to limit; otherwise, images will be pulled in sequential order. Tags are used to control the image selection further. """ DatabaseRunner.__init__(self, algorithm, arguments) self._domain = domain self._image_id = None if kMaxWorkers > 1: self._pool = Pool(int(config.get('global', 'max_workers'))) def run(self): """ Runs the algorithm on the images matching the supplied arguments """ self._logger.debug('Fetching image IDs from database') if self._arguments.image_id: self._image_id = self._arguments.image_id images = (self._database_mapper.get_image_for_analysis(self._domain, self._image_id), ) else: images = self._database_mapper.get_images_for_analysis(self._domain, self._arguments.limit, self._arguments.random, self._arguments.tags_require, self._arguments.tags_exclude) self._logger.debug('Processing {0} images'.format(len(images))) if kMaxWorkers > 1: return self.evaluate(self._pool.map(self._algorithm.apply, images)) else: return self.evaluate(map(self._algorithm.apply, images))
def _get_data(data_folder = "data/crcm4_data", v_name = "pcp", member_list = None, year_range = None, months = None): """ returns seasonal means of each year for all members in the list Note!: uses caching """ year_range = list(year_range) cache_file = "_".join(member_list) + "_" + "_".join(map(str, months)) + \ "_{0}_from_{1}_to_{2}_cache.bin".format(v_name, year_range[0], year_range[-1]) if os.path.isfile(cache_file): return pickle.load(open(cache_file)) p = Pool(processes=len(member_list)) #prepare input for the parallel processes m_folders = map(lambda x: os.path.join(data_folder,"{0}_p1{1}".format(x, v_name)), member_list) year_ranges = [year_range] * len(member_list) months_for_p = [months] * len(member_list) #calculate means result = p.map(_get_annual_means_for_year_range_p, zip(m_folders, year_ranges, months_for_p)) result = np.concatenate(result, axis = 0) #shape = (n_members * len(year_range)) x nx x ny print result.shape pickle.dump(result, open(cache_file, "w")) return result
def check(self, artdict): print("Checking for infobox existence") pool = Pool(processes=100) revs = [] for a in artdict: rev = artdict[a]["Revision"].split('oldid=')[1].strip() revs.append((a, rev)) texts = dict(pool.map(self.get_text, revs)) for a in artdict: text = texts[a] if text is None: artdict[a]["MultiInfobox"] = 0 artdict[a]["Infobox programming language"] = -1 artdict[a]["Infobox software"] = -1 artdict[a]["Infobox file format"] = -1 else: if 'infobox programming language' in text.lower(): artdict[a]["Infobox programming language"] = text.lower().index('infobox programming language') else: artdict[a]["Infobox programming language"] = -1 if 'infobox software' in text.lower(): artdict[a]["Infobox software"] = text.lower().index('infobox software') else: artdict[a]["Infobox software"] = -1 if 'infobox file format' in text.lower(): artdict[a]["Infobox file format"] = text.lower().index('infobox file format') else: artdict[a]["Infobox file format"] = -1 artdict[a]["MultiInfobox"] = text.lower().count("{{infobox") return artdict
def sum_lines(self, SPEEDUP=True): filesname = [] for item_dir in self.dirlist.keys(): for item_file in self.dirlist[item_dir][1]: filesname.append(item_dir + '/' + item_file) if SPEEDUP: # when python version is less then 3.3, multiprocessing.pool.Pool # don't support the context management protocol if sys.version_info.major is 3 and sys.version_info.minor >= 3: with Pool(self.MAX_RES) as res_pool: return reduce(self._adder, res_pool.map(self._count_filelines, filesname)) else: # in python2.x(maybe python3.[0-2]), # multiprocessing must pickle things to sling them among processes, # and bound methods are not picklable. # the workaround (whether you consider it "easy" or not;-) is to # add the infrastructure to your program to allow such methods to be pickled, # registering it with the copy_reg standard library method. # the following is a elusion to make it work in python2.x res_pool = Pool(processes=self.MAX_RES) retval = res_pool.map(_filecounter, filesname) return reduce(self._adder, retval) else: for filename in filesname: with open(filename, 'rb') as filebuf: self.filesline += len(filebuf.readlines()) return self.filesline
def parallel_main(): recs = sys.stdin.readlines() vals = [int(rec) for rec in recs] p = Pool() results = p.map(solve, vals) for v1, v2 in results: print("{} {}".format(v1, v2))
def test_stemming(): with open("tests.txt") as file: pool = Pool(4) results = pool.map(validate, file) for result in results: if result: yield assert_output, result[0], result[1]
def parallel_main(): recs = iter(sys.stdin.readlines()) cuts_list = [] cuts_list_append = cuts_list.append cuts = [] cuts_extend = cuts.extend cuts_append = cuts.append cuts_clear = cuts.clear while True: # length of stick L = int(next(recs)) if L == 0: break # number of cut n_cut = int(next(recs)) # cutting points cuts_clear() cuts_append(0) cuts_extend(list(map(int, next(recs).split()))) cuts_append(L) cuts_list_append(cuts[:]) p = Pool(4) results = p.map(min_cut, cuts_list) for res in results: print(res)
def get_urls1(): f2 = open('app_links.txt','r') nprocs = 100 # nprocs is the number of processes to run ParsePool = Pool(nprocs) #ParsePool.map(btl_test,url) ParsedURLS = ParsePool.map(urlsDeatilsExtract,f2)
gain_above_dict = { float(key): value for key, value in gain_above_dict.iteritems() } gain_below_dict = { float(key): value for key, value in gain_below_dict.iteritems() } # This configuration of the multiprocessing call is necessary for passing multiple arguments to the main function # It is based on the example here: http://spencerimp.blogspot.com/2015/12/python-multiprocess-with-multiple.html # Ran with 16 processors on r4.16xlarge num_of_processes = 16 pool = Pool(num_of_processes) pool.map( partial(annual_gain_rate_mangrove.annual_gain_rate, gain_above_dict=gain_above_dict, gain_below_dict=gain_below_dict), mangrove_ecozone_list) pool.close() pool.join() # # For single processor use # for tile in mangrove_ecozone_list: # # annual_gain_rate_mangrove.annual_gain_rate(tile, gain_table_dict) print "Tiles processed. Uploading to s3 now..." uu.upload_final_set(cn.annual_gain_AGB_mangrove_dir, cn.pattern_annual_gain_AGB_mangrove) uu.upload_final_set(cn.annual_gain_BGB_mangrove_dir, cn.pattern_annual_gain_BGB_mangrove)
def ensemble(training_output_folder1, training_output_folder2, output_folder, task, validation_folder, folds): print("\nEnsembling folders\n", training_output_folder1, "\n", training_output_folder2) output_folder_base = output_folder output_folder = join(output_folder_base, "ensembled_raw") # only_keep_largest_connected_component is the same for all stages dataset_directory = join(preprocessing_output_dir, task) plans = load_pickle(join(training_output_folder1, "plans.pkl")) # we need this only for the labels files1 = [] files2 = [] property_files = [] out_files = [] gt_segmentations = [] folder_with_gt_segs = join(dataset_directory, "gt_segmentations") for f in folds: validation_folder_net1 = join(training_output_folder1, "fold_%d" % f, validation_folder) validation_folder_net2 = join(training_output_folder2, "fold_%d" % f, validation_folder) patient_identifiers1 = subfiles(validation_folder_net1, False, None, 'npz', True) patient_identifiers2 = subfiles(validation_folder_net2, False, None, 'npz', True) # we don't do postprocessing anymore so there should not be any of that noPostProcess patient_identifiers1_nii = [ i for i in subfiles(validation_folder_net1, False, None, suffix='nii.gz', sort=True) if not i.endswith("noPostProcess.nii.gz") and not i.endswith('_postprocessed.nii.gz') ] patient_identifiers2_nii = [ i for i in subfiles(validation_folder_net2, False, None, suffix='nii.gz', sort=True) if not i.endswith("noPostProcess.nii.gz") and not i.endswith('_postprocessed.nii.gz') ] assert len(patient_identifiers1) == len( patient_identifiers1_nii ), "npz seem to be missing. run validation with --npz" assert len(patient_identifiers1) == len( patient_identifiers1_nii ), "npz seem to be missing. run validation with --npz" assert all([ i[:-4] == j[:-7] for i, j in zip(patient_identifiers1, patient_identifiers1_nii) ]), "npz seem to be missing. run validation with --npz" assert all([ i[:-4] == j[:-7] for i, j in zip(patient_identifiers2, patient_identifiers2_nii) ]), "npz seem to be missing. run validation with --npz" all_patient_identifiers = patient_identifiers1 for p in patient_identifiers2: if p not in all_patient_identifiers: all_patient_identifiers.append(p) # assert these patients exist for both methods assert all([ isfile(join(validation_folder_net1, i)) for i in all_patient_identifiers ]) assert all([ isfile(join(validation_folder_net2, i)) for i in all_patient_identifiers ]) maybe_mkdir_p(output_folder) for p in all_patient_identifiers: files1.append(join(validation_folder_net1, p)) files2.append(join(validation_folder_net2, p)) property_files.append(join(validation_folder_net1, p)[:-3] + "pkl") out_files.append(join(output_folder, p[:-4] + ".nii.gz")) gt_segmentations.append( join(folder_with_gt_segs, p[:-4] + ".nii.gz")) p = Pool(default_num_threads) p.map(merge, zip(files1, files2, property_files, out_files)) p.close() p.join() if not isfile(join(output_folder, "summary.json")) and len(out_files) > 0: aggregate_scores(tuple(zip(out_files, gt_segmentations)), labels=plans['all_classes'], json_output_file=join(output_folder, "summary.json"), json_task=task, json_name=task + "__" + output_folder_base.split("/")[-1], num_threads=default_num_threads) if not isfile(join(output_folder_base, "postprocessing.json")): determine_postprocessing(output_folder_base, folder_with_gt_segs, "ensembled_raw", "temp", "ensembled_postprocessed", default_num_threads, dice_threshold=0) out_dir_all_json = join(network_training_output_dir, "summary_jsons") json_out = load_json( join(output_folder_base, "ensembled_postprocessed", "summary.json")) json_out["experiment_name"] = output_folder_base.split("/")[-1] save_json( json_out, join(output_folder_base, "ensembled_postprocessed", "summary.json")) maybe_mkdir_p(out_dir_all_json) shutil.copy( join(output_folder_base, "ensembled_postprocessed", "summary.json"), join(out_dir_all_json, "%s__%s.json" % (task, output_folder_base.split("/")[-1])))
pages = soups.find('div', attrs={ 'class': 'pagenavi' }).find_all('span')[-2].getText() dirname = u'[{}P] {}'.format(int(pages), title) if not os.path.exists(dirname): os.mkdir(dirname) for page in range(1, int(pages) + 1): each_pic = detail_link + '/' + str(page) picture = requests.get(each_pic, headers=headers).content pic_html = BeautifulSoup(picture, 'lxml') img = pic_html.find('div', attrs={ 'class': 'main-image' }).find('img')['src'] filename = '%s/%s/%s.jpg' % (os.path.abspath('.'), dirname, n) print(u'开始下载图片:%s 第%s张' % (dirname, n)) try: with open(filename, 'wb+') as jpg: jpg.write(requests.get(img, headers=headers).content) n += 1 time.sleep(1) except: pass if __name__ == '__main__': pool = Pool(10) page = [x for x in range(1, 154)] pool.map(get_pic, page) pool.close() pool.join()
except Exception: print('存储到mongo失败', result) # 获取某个品牌所有详情页的url def get_all_urls(brand): detail_urls = [] total, urls = search(brand) total = int(total) detail_urls.extend(urls) for i in range(2, 2 + 1): result = next_page(i) detail_urls.extend(result) # break return detail_urls def main(brand): urls = get_all_urls(brand) for url in set(urls): url = 'http:' + url get_products(url) if __name__ == '__main__': brands = [] pool = Pool(processes=3) # 设置进程池中的进程数 pool.map(main, brands) # 将列表中的每个对象应用到get_page_list函数 pool.close() # 等待进程池中的进程执行结束后再关闭pool pool.join() # main()
def parallel_pair_e_it(self, chunk_it): p = Pool(4) for chunk in chunk_it: for pair_e in p.map(self.mod_pairs, chunk): yield pair_e p.close()
json_latest = self.get_latest_page(offset) print(len(json_latest)) json_byte = self.get_byte_page(offset) latest_records = self.get_futurism_infos(json_latest) self.save_info_Mysql(latest_records) # print(latest_records) print('==========' * 10) byte_records = self.get_futurism_infos(json_byte) self.save_info_Mysql(byte_records) # print(byte_records) def run(self, offset): self.start(offset) #多线程下载 if __name__ == '__main__': # main(1) fs = Futurism_Spider() # fs.run(1) pool = Pool() groups = ([ x * fs.OFF_SET for x in range(fs.GROUP_START, fs.GROUP_END + 1) ]) pool.map(fs.run, groups) pool.close() pool.join()
luckey_count = 0 for i in range(*interval): if sum_ok(i, int(n/2)): luckey_count += 1 return luckey_count if __name__ == "__main__": import time from multiprocessing.pool import Pool parts = 50 workers = 8 # start of calculation start = time.time() part = 10**n / parts incremet_sum = 0 intervals = [] for i in range(parts): intervals.append((incremet_sum, incremet_sum + part)) incremet_sum += part pool = Pool(workers) luckey_count = pool.map(count_on_interval, intervals) print luckey_count print 'Entire job took:', time.time() - start, 'probability:', (1.0 * sum(luckey_count)) / 10**n
def run_cv(f, n_proc): p = Pool(n_proc) p.map(f, range(len(configs))) p.close() # no more tasks p.join() # wrap up current tasks
def check_words(self, is_sorted=False, is_reversed=False): # Split the long string into separate strings, and make some IDs. words = list([w for w in LONG_TEXT[:100].split(' ') if w]) print("Adding words: {}".format(words)) # Avoid adding the same string twice (or a prefix of a previous string). # - because it's a current problem unless we append string IDs, which makes things too slow # words = set(words) # words = [w for w in words if 0 != sum([x.startswith(w) for x in words if x != w])] assert words # Make a string ID for each string. strings = {} for string in words: string_id = uuid.uuid4().hex strings[string_id] = string # Create a new suffix tree. self.app = SuffixTreeApplicationWithCassandra() st = self.app.register_new_suffix_tree() assert st.id in self.app.suffix_tree_repo # Close the app, so the pool doesn't inherit it. self.app.close() # Start the pool. pool = Pool(initializer=pool_initializer, processes=1) words = [[s, sid, st.id] for sid, s in strings.items() if s] if is_sorted: words = sorted(words) if is_reversed: words = reversed(words) results = pool.map(add_string_to_suffix_tree, words) for result in results: if isinstance(result, Exception): print(result.args[0][1]) raise result # Creat the app again. self.app = SuffixTreeApplicationWithCassandra() errors = [] # Check the suffix tree returns string ID for all substrings of string. for string_id, string in strings.items(): # Check all prefixes and suffixes. substrings = sorted(list(get_all_substrings(string))) print("") print("Checking for all substrings of string '{}': {}".format( repr(string), " ".join([repr(s) for s in substrings]))) for substring in substrings: results = self.app.find_string_ids(substring, st.id) if string_id not in results: msg = "Not found: substring '{}' from string '{}'".format( repr(substring), repr(string)) print(msg) errors.append(msg) # Check for errors. self.assertFalse(errors, "\n".join(errors))
dst_image_path = os.path.join('./data/image', name) dst_mask_path = os.path.join('./data/mask', name) try: img = imread(image_path) img, mask = preprocess(img) img = cv.resize(img, dsize) mask = cv.resize(mask, dsize) imwrite(dst_image_path, img) imwrite(dst_mask_path, mask) except: print(image_path) continue if __name__ == "__main__": image_list = glob.glob(os.path.join('./data/sample', '*.jpeg')) patches = 16 patch_len = int(len(image_list) / patches) filesPatchList = [] for i in range(patches - 1): fileList = image_list[i * patch_len:(i + 1) * patch_len] filesPatchList.append(fileList) filesPatchList.append(image_list[(patches - 1) * patch_len:]) # mutiple process pool = Pool(patches) pool.map(process, filesPatchList) pool.close()
with open('{}\\{}.jpg'.format(save_path, pic_name), 'ab+') as pic_write: pic_write.write(resp.content) except requests.ConnectionError: print("{}\n{}无法获取此图片,保存失败!".format(file_title, url)) def main(offset): json = get_json_data(offset) for info in get_pic_url(json): if info is not None: save_pic(info['title'], info['url']) else: print('offset={},已获取完街拍图片'.format(offset)) break break if __name__ == '__main__': start_time = time.time() INIT_PAGE = 50 offset_list = ([x * 20 for x in range(0, INIT_PAGE)]) pool = Pool() pool.map(main, offset_list) pool.close() pool.join() used_time = time.time() - start_time print("图片保存路径:", os.getcwd()) print('耗费时间:', used_time) # 不加多线程时间 181.3823745250702 # 多线程:耗费时间: 59.359395027160645
detail = dl.xpath('.//text()') detail = str(''.join(detail)).replace('\xa0', '').strip() infos.append(detail) #print(detail) save('\n'.join(infos)) def get_pages(url): #首页 response = requests.get(url, headers=headers) # print(response.text) selector = etree.HTML(response.text) items = selector.xpath('//div[@class="city_spots_list"]/ul//li') for item in items: #获取详情页url href = item.xpath('./a/@href')[0] #print(href) res = get_detail(href) paser_pages(res) if __name__ == '__main__': #多线程爬取 page_href = [ 'https://yancheng.cncn.com/jingdian/1-{}-0-0.html'.format(str(i)) for i in range(1, 6) ] pool = Pool() result = pool.map(get_pages, page_href) pool.close() pool.join()
print(f"downloading {centre.upper()} {var_name} hindcasts for year {year}") c.retrieve( 'seasonal-monthly-single-levels', { 'originating_centre':centre, 'system':system, 'variable':var_dict[var_name], 'product_type':'monthly_mean', 'year':str(year), 'month':[ '01','02','03', '04','05','06', '07','08','09', '10','11','12' ], 'leadtime_month':[ '2','3','4', '5','6' ], 'format':'grib' }, f'{str(dpath)}/{centre.upper()}_system_{system}_{var_name}_{year}.grib') # initialise a Pool with the p = Pool(workers) p.map(fetch_hindcast, list(range(workers)))
site = 'http://www.mmjpg.com/mm/' mm_url = site + str(num) print(mm_url) count = get_mmurl_count(mm_url) print(count) title = get_mmurl_title(mm_url) print(title) mmurl_t = get_mmurl_t(mm_url) # print(mmurl_t) path = 'D:\爬虫\图片下载\\' + title if os.path.isdir(path): pass else: os.mkdir(path) for i in range(1, count + 1): imgs = mmurl_t + str(i) + '.jpg' # print(imgs) with open(path + '\\' + str((1000 + i)) + '.jpg', 'wb') as f: f.write(requests.get(imgs, headers=header2).content) if __name__ == '__main__': s = time.time() # for i in range(1242,1256): # main(i) # main(1242,1256) pool = Pool() pool.map(main, [i for i in range(1242, 1256)]) e = time.time() print(e - s)
def processResultPath(new_base, resultAndPath): result, path = resultAndPath mean, stderr, count = result # sampled_mean = windowAverage(mean) # sampled_stderr = windowAverage(stderr) sampled_mean = everyN(mean, EVERY) sampled_stderr = everyN(stderr, EVERY) sampled = [sampled_mean, sampled_stderr, count] new_path = new_base + '/' + rest(path) os.makedirs(up(new_path), exist_ok=True) np.save(new_path, sampled) if __name__ == '__main__': pool = Pool() new_base = sys.argv[1] exp_paths = sys.argv[2:] for exp_path in exp_paths: exp = loadExperiment(exp_path) result_paths = listResultsPaths(exp) results = map(loadResults, result_paths) pool.map(partial(processResultPath, new_base), results)
os.makedirs('result\\'+item.get('title')) try: response = requests.get(item.get('image_url')) if response.status_code == 200: file_path = 'result\\{0}/{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg') if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(response.content) else: print('Already Downloaded', file_path) except requests.ConnectionError: print('Failed to save image!') from multiprocessing.pool import Pool def main(offset): json = get_one_page(offset) for item in get_images(json): print(item) save_image(item) GROUP_START = 1 GROUP_END = 5 if __name__ == '__main__': pool = Pool() groups = ([x * 20 for x in range(GROUP_START, GROUP_END+1)]) pool.map(main, groups) pool.close() pool.join()
doc = pq(html) items = doc('.excerpt-c5').items() for item in items: title = item.find('h2').text() photo_url = item.find('.thumbnail > img').attr('data-src') photo_page_url = item.find('h2 > a').attr('href') print(title, photo_url) if __name__ == '__main__': start_time = time.time() num_cpus = multiprocessing.cpu_count() print('将会启动进程数为:', num_cpus) pool = Pool(num_cpus) url_list = [] for i in range(1, 10): url = 'https://www.lovemmtu.net/page/{}'.format(i) url_list.append(url) # spider(url) # print(get_text('https://www.lovemmtu.net/page/4')) pool.map(spider, url_list) pool.close() pool.join() end_time = time.time() print('耗时{}s'.format((end_time - start_time)))
flag1 = 0 para_space = [] while flag1 <= S_0.shape[0] - 1: flag2 = 0 while flag2 <= S_index.shape[0] - 1: flag3 = 0 while flag3 <= B_0.shape[0] - 1: flag4 = 0 while flag4 <= B_index.shape[0] - 1: flag5 = 0 while flag5 <= M_vir_0.shape[0] - 1: para_arr = np.array([ S_0[flag1], S_index[flag2], B_0[flag3], B_index[flag4], M_vir_0[flag5] ]) para_space.append(para_arr) flag5 += 1 flag4 += 1 flag3 += 1 flag2 += 1 flag1 += 1 Time2 = time.time() print("Sequential execution time:", Time2 - Time1) pool = Pool(50) hh = pool.map(Func, para_space) pool.close() pool.join() print(hh) Time3 = time.time() print("Parallel execution time:", Time3 - Time2) print("The total time:", Time3 - Time1)
def add_travel_time_dir(graph_dir, mask_dir, conv_dict, graph_dir_out, min_z=128, dx=4, dy=4, percentile=90, max_speed_band=-2, use_weighted_mean=True, variable_edge_speed=False, mask_prefix='', save_shapefiles=True, n_threads=12, verbose=False): '''Update graph properties to include travel time for entire directory''' t0 = time.time() pickle_protocol = 4 # 4 is most recent, python 2.7 can't read 4 logger1.info("Updating graph properties to include travel time") logger1.info(" Writing to: " + str(graph_dir_out)) os.makedirs(graph_dir_out, exist_ok=True) image_names = sorted( [z for z in os.listdir(mask_dir) if z.endswith('.tif')]) nfiles = len(image_names) n_threads = min(n_threads, nfiles) params = [] for i, image_name in enumerate(image_names): im_root = image_name.split('.')[0] if len(mask_prefix) > 0: im_root = im_root.split(mask_prefix)[-1] out_file = os.path.join(graph_dir_out, im_root + '.gpickle') if (i % 1) == 0: logger1.info("\n" + str(i + 1) + " / " + str(len(image_names)) + " " + image_name + " " + im_root) mask_path = os.path.join(mask_dir, image_name) graph_path = os.path.join(graph_dir, im_root + '.gpickle') if not os.path.exists(graph_path): # print(" ", str(i), "DNE, skipping: " + str(graph_path)) logger1.info(" " + str(i) + "DNE, skipping: " + str(graph_path)) # return continue if verbose: logger1.info("mask_path: " + mask_path) logger1.info("graph_path: " + graph_path) mask = skimage.io.imread(mask_path) G_raw = nx.read_gpickle(graph_path) # see if it's empty if len(G_raw.nodes()) == 0: nx.write_gpickle(G_raw, out_file, protocol=pickle_protocol) continue params.append((G_raw, mask, conv_dict, min_z, dx, dy, \ percentile, \ max_speed_band, use_weighted_mean, \ variable_edge_speed, \ verbose, \ out_file, save_shapefiles, im_root, graph_dir_out)) # exectute if n_threads > 1: pool = Pool(n_threads) pool.map(infer_travel_time, params) else: infer_travel_time(params[0]) tf = time.time() print("Time to infer speed:", tf - t0, "seconds") return
author = item.find('.author') showContent = item.find('.show-content') content = str(htmlTitle + author + showContent) save_to_cvs(title, content) return None except RequestException: print('分页请求出错', link) return parse_page(link) except TimeoutError: return parse_page(link) def save_to_cvs(title, content): file_path = '{0}/{1}.{2}'.format('D:\Text1', title, 'html') if not os.path.exists(file_path): with open(file_path, 'w', encoding='utf-8') as f: f.write(content) print(title, '-下载完成') def main(i): pages = get_links(str(i)) for page in pages: parse_page(page) if __name__ == '__main__': total = int(1416 / 9 + 2) pool = Pool() pool.map(main, [i for i in range(total)])
for i in range(len(header)): d[header[i]] = data[i] d['cn_name'] = sohu_seeds_collect.get_stock_info(code=code) d['code'] = code data_list.append(SoHuStock.create(**d, should_save=False)) logger.info('BATCH DONE') return data_list if __name__ == '__main__': batches = [] batch_size = 100 batch = [] codes = sohu_seeds_collect.data().keys() for code in codes: batch.append(code) if len(batch) == batch_size: batches.append(batch) batch = [] p = Pool(32) data_list = p.map(task, batches) all = [] for data in data_list: all.extend(data) logger.info('Start writing {} data to disk ...'.format(len(all))) SoHuStock.save_many(_deduplicate(all)) logger.info('DONE')
parser = argparse.ArgumentParser() parser.add_argument( "--graph_dir", type=str, help="input graph dir", default="data/graphs/vecroad_4/graphs_junc/" ) parser.add_argument( "--save_dir", type=str, help="save wkt dir", default="data/graphs/vecroad_4/graphs_junc_wkt/" ) args = parser.parse_args() os.makedirs(args.save_dir, exist_ok=True) def worker(f): print(f) name = f.split('.')[0] g = graph_helper.read_graph(os.path.join(args.graph_dir, f)) g = g.clear_self() wkt = g.convert_rs_to_wkt() all_data = [] for linestring in wkt: all_data.append(("AOI_0_{}_img0".format(name), linestring)) df = pd.DataFrame(all_data, columns=['ImageId', 'WKT_Pix']) df.to_csv(os.path.join(args.save_dir, name + '.csv'), index=False) files = os.listdir(args.graph_dir) pool = Pool() pool.map(worker, files) pool.close() pool.join()
def parallel_score_it(chunk_it, score_f, ncpus=2): p = Pool(ncpus) for chunk in chunk_it: for score in p.map(score_f, chunk): yield score p.close()
def daily_photo(request): # 爬虫 class Glob: def __init__(self): self.i = 1 GL = Glob() headers = { 'Host': 'wall.alphacoders.com', 'Referer': 'https://wall.alphacoders.com/by_favorites.php?quickload=807801&page=1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } def get_page(page): params = { # 'search':'fantasy+tiger', 'quickload': '807801', 'page': page, } base_url = 'https://wall.alphacoders.com/featured.php?' url = base_url + urlencode(params) try: resp = requests.get(url, headers=headers) print(url) # print('ok') if resp.status_code == 200: return resp.text except requests.ConnectionError: print('no1') return None def get_imageurl(html): result = re.findall('[a-zA-z]+://images+[^\s]+jpg', html) return result def main(page): GL.i = 0 html = get_page(page) result = get_imageurl(html) file_name = 'D:\\Project\\Myblog\\mysite\\blog\\static\\img\\' + os.path.sep + str(page) + '页' if not os.path.exists(file_name): os.makedirs(file_name) # if not os.path.exists(file_path): # os.makedirs(file_path) for url in result: with open(file_name + os.path.sep + str(page) + '-' + str(GL.i) + '.jpg', 'wb') as f: content = re.sub('-+[3-4]\d{2}-', '-1920-', url) print(content) image = requests.get(content) f.write(image.content) # time.sleep(1) GL.i += 1 if __name__ == '__main__': pool = Pool() Scope = ([i for i in range(1, 31)]) pool.map(main, Scope) pool.close() pool.join() return render(request, 'daily_photo.html')
global s_kp, s_des, pw_kp, pw_des matches = [] print('s_idx', s_idx) matches = [match(s_kp[s_idx], s_des[s_idx], pw_kp[pw_idx], pw_des[pw_idx]) for pw_idx in range(5)] def test(): s_idx = 0 return [match_v3(s_kp[s_idx], s_des[s_idx], pw_kp[pw_idx], pw_des[pw_idx]) for pw_idx in range(pw_kp.shape[0])] def perform_match_v3(s_idx): global s_kp, s_des, pw_kp, pw_des print('s_idx', s_idx) # matches = (match(s_kp[s_idx], s_des[s_idx], pw_kp[pw_idx], pw_des[pw_idx]) for pw_idx in range(5)) # matches = (match_v2(s_kp[s_idx], s_des[s_idx], pw_kp[pw_idx], pw_des[pw_idx]) for pw_idx in range(5)) matches = (match_v3(s_kp[s_idx], s_des[s_idx], pw_kp[pw_idx], pw_des[pw_idx]) for pw_idx in range(pw_kp.shape[0])) # np.savez_compressed(str(s_idx) + '-M', m=np.asarray(matches2)) if __name__ == '__main__': print('Begin pool work') pool = Pool() # s_idx = range(2) s_idx = range(s_kp.shape[0]) time_start = timer() # pool.map(perform_match, s_idx) # pool.map(perform_match_v2, s_idx) pool.map(perform_match_v3, s_idx) time_end = timer() pool.close() pool.join() duration = time_end - time_start print("Program took %.3fs" % duration)
def write_to_elastic(all_images): p = Pool(processes=10) result = p.map(store_image, all_images) p.close() p.join()
if not os.path.exists(file): with open(file, 'wb') as f: f.write(response.content) else: return ('Already Download', file) except requests.ConnectionError: print('Failed to save image') def main(offset): json = get_page(offset) for item in get_image(json): save_image(item) return 'Page {} Save Successful'.format(offset / 20) if __name__ == '__main__': print('Program Start') # offset = 10 # main(offset) GROUP_START = 1 GROUP_END = 5 pool = Pool() groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)]) results = pool.map(main, groups) for result in results: print(result) pool.close() pool.join() print('Program End')
def _read_obs(self, stns_ids=None): # Saw extreme decreased performance due to garbage collection when # pandas ran checks for a chained assignment. Turn off this check # temporarily. opt_val = pd.get_option('mode.chained_assignment') pd.set_option('mode.chained_assignment', None) try: if stns_ids is None: stns_obs = self.stns else: stns_obs = self.stns.loc[stns_ids] nstns = len(stns_obs.station_id) nprocs = self.nprocs if nstns >= self.nprocs else nstns if self.has_start_end_dates: start_date = self.start_date end_date = self.end_date else: start_date = None end_date = None iter_stns = [(row[1], start_date, end_date, self.elems, self.min_hrly_for_dly) for row in stns_obs.iterrows()] if nprocs > 1: # http://stackoverflow.com/questions/24171725/ # scikit-learn-multicore-attributeerror-stdin-instance- # has-no-attribute-close if not hasattr(sys.stdin, 'close'): def dummy_close(): pass sys.stdin.close = dummy_close pool = Pool(processes=nprocs, initializer=_init_worker, initargs=[_download_obs]) obs_all = pool.map(_download_obs, iter_stns, chunksize=1) pool.close() pool.join() else: obs_all = [] _init_worker(_download_obs) for a_stn in iter_stns: obs_stn = _download_obs(a_stn) obs_all.append(obs_stn) _download_obs.ftp.close() try: obs_all = pd.concat(obs_all, ignore_index=True) except ValueError: # No valid observations obs_all = pd.DataFrame({ 'station_id': [], 'elem': [], 'time': [], 'obs_value': [] }) finally: pd.set_option('mode.chained_assignment', opt_val) obs_all = obs_all.set_index(['station_id', 'elem', 'time']) obs_all = obs_all.sortlevel(0, sort_remaining=True) return obs_all
md5(r.content).hexdigest(), 'jpg') #这里直接是路径,不是创建路径 if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(r.content) else: print('Already Download', file_path) except requests.ConnectionError as e: print('Failed to download', e) def main(offset): json = get_page(offset) if json == None: return None for item in get_image(json): #迭代器 save_images(item) #爬取20页内容 GROUP_START = 1 GROUP_END = 20 if __name__ == '__main__': #使用对进程秒爬 pool = Pool() groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)]) #参数时Tuple pool.map(main, groups) #添加到进程池中 pool.close() pool.join() print('all download!')
for i in range(2, int(num**0.5 + 1)): if num % i == 0: return False return True def is_concatenate(num1, num2): if is_prime(int(str(num1) + str(num2))) and is_prime( int(str(num2) + str(num1))): return 1 return 0 p = Pool(processes=16) num_range = range(2, 10**6) prime_list = p.map(cal_prime, num_range) p.close() p.join() prime_list_clear = [x for x in prime_list if x is not None] print(prime_list_clear) check_list = [] for i in prime_list_clear: print(i) check_list.append([i]) for x in check_list: check_s = 0 for j in range(len(x)): check_s += is_concatenate(i, x[j]) if check_s == len(x):