def crawl(dataset_path, scenes, subsequence_length, num_workers=1): pool = Pool(num_workers) manager = Manager() count = len(scenes) progress = manager.Value('i', 0) samples = [] if subsequence_length == 2: for scene_samples in pool.imap_unordered( partial(crawl_subprocess_short, dataset_path=dataset_path, count=count, progress=progress), scenes): samples.extend(scene_samples) else: for scene_samples in pool.imap_unordered( partial(crawl_subprocess_long, dataset_path=dataset_path, count=count, progress=progress, subsequence_length=subsequence_length), scenes): samples.extend(scene_samples) random.shuffle(samples) return samples
def t2(): from gevent.pool import Pool # noqa: E402 p = Pool(10) run1 = [v for v in p.imap_unordered(echo, range(10))] run2 = [v for v in p.imap_unordered(echo, range(10))] run3 = [v for v in p.imap_unordered(echo, range(10))] run4 = [v for v in p.imap_unordered(echo, range(10))] print(run1 == run2 == run3 == run4) print(run1) print(run2)
def t1(): from multiprocessing.pool import Pool # noqa: E402 p = Pool(10) run1 = [v for v in p.imap_unordered(echo, range(10))] run2 = [v for v in p.imap_unordered(echo, range(10))] run3 = [v for v in p.imap_unordered(echo, range(10))] run4 = [v for v in p.imap_unordered(echo, range(10))] print(run1 == run2 == run3 == run4) print(run1) print(run2)
def test_concurrent_processes(get_dict): pool = Pool(16) with get_dict() as storage: filename = storage.filename for _ in pool.imap_unordered(partial(insert_range, filename=filename), split_seq(range(10000), 1000)): pass for _ in pool.imap_unordered(partial(remove_range, filename=filename), split_seq(range(10000), 1000)): pass assert sorted(iter(get_dict())) == list()
def SummaryMode(corpus, context_token_limit): for dataset in datasets: print 'Generating summaries for the %s set:' % dataset urls_filename = '%s/wayback_%s_urls.txt' % (corpus, dataset) urls = ReadUrls(urls_filename) p = Pool() story_lists = p.imap_unordered( GenerateMapper, izip(urls, repeat(corpus), repeat(context_token_limit))) progress_bar = ProgressBar(len(urls)) for story in story_lists: if story == None: continue url_hash = Hashhex(story.url) with open('%s/summary/%s/%s.sent' % (corpus, dataset, url_hash), 'w') as f: f.write(story.content) with open('%s/summary/%s/%s.summ' % (corpus, dataset, url_hash), 'w') as f: f.write(''.join( [highlight + ".\n" for highlight in story.highlights])) progress_bar.Increment()
def cmd_null_minidump(pn): global DUMP print('Processing minidump:', pn) DUMP = Minidump(pn) # prepare args, i.e., mstr args = list() for mid in range(DUMP.mapping_cnt): m = DUMP.mappings[mid] args.append((mid, [], MapType.NONHEAP, m.is_writable)) # multiprocessing for each mapping workers = Pool(NCPU) for (mid, s) in workers.imap_unordered(_null_mapping, args): m = DUMP.mappings[mid] if not s: continue for (va, sz) in list(DUMP.payloads): if va in m: s = s[:va - m.start] + DUMP.payloads[ (va, sz)] + s[va - m.start + sz:] del DUMP.payloads[(va, sz)] DUMP.write(m.stack_addr, s) # patch sparsely DUMP.save(pn, sparse=True) print('Finished:', pn)
def index_owl(owl_file_paths, output_properties, dist): maximum_lines_per_file = 50000 prefix, temp_files, temp_dir = separate_large_owl(owl_file_paths, maximum_lines_per_file) base_dir = os.path.join(os.getcwd(), dist) if os.path.exists(base_dir): rmtree(base_dir) for output_property in output_properties.values(): os.mkdir(os.path.join(temp_dir, output_property)) os.mkdir(base_dir) print(i18n_t('cmd.build_index.info_collecting_info')) try: p = Pool() with tqdm(total=len(temp_files)) as pbar: for _ in p.imap_unordered(output_process, ((prefix, temp_file, output_properties, temp_dir) for temp_file in temp_files)): pbar.update(1) for op in output_properties.values(): join_process((base_dir, temp_dir, op)) with open(os.path.join(base_dir, 'prefix.ttl'), 'w') as fp: fp.write(prefix) finally: rmtree(temp_dir) return base_dir
def parse(document, pages, parse_refs=True, progress_monitor=NullProgressMonitor(), pool_size=DEFAULT_POOL_SIZE): progress_monitor.start('Parsing Pages', pool_size + 1) # Prepare input pages = [(page.local_url, page.url) for page in pages.values() if page.local_url is not None] pages_chunks = chunk_it(pages, pool_size) inputs = [] for pages_chunk in pages_chunks: inputs.append((document.parser, document.pk, parse_refs, pages_chunk)) # Close connection to allow the new processes to create their own. connection.close() # Split work progress_monitor.info('Sending {0} chunks to worker pool' .format(len(inputs))) pool = Pool(pool_size) for result in pool.imap_unordered(sub_process_parse, inputs, 1): progress_monitor.work('Parsed 1/{0} of the pages'.\ format(pool_size), 1) # Word Count word_count = 0 for page in document.pages.all(): word_count += page.word_count document.word_count = word_count document.save() progress_monitor.work('Counted Total Words', 1) pool.close() progress_monitor.done()
def main(): input_folder = Path("/home/ardaduz/HDD/Downloads/tum-rgbd-raw") output_folder = Path("/media/ardaduz/T5/test/tumrgbd") input_directories = [ input_folder / "rgbd_dataset_freiburg1_desk", input_folder / "rgbd_dataset_freiburg1_plant", input_folder / "rgbd_dataset_freiburg1_room", input_folder / "rgbd_dataset_freiburg1_teddy", input_folder / "rgbd_dataset_freiburg1_xyz", input_folder / "rgbd_dataset_freiburg2_desk", input_folder / "rgbd_dataset_freiburg2_metallic_sphere2", input_folder / "rgbd_dataset_freiburg2_xyz", input_folder / "rgbd_dataset_freiburg3_cabinet", input_folder / "rgbd_dataset_freiburg3_long_office_household", input_folder / "rgbd_dataset_freiburg3_nostructure_notexture_far", input_folder / "rgbd_dataset_freiburg3_nostructure_texture_far", input_folder / "rgbd_dataset_freiburg3_structure_notexture_far", input_folder / "rgbd_dataset_freiburg3_structure_texture_far", input_folder / "rgbd_dataset_freiburg3_teddy"] pool = Pool(6) for finished_scene in pool.imap_unordered(partial(process_scene, output_folder=output_folder), input_directories): print("finished", finished_scene) pool.join() pool.close()
def create_vocab(inputs_path, top_k=1000000000, at_least=1, pad="_PAD_", unk="_UNK_", processes=32): word_counts = {} pool = Pool(processes) for wc in pool.imap_unordered(_process_file, inputs_path.glob("*.json")): for k, v in wc.items(): word_counts[k] = word_counts.get(k, 0) + v tokens_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True) logging.info(" # Unique Words: {}".format(len(word_counts))) sorted_tokens_counts = [tc for tc in tokens_counts if tc[1] >= at_least][:top_k] index2tokens = [] if pad is not None: index2tokens.append(pad) if unk is not None: index2tokens.append(unk) index2tokens.extend([t for t, c in sorted_tokens_counts]) tokens2index = {t: i for i, t in enumerate(index2tokens)} logging.info(" After filtering, # Unique Words: {}".format( len(tokens2index))) return Vocab(index2tokens, tokens2index, pad=pad, unk=unk)
def query_all_tweets(query, content_collection, search, year=2017, month=1): """ Queries *all* tweets in the history of twitter for the given query. This will run in parallel for each ~10 days. :param query: A twitter advanced search query. :return: A list of tweets. """ limits = [] while date(year=year, month=month, day=1) < date.today(): nextmonth = month + 1 if month < 12 else 1 nextyear = year + 1 if nextmonth == 1 else year for i in range(1, 26, 2): limits.append((date(year=year, month=month, day=i), date(year=year, month=month, day=i + 2))) limits.append((date(year=year, month=month, day=28), date(year=nextyear, month=nextmonth, day=1))) year, month = nextyear, nextmonth queries = ['{} since:{} until:{}'.format(query, since, until) for since, until in reversed(limits)] all_tweets = 0 pool = Pool(20) try: for new_tweets in pool.imap_unordered(query_tweets_once, queries): all_tweets += len(new_tweets) insert_tweets(content_collection, query, search, new_tweets) if len(new_tweets) > 0: print("Got {} tweets ({} new) for {}.".format(all_tweets, len(new_tweets), new_tweets[0].timestamp)) else: print("Got {} tweets ({} new).".format(all_tweets, len(new_tweets))) except KeyboardInterrupt: print("Program interrupted by user. Returning all tweets gathered so far.")
def query_tweets(query, limit=None, begindate=dt.date(2006, 3, 21), enddate=dt.date.today(), poolsize=20, lang=''): no_days = (enddate - begindate).days if poolsize > no_days: # Since we are assigning each pool a range of dates to query, # the number of pools should not exceed the number of dates. poolsize = no_days dateranges = [begindate + dt.timedelta(days=elem) for elem in linspace(0, no_days, poolsize+1)] if limit: limit_per_pool = (limit // poolsize)+1 else: limit_per_pool = None queries = ['{} since:{} until:{}'.format(query, since, until) for since, until in zip(dateranges[:-1], dateranges[1:])] all_tweets = [] try: pool = Pool(poolsize) logger.info('queries: {}'.format(queries)) try: for new_tweets in pool.imap_unordered(partial(query_tweets_once, limit=limit_per_pool, lang=lang), queries): all_tweets.extend(new_tweets) logger.info('Got {} tweets ({} new).'.format( len(all_tweets), len(new_tweets))) except KeyboardInterrupt: logger.info('Program interrupted by user. Returning all tweets ' 'gathered so far.') finally: pool.close() pool.join() return all_tweets
def query_all_tweets(query, start_date, end_date): """ Queries *all* tweets in the history of twitter for the given query. This will run in parallel for each ~30 days. :param query: A twitter advanced search query. :param start_date: Crawl start Date eg.20170101 :param end_date: Crawl end Date eg.20171010 :return: A list of tweets. """ queries = get_all_query(query, start_date, end_date) pool = Pool(10) all_tweets = [] try: for new_tweets in pool.imap_unordered(partial(query_tweets_once), queries): for new_tweet in new_tweets: all_tweets.append(new_tweet) except KeyboardInterrupt: logging.info("Program interrupted by user. Returning all tweets " "gathered so far.") return sorted(all_tweets, reverse=True)
def _run_with_multiprocessing(process, total_tiles, zoom_levels, multi, quiet, debug): LOGGER.debug("run with multiprocessing") num_processed = 0 LOGGER.info("run process using %s workers", multi) f = partial(_process_worker, process) with tqdm.tqdm(total=total_tiles, unit="tiles", disable=(quiet or debug)) as pbar: for zoom in zoom_levels: process_tiles = process.get_process_tiles(zoom) pool = Pool(multi) try: for output in pool.imap_unordered(f, process_tiles, chunksize=1): if output: _write_worker(process, output) pbar.update() num_processed += 1 except KeyboardInterrupt: LOGGER.info("Caught KeyboardInterrupt, terminating workers") pool.terminate() break except Exception: pool.terminate() raise finally: pool.close() pool.join() process_tiles = None LOGGER.info("%s tile(s) iterated", (str(num_processed)))
def save_auroc(mode="bayes"): X_train, y_train, X_test, y_test = get_digits(0.5) cores = 4 pool = Pool(processes=cores) estimates = {} for k in [20, 30]: estimates[k] = {} for digit_estimated in [0, 1, 2, 3]: condition = (y_train == digit_estimated) selected_digits = X_train[np.where(condition)] durations = [] local_estimates = [] if mode == "bayes": est_fun = threaded_estimates else: est_fun = threaded_estimates_EM for estimate, duration in pool.imap_unordered(est_fun, get_data(selected_digits, n=10, K=k)): local_estimates.append(estimate) estimates[k][digit_estimated] = average_of_estimates(local_estimates, 10) print(f"Done k={k}, digit #{digit_estimated}") with open(f"{mode}_auroc_allk.bin", "wb") as f: pickle.dump(estimates, f)
def query_tweets(query, limit=None, begindate=dt.date(2006,3,21), enddate=dt.date.today(), poolsize=20, lang=''): no_days = (enddate - begindate).days if poolsize > no_days: # Since we are assigning each pool a range of dates to query, # the number of pools should not exceed the number of dates. poolsize = no_days dateranges = [begindate + dt.timedelta(days=elem) for elem in linspace(0, no_days, poolsize+1)] if limit: limit_per_pool = (limit // poolsize)+1 else: limit_per_pool = None queries = ['{} since:{} until:{}'.format(query, since, until) for since, until in zip(dateranges[:-1], dateranges[1:])] all_tweets = [] try: pool = Pool(poolsize) try: for new_tweets in pool.imap_unordered(partial(query_tweets_once, limit=limit_per_pool, lang=lang), queries): all_tweets.extend(new_tweets) logging.info("Got {} tweets ({} new).".format( len(all_tweets), len(new_tweets))) except KeyboardInterrupt: logging.info("Program interrupted by user. Returning all tweets " "gathered so far.") finally: pool.close() pool.join() return all_tweets
def _run_with_multiprocessing(process, zoom_levels, multi, max_chunksize): logger.debug("run with multiprocessing") num_processed = 0 total_tiles = process.count_tiles(min(zoom_levels), max(zoom_levels)) logger.debug("run process on %s tiles using %s workers", total_tiles, multi) f = partial(_process_worker, process) for zoom in zoom_levels: pool = Pool(multi, _worker_sigint_handler) try: for tile, message in pool.imap_unordered( f, process.get_process_tiles(zoom), # set chunksize to between 1 and max_chunksize chunksize=max_chunksize): num_processed += 1 logger.debug("tile %s/%s finished", num_processed, total_tiles) yield dict(process_tile=tile, **message) except KeyboardInterrupt: logger.error("Caught KeyboardInterrupt, terminating workers") pool.terminate() raise except Exception: pool.terminate() raise finally: pool.close() pool.join() logger.debug("%s tile(s) iterated", (str(num_processed)))
def sampled_choice_sets_agreement(choice_sets, num_threads, model, epsilon): """ Optimize agreement for 500 randomly sampled choice sets. :param choice_sets: the choice sets to sample from :param num_threads: number of threads to use :param model: a fitted DiscreteChoiceModel with two agents :param epsilon: approximation parameter """ filtered_choice_sets = [ x for x in choice_sets if 1 < np.count_nonzero(x) <= 5 ] choice_set_indices = np.random.choice(range(len(filtered_choice_sets)), 500, replace=False) sampled_choice_sets = [ tuple(np.nonzero(filtered_choice_sets[i])[0]) for i in choice_set_indices ] pool = Pool(num_threads) helper_partial = partial(agreement_helper, model=model, epsilon=epsilon) results = [] for result in tqdm(pool.imap_unordered(helper_partial, sampled_choice_sets), total=len(sampled_choice_sets)): results.append(result) pool.close() pool.join() return results
def create_masks(data_root_path='/data/SN7_buildings/train/', result_path='/wdata/train_masks/'): if os.path.exists(result_path): shutil.rmtree(result_path) os.mkdir(result_path) ids = os.listdir(data_root_path) all_params = [] for _id in tqdm(ids[:]): id_path = os.path.join(data_root_path, _id) if not os.path.isdir(id_path): continue sub_res_path = os.path.join(result_path, _id) os.mkdir(sub_res_path) labels_path = os.path.join(id_path, 'labels_match_pix') rasters_path = os.path.join(id_path, 'images') files = sorted(os.listdir(labels_path)) files = [el for el in files if 'UDM' not in el] files = ['_'.join(el.split('.')[0].split('_')[:-1]) for el in files] params = [(el, labels_path, rasters_path, sub_res_path) for el in files] all_params += params n_cpus = cpu_count() pool = Pool(n_cpus) for _ in tqdm(pool.imap_unordered(mask_fro_id, all_params), total=len(all_params)): pass
def work_with_database(cursor: sqlite3.Cursor, args: Namespace, pool: Pool, dims: int) -> None: """ a function-helper :param cursor: a cursor for a database to work with :param args: additional arguments :param pool: a multiprocessing pool :param dims: semigroups cardinality :returns: """ try: create_table_if_not_exists( cursor, TABLE_NAME, ["output STRING", "errors STRING"], ) with tqdm(total=args.number_of_tasks) as progress_bar: for output, errors in pool.imap_unordered( partial( table_completion, dims, args.mace_timeout, args.mace_memory_mb, ), range(args.number_of_tasks), ): insert_values_into_table(cursor, TABLE_NAME, (output, errors)) progress_bar.update() finally: pool.close() pool.join()
def index_owl(owl_file_paths, output_properties, dist): prefix, temp_files, temp_dir = separate_large_owl(owl_file_paths) base_dir = os.path.join(os.getcwd(), dist) if os.path.exists(base_dir): rmtree(base_dir) for output_property in output_properties.values(): os.mkdir(os.path.join(temp_dir, output_property)) os.mkdir(base_dir) print('分割したファイルから情報を収集しています...') try: p = Pool() with tqdm(total=len(temp_files)) as pbar: for _ in p.imap_unordered( output_process, ((prefix, temp_file, output_properties, temp_dir) for temp_file in temp_files)): pbar.update(1) for op in output_properties.values(): join_process((base_dir, temp_dir, op)) with open(os.path.join(base_dir, 'prefix.ttl'), 'w') as fp: fp.write(prefix) finally: rmtree(temp_dir) return base_dir
def parallel_create_audio_data(audio_files: List[str], sample_rate: int, outfile: str, mono=True, max_seconds: Optional[int] = None): fn_args = [] batch_size = 100 for i in range(0, len(audio_files), batch_size): batch = audio_files[i:i + batch_size] if len(batch) == 0: continue batch_outfile = str( Path(outfile).parent / f"{Path(outfile).stem}.chunk.{i:05}.npz") if os.path.exists(batch_outfile): continue fn_args.append( (batch, sample_rate, batch_outfile, mono, max_seconds, False)) pool = Pool() for _ in tqdm(pool.imap_unordered(wrapped_create_audio_data, fn_args), total=len(fn_args)): pass
def download_using_parallel_processing(concepts, processors): pool = Pool(processors) counter = 0 for result in pool.imap_unordered(download_concept, concepts): counter += 1 if counter % 100 == 0: print_log(" Count: " + str(counter))
def query_profile(self, profiles, poolsize=20): ''' profiles: List Unique profies to scrape from poolsize: int Size of pool. Bigger - the more instance of browser is opened logger (logger): Made this mandatory here because of issues ''' url = "https://twitter.com/{}" no_profiles = len(profiles) if (poolsize > no_profiles): poolsize = no_profiles urls = [url.format(x) for x in profiles] all_profile = [] pool = Pool(poolsize) try: for profile_data in pool.imap_unordered(partial(self.query_single_profile), urls): all_profile.append(profile_data) self.logger.info("Got {} profiles (1 new).".format(len(all_profile))) finally: pool.close() pool.join() return all_profile
def query_profile(self, profiles, poolsize=20): ''' profiles: List Unique profies to scrape from poolsize: int Size of pool. Bigger - the more instance of browser is opened ''' url = "https://twitter.com/{}" no_profiles = len(profiles) if (poolsize > no_profiles): poolsize = no_profiles urls = [url.format(x) for x in profiles] all_profiles = [] pool = Pool(poolsize) profile_received = 0 try: for profile in pool.imap_unordered( partial(self.query_single_profile), urls): profile_received = profile_received + 1 all_profiles.append(profile) print("Got {} profiles (1 new).".format(profile_received)) finally: pool.close() pool.join() return all_profiles
def query_tweets(query, limit=None, begindate=dt.date(2017,1,1), enddate=dt.date.today(), poolsize=20, lang=''): no_days = (enddate - begindate).days stepsize = roundup(no_days, poolsize) dateranges = [begindate + dt.timedelta(days=elem) for elem in range(0,no_days,stepsize)] dateranges.append(enddate) if limit: limit_per_pool = roundup(limit, poolsize) else: limit_per_pool = None queries = ['{} since:{} until:{}'.format(query, since, until) for since, until in zip(dateranges[:-1], dateranges[1:])] all_tweets = [] try: pool = Pool(poolsize) try: for new_tweets in pool.imap_unordered(partial(query_tweets_once, limit=limit_per_pool, lang=lang), queries): all_tweets.extend(new_tweets) logging.info("Got {} tweets ({} new).".format( len(all_tweets), len(new_tweets))) except KeyboardInterrupt: logging.info("Program interrupted by user. Returning all tweets " "gathered so far.") finally: pool.close() pool.join() return all_tweets
def validation_loss_grid_search(datasets, methods, update=False): lrs = [0.0005, 0.001, 0.005, 0.01, 0.05, 0.1] wds = [0, 0.0001, 0.0005, 0.001, 0.005, 0.01] params = {(dataset, method, lr, wd) for dataset in datasets for lr in lrs for method in methods for wd in wds} results = dict() pool = Pool(THREADS) for args, losses in tqdm(pool.imap_unordered( validation_loss_grid_search_helper, params), total=len(params)): results[args] = losses pool.close() pool.join() filename = f'{CONFIG_DIR}/validation_loss_lr_wd_settings.pickle' if update: with open(filename, 'rb') as f: old_results, old_datasets, old_methods, old_lrs, old_wds = pickle.load( f) old_results.update(results) results = old_results datasets = list(set(old_datasets).union(datasets)) lrs = sorted(set(old_lrs).union(lrs)) wds = sorted(set(old_wds).union(wds)) with open(filename, 'wb') as f: pickle.dump((results, datasets, methods, lrs, wds), f)
def learning_rate_grid_search(datasets, methods, update=False): lrs = [0.0005, 0.001, 0.005, 0.01, 0.05, 0.1] params = {(dataset, method, lr) for dataset in datasets for lr in lrs for method in methods} results = dict() pool = Pool(THREADS) for args, loss in tqdm(pool.imap_unordered( learning_rate_grid_search_helper, params), total=len(params)): results[args] = loss pool.close() pool.join() filename = f'{CONFIG_DIR}/learning_rate_settings.pickle' if update: with open(filename, 'rb') as f: old_results, old_lrs = pickle.load(f) old_results.update(results) results = old_results lrs = sorted(set(old_lrs).union(lrs)) with open(filename, 'wb') as f: pickle.dump((results, lrs), f)
def do_multiprocess(images, measure, num_processes, is_rotation_invariant=False): rotations = None if is_rotation_invariant: rotations = dict() for path, img in images: rotations[path] = (rotate(img, 90), rotate(img, 180), rotate(img, 270)) pool = Pool(num_processes) doer = Doer(measure, is_rotation_invariant, rotations) n_images = len(images) n_combinations = factorial(n_images) / (factorial(2) * factorial(n_images - 2)) records = [] with tqdm(total=n_combinations) as pbar: for record in tqdm( pool.imap_unordered(doer.do, list(itertools.combinations(images, 2)), chunksize=50)): records.append(record) pbar.update() return records
def main(): mp = Mpool(10) run1 = [a for a in mp.imap_unordered(echo, xrange(10))] run2 = [a for a in mp.imap_unordered(echo, xrange(10))] run3 = [a for a in mp.imap_unordered(echo, xrange(10))] run4 = [a for a in mp.imap_unordered(echo, xrange(10))] print(run1 == run2 == run3 == run4) gp = Gpool(10) run1 = [a for a in gp.imap_unordered(echo, xrange(10))] run2 = [a for a in gp.imap_unordered(echo, xrange(10))] run3 = [a for a in gp.imap_unordered(echo, xrange(10))] run4 = [a for a in gp.imap_unordered(echo, xrange(10))] print(run1 == run2 == run3 == run4)
def multiproc_eval(func, ): count = inputs.shape[0] global pool if pool is None: pool = Pool(processes=20) return list( tqdm(pool.imap_unordered(partial(eval, func), range(count)), total=count))
def download_list(api_k, hash_list): global api_key if api_k: api_key = api_k files = json.load(open(hash_list)) pool = Pool(os.cpu_count()) for _ in tqdm.tqdm(pool.imap_unordered(download_file_by_hash, files), total=len(files)): pass
def main(): # non deterministic process pool from multiprocessing.pool import Pool p = Pool(10) run1 = [a for a in p.imap_unordered(echo, xrange(10))] run2 = [a for a in p.imap_unordered(echo, xrange(10))] run3 = [a for a in p.imap_unordered(echo, xrange(10))] run4 = [a for a in p.imap_unordered(echo, xrange(10))] print(run1, run2, run3, run4) print(run1 == run2 == run3 == run4) # deterministic gevent pool from gevent.pool import Pool p = Pool(10) run1 = [a for a in p.imap_unordered(echo, xrange(10))] run2 = [a for a in p.imap_unordered(echo, xrange(10))] run3 = [a for a in p.imap_unordered(echo, xrange(10))] run4 = [a for a in p.imap_unordered(echo, xrange(10))] print(run1, run2, run3, run4) print(run1 == run2 == run3 == run4)
def extract_all_plaintext(filenames, out_folder=PLAINTEXT_FOLDER): print "EXTRACTING PLAINTEXT FROM {0} FILES INTO {1}".format(len(filenames),out_folder) #Zip the filename input with the output folder tuple_input = zip(filenames, [out_folder]*len(filenames)) pool = Pool(processes=util.CPU_COUNT) #pool = Pool(processes=1) num_tasks = len(filenames) for i, _ in enumerate(pool.imap_unordered(__extract_plaintext_as_tuple, tuple_input), 1): sys.stderr.write('\rdone {0:%}'.format(i/num_tasks)) pool.close() print "\nDONE"
def StoreMode(corpus): for dataset in datasets: print "Storing news stories for the %s set:" % dataset urls_filename = "%s/wayback_%s_urls.txt" % (corpus, dataset) urls = ReadUrls(urls_filename) p = Pool() stories = p.imap_unordered(StoreMapper, izip(urls, repeat(corpus))) progress_bar = ProgressBar(len(urls)) for story in stories: if story: WriteStory(story, corpus) progress_bar.Increment()
def run(config_uri, app_name=None, username=None, types=(), batch_size=500, processes=None): # multiprocessing.get_context is Python 3 only. from multiprocessing import get_context from multiprocessing.pool import Pool # Loading app will have configured from config file. Reconfigure here: logging.getLogger('snovault').setLevel(logging.DEBUG) testapp = internal_app(config_uri, app_name, username) connection = testapp.app.registry[CONNECTION] uuids = [str(uuid) for uuid in connection.__iter__(*types)] transaction.abort() logger.info('Total items: %d' % len(uuids)) pool = Pool( processes=processes, initializer=initializer, initargs=(config_uri, app_name, username), context=get_context('forkserver'), ) all_results = [] try: for result in pool.imap_unordered(worker, batched(uuids, batch_size), chunksize=1): results = result['results'] errors = sum(error for item_type, path, update, error in results) updated = sum(update for item_type, path, update, error in results) logger.info('Batch: Updated %d of %d (errors %d)' % (updated, len(results), errors)) all_results.extend(results) finally: pool.terminate() pool.join() def result_item_type(result): # Ensure we always return a string return result[0] or '' for item_type, results in itertools.groupby( sorted(all_results, key=result_item_type), key=result_item_type): results = list(results) errors = sum(error for item_type, path, update, error in results) updated = sum(update for item_type, path, update, error in results) logger.info('Collection %s: Updated %d of %d (errors %d)' % (item_type, updated, len(results), errors))
def GenerateMode(corpus, context_token_limit): for dataset in datasets: print 'Generating questions for the %s set:' % dataset urls_filename = '%s/wayback_%s_urls.txt' % (corpus, dataset) urls = ReadUrls(urls_filename) p = Pool() question_context_lists = p.imap_unordered( GenerateMapper, izip(urls, repeat(corpus), repeat(context_token_limit))) progress_bar = ProgressBar(len(urls)) for question_context_list in question_context_lists: if question_context_list: for question_context in question_context_list: WriteQuestionContext(question_context, corpus, dataset) progress_bar.Increment()
label_indeces = load_labels() raw_features = load_raw_features() print "Loaded {0} features".format(len(raw_features)) print "Grouping prevectors by base_url" sites = {} site_labels = {} for dp in data_points: if dp['base_url'] not in sites: sites[dp['base_url']] = {} site_labels[dp['base_url']] = dp['label'] sites[dp['base_url']][dp['offset']] = {"code": dp['code'], "content_ssdeep": dp['content_ssdeep']} print "Vectorizing {0} base urls".format(len(sites)) labels = [] names = [] vectors = [] pool = Pool(processes=cpu_count(), initializer=preload_process, initargs=(sites,)) for vector, site in pool.imap_unordered(compute_vectors, sites.keys()): if site_labels[site] in labels_to_ignore: continue vectors.append(vector) labels.append(site_labels[site]) names.append(site) print "Vector for {0} completed".format(site) with open("raw_feature_vectors.json", "w") as f: json.dump({"labels": labels, "names": names, "vectors": vectors}, f)
def main(force_reanalyze=False, include_hidden=False, dry_run=False, gain_type='auto', jobs=default_job_count(), quiet=False, verbose=False, *music_directories ): """Add replaygain tags to your music files.""" if quiet: logging.basicConfig(level=logging.WARN) elif verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) # Some pesky functions used below will catch KeyboardInterrupts # inappropriately, so install an alternate handler that bypasses # KeyboardInterrupt instead. def signal_handler(sig, frame): print "Canceled." os.kill(os.getpid(), signal.SIGTERM) original_handler = signal.signal(signal.SIGINT, signal_handler) track_class = RGTrack if dry_run: logging.warn('This script is running in "dry run" mode, so no files will actually be modified.') track_class = RGTrackDryRun if len(music_directories) == 0: logging.error("You did not specify any music directories or files. Exiting.") sys.exit(1) logging.info("Searching for music files in the following directories:\n%s", "\n".join(music_directories),) tracks = [ track_class(f) for f in get_all_music_files(music_directories, ignore_hidden=(not include_hidden)) ] # Filter out tracks for which we can't get the length for t in tracks[:]: try: len(t) except Exception: logging.error("Track %s appears to be invalid. Skipping.", t.filename) tracks.remove(t) if len(tracks) == 0: logging.error("Failed to find any tracks in the directories you specified. Exiting.") sys.exit(1) track_sets = RGTrackSet.MakeTrackSets(tracks) # Remove the earlier bypass of KeyboardInterrupt signal.signal(signal.SIGINT, original_handler) logging.info("Beginning analysis") handler = TrackSetHandler(force=force_reanalyze, gain_type=gain_type) # For display purposes, calculate how much granularity is required # to show visible progress at each update total_length = sum(len(ts) for ts in track_sets) min_step = min(len(ts) for ts in track_sets) places_past_decimal = max(0,int(math.ceil(-math.log10(min_step * 100.0 / total_length)))) update_string = '%.' + str(places_past_decimal) + 'f%% done' import gst pool = None try: if jobs == 1: # Sequential handled_track_sets = imap(handler, track_sets) else: # Parallel pool = Pool(jobs) handled_track_sets = pool.imap_unordered(handler,track_sets) processed_length = 0 percent_done = 0 for ts in handled_track_sets: processed_length = processed_length + len(ts) percent_done = 100.0 * processed_length / total_length logging.info(update_string, percent_done) logging.info("Analysis complete.") except KeyboardInterrupt: if pool is not None: logging.debug("Terminating process pool") pool.terminate() pool = None raise finally: if pool is not None: logging.debug("Closing transcode process pool") pool.close() if dry_run: logging.warn('This script ran in "dry run" mode, so no files were actually modified.') pass
def imap_unordered(self, func, iterable, chunksize=1): """ Override multiprocessing.Pool.imap_unordered() method such that it logs full exception stack trace from child process. """ return Pool.imap_unordered(self, LogExceptions(func), iterable, chunksize)
import time def echo(i): time.sleep(0.001) return i from multiprocessing.pool import Pool p = Pool(10) print [a for a in p.imap_unordered(echo, xrange(10))] print [a for a in p.imap_unordered(echo, xrange(10))] print [a for a in p.imap_unordered(echo, xrange(10))] print [a for a in p.imap_unordered(echo, xrange(10))] # ^ Is this distribution random ? from gevent.pool import Pool p = Pool(10) print [a for a in p.imap_unordered(echo, xrange(10))] print [a for a in p.imap_unordered(echo, xrange(10))] print [a for a in p.imap_unordered(echo, xrange(10))] print [a for a in p.imap_unordered(echo, xrange(10))]
import inspect from scipy.stats import f_oneway from statsmodels.stats.multicomp import pairwise_tukeyhsd from collections import defaultdict, Counter from multiprocessing.pool import Pool from itertools import cycle completed = [] container = defaultdict(list) pool = Pool() sim_list = [func for name,func in inspect.getmembers(simulations, inspect.isfunction) if name.startswith('sim_')] try: print('press CTRL-c to stop generating samples') it = pool.imap_unordered(f, cycle(sim_list)) while 1: sim, result = it.next(timeout=SIMULATION_TIMEOUT) completed.append(sim) sys.stdout.write('.') for p, wins in result.items(): container[p].append( (sim, wins) ) except KeyboardInterrupt: pool.close() print('stopping all simulations...') finally: pool.terminate() pool.join()
from operator import attrgetter from collections import namedtuple from multiprocessing.pool import Pool from jinja2 import Environment, FileSystemLoader import requests import feedparser import config SearchResult = namedtuple('SearchResult', ['title', 'url']) pool = Pool(5) if __name__ == '__main__': feeds = pool.imap_unordered(feedparser.parse, config.SEARCH_FEEDS) entries = chain.from_iterable(map(attrgetter('entries'), feeds)) unique_entries = dict((v['link'], v) for v in entries).values() results = [SearchResult(entry.title, entry.link) for entry in unique_entries] if results: env = Environment(autoescape=True, loader=FileSystemLoader('templates')) template = env.get_template('notification.html') email_msg = template.render(title=config.EMAIL_SUBJECT, results=results) requests.post(config.MAILGUN_URL, auth=("api", config.MAILGUN_KEY), data={ "from": config.MAILGUN_EMAIL_SENDER, "to": config.SEND_NOTIFICATIONS_TO, "subject": config.EMAIL_SUBJECT,
import os import time def echo(i): time.sleep(0.001) print os.getpid() return i # Non Deterministic Process Pool from multiprocessing.pool import Pool p = Pool(10) run1 = [a for a in p.imap_unordered(echo, xrange(10))] run2 = [a for a in p.imap_unordered(echo, xrange(10))] run3 = [a for a in p.imap_unordered(echo, xrange(10))] run4 = [a for a in p.imap_unordered(echo, xrange(10))] print( run1 == run2 == run3 == run4 ) print print # Deterministic Gevent Pool from gevent.pool import Pool p = Pool(10) run1 = [a for a in p.imap_unordered(echo, xrange(10))] run2 = [a for a in p.imap_unordered(echo, xrange(10))] run3 = [a for a in p.imap_unordered(echo, xrange(10))] run4 = [a for a in p.imap_unordered(echo, xrange(10))]
def task(pid): print('Starting task %d' % (pid,)) time.sleep(random.randint(0,5)) print('Finished task %d' % (pid,)) return pid**2 p = Pool(processes=5) #result = p.apply(task, [1]) #async_result = p.apply_async(task,[1]) #print async_result.ready() #result = async_result.get() #print result #mapresult = p.map(task,xrange(0,10)) #print mapresult #async_mapresult = p.map_async(task,xrange(0,10)) #print async_mapresult.ready() #result = async_mapresult.get() #print result #imapresult = p.imap(task,xrange(0,10)) #for result in imapresult: # print result imapresult_unordered = p.imap_unordered(task,xrange(0,10)) for result in imapresult_unordered: print result