def run_worker(data, versions, channels): t_pool = ThreadPool() t_pool.imap_unordered(partial(generate_statistics, versions=versions, channels=channels), data) t_pool.close() t_pool.join()
def find_process_files(root_dir): lock = Lock() try: num_proc = int(os.environ.get('SCIPY_NUM_CYTHONIZE_JOBS', '')) pool = Pool(processes=num_proc) except ValueError: pool = Pool() hash_db = load_hashes(HASH_FILE) # Keep changed pxi/pxd hashes in a separate dict until the end # because if we update hash_db and multiple files include the same # .pxi file the changes won't be detected. dep_hashes = {} # Run any _generate_pyx.py scripts jobs = [] for cur_dir, dirs, files in os.walk(root_dir): generate_pyx = os.path.join(cur_dir, '_generate_pyx.py') if os.path.exists(generate_pyx): jobs.append(generate_pyx) for result in pool.imap_unordered(lambda fn: process_generate_pyx(fn, lock), jobs): pass # Process pyx files jobs = [] for cur_dir, dirs, files in os.walk(root_dir): for filename in files: in_file = os.path.join(cur_dir, filename + ".in") if filename.endswith('.pyx') and os.path.isfile(in_file): continue for fromext, function in rules.items(): if filename.endswith(fromext): toext = ".c" with open(os.path.join(cur_dir, filename), 'rb') as f: data = f.read() m = re.search(br"^\s*#\s*distutils:\s*language\s*=\s*c\+\+\s*$", data, re.I|re.M) if m: toext = ".cxx" fromfile = filename tofile = filename[:-len(fromext)] + toext jobs.append((cur_dir, fromfile, tofile, function, hash_db, dep_hashes, lock)) for result in pool.imap_unordered(lambda args: process(*args), jobs): pass hash_db.update(dep_hashes) save_hashes(hash_db, HASH_FILE)
def SerializeHtmlTraces(results): """Creates html trace files for each story run, if necessary. For each story run, takes all trace files from individual trace agents and runs trace2html on them. This is done only once, subsequent calls to this function will not do anything. TODO(crbug.com/981349): Remove this function entirely when trace serialization has been handed over to results processor. """ assert not results.current_story_run, 'Cannot serialize traces while running.' def _GetCpuCount(): try: return multiprocessing.cpu_count() except NotImplementedError: # Some platforms can raise a NotImplementedError from cpu_count() logging.warn('cpu_count() not implemented.') return 8 available_runs = list(run for run in results.IterRunsWithTraces()) if not available_runs: return # Note that this is speculatively halved as an attempt to fix # crbug.com/953365. threads_count = min(_GetCpuCount() / 2 or 1, len(available_runs)) pool = ThreadPool(threads_count) try: for _ in pool.imap_unordered(_SerializeHtmlTraceInPool, available_runs): pass finally: pool.terminate() pool.join()
def load(cls, docs, ignore_errors=False): """Force load the provided docs to read from file system.""" if not docs: return pod = docs[0].pod def load_func(doc): """Force the doc to read the source file.""" try: # pylint: disable=pointless-statement doc.has_serving_path() # Using doc fields forces file read. except document_front_matter.BadFormatError: if not ignore_errors: raise with pod.profile.timer('DocsLoader.load'): if ThreadPool is None or len(docs) < cls.MIN_POOL_COUNT: for doc in docs: load_func(doc) return pool_size = min(cls.MAX_POOL_SIZE, len(docs) * cls.POOL_RATIO) pool_size = int(round(pool_size)) thread_pool = ThreadPool(pool_size) results = thread_pool.imap_unordered(load_func, docs) # Loop results to make sure that the threads are all processed. for _ in results: pass thread_pool.close() thread_pool.join()
def _download_all(items): """Async download of the files. Example: [(url, quality, file_path)] """ global WORKERS # Don't start more workers then 1:1 if WORKERS < len(items): WORKERS = len(items) pool = ThreadPool(WORKERS) chunks = 1 # TODO # 1 ffmpeg is normally 10x- 20x * 2500kbits ish # so depending on how many items you download and # your bandwidth you might need to tweak chunk results = pool.imap_unordered(dl, items, chunks) try: for j in tqdm.tqdm(results, total=len(items)): pass finally: pool.close() pool.join()
def _maybe_convert_set(extracted_dir, source_csv, target_csv): print() if path.exists(target_csv): print('Found CSV file "%s" - not importing "%s".' % (target_csv, source_csv)) return print('No CSV file "%s" - importing "%s"...' % (target_csv, source_csv)) samples = [] with open(source_csv) as source_csv_file: reader = csv.DictReader(source_csv_file) for row in reader: samples.append((row['filename'], row['text'])) # Mutable counters for the concurrent embedded routine counter = { 'all': 0, 'too_short': 0, 'too_long': 0 } lock = RLock() num_samples = len(samples) rows = [] def one_sample(sample): mp3_filename = path.join(*(sample[0].split('/'))) mp3_filename = path.join(extracted_dir, mp3_filename) # Storing wav files next to the mp3 ones - just with a different suffix wav_filename = path.splitext(mp3_filename)[0] + ".wav" _maybe_convert_wav(mp3_filename, wav_filename) frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT)) file_size = path.getsize(wav_filename) with lock: if int(frames/SAMPLE_RATE*1000/10/2) < len(str(sample[1])): # Excluding samples that are too short to fit the transcript counter['too_short'] += 1 elif frames/SAMPLE_RATE > MAX_SECS: # Excluding very long samples to keep a reasonable batch-size counter['too_long'] += 1 else: # This one is good - keep it for the target CSV rows.append((wav_filename, file_size, sample[1])) counter['all'] += 1 print('Importing mp3 files...') pool = Pool(cpu_count()) bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR) for i, _ in enumerate(pool.imap_unordered(one_sample, samples), start=1): bar.update(i) bar.update(num_samples) pool.close() pool.join() print('Writing "%s"...' % target_csv) with open(target_csv, 'w') as target_csv_file: writer = csv.DictWriter(target_csv_file, fieldnames=FIELDNAMES) writer.writeheader() bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR) for filename, file_size, transcript in bar(rows): writer.writerow({ 'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript }) print('Imported %d samples.' % (counter['all'] - counter['too_short'] - counter['too_long'])) if counter['too_short'] > 0: print('Skipped %d samples that were too short to match the transcript.' % counter['too_short']) if counter['too_long'] > 0: print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS))
def upfront_scan_hosts(self, hosts, command_label): logger.verbose("scan_hosts() - command label : " + command_label ) pool = ThreadPool(self.args.threadPool) self.phase_commands = [] nmap_path = os.path.join(self.args.outputFolder, __nmap_folder__) # Check folder for existing service if not os.path.exists(nmap_path): os.makedirs(nmap_path) for host in hosts: command_keys = { 'output': os.path.join(nmap_path, command_label.replace(" ","_")+"_"+host.strip().replace(".", "_") ), 'target': host.strip()} command = self.prepare_command( command_label ,command_keys ) base, filename = os.path.split(command_keys['output']) # Resume file already exists if not self.args.noResume and self.find_files(base, filename + ".*").__len__() > 0: logger.verbose("scan_hosts() - RESUME - output file already exists: " + command_keys['output']) else: self.phase_commands.append(command) logger.debug("scan_hosts() - command : " + command) #results = pool.map(self.execute_scan, self.phase_commands) for _ in bar(pool.imap_unordered(self.execute_command, self.phase_commands), expected_size=len(self.phase_commands)): pass pool.close() pool.join()
def check_vm_connectivity(env, os_conn, vm_keypair=None, timeout=4 * 60): """Check that all vms can ping each other and public ip""" ping_plan = {} exc = [] def check(args): server, ips_to_ping = args try: check_ping_from_vm(env, os_conn, server, vm_keypair, ips_to_ping, timeout=timeout) except AssertionError as e: return e servers = os_conn.get_servers() for server1 in servers: ips_to_ping = [settings.PUBLIC_TEST_IP] for server2 in servers: if server1 == server2: continue ips_to_ping += os_conn.get_nova_instance_ips( server2).values() ping_plan[server1] = ips_to_ping p = Pool(len(ping_plan)) for result in p.imap_unordered(check, ping_plan.items()): if result is not None: exc.append(result) if len(exc) > 0: raise MultipleAssertionErrors(exc)
def get_all_packages(save_to, batch_size=1000): num_exceptions = 0 collected_data = [] pool = ThreadPool(multiprocessing.cpu_count()) file_batches = _batch_list(batch_size, all_packages) print('Getting all packages...') for batch_results, n_exceptions in tqdm( pool.imap_unordered(_get_packages_worker, file_batches), total=len(all_packages) // batch_size + 1): # Add to global list collected_data.extend(batch_results) num_exceptions += n_exceptions pool.close() pool.join() # Log and save at completion print( f'There were {num_exceptions} exceptions out of {len(all_packages)} requests.' ) print(f'Saving data to /data...') with open(save_to, 'w', encoding='utf-8') as f: ujson.dump( { "data": collected_data, "timestamp": time.time(), "pypi_api_url": api_url + "/<PACKAGE_NAME>/json", }, f) print(f'Saved data to: {save_to}')
def _initialize_len(self): """ """ print("Initializing Stream") if self.jobs == 1: lengths = list( tqdm(map(self._get_len, enumerate(self.filenames)), total=len(self.filenames), file=sys.stdout, desc="Post Counter")) else: mp = Threads(self.jobs) lengths = list( tqdm(mp.imap_unordered(self._get_len, enumerate(self.filenames)), total=len(self.filenames), file=sys.stdout, desc="Post Counter")) _ = mp.close() if self.kind == "post": self.len_ = sum([i[1] for i in lengths]) else: self.len_ = len([i for i in lengths if i[1] > 0]) self.filenames = [self.filenames[i[0]] for i in lengths if i[1] > 0]
def add_items_concurrently(users): print("Sending add item requests...") p = Pool(len(users)) start = time.time() for response in p.imap_unordered(add_item, users): print("{} (Time elapsed: {}s)".format(response, int(time.time() - start)))
def login_concurrently(users): print("Sending login requests...") p = Pool(len(users)) start = time.time() for response in p.imap_unordered(login, users): print("{} (Time elapsed: {}s)".format(response, int(time.time() - start)))
def processJobs(jobs, concurrentTasks, sortOutput=False): job_count = len(jobs) logging.info("Processing {} job(s) with a concurrency of {}".format( job_count, concurrentTasks)) if RANDOMIZE_JOBS: shuffle(jobs) pool = Pool(concurrentTasks) try: job_progress = 0 for x in tqdm(pool.imap_unordered(worker, jobs), total=len(jobs)): job_progress += 1 logging.info("{} out of {} staged jobs remaining".format( job_count - job_progress, job_count)) pool.close() pool.join() except KeyboardInterrupt: printAndLog( "\nReceived keyboard interrupt. Cleaning up and exiting...") pool.terminate() cleanup() sys.exit(1) except SystemExit: pool.terminate() sys.exit(1) if sortOutput: cleanup() print("\n")
def apply_tokenizer(filenames, cache_dir, min_n=1, max_n=1, min_date=None, max_date=None, remove_retweets=False, jobs=4): """ """ ## Tokenizer helper = partial(load_and_tokenize, min_n=min_n, max_n=max_n, min_date=min_date, max_date=max_date, remove_retweets=remove_retweets, cache_dir=cache_dir, pretokenized=False) ## Initialize Pool mp = Pool(jobs) filenames = list( tqdm(mp.imap_unordered(helper, filenames), desc="Tokenizer", total=len(filenames), file=sys.stdout)) _ = mp.close() ## Filename Map filenames = dict((y, x) for x, y in filenames) ## Return Filenames return filenames
def scrape(array, function, threads): # Define the number of threads pool = ThreadPool(threads) # Tell the user what is happening print( f"Scraping {len(array)} items using {function} on {threads} threads.") # Calls function() and adds the filesize returned each call to an array called filesizes result = (pool.imap_unordered(function, array)) pool.close() # Display progress as the scraper runs its processes while (len(array) > 1): completed = result._index # Break out of the loop if all tasks are done or if there is only one task if (completed == len(array)): sys.stdout.flush() sys.stdout.write('\r' + "") sys.stdout.flush() break # Avoid a ZeroDivisionError if completed > 0: sys.stdout.flush() sys.stdout.write( '\r' + f"{completed/len(array)*100:.0f}% done. {len(array)-completed} left. " ) sys.stdout.flush() sys.stdout.flush() pool.join() return list(result)
def load(cls, docs): """Force load the provided docs to read from file system.""" if not docs: return pod = docs[0].pod def load_func(doc): """Force the doc to read the source file.""" # pylint: disable=pointless-statement doc.has_serving_path() # Using doc fields forces file read. with pod.profile.timer('DocsLoader.load'): if ThreadPool is None or len(docs) < cls.MIN_POOL_COUNT: for doc in docs: load_func(doc) return pool_size = min(cls.MAX_POOL_SIZE, len(docs) * cls.POOL_RATIO) pool_size = int(round(pool_size)) thread_pool = ThreadPool(pool_size) results = thread_pool.imap_unordered(load_func, docs) # Loop results to make sure that the threads are all processed. for _ in results: pass thread_pool.close() thread_pool.join()
def main(args): print(args) pool = Pool() protein_name = os.path.splitext(os.path.basename(args.file))[0] with open(args.settings, 'rb') as f: reader = csv.reader(f, delimiter=';') header = reader.next() col_idx = dict(itertools.izip(header, xrange(len(header)))) # Now we can get a column index by name: `col_idx['Age']` settings_list = [row for row in reader] commands = list() for row in settings_list: dab_shift = int(row[col_idx['DAB shift']]) hem_shift = int(row[col_idx['HEM shift']]) fileout = os.path.join(args.out, protein_name + "_d%d-h%d.csv" % (dab_shift, hem_shift)) shstr = "python2 cli_hpa.py %s %f --dab-shift %d --hem-shift %d --mp-disable --quiet --out %s" % ( args.file, args.scale, dab_shift, hem_shift, fileout) commands.append(shstr) print(commands) # quit() for i, returncode in enumerate(pool.imap_unordered(partial(subprocess.call, shell=True), commands)): print("Let's play! %d" % i) if returncode != 0: print("%d command failed: %d" % (i, returncode))
def _maybe_convert_set(target_csv): def one_sample(sample): if is_audio_file(sample): sample = os.path.join(target_csv, sample) y, sr = librosa.load(sample, sr=16000) # Trim the beginning and ending silence yt, index = librosa.effects.trim(y) # pylint: disable=unused-variable duration = librosa.get_duration(yt, sr) if duration > MAX_SECS or duration < MIN_SECS: os.remove(sample) else: librosa.output.write_wav(sample, yt, sr) samples = sorted(os.listdir(target_csv)) num_samples = len(samples) print(f"Converting wav files to {SAMPLE_RATE}hz...") pool = Pool(cpu_count()) bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR) for i, _ in enumerate(pool.imap_unordered(one_sample, samples), start=1): bar.update(i) bar.update(num_samples) pool.close() pool.join()
def main(): start = 0 end = 5000 problems = download_tasks_with_tags() problems_batch = problems['result']['problems'][start:end] thread_pool = ThreadPool(10) list(thread_pool.imap_unordered(process_one_problem, problems_batch))
class Attack(): def __init__(self, template, start, end, payload=""): self.template = template self.start = start self.end = end self.payload = payload self.range = int(end.bit_length() / 4) self.pool = Pool(5) def hex(self, num): num = f"{num:x}" return str(num).zfill(self.range) def f(self, l): t = self.template.format(self.hex(l)) x = gzdeflate(unhexlify(t)) return (x, t) def attack(self): for result in self.pool.imap_unordered(self.f, range(self.start, self.end + 1)): if self.payload.encode() in result[0]: print("{},{}".format(self.payload, result[1])) return result[1]
def download_all(self, threads=32) -> None: '''Handles multiprocessing using ThreadPool; sends items from a list to a function and gets the results as a list''' pool = ThreadPool(threads) lst = self.get_urls() print( f"Downloading {len(lst)} items using {self.download_zip} in {threads} processes." ) result = (pool.imap_unordered(self.download_zip, lst)) pool.close() # Display progress as the scraper runs its processes while (len(lst) > 1): completed = result._index # Break out of the loop if all tasks are done or if there is only one task if (completed == len(lst)): sys.stdout.flush() sys.stdout.write('\r' + "") sys.stdout.flush() break # Avoid a ZeroDivisionError if completed > 0: sys.stdout.flush() sys.stdout.write( '\r' + f"{completed/len(lst)*100:.0f}% done. {len(lst)-completed} left. " ) sys.stdout.flush() sys.stdout.flush() pool.join() return list(result)
def render_images(self, annotations, predictions, images_dir): """Runs render script to render images and store them into images_dir Args: annotations (list of tuples: (formula, file_idx, folder_path)): Ground-truth formula predictions (list of tuples: (formula, file_idx, folder_path)): Predicted formula """ out_path_gold = os.path.join(images_dir, 'images_gold') out_path_pred = os.path.join(images_dir, 'images_pred') for dir_ in [out_path_gold, out_path_pred]: if not os.path.exists(dir_): os.makedirs(dir_) annotations = [(elem[0], elem[1], out_path_gold) for elem in annotations] predictions = [(elem[0], elem[1], out_path_pred) for elem in predictions] lines = annotations + predictions print('Creating render pool with {} threads'.format(self.num_threads)) pool = ThreadPool(self.num_threads) print('Jobs running...') pairs_images_rendered = 0 for num, _ in enumerate(pool.imap_unordered(render_routine, lines)): if num % (PRINT_FREQ * 2) == 0 and num != 0: pairs_images_rendered += PRINT_FREQ # 2x PRINT_FREQ because images are rendered by pairs (original + predicted) print('{} / {} images rendered'.format(pairs_images_rendered, len(lines) // 2)) print('All images rendered') pool.close() pool.join() return out_path_gold, out_path_pred
def ComputeTimelineBasedMetrics(self): assert not self._current_page_run, 'Cannot compute metrics while running.' def _GetCpuCount(): try: return multiprocessing.cpu_count() except NotImplementedError: # Some platforms can raise a NotImplementedError from cpu_count() logging.warn('cpu_count() not implemented.') return 8 runs_and_values = self._FindRunsAndValuesWithTimelineBasedMetrics() if not runs_and_values: return # Note that this is speculatively halved as an attempt to fix # crbug.com/953365. threads_count = min(_GetCpuCount() / 2 or 1, len(runs_and_values)) pool = ThreadPool(threads_count) try: for result in pool.imap_unordered(_ComputeMetricsInPool, runs_and_values): self._AddPageResults(result) finally: pool.terminate() pool.join()
def _http_requests_pool(self, urls, workers=10, chunk=None): """Generator function to request urls in chunks""" # From cpython if chunk is None: chunk, extra = divmod(len(urls), workers * 4) if extra: chunk += 1 if len(urls) == 0: chunk = 0 if len(urls) == 1: yield self._http_requests_single(urls[0]) else: pool = ThreadPool(workers) try: for work in pool.imap_unordered(self._http_requests_single, urls, chunk): yield work except Exception as e: if not self._silent: logger.error("Failed to yield request: %s" % e) finally: pool.close() pool.join()
def render_images(self, annotations, predictions, images_dir): """Runs render script to render images and store them into images_dir Args: annotations (str): Ground-truth formula predictions (str): Predicted formula """ out_path_gold = os.path.join(images_dir, 'images_gold') out_path_pred = os.path.join(images_dir, 'images_pred') for dir_ in [out_path_gold, out_path_pred]: if not os.path.exists(dir_): os.makedirs(dir_) lines_gold = [(ann.label, ann.identifier, out_path_gold) for ann in annotations] lines_pred = [(pred.label, pred.identifier, out_path_pred) for pred in predictions] lines = lines_gold + lines_pred logging.info('Creating render pool with %s threads', self.num_threads) pool = ThreadPool(self.num_threads) logging.info('Jobs running...') pairs_images_rendered = 0 for num, _ in enumerate(pool.imap_unordered(render_routine, lines)): if num % (PRINT_FREQ * 2) == 0 and num != 0: pairs_images_rendered += PRINT_FREQ # 2x PRINT_FREQ because images are rendered by pairs (original + predicted) print_info("{} / {} images rendered".format( pairs_images_rendered, len(lines) // 2)) print_info("All images rendered") pool.close() pool.join()
def run(self): mkdir_p(self.intermediate_folder) mkdir_p(self.output_folder) color_values = self._extract_colors() self.logger.debug('Found {} unique colors: {}'.format( len(color_values), color_values)) manifest = {} def render_color(color): file_name = self._export_stl(color) manifest[file_name] = ColoredStlExporter.parse_openscad_color( color) pool = Pool() for _ in pool.imap_unordered(render_color, color_values): # Consume results as they occur so any exception is rethrown pass pool.close() pool.join() with open(os.path.join(self.output_folder, 'manifest.json'), 'wb') as f: f.write(json.dumps(manifest, indent=4))
def get_durations(paths, print_detail=True): duration_all = 0 duration_book = defaultdict(list) pool = Pool() iterator = pool.imap_unordered(get_duration, paths) for dataset, duration in tqdm(iterator, total=len(paths)): duration_all += duration duration_book[dataset].append(duration) total_count = 0 for book, duration in duration_book.items(): if book: time = second_to_hour(sum(duration)) file_count = len(duration) total_count += file_count if print_detail: print(" [*] Duration of {}: {} (file #: {})". \ format(book, time, file_count)) print(" [*] Total Duration : {} (file #: {})". \ format(second_to_hour(duration_all), total_count)) print() return duration_all
def get_continue_cut_multiprocessing(self, data, multiprocessing_type=1 ): #默认1为多进程,其他为多线程 logging.info('多进程版-连续变量最优分组进行中。。。') self.save_data(data) if multiprocessing_type == 1: logging.info('已启用多进程,最优分箱进行中。。。') pool = Pool(multiprocessing.cpu_count()) # 设置进程数一般为cpu数量 else: logging.info('已启用多线程,最优分箱进行中。。。') pool = ThreadPool(multiprocessing.cpu_count() * 2) # 设置线程数一般为cpu的2倍 cols = [col for col, col_type in self.col_type if col_type == 1] # pool.imap_unordered(self.get_cut_all_not_null_multiprocessing, cols) for i in tqdm(pool.imap_unordered( self.get_cut_all_not_null_multiprocessing, cols), total=len(cols), leave=False): pass pool.close() pool.join() self.transform_cut_points_list() self.del_data() logging.info('多进程版-连续变量最优分组完成!')
def create_accounts_concurrently(users): print("Sending create account requests...") p = Pool(min(len(users), 5 * multiprocessing.cpu_count())) start = time.time() for response in p.imap_unordered(create_account, users): print("{} (Time elapsed: {}s)".format(response, int(time.time() - start)))
def vectorize_files(filenames, min_date=None, max_date=None, min_n=1, max_n=1, remove_retweets=False, jobs=4, pretokenized=False): """ """ ## Initialize Helper vectorizer = partial(_vectorize_file, min_date=min_date, max_date=max_date, min_n=min_n, max_n=max_n, remove_retweets=remove_retweets, pretokenized=pretokenized) ## Vectorize using Multiprocessing mp = Pool(jobs) results = list( tqdm(mp.imap_unordered(vectorizer, filenames), total=len(filenames), desc="Vectorizing Files", file=sys.stdout)) _ = mp.close() ## Parse Results filenames = [r[0] for r in results] X = vstack(r[1] for r in results) return filenames, X
def getoutput_ManyJobs(self, listOfJobids): """Waits for a job to complete and then returns its standard output and standard error data if the files were given default names. """ pool = Pool() for (jobid, hasFinished) in pool.imap_unordered( self.waitUntilSignalOfEnd, tuple(jobid for jobid in listOfJobids)): if hasFinished: print('return logs of job', jobid, file=sys.stderr) else: print('job', jobid, 'has not finished, over max allowed running time', file=sys.stderr) signalOfEndFileName = self.signalOfEndFileName % str(jobid) try: os.remove(signalOfEndFileName) except: pass outFileName = self.outFileName % str(jobid) errFileName = self.errFileName % str(jobid) yield jobid, outFileName, errFileName os.remove(self.wrapperExecFileName)
def ComputeTimelineBasedMetrics(results): """Compute TBMv2 metrics on all story runs in parallel.""" assert not results.current_story_run, 'Cannot compute metrics while running.' def _GetCpuCount(): try: return multiprocessing.cpu_count() except NotImplementedError: # Some platforms can raise a NotImplementedError from cpu_count() logging.warn('cpu_count() not implemented.') return 8 available_runs = list(run for run in results.IterRunsWithTraces() if run.tbm_metrics) if not available_runs: return # Note that this is speculatively halved as an attempt to fix # crbug.com/953365. threads_count = min(_GetCpuCount() / 2 or 1, len(available_runs)) pool = ThreadPool(threads_count) metrics_runner = lambda run: _ComputeMetricsInPool(run, results.label, results.upload_bucket) try: for result in pool.imap_unordered(metrics_runner, available_runs): results.AddMetricPageResults(result) finally: pool.terminate() pool.join()
def main(): urls = [ 'http://www.python.org', 'https://stackoverflow.com/', 'https://css-tricks.com/', 'https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference', 'https://dev.twitter.com/', 'https://d3js.org/', 'https://www.heroku.com/', 'https://docs.pytest.org/en/latest/', 'https://www.djangoproject.com/', 'https://pudding.cool/', 'https://caniuse.com/', 'http://svgpocketguide.com/book/', 'https://www.w3.org/TR/SVG/intro.html', ] pool = Pool() start = time.time() for x, y in pool.imap_unordered(url_name, urls): index = urls.index(y) log.info("{}s (sleep: {}) (#{} in array) for {})" .format(int(time.time() - start), x, index, y)) pool.close() pool.join()
def parse_page(self, html): """ Parse the log-in page and extract links. Args: html: log-in page content """ parser = LinkParser() parser.feed(html) if not len(LINKS): print '\n no links extracted from log-in page\n' sys.exit(1) print '\n %i links found in first page ...' % len(LINKS) start = timer() pool = ThreadPool(NUMBER_THREADS) results = pool.imap_unordered(self.parse_page_links, LINKS) pool.close() pool.join() print '\n %i links found in spidered pages' % len(LINKS) print '\n link searches: %s secs\n' % str.format( '{0:.3f}', (timer() - start))
def download_external_resources(container, urls, timeout=60, progress_report=lambda url, done, total: None): failures = {} replacements = {} data_uri_map = {} with TemporaryDirectory('editor-download') as tdir: pool = Pool(10) with closing(pool): for ok, result in pool.imap_unordered( partial(download_one, tdir, timeout, progress_report, data_uri_map), urls): if ok: url, suggested_filename, downloaded_file, mt = result with lopen(downloaded_file, 'rb') as src: name = container.add_file(suggested_filename, src, mt, modify_name_if_needed=True) replacements[url] = name else: url, err = result failures[url] = err return replacements, failures
def _general_processor( cls: Type[Artwork], item_ids: List[int]) -> Tuple[List[Artwork], List[int]]: util.log(texts.ARTWORK_ID_PROCESSING, start=os.linesep, inform=True) total = len(item_ids) successes = [] fails = [] pool = Pool() def process_item(item_id_): try: successes.append(cls(item_id_)) except ArtworkError: fails.append(item_id_) for index, item_id in enumerate( pool.imap_unordered(process_item, item_ids), 1): util.print_progress(index, total, msg=texts.GUI_ID_PROCESSING_HEADING) msg = texts.ARTWORK_ID_PROCESS_RESULT.format(total=total, successes=len(successes), fails=len(fails)) util.print_done(msg) return successes, fails
def _http_requests_pool(self, urls, workers=10, chunk=None): """Generator function to request urls in chunks""" # From cpython if chunk is None: chunk, extra = divmod(len(urls), workers * 4) if extra: chunk += 1 if len(urls) == 0: chunk = 0 if self.ssl_verify: session = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) else: urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) session = urllib3.PoolManager() part = partial(self._http_requests_urllib3, session=session) if len(urls) == 1: yield part(urls[0]) else: pool = ThreadPool(workers) try: for work in pool.imap_unordered(part, urls, chunk): yield work except Exception as e: logger.error(u"Failed to yield request: %s" % e) finally: pool.close() pool.join()
def process(lst: list, function, processes: int): """Handles multiprocessing using ThreadPool; sends items from a list to a function and gets the results as a list""" # Define the number of processes, use less than or equal to the defined value count_threads = min(processes, len(lst)) if count_threads == 0: return [] pool = ThreadPool(count_threads) # Tell the user what is happening print(f"Copying {len(lst)} items using {function} in {count_threads} processes.") # Calls function() and returns True for success and False for fail each call to a lst result = (pool.imap_unordered(function, lst)) pool.close() # Display progress as the scraper runs its processes while (len(lst) > 1): completed = result._index # Break out of the loop if all tasks are done or if there is only one task if (completed == len(lst)): sys.stdout.flush() sys.stdout.write('\r' + "") sys.stdout.flush() break # Avoid a ZeroDivisionError if completed > 0: sys.stdout.flush() sys.stdout.write('\r' + f"{completed/len(lst)*100:.0f}% done. {len(lst)-completed} left. ") sys.stdout.flush() sys.stdout.flush() pool.join() return list(result)
def img_rescaler(dir_in, extension_in, threads=1): """ Import an image, rescale it to normal UBYTE (0-255, 8 bit) range, and re-save it. """ dir_out = os.path.join(dir_in, "rescaled") total_files = 0 for path, folder, filename in os.walk(dir_in): if dir_out not in path: for f in filename: if f.endswith(extension_in): total_files += 1 print("\nYou have {} images to analyze".format(total_files)) for path, folder, filename in os.walk(dir_in): if dir_out not in path: # Don't run in the output directory. # Make directory for saving objects subpath = path[len(dir_in)+1:] if not os.path.exists(os.path.join(dir_out, subpath)): os.mkdir(os.path.join(dir_out, subpath)) # What we'll do: global _core_fn # bad form for Pool.map() compatibility def _core_fn(filename): if filename.endswith(extension_in): # count progress. path_in = os.path.join(path, filename) subpath_in = os.path.join(subpath, filename) # for printing purposes path_out = os.path.join(dir_out, subpath, filename) if os.path.exists(path_out): #skip print("\nALREADY ANALYZED: {}. Skipping...\n".format(subpath_in)) else: #(try to) do it try: img = io.imread(path_in) # load image img = img_as_ubyte(img / np.max(img)) io.imsave(path_out, img) except: print("Couldn't analyze {}".format(subpath_in)) return() # run it sleep(1) # to give everything time to load thread_pool = Pool(threads) # Work on _core_fn (and give progressbar) tqdm.tqdm(thread_pool.imap_unordered(_core_fn, filename, chunksize=1), total=total_files) # finish thread_pool.close() thread_pool.join() return()
def calc_factorials(max_int=100, pool_size=8, threads=True, chunk_size=10): if threads: pool = ThreadPool(pool_size) else: pool = ProcessPool(pool_size) results = pool.imap_unordered(factorial_calc, range(max_int), chunk_size) return results
def main(): input_filename, output_dir, n_threads = parse_args() if not os.path.isdir(output_dir): print("Output directory {} does not exist".format(output_dir)) sys.exit() with open(input_filename) as input_file: reader = csv.reader(input_file) header_row = next(reader) rows = list(reader) try: row_idx_image_id = header_row.index('ImageId') row_idx_url = header_row.index('URL') row_idx_x1 = header_row.index('x1') row_idx_y1 = header_row.index('y1') row_idx_x2 = header_row.index('x2') row_idx_y2 = header_row.index('y2') except ValueError as e: print('One of the columns was not found in the source file: ', e.message) rows = [(row[row_idx_image_id], row[row_idx_url], float(row[row_idx_x1]), float(row[row_idx_y1]), float(row[row_idx_x2]), float(row[row_idx_y2])) for row in rows] if n_threads > 1: pool = ThreadPool(n_threads) partial_get_images = partial(get_image, output_dir=output_dir) for i, _ in enumerate(pool.imap_unordered(partial_get_images, rows), 1): sys.stderr.write('\rDownloaded {0} images'.format(i + 1)) pool.close() pool.join() else: failed_to_download = set() for idx in range(len(rows)): row = rows[idx] if not download_image(image_id=row[0], url=row[1], x1=float(row[2]), y1=float(row[3]), x2=float(row[4]), y2=float(row[5]), output_dir=output_dir): failed_to_download.add(row[row_idx_image_id]) sys.stdout.write('\rDownloaded {0} images'.format(idx + 1)) sys.stdout.flush() print() if failed_to_download: print('\nUnable to download images with the following IDs:') for image_id in failed_to_download: print(image_id)
def run_task_multi_thread(action_function, files, action_label, nb_threads=2, offset=0): """Run given action on every files using a threading pool. It uses a progress bar instead of a usual verbose log. """ pool = Pool(processes=nb_threads) items = [(file, action_function) for file in files[offset:]] pool_iterable = pool.imap_unordered(run_single_task, items) progress_bar_items = tqdm(total=len(items), iterable=pool_iterable, unit='images', desc='{0: <30}'.format(action_label)) for item in progress_bar_items: pass
def match(bot, opponent): # List of match's results results = [] # List of matches to perform matches = [[bot, opponent]] * args.count # Threads the matches and collect results pool = Pool(args.threads) for match in pool.imap_unordered(perform, matches): results.append(match) return stat_create(results)
class ScannerPool: # @classmethod # def getPool(cls): # if "pool" not in cls.__dict__ or cls.pool is None: # logger.info("Threads pool created with %d threads" % THREAD_NUMBER) # cls.pool = Pool(THREAD_NUMBER) # return cls.pool def __init__(self): self.pool = Pool(THREAD_NUMBER) def map(self, *args, **kwargs): return self.pool.imap_unordered(*args, **kwargs)
def build_common(out_name='common.a', build_dir='temp_build/temp_build', num_parallel=1): compiler = os.environ.get('CXX', 'g++') ar = os.environ.get('AR', 'ar') libtool = os.environ.get('LIBTOOL', 'libtool') cflags = os.environ.get('CFLAGS', '') + os.environ.get('CXXFLAGS', '') for file in COMMON_FILES: outfile = os.path.join(build_dir, os.path.splitext(file)[0] + '.o') outdir = os.path.dirname(outfile) if not os.path.exists(outdir): print('mkdir', outdir) os.makedirs(outdir) def build_one(file): outfile = os.path.join(build_dir, os.path.splitext(file)[0] + '.o') if os.path.exists(outfile): return cmd = '{cc} -fPIC -c {cflags} {args} {includes} {infile} -o {outfile}'.format( cc=compiler, cflags=cflags, args=' '.join(ARGS), includes=' '.join('-I' + i for i in INCLUDES), infile=file, outfile=outfile, ) print(cmd) subprocess.check_call(shlex.split(cmd)) return outfile pool = Pool(num_parallel) obj_files = list(pool.imap_unordered(build_one, COMMON_FILES)) if sys.platform.startswith('darwin'): cmd = '{libtool} -static -o {outfile} {infiles}'.format( libtool=libtool, outfile=out_name, infiles=' '.join(obj_files), ) print(cmd) subprocess.check_call(shlex.split(cmd)) else: cmd = '{ar} rcs {outfile} {infiles}'.format( ar=ar, outfile=out_name, infiles=' '.join(obj_files) ) print(cmd) subprocess.check_call(shlex.split(cmd))
def download_external_resources(container, urls, timeout=60, progress_report=lambda url, done, total: None): failures = {} replacements = {} with TemporaryDirectory('editor-download') as tdir: pool = Pool(10) with closing(pool): for ok, result in pool.imap_unordered(partial(download_one, tdir, timeout, progress_report), urls): if ok: url, suggested_filename, downloaded_file, mt = result with lopen(downloaded_file, 'rb') as src: name = container.add_file(suggested_filename, src, mt, modify_name_if_needed=True) replacements[url] = name else: url, err = result failures[url] = err return replacements, failures
def test_threaded(self): # add three more short subchains for threads to test on for ident in 'ghijklmno': obj = make_mock_relationship('test_db', 'schema', ident) self.cache.add(make_relation('dbt', 'schema', ident)) self.cache.add_link(make_relation('dbt', 'schema', 'a'), make_relation('dbt', 'schema', 'g')) self.cache.add_link(make_relation('dbt', 'schema', 'g'), make_relation('dbt', 'schema', 'h')) self.cache.add_link(make_relation('dbt', 'schema', 'h'), make_relation('dbt', 'schema', 'i')) self.cache.add_link(make_relation('dbt', 'schema', 'a'), make_relation('dbt', 'schema', 'j')) self.cache.add_link(make_relation('dbt', 'schema', 'j'), make_relation('dbt', 'schema', 'k')) self.cache.add_link(make_relation('dbt', 'schema', 'k'), make_relation('dbt', 'schema', 'l')) self.cache.add_link(make_relation('dbt', 'schema', 'a'), make_relation('dbt', 'schema', 'm')) self.cache.add_link(make_relation('dbt', 'schema', 'm'), make_relation('dbt', 'schema', 'n')) self.cache.add_link(make_relation('dbt', 'schema', 'n'), make_relation('dbt', 'schema', 'o')) pool = ThreadPool(4) results = list(pool.imap_unordered(self._target, ('b', 'g', 'j', 'm'))) pool.close() pool.join() # at a minimum, we expect each table to "see" itself, its parent ('a'), # and the unrelated table ('a') min_expect = { 'b': {'a', 'b', 'e'}, 'g': {'a', 'g', 'e'}, 'j': {'a', 'j', 'e'}, 'm': {'a', 'm', 'e'}, } for ident, relations in results: seen = set(r.identifier for r in relations) self.assertTrue(min_expect[ident].issubset(seen)) self.assert_has_relations(set('abgjme'))
def render_rotation(output_folder, num_frames, start_frame, variables): def render_frame(i): angle = 135 + i * 360 / num_frames openscad.run( 'splitflap.scad', os.path.join(output_folder, 'frame_%05d.png' % (start_frame + i)), output_size = [320, 240], camera_translation = [0, 0, 0], camera_rotation = [60, 0, angle], camera_distance = 600, variables = variables, colorscheme = 'Nature', ) pool = Pool() for _ in pool.imap_unordered(render_frame, range(num_frames)): # Consume results as they occur so any exception is rethrown pass pool.close() pool.join()
def getoutput_ManyJobs(self, listOfJobids): """Waits for a job to complete and then returns its standard output and standard error data if the files were given default names. """ pool = Pool() for (jobid, hasFinished) in pool.imap_unordered(self.waitUntilSignalOfEnd, tuple(jobid for jobid in listOfJobids)): if hasFinished: print >> sys.stderr, 'return logs of job', jobid else: print >> sys.stderr, 'job', jobid, 'has not finished, over max allowed running time' signalOfEndFileName = self.signalOfEndFileName % str(jobid) try: os.remove(signalOfEndFileName) except: pass outFileName = self.outFileName % str(jobid) errFileName = self.errFileName % str(jobid) yield jobid, outFileName, errFileName os.remove(self.wrapperExecFileName)
def run(self): mkdir_p(self.intermediate_folder) mkdir_p(self.output_folder) color_values = self._extract_colors() self.logger.debug('Found {} unique colors: {}'.format(len(color_values), color_values)) manifest = {} def render_color(color): file_name = self._export_stl(color) manifest[file_name] = ColoredStlExporter.parse_openscad_color(color) pool = Pool() for _ in pool.imap_unordered(render_color, color_values): # Consume results as they occur so any exception is rethrown pass pool.close() pool.join() with open(os.path.join(self.output_folder, 'manifest.json'), 'wb') as f: f.write(json.dumps(manifest, indent=4))
# http://stackoverflow.com/questions/16675803/learning-python-and-threading-i-think-my-code-runs-infinitely-help-me-find-bug import json import urllib2 from collections import Counter from multiprocessing.dummy import Pool # use threads import time def get_name(url): try: return json.load(urllib2.urlopen(url))['gender'] except Exception: return None # error start = time.time() urls = ('http://graph.facebook.com/%d' % i for i in xrange(200)) p = Pool(5) # 5 concurrent connections first_names = Counter(p.imap_unordered(get_name, urls)) print first_names.most_common() print 'It took %s s' % (time.time() - start)
for d in [1,2]: if int(run_log['hv dia%d'%d]) < 0: bias+='M' else: bias+='P' conf = ' -c %s/conf/converter%s.conf'%(pwd,bias) out = ' -o /data/psi_2015_05/root/run%d.root'%run cmd += inp + conf + out print cmd commands.append((run,cmd)) exit pool = Pool(nProcesses) it = pool.imap_unordered(partial(call, shell=True), [c[1] for c in commands]) failures = [] complete = [] for i, returncode in enumerate(it): # print multiprocessing.active_children() if returncode != 0: print("Command '%s' failed: %d" % (commands[i], returncode)) failures.append(commands[i][0]) else: complete.append(commands[i][0]) print("Command '%s' completed: %d" % (commands[i], returncode)) print 'completed:',complete print 'Failures:',failures
# Example function that takes a record and returns some components of the genbank accessions def get_record(record): try: handle = Entrez.efetch(db="protein",id=record, retmode="xml") record = Entrez.read(handle) organism = record[0]["GBSeq_source"] taxon = record[0]["GBSeq_taxonomy"] except: return record,'error' return organism, taxon # For counting iterations z=0 total = len(recordList) # Pool(n) will return n separate threads. pool = Pool(20) # at most 20 concurrent downloads # Open a file for writing: with open("/Users/jimbo/Desktop/example.txt", "wb") as f: # Call imap_unordered on the pool of processors you opened, and pass a function(x) and a list of x's for org,tax in pool.imap_unordered(get_record, recordList): # Write output line by line as results come in from pool. f.write(org+"\t"+tax+"\n") # Interactive output to check status if z%1000==0: print '{0} down, {1} to go'.format(z, total-z)
def _maybe_convert_set(audio_dir, input_tsv): output_csv = path.join(audio_dir,os.path.split(input_tsv)[-1].replace('tsv', 'csv')) print("Saving new DeepSpeech-formatted CSV file to: ", output_csv) # Get audiofile path and transcript for each sentence in tsv samples = [] with open(input_tsv) as input_tsv_file: reader = csv.DictReader(input_tsv_file, delimiter='\t') for row in reader: samples.append((row['path'], row['sentence'])) # Keep track of how many samples are good vs. problematic counter = { 'all': 0, 'too_short': 0, 'too_long': 0 } lock = RLock() num_samples = len(samples) rows = [] def one_sample(sample): """ Take a audio file, and optionally convert it to 16kHz WAV """ mp3_filename = path.join(audio_dir, sample[0]) if not path.splitext(mp3_filename.lower())[1] == '.mp3': mp3_filename += ".mp3" # Storing wav files next to the mp3 ones - just with a different suffix wav_filename = path.splitext(mp3_filename)[0] + ".wav" _maybe_convert_wav(mp3_filename, wav_filename) frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT)) file_size = path.getsize(wav_filename) with lock: if int(frames/SAMPLE_RATE*1000/10/2) < len(str(sample[1])): # Excluding samples that are too short to fit the transcript counter['too_short'] += 1 elif frames/SAMPLE_RATE > MAX_SECS: # Excluding very long samples to keep a reasonable batch-size counter['too_long'] += 1 else: # This one is good - keep it for the target CSV rows.append((wav_filename, file_size, sample[1])) counter['all'] += 1 print("Importing mp3 files...") pool = Pool(cpu_count()) bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR) for i, _ in enumerate(pool.imap_unordered(one_sample, samples), start=1): bar.update(i) bar.update(num_samples) pool.close() pool.join() with open(output_csv, 'w') as output_csv_file: print('Writing CSV file for DeepSpeech.py as: ', output_csv) writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES) writer.writeheader() bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR) for filename, file_size, transcript in bar(rows): writer.writerow({ 'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript }) print('Imported %d samples.' % (counter['all'] - counter['too_short'] - counter['too_long'])) if counter['too_short'] > 0: print('Skipped %d samples that were too short to match the transcript.' % counter['too_short']) if counter['too_long'] > 0: print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS))
def main(dataset_dir, tests_file, only, host, port, user, password, client_opt='api', server_log=None, interval=30, force=False, concurrent=1, aggregate=False, verbose=0, just_print=False): setup_logging(verbose) if just_print: logging.warning( "This is a simulation," " no command will actually be performed!") # remove trailing `/` if present, otherwise # `basename(dataset_dir)` returns the empty string if dataset_dir.endswith('/'): dataset_dir = dataset_dir[:-1] # collect datasets to be tested test_params = load_test_params(dataset_dir, tests_file) datasets_to_test = collect_datasets(dataset_dir, test_params, only) if not datasets_to_test: abort("No datasets to test!") logging.info("Will test datasets: %r", datasets_to_test) existing_experiments = list_experiment_names(host, port, user, password) # actual code to run the tests def do_test_dataset(dataset_path): name, params = get_experiment_data(dataset_path, test_params) experiment_name = params.pop('name') params['client'] = client_opt client = make_client(client_opt, host, port, user, password, experiment_name) # check if an experiment with this name already exists if experiment_name in existing_experiments: if force: with timing( "deleting old experiment `{}` ..." .format(experiment_name)): if not just_print: delete_experiment(host, port, user, password, experiment_name) else: logging.error( "Experiment `%s` already exists." " Remove it before re-running this test.", experiment_name) return # start actual testing with _Testsuite(name, **params) as suite: run = Runner(suite, just_print, server_log) try: # create experiment run(client.create_experiment( params['workflow_type'], params['microscope_type'], params['plate_format'], params['plate_acquisition_mode'])) for plate in params['plates']: # create plate(s) run(client.create_plate(plate)) # create and upload acquisition(s) for acquisition in params['acquisitions'][plate]: acquisition_dir = join( dataset_path, 'plates', plate, 'acquisitions', acquisition) run(client.create_acquisition(plate, acquisition)) run(client.upload_microscope_files(plate, acquisition, acquisition_dir)) workflow_description_path = join(dataset_path, params.get('workflow_description_path', 'workflow_description.yaml')) run(client.upload_workflow_description_file(workflow_description_path)) jterator_project_path = join(dataset_path, params.get('jterator_project_path', 'jterator')) run(client.upload_jterator_project_files(jterator_project_path)) with suite.new_test_case("Running workflow") as case: if not just_print: run_workflow( case, host, port, user, password, experiment_name, server_log, interval) except Runner.Abort as err: logging.warning("%s", err) return suite # do (possibly) parallel processing report = Report(basename(dataset_dir)) proc = Pool(processes=(concurrent or None)) suites = proc.imap_unordered(do_test_dataset, datasets_to_test) errors = False for suite in suites: if not suite: # `do_test_dataset` errored out continue report.add_test_suite(suite) if not aggregate: if not just_print: write_junit_xml(report, tests_file, dataset_dir, suite.name) report.print_terminal_output() if report.errored > 0 or report.failed > 0: errors = True report.reset() if aggregate: if not just_print: write_junit_xml(report, tests_file, dataset_dir) report.print_terminal_output() if report.errored > 0 or report.failed > 0: errors = True report.reset() if errors: return 2 else: return 0
def mergeFilesByRegion(filesByRegion, grid, outputDir): # Merge a set of files by region into the specified dir # Key is up/down/nominal etc N = 0 filesToWrite = {} for r in filesByRegion: for key in filesByRegion[r]: if filesByRegion[r][key] == []: if key == "Nominal": print ("WARNING: no input files for region {0} key {1}".format(r, key)) continue filePrefix = "%s_%s" % (r, grid) filename = os.path.join(outputDir, "%s.root" % (filePrefix)) if os.path.exists(filename): print ("Output file {0} exists - skipping".format(os.path.basename(filename))) continue filesToWrite[filename] = {"region": r, "files": filesByRegion[r][key]} N += 1 # Got anything? if filesToWrite == {}: return # build the pool arguments args = [] for filename in filesToWrite: N -= 1 args.append((filename, filesToWrite[filename]["files"], False, filesToWrite[filename]["region"], N)) pool = ThreadPool(8, init_worker) try: # results = pool.map(mergeFiles, args) results = pool.imap_unordered(mergeFiles, args) pool.close() pool.join() except KeyboardInterrupt: print "Caught KeyboardInterrupt, terminating workers" pool.terminate() pool.join() return # Below is to be removed legacy code relying on hadd for r in filesByRegion: for key in filesByRegion[r]: if filesByRegion[r][key] == []: continue N -= 1 # Merge the files in chunks of 50, and then merge these chunks # The whole idea behind this exercise is to avoid exceeding the maximum length of # of a command allowed in bash. filePrefix = "%s_%s" % (r, grid) filename = os.path.join(outputDir, "%s.root" % (filePrefix)) if os.path.exists(filename): print ("Output file {0} exists - skipping".format(os.path.basename(filename))) continue mergeFiles(filename, filesByRegion[r][key]) # fileMerger = ROOT.TFileMerger() # fileMerger.OutputFile(filename) # for f in filesByRegion[r][key]: # fileMerger.AddFile(f) # fileMerger.Merge() # i=1 # print("Attempting to make file {0}".format(filename)) # for subset in chunks(filesByRegion[r][key], 50): # print("Merging subset {0:d}...".format(i)) # filename = os.path.join(outputDir, "%s_%03d.root" % (filePrefix, i) ) # outputFiles.append(filename) # # if len(subset) == 1: # shutil.copy(subset[0], filename) # else: # cmd = "hadd -f %s %s" % (filename, " ".join(subset)) # subprocess.call(cmd, shell=True) # # i+=1 # print("Merging all subsets") # filename = os.path.join(outputDir, "%s.root" % (filePrefix) ) # if len(outputFiles) == 1: # # only 1 file, so just rename it # os.rename(outputFiles[0], filename) # else: # cmd = "hadd -f %s %s" % (filename, " ".join(outputFiles)) # subprocess.call(cmd, shell=True) # print("Done merging subsets; removing temporary files") # for f in outputFiles: # if not os.path.exists(f): continue # os.remove(f) print ("=> Created file for {0}; {1} files remaining".format(r, N))
class RenderLocaleBatch(object): """Handles the rendering and threading of the controllers.""" BATCH_DEFAULT_SIZE = 300 # Default number of documents in a batch. def __init__(self, jinja_env, profile, tick=None, batch_size=None): self.batch_size = batch_size or self.BATCH_DEFAULT_SIZE self.jinja_env = jinja_env self.profile = profile self.tick = tick self.batches = [[]] self._is_loading = False self._is_rendering = False self._results = None self._thread_pool = None def __len__(self): count = 0 for batch in self.batches: count = count + len(batch) return count def _get_batch(self): # Ensure that batch is not over the max size. batch = self.batches[len(self.batches) - 1] if len(batch) >= self.batch_size: self.batches.append([]) batch = self.batches[len(self.batches) - 1] return batch def add(self, controller, *args, **kwargs): """Add an item to be rendered to the batch.""" batch = self._get_batch() batch.append({ 'controller': controller, 'jinja_env': self.jinja_env, 'args': args, 'kwargs': kwargs, }) def load_start(self, source_dir): """Start the batches loading.""" self._thread_pool = ThreadPool(len(self.batches)) self._results = self._thread_pool.imap_unordered( load_func, self.batches, source_dir) self._is_loading = True def load_finish(self): """Finish in progress batches loading.""" if not self._is_loading: raise RenderNotStartedError('Rendering was never started') load_errors = [] loaded_docs = [] for batch_result in self._results: load_errors = load_errors + batch_result.load_errors loaded_docs = loaded_docs + batch_result.loaded_docs if self.tick: for _ in batch_result.load_errors: self.tick() for _ in batch_result.loaded_docs: self.tick() for result in batch_result.loaded_docs: self.profile.add_timer(result.load_timer) self._thread_pool.close() self._thread_pool.join() self._is_loading = False return loaded_docs, load_errors def load_sync(self, source_dir): """Syncronous loading for non-threaded loading.""" load_errors = [] loaded_docs = [] for batch in self.batches: batch_result = load_func(batch, source_dir, tick=self.tick) load_errors = load_errors + batch_result.load_errors loaded_docs = loaded_docs + batch_result.loaded_docs return loaded_docs, load_errors def render_start(self): """Start the batches rendering.""" self._thread_pool = ThreadPool(len(self.batches)) self._results = self._thread_pool.imap_unordered( render_func, self.batches) self._is_rendering = True def render_finish(self): """Finish in progress batches rendering.""" if not self._is_rendering: raise RenderNotStartedError('Rendering was never started') render_errors = [] rendered_docs = [] for batch_result in self._results: render_errors = render_errors + batch_result.render_errors rendered_docs = rendered_docs + batch_result.rendered_docs if self.tick: for _ in batch_result.render_errors: self.tick() for _ in batch_result.rendered_docs: self.tick() for result in batch_result.rendered_docs: self.profile.add_timer(result.render_timer) self._thread_pool.close() self._thread_pool.join() self._is_rendering = False return rendered_docs, render_errors def render_sync(self): """Syncronous rendering for non-threaded rendering.""" render_errors = [] rendered_docs = [] for batch in self.batches: batch_result = render_func(batch, tick=self.tick) render_errors = render_errors + batch_result.render_errors rendered_docs = rendered_docs + batch_result.rendered_docs return rendered_docs, render_errors
sigscan = "std_T2tt" if len(sys.argv) > 2: sigscan = sys.argv[2] print "Doing limits from cards in ", carddir ext_cards = os.listdir(carddir) ext_cards = filter(lambda x : sigscan in x and "bin1.txt" in x, ext_cards) sigs = [ x[9:-9] for x in ext_cards] sigs = [ s for s in sigs if int(s.split('_')[2]) < 1500] pool = ThreadPool(40) cards = [] for combined in pool.imap_unordered(combine_cards, sigs): cards.append(combined) if not cards: cards = os.listdir(combineddir) cards = filter(lambda x : '.txt' in x and '.log' not in x, cards) cards = [combineddir+'/'+c for c in cards] # print cards os.system('mkdir -p '+limitdir) limits = [] for result in pool.imap_unordered(run_asymptotic, cards): limits.append(result) print limits
def _maybe_convert_sets(target_dir, extracted_data): extracted_dir = path.join(target_dir, extracted_data) # override existing CSV with normalized one target_csv_template = os.path.join(target_dir, ARCHIVE_DIR_NAME + '_' + ARCHIVE_NAME.replace('.zip', '_{}.csv')) if os.path.isfile(target_csv_template): return ogg_root_dir = os.path.join(extracted_dir, ARCHIVE_NAME.replace('.zip', '')) # Get audiofile path and transcript for each sentence in tsv samples = [] glob_dir = os.path.join(ogg_root_dir, '**/*.ogg') for record in glob(glob_dir, recursive=True): record_file = record.replace(ogg_root_dir + os.path.sep, '') samples.append((record_file, os.path.splitext(os.path.basename(record_file))[0])) # Keep track of how many samples are good vs. problematic counter = {'all': 0, 'failed': 0, 'invalid_label': 0, 'too_short': 0, 'too_long': 0} lock = RLock() num_samples = len(samples) rows = [] def one_sample(sample): """ Take a audio file, and optionally convert it to 16kHz WAV """ ogg_filename = path.join(ogg_root_dir, sample[0]) # Storing wav files next to the ogg ones - just with a different suffix wav_filename = path.splitext(ogg_filename)[0] + ".wav" _maybe_convert_wav(ogg_filename, wav_filename) file_size = -1 if path.exists(wav_filename): file_size = path.getsize(wav_filename) frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT)) label = label_filter(sample[1]) with lock: if file_size == -1: # Excluding samples that failed upon conversion counter['failed'] += 1 elif label is None: # Excluding samples that failed on label validation counter['invalid_label'] += 1 elif int(frames/SAMPLE_RATE*1000/10/2) < len(str(label)): # Excluding samples that are too short to fit the transcript counter['too_short'] += 1 elif frames/SAMPLE_RATE > MAX_SECS: # Excluding very long samples to keep a reasonable batch-size counter['too_long'] += 1 else: # This one is good - keep it for the target CSV rows.append((wav_filename, file_size, label)) counter['all'] += 1 print("Importing ogg files...") pool = Pool(cpu_count()) bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR) for i, _ in enumerate(pool.imap_unordered(one_sample, samples), start=1): bar.update(i) bar.update(num_samples) pool.close() pool.join() with open(target_csv_template.format('train'), 'w') as train_csv_file: # 80% with open(target_csv_template.format('dev'), 'w') as dev_csv_file: # 10% with open(target_csv_template.format('test'), 'w') as test_csv_file: # 10% train_writer = csv.DictWriter(train_csv_file, fieldnames=FIELDNAMES) train_writer.writeheader() dev_writer = csv.DictWriter(dev_csv_file, fieldnames=FIELDNAMES) dev_writer.writeheader() test_writer = csv.DictWriter(test_csv_file, fieldnames=FIELDNAMES) test_writer.writeheader() for i, item in enumerate(rows): transcript = validate_label(item[2]) if not transcript: continue wav_filename = os.path.join(ogg_root_dir, item[0].replace('.ogg', '.wav')) i_mod = i % 10 if i_mod == 0: writer = test_writer elif i_mod == 1: writer = dev_writer else: writer = train_writer writer.writerow(dict( wav_filename=wav_filename, wav_filesize=os.path.getsize(wav_filename), transcript=transcript, )) print('Imported %d samples.' % (counter['all'] - counter['failed'] - counter['too_short'] - counter['too_long'])) if counter['failed'] > 0: print('Skipped %d samples that failed upon conversion.' % counter['failed']) if counter['invalid_label'] > 0: print('Skipped %d samples that failed on transcript validation.' % counter['invalid_label']) if counter['too_short'] > 0: print('Skipped %d samples that were too short to match the transcript.' % counter['too_short']) if counter['too_long'] > 0: print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS))