class _Worker(object): def __init__(self, protocol=None): self.protocol = protocol self.pool = ProcessPoolExecutor(max_workers=1) self.pool.submit(id, 42).result() # start the worker process def run(self, func, *args, **kwargs): """Synchronous remote function call""" input_payload = dumps((func, args, kwargs), protocol=self.protocol) result_payload = self.pool.submit( call_func, input_payload, self.protocol).result() result = loads(result_payload) if isinstance(result, BaseException): raise result return result def memsize(self): workers_pids = [p.pid if hasattr(p, "pid") else p for p in list(self.pool._processes)] num_workers = len(workers_pids) if num_workers == 0: return 0 elif num_workers > 1: raise RuntimeError("Unexpected number of workers: %d" % num_workers) return psutil.Process(workers_pids[0]).memory_info().rss def close(self): self.pool.shutdown(wait=True)
def parallel(self, parallel): # shutdown any previous executor if we are managing it if getattr(self, '_managing_executor', False): self._executor.shutdown() self._parallel = parallel self._managing_executor = False if parallel is False: self._executor = None return if parallel is True: from concurrent.futures import ProcessPoolExecutor self._executor = ProcessPoolExecutor() self._managing_executor = True return if isinstance(parallel, numbers.Number): from concurrent.futures import ProcessPoolExecutor self._executor = ProcessPoolExecutor(parallel) self._managing_executor = True return # assume a pool-executor has been supplied self._executor = parallel
def build_from_path(hparams, input_dirs, mel_dir, linear_dir, wav_dir, n_jobs=12, tqdm=lambda x: x): """ Preprocesses the speech dataset from a gven input path to given output directories Args: - hparams: hyper parameters - input_dir: input directory that contains the files to prerocess - mel_dir: output directory of the preprocessed speech mel-spectrogram dataset - linear_dir: output directory of the preprocessed speech linear-spectrogram dataset - wav_dir: output directory of the preprocessed speech audio dataset - n_jobs: Optional, number of worker process to parallelize across - tqdm: Optional, provides a nice progress bar Returns: - A list of tuple describing the train examples. this should be written to train.txt """ # We use ProcessPoolExecutor to parallelize across processes, this is just for # optimization purposes and it can be omited executor = ProcessPoolExecutor(max_workers=n_jobs) futures = [] index = 1 for input_dir in input_dirs: with open(os.path.join(input_dir, 'metadata.csv'), encoding='utf-8') as f: for line in f: parts = line.strip().split('|') wav_path = os.path.join(input_dir, 'wavs', '{}.wav'.format(parts[0])) text = parts[2] futures.append(executor.submit(partial(_process_utterance, mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams))) index += 1 return [future.result() for future in tqdm(futures) if future.result() is not None]
def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): '''Preprocesses the LJ Speech dataset from a given input path into a given output directory. Args: in_dir: The directory where you have downloaded the LJ Speech dataset out_dir: The directory to write the output into num_workers: Optional number of worker processes to parallelize across tqdm: You can optionally pass tqdm to get a nice progress bar Returns: A list of tuples describing the training examples. This should be written to train.txt ''' # We use ProcessPoolExecutor to parallelize across processes. This is just an optimization and you # can omit it and just call _process_utterance on each input if you want. executor = ProcessPoolExecutor(max_workers=num_workers) futures = [] index = 1 with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f: for line in f: parts = line.strip().split('|') wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0]) text = parts[2] futures.append(executor.submit(partial(_process_utterance, out_dir, index, wav_path, text))) index += 1 return [future.result() for future in tqdm(futures)]
def __call__(self, workflow, input_artifact_filepaths, parameter_references, output_artifact_filepaths): input_artifact_abs_filepaths = \ {k: os.path.abspath(v) for k, v in input_artifact_filepaths.items()} output_artifact_abs_filepaths = \ {k: os.path.abspath(v) for k, v in output_artifact_filepaths.items()} job = workflow.to_script(input_artifact_abs_filepaths, parameter_references, output_artifact_abs_filepaths) temp_dir = tempfile.mkdtemp() pool = ProcessPoolExecutor(max_workers=1) py_filename = os.path.join(temp_dir, 'job.py') with open(py_filename, 'w') as py_file: py_file.write(job.code) # TODO: handle subproccess exceptions future = pool.submit(subprocess.run, [self._python_executable, py_filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE) # TODO: handle callback exceptions # TODO: make sure that tempdir is cleaned up even if there is an # exception in pool.submit or the callback future.add_done_callback(lambda _: shutil.rmtree(temp_dir)) return future
def main(): parser = argparse.ArgumentParser() group = parser.add_mutually_exclusive_group(required=True) group.add_argument("--filter", action="store_true", help="act as a filter") group.add_argument("--transform", metavar="MAPPING", type=argparse.FileType("r"), help="transform all files given in the mapping file") parser.add_argument("--srcprefix", metavar="PREFIX", default="", help="when transforming data files prepend this PREFIX to source paths") parser.add_argument("--dstprefix", metavar="PREFIX", default="", help="when transforming data files prepend this PREFIX to destination paths") args = parser.parse_args() if args.filter: check_stream(sys.stdin, sys.stdout) else: exe = Executor() res = [] for lineno, line in enumerate(args.transform): line = line.split('#', 1)[0] # comment line = line.rstrip() # trailing space or newline match = re.match(r'^(\S+):\s*(\S+)$', line) if not match: raise ValueError("syntax error on line %d" % (lineno + 1)) destination, source = match.groups() source = os.path.join(args.srcprefix, source) destination = os.path.join(args.dstprefix, destination) res.append(exe.submit(transform, source, destination)) while res: res.pop(0).result() # propagate exceptions
def __init__(self, apiurl, apiversion, charmworldurl=None, io_loop=None): """Initialize the deployer. The apiurl argument is the URL of the juju-core WebSocket server. The apiversion argument is the Juju API version (e.g. "go"). """ self._apiurl = apiurl self._apiversion = apiversion if charmworldurl is not None and not charmworldurl.endswith('/'): charmworldurl = charmworldurl + '/' self._charmworldurl = charmworldurl if io_loop is None: io_loop = IOLoop.current() self._io_loop = io_loop # Deployment validation and importing executors. self._validate_executor = ProcessPoolExecutor(1) self._run_executor = ProcessPoolExecutor(1) # An observer instance is used to watch the deployments progress. self._observer = utils.Observer() # Queue stores the deployment identifiers corresponding to the # currently started/queued jobs. self._queue = [] # The futures attribute maps deployment identifiers to Futures. self._futures = {}
class ThreadPool(object): '''线程池实现''' def __init__(self, thread_num=1, process_num=1, q_size=2000, daemon=True): self.thread_pool = _ThreadPoolExecutor(thread_num, daemon) self.process_pool = ProcessPoolExecutor(process_num) self.result_queue = Queue(q_size) def wait(self, threads=[]): thread_wait(threads) def add_thread(self, target, args=()): result = self.thread_pool.submit(target, *args) return result def add_process(self, target, args=()): result = self.process_pool.submit(target, *args) return result def thread_map(self, target, args=[]): return [self.thread_pool.submit(target, arg) for arg in args] def process_map(self, target, args=[]): return self.process_pool.map(target, args) def map(self, target, args=[]): return self.process_map(target, args)
def main(): """ Makes banner requests with a ThreadPoolExecutor. """ arg_parser = ArgumentParser() arg_parser.add_argument("--ip", help="IP address", required=True) arg_parser.add_argument("--pool", help="Executor pool type", choices=("thread", "process"), required=True) arg_parser.add_argument( "--workers", help="Number of executor workers", type=int, choices=range(1, 9), required=True ) args = arg_parser.parse_args() ip = args.ip pool = args.pool workers = args.workers if pool == "process": executor = ProcessPoolExecutor(max_workers=workers) elif pool == "thread": executor = ThreadPoolExecutor(max_workers=workers) for i in range(1, 256): for port in get_ports(): executor.submit(banner_request, "{0}.{1}".format(ip, i), port) print("[!] Finished spawning banner requests")
def on_message(self, message): print len(message) result = yield tornado.gen.Task(self.process_message, message) return pool = ProcessPoolExecutor() fut = pool.submit(call_process, message) ret = yield fut pool.shutdown()
def main(chunk): nums = range(1, 1000) pool = ProcessPoolExecutor() count = 0 returned_iterator = pool.map(is_prime, nums, timeout=None, chunksize=chunk) for result in returned_iterator: if result: count += 1 return count
def run_simulation(datasets, workers_num): workers = [TroiaWebDemoUser(get_troia_client(), "TES_TROJ_JID_" + str(i)) for i in xrange(workers_num)] for worker in workers: worker.set_datasets(datasets) executor = ProcessPoolExecutor(workers_num) # maap = map maap = lambda *args, **kwargs: list(executor.map(*args, **kwargs)) maap(exec_fun, workers, repeat(ITERATIONS, workers_num))
def splice_gmaps(threadpool, tilefolder, tempfiles, name): processpool = ProcessPoolExecutor() caption = "Rendering Zoom Layers {}".format(name) loadingbar = Bar(caption=caption) loadingbar.set_progress(0, caption) pygame.display.update() side = 1600 zoom_levels = 4 factor = 2 ** (zoom_levels - 1) masterside = side * factor plates = generate_plate_coords(factor, tempfiles) master_surface = pygame.Surface((masterside, masterside)) done = 0 total = len(tempfiles) + len(plates) * sum((4 ** x for x in range(zoom_levels))) fraction = 100 / total def render_base_to_master(task): imgdata, size, location = task.result() tempsurf = pygame.image.frombuffer(imgdata, size, "RGB") master_surface.blit(tempsurf, location) tasks = [] for masterpos, pieces in plates.items(): master_surface.fill((132, 170, 248)) for x, y in pieces: task = processpool.submit(unpack, tempfiles, x, y, ((x % factor) * side, (y % factor) * side)) tasks.append(threadpool.submit(render_base_to_master, task)) tasks.append(task) current_area = masterside for task in tasks: task.result() done += 0.5 loadingbar.set_progress(done * fraction, caption + " %4d of %4d" % (done, total)) for z in range(zoom_levels): tasks = [] pieces = masterside // current_area x_off = masterpos[0] * pieces y_off = masterpos[1] * pieces for xp in range(pieces): for yp in range(pieces): temp = pygame.Surface.subsurface(master_surface, (xp * current_area, yp * current_area, current_area, current_area)) filename = "screen_{}_{}_{}.png".format(z + 1, x_off + xp, y_off + yp) data = pygame.image.tostring(temp, "RGB") tasks.append(processpool.submit(render_plate, data, tilefolder, temp.get_size(), side, filename)) for task in tasks: task.result() done += 1 loadingbar.set_progress(done * fraction, caption + " %4d of %4d" % (done, total)) current_area //= 2 processpool.shutdown()
def _run(self, instance_id: str, service_id: str, plan_id: str, accepts_incomplete: bool, func: Callable, *func_args) -> Any: # The _match_synchronicity call must come first because it may raise an exception sync = self._match_synchronicity(service_id, plan_id, accepts_incomplete) executor = ProcessPoolExecutor(max_workers=1) future = executor.submit(func, *func_args) if sync: return future.result(timeout=59) else: self.async_ops[instance_id] = future raise ProvisioningAsynchronously
def post(self): file = self.request.files['file'][0] hark.client.login() hark.client.createSession(default_hark_config) log.info("Uploading asynchrounously") pool = ProcessPoolExecutor(max_workers=2) future = pool.submit(async_upload, file) yield future pool.shutdown() log.info("Rendering visualization page") self.render('visualize.html')
def compute_pi(nr_tries=10000, pool_size=None, constructor=None): if not constructor: executor = ProcessPoolExecutor(max_workers=pool_size) else: executor = constructor(max_workers=pool_size) args = [(nr_tries//pool_size, ) for _ in range(pool_size)] results = executor.map(partial_pi, args) if not pool_size: pool_size = multiprocessing.cpu_count() return sum(results)/pool_size
def main(): numbers = [ (1963309, 2265973), (2030677, 3814172), (1551645, 2229620), (2039045, 2020802) ] start = time() pool = ProcessPoolExecutor(max_workers=2) results = list(pool.map(gcd, numbers)) end = time() print('Took %.3f seconds' % (end - start))
def generate_stocks(freq=pd.Timedelta(seconds=60), directory=None): from concurrent.futures import ProcessPoolExecutor, wait e = ProcessPoolExecutor() if os.path.exists(os.path.join('data', 'daily')): glob_path = os.path.join('data', 'daily', '*') else: glob_path = os.path.join(daily_dir, '*') filenames = sorted(glob(glob_path)) futures = [e.submit(generate_stock, fn, directory=directory, freq=freq) for fn in filenames] wait(futures)
def build_from_path(in_dir, out_dir, num_workers=1): executor = ProcessPoolExecutor(max_workers=num_workers) futures = [] index = 1 with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f: for line in f: parts = line.strip().split('|') wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0]) text = parts[2] futures.append(executor.submit( partial(_process_utterance, out_dir, index, wav_path, text))) index += 1 return [future.result() for future in futures]
class ConcurrentDownloader(BaseDownloader, ConcurrentMixin): """Concurrent ProcessPoolExecutor downloader :param pool_size: size of ThreadPoolExecutor :param timeout: request timeout in seconds """ def __init__( self, worker_class, worker_kwargs=None, pool_size=5, middlewares=None,): # configure executor self.pool_size = pool_size self.executor = ProcessPoolExecutor(max_workers=self.pool_size) # prepare worker params self.worker_params = { 'worker_class': worker_class, 'worker_kwargs': worker_kwargs or {}, } # ctrl-c support for python2.x # trap sigint signal.signal(signal.SIGINT, lambda s, f: s) super(ConcurrentDownloader, self).__init__( middlewares=middlewares ) def get(self, requests): for request in requests: # delegate request processing to the executor future = self.executor.submit( _run_download_worker, self.worker_params, request, ) # build Planned object done_future = Planned() # when executor finish request - fire done_future future.add_done_callback( partial(self._done, request, done_future) ) yield done_future def get_workers_count(self): return self.pool_size def stop(self): self.executor.shutdown()
def precompute_to_stream(self, stream, logger): """ File format: int64: nnz in total padding to 128 bytes double[ni]: x_squared double[(lmax + 1) * ni]: Lambda_0 double[(lmax + 1) * ni]: Lambda_1 ushort[(lmax + 1)**2]: i_stops Format of i_stops is m-major ordering, but with, additionally, even coefficents all coming before the odd ones. """ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor executor = ProcessPoolExecutor(max_workers=8) start_pos = stream.tell() for i in range(2 * (self.lmax + 1)): write_int64(stream, 0) write_array(stream, self.x_squared) futures = [] for m in range(self.lmax + 1): for odd in [0, 1]: futures.append(executor.submit(precompute_single, self.thetas, self.lmax, self.epsilon_legendre, m, odd)) nnz_total = 0 Lambda_1_list = [] i_stops_list = [] nnz_list = [] it = iter(futures) for m in range(self.lmax + 1): for odd in [0, 1]: Lambda_0, Lambda_1, i_stops, nnz = it.next().result() logger.info('Got %s m=%d' % (['even', 'odd'][odd], m)) write_array(stream, Lambda_0) Lambda_1_list.append(Lambda_1) i_stops_list.append(i_stops) nnz_list.append(nnz) nnz_total += nnz for arr in Lambda_1_list: write_array(stream, arr) for arr in i_stops_list: write_array(stream, arr) end_pos = stream.tell() stream.seek(start_pos) for nnz in nnz_list: write_int64(stream, nnz) stream.seek(end_pos) return nnz_total
def __init__(self, project_dir=None, max_training_processes=1, response_log=None, emulation_mode=None, remote_storage=None, component_builder=None, model_server=None, wait_time_between_pulls=None): self._training_processes = max(max_training_processes, 1) self._current_training_processes = 0 self.responses = self._create_query_logger(response_log) self.project_dir = config.make_path_absolute(project_dir) self.emulator = self._create_emulator(emulation_mode) self.remote_storage = remote_storage self.model_server = model_server self.wait_time_between_pulls = wait_time_between_pulls if component_builder: self.component_builder = component_builder else: self.component_builder = ComponentBuilder(use_cache=True) self.project_store = self._create_project_store(project_dir) # tensorflow sessions are not fork-safe, # and training processes have to be spawned instead of forked. See # https://github.com/tensorflow/tensorflow/issues/5448#issuecomment # -258934405 multiprocessing.set_start_method('spawn', force=True) self.pool = ProcessPool(self._training_processes)
def spark_submit(exec_string, log_file, driver_path): """ asynchronously run the pyspark/sparktk submitted script while writing the logs to the log_file for the app :param exec_string: the command that is going to be run :param log_file: the file containing command(script) logs while running :param driver_path: the path to the main sparktk/pyspark script within the uploads folder :return: None """ print "Entering spark_submit" mark_submitted(driver_path) pool = Pool(max_workers=1) cmd_string = "%s >>%s 2>&1" % (exec_string, log_file) print "CMD stting is %s" % (cmd_string) future = pool.submit(subprocess.call, cmd_string, shell=True) future.driver_path = driver_path future.add_done_callback(mark_completed)
def __init__(self, scoring_model, extractor, cpu_workers=None, io_workers=None, batch_size=50): self.scoring_model = scoring_model self.extractor = extractor self.cpu_workers = \ int(cpu_workers) if cpu_workers is not None else cpu_count() self.batch_size = int(batch_size) if io_workers is not None: self.io_workers = int(io_workers) else: self.io_workers = max(self.MIN_IO_WORKERS, min(self.MAX_IO_WORKERS, int(self.cpu_workers * self.IO_WORKER_MULTIPLIER))) logger.info("Starting up IO thread pool with {0} workers" .format(self.io_workers)) self.scores_ex = ThreadPoolExecutor(max_workers=self.io_workers) logger.info("Starting up CPU thread pool with {0} workers" .format(self.cpu_workers)) self.process_ex = ProcessPoolExecutor(max_workers=self.cpu_workers) roots = dependencies.dig(self.scoring_model.features) self.root_datasources = [d for d in roots if isinstance(d, Datasource)]
def initialize(self, io_loop=None, keep_alive_milliseconds=37000, # how often to check for unused sessions check_unused_sessions_milliseconds=17000, # how long unused sessions last unused_session_lifetime_milliseconds=15000, # how often to log stats stats_log_frequency_milliseconds=15000, **kw): if io_loop is None: io_loop = IOLoop.current() self._loop = io_loop for app_context in self._applications.values(): app_context._loop = self._loop self._clients = set() self._executor = ProcessPoolExecutor(max_workers=4) self._loop.add_callback(self._start_async) self._stats_job = PeriodicCallback(self.log_stats, stats_log_frequency_milliseconds, io_loop=self._loop) self._unused_session_linger_milliseconds = unused_session_lifetime_milliseconds self._cleanup_job = PeriodicCallback(self.cleanup_sessions, check_unused_sessions_milliseconds, io_loop=self._loop) if keep_alive_milliseconds > 0: self._ping_job = PeriodicCallback(self.keep_alive, keep_alive_milliseconds, io_loop=self._loop) else: self._ping_job = None
def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): executor = ProcessPoolExecutor(max_workers=num_workers) futures = [] index = 1 for book in books: with open(os.path.join(in_dir, book, 'sentence_index.txt')) as f: for line in f: parts = line.strip().split('\t') if line[0] is not '#' and len(parts) == 8 and float(parts[3]) > _min_confidence: wav_path = os.path.join(in_dir, book, 'wav', '%s.wav' % parts[0]) labels_path = os.path.join(in_dir, book, 'lab', '%s.lab' % parts[0]) text = parts[5] task = partial(_process_utterance, out_dir, index, wav_path, labels_path, text) futures.append(executor.submit(task)) index += 1 results = [future.result() for future in tqdm(futures)] return [r for r in results if r is not None]
def make_arch_db(): executor = ProcessPoolExecutor(max_workers=8) by = 10000 m = 60000000 #by = 2000 #m = 10000 e = executor.map(process_range, zip(range(0, m, by),range(by, m+by, by))) executor.shutdown() print('done calculating architectures') pfam_sets = merge(e) print(len(pfam_sets)) gsave(pfam_sets,'pfam_sets.pkl.gz') # mongodb db = MongoClient('wl-cmadmin', 27017).ArchDB_Pfam_071414.ArchDB_Pfam_071414 db.insert(map(lambda item: {'_id': min(item[1]), 'pID': list(item[1]), 'Pfam': item[0]}, pfam_sets.items())) db.ensure_index('pID') db.ensure_index('Pfam')
def __init__(self, applications, io_loop=None, extra_patterns=None, # heroku, nginx default to 60s timeout, so well less than that keep_alive_milliseconds=37000): if io_loop is None: io_loop = IOLoop.current() self._loop = io_loop if keep_alive_milliseconds < 0: # 0 means "disable" raise ValueError("keep_alive_milliseconds must be >= 0") self._resources = {} # Wrap applications in ApplicationContext self._applications = dict() for k,v in applications.items(): self._applications[k] = ApplicationContext(v, self._loop) extra_patterns = extra_patterns or [] relative_patterns = [] for key in applications: app_patterns = [] for p in per_app_patterns: if key == "/": route = p[0] else: route = key + p[0] app_patterns.append((route, p[1], { "application_context" : self._applications[key] })) websocket_path = None for r in app_patterns: if r[0].endswith("/ws"): websocket_path = r[0] if not websocket_path: raise RuntimeError("Couldn't find websocket path") for r in app_patterns: r[2]["bokeh_websocket_path"] = websocket_path relative_patterns.extend(app_patterns) all_patterns = extra_patterns + relative_patterns + toplevel_patterns log.debug("Patterns are: %r", all_patterns) super(BokehTornado, self).__init__(all_patterns, **settings) self._clients = set() self._executor = ProcessPoolExecutor(max_workers=4) self._loop.add_callback(self._start_async) self._stats_job = PeriodicCallback(self.log_stats, 15.0 * 1000, io_loop=self._loop) self._unused_session_linger_seconds = 60*30 self._cleanup_job = PeriodicCallback(self.cleanup_sessions, 17.0 * 1000, io_loop=self._loop) if keep_alive_milliseconds > 0: self._ping_job = PeriodicCallback(self.keep_alive, keep_alive_milliseconds, io_loop=self._loop) else: self._ping_job = None
def __init__(self, config, component_builder): self._training_processes = config['max_training_processes'] if config['max_training_processes'] > 0 else 1 self.config = config self.responses = self._create_query_logger(config) self.model_dir = config['path'] self.emulator = self._create_emulator() self.component_builder = component_builder if component_builder else ComponentBuilder(use_cache=True) self.project_store = self._create_project_store() self.pool = ProcessPool(self._training_processes)
def __init__(self, pop_size, problem): self.problem = problem self.pop = [Network.random_network() for i in range(pop_size)] self.fitness_cache = {} self.best = None self.nt = NetTester(problem) self.pp = ProcessPoolExecutor(max_workers=4) self.ntf = NetworkTesterFactory(problem) self.pop_size = pop_size
# -*- coding: utf-8 -*- import requests import hashlib import base64 import re import json import time from functools import lru_cache from concurrent.futures import ProcessPoolExecutor from requests import Session from requests_futures.sessions import FuturesSession session = FuturesSession(executor=ProcessPoolExecutor(max_workers=10), session=Session()) def netease_hymn(): return """ player's Game Over, u can abandon. u get pissed, get pissed, Hallelujah my King! errr oh! f**k ohhh!!!! """ def encrypted_id(dfsId): x = [ord(i[0]) for i in netease_hymn().split()] y = ''.join([chr(i - 61) if i > 96 else chr(i + 32) for i in x]) byte1 = bytearray(y, encoding='ascii') byte2 = bytearray(str(dfsId), encoding='ascii')
def parse_cmds_in_parallel(self, cmds, unwrap, total_cmds=None): if os.environ.get("CLADE_DEBUG"): if total_cmds: self.log("Parsing {} commands".format(total_cmds)) for cmd in cmds: unwrap(self, cmd) return if self.conf.get("cpu_count"): max_workers = self.conf.get("cpu_count") else: max_workers = os.cpu_count() # cmds is eather list, tuple or generator if type(cmds) is list or type(cmds) is tuple: total_cmds = len(cmds) # Print progress only of we know total number of commands if total_cmds: self.log("Parsing {} commands".format(total_cmds)) with ProcessPoolExecutor(max_workers=max_workers) as p: chunk_size = 2000 futures = [] finished_cmds = 0 # Submit commands to executor in chunks for cmd_chunk in self.__get_cmd_chunk(cmds, chunk_size=chunk_size): chunk_futures = [] for cmd in cmd_chunk: f = p.submit(unwrap, self, cmd) chunk_futures.append(f) futures.append(f) while True: if not futures: break done_futures = [x for x in futures if x.done()] # Remove all futures that are already completed # to reduce memory usage futures = [x for x in futures if not x.done()] # Track progress (only if stdout is not redirected) if total_cmds and sys.stdout.isatty( ) and self.conf["log_level"] in ["INFO", "DEBUG"]: finished_cmds += len(done_futures) msg = "\t [{:.0f}%] {} of {} commands are parsed".format( finished_cmds / total_cmds * 100, finished_cmds, total_cmds, ) print(msg, end="\r") # Check return value of all finished futures for f in done_futures: try: f.result() except Exception as e: raise RuntimeError( "Something happened in the child process: {}". format(e)) # Submit next chunk if the current one is almost processed finished_chunk_cmds = len( [x for x in chunk_futures if x.done()]) if finished_chunk_cmds > (chunk_size - chunk_size // 10): break # Save a little bit of CPU time # skip sleep only for very small projects time.sleep(0.1) if total_cmds and sys.stdout.isatty( ) and self.conf["log_level"] in ["INFO", "DEBUG"]: print(" " * 79, end="\r")
def Main(): pool = ProcessPoolExecutor(numThreads) result = pool.map(ffmpegConvert, fileList)
error2: error type2 file empty error3: error type3 do not have needed data """ import rsa, json, requests, os, redis, zipfile, shutil, time, re import numpy as np # from retry import retry import pandas as pd # import Crypto.PublicKey.RSA import base64, pymysql from datetime import datetime from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor from .mongo_con import MongoDb THREAD_POOL = ThreadPoolExecutor(6) PROCESS_POOL = ProcessPoolExecutor(2) redis_pool_6 = redis.ConnectionPool(host='127.0.0.1', port=6379, db=6, password='******', decode_responses=True) redis_pool_7 = redis.ConnectionPool(host='127.0.0.1', port=6379, db=7, password='******', decode_responses=True) red = redis.StrictRedis(connection_pool=redis_pool_7) red_station_status = redis.StrictRedis(connection_pool=redis_pool_6) exchange_rate = { 'CA': 0.7519,
def main(): # command-line options define("debug", default=False, help="run in debug mode", type=bool) define("no_cache", default=False, help="Do not cache results", type=bool) define( "localfiles", default="", help= "Allow to serve local files under /localfile/* this can be a security risk", type=str) define("port", default=5000, help="run on the given port", type=int) define("cache_expiry_min", default=10 * 60, help="minimum cache expiry (seconds)", type=int) define("cache_expiry_max", default=2 * 60 * 60, help="maximum cache expiry (seconds)", type=int) define("mc_threads", default=1, help="number of threads to use for Async Memcache", type=int) define("threads", default=1, help="number of threads to use for rendering", type=int) define("processes", default=0, help="use processes instead of threads for rendering", type=int) define("frontpage", default=FRONTPAGE_JSON, help="path to json file containing frontpage content", type=str) tornado.options.parse_command_line() # NBConvert config config = Config() config.HTMLExporter.template_file = 'basic' config.NbconvertApp.fileext = 'html' config.CSSHTMLHeaderTransformer.enabled = False # don't strip the files prefix - we use it for redirects # config.Exporter.filters = {'strip_files_prefix': lambda s: s} # DEBUG env implies both autoreload and log-level if os.environ.get("DEBUG"): options.debug = True logging.getLogger().setLevel(logging.DEBUG) # setup memcache mc_pool = ThreadPoolExecutor(options.mc_threads) if options.processes: # can't pickle exporter instances, exporter = HTMLExporter pool = ProcessPoolExecutor(options.processes) else: exporter = HTMLExporter(config=config, log=log.app_log) pool = ThreadPoolExecutor(options.threads) memcache_urls = os.environ.get('MEMCACHIER_SERVERS', os.environ.get('MEMCACHE_SERVERS')) if options.no_cache: log.app_log.info("Not using cache") cache = MockCache() elif pylibmc and memcache_urls: kwargs = dict(pool=mc_pool) username = os.environ.get('MEMCACHIER_USERNAME', '') password = os.environ.get('MEMCACHIER_PASSWORD', '') if username and password: kwargs['binary'] = True kwargs['username'] = username kwargs['password'] = password log.app_log.info("Using SASL memcache") else: log.app_log.info("Using plain memecache") cache = AsyncMultipartMemcache(memcache_urls.split(','), **kwargs) else: log.app_log.info("Using in-memory cache") cache = DummyAsyncCache() # setup tornado handlers and settings template_path = pjoin(here, 'templates') static_path = pjoin(here, 'static') env = Environment(loader=FileSystemLoader(template_path)) env.filters['markdown'] = markdown.markdown try: git_data = git_info(here) except Exception as e: app_log.error("Failed to get git info: %s", e) git_data = {} else: git_data['msg'] = escape(git_data['msg']) if options.no_cache: # force jinja to recompile template every time env.globals.update(cache_size=0) env.globals.update( nrhead=nrhead, nrfoot=nrfoot, git_data=git_data, ipython_info=ipython_info(), len=len, ) AsyncHTTPClient.configure(HTTPClientClass) client = AsyncHTTPClient() github_client = AsyncGitHubClient(client) # load frontpage sections with io.open(options.frontpage, 'r') as f: frontpage_sections = json.load(f) # cache frontpage links for the maximum allowed time max_cache_uris = {''} for section in frontpage_sections: for link in section['links']: max_cache_uris.add('/' + link['target']) settings = dict( log_function=log_request, jinja2_env=env, static_path=static_path, client=client, github_client=github_client, exporter=exporter, config=config, cache=cache, cache_expiry_min=options.cache_expiry_min, cache_expiry_max=options.cache_expiry_max, max_cache_uris=max_cache_uris, frontpage_sections=frontpage_sections, pool=pool, gzip=True, render_timeout=20, localfile_path=os.path.abspath(options.localfiles), fetch_kwargs=dict(connect_timeout=10, ), ) # create and start the app if options.localfiles: log.app_log.warning( "Serving local notebooks in %s, this can be a security risk", options.localfiles) # use absolute or relative paths: handlers.insert(0, (r'/localfile/(.*)', LocalFileHandler)) app = web.Application(handlers, debug=options.debug, **settings) http_server = httpserver.HTTPServer(app, xheaders=True) log.app_log.info("Listening on port %i", options.port) http_server.listen(options.port) ioloop.IOLoop.instance().start()
from PIL import Image as PILImage from nio.crypto import AsyncDataT as File from nio.crypto import async_generator_from_data if sys.version_info >= (3, 7): from contextlib import asynccontextmanager else: from async_generator import asynccontextmanager AsyncOpenFile = Union[AsyncTextIOWrapper, AsyncBufferedReader] Size = Tuple[int, int] BytesOrPIL = Union[bytes, PILImage.Image] auto = autostr COMPRESSION_POOL = ProcessPoolExecutor() class AutoStrEnum(Enum): """An Enum where auto() assigns the member's name instead of an integer. Example: >>> class Fruits(AutoStrEnum): apple = auto() >>> Fruits.apple.value "apple" """ @staticmethod def _generate_next_value_(name, *_): return name
from concurrent.futures import ProcessPoolExecutor import requests def fetch_async(url): response = requests.get(url) return response url_list = ['http://www.github.com', 'http://www.bing.com'] pool = ProcessPoolExecutor(5) for url in url_list: pool.submit(fetch_async, url) pool.shutdown(wait=True)
layer_dir=layer_dir, resolution=resolution, atlas_name=atlas_name.lower()) elif step == 'step1': print("step 1") vol = CloudVolume(f'file://{layer_dir}') if atlas_name == 'paxinos': done_files = set([ int(y) for y in os.listdir(progress_dir) ]) all_files = set(range(vol.bounds.minpt.y, vol.bounds.maxpt.y)) to_upload = [ int(y) for y in list(all_files.difference(done_files)) ] done_files = set([ int(z) for z in os.listdir(progress_dir)]) all_files = set(range(vol.bounds.minpt.z,vol.bounds.maxpt.z)) to_upload = [ int(z) for z in list(all_files.difference(done_files)) ] to_upload.sort() print("Have {len(to_upload)} slices remaining to upload",to_upload) if atlas_name == 'paxinos': with ProcessPoolExecutor(max_workers=4) as executor: for job in executor.map(process_paxinos_slice,to_upload): try: print(job) except Exception as exc: print(f'generated an exception: {exc}') else: with ProcessPoolExecutor(max_workers=4) as executor: for job in executor.map(process_slice,to_upload): try: print(job) except Exception as exc: print(f'generated an exception: {exc}')
def run(self, train_base_models=True): start_time = time.time() from concurrent.futures import ProcessPoolExecutor pool = ProcessPoolExecutor(max_workers=self.n_workers) X, y = [], [] c = [] inc = 1. X_l, y_l = [], [] weight = np.array([1 / self.K] * (self.K + 1)) config_evaluated = [] config_space = get_benchmark_configspace('covtype_svm') # Initialize config L. config_L = sample_configurations(config_space, self.num_L_init) if train_base_models: func_configs = list() for iter_t in range(self.K): print('Build mid fidelity model', iter_t) func_configs.append(True) func_configs.append(False) training_data = self.run_parallel_async(pool, self.mini_smac, func_configs) with open('data/xgb/base_%s_data.pkl' % self.method_name, 'wb') as f: pickle.dump(training_data, f) else: with open('data/xgb/base_tse_data_%d.pkl' % 10, 'rb') as f: training_data = pickle.load(f) print('Load training data for M evaluations!') # Create base models. base_models = list() config_space = get_benchmark_configspace('covtype_svm') types, bounds = get_types(config_space) for iter_t in range(self.K + 1): config_x, config_y = training_data[iter_t] model = RandomForestWithInstances(types=types, bounds=bounds) model.train(config_x, config_y) base_models.append(model) low_fidelity_model = base_models[self.K] X_l.extend(training_data[self.K][0].tolist()) y_l.extend(training_data[self.K][1].tolist()) print('Base model building finished!') # The framework of TSE. for iter_t in range(self.iter_H): print('Iteration in TSE', iter_t) # Sample a batch of configurations according to tse model. configs = sample_configurations(config_space, self.iter_L * 10) config_arrays = convert_configurations_to_array(configs) perfs, _ = low_fidelity_model.predict(config_arrays) perfs = perfs[:, 0] if len(y) > 3: preds = [] for i in range(self.K): m, _ = base_models[i].predict(config_arrays) preds.append(m[:, 0].tolist()) preds = np.array(preds).T preds = np.mat(np.hstack((preds, np.ones((len(configs), 1))))) # Add the delta. delta = preds * np.mat(weight.reshape(-1, 1)) perfs += delta.getA()[:, 0] configs_candidate = [] indexes = np.argsort(perfs)[:self.iter_L] for index in indexes: configs_candidate.append(configs[index]) # Evaluate the low-fidelity configurations. print('=' * 10 + 'Evaluating the low-fidelity configurations') config_params = [] for config in configs_candidate: config_params.append((config.get_dictionary(), self.s_min)) result_perf = self.run_parallel_async(pool, self.objective_function, config_params) for index, item in enumerate(result_perf): X_l.append(configs_candidate[index].get_array().tolist()) y_l.append(item[0]) print(np.array(X_l).shape, np.array(y_l, dtype=np.float64).shape) # Update f_L. print('=' * 10 + 'Retrain the f_L') low_fidelity_model.train(np.array(X_l), np.array(y_l, dtype=np.float64)) config_L.extend(configs_candidate) configs_input = [] for config in config_L: if config not in config_evaluated: configs_input.append(config) # Choose the next configuration. config_arrays = convert_configurations_to_array(configs_input) perfs, _ = low_fidelity_model.predict(config_arrays) perfs = perfs[:, 0] if len(y) > 3: preds = [] for i in range(self.K): m, _ = base_models[i].predict(config_arrays) preds.append(m[:, 0].tolist()) preds = np.array(preds).T preds = np.mat( np.hstack((preds, np.ones((len(configs_input), 1))))) # Add the delta. delta = preds * np.mat(weight.reshape(-1, 1)) perfs += delta.getA()[:, 0] next_config = configs_input[np.argmin(perfs)] # Evaluate this config with a high-fidelity setting. print('=' * 10 + 'Evaluate the high-fidelity configuration') perf, _ = self.objective_function( (next_config.get_dictionary(), self.s_max)) X.append(next_config) y.append(perf) if perf < inc: inc = perf c.append([time.time() - start_time, inc]) print('Current inc', inc) if len(y) < 3: continue # Learn the weight in TSE. Z = [] for i in range(self.K): m, v = base_models[i].predict( convert_configurations_to_array(X)) Z.append(m[:, 0].tolist()) Z = np.mat(np.hstack((np.array(Z).T, np.ones((len(y), 1))))) f = np.mat(np.array(y).reshape((-1, 1))) # Compute the weight. try: ZtZ_inv = np.linalg.inv(Z.T * Z) weight = (ZtZ_inv * Z.T * f)[:, 0] print('The weight updated is', weight) except np.linalg.LinAlgError as err: if 'Singular matrix' in str(err): print( 'Singular matrix encountered, and do not update the weight!' ) else: raise ValueError('Unexpected error!') # Save the result. np.save(self.file_path, np.transpose(np.array(c))) plt.plot(np.array(c)[:, 0], np.array(c)[:, 1]) plt.xlabel('time_elapsed (s)') plt.ylabel('validation error') plt.savefig("data/xgb/%s.png" % self.method_name) if time.time() - start_time > self.runtime_limit: print('Runtime budget meets!') break pool.shutdown(wait=True)
def main(): print("Main method called") files = glob.glob('root/zipfiles/*') for f in files: os.remove(f) shutil.rmtree('data/books') if not os.path.exists('data/books'): os.makedirs('data/books') tar = tarfile.open(name="root/archive.tar") tar.extractall() dir_name = "root/zipfiles" extension = ".zip" for item in os.listdir(dir_name): if item.endswith(extension): file_name = dir_name + "/" + item zip_ref = zipfile.ZipFile(file_name) # create zipfile object try: zip_ref.extractall("data/books") # extract file to dir print(file_name) except NotImplementedError: print("Could not unzip: " + file_name + " - continuing") zip_ref.close() dir_name = "data/books" extension = ".txt" cities_csv = pd.read_csv('data/cities/cities15000.csv', header=0, sep=';', usecols=['englishName', 'latitude', 'longitude']) books = list() authors = set() complete_author_list = list() cities = set() th_ex = ProcessPoolExecutor(max_workers=6) futures = [] count = 0 for root, directories, files in os.walk(dir_name): for file in files: if file.endswith(extension): count = count + 1 future = th_ex.submit(get_results, root, file, cities_csv, count) futures.append(future) pp.pprint(count) th_ex.shutdown(wait=True) print("Done with everything. Unwrapping results...") broken_count = 0 for th in futures: try: result = th.result() books.append(result[0]) for author in result[1]: authors.add(author) for city in result[2]: cities.add(city) except concurrent.futures.process.BrokenProcessPool: broken_count = broken_count + 1 print("BrokenProcessPool: #%d" % broken_count) for idx, val in enumerate(authors): complete_author_list.append((idx, val)) with open('data/csv/neo-cities.csv', 'w+', newline='', encoding='utf-8') as csv_file: writer = csv.writer(csv_file, delimiter='|', escapechar='\\') writer.writerow(['name', 'location:ID(Location-ID)']) for city in cities: writer.writerow(city) with open('data/csv/neo-books.csv', 'w+', newline='', encoding='utf-8') as csv_file: writer = csv.writer(csv_file, delimiter='|', escapechar='\\') writer.writerow(['bookId:ID(Book-ID)', 'title']) for value in books: title = value['title'] title = title.rstrip() title = " ".join(title.split()) writer.writerow([value['id'], title]) with open('data/csv/neo-authors.csv', 'w+', newline='', encoding='utf-8') as csv_file: writer = csv.writer(csv_file, delimiter='|', escapechar='\\') writer.writerow(['authorId:ID(Author-ID)', 'name']) for author in complete_author_list: auth = " ".join(author[1].split()) if auth or auth is not None: writer.writerow([author[0], auth]) with open('data/csv/neo-books-cities.csv', 'w+', newline='') as csv_file: writer = csv.writer(csv_file, delimiter='|') writer.writerow([':START_ID(Book-ID)', ':END_ID(Location-ID)']) for book in books: for city in book['cities']: writer.writerow([book['id'], city[1]]) with open('data/csv/neo-books-authors.csv', 'w+', newline='') as csv_file: writer = csv.writer(csv_file, delimiter='|') writer.writerow([':START_ID(Book-ID)', ':END_ID(Author-ID)']) for book in books: for author in book['authors']: auth = [ item for item in complete_author_list if item[1] == author ] writer.writerow([book['id'], auth[0][0]]) with open('data/csv/postgres-cities.csv', 'w+', newline='', encoding='utf-8') as csv_file: writer = csv.writer(csv_file, delimiter=';', escapechar='\\') writer.writerow(['name', 'location']) for city in cities: writer.writerow(city) with open('data/csv/postgres-books.csv', 'w+', newline='', encoding='utf-8') as csv_file: writer = csv.writer(csv_file, delimiter='|', escapechar='\\') writer.writerow(['book_id', 'title']) for value in books: title = value['title'] title = title.rstrip() title = " ".join(title.split()) writer.writerow([value['id'], title]) with open('data/csv/postgres-authors.csv', 'w+', newline='', encoding='utf-8') as csv_file: writer = csv.writer(csv_file, delimiter='|', escapechar='\\') writer.writerow(['author_id', 'name']) for author in complete_author_list: auth = " ".join(author[1].split()) if auth or auth is not None: writer.writerow([author[0], auth]) with open('data/csv/postgres-books-cities.csv', 'w+', newline='') as csv_file: writer = csv.writer(csv_file, delimiter='|') writer.writerow(['book_id', 'location']) for book in books: for city in book['cities']: writer.writerow([book['id'], city[1]]) with open('data/csv/postgres-books-authors.csv', 'w+', newline='') as csv_file: writer = csv.writer(csv_file, delimiter='|') writer.writerow(['book_id', 'author_id']) for book in books: for author in book['authors']: auth = [ item for item in complete_author_list if item[1] == author ] writer.writerow([book['id'], auth[0][0]])
def __init__(self, max_concurrent=5): self.pool = ProcessPoolExecutor(max_workers=max_concurrent) self.queue = []
executor=executor) async def benchmark_signature_validation_parallel(n, executor): tasks = [ verify_signature_async(hash, signature, public_key, executor=executor) for i in range(n) ] await asyncio.gather(*tasks) if __name__ == "__main__": loop = asyncio.get_event_loop() te = ThreadPoolExecutor(max_workers=1) pe = ProcessPoolExecutor(max_workers=1) n = 1000 # Blake2b serial start = time.time() sync_benchmark_blake2b(n) print(f"blake2b, n={n} sync done in {(time.time()-start)*1000} ms") start = time.time() fut: asyncio.Future = asyncio.ensure_future(benchmark_blake2b(n, te)) loop.run_until_complete(fut) print(f"blake2b, n={n} thread done in {(time.time()-start)*1000} ms") start = time.time() fut: asyncio.Future = asyncio.ensure_future(benchmark_blake2b(n, pe)) loop.run_until_complete(fut)
resource_url = "https://www.dy2018.com" + href.attrs['href'] getInfo(title, resource_url) def getInfo(title, resource_url): response2 = requests.get(resource_url, verify=False, headers=header) soup2 = BeautifulSoup( response2.text.encode(response2.encoding).decode('gbk'), "html.parser") for links in soup2.find_all("td", attrs={"style": "WORD-WRAP: break-word"}): print("{0}++{1}".format(title, links.text)) conn = pymysql.connect(host='127.0.0.1', port=3333, user='******', passwd='Xiaoxian0910', db='airasia', charset='utf8') cursor = conn.cursor() sql = "INSERT INTO dy2018.`bikan` (name,link) VALUES(%s,%s)" cursor.execute(sql, (title, links.text)) conn.commit() cursor.close() conn.close() # links = re.findall(r'magnet:\?xt=urn:btih:(?:[A-Z]|[0-9])*',response2) # for link in links: # print(link) if __name__ == "__main__": with ProcessPoolExecutor(max_workers=10) as pool: pool.map(Gen_url, urls)
# pool.submit(fetch_request,url) # pool.shutdown(True) #进程池+回调函数 from concurrent.futures import ProcessPoolExecutor import requests def fetch_async(url): response = requests.get(url) return response def callback(future): print(future.result().text) url_list = [ 'http://www.baidu.com', 'http://www.bing.com', 'http://www.cnblogs.com/' ] pool = ProcessPoolExecutor(5) if __name__ == '__main__': for url in url_list: v = pool.submit(fetch_async, url) v.add_done_callback(callback) pool.shutdown()
def closest_point_of_approach( traffic: Traffic, lateral_separation: float, vertical_separation: float, projection: Union[pyproj.Proj, crs.Projection, None] = None, round_t: str = "d", max_workers: int = 4, ) -> CPA: """ Computes a CPA dataframe for all pairs of trajectories candidates for being separated by less than lateral_separation in vertical_separation. In order to be computed efficiently, the method needs the following parameters: - projection: a first filtering is applied on the bounding boxes of trajectories, expressed in meters. You need to provide a decent projection able to approximate distances by Euclide formula. By default, EuroPP() projection is considered, but a non explicit argument will raise a warning. - round_t: an additional column will be added in the DataFrame to group trajectories by relevant time frames. Distance computations will be considered only between trajectories flown in the same time frame. By default, the 'd' pandas freq parameter is considered, to group trajectories by day, but other ways of splitting ('h') may be more relevant and impact performance. - max_workers: distance computations are spread over a given number of processors. """ if projection is None: logging.warn("Defaulting to projection EuroPP()") projection = crs.EuroPP() if isinstance(projection, crs.Projection): projection = pyproj.Proj(projection.proj4_init) def yield_pairs(t_chunk: Traffic): """ This function yields all pairs of possible candidates for a CPA calculation. """ # combinations types Iterator[Tuple[T, ...]] for first, second in cast(Iterator[Tuple[Flight, Flight]], combinations(t_chunk, 2)): # cast are necessary because of the lru_cache × property bug if (cast(pd.Timestamp, first.start) > cast( pd.Timestamp, second.stop)) or (cast( pd.Timestamp, second.start) > cast( pd.Timestamp, first.stop)): # Flights must fly at the same time continue if (first.min("altitude") > second.max("altitude") + vertical_separation): # Bounding boxes in altitude must cross continue if (second.min("altitude") > first.max("altitude") + vertical_separation): # Bounding boxes in altitude must cross continue if first.min("x") > second.max("x") + lateral_separation: # Bounding boxes in x must cross continue if second.min("x") > first.max("x") + lateral_separation: # Bounding boxes in x must cross continue if first.min("y") > second.max("y") + lateral_separation: # Bounding boxes in y must cross continue if second.min("y") > first.max("y") + lateral_separation: # Bounding boxes in y must cross continue # Next step is to check the 2D footprint of the trajectories # intersect. Before computing the intersection we bufferize the # trajectories by half the requested separation. first_shape = first.project_shape(projection) second_shape = second.project_shape(projection) if first_shape is None or second_shape is None: continue first_shape = first_shape.simplify(1e3).buffer(lateral_separation / 2) second_shape = first_shape.simplify(1e3).buffer( lateral_separation / 2) if first_shape.intersects(second_shape): yield first, second t_xyt = (traffic.airborne().compute_xy(projection).assign( round_t=lambda df: df.timestamp.dt.round(round_t))) cumul = list() # Multiprocessing is implemented on each timerange slot only. # TODO: it would probably be more efficient to multiprocess over each # t_chunk rather than multiprocess the distance computation. for _, t_chunk in tqdm(t_xyt.groupby("round_t"), total=len(set(t_xyt.data.round_t))): with ProcessPoolExecutor(max_workers=max_workers) as executor: tasks = { # TODO submit(Flight.distance, first, second) executor.submit(first.distance, second): ( first.flight_id, second.flight_id, ) for (first, second) in yield_pairs(Traffic(t_chunk)) } for future in as_completed(tasks): cumul.append(future.result()) return CPA(pd.concat(cumul, sort=False))
yearAry = ["15", "16"] monthAry = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] base_url1 = "http://agora.ex.nii.ac.jp/digital-typhoon/globe/color/20{}/8192x8192/MTS1{}00.globe.0.jpg" base_url2 = "http://agora.ex.nii.ac.jp/digital-typhoon/globe/color/20{}/8192x8192/MTS2{}00.globe.0.jpg" base_url3 = "http://agora.ex.nii.ac.jp/digital-typhoon/globe/color/20{}/8192x8192/HMW8{}00.globe.0.jpg" path = "/Users/iii/Desktop/imgs/" queue = Queue() for s_year in yearAry: for idx, month_num in enumerate(monthAry): if idx + 1 < 7: continue month = idx + 1 s_month = "0" + str(idx + 1) if idx + 1 < 10 else str(idx + 1) for date in range(month_num): s_date = "0" + str(date + 1) if date + 1 < 10 else str(date + 1) s_total = s_year + "," + s_year + s_month + s_date queue.put(s_total) # threads = map(lambda i: Thread(target=worker), xrange(NUM_THREADS)) # map(lambda th: th.start(), threads) # map(lambda th: th.join(), threads) cpus = multiprocessing.cpu_count() print cpus with ProcessPoolExecutor(max_workers=cpus) as executor: while not queue.empty(): page = queue.get() executor.submit(crawler, page)
def __init__( self, config: Config, loop, *, orm_base=None, using_box: Box = None, ) -> None: """Initialize""" logging.config.dictConfig(config.LOGGING) logger = logging.getLogger(f'{__name__}.Bot.__init__') logger.info('start') Namespace._bot = self self.process_pool_executor = ProcessPoolExecutor() self.thread_pool_executor = ThreadPoolExecutor() logger.info('connect to DB') config.DATABASE_ENGINE = get_database_engine(config) logger.info('connect to memcache') self.mc = aiomcache.Client( host=config.CACHE['HOST'], port=config.CACHE['PORT'], ) self.cache = Cache(self.mc, config.CACHE.get('PREFIX', 'YUI_')) logger.info('import apps') for app_name in config.APPS: logger.debug('import apps: %s', app_name) importlib.import_module(app_name) self.config = config self.loop = loop self.loop.set_debug(self.config.DEBUG) self.orm_base = orm_base or Base self.box = using_box or box self.queue: asyncio.Queue = asyncio.Queue() self.api = SlackAPI(self) self.channels: list[PublicChannel] = [] self.ims: list[DirectMessageChannel] = [] self.groups: list[PrivateChannel] = [] self.users: list[User] = [] self.restart = False self.is_ready = False self.method_last_call: defaultdict[str, datetime] = defaultdict(now) self.method_queue: defaultdict[str, list] = defaultdict(list) self.config.check( self.box.config_required, self.box.channel_required, self.box.channels_required, self.box.user_required, self.box.users_required, ) if self.config.REGISTER_CRONTAB: logger.info('register crontab') self.register_tasks()
def _run_tests(all_tests, log_name_base, extra_args): global stop, executor, futures, system_compiler xmlname = log_name_base + '.xml' junit_root = ET.Element('testsuites') conf_time = 0 build_time = 0 test_time = 0 passing_tests = 0 failing_tests = 0 skipped_tests = 0 commands = (compile_commands, clean_commands, install_commands, uninstall_commands) try: # This fails in some CI environments for unknown reasons. num_workers = multiprocessing.cpu_count() except Exception as e: print( 'Could not determine number of CPUs due to the following reason:' + str(e)) print('Defaulting to using only one process') num_workers = 1 # Due to Ninja deficiency, almost 50% of build time # is spent waiting. Do something useful instead. # # Remove this once the following issue has been resolved: # https://github.com/mesonbuild/meson/pull/2082 num_workers *= 2 executor = ProcessPoolExecutor(max_workers=num_workers) for name, test_cases, skipped in all_tests: current_suite = ET.SubElement(junit_root, 'testsuite', { 'name': name, 'tests': str(len(test_cases)) }) print() if skipped: print(bold('Not running %s tests.' % name)) else: print(bold('Running %s tests.' % name)) print() futures = [] for t in test_cases: # Jenkins screws us over by automatically sorting test cases by name # and getting it wrong by not doing logical number sorting. (testnum, testbase) = os.path.split(t)[-1].split(' ', 1) testname = '%.3d %s' % (int(testnum), testbase) should_fail = False if name.startswith('failing'): should_fail = name.split('failing-')[1] result = executor.submit(run_test, skipped, t, extra_args, system_compiler, backend, backend_flags, commands, should_fail) futures.append((testname, t, result)) for (testname, t, result) in futures: sys.stdout.flush() result = result.result() if result is None or 'MESON_SKIP_TEST' in result.stdo: print(yellow('Skipping:'), t) current_test = ET.SubElement(current_suite, 'testcase', { 'name': testname, 'classname': name }) ET.SubElement(current_test, 'skipped', {}) skipped_tests += 1 else: without_install = "" if len( install_commands) > 0 else " (without install)" if result.msg != '': print( red('Failed test{} during {}: {!r}'.format( without_install, result.step.name, t))) print('Reason:', result.msg) failing_tests += 1 if result.step == BuildStep.configure and result.mlog != no_meson_log_msg: # For configure failures, instead of printing stdout, # print the meson log if available since it's a superset # of stdout and often has very useful information. failing_logs.append(result.mlog) else: failing_logs.append(result.stdo) failing_logs.append(result.stde) else: print('Succeeded test%s: %s' % (without_install, t)) passing_tests += 1 conf_time += result.conftime build_time += result.buildtime test_time += result.testtime total_time = conf_time + build_time + test_time log_text_file(logfile, t, result.stdo, result.stde) current_test = ET.SubElement( current_suite, 'testcase', { 'name': testname, 'classname': name, 'time': '%.3f' % total_time }) if result.msg != '': ET.SubElement(current_test, 'failure', {'message': result.msg}) stdoel = ET.SubElement(current_test, 'system-out') stdoel.text = result.stdo stdeel = ET.SubElement(current_test, 'system-err') stdeel.text = result.stde print("\nTotal configuration time: %.2fs" % conf_time) print("Total build time: %.2fs" % build_time) print("Total test time: %.2fs" % test_time) ET.ElementTree(element=junit_root).write(xmlname, xml_declaration=True, encoding='UTF-8') return passing_tests, failing_tests, skipped_tests
def main(): # Make repository parser = get_parser() args = parser.parse_args() out_reg, out_autoreg = make_repo_from_parser(args) # Prepare model labels if (not args.with_init) and (args.with_forcing): label_add = "(no init)" elif (args.with_init) and (not args.with_forcing): label_add = "(no forcing)" elif (args.with_init) and (args.with_forcing): label_add = "" elif (not args.with_init) and (not args.with_forcing): label_add = "(no init, no forcing)" # Initialize result dicts reg_results = {"label": "lin reg " + label_add, "scores": []} shuffled_results = {"word_freqs": [], "word_lengths": []} autoreg_results = {"label": "lin autoreg " + label_add, "scores": []} # Loop over subjects (in parallel) with ProcessPoolExecutor(args.n_workers) as pool: pendings = [] for sub in range(args.n_subjects): pendings.append( pool.submit( eval_lin_models, sub, args.data, out_reg, out_autoreg, with_forcing=args.with_forcing, with_init=args.with_init, shuffle=args.shuffle)) for pending in tqdm.tqdm(pendings): (score_linreg, score_linautoreg, shuffled) = pending.result() # stack results in lists reg_results["scores"].append(score_linreg) autoreg_results["scores"].append(score_linautoreg) for key, score_shuffled in shuffled.items(): shuffled_results[key].append(score_shuffled) # Making numpy arrays from lists reg_results["scores"] = np.array(reg_results["scores"]) autoreg_results["scores"] = np.array(autoreg_results["scores"]) for key in shuffled_results.keys(): shuffled_results[key] = np.array(shuffled_results[key]) # # Converting to torch arrays # reg_results["scores"] = torch.from_numpy(reg_results["scores"]) # autoreg_results["scores"] = torch.from_numpy(autoreg_results["scores"]) # for key in shuffled_results.keys(): # shuffled_results[key] = torch.from_numpy(shuffled_results[key]) # Save torch.save(reg_results, out_reg / "reference_metrics.th") torch.save(autoreg_results, out_autoreg / "reference_metrics.th") if args.shuffle: for key, value in shuffled_results.items(): torch.save({'scores': value, 'label': 'lin reg ' + label_add}, out_reg / f"shuffled_{key}_metrics.th")
total_articles = len(article_urls) processed_articles = get_processed_articles_list() total_processed = len(processed_articles) print() print(f'Urls loaded: {total_articles}') print(f'Processed: {total_processed}') print() urls_to_process = [art['url'] for art in article_urls] print(len(urls_to_process)) for done_art in processed_articles: if done_art['url'] in urls_to_process: urls_to_process.remove(done_art['url']) print(f'Starting reading article...') with ProcessPoolExecutor(4) as executor: for article_url_batch in batch(urls_to_process, 20): urls = [url for url in list(article_url_batch)] futures.append( executor.submit(run_get_page_articles_process_batch, urls)) for future in as_completed(futures): result = future.result() if result is None: print('Article process result is None') else: print(f'Writing {len(result)} articles to articles file') total_processed = total_processed + len(result) write_processed_articles_to_file(result) print( f'Processed {total_processed}/{total_articles} articles...'
def test_no_connection_sharing_among_processes(s3): executor = ProcessPoolExecutor() conn_id = executor.submit(_get_s3_id, s3).result() assert id(s3.connect()) != conn_id, \ "Processes should not share S3 connections."
def _fit_model(RV, df_data, keys_d=None, kwrgs_pp={}, stat_model=tuple, lags_i=list, verbosity=0): #%% # stat_model = fc.stat_model_l[0] # RV = fc.TV # lags_i = [1] # kwrgs_pp={} # keys_d=None # df_data = fc.df_data # # do forecasting accros lags splits = df_data.index.levels[0] y_pred_all = [] y_pred_c = [] models = [] # store target variable (continuous and binary in y_ts dict) if hasattr(RV, 'RV_bin_fit'): y_ts = {'cont': RV.RV_ts_fit, 'bin': RV.RV_bin_fit} else: y_ts = {'cont': RV.RV_ts_fit} from time import time t0 = time() futures = {} with ProcessPoolExecutor(max_workers=max_cpu) as pool: for lag in lags_i: for split in splits: fitkey = f'{lag}_{split}' futures[fitkey] = pool.submit(fit, y_ts, df_data, lag, split, stat_model=stat_model, keys_d=keys_d, kwrgs_pp=kwrgs_pp, verbosity=verbosity) results = {key: future.result() for key, future in futures.items()} # unpack results models = dict() for lag in lags_i: y_pred_l = [] model_lag = dict() for split in splits: prediction, model = results[f'{lag}_{split}'] # store model model_lag[f'split_{split}'] = model # retrieve original input data df_norm = model.X TestRV = (df_norm['TrainIsTrue'] == False)[df_norm['y_pred']] y_pred_l.append(prediction[TestRV.values]) if lag == lags_i[0]: # ensure that RV timeseries matches y_pred TrainRV = (df_norm['TrainIsTrue'])[df_norm['y_pred']] RV_bin = RV.RV_bin.loc[TrainRV.index] # predicting RV might not be possible # determining climatological prevailance in training data y_c_mask = np.logical_and(TrainRV, RV_bin.squeeze() == 1) y_clim_val = RV_bin[y_c_mask].size / RV_bin.size # filling test years with clim of training data y_clim = RV_bin[TestRV == True].copy() y_clim[:] = y_clim_val y_pred_c.append(y_clim) models[f'lag_{lag}'] = model_lag y_pred_l = pd.concat(y_pred_l) y_pred_l = y_pred_l.sort_index() if lag == lags_i[0]: y_pred_c = pd.concat(y_pred_c) y_pred_c = y_pred_c.sort_index() y_pred_all.append(y_pred_l) y_pred_all = pd.concat(y_pred_all, axis=1) print("\n") print(time() - t0) print(f'{stat_model} ') #%% return y_pred_all, y_pred_c, models
def submit_qca_optimization_dataset( dataset_name=None, metadata=None, compute_spec=None, input_molecules=None, server="from_file", threads=None, compute_tag=None, priority="normal", skip_compute=False, ): """ Create or update an optimization dataset. Parameters ---------- dataset_name : str The name of the dataset. This is needed if the dataset already exists and no metadata is supplied. Useful when e.g. adding computes or molecules to an existing dataset. metadata : str A filename specifying the metadata needed to create a new dataset, in JSON format. An example metadata has the following format: { "submitter": "trevorgokey", "creation_date": "2020-09-18", "collection_type": "OptimizationDataset", "dataset_name": "OpenFF Sandbox CHO PhAlkEthOH v1.0", "short_description": "A diverse set of CHO molecules", "long_description_url": "https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2020-09-18-OpenFF-Sandbox-CHO-PhAlkEthOH", "long_description": "This dataset contains an expanded set of the AlkEthOH and PhEthOH datasets, which were used in the original derivation of the smirnoff99Frosst parameters.", "elements": [ "C", "H", "O" ], "change_log": [ {"author": "trevorgokey", "date": "2020-09-18", "version": "1.0", "description": "A diverse set of CHO molecules. The molecules in this set were generated to include all stereoisomers if chirality was ambiguous from the SMILES input. Conformations were generated which had an RMSD of at least 4 Angstroms from all other conformers" } ] } compute_spec : str A filename specifying the compute specifications for the dataset, in JSON format. input_molecules : str A filename specifying the molecules to load into the dataset as entries, in JSON format. server : str The server URI to connect to. The special value 'from_file' will read from the default server connection config file for e.g. authentication threads : int The number of threads to use to when contacting the server. compute_tag : str The compute tag used to match computations with compute managers. For OpenFF calculations, this should be "openff" priorty : str The priority of new calculations to submit. This must be either "low", "normal", or "high". skip_compute : bool Do not submit the tasks after the molecules and compute specifications have been added Returns ------- None """ ds_type = "OptimizationDataset" ds_name = dataset_name if server == "from_file": # Connect to a server that needs authentication client = ptl.FractalClient().from_file() elif server is not None: # Use a custom server, possibly a local, private server client = ptl.FractalClient(server, verify=False) else: # Use the default public MOLSSI server client = ptl.FractalClient() try: ds = client.get_collection(ds_type, ds_name) logger.info("\nDataset loaded with the following metadata:") logger.info(pformat(ds.data.metadata)) except KeyError: assert metadata is not None metadata = json.load(open(metadata)) metadata["collection_type"] = ds_type if ds_name is not None: metadata["dataset_name"] = ds_name else: ds_name = metadata["dataset_name"] ds = getattr(ptl.collections, ds_type)( ds_name, client=client, metadata=metadata, description=metadata["short_description"], tags=["openff"], tagline=metadata["short_description"], ) logger.info("\nDataset created with the following metadata:") logger.info(pformat(metadata)) if compute_spec is not None: specs = json.load(open(compute_spec)) add_compute_specs(ds, specs) if input_molecules is not None: pool = ProcessPoolExecutor(max_workers=threads) new_mols = 0 new_calcs = 0 total_calcs = 0 logger.info("\nLoading {} into QCArchive...".format(input_molecules)) if input_molecules.endswith("lzma") or input_molecules.endswith("xz"): input_ds = json.load(lzma.open(input_molecules, "rt")) elif input_molecules.endswith("bz2"): input_ds = json.load(bz2.open(input_molecules, "rt")) else: input_ds = json.load(open(input_molecules)) logger.info("Number of unique molecules: {}".format(len(input_ds))) work = [] for j, index in enumerate(input_ds): for i, mol in enumerate(input_ds[index], 1): work_unit = pool.submit(submit, *(ds, index, mol, i)) work.append(work_unit) ds.save() ids = [] new_entries = 0 iterable = enumerate(as_completed(work)) if logger.getEffectiveLevel() >= logging.INFO: iterable = tqdm.tqdm(iterable, total=len(work), ncols=80, desc="Entries") for j, unit in iterable: unique_id, success = unit.result() new_entries += int(success) ids.append(unique_id) new_mols += len(input_ds) new_calcs += new_entries total_calcs += len(ids) logger.info("\nNumber of new entries: {}/{}".format( new_entries, len(ids))) stride = 20 # Only submit tasks that were explicitly given as parameters if compute_spec is not None and not skip_compute: new_tasks = 0 for qc_spec_name in specs: out_str = ( "\nSubmitting calculations in batches of {} for specification {}" ) logger.info(out_str.format(stride, qc_spec_name)) work = [] args = (qc_spec_name, ) kwargs = dict(priority=priority, tag=compute_tag) for entry_list in chunk(ids, stride): kwargs["subset"] = entry_list work_unit = pool.submit(ds.compute, *args, **kwargs) work.append(work_unit) iterable = as_completed(work) if logger.getEffectiveLevel() >= logging.INFO: iterable = tqdm.tqdm(iterable, total=len(work), ncols=80, desc="Tasks") for unit in iterable: submitted = unit.result() new_tasks += submitted logger.info("\nNumber of new tasks: {}".format(new_tasks)) pool.shutdown(wait=True)
def do_nothing(*args, **kwargs): pass def disable_stdout(): import os import sys f = open(os.devnull, 'w') sys.stdout = f sys.stdout.flush = do_nothing sys.stdout.write = do_nothing tx = TaskExecutor.load("conf/config.yaml", multi_process=False) proc_pool = ProcessPoolExecutor(max_workers=64) #proc_pool = ThreadPoolExecutor(max_workers=64) tick_buffer = BufferedDataProcessor(num_worker=4) history_buffer = BufferedDataProcessor(num_worker=8) history_index_buffer = BufferedDataProcessor(num_worker=1) def logtime(key): return lambda t: logging.debug("%s: %.3fs", key, t) @tick_buffer.on_combine @history_buffer.on_combine @history_index_buffer.on_combine def df_merge(a, b): return pd.concat([a, b])
def process_pool_executor_handler(executor: ProcessPoolExecutor, manager: DownloadProcess, file_maps: Dict[str, str], directory: str, progress_bar_queue) -> None: done_queue = JoinableQueue() def update_hook(future: Future): temp = future.result() if temp: for failed_links in temp: done_queue.put(failed_links) while manager.done_retries != manager.max_retries: print( f"Starting download {manager.get_total_links() - manager.get_total_downloaded_links_count()} links left" ) available_cpus = list(os.sched_getaffinity( os.getpid())) if platform.system() == "Linux" else [0, 1, 2, 3] print( f"available cpu's {available_cpus}, initializing {4 * manager.get_process_num()}" f" threads with {manager.get_thread_num()} links per " f"process") if len(manager.error_links): download_links = manager.error_links.copy() manager.error_links = [] else: download_links = manager.get_download_links().copy() process_futures: List[Future] = [] start = 0 for temp_num in range(len(download_links)): end = start + manager.get_thread_num() if end > len(download_links): end = len(download_links) cpu_num = available_cpus[temp_num % len(available_cpus)] process_futures.append( executor.submit(start_threads, download_links[start:end], file_maps, manager.get_session(), directory, manager.http2, progress_bar_queue, manager.debug, cpu_num)) process_futures[-1].add_done_callback(update_hook) start = end if end >= len(download_links): break wait(process_futures) while not done_queue.empty(): link = done_queue.get() manager.error_links.append(link) manager.set_total_downloaded_links_count(manager.get_total_links() - len(manager.error_links)) if manager.debug: print( f"Total downloaded links {manager.get_total_downloaded_links_count()}" ) print(f"Error links generated {len(manager.error_links)}") if len(manager.error_links): manager.set_thread_num( int( ceil((manager.get_total_links() - manager.get_total_downloaded_links_count()) / manager.get_process_num()))) print( f"\n{manager.get_total_links()} was expected but " f"{manager.get_total_downloaded_links_count()} was downloaded." ) manager.done_retries += 1 print(f"Trying retry {manager.done_retries}") else: break
def setup_routes(app): app.router.add_get('/data', get_all_data) app.router.add_get('/data/{mac}', get_data) if __name__ == '__main__': tags = { 'F4:A5:74:89:16:57': 'kitchen', 'CC:2C:6A:1E:59:3D': 'bedroom', 'BB:2C:6A:1E:59:3D': 'livingroom' } m = Manager() q = m.Queue() # Start background process executor = ProcessPoolExecutor(1) executor.submit(run_get_data_background, list(tags.keys()), q) loop = asyncio.get_event_loop() # Start data updater loop.create_task(data_update(q)) # Setup and start web application app = web.Application(loop=loop) setup_routes(app) web.run_app(app, host='0.0.0.0', port=5000)
imgs_list = soup.find_all('img') if len(imgs_list) != 0: img_url = 'https:' + imgs_list[-1]['src'] else: img_url = 'error_img' return url + '\t' + img_url # imgs['url'] = imgs.img.apply(crawl) imgs = pd.read_csv(os.path.join(path, 'train_all_json/train_img.txt'), header=None, names=['img']) # img count 305613 for j in range(0, 3050): print('save:', j * 100, (j + 1) * 100) f = open(os.path.join(path, 'img_url.txt'), 'a+') with ProcessPoolExecutor(8) as pool: if j == 3049: p = pool.map(crawl, imgs.img[j * 100:]) else: p = pool.map(crawl, imgs.img[j * 100:(j + 1) * 100]) for i in p: f.write(i + '\n') f.close() # download image def get_image(img, url): usr = img.split('/')[-2] file_suffix = img.split('/')[-1] + '.jpg' file_path = os.path.join(path, 'train', usr) try:
print('Available pages:\n') for page in os.listdir('/opt/snare/pages/'): print('\t- {}'.format(page)) print('\nuse with --page-dir {page_name}\n\n') exit() if not os.path.exists('/opt/snare/pages/' + args.page_dir): print("--page-dir: {0} does not exist".format(args.page_dir)) exit() if not os.path.exists('/opt/snare/pages/' + args.page_dir + "/" + args.index_page): print('can\'t crate meta tag') else: add_meta_tag(args.page_dir, args.index_page) loop = asyncio.get_event_loop() loop.run_until_complete(check_tanner()) pool = ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) compare_version_fut = None if args.auto_update is True: timeout = parse_timeout(args.update_timeout) compare_version_fut = loop.run_in_executor(pool, compare_version_info, timeout) if args.host_ip == 'localhost' and args.interface: host_ip = ni.ifaddresses(args.interface)[2][0]['addr'] else: host_ip = args.host_ip future = loop.create_server( lambda: HttpRequestHandler(args, debug=args.debug, keep_alive=75), args.interface, int(args.port)) srv = loop.run_until_complete(future) drop_privileges()
def __iter__(self): from concurrent.futures import ProcessPoolExecutor with ProcessPoolExecutor() as executor: return _coconut.iter( _coconut.list(executor.map(self.func, *self.iters)))
def main(): args = parse_arguments() if args.use_env and 'LOCAL_RANK' in os.environ: args.local_rank = int(os.environ['LOCAL_RANK']) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) device, args = setup_training(args) dllogger.log(step="PARAMETER", data={"Config": [str(args)]}) # Prepare optimizer model, optimizer, lr_scheduler, checkpoint, global_step = prepare_model_and_optimizer( args, device) if is_main_process(): dllogger.log(step="PARAMETER", data={"SEED": args.seed}) raw_train_start = time.time() if args.do_train: if is_main_process(): dllogger.log(step="PARAMETER", data={"train_start": True}) dllogger.log(step="PARAMETER", data={"batch_size_per_gpu": args.train_batch_size}) dllogger.log(step="PARAMETER", data={"learning_rate": args.learning_rate}) model.train() most_recent_ckpts_paths = [] average_loss = 0.0 # averaged loss every args.log_freq steps epoch = 0 training_steps = 0 pool = ProcessPoolExecutor(1) # Note: We loop infinitely over epochs, termination is handled via iteration count while True: thread = None if not args.resume_from_checkpoint or epoch > 0 or ( args.phase2 and global_step < 1) or args.init_checkpoint: files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and 'training' in f ] files.sort() num_files = len(files) random.shuffle(files) f_start_id = 0 else: f_start_id = checkpoint['files'][0] files = checkpoint['files'][1:] args.resume_from_checkpoint = False num_files = len(files) shared_file_list = {} if torch.distributed.is_initialized( ) and torch.distributed.get_world_size() > num_files: remainder = torch.distributed.get_world_size() % num_files data_file = files[ (f_start_id * torch.distributed.get_world_size() + torch.distributed.get_rank() + remainder * f_start_id) % num_files] else: data_file = files[ (f_start_id * torch.distributed.get_world_size() + torch.distributed.get_rank()) % num_files] previous_file = data_file train_data = pretraining_dataset(data_file, args.max_predictions_per_seq) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size * args.n_gpu, num_workers=4, pin_memory=True) # shared_file_list["0"] = (train_dataloader, data_file) overflow_buf = None if args.allreduce_post_accumulation: overflow_buf = torch.cuda.IntTensor([0]) if len(files) == 1: f_start_id = -1 for f_id in range(f_start_id + 1, len(files)): if torch.distributed.get_world_size() > num_files: data_file = files[ (f_id * torch.distributed.get_world_size() + torch.distributed.get_rank() + remainder * f_id) % num_files] else: data_file = files[ (f_id * torch.distributed.get_world_size() + torch.distributed.get_rank()) % num_files] previous_file = data_file dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args) train_iter = tqdm(train_dataloader, desc="Iteration" ) if is_main_process() else train_dataloader for step, batch in enumerate(train_iter): training_steps += 1 batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch loss = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels, checkpoint_activations=args.checkpoint_activations) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. divisor = args.gradient_accumulation_steps if args.gradient_accumulation_steps > 1: if not args.allreduce_post_accumulation: # this division was merged into predivision loss = loss / args.gradient_accumulation_steps divisor = 1.0 if args.fp16: with amp.scale_loss( loss, optimizer, delay_overflow_check=args. allreduce_post_accumulation) as scaled_loss: scaled_loss.backward() else: loss.backward() average_loss += loss.item() if training_steps % args.gradient_accumulation_steps == 0: lr_scheduler.step() # learning rate warmup global_step = take_optimizer_step( args, optimizer, model, overflow_buf, global_step) if global_step >= args.max_steps: train_time_raw = time.time() - raw_train_start last_num_steps = int( training_steps / args.gradient_accumulation_steps) % args.log_freq last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps average_loss = torch.tensor( average_loss, dtype=torch.float32).cuda() average_loss = average_loss / (last_num_steps * divisor) if (torch.distributed.is_initialized()): average_loss /= torch.distributed.get_world_size() torch.distributed.all_reduce(average_loss) final_loss = average_loss.item() if is_main_process(): dllogger.log(step=( epoch, training_steps / args.gradient_accumulation_steps, ), data={"final_loss": final_loss}) elif training_steps % ( args.log_freq * args.gradient_accumulation_steps) == 0: if is_main_process(): dllogger.log( step=( epoch, global_step, ), data={ "average_loss": average_loss / (args.log_freq * divisor), "step_loss": loss.item() * args.gradient_accumulation_steps / divisor, "learning_rate": optimizer.param_groups[0]['lr'] }) average_loss = 0 if global_step >= args.max_steps or training_steps % ( args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0: if is_main_process() and not args.skip_checkpoint: # Save a trained model dllogger.log(step="PARAMETER", data={"checkpoint_step": global_step}) model_to_save = model.module if hasattr( model, 'module' ) else model # Only save the model it-self if args.resume_step < 0 or not args.phase2: output_save_file = os.path.join( args.output_dir, "ckpt_{}.pt".format(global_step)) else: output_save_file = os.path.join( args.output_dir, "ckpt_{}.pt".format(global_step + args.phase1_end_step)) if args.do_train: torch.save( { 'model': model_to_save.state_dict(), 'optimizer': optimizer.state_dict(), 'master params': list(amp.master_params(optimizer)), 'files': [f_id] + files }, output_save_file) most_recent_ckpts_paths.append( output_save_file) if len(most_recent_ckpts_paths) > 3: ckpt_to_be_removed = most_recent_ckpts_paths.pop( 0) os.remove(ckpt_to_be_removed) if global_step >= args.max_steps: del train_dataloader # thread.join() return args, final_loss, train_time_raw del train_dataloader # thread.join() # Make sure pool has finished and switch train_dataloader # NOTE: Will block until complete train_dataloader, data_file = dataset_future.result( timeout=None) epoch += 1