def sygusCompetition(checkpoints, tasks): from pathos.multiprocessing import Pool import datetime # map from task to list of search times, one for each checkpoint. # search time will be None if it is not solved searchTimes = {t: [] for t in tasks} CPUs = int(8 / len(checkpoints)) maxWorkers = int(numberOfCPUs() / CPUs) workers = Pool(maxWorkers) eprint( f"You gave me {len(checkpoints)} checkpoints to ensemble. Each checkpoint will get {CPUs} CPUs. Creating a pool of {maxWorkers} worker processes." ) timeout = 3600 promises = [] for t in tasks: for checkpoint in checkpoints: promise = workers.apply_async(competeOnOneTask, (checkpoint, t), { "CPUs": CPUs, "timeout": timeout }) promises.append(promise) eprint(f"Queued {len(promises)} jobs.") for promise in promises: dt, task = promise.get() if dt is not None: searchTimes[task].append(dt) searchTimes = { t: min(ts) if len(ts) > 0 else None for t, ts in searchTimes.items() } fn = "experimentOutputs/text_competition_%s.p" % ( datetime.datetime.now().isoformat()) with open(fn, "wb") as handle: pickle.dump(searchTimes, handle) eprint() hits = sum(t is not None for t in searchTimes.values()) total = len(searchTimes) percentage = 100 * hits / total eprint("Hits %d/%d = %f\n" % (hits, total, percentage)) eprint() eprint("Exported competition results to", fn)
def backgroundHelmholtzEnumeration(tasks, g, timeout, _=None, special=None, evaluationTimeout=None): from pathos.multiprocessing import Pool requests = list({t.request for t in tasks}) inputs = { r: list({ tuplify(xs) for t in tasks if t.request == r for xs, y in t.examples }) for r in requests } workers = Pool(len(requests)) promises = [ workers.apply_async(helmholtzEnumeration, args=(g, r, inputs[r], float(timeout)), kwds={ 'special': special, 'evaluationTimeout': evaluationTimeout }) for r in requests ] def get(): results = [p.get() for p in promises] frontiers = [] with timing("(Helmholtz enumeration) Decoded json into frontiers"): for request, result in zip(requests, results): response = json.loads(result.decode("utf-8")) for b, entry in enumerate(response): frontiers.append( Frontier([ FrontierEntry(program=Program.parse(p), logPrior=entry["ll"], logLikelihood=0.) for p in entry["programs"] ], task=Task(str(b), request, []))) eprint("Total number of Helmholtz frontiers:", len(frontiers)) return frontiers return get
def run(self, integrator): if self.n_cores == 1: # run single thread self.out_img = integrator.run(self.rows_pool[0], self.cols_pool[0], self.camera.get_ray, self.world)[0].reshape( (self.height, self.width, 3)) else: # run multithread pool = Pool(processes=self.n_cores) # create pool of threads results = [ pool.apply_async(integrator.run, args=(self.rows_pool[core_idx], self.cols_pool[core_idx], self.camera.get_ray, self.world)) for core_idx in range(self.n_cores) ] output = [p.get() for p in results] # get results # map results to the resulting image for out in output: self.out_img[out[1], out[2], :] = out[0]
def estimate_param_scan(estimator, X, param_sets, evaluate=None, evaluate_args=None, failfast=True, return_estimators=False, n_jobs=1, progress_reporter=None, show_progress=True, return_exceptions=False): """ Runs multiple estimations using a list of parameter settings Parameters ---------- estimator : Estimator object or class An estimator object that provides an estimate(X, **params) function. If only a class is provided here, the Estimator objects will be constructed with default parameter settings, and the parameter settings from param_sets for each estimation. If you want to specify other parameter settings for those parameters not specified in param_sets, construct an Estimator before and pass the object. param_sets : iterable over dictionaries An iterable that provides parameter settings. Each element defines a parameter set, for which an estimation will be run using these parameters in estimate(X, **params). All other parameter settings will be taken from the default settings in the estimator object. evaluate : str or list of str, optional The given methods or properties will be called on the estimated models, and their results will be returned instead of the full models. This may be useful for reducing memory overhead. evaluate_args: iterable of iterable, optional Arguments to be passed to evaluated methods. Note, that size has to match to the size of evaluate. failfast : bool If True, will raise an exception when estimation failed with an exception or trying to calls a method that doesn't exist. If False, will simply return None in these cases. return_estimators: bool If True, return a list estimators in addition to the models. show_progress: bool if the given estimator supports show_progress interface, we set the flag prior doing estimations. return_exceptions: bool, default=False if failfast is False while this setting is True, returns the exception thrown at the actual grid element, instead of None. Returns ------- models : list of model objects or evaluated function values A list of estimated models in the same order as param_sets. If evaluate is given, each element will contain the results from these method evaluations. estimators (optional) : list of estimator objects. These are returned only if return_estimators=True Examples -------- Estimate a maximum likelihood Markov model at lag times 1, 2, 3. >>> from pyemma.msm.estimators import MaximumLikelihoodMSM, BayesianMSM >>> >>> dtraj = [0,0,1,2,1,0,1,0,1,2,2,0,0,0,1,1,2,1,0,0,1,2,1,0,0,0,1,1,0,1,2] # mini-trajectory >>> param_sets=param_grid({'lag': [1,2,3]}) >>> >>> estimate_param_scan(MaximumLikelihoodMSM, dtraj, param_sets, evaluate='timescales') [array([ 1.24113168, 0.77454377]), array([ 2.65266698, 1.42909842]), array([ 5.34810405, 1.14784446])] Now we also want to get samples of the timescales using the BayesianMSM. >>> estimate_param_scan(MaximumLikelihoodMSM, dtraj, param_sets, failfast=False, ... evaluate=['timescales', 'timescales_samples']) # doctest: +SKIP [[array([ 1.24113168, 0.77454377]), None], [array([ 2.48226337, 1.54908754]), None], [array([ 3.72339505, 2.32363131]), None]] We get Nones because the MaximumLikelihoodMSM estimator doesn't provide timescales_samples. Use for example a Bayesian estimator for that. Now we also want to get samples of the timescales using the BayesianMSM. >>> estimate_param_scan(BayesianMSM, dtraj, param_sets, show_progress=False, ... evaluate=['timescales', 'sample_f'], evaluate_args=((), ('timescales', ))) # doctest: +SKIP [[array([ 1.24357685, 0.77609028]), [array([ 1.5963252 , 0.73877883]), array([ 1.29915847, 0.49004912]), array([ 0.90058583, 0.73841786]), ... ]] """ # make sure we have an estimator object estimator = get_estimator(estimator) if hasattr(estimator, 'show_progress'): estimator.show_progress = show_progress # if we want to return estimators, make clones. Otherwise just copy references. # For parallel processing we always need clones. # Also if the Estimator is its own Model, we have to clone. from pyemma._base.model import Model if (return_estimators or n_jobs > 1 or n_jobs is None or isinstance(estimator, Model)): estimators = [clone_estimator(estimator) for _ in param_sets] else: estimators = [estimator for _ in param_sets] # if we evaluate, make sure we have a list of functions to evaluate if _types.is_string(evaluate): evaluate = [evaluate] if _types.is_string(evaluate_args): evaluate_args = [evaluate_args] if evaluate is not None and evaluate_args is not None and len( evaluate) != len(evaluate_args): raise ValueError( "length mismatch: evaluate ({}) and evaluate_args ({})".format( len(evaluate), len(evaluate_args))) show_progress = progress_reporter is not None and show_progress if show_progress: progress_reporter._progress_register(len(estimators), stage=0, description="estimating %s" % str(estimator.__class__.__name__)) if n_jobs > 1 and os.name == 'posix': if hasattr(estimators[0], 'logger'): estimators[0].logger.debug('estimating %s with n_jobs=%s', estimator, n_jobs) # iterate over parameter settings task_iter = ((estimator, param_set, X, evaluate, evaluate_args, failfast, return_exceptions) for estimator, param_set in zip(estimators, param_sets)) from pathos.multiprocessing import Pool as Parallel pool = Parallel(processes=n_jobs) args = list(task_iter) if show_progress: from pyemma._base.model import SampledModel for a in args: if isinstance(a[0], SampledModel): a[0].show_progress = False def callback(_): progress_reporter._progress_update(1, stage=0) else: callback = None import six if six.PY3: def error_callback(*args, **kw): if failfast: raise Exception('something failed') with pool: res_async = [ pool.apply_async(_estimate_param_scan_worker, a, callback=callback, error_callback=error_callback) for a in args ] res = [x.get() for x in res_async] else: try: res_async = [ pool.apply_async(_estimate_param_scan_worker, a, callback=callback) for a in args ] res = [x.get() for x in res_async] finally: pool.close() # if n_jobs=1 don't invoke the pool, but directly dispatch the iterator else: if hasattr(estimators[0], 'logger'): estimators[0].logger.debug( 'estimating %s with n_jobs=1 because of the setting or ' 'you not have a POSIX system', estimator) res = [] if show_progress: from pyemma._base.model import SampledModel if isinstance(estimator, SampledModel): for e in estimators: e.show_progress = False for estimator, param_set in zip(estimators, param_sets): res.append( _estimate_param_scan_worker(estimator, param_set, X, evaluate, evaluate_args, failfast, return_exceptions)) if show_progress: progress_reporter._progress_update(1, stage=0) if show_progress: progress_reporter._progress_force_finish(0) # done if return_estimators: return res, estimators else: return res
def learnMulti(self, gn, top_n=5): """ Learn using multiprocessing gn: initial grammar top_n: save the best top_n grammars """ # settings max_process = cpu_count( ) # processes to run in parallel; or use cpu_count() pool = Pool(max_process) P_LOCK = Manager().Lock() # mutex lock for printing gidCounter = 0 results = [] gList = [gn] gnBest = [deepcopy(gn)] bestmdl_last = gn.mdl # best MDL on the upper level nodes # uncomment this for debugging #history_pri_lik= set() # history of prior & likelihood values while len(gList) > 0: gn = gList.pop(0) # grammar node to be expanded self.printMsg( 1, '>> gList size: %d, bestMDL: %.3f (#%d)' % (len(gList), gnBest[0].mdl, gnBest[0].gid)) # substitute if possible ntlist = self.getFirstDLThack(gn.dlt) if len(ntlist) > 0: self.printMsg(2, '>> Possible substitutions on #%d:' % (gn.gid), ntlist) while len(ntlist) > 0: gidCounter += 1 argList = [gn, ntlist.pop(), gidCounter, P_LOCK] results.append( pool.apply_async(self.substituteMulti, [argList])) else: self.printMsg( 2, '>> No more SUBSTITUTE possible on #%d\n' % gn.gid) # merge if possible ntlist = self.mergeSet(gn.g) if len(ntlist) > 0: self.printMsg(2, '>> Possible merges on #%d:' % (gn.gid), ntlist) while len(ntlist) > 0: gidCounter += 1 argList = [gn, ntlist.pop(), gidCounter, P_LOCK] results.append(pool.apply_async(self.mergeMulti, [argList])) else: self.printMsg(2, '>> No more MERGE possible on #%d.\n' % gn.gid) del gn delList = [] bestmdl = bestmdl_last # search next level in the search tree while len(results) >= 1: time.sleep(0.001) # avoid wasting resource for r in range(len(results)): if results[r].ready(): delList.append(r) gn_new = results[r].get() # uncomment for debugging #if gn_new.lik < self.max_mdl: # history_pri_lik.add((gn_new.pri,gn_new.lik)) # save the best-N grammars if len(gnBest) < top_n: gnBest.append(gn_new) gnBest = sorted(gnBest, key=lambda gn: gn.mdl) elif gn_new.mdl < gnBest[-1].mdl: del gnBest[-1] gnBest.append(deepcopy(gn_new)) gnBest = sorted(gnBest, key=lambda gn: gn.mdl) # save this level's best mdl if gn_new.mdl < bestmdl: bestmdl = gn_new.mdl # beam search: compare with the best mdl on the upper level if gn_new.mdl >= bestmdl_last: gn_new.worse += 1 if gn_new.worse < BEAMSIZE: gList.append(gn_new) else: del gn_new delList.sort() for d in range(len(delList)): del results[delList.pop()] bestmdl_last = bestmdl pool.close() pool.join() return gnBest
def generate_output(args): """ Main application Driver 1. Partition filenames into smaller chunks/arrays of image filenames 2. Generate worker processes 3. Pass the chunks to the workers 4. Each worker deduplicates it's set of image files 5. Merge the results from each worker to one python dictionary 6. OPTIONAL -- Output the deduplicated image files to a directory """ # Partition the list of filenames num_chunks = args.num_jobs # Create a pool of worker threads # Each worker will deduplicate a set of images filenames = [] metadata = None end_str = "" if args.json_metadata != None: metadata,filenames = process_json_file(args.json_metadata) end_str = "from metadata file: %s" % args.json_metadata else: # Find all image files in dump directory filenames = find_all_images(args.dump_dir) end_str = "from directory: %s" % args.dump_dir file_chunks = partition_filenames(filenames, num_chunks) print("Found {} images in directory: {}".format(len(filenames), end_str)) """ metadata_results = [] file_chunk_list = list(file_chunks) num_proc = len(file_chunk_list) print >> sys.stderr, "Printing file chunks" print >> sys.stderr, file_chunks pool2 = Pool(processes = num_proc) with open(args.json_metadata) as json_metadata_file: metadata_results = [pool2.map(process_json_line,json_metadata_file, chunk) for index, chunk in enumerate(file_chunk_list)] #objs = [p.get() for p in results] metadata = merge_exact_duplicates(metadata_results) """ pool = Pool(processes=num_chunks) # Pass the partitions to each thread results = [] final_dictionary = {} if not args.near_duplicates: if args.num_jobs == 1: # If we're only using one worker, don't make overhead of starting a process result = exact_deduplicate_images(filenames) dictionaries = [result] else: # Get the results from each worker results = [pool.apply_async(exact_deduplicate_images, args=(index,chunk,)) for index, chunk in enumerate(file_chunks)] dictionaries = [p.get() for p in results] # Merge the results into one dictionary final_dictionary = merge_exact_duplicates(dictionaries) else: if args.num_jobs == 1: # If we're only using one worker, don't make overhead of starting a process result = near_deduplicate_images(filenames, args.bit_distance, metadata = metadata) near_duplicate_objects = [result] else: # Get the results from each near duplicate worker if metadata != None: results = [pool.apply_async(near_deduplicate_images, (chunk,args.bit_distance, ), dict(metadata=metadata)) for chunk in file_chunks] else: results = [pool.apply_async(near_deduplicate_images, (chunk,args.bit_distance,)) for chunk in file_chunks] # create an array of near duplicate objects near_duplicate_objects = [p.get() for p in results] # Merge the dictionaries together using the info from its corresponding indexes final_dictionary = merge_near_duplicates(near_duplicate_objects) print("Number of images prior to deduplication: {}".format(len(filenames)), file=sys.stderr) print("Number of images after deduplication: {}".format(len(final_dictionary)), file=sys.stderr) # Write the image locations to an output file if args.output_json != None: # TODO # For now, just do this with exact duplicates # Dumping the simhash class to JSON doesn't work because the object isn't # JSON serializable outfile_name = args.output_json print("Writing to image dictionary to file: {}".format(outfile_name)) with open(outfile_name, 'w') as outfile: json.dump(final_dictionary, outfile, indent=4, skipkeys=True, default=str) # Copy the images to an output directory create_output_image_directory(args, final_dictionary) return len(final_dictionary), len(filenames) - len(final_dictionary)
reg_loss = 0 nonzeros = 0 for batch in range(1, 2500000): if batch % 10 == 0: args_to_save = {} auxs_to_save = {} for k in net.arg_dict: args_to_save[k] = net.arg_dict[k].copyto(mx.cpu()) for k in net.aux_dict: auxs_to_save[k] = net.aux_dict[k].copyto(mx.cpu()) mx.nd.save('args_reg37ss.nd', args_to_save) # avoid device ordinal problem mx.nd.save('auxs_reg37ss.nd', auxs_to_save) if batch % 20 == 0: optimizer.lr /= 2 anno[:] = anno_np data = [pool.apply_async(get_image, ()) for i in range(batch_size)] # get_data(batch_size, imgout, anno_np, reg_anno_np) for bb in range(batch_size): d = data[bb].get() imgout[0] = d[0] anno[0] = d[1] reg_anno[0] = d[2] net.forward(is_train=True) cls_grad = net.outputs[0] - anno cls_pred_np = net.outputs[0].asnumpy() cls_truth_np = anno.asnumpy() # print net.outputs[1].asnumpy()[0,0,50:60,50:60] # print reg_anno.asnumpy()[0,0,50:60,50:60] precision += np.mean(np.argmax(cls_pred_np, axis=1)==np.argmax(cls_truth_np, axis=1)) # for i in range(batch_size):
def __init__(self): grid_size = 16 # HEURISTICS: radius = (1/3)*2^(ENCODING_SIZE) # where ENCODING_SIZE is bit size of every pattern element (8 bits for us) radius = 24 # Calculate pattern size based on grid_size and size of a Nibble (4) pattern_size = pow(grid_size, 2) / 4 # Set neural network data size RbfNetwork.PATTERN_SIZE = pattern_size # Set neural network default radius RbfNetwork.DEFAULT_RADIUS = radius # Set pattern size in RBF knowledge RbfKnowledge.PATTERN_SIZE = pattern_size # If there are no persisten memory related files, create them if not os.path.isfile("persistent_memory/sight_snb.p"): self.erase_all_knowledge() # 3.2.1.1 TODO: use detected processor number, and equation logic 3.1.3. # Detect system and determine threads number to use detect_system = DetectSystem() # Init thread's pool, with the determined threads number pool = Pool(detect_system.thread_number(12)) # SNB #self.snb = SensoryNeuralBlock("persistent_memory/sight_snb.p", "persistent_memory/hearing_snb.p") self.snb = pool.apply_async(lambda x: SensoryNeuralBlock("persistent_memory/sight_snb.p", "persistent_memory/hearing_snb.p"), [None]).get() # Relational Neural Block self.rnb = pool.apply_async(lambda x: RelNetwork.deserialize("persistent_memory/rnb.p"), [None]).get() # Analytical neuron self.analytical_n = pool.apply_async(lambda x: AnalyticalNeuron(), [None]).get() # Addition by memory network self.am_net = pool.apply_async(lambda x: CulturalNetwork.deserialize("persistent_memory/am_net.p"), [None]).get() # Geometric Neural Block self.gnb = pool.apply_async(lambda x: GeometricNeuralBlock.deserialize("persistent_memory/gnb.p"), [None]).get() # Syllables net self.syllables_net = pool.apply_async(lambda x: CulturalNetwork.deserialize("persistent_memory/syllables_net.p"), [None]).get() # Words net self.words_net = pool.apply_async(lambda x: CulturalNetwork.deserialize("persistent_memory/words_net.p"), [None]).get() # Sight-Syllables rel network self.ss_rnb = pool.apply_async(lambda x: RelNetwork.deserialize("persistent_memory/ss_rnb.p"), [None]).get() # ################### INTENTIONS MODULES ######################################################################## self.episodic_memory = pool.apply_async(lambda x: EpisodicMemoriesBlock.deserialize("persistent_memory/episodic_memory.p"), [None]).get() self.decisions_block = pool.apply_async(lambda x: DecisionsBlock.deserialize("persistent_memory/decisions_block.p"), [None]).get() self.internal_state = pool.apply_async(lambda x: InternalState.deserialize("persistent_memory/internal_state.p"), [None]).get() self.desired_state = pool.apply_async(lambda x: InternalState.deserialize("persistent_memory/desired_state.p"), [None]).get() # Internal state "Ports" (Three components real valued vector) self._internal_state_in = None # Memory that stores short term bip inputs for making a decision self._intentions_short_term_memory = [] self._output_memory = None # ############################################################################################################### # _bbcc_words self._learning_words = False self._learning_syllables = False self._enable_bbcc = False # Output "ports" (related to senses) self.s_knowledge_out = None self.h_knowledge_out = None # Input "ports" (senses) self.s_knowledge_in = None self.h_knowledge_in = None self._working_domain = "ADDITION" self.state = "MISS"
def main(): parser = OptionParser() parser.add_option('-n', '--name', dest='name', type=str, action='store', help='Name of the movie file to get.') parser.add_option('-y', '--year', dest='year', type=int, action='store', help='Year to look for the movie file to get.') parser.add_option( '--maxnum', dest='maxnum', type=int, action='store', default=10, help='Maximum number of torrents to look through. Default is 10.') parser.add_option( '--timeout', dest='timeout', type=int, action='store', default=60, help= 'Timeout on when to quit searching for torrents (in seconds). Default is 60 seconds..' ) #parser.add_option('--any', dest='do_any', action='store_true', default = False, # help = 'If chosen, make no filter on movie format.') parser.add_option('-f', '--filename', dest='filename', action='store', type=str, help='If defined, put option into filename.') parser.add_option('--bypass', dest='do_bypass', action='store_true', default=False, help='If chosen, bypass YTS.AG.') parser.add_option('--nozooq', dest='do_nozooq', action='store_true', default=False, help='If chosen, bypass ZOOQLE.') #parser.add_option('--torrentz', dest='do_torrentz', action='store_true', default=False, # help = 'If chosen, also look through TORRENTZ to get magnet link.') parser.add_option('--info', dest='do_info', action='store_true', default=False, help='If chosen, run in info mode.') parser.add_option( '--add', dest='do_add', action='store_true', default=False, help= 'If chosen, push the magnet link or torrent file into the deluge server.' ) parser.add_option('--noverify', dest='do_verify', action='store_false', default=True, help='If chosen, do not verify SSL connections.') parser.add_option( '--timing', dest='do_timing', action='store_true', default=False, help= 'If chosen, show timing information (how long to get movie torrents).') parser.add_option( '--doRaw', dest='do_raw', action='store_true', default=False, help='If chosen, do not use IMDB matching for Jackett torrents.') opts, args = parser.parse_args() assert (opts.timeout >= 10) assert (opts.name is not None) if opts.do_info: logging.basicConfig(level=logging.INFO) # num_both = 0 if opts.filename is not None: num_both += 1 if opts.do_add: num_both += 1 assert ( num_both != 2), "error, at most either one of --f or --add must be set, NOT both." # time0 = time.time() tmdb_id = None if opts.year is not None: tmdb_id = plextmdb.get_movie_tmdbids(opts.name, year=opts.year) if not opts.do_bypass: try: get_movie_yts(opts.name, verify=opts.do_verify, raiseError=True, to_torrent=opts.do_add) logging.info('search for YTS torrents took %0.3f seconds.' % (time.time() - time0)) return except ValueError: pass pool = Pool(processes=4) if not opts.do_nozooq: jobs = [ pool.apply_async(get_items_zooqle, args=(opts.name, opts.maxnum)) ] else: jobs = [] # ## check for jackett if get_jackett_credentials() is None: jobs += list( map( lambda func: pool.apply_async(func, args=(opts.name, opts.maxnum)), (get_items_rarbg, get_items_tpb))) #if opts.do_torrentz: # jobs.append( pool.apply_async( get_items_torrentz, args = ( opts.name, opts.maxnum ) ) ) else: jobs.append( pool.apply_async(get_items_jackett, args=(opts.name, tmdb_id, opts.maxnum, opts.do_verify, opts.do_raw))) jobs.append( pool.apply_async(get_items_eztv_io, args=(opts.name, tmdb_id, opts.maxnum, opts.do_verify))) items_lists = [] for job in jobs: try: items = job.get(opts.timeout) # 60 second timeout on process if items is None: continue items_lists.append(items) except: pass items = list(chain.from_iterable(items_lists)) if opts.do_timing: print('search for %d torrents took %0.3f seconds.' % (len(items), time.time() - time0)) if len(items) != 0: # ## sort from most seeders + leecher to least items_sorted = sorted( items, key=lambda tup: -tup['seeders'] - tup['leechers'])[:opts.maxnum] get_movie_torrent_items(items_sorted, filename=opts.filename, to_torrent=opts.do_add)
*main_client.get_array_nd_int32('simulation-time')) print(f"DA Time:{current_time}") temp_intp = thetao_assimilator.time_interpolate(current_time) salt_intp = so_assimilator.time_interpolate(current_time) for rank in rank_list: rank.running = False main_client.put_scalar_int32(f'{rank.id_str}_sent-inc', 0) da_todo = [rank for rank in rank_list if not rank.running] async_list = [] # Loop over ranks have been processed and add them to the async queue da_rank = 0 while (da_todo): for rank in da_todo: stime = time() if (main_client.poll_key_and_check_scalar_int32( f'{rank.id_str}_sent-prior', 1, 10, 1)): rank.running = True async_list.append( pool.apply_async(rank.run_da, (temp_intp, salt_intp, da_rank))) da_rank = (da_rank + 1) % NUM_THREADS # rank.run_da( temp_intp, salt_intp ) print(time() - stime) da_todo = [rank for rank in rank_list if not rank.running] print(f"Remaining number of ranks: {len(da_todo)}") pool.close() pool.join()
assert (os.path.basename(opts.filename).endswith('.srt')) logger = logging.getLogger() if opts.do_info: logger.setLogger(logging.INFO) keywords_set = {} if opts.keywords is not None: keywords_set = set( map( lambda tok: tok.lower(), filter(lambda tok: len(tok.strip()) != 0, opts.keywords.strip().split(',')))) # ## now calculation with multiprocessing time0 = time.time() pool = Pool(processes=3) if not opts.do_bypass: jobs = [pool.apply_async(get_items_yts, args=(opts.name, opts.maxnum))] else: jobs = [] jobs += list( map( lambda func: pool.apply_async( func, args=(opts.name, opts.maxnum, keywords_set)), (get_items_subscene, get_items_opensubtitles))) items_lists = [] for job in jobs: try: items = job.get() if items is None: continue items_lists.append(items) except: pass
def generate_output(args): """ Main application Driver 1. Partition filenames into smaller chunks/arrays of image filenames 2. Generate worker processes 3. Pass the chunks to the workers 4. Each worker deduplicates it's set of image files 5. Merge the results from each worker to one python dictionary 6. OPTIONAL -- Output the deduplicated image files to a directory """ # Partition the list of filenames num_chunks = args.num_jobs # Create a pool of worker threads # Each worker will deduplicate a set of images filenames = [] metadata = None end_str = "" if args.json_metadata != None: metadata, filenames = process_json_file(args.json_metadata) end_str = "from metadata file: %s" % args.json_metadata else: # Find all image files in dump directory filenames = find_all_images(args.dump_dir) end_str = "from directory: %s" % args.dump_dir file_chunks = partition_filenames(filenames, num_chunks) print("Found {} images in directory: {}".format(len(filenames), end_str)) """ metadata_results = [] file_chunk_list = list(file_chunks) num_proc = len(file_chunk_list) print >> sys.stderr, "Printing file chunks" print >> sys.stderr, file_chunks pool2 = Pool(processes = num_proc) with open(args.json_metadata) as json_metadata_file: metadata_results = [pool2.map(process_json_line,json_metadata_file, chunk) for index, chunk in enumerate(file_chunk_list)] #objs = [p.get() for p in results] metadata = merge_exact_duplicates(metadata_results) """ pool = Pool(processes=num_chunks) # Pass the partitions to each thread results = [] final_dictionary = {} if not args.near_duplicates: if args.num_jobs == 1: # If we're only using one worker, don't make overhead of starting a process result = exact_deduplicate_images(filenames) dictionaries = [result] else: # Get the results from each worker results = [ pool.apply_async(exact_deduplicate_images, args=( index, chunk, )) for index, chunk in enumerate(file_chunks) ] dictionaries = [p.get() for p in results] # Merge the results into one dictionary final_dictionary = merge_exact_duplicates(dictionaries) else: if args.num_jobs == 1: # If we're only using one worker, don't make overhead of starting a process result = near_deduplicate_images(filenames, args.bit_distance, metadata=metadata) near_duplicate_objects = [result] else: # Get the results from each near duplicate worker if metadata != None: results = [ pool.apply_async(near_deduplicate_images, ( chunk, args.bit_distance, ), dict(metadata=metadata)) for chunk in file_chunks ] else: results = [ pool.apply_async(near_deduplicate_images, ( chunk, args.bit_distance, )) for chunk in file_chunks ] # create an array of near duplicate objects near_duplicate_objects = [p.get() for p in results] # Merge the dictionaries together using the info from its corresponding indexes final_dictionary = merge_near_duplicates(near_duplicate_objects) print("Number of images prior to deduplication: {}".format(len(filenames)), file=sys.stderr) print("Number of images after deduplication: {}".format( len(final_dictionary)), file=sys.stderr) # Write the image locations to an output file if args.output_json != None: # TODO # For now, just do this with exact duplicates # Dumping the simhash class to JSON doesn't work because the object isn't # JSON serializable outfile_name = args.output_json print("Writing to image dictionary to file: {}".format(outfile_name)) with open(outfile_name, 'w') as outfile: json.dump(final_dictionary, outfile, indent=4, skipkeys=True, default=str) # Copy the images to an output directory create_output_image_directory(args, final_dictionary) return len(final_dictionary), len(filenames) - len(final_dictionary)