def multi_mode(cli_parsed): dbm = db_manager.DB_Manager(cli_parsed.d + '/ew.db') dbm.open_connection() if not cli_parsed.resume: dbm.initialize_db() dbm.save_options(cli_parsed) m = Manager() targets = m.Queue() lock = m.Lock() multi_counter = m.Value('i', 0) display = None def exitsig(*args): dbm.close() if current_process().name == 'MainProcess': print '' print 'Resume using ./EyeWitness.py --resume {0}'.format( cli_parsed.d + '/ew.db') os._exit(1) signal.signal(signal.SIGINT, exitsig) if cli_parsed.resume: pass else: url_list, rdp_list, vnc_list = target_creator(cli_parsed) if cli_parsed.web: for url in url_list: dbm.create_http_object(url, cli_parsed) for rdp in rdp_list: dbm.create_vnc_rdp_object('rdp', rdp, cli_parsed) for vnc in vnc_list: dbm.create_vnc_rdp_object('vnc', vnc, cli_parsed) if cli_parsed.web: if cli_parsed.web and not cli_parsed.show_selenium: display = Display(visible=0, size=(1920, 1080)) display.start() multi_total = dbm.get_incomplete_http(targets) if multi_total > 0: if cli_parsed.resume: print 'Resuming Web Scan ({0} Hosts Remaining)'.format( str(multi_total)) else: print 'Starting Web Requests ({0} Hosts)'.format( str(multi_total)) if multi_total < cli_parsed.threads: num_threads = multi_total else: num_threads = cli_parsed.threads for i in xrange(num_threads): targets.put(None) try: workers = [ Process(target=worker_thread, args=(cli_parsed, targets, lock, (multi_counter, multi_total))) for i in xrange(num_threads) ] for w in workers: w.start() for w in workers: w.join() except Exception as e: print str(e) # Set up UA table here if cli_parsed.cycle is not None: ua_dict = get_ua_values(cli_parsed.cycle) if not cli_parsed.ua_init: dbm.clear_table("ua") completed = dbm.get_complete_http() completed[:] = [x for x in completed if x.error_state is None] for item in completed: for browser, ua in ua_dict.iteritems(): dbm.create_ua_object(item, browser, ua) cli_parsed.ua_init = True dbm.clear_table("opts") dbm.save_options(cli_parsed) for browser, ua in ua_dict.iteritems(): targets = m.Queue() multi_counter.value = 0 multi_total = dbm.get_incomplete_ua(targets, browser) if multi_total > 0: print( "[*] Starting requests for User Agent {0}" " ({1} Hosts)").format(browser, str(multi_total)) if multi_total < cli_parsed.threads: num_threads = multi_total else: num_threads = cli_parsed.threads for i in xrange(num_threads): targets.put(None) workers = [ Process(target=worker_thread, args=(cli_parsed, targets, lock, (multi_counter, multi_total), (browser, ua))) for i in xrange(num_threads) ] for w in workers: w.start() for w in workers: w.join() if any((cli_parsed.vnc, cli_parsed.rdp)): log._LOG_LEVEL = log.Level.ERROR multi_total, targets = dbm.get_incomplete_vnc_rdp() if multi_total > 0: print '' print 'Starting VNC/RDP Requests ({0} Hosts)'.format( str(multi_total)) app = QtGui.QApplication(sys.argv) timer = QTimer() timer.start(10) timer.timeout.connect(lambda: None) # add qt4 reactor import qt4reactor qt4reactor.install() from twisted.internet import reactor for target in targets: if os.path.dirname(cli_parsed.d) != os.path.dirname( target.screenshot_path): target.set_paths(cli_parsed.d) tdbm = db_manager.DB_Manager(cli_parsed.d + '/ew.db') if target.proto == 'vnc': reactor.connectTCP( target.remote_system, target.port, vnc_module.RFBScreenShotFactory( target.screenshot_path, reactor, app, target, tdbm)) else: reactor.connectTCP( target.remote_system, int(target.port), rdp_module.RDPScreenShotFactory( reactor, app, 1200, 800, target.screenshot_path, cli_parsed.timeout, target, tdbm)) reactor.runReturn() app.exec_() if display is not None: display.stop() results = dbm.get_complete_http() vnc_rdp = dbm.get_complete_vnc_rdp() dbm.close() m.shutdown() write_vnc_rdp_data(cli_parsed, vnc_rdp) sort_data_and_write(cli_parsed, results) if cli_parsed.ocr: for target in targets: try: rdp_module.parse_screenshot(cli_parsed.d, target) except IOError: pass
s = SessionWrapper.new(init=True) res = s.query(PullRequest.slug).distinct() for r in res: seen.add(r.slug) return seen if __name__ == '__main__': pr_file = 'tmp_pullrequests.csv' # comment_file = 'tmp_comments.csv' logger = logging_config.get_logger('pr_extractor') try: tokens = Tokens() tokens_iter = tokens.iterator() manager = Manager() tokens_queue = manager.Queue() for token in tokens_iter: tokens_queue.put(token) tokens_map = manager.dict() extractor = PrAndCommentExtractor(tokens, tokens_queue, tokens_map) print("Retrieving the list of cloned GitHub project") slugs = get_github_slugs(sys.argv[1]) print("%s" % len(slugs)) print("Retrieving the list of project already analyzed") extractor.seen = get_already_parsed_projects() print("%s" % len(extractor.seen)) print("Beginning data extraction") extractor.start(slugs, pr_file) print("Storing data into db") extractor.add_to_db(pr_file)
class MultiprocessingManager: """The facade class for the Holmes library used in a multiprocessing environment. This class is threadsafe. Parameters: model -- the name of the spaCy model, e.g. *en_core_web_lg* overall_similarity_threshold -- the overall similarity threshold for embedding-based matching. Defaults to *1.0*, which deactivates embedding-based matching. embedding_based_matching_on_root_words -- determines whether or not embedding-based matching should be attempted on root (parent) tokens, which has a considerable performance hit. Defaults to *False*. ontology -- an *Ontology* object. Defaults to *None* (no ontology). analyze_derivational_morphology -- *True* if matching should be attempted between different words from the same word family. Defaults to *True*. perform_coreference_resolution -- *True*, *False* or *None* if coreference resolution should be performed depending on whether the model supports it. Defaults to *None*. debug -- a boolean value specifying whether debug representations should be outputted for parsed sentences. Defaults to *False*. verbose -- a boolean value specifying whether status messages should be outputted to the console. Defaults to *True* number_of_workers -- the number of worker processes to use, or *None* if the number of worker processes should depend on the number of available cores. Defaults to *None* """ def __init__(self, model, *, overall_similarity_threshold=1.0, embedding_based_matching_on_root_words=False, ontology=None, analyze_derivational_morphology=True, perform_coreference_resolution=None, debug=False, verbose=True, number_of_workers=None): self.semantic_analyzer = SemanticAnalyzerFactory().semantic_analyzer( model=model, perform_coreference_resolution=perform_coreference_resolution, debug=debug) if perform_coreference_resolution is None: perform_coreference_resolution = \ self.semantic_analyzer.model_supports_coreference_resolution() validate_options(self.semantic_analyzer, overall_similarity_threshold, embedding_based_matching_on_root_words, perform_coreference_resolution) self.structural_matcher = StructuralMatcher( self.semantic_analyzer, ontology, overall_similarity_threshold, embedding_based_matching_on_root_words, analyze_derivational_morphology, perform_coreference_resolution) self._perform_coreference_resolution = perform_coreference_resolution self._verbose = verbose self._document_labels = [] self._input_queues = [] if number_of_workers is None: number_of_workers = cpu_count() self._number_of_workers = number_of_workers self._next_worker_to_use = 0 self._multiprocessor_manager = Multiprocessing_manager() self._worker = Worker( ) # will be copied to worker processes by value (Windows) or # by reference (Linux) self._workers = [] for counter in range(0, self._number_of_workers): input_queue = Queue() self._input_queues.append(input_queue) worker_label = ' '.join(('Worker', str(counter))) this_worker = Process(target=self._worker.listen, args=(self.semantic_analyzer, self.structural_matcher, input_queue, worker_label), daemon=True) self._workers.append(this_worker) this_worker.start() self._lock = Lock() def _add_document_label(self, label): with self._lock: if label in self._document_labels: raise DuplicateDocumentError(label) else: self._document_labels.append(label) def _handle_reply(self, worker_label, return_value): """ If 'return_value' is an exception, return it, otherwise return 'None'. """ if isinstance(return_value, Exception): return return_value elif self._verbose: if not isinstance(return_value, list): with self._lock: print(': '.join((worker_label, return_value))) return None def _internal_register_documents(self, dictionary, worker_method): reply_queue = self._multiprocessor_manager.Queue() for label, value in dictionary.items(): self._add_document_label(label) with self._lock: self._input_queues[self._next_worker_to_use].put( (worker_method, (value, label), reply_queue)) self._next_worker_to_use += 1 if self._next_worker_to_use == self._number_of_workers: self._next_worker_to_use = 0 recorded_exception = None for _ in range(0, len(dictionary)): possible_exception = self._handle_reply(*reply_queue.get()) if possible_exception is not None and recorded_exception is None: recorded_exception = possible_exception if recorded_exception is not None: with self._lock: print('ERROR: not all documents were registered successfully. Please examine the '\ ' above output from the worker processes to identify the problem.') def parse_and_register_documents(self, document_dictionary): """Parameters: document_dictionary -- a dictionary from unique document labels to raw document texts. """ self._internal_register_documents( document_dictionary, self._worker.worker_parse_and_register_document) def deserialize_and_register_documents(self, serialized_document_dictionary): """Parameters: serialized_document_dictionary -- a dictionary from unique document labels to documents serialized using the *Manager.serialize_document()* method. """ if self._perform_coreference_resolution: raise SerializationNotSupportedError(self.semantic_analyzer.model) self._internal_register_documents( serialized_document_dictionary, self._worker.worker_deserialize_and_register_document) def document_labels(self): with self._lock: document_labels = self._document_labels return sorted(document_labels) def topic_match_documents_returning_dictionaries_against( self, text_to_match, *, maximum_activation_distance=75, relation_score=30, reverse_only_relation_score=20, single_word_score=5, single_word_any_tag_score=2, overlapping_relation_multiplier=1.5, embedding_penalty=0.6, ontology_penalty=0.9, maximum_number_of_single_word_matches_for_relation_matching=500, maximum_number_of_single_word_matches_for_embedding_matching=100, sideways_match_extent=100, only_one_result_per_document=False, number_of_results=10, document_label_filter=None, tied_result_quotient=0.9): """Returns the results of a topic match between an entered text and the loaded documents. Properties: text_to_match -- the text to match against the loaded documents. maximum_activation_distance -- the number of words it takes for a previous phraselet activation to reduce to zero when the library is reading through a document. relation_score -- the activation score added when a normal two-word relation is matched. reverse_only_relation_score -- the activation score added when a two-word relation is matched using a search phrase that can only be reverse-matched. single_word_score -- the activation score added when a normal single word is matched. single_word_any_tag_score -- the activation score added when a single word is matched whose tag did not correspond to the template specification. overlapping_relation_multiplier -- the value by which the activation score is multiplied when two relations were matched and the matches involved a common document word. embedding_penalty -- a value between 0 and 1 with which scores are multiplied when the match involved an embedding. The result is additionally multiplied by the overall similarity measure of the match. ontology_penalty -- a value between 0 and 1 with which scores are multiplied for each word match within a match that involved the ontology. For each such word match, the score is multiplied by the value (abs(depth) + 1) times, so that the penalty is higher for hyponyms and hypernyms than for synonyms and increases with the depth distance. maximum_number_of_single_word_matches_for_relation_matching -- the maximum number of single word matches that are used as the basis for matching relations. If more document words than this value correspond to each of the two words within a relation phraselet, matching on the phraselet is not attempted. maximum_number_of_single_word_matches_for_embedding_matching = the maximum number of single word matches that are used as the basis for reverse matching with embeddings at the parent word. If more than this value exist, reverse matching with embeddings is not attempted because the performance hit would be too great. sideways_match_extent -- the maximum number of words that may be incorporated into a topic match either side of the word where the activation peaked. only_one_result_per_document -- if 'True', prevents multiple results from being returned for the same document. number_of_results -- the number of topic match objects to return. document_label_filter -- optionally, a string with which document labels must start to be considered for inclusion in the results. tied_result_quotient -- the quotient between a result and following results above which the results are interpreted as tied. """ if maximum_number_of_single_word_matches_for_embedding_matching > \ maximum_number_of_single_word_matches_for_relation_matching: raise EmbeddingThresholdGreaterThanRelationThresholdError(' '.join(( 'embedding', str(maximum_number_of_single_word_matches_for_embedding_matching ), 'relation', str(maximum_number_of_single_word_matches_for_relation_matching )))) reply_queue = self._multiprocessor_manager.Queue() for counter in range(0, self._number_of_workers): self._input_queues[counter].put( (self._worker. worker_topic_match_documents_returning_dictionaries_against, (text_to_match, maximum_activation_distance, relation_score, reverse_only_relation_score, single_word_score, single_word_any_tag_score, overlapping_relation_multiplier, embedding_penalty, ontology_penalty, maximum_number_of_single_word_matches_for_relation_matching, maximum_number_of_single_word_matches_for_embedding_matching, sideways_match_extent, only_one_result_per_document, number_of_results, document_label_filter, tied_result_quotient), reply_queue)) topic_match_dicts = [] recorded_exception = None for _ in range(0, self._number_of_workers): worker_label, worker_topic_match_dicts = reply_queue.get() if recorded_exception is None: recorded_exception = self._handle_reply( worker_label, worker_topic_match_dicts) if not isinstance(worker_topic_match_dicts, Exception): topic_match_dicts.extend(worker_topic_match_dicts) if recorded_exception is not None: with self._lock: print('ERROR: not all workers returned results. Please examine the above output '\ ' from the worker processes to identify the problem.') return TopicMatchDictionaryOrderer().order(topic_match_dicts, number_of_results, tied_result_quotient) def start_topic_matching_search_mode_console( self, only_one_result_per_document=False, maximum_number_of_single_word_matches_for_relation_matching=500, maximum_number_of_single_word_matches_for_embedding_matching=100): """Starts a topic matching search mode console enabling the matching of pre-registered documents to search texts entered ad-hoc by the user. Parameters: only_one_result_per_document -- if 'True', prevents multiple topic match results from being returned for the same document. maximum_number_of_single_word_matches_for_relation_matching -- the maximum number of single word matches that are used as the basis for matching relations. If more document words than this value correspond to each of the two words within a relation phraselet, matching on the phraselet is not attempted. maximum_number_of_single_word_matches_for_embedding_matching = the maximum number of single word matches that are used as the basis for matching with embeddings at the other word. If more than this value exist, matching with embeddings is not attempted because the performance hit would be too great. """ holmes_consoles = HolmesConsoles(self) holmes_consoles.start_topic_matching_search_mode( only_one_result_per_document, maximum_number_of_single_word_matches_for_relation_matching= maximum_number_of_single_word_matches_for_relation_matching, maximum_number_of_single_word_matches_for_embedding_matching= maximum_number_of_single_word_matches_for_embedding_matching) def close(self): for worker in self._workers: worker.terminate()
def main(input_file, output_file, temp_dir=None, parallel=True): """ This module filters RFs according to input options and then computes some quality metrics on each RF. This enables different downstream approaches to selecting and filtering for good quality RFs. The stats attribute of each RF is populated with these quality metrics. In addition, a new root group is added to the hdf5 file containing a Pandas DataFrame that tabulates the attributes of each trace to allow easy event filtering in the downstream workflow. Available methods: 1. rf_group_by_similarity - grouping method based on calculation of euclidean distances and clustering by similarity ( aca machine learning approach) 2. TODO: coherence - finding the coherent signals (in frequency domain) relative to median. Consequently, moveout should be applied to use this technique 3. TODO knive - analysing the change of RMS relative to median. Noisy stations will give higher input. Moveout should be applied to use this technique 4. S/N ratio 5. Spectral entropy """ logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) similarity_eps = 0.05 # Set up asynchronous buffered writing of results to file mgr = Manager() with h5py.File(input_file, mode='r') as h5f: config_str = h5f.attrs['metadata'] if 'metadata' in h5f.attrs else '' write_queue = mgr.Queue() output_thread = Process(target=async_write, args=(write_queue, output_file, 20, config_str)) output_thread.daemon = True output_thread.start() logger.info("Processing source file {}".format(input_file)) if parallel: logger.info("Parallel processing") Parallel(n_jobs=-3, verbose=5, max_nbytes='16M', temp_folder=temp_dir) \ (delayed(rf_quality_metrics_queue)(write_queue, station_id, station_stream3c, similarity_eps) for station_id, station_stream3c in IterRfH5StationEvents(input_file)) else: logger.info("Serial processing") for station_id, station_stream3c in IterRfH5StationEvents(input_file): try: rf_quality_metrics_queue(write_queue, station_id, station_stream3c, similarity_eps) except (ValueError, AssertionError) as e: traceback.print_exc() logger.error( "Unhandled exception occurred in rf_quality_metrics_queue for station {}. " "Data will be omitted for this station!\nError:\n{}". format(station_id, str(e))) # end try # end for # end if # Signal completion logger.info("Finishing...") write_queue.put(None) write_queue.join() logger.info("rf_quality_filter SUCCESS!")
abnormal_fraction) # Collections of all valid algorithms. __ALGO_NAMES__ = [ '{}-{}'.format(algo, p) for algo in ('cae', 'cae-iforest', 'drae', 'rdae', 'dagmm', 'ssd-iforest', 'e3outlier') for p in (0.05, 0.1, 0.15, 0.2, 0.25) ] if __name__ == '__main__': n_run = 5 N_GPUS = 1 # deprecated, use one gpu only man = Manager() q = man.Queue(N_GPUS) for g in range(N_GPUS): q.put(str(g)) experiments_list = [ (load_mnist_with_outliers, 'mnist', 10), (load_fashion_mnist_with_outliers, 'fashion-mnist', 10), (load_cifar10_with_outliers, 'cifar10', 10), (load_cifar100_with_outliers, 'cifar100', 20), (load_svhn_with_outliers, 'svhn', 10), ] p_list = [0.05, 0.1, 0.15, 0.2, 0.25] for i in range(n_run): for data_load_fn, dataset_name, n_classes in experiments_list: for p in p_list:
def evaluate_conv_net(storm_norm_data, storm_meta, hail_labels, sampling_config, param_combos, out_path, num_gpus=8): """ Args: storm_norm_data: storm_meta: hail_labels: sampling_config: param_combos: out_path: num_gpus: Returns: """ unique_dates = np.unique(storm_meta["run_dates"]) np.random.seed(sampling_config["random_seed"]) storm_sampler = train_split_generator(unique_dates, sampling_config["train_split"], sampling_config["num_samples"]) best_param_combos = [] sample_scores = pd.DataFrame( index=np.arange(sampling_config["num_samples"]), columns=[ "Brier Score", "Brier Score Climo", "Brier Skill Score", "AUC" ], dtype=float) for n in range(sampling_config["num_samples"]): environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7" train_dates, test_dates = next(storm_sampler) print(train_dates, test_dates) train_indices = np.where(np.in1d(storm_meta["run_dates"], train_dates))[0] test_indices = np.where(np.in1d(storm_meta["run_dates"], test_dates))[0] all_members = np.unique(storm_meta.loc[train_indices, "members"]) np.random.shuffle(all_members) member_split = int( np.round(all_members.size * sampling_config["member_split"])) train_members = all_members[:member_split] val_members = all_members[member_split:] print(train_members, val_members) train_member_indices = np.where( np.in1d(storm_meta.loc[train_indices, "members"], train_members))[0] val_member_indices = np.where( np.in1d(storm_meta.loc[train_indices, "members"], val_members))[0] param_scores = pd.DataFrame(index=np.arange(param_combos.shape[0]), columns=["Brier Skill Score", "AUC"], dtype=float) score_outputs = [] param_train_data = storm_norm_data[train_indices][train_member_indices] param_train_labels = hail_labels[train_indices][train_member_indices] param_val_data = storm_norm_data[train_indices][val_member_indices] param_val_labels = hail_labels[train_indices][val_member_indices] print("Saving training data") np.save(join(out_path, "param_train_data.npy"), param_train_data) np.save(join(out_path, "param_train_labels.npy"), param_train_labels) np.save(join(out_path, "param_val_data.npy"), param_val_data) np.save(join(out_path, "param_val_labels.npy"), param_val_labels) gpu_manager = Manager() gpu_queue = gpu_manager.Queue() n_pool = Pool(num_gpus, maxtasksperchild=1) for g in range(num_gpus): gpu_queue.put(g) for c in param_combos.index.values: print(c) score_outputs.append( n_pool.apply_async( train_single_conv_net, (c, gpu_queue, param_combos.loc[c].to_dict(), out_path))) n_pool.close() n_pool.join() #for c in param_combos.index.values: # score_outputs.append(train_single_conv_net(c, gpu_queue, param_combos.loc[c].to_dict(), out_path)) for async_out in score_outputs: out = async_out.get() param_scores.loc[out[1]] = out[0] del n_pool del gpu_queue del gpu_manager best_config = param_scores["Brier Skill Score"].idxmax() best_combo = param_combos.loc[best_config].to_dict() param_scores.to_csv(join( out_path, "conv_net_param_scores_sample_{0:03d}.csv".format(n)), index_label="Param Combo") best_param_combos.append(best_config) print("Best Config") print(param_combos.loc[best_config]) pool = Pool(1) np.save(join(out_path, "best_train_data.npy"), storm_norm_data[train_indices]) np.save(join(out_path, "best_test_data.npy"), storm_norm_data[test_indices]) sample_scores = pool.apply( train_best_conv_net, (best_combo, n, hail_labels[train_indices], storm_meta.loc[test_indices], hail_labels[test_indices], sample_scores, out_path)) pool.close() pool.join() del pool sample_scores.to_csv(join(out_path, "conv_net_sample_scores.csv"), index_label="Sample") best_config_frame = param_combos.loc[best_param_combos] best_config_frame = best_config_frame.reset_index() best_config_frame.to_csv(join(out_path, "conv_net_best_params.csv"), index_label="Sample") return
all.append(price.text) else: all.append('R$0') if rating is not None: all.append(rating.text) else: all.append('-1') q.put(all) #print("---------------------------------------------------------------") results = [] if __name__ == "__main__": m = Manager() q = m.Queue( ) # use this manager Queue instead of multiprocessing Queue as that causes error p = {} if sys.argv[1] in [ 't', 'p' ]: # user decides which method to invoke: thread, process or pool for i in range(1, no_pages): if sys.argv[1] in ['t']: print("starting thread: ", i) p[i] = threading.Thread(target=get_data, args=(i, q)) p[i].start() elif sys.argv[1] in ['p']: print("starting process: ", i) p[i] = Process(target=get_data, args=(i, q)) p[i].start() # join should be done in seperate for loop # reason being that once we join within previous for loop, join for p1 will start working
def __init__(self, cache_fname='.sparql_cache'): self.cache_fname = cache_fname self.cache_has_changed = False manager = Manager() self.cache_queue = manager.Queue() self.cache = TryLoad(self.cache_fname)
assert sum(args.split) == 1 and not any( [i < 0 or i > 1 for i in args.split]), "Split must be valid distrib" traj_files = [] for s in args.paths.split(':'): if 'traj_group' in s: traj_files = traj_files + glob.glob('{}/traj*'.format(s)) else: for t_group in glob.glob('{}/traj_group*'.format(s)): traj_files = traj_files + glob.glob('{}/traj*'.format(t_group)) random.shuffle(traj_files) print('Saving {} trajectories...'.format(len(traj_files))) m = Manager() record_queue = m.Queue() save_dir, T = args.save_dir, args.T seperate_good, traj_per_file = args.seperate_good, args.traj_per_file record_saver_proc = Process(target=record_worker, args=(record_queue, save_dir, T, seperate_good, traj_per_file, args.offset, tuple(args.split))) record_saver_proc.start() if args.nworkers > 1: confs = [] split = len(traj_files) // args.nworkers for w in range(args.nworkers): start, end = w * split, (w + 1) * split if w == args.nworkers - 1: end = len(traj_files)
LOGS_PATH = f'{os.getcwd()}/' logger = get_neat_logger(path=LOGS_PATH) # N_SAMPLES = 10 N_PROCESSES = 16 N_GENOMES = 5 genomes = [] for i in range(N_GENOMES): genome = Genome(key=i) genome.create_random_genome() genomes.append(genome) manager = Manager() task_queue = manager.Queue() exit_queue = manager.Queue() exception_queue = manager.Queue() results_queue = manager.Queue() workers = [] for i in range(N_PROCESSES): worker = Worker(task_queue=task_queue, exit_queue=exit_queue, exception_queue=exception_queue, results_queue=results_queue) worker.start() workers.append(worker) for genome in genomes:
lock = Lock() total_lines = args.num_lines[0] num_processes = args.num_processes[0] group_size = num_processes total_groups = total_lines // group_size process_labels = [] process_queues = [] process = [] pipes = [] for i in range(0, num_processes): label = "process_ " + str(i) + " : " process_labels.append(label) process_queues.append(the_man.Queue()) send_end, recv_end = Pipe() pipes.append(recv_end) process.append( Process(target=pool_process_paragraph, args=( process_queues[i], t1, process_labels[i], args.intermediate_text_address[0], total_groups, send_end, lock, num_processes, i, )))
if not result['OK']: print("Failed queueing %s" % path) else: print("Task failed: %s" % result['Message']) if 'Path' in result: random.shuffle(lfcHosts) print("Requeueing task for directory %s, lfc %s" % (result['Path'], lfcHosts[0])) ######################################################################### pPool = ProcessPool(30, 40, 0) manager = Manager() writerQueue = manager.Queue() stopFlag = Value('i', 0) # pPool.daemonize() # lfcHosts = ['lfc-lhcb-ro.cern.ch', # 'lfc-lhcb-ro.cr.cnaf.infn.it', # 'lhcb-lfc-fzk.gridka.de', # 'lfc-lhcb-ro.in2p3.fr', # 'lfc-lhcb.grid.sara.nl', # 'lfclhcb.pic.es', # 'lhcb-lfc.gridpp.rl.ac.uk'] lfcHosts = ['prod-lfc-lhcb-ro.cern.ch'] # path = "/lhcb/LHCb" path = '/lhcb/user/c/chaen'
except EmptyException: pass # Resume normal execution. def sleep_process(secs, queue): """Sleep for a certain amount of time, and update state when finished. Arguments: secs {int} -- number of seconds to sleep for. queue {Queue} -- shared Queue used for interprocess communication with run_apps. """ sleep(secs) queue.put(True, timeout=0.001) # Low timeout to prevent blocking. if __name__ == '__main__': app_list = [] for app_name in APPS: module = import_module(APPS.get(app_name)) app = getattr(module, app_name) app_list.append(app) print(app_list) manager = Manager() queue = manager.Queue() pool = Pool(processes=2) pool.apply(run_apps, args=(app_list, ARGS, KWARGS, queue)) pool.apply_async(sleep_process, args=(5, queue))
try: v[seqshuffle(pop)[0]] += 1 except KeyboardInterrupt: pass now = time.time() q.put((n, pop, trials, v, now - start)) POP = int(sys.argv[1]) if len(sys.argv) > 1 else 50 RUNS = range(8, 24 + 1) print 'Usage: python %s <popsize>' % (sys.argv[0], ) print 'popsize:', POP print 'n: %u..%u' % (RUNS[0], RUNS[-1]) man = Manager() q = man.Queue() p = Pool() # blame http://zachseward.com/sparktweets/ # http://en.wikipedia.org/wiki/List_of_Unicode_characters#Block_elements BLOCKS = u' _▁▂▃▄▅▆▇█' popsize = POP prevtrials = 0 prevv = [0 for _ in range(popsize + 1)] try: for n in RUNS: # scatter: calculate the amount of work (trials), split it into a bunch of jobs and run
def _executors_repro( self, executors: dict, jobs: Optional[int] = 1) -> Mapping[str, Mapping[str, str]]: """Run dvc repro for the specified BaseExecutors in parallel. Returns: dict mapping stash revs to the successfully executed experiments for each stash rev. """ result: Dict[str, Dict[str, str]] = defaultdict(dict) manager = Manager() pid_q = manager.Queue() rel_cwd = relpath(os.getcwd(), self.repo.root_dir) with ProcessPoolExecutor(max_workers=jobs) as workers: futures = {} for rev, executor in executors.items(): future = workers.submit( executor.reproduce, executor.dvc_dir, rev, queue=pid_q, name=executor.name, rel_cwd=rel_cwd, log_level=logger.getEffectiveLevel(), ) futures[future] = (rev, executor) try: wait(futures) except KeyboardInterrupt: # forward SIGINT to any running executor processes and # cancel any remaining futures pids = {} while not pid_q.empty(): rev, pid = pid_q.get() pids[rev] = pid for future, (rev, _) in futures.items(): if future.running(): os.kill(pids[rev], signal.SIGINT) elif not future.done(): future.cancel() for future, (rev, executor) in futures.items(): rev, executor = futures[future] exc = future.exception() try: if exc is None: exec_result = future.result() result[rev].update( self._collect_executor(executor, exec_result)) else: # Checkpoint errors have already been logged if not isinstance(exc, CheckpointKilledError): logger.exception( "Failed to reproduce experiment '%s'", rev[:7], exc_info=exc, ) except CancelledError: logger.error( "Cancelled before attempting to reproduce experiment " "'%s'", rev[:7], ) finally: executor.cleanup() return result
def run_tests_parallel(tests, prefix, options): # This queue will contain the results of the various tests run. # We could make this queue a global variable instead of using # a manager to share, but this will not work on Windows. queue_manager = Manager() async_test_result_queue = queue_manager.Queue() # This queue will be used by the result process to indicate # that it has received a result and we can start a new process # on our end. The advantage is that we don't have to sleep and # check for worker completion ourselves regularly. notify_queue = queue_manager.Queue() # This queue will contain the return value of the function # processing the test results. result_process_return_queue = queue_manager.Queue() result_process = Process(target=process_test_results_parallel, args=(async_test_result_queue, result_process_return_queue, notify_queue, len(tests), options)) result_process.start() # Ensure that a SIGTERM is handled the same way as SIGINT # to terminate all child processes. sigint_handler = signal.getsignal(signal.SIGINT) signal.signal(signal.SIGTERM, sigint_handler) worker_processes = [] def remove_completed_workers(workers): new_workers = [] for worker in workers: if worker.is_alive(): new_workers.append(worker) else: worker.join() return new_workers try: testcnt = 0 # Initially start as many jobs as allowed to run parallel for i in range(min(options.max_jobs, len(tests))): notify_queue.put(True) # For every item in the notify queue, start one new worker. # Every completed worker adds a new item to this queue. while notify_queue.get(): if (testcnt < len(tests)): # Start one new worker worker_process = Process(target=wrap_parallel_run_test, args=(tests[testcnt], prefix, async_test_result_queue, options)) worker_processes.append(worker_process) worker_process.start() testcnt += 1 # Collect completed workers worker_processes = remove_completed_workers(worker_processes) else: break # Wait for all processes to terminate while len(worker_processes) > 0: worker_processes = remove_completed_workers(worker_processes) # Signal completion to result processor, then wait for it to complete on its own async_test_result_queue.put(None) result_process.join() # Return what the result process has returned to us return result_process_return_queue.get() except (Exception, KeyboardInterrupt) as e: # Print the exception if it's not an interrupt, # might point to a bug or other faulty condition if not isinstance(e, KeyboardInterrupt): traceback.print_exc() for worker in worker_processes: try: worker.terminate() except: pass result_process.terminate() return False
class MultiprocessRunner(SystemRunner): def __init__(self, system: System, pipeline_ids=(DEFAULT_PIPELINE_ID,), poll_interval=None, setup_tables=False, sleep_for_setup_tables=0, *args, **kwargs): super(MultiprocessRunner, self).__init__(system=system, *args, **kwargs) self.pipeline_ids = pipeline_ids self.poll_interval = poll_interval or DEFAULT_POLL_INTERVAL assert isinstance(system, System) self.os_processes = None self.setup_tables = setup_tables or system.setup_tables self.sleep_for_setup_tables = sleep_for_setup_tables def start(self): assert self.os_processes is None, "Already started" self.os_processes = [] self.manager = Manager() self.inboxes = {} self.outboxes = {} # Setup queues. for pipeline_id in self.pipeline_ids: for process_name, upstream_names in self.system.followings.items(): inbox_id = (pipeline_id, process_name.lower()) if inbox_id not in self.inboxes: self.inboxes[inbox_id] = self.manager.Queue() for upstream_class_name in upstream_names: outbox_id = (pipeline_id, upstream_class_name.lower()) if outbox_id not in self.outboxes: self.outboxes[outbox_id] = PromptOutbox() if inbox_id not in self.outboxes[outbox_id].downstream_inboxes: self.outboxes[outbox_id].downstream_inboxes[inbox_id] = self.inboxes[inbox_id] # Subscribe to broadcast prompts published by a process # application in the parent operating system process. subscribe(handler=self.broadcast_prompt, predicate=self.is_prompt) # Start operating system process. for pipeline_id in self.pipeline_ids: for process_name, upstream_names in self.system.followings.items(): process_class = self.system.process_classes[process_name] inbox = self.inboxes[(pipeline_id, process_name.lower())] outbox = self.outboxes.get((pipeline_id, process_name.lower())) os_process = OperatingSystemProcess( application_process_class=process_class, infrastructure_class=self.infrastructure_class, upstream_names=upstream_names, poll_interval=self.poll_interval, pipeline_id=pipeline_id, setup_tables=self.setup_tables, inbox=inbox, outbox=outbox, ) os_process.daemon = True os_process.start() self.os_processes.append(os_process) if self.setup_tables: # Avoid conflicts when creating tables. sleep(self.sleep_for_setup_tables) def broadcast_prompt(self, prompt): outbox_id = (prompt.pipeline_id, prompt.process_name) outbox = self.outboxes.get(outbox_id) if outbox: outbox.put(prompt) @staticmethod def is_prompt(event): return isinstance(event, Prompt) def close(self): super(MultiprocessRunner, self).close() unsubscribe(handler=self.broadcast_prompt, predicate=self.is_prompt) for os_process in self.os_processes: os_process.inbox.put('QUIT') for os_process in self.os_processes: os_process.join(timeout=10) for os_process in self.os_processes: os_process.is_alive() and os_process.terminate() self.os_processes = None self.manager = None
def custom_search(request): manager = Manager() q = manager.Queue() print q.qsize() start_time = timeit.default_timer() return_dict = manager.dict() if request.method == 'POST': data = json.loads(request.body) query = data['query'] p1 = Process(target=search_google, args=(query,return_dict,q,"google process")) p1.daemon = True p1.name ="google process" q.put(p1.name) print p1 p2 = Process(target=search_twitter, args=(query,return_dict,q,"twitter process")) p2.daemon = True p2.name ="twitter process" q.put(p2.name) print p2 p3 = Process(target=search_duckgo, args=(query,return_dict,q,"duckduckgo process")) p3.daemon = True p3.name ="duckduckgo process" q.put(p3.name) print p3 print q.qsize() p1.start() p2.start() p3.start() print "before p1 join" p1.join(1) print "after p1 join" p2.join(1) print "after p2 join" p3.join(1) print "after p3 join" # q.join() print "after q.join()" if p1.is_alive(): print "timed out1" p1.terminate() p1.join() return_dict[p3.name]= "timed out" if p2.is_alive(): print "timed out2" p2.terminate() p2.join() return_dict[p3.name]= "timed out" if p3.is_alive(): print "timed out3" p3.terminate() p3.join() return_dict[p3.name]= "timed out" print "after time outs" end_time = timeit.default_timer() time_taken = end_time - start_time print "time taken: "+str(start_time)+" and "+ str(end_time)+" : "+str(time_taken) print return_dict.values() if time_taken > 1.0: return HttpResponse(json.dumps( {"query":query, "results": "error: time taken is greater than 1 sec" })) else: google_data = return_dict["google process"] twitter_data = return_dict["twitter process"] duckgo_data = return_dict["duckduckgo process"] return HttpResponse(json.dumps( {"query":query, "results":{ "google": google_data, "twitter": twitter_data, "duckduckgo": duckgo_data } }))
def evaluate_sklearn_model(model_name, model_obj, storm_data, storm_meta, hail_labels, sampling_config, param_combos, out_path, num_gpus=8): unique_dates = np.unique(storm_meta["run_dates"]) np.random.seed(sampling_config["random_seed"]) storm_sampler = train_split_generator(unique_dates, sampling_config["train_split"], sampling_config["num_samples"]) best_param_combos = [] sample_scores = pd.DataFrame( index=np.arange(sampling_config["num_samples"]), columns=[ "Brier Score", "Brier Score Climo", "Brier Skill Score", "AUC" ], dtype=float) for n in range(sampling_config["num_samples"]): train_dates, test_dates = next(storm_sampler) train_indices = np.where(np.in1d(storm_meta["run_dates"], train_dates))[0] test_indices = np.where(np.in1d(storm_meta["run_dates"], test_dates))[0] all_members = np.unique(storm_meta.loc[train_indices, "members"]) np.random.shuffle(all_members) member_split = int( np.round(all_members.size * sampling_config["member_split"])) train_members = all_members[:member_split] val_members = all_members[member_split:] train_member_indices = np.where( np.in1d(storm_meta.loc[train_indices, "members"], train_members))[0] val_member_indices = np.where( np.in1d(storm_meta.loc[train_indices, "members"], val_members))[0] param_scores = pd.DataFrame(index=np.arange(param_combos.shape[0]), columns=["Brier Skill Score", "AUC"], dtype=float) score_outputs = [] param_train_data = storm_data[train_indices][train_member_indices] param_train_labels = hail_labels[train_indices][train_member_indices] param_val_data = storm_data[train_indices][val_member_indices] param_val_labels = hail_labels[train_indices][val_member_indices] print("Saving training data") np.save(join(out_path, "param_train_data.npy"), param_train_data) np.save(join(out_path, "param_train_labels.npy"), param_train_labels) np.save(join(out_path, "param_val_data.npy"), param_val_data) np.save(join(out_path, "param_val_labels.npy"), param_val_labels) gpu_manager = Manager() gpu_queue = gpu_manager.Queue() n_pool = Pool(num_gpus, maxtasksperchild=1) for g in range(num_gpus): gpu_queue.put(g) for c in param_combos.index.values: print(c) score_outputs.append( n_pool.apply_async(train_single_sklearn_model, (model_name, model_obj, c, param_combos.loc[c].to_dict(), out_path), dict(device_queue=gpu_queue))) n_pool.close() n_pool.join() for async_out in score_outputs: out = async_out.get() param_scores.loc[out[1]] = out[0] del n_pool del gpu_queue del gpu_manager #for c in param_combos.index: # print(param_combos.loc[c]) # model_inst = model_obj(**param_combos.loc[c].to_dict()) # model_inst.fit(storm_data[train_indices][train_member_indices], # hail_labels[train_indices][train_member_indices]) # val_preds = model_inst.predict_proba(storm_data[train_indices][val_member_indices])[:, 1] # param_scores.loc[c, "Brier Skill Score"] = brier_skill_score(hail_labels[train_indices][val_member_indices], # val_preds) # param_scores.loc[c, "AUC"] = roc_auc_score(hail_labels[train_indices][val_member_indices], # val_preds) # if param_scores.loc[c, "Brier Skill Score"] > best_score: # best_config = c # best_score = param_scores.loc[c, "Brier Skill Score"] # del model_inst param_scores.to_csv(join( out_path, "{0}_param_scores_sample_{1:03d}.csv".format(model_name, n)), index_label="Param Combo") best_config = param_scores["Brier Skill Score"].idxmax() best_combo = param_combos.loc[best_config].to_dict() best_param_combos.append(best_config) print("Best Config") print(param_combos.loc[best_config]) pool = Pool(1) np.save(join(out_path, "best_train_data.npy"), storm_data[train_indices]) np.save(join(out_path, "best_test_data.npy"), storm_data[test_indices]) sample_scores = pool.apply( train_best_sklearn_model, (model_name, model_obj, best_combo, n, hail_labels[train_indices], storm_meta.loc[test_indices], hail_labels[test_indices], sample_scores, out_path)) pool.close() pool.join() del pool sample_scores.to_csv(join(out_path, "{0}_sample_scores.csv".format(model_name)), index_label="Sample") #print("Train Best " + model_name) #model_inst = model_obj(**param_combos.loc[best_config].to_dict()) #model_inst.fit(storm_data[train_indices], # hail_labels[train_indices]) #print("Scoring " + model_name) #test_pred_frame = storm_meta.loc[test_indices] #test_pred_frame[model_name] = model_inst.predict_proba(storm_data[test_indices])[:, 1] #test_pred_frame["label"] = hail_labels[test_indices] #test_preds = test_pred_frame[model_name].values #test_pred_frame = pd.DataFrame({"indices": test_indices, # "lon": storm_centers[test_indices, 0], # "lat": storm_centers[test_indices, 1], # "run_dates": storm_run_dates[test_indices], # "valid_dates": storm_valid_dates[test_indices], # "members": storm_members[test_indices], # model_name: test_preds, # "label": hail_labels[test_indices]}, #columns=["indices", "lon", "lat", "dates", "members", "conv_net", "label"]) #test_pred_frame.to_csv(join(out_path, "predictions_{0}_sample_{1:03d}.csv".format(model_name, n)), index_label="Index") #sample_scores.loc[n, "Brier Score"] = brier_score(hail_labels[test_indices], test_preds) #sample_scores.loc[n, "Brier Score Climo"] = brier_score(hail_labels[test_indices], # hail_labels[test_indices].mean()) #sample_scores.loc[n, "Brier Skill Score"] = brier_skill_score(hail_labels[test_indices], test_preds) #sample_scores.loc[n, "AUC"] = roc_auc_score(hail_labels[test_indices], test_preds) # #del model_inst #sample_scores.to_csv(join(out_path, "{0}_sample_scores.csv".format(model_name)), index_label="Sample") best_config_frame = param_combos.loc[best_param_combos] best_config_frame = best_config_frame.reset_index() best_config_frame.to_csv(join(out_path, "{0}_best_params.csv".format(model_name)), index_label="Sample") return
def calcProb(routes): i = 1 for routeid, ti, tj, Dij, Dji in routes: print "Route %d/%d - ti: %d, tj: %d, Dij: %d, Dji: %d" % ( i, len(routes), ti, tj, Dij, Dji) i += 1 Dij = min(Dij, Dji) if Dij <= 0: Dij = 1 # Edges ophalen con = Verbinding() try: sql = 'SELECT edge_id, db, df FROM prismedges WHERE route_id=%d' % routeid edges = con.selectAll(sql) # Route constants vmean = 1 / ((tj - ti) * 1.0) vmax = 1 / (Dij * 1.0) # XY uitrekenen edgevars = {} print 'Calculating XY...' manager = Manager() tq = manager.Queue() rq = manager.Queue() for edgeid, Db, Df in edges: if Db + Df == 0: Df = 1 tq.put((edgeid, Db, Df)) edgevars[edgeid] = EdgeVar(edgeid, Db=Db, Df=Df) worker1 = ParabolaWorker(tq, rq, Dij, ti, tj) worker2 = ParabolaWorker(tq, rq, Dij, ti, tj) worker3 = ParabolaWorker(tq, rq, Dij, ti, tj) worker1.start() worker2.start() worker3.start() tq.put(None) tq.put(None) tq.put(None) worker1.join() worker2.join() worker3.join() while not rq.empty(): edge = rq.get() k = edge[0] edgevars[k].tb, edgevars[k].tf, edgevars[k].x, edgevars[ k].y = edge[1:] worker1.terminate() worker2.terminate() worker3.terminate() #Iterate over time print 'Iterating over time...' edges = [(k, v.tb, v.tf) for k, v in edgevars.iteritems()] prism = Prism(edges) tq = manager.Queue() rq = manager.Queue() sq = manager.Queue(1) for t, edgelist in prism.iteratePrism(stepsize=30): tq.put((t, edgelist)) worker1 = IteratorWorker(tq, rq, ti, tj, vmax, vmean, edgevars) worker2 = IteratorWorker(tq, rq, ti, tj, vmax, vmean, edgevars) worker3 = IteratorWorker(tq, rq, ti, tj, vmax, vmean, edgevars) summator = SumWorker(rq, sq, edgevars) worker1.start() worker2.start() worker3.start() summator.start() tq.put(None) tq.put(None) tq.put(None) worker1.join() worker2.join() worker3.join() rq.put(None) summator.join(10) edgevars = sq.get() summator.terminate() worker1.terminate() worker2.terminate() worker3.terminate() #Copy edge to table routestr = '' for edge in edgevars.itervalues(): if edge.P > 1.0: edge.P = 1 routestr += '%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\n' % ( routeid, edge.edgeid, edge.Db, edge.Df, edge.x, edge.y, 1.0 - exp(edge.P), edge.E) f = StringIO(routestr) con.copyfrom(f, 'probedges', columns=('route_id', 'edge_id', 'db', 'df', 'x', 'y', 'P', 'E')) con.commit() finally: con.sluit()
combined_advert = Advertiser( name=duplicate_adverts.all()[0].name, count=total_count, broker_id=cur_broker.id) s.add(combined_advert) for advert in duplicate_adverts.all(): s.delete(advert) s.commit() if __name__ == "__main__": session = Session() parse_file = 'events.ra.csv' num_workers = 4 manager = Manager() work = manager.Queue(num_workers) # start for workers pool = [] for i in range(num_workers): print("spawning process") p = Process(target=do_work, args=(work, )) p.start() pool.append(p) # produce data with open(parse_file, 'rt') as csvfile: check_loop = 10000 timing = [] reader = csv.reader(csvfile) parse = False
def iasi_level2_runner(): """Listens and triggers processing""" LOG.info( "*** Start the extraction and conversion of ears iasi level2 profiles") pool = Pool(processes=6, maxtasksperchild=1) manager = Manager() listener_q = manager.Queue() publisher_q = manager.Queue() pub_thread = FilePublisher(publisher_q) pub_thread.start() listen_thread = FileListener(listener_q) listen_thread.start() jobs_dict = {} while True: try: msg = listener_q.get() except Empty: LOG.debug("Empty listener queue...") continue LOG.debug("Number of threads currently alive: " + str(threading.active_count())) if 'start_time' in msg.data: start_time = msg.data['start_time'] elif 'nominal_time' in msg.data: start_time = msg.data['nominal_time'] else: LOG.warning("Neither start_time nor nominal_time in message!") start_time = None if 'end_time' in msg.data: end_time = msg.data['end_time'] else: LOG.warning("No end_time in message!") if start_time: end_time = start_time + timedelta(seconds=60 * 15) else: end_time = None if not start_time or not end_time: LOG.warning("Missing either start_time or end_time or both!") LOG.warning("Ignore message and continue...") continue sensor = str(msg.data['sensor']) platform_name = msg.data['platform_name'] keyname = (str(platform_name) + '_' + str(start_time.strftime('%Y%m%d%H%M'))) jobs_dict[keyname] = datetime.utcnow() urlobj = urlparse(msg.data['uri']) path, fname = os.path.split(urlobj.path) LOG.debug("path " + str(path) + " filename = " + str(fname)) scene = { 'platform_name': platform_name, 'starttime': start_time, 'endtime': end_time, 'sensor': sensor, 'filename': urlobj.path } # if keyname not in jobs_dict: # LOG.warning("Scene-run seems unregistered! Forget it...") # continue pool.apply_async(format_conversion, (msg.data, scene, jobs_dict[keyname], publisher_q)) # Block any future run on this scene for x minutes from now # x = 5 thread_job_registry = threading.Timer(5 * 60.0, reset_job_registry, args=(jobs_dict, keyname)) thread_job_registry.start() pool.close() pool.join() pub_thread.stop() listen_thread.stop()
class LocalExecutor(BaseExecutor): """ LocalExecutor executes tasks locally in parallel. It uses the multiprocessing Python library and queues to parallelize the execution of tasks. :param parallelism: how many parallel processes are run in the executor """ def __init__(self, parallelism: int = PARALLELISM): super().__init__(parallelism=parallelism) self.manager: Optional[SyncManager] = None self.result_queue: Optional['Queue[TaskInstanceStateType]'] = None self.workers: List[QueuedLocalWorker] = [] self.workers_used: int = 0 self.workers_active: int = 0 self.impl: Optional[Union['LocalExecutor.UnlimitedParallelism', 'LocalExecutor.LimitedParallelism']] = None class UnlimitedParallelism: """ Implements LocalExecutor with unlimited parallelism, starting one process per each command to execute. :param executor: the executor instance to implement. """ def __init__(self, executor: 'LocalExecutor'): self.executor: 'LocalExecutor' = executor def start(self) -> None: """Starts the executor.""" self.executor.workers_used = 0 self.executor.workers_active = 0 # pylint: disable=unused-argument # pragma: no cover def execute_async( self, key: TaskInstanceKey, command: CommandType, queue: Optional[str] = None, executor_config: Optional[Any] = None, ) -> None: """ Executes task asynchronously. :param key: the key to identify the task instance :param command: the command to execute :param queue: Name of the queue :param executor_config: configuration for the executor """ if not self.executor.result_queue: raise AirflowException(NOT_STARTED_MESSAGE) local_worker = LocalWorker(self.executor.result_queue, key=key, command=command) self.executor.workers_used += 1 self.executor.workers_active += 1 local_worker.start() # pylint: enable=unused-argument # pragma: no cover def sync(self) -> None: """Sync will get called periodically by the heartbeat method.""" if not self.executor.result_queue: raise AirflowException("Executor should be started first") while not self.executor.result_queue.empty(): results = self.executor.result_queue.get() self.executor.change_state(*results) self.executor.workers_active -= 1 def end(self) -> None: """ This method is called when the caller is done submitting job and wants to wait synchronously for the job submitted previously to be all done. """ while self.executor.workers_active > 0: self.executor.sync() class LimitedParallelism: """ Implements LocalExecutor with limited parallelism using a task queue to coordinate work distribution. :param executor: the executor instance to implement. """ def __init__(self, executor: 'LocalExecutor'): self.executor: 'LocalExecutor' = executor self.queue: Optional['Queue[ExecutorWorkType]'] = None def start(self) -> None: """Starts limited parallelism implementation.""" if not self.executor.manager: raise AirflowException(NOT_STARTED_MESSAGE) self.queue = self.executor.manager.Queue() if not self.executor.result_queue: raise AirflowException(NOT_STARTED_MESSAGE) self.executor.workers = [ QueuedLocalWorker(self.queue, self.executor.result_queue) for _ in range(self.executor.parallelism) ] self.executor.workers_used = len(self.executor.workers) for worker in self.executor.workers: worker.start() def execute_async( self, key: TaskInstanceKey, command: CommandType, queue: Optional[str] = None, # pylint: disable=unused-argument executor_config: Optional[Any] = None, # pylint: disable=unused-argument ) -> None: """ Executes task asynchronously. :param key: the key to identify the task instance :param command: the command to execute :param queue: name of the queue :param executor_config: configuration for the executor """ if not self.queue: raise AirflowException(NOT_STARTED_MESSAGE) self.queue.put((key, command)) def sync(self): """Sync will get called periodically by the heartbeat method.""" while True: try: results = self.executor.result_queue.get_nowait() try: self.executor.change_state(*results) finally: self.executor.result_queue.task_done() except Empty: break def end(self): """Ends the executor. Sends the poison pill to all workers.""" for _ in self.executor.workers: self.queue.put((None, None)) # Wait for commands to finish self.queue.join() self.executor.sync() def start(self) -> None: """Starts the executor""" self.manager = Manager() self.result_queue = self.manager.Queue() self.workers = [] self.workers_used = 0 self.workers_active = 0 self.impl = (LocalExecutor.UnlimitedParallelism(self) if self.parallelism == 0 else LocalExecutor.LimitedParallelism(self)) self.impl.start() def execute_async( self, key: TaskInstanceKey, command: CommandType, queue: Optional[str] = None, executor_config: Optional[Any] = None, ) -> None: """Execute asynchronously.""" if not self.impl: raise AirflowException(NOT_STARTED_MESSAGE) self.validate_command(command) self.impl.execute_async(key=key, command=command, queue=queue, executor_config=executor_config) def sync(self) -> None: """Sync will get called periodically by the heartbeat method.""" if not self.impl: raise AirflowException(NOT_STARTED_MESSAGE) self.impl.sync() def end(self) -> None: """ Ends the executor. :return: """ if not self.impl: raise AirflowException(NOT_STARTED_MESSAGE) if not self.manager: raise AirflowException(NOT_STARTED_MESSAGE) self.impl.end() self.manager.shutdown() def terminate(self): """Terminate the executor is not doing anything."""
def execute_tasks(process_count: int, input_tasks: List[Tuple[Any, Any]], shared: Dict[str, Any] = {}, callback: Any = None) -> List[Any]: """ Creates 'process_count' processes that will together execute the provided tasks. """ manager = Manager() results = [None] * len(input_tasks) if len(input_tasks) == 1: process_count = 0 if process_count == 0: output = manager.Queue() for i, task in enumerate(input_tasks): context = MainContext(i, output) results[i] = task[0](context, *task[1], **shared) callback("complete", i) while not output.empty(): command = output.get(block=True) callback(command[0], *command[1]) return results input = manager.Queue() output = manager.Queue() timeout = 5 * 60 # if one single task takes more then 5 minutes, something is wrong # instead of copying state for each task, shared state is written to a file which is loaded once per process. shared_file = None temp_file = None if len(shared) > 0: context = MainContext(0, None) with TimeCode(context, "create_shared") as tc: temp_file = tempfile.NamedTemporaryFile("wb", suffix='.dump', prefix="mp_shared", delete=True) shared_file = temp_file.name debug(f"shared file: '{temp_file.name}'") pickle_data = pickle.dumps(shared) temp_file.write(pickle_data) temp_file.flush() # add tasks to the task queue for i, task in enumerate(input_tasks): try: input.put((i, task)) except: get_console().print_exception() error(i) error(task) fatal_exit() # create the processes processors = [ Process(target=_process_entrypoint, args=(input, output, shared_file)) for i in range(process_count) ] # start the processes for process in processors: process.start() # receive messages waiting = len(input_tasks) while waiting > 0: try: command = output.get(block=True, timeout=timeout) processing = True if callback: processing = callback(command[0], *command[1]) if processing: if command[0] == 'debug': debug(*command[1]) elif command[0] == 'warning': warning(*command[1]) elif command[0] == 'error': error(*command[1]) elif command[0] == 'info': info(*command[1]) elif command[0] == 'complete': results[command[1][0]] = command[1][1] waiting -= 1 elif command[0] == 'exception': waiting -= 1 print(command[1][1]) elif command[0] == 'exit': sys.exit(1) else: warning(f"unknown command: {command}") except Empty: error(f"task took to long to complete (+{timeout} seconds)") fatal_exit() # wait for all processes to finish for process in processors: process.join() # TODO: Maybe we don't need to clear the queue while not output.empty(): command = output.get(block=False) warning(f"skipped command: {command}") if temp_file: temp_file.close() return results
od='' sfile='{}Summary_{}{}.csv'.format(od,ref[0:4],str(len(ref))) inffile='{}/PDB_info.txt'.format(odir) data=[] dheader=['PDB ID','Segment','Match length','Alignment','Cropped'] data.append(dheader) data.append(['Ref','Ref',len(ref),ref,ref]) info=[] iheader=['File','Protein segment','Ligand segment','Ligand name','Complex'] info.append(iheader) p=Pool(cores) m = Manager() q = m.Queue() args=[] inps=ilist blist=slice_list(ilist,cores) for ifiles in blist: args.append((ifiles,ref,odir,l,ph,q)) result = p.map_async(crop_handler, args) start=time.time() prcprev=0 while True: if result.ready():
# if m.im_self is None: # return getattr, (m.im_class, m.im_func.func_name) # else: # return getattr, (m.im_self, m.im_func.func_name) static_reg = re.compile(r'\.html$|\.htm$|\.shtml$|\.css$|\.png$|\.js$|\.dpg$|\.jpg$|\.svg$|\.jpeg$|' r'\.gif$|\.webp$|\.ico$|\.woff$|\.ttf$|css\?|js\?|jpg\?|png\?|woff\?v=' r'|woff2\?v=|ttf\?|woff\?|woff2$|html\?v=|ico$') burp_traffic = [] manager = Manager() case_list = manager.list() openner_result = manager.list() # for deduplicate # api_list=manager.list() # filtered=manager.list() traffic_queue = manager.Queue() # for saving ot local file traffic_list = manager.list() # save reflect for analyzing reflect_list = manager.list() # filter api_list = manager.list() class Traffic_generator(Process): DEFAULT_HEADER = { 'User-Agent': 'Mozilla/2.0 (X11; Linux x86_64) AppleWebKit/237.36 (KHTML, like Gecko) Chrome/62.0.3322.146 Safari/237.36', } def __init__(self, id, url_list,coroutine): Process.__init__(self) self.id = id
def write(q): print 'Write...(%s)' % os.getpid() for v in ['A', 'B', 'C']: print 'Put %s to queue...' % v q.put(v) time.sleep(random.random()) def read(q): print 'Read...(%s)' % os.getpid() while True: if not q.empty(): v = q.get(True) print 'Get %s from queue...' % v time.sleep(random.random()) else: break if __name__ == '__main__': manager = Manager() q = manager.Queue() p = Pool() p.apply_async(write, args=(q, )) time.sleep(0.5) p.apply_async(read, args=(q, )) p.close() p.join() print 'All data writed and readed.'
def _create_queues(self): # Need to use multiprocessing.Manager, the Queue() is buggy and causes # deadlock manager = Manager() self._chunk_queue = manager.Queue() self._result_queue = manager.Queue()
def calculateRINsFromPdbList(pdbs, fromScratch=True, forceCentrality=True, remove_tmp_files=True, n_proc=32): pdbs = set([x.lower() for x in pdbs]) lim = 100 * 1024 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (lim, lim)) if not os.path.isfile(settings.REDUCE_HET_DICT): print("%s not found" % settings.REDUCE_HET_DICT) sys.exit(1) os.environ["REDUCE_HET_DICT"] = settings.REDUCE_HET_DICT num_of_proc = n_proc manager = Manager() lock = manager.Lock() in_queue = manager.Queue() bio_pdbs = set() total_structures = 0 subfolders = os.listdir(bio_assembly_path) # BUG: undefined variable for subfolder in subfolders: sub_path = "%s/%s" % (bio_assembly_path, subfolder) files = os.listdir(sub_path) if not os.path.exists("%s/%s" % (base_path, subfolder)): # BUG: undefined variable os.mkdir("%s/%s" % (base_path, subfolder)) for fn in files: if fn.count('.pdb1.gz') == 1: pdbgz_path = "%s/%s" % (sub_path, fn) if os.path.getsize(pdbgz_path) > 50 * 1024 * 1024: continue pdb_id = fn.replace('.pdb1.gz', '') if pdb_id not in pdbs: continue bio_pdbs.add(pdb_id) in_queue.put((pdbgz_path, pdb_id)) total_structures += 1 subfolders = os.listdir(AU_path) # BUG: undefined variable for subfolder in subfolders: sub_path = "%s/%s" % (AU_path, subfolder) files = os.listdir(sub_path) if not os.path.exists("%s/%s" % (base_path, subfolder)): os.mkdir("%s/%s" % (base_path, subfolder)) for fn in files: if fn.count('.ent.gz') == 1: pdbgz_path = "%s/%s" % (sub_path, fn) if os.path.getsize(pdbgz_path) > 50 * 1024 * 1024: continue pdb_id = fn[3:7] if not '%s_au' % pdb_id in pdbs: continue if pdb_id in bio_pdbs: continue in_queue.put((pdbgz_path, pdb_id)) total_structures += 1 print('Amount of structures for RINerator: ', total_structures) processes = {} for i in range(1, num_of_proc + 1): p = Process(target=createRinProc, args=(in_queue, lock, fromScratch, i, forceCentrality, remove_tmp_files, base_path, rinerator_path, errorlog)) # BUG: undefined variable processes[i] = p print('Start RINerator Process: ', i) p.start() for i in processes: processes[i].join()
class BaseProcessor: _logger = None _file_handler = None _process_list = [] _manager = None _event_queue = None _task_queue = None _process_count = 0 _use_verbose_logging = False def __init__(self, file_handler, process_count, use_verbose_logging): self._file_handler = file_handler self._process_count = process_count self._use_verbose_logging = use_verbose_logging self._logger = Logger() self._manager = Manager() self._event_queue = self._manager.Queue() self._task_queue = self._manager.Queue() def _get_process(self, process_id): raise AttributeError("not supported") def _run_processes(self, items_to_process, event_handler_func, event_handler_args): total_to_process = len(items_to_process) processes = self._initialize_processes() self._fill_task_queue(items_to_process) self._process_events(total_to_process, event_handler_func, event_handler_args) self._stop_processes(processes) def _initialize_processes(self): processes = [] for i in range(self._process_count): process = self._get_process(i) processes.append(process) process.start() return processes def _fill_task_queue(self, items): for item in items: self._task_queue.put(item) def _process_events(self, total_to_process, event_handler_func, event_handler_args): num_processed = 0 num_processed_by_process_list = [0] * self._process_count while True: self._write_progress_to_console(num_processed, total_to_process, num_processed_by_process_list) event = None try: event = self._event_queue.get(True, 1) except: pass if event is not None: args_to_use = (event, num_processed_by_process_list, num_processed, total_to_process) args_to_use += event_handler_args num_processed = event_handler_func(*args_to_use) if num_processed >= total_to_process: break def _stop_processes(self, processes): for i in range(self._process_count): self._task_queue.put(-1) for process in processes: process.join() def _write_progress_to_console(self, num_processed, total_to_process, num_processed_by_process_list): output_str = "Progress: " + str(num_processed) + "/" + str( total_to_process) + " " for i in range(len(num_processed_by_process_list)): output_str += ("P" + str(i) + ": " + str(num_processed_by_process_list[i]) + " ") sys.stdout.write(output_str + "\r") sys.stdout.flush() def _log_process_message(self, process_id, message): if self._use_verbose_logging: self._logger.print_log("[process: " + str(process_id) + "] " + message)