def queue_info(iters=None,): work = JoinableQueue() for filename in iters: work.put(obj=filename) time.sleep(1) return work
def main(): jobs = JoinableQueue() result = JoinableQueue() numToProcess = -1 scores = pd.DataFrame(columns=['query','fmeasure','precision','recall', 'size','maxDistance','topHits',"contextSteps"]) print len(datasets) for key in datasets: jobs.put(key) processed_count = Counter() for i in xrange(NUMBER_OF_PROCESSES): p = Process(target=work, args=(i, jobs, result, processed_count)) p.daemon = True p.start() #work(1, jobs, result, processed_count) automated_annotations = {} distances = {} jobs.join() dataset_index = collections.defaultdict(set) annotated_datasets = set() while not result.empty(): dataset, classes = result.get() if len(classes) == 0: annotated_datasets.add(dataset) for c in classes.keys(): dataset_index[c].add(dataset) owl_class = Class(c, graph=graph) for parent in owl_class.parents: dataset_index[parent.identifier].add(dataset) result.task_done() print '\n' for query, c in queries.items(): manual = ground_truth[query] automated = dataset_index[c] hits = manual & automated misses = manual - automated precision = np.nan if len(automated) == 0 else float(len(hits)) / len(automated) recall = np.nan if len(manual) == 0 else float(len(hits)) / len(manual) if precision != 0 or recall != 0: fmeasure = 0 if np.isnan(precision) or np.isnan(recall) else 2 * (precision * recall) / (precision + recall) else: fmeasure = 0 scores = scores.append(dict(query=query, size=len(manual), precision=precision, recall=recall, fmeasure=fmeasure,topHits=topHits, maxDistance=maxDistance, contextSteps = context_steps), ignore_index=True) print "Hits for", query, c print '\n'.join(sorted(hits)) print scores print "Annotated", len(annotated_datasets), "datasets."
def launch_mesos_tf(marathon_url_str, tsknom_str, cpu_float, mem_float, ntasks_int, uri_str, marathon_usr, marathon_usrpwd, localhost_str, mxattempts=10): toret_nodes = dict() docker = False if uri_str.find('docker') > -1: uri_str = uri_str.replace('docker://', '') docker = True uri_str = uri_str.rstrip('/') marathon_url_str = marathon_url_str.rstrip('/') counter = 0 tq = JoinableQueue() q = Queue() plist = list() consumers = [ Consumer(tq, q) for i in xrange(ntasks_int) ] for c in consumers: c.start() for i in xrange(ntasks_int): tq.put(Task(post_marathon_tasks, (marathon_url_str, tsknom_str, cpu_float, mem_float, i+1, ntasks_int, uri_str, marathon_usr, marathon_usrpwd, localhost_str, mxattempts, docker))) for i in xrange(ntasks_int): tq.put(None) tq.join() for i in xrange(1, ntasks_int+1): toret_nodes[i] = q.get() return toret_nodes
def setup_queue(options): probe_servers = Queue() progress_queue = Queue() run = Probe.ProbeRun.objects.get(id = options.run_id) summary_top = Results.ResultSummaryList.objects.get(part_of_run=run) summary_top.setup() connection.close() threads = [] for i in range(options.threads): new_thread = Process(target=SetupQueueThread, args=(i,run, probe_servers, progress_queue)) new_thread.daemon = True new_thread.start() threads.append(new_thread) progress_thread = threading.Thread(target=__ProgressCounter, args=(run, progress_queue, threads,options)) progress_thread.daemon = True progress_thread.start() i = 0; if options.input_filename and (not options.count or i < options.count): for hostname_line in fileinput.input(options.input_filename, openhook=fileinput.hook_compressed): probe_servers.put(hostname_line) i+=1 if options.count and i >= options.count: break; probe_servers.join() progress_queue.join() return run
class emailSubsystem(object): def __init__(self): ### will move to Celery eventually; with Celery, the app would be able to periodically # wakeup and check on replyQueue to see which emails were send, which were not and # what to do ... self.emailQueue = JoinableQueue() self.replyQueue = JoinableQueue() self.worker = Process(target=sendEmailWorker, args=(self.emailQueue, self.replyQueue)) def start(self): # temporarily comment out starting a new process as it seems to leave zombies # and causes app not to start as max process limit is reached. #self.worker.start() return def shutdown(self): # post poison pill # wait on the queue to be done; ie join on emailQueue # wait on the worker process to die; ie join on worker self.emailQueue.put(None) self.emailQueue.join() self.worker.join()
def crunch(file_name, ext_type, handler, pool_size=4, queue_size=40, limit=None): print 'Crunching file: %s, limit: %s' % (file_name, limit) q = JoinableQueue(queue_size) q_feats = Queue() pool = Pool(pool_size, wrap_handler(handler), ((q, q_feats),)) with file_reader(file_name) as reader: idx = 0 for entry in reader: if (entry.pathname.find(ext_type) != -1): text = [b for b in entry.get_blocks()] key = entry.pathname.split('/')[-1].split('.')[0] q.put((key, text), True) idx += 1 print 'Processing:', entry.pathname, idx if limit and idx >= limit: print 'Reached the limit' break q.close() q.join() pool.close() result = [] for i in range(q_feats.qsize()): result.append(q_feats.get()) return result
def build(opts): tasks = JoinableQueue() results = JoinableQueue() if opts.remove: log.info("Removing existing docs collection") session = utils.get_session(config) session.docs.drop() # start up our builder threads log.info("Creating %d Builder processes" % opts.threads) builders = [ Builder(tasks, results) for i in xrange(opts.threads)] for b in builders: b.start() # queue up the bibcodes for bib in get_bibcodes(opts): tasks.put(bib) # add some poison pills to the end of the queue log.info("poisoning our task threads") for i in xrange(opts.threads): tasks.put(None) # join the results queue. this should # block until all tasks in the task queue are completed log.info("Joining the task queue") tasks.join() log.info("Joining the task threads") for b in builders: b.join() log.info("All work complete")
def upload(args=None, authdata=None): """ Initialize the containers and pseudo-directories for what is to be uploaded. Separates jobs into sub-jobs based on container. Up to 100 containers per second. """ #initalize the containers in parallel containers = [] for obj in os.listdir(args['dir']): if args['container']: containers.append(args['container']) break #if os.path.isdir(os.path.abspath(args['dir']+'/'+obj)): if os.path.isdir(os.path.join(args['dir'], obj)): containers.append(obj) if containers: #set container job count to the less of args['cc'] or container count if args['cc'] < len(containers): args['cc'] = len(containers) #create queue and jobs container_queue = JoinableQueue() for container_worker in range(args['cc']): job = Process(target=container_consumer, args=(args, authdata, container_queue,)) job.daemon=False job.start() for container in containers: container_queue.put(container) #tailing the queue with a Null marker so the works shut down nicely. for container in range(args['cc']): container_queue.put(None) container_queue.join()
class SimpleSynergeticServer(Process): def __init__(self, authen_key): Process.__init__(self) self.__task_queue = JoinableQueue(1) self.__return_queue = Queue(1) self.serv = Listener(('', 40000), authkey=authen_key) def run(self): print 'Server Works' copy_reg.pickle(types.MethodType, _reduce_method) #Start the synergeticProcess in Deamon Mode worker_p = SynergeticProcess(self.__task_queue, self.__return_queue) worker_p.deamon = True worker_p.start() while True: print 'wait for Client' pool_conn = self.serv.accept() print 'connection Client Accepted' while True: print 'in LOOP Simple Server' #There is no need for task_id in this version try: print 'Try to recv MSG' unpickled_msg = pool_conn.recv() print 'MSG Reseved' except Exception as e: # EOFError: print 'Fail To Receive MSG:', e break if unpickled_msg[0] == 'MODULES': self.import_mods( unpickled_msg[1] ) ret = 'MODULES-READY' else: self.__task_queue.put(unpickled_msg) ret = self.__return_queue.get() try: print 'SEND RESPONCE' try: pool_conn.send( ret ) except EOFError: print 'SENT TO POOL FAILD' print 'RESPONCE SENT ', ret except EOFError: break pool_conn.close() def import_mods(self, mods_d): for mod_name, mod_bytecode in mods_d.items(): try: fobj = open(mod_name + ".pyc", 'wb') except Exception as e: print("Synergeticprocessing.SimpleServer --> Module file error: %s" % e) else: fobj.write( mod_bytecode ) finally: fobj.close() for mod in mods_d: print 'blocking' __import__( mod ) print 'imported ', mod
def solve(iterations, proc_count): queue = JoinableQueue() partition = get_iterations_partition(iterations, proc_count) for iteration in partition: queue.put(iteration) for i in range(process_count): queue.put(None) manager = Manager() result = manager.list() processes = [] cur_time = time.time() for i in range(process_count): proc = Process(target=worker, args=(queue, result,)) proc.start() processes.append(proc) queue.join() for proc in processes: proc.join() cur_time = time.time() - cur_time print_results(cur_time, result, iterations)
def aggress(map): global startMap startMap = map #print "Regressing..." state = State() jobs = [] longestSolution = Value('d', 20) highestScore = Value('d', 0) queue = JoinableQueue() manager = Manager() d = manager.dict() d.clear() l = RLock() if multiProc: queue.put((state, map, 1)) for i in range(numProcs): p = Process(target = multiMain, args=(startMap, l, d, queue,highestScore)) p.start() queue.join() else: a(l, highestScore, d, None, state, map, 1)
class ProcessPool(object): def __init__(self, size=1): self.size = size self.jobs = Queue() self.results = Queue() self.processes = [] def start(self): '''start all processes''' for i in range(self.size): self.processes.append(ProcessWorker(self)) for process in self.processes: process.start() def append_job(self, job, *args, **kwargs): self.jobs.put((job, args, kwargs)) def join(self): '''waiting all jobs done''' self.jobs.join() def stop(self): '''kill all processes''' for process in self.processes: process.stop() for process in self.processes: # waiting processes completing if process.is_alive(): process.join() del self.processes[:] # reset processes to empty
def get_citations(**args): """ Method to prepare the actual citation dictionary creation """ # create the queues tasks = JoinableQueue() results = JoinableQueue() # how many threads are there to be used if 'threads' in args: threads = args['threads'] else: threads = cpu_count() # initialize the "harvesters" (each harvester get the citations for a bibcode) harvesters = [ CitationHarvester(tasks, results) for i in range(threads)] # start the harvesters for b in harvesters: b.start() # put the bibcodes in the tasks queue for bib in args['bibcodes']: tasks.put(bib) # add some 'None' values at the end of the tasks list, to faciliate proper closure for i in range(threads): tasks.put(None) tasks.join() for b in harvesters: b.join() return [item for sublist in cit_dict.values() for item in sublist]
def run(self): # Changes the process name shown by ps for instance setProcTitle ("agentcluster master [version: %s] [monitoring: %d seconds]" % (__version__,self.monitoring_period) ); try: logger.info ( 'Agent cluster server starting' ); logger.info ( 'Configurations will be scanned in directories:' ); for directory in confdir.data: logger.info ( ' o %s', os.path.abspath(directory) ); self.watchdog = Watchdog(self.monitoring_period) self.watchdog.start() # Generates a deadlock to enter in sleep mode # Only an external signal can break this deadlock logger.info ( 'Agent cluster server started' ); queue = JoinableQueue() queue.put(object()); queue.join(); except KeyboardInterrupt: logger.info ( 'Agent cluster server interrupted' ); except Exception: logger.error ( 'Exception catched in main process: %s', sys.exc_info()[1] ); logger.debug ( "", exc_info=True ); finally: # First stop the monitoring to avoid restarting killed agents if self.watchdog is not None: self.watchdog.shutdown = True self.watchdog.join() logger.info ( 'Agent cluster server end' ); logging.shutdown()
def main(opts, files): if opts.threads == 1: log.info("running synchronously") run_syncronous(opts, files) else: Q = JoinableQueue() workers = [Worker(Q, opts) for i in xrange(opts.threads)] log.info("initializing %d threads" % opts.threads) for w in workers: w.start() # push log events onto the queue events_iter = events(files, opts) if opts.limit: events_iter = itertools.islice(events_iter, opts.limit) for event in events_iter: Q.put(event) # add poison pills for i in xrange(opts.threads): Q.put(None) Q.join() log.info("work complete. shutting down threads.") for w in workers: w.join()
def evaluate(points,meshToBasis,kernel,quadRule,coeffs,nprocs=None): """Evaluate a kernel using the given coefficients""" if nprocs==None: nprocs=cpu_count() inputQueue=JoinableQueue() nelements=meshToBasis.nelements for elem in meshToBasis: inputQueue.put(elem) buf=sharedctypes.RawArray('b',len(points[0])*numpy.dtype(numpy.complex128).itemsize) result=numpy.frombuffer(buf,dtype=numpy.complex128) result[:]=numpy.zeros(1,dtype=numpy.complex128) time.sleep(.5) workers=[] for id in range(nprocs): worker=EvaluationWorker(points,kernel,quadRule,coeffs,inputQueue,result) worker.start() workers.append(worker) inputQueue.join() for worker in workers: worker.join() return result.copy()
def readCEFFile(afile,pygtail): if exists(afile): #sometimes files can move/archive while we iterate the list try: #start a process to post our stuff. logcache=JoinableQueue() postingProcess=Process(target=postLogs,args=(logcache,),name="cef2mozdefHTTPPost") postingProcess.start() #have pygtail feed us lines for line in pygtail: pygtail._update_offset_file() cefDict=parseCEF(line) #logger.debug(json.dumps(cefDict)) #append json to the list for posting if cefDict is not None: logcache.put(json.dumps(cefDict)) logger.info('{0} done'.format(afile)) logger.info('waiting for posting to finish') logcache.put(None) logcache.close() #logger.info('posting done') except KeyboardInterrupt: sys.exit(1) except ValueError as e: logger.fatal('Exception while handling CEF message: %r'%e) sys.exit(1)
class QueueTask: def __init__(self): self.queue = JoinableQueue() self.event = Event() atexit.register( self.queue.join ) process = Process(target=self.work) process.daemon = True process.start() def work(self): while True: func, args, wait_for = self.queue.get() for evt in wait_for: evt.wait() func(*args) self.event.set() self.queue.task_done() def enqueue(self, func, args=[], wait_for=[]): self.event.clear() self.queue.put( (func, args, wait_for) ) return self.event
def batchProcess(self, arr_to_enque, work_method, t=False): q = JoinableQueue() output = JoinableQueue() extra = JoinableQueue() third = JoinableQueue() if t: args = ((q, output, extra, third)) else: args=(q, output, extra) for obj in arr_to_enque: q.put(obj) processes = [Process(target=work_method, args=args, name=str(obj)) for obj in arr_to_enque] for p in processes: p.start() for p in processes: p.join(30) if p.is_alive(): print "ERROR JOINING PROCESS FOR: ", p.name p.terminate() raise Exception("Goal Conversion Error:", (self.account_id, self.project_id, exp_id, var_ids)) print "end batch process" if t: return (output, extra, third) else: return (output, extra)
def main(workers=10): """ Executes main function of mini-framework's Control thread. :param workers: Integer detailing number of worker FIFO threads to employ """ start_logging() log_info("New multiprocessing session with {} workers".format(workers)) # Input JoinableQueue and Output Queue inq = JoinableQueue(maxsize=int(workers*1.5)) outq = Queue(maxsize=int(workers*1.5)) ot = OutThread(workers, outq) ot.start() for _ in range(workers): w = WorkerThread(inq, outq) w.start() # Create a sequence of a 1000 random alphabetic characters random_chars = (ascii_letters[randint(0, 51)] for _ in range(1000)) # Keep input queue loaded for as long as possible # Feed the process pool with work units for work in enumerate(random_chars): inq.put(work) # Fill the input queue with Nones to shut the worker threads down # which terminates the process pool for _ in range(workers): inq.put(None) inq.join() print("Control process terminating")
def cdp_no_split_single(loaded_seq_list, loaded_seq_name_list, ref_file, nt, cores): """ Aligns a single SRNA_seq object to multiple refseq seqs in a Ref object at a time. No splitting of read counts. """ refs = RefSeq() refs.load_ref_file(ref_file) print(colored("------------------ALIGNING READS------------------\n", 'green')) workers = cores work_queue = JoinableQueue() processes = [] mgr = Manager() count = 0 counts_by_ref = mgr.dict() # {header:[count1, count2,.......]} for header, seq in refs: work_queue.put((header, seq,)) count += 1 if count % 10000 == 0: _cdp_no_split_single_queue(counts_by_ref, loaded_seq_list, nt, processes, work_queue, workers) _cdp_no_split_single_queue(counts_by_ref, loaded_seq_list, nt, processes, work_queue, workers) _cdp_single_output(counts_by_ref.copy(), loaded_seq_name_list, ref_file, nt)
class JavaMultipleParserExecutor: def __init__(self, output_dir, repo_path, processes=None): self.target_blobs = JoinableQueue() self.num_consumers = processes if processes else cpu_count() self.consumers = [JavaConsumer(self.target_blobs, repo_path, output_dir) for i in range(self.num_consumers)] for consumer in self.consumers: consumer.start() self.closed = False def parse_blob(self, blob): if self.closed: return self.target_blobs.put(blob.hexsha) def join(self): if self.closed: return for i in range(self.num_consumers): self.target_blobs.put(None) self.target_blobs.join() self.closed = True
def processData(imageList,featuresDir,featuresExt,task): numProcs = 8 taskQueue = JoinableQueue() resultQueue = Queue() processes = [] for i in range(numProcs): t = Process(target=worker, args=(taskQueue, resultQueue, task)) t.daemon = True t.start() processes.append(t) for img in imageList: filename = featuresDir+'/'+img+'.'+featuresExt idxFile = re.sub(r'\..+$',r'.idx',filename) content = open(filename) index = open(idxFile) taskQueue.put( (img,content.read(),index.read()) ) #taskQueue.put( (img,filename,idxFile) ) index.close() content.close() for i in range(len(processes)): taskQueue.put('stop') results = [] retrieved = 0 while retrieved < len(imageList): data = resultQueue.get() retrieved += 1 if data != 'Ignore': results.append(data) return results
def parallelPrepareImg(img, info, name, idx): # Make Color Image if img.ndim == 2: img = np.tile(img[:, :, np.newaxis], (1, 1, 3)) elif img.shape[2] == 4: img = img[:, :, :3] # Prepare processes numProcs = 3 taskQueue = JoinableQueue() resultQueue = ProcQueue() processes = [] for i in range(numProcs): t = Process(target=singleWindowProcess, args=(taskQueue, resultQueue, img)) t.daemon = True t.start() processes.append(t) j = 0 # Add tasks to the queue for b in info: idx.write(b[4]) taskQueue.put( (b,j) ) j += 1 for i in range(len(processes)): taskQueue.put('stop') # Collect results data = np.zeros([len(info), 227, 227, 3]) retrieved = 0 while retrieved < len(info): j,win = resultQueue.get() data[j,:,:,:] = win retrieved += 1 # Substract mean and return data -= imagenet.IMAGENET_MEAN[14:241,14:241,:] return data.swapaxes(2, 3).swapaxes(1, 2)
def find_vocabulary(data_dir, stats_dir, category, min_num_images, save_description): print "Start find vocabulary" filequeue = JoinableQueue() photoqueue = Queue() init_dict = initialize_variables(None, None, False) # Create new processes num_processes = cpu_count() temp_dir = os.path.join(stats_dir, "database_temp", "vocab", category) if not os.path.exists(temp_dir): os.makedirs(temp_dir) processes = [FindVocabularyProcess(filequeue, photoqueue, init_dict, 30.0, num_processes, temp_dir, category) for i in xrange(num_processes)] for p in processes: p.start() #Add the files to the process queue add_files_to_queue(data_dir, category, filequeue) #Add a poison pill for each process for i in xrange(num_processes): filequeue.put("Stop") for p in processes: p.join() merge_vocabulary_files(data_dir, temp_dir, min_num_images, save_description) print "Removing temp files" shutil.rmtree(temp_dir) print "Done with find vocabulary"
class FlightProducer(Process): def __init__(self, options={}, date_group=[]): self.options = options self.date_group = date_group self.date_queue = JoinableQueue() def start(self): consumers_list = [] consumers_num = cpu_count() * 2 # Consumers for i in xrange(consumers_num): consumers_list.append(FlightConsumer(self.options, self.date_queue)) for consumer in consumers_list: consumer.start() # Put each date group to queue for date_item in self.date_group: self.date_queue.put(date_item) # Tell consumers can exit for i in xrange(consumers_num): self.date_queue.put(None) # Wait for all of the consumers to finish self.date_queue.join() print('Done')
def task_writer(task: JoinableQueue): for n in News.objects.all()[:50].iterator(): task.put(n) for i in range(PROCESS_NUM): task.put("end") print("task writer ends")
def save_transaction_list(data_dir, stats_dir, category, concept_vocabulary, save_description): print "Start saving transaction list" filequeue = JoinableQueue() concept_vocabulary_list, concept_vocabulary_freq = zip(*concept_vocabulary) init_dict = initialize_variables(concept_vocabulary_list, None, True) # Create new processes temp_dir = os.path.join(stats_dir, "transaction_list") if not os.path.exists(temp_dir): os.makedirs(temp_dir) else: print "todo" lock = Lock() num_processes = cpu_count() processes = [TransactionListProcess(filequeue, init_dict, 30, num_processes, temp_dir, save_description, lock) for i in xrange(num_processes)] for p in processes: p.start() #Add the files to the process queue add_files_to_queue(data_dir, category, filequeue) #Add a poison pill for each process for i in xrange(num_processes): filequeue.put("Stop") for p in processes: p.join() print "Removing temp files" shutil.rmtree(temp_dir) print "Done with saving transaction list"
def main(multiplier): # Establish communication queues tasks = JoinableQueue() results = Queue() # Start consumers num_consumers = cpu_count() * multiplier print 'Creating %d consumers' % num_consumers consumers = [Consumer(tasks, results) for i in xrange(num_consumers)] for w in consumers: w.start() fout = open(os.path.join(settings.PERSIST_DIR, 'doc_matrix_comparison.csv'), 'w', 0) rw = ResultWriter(results, csv.writer(fout)) rw.start() #num_docs = 801781 num_docs = 25 for i in xrange(num_docs): tasks.put(Task(i)) # Add a poison pill for each consumer for i in xrange(num_consumers): tasks.put(None) # Wait for all of the tasks to finish tasks.join() results.put('STOP')
class MMapPool(object): def __init__(self, n, mmap_size): self.n = n self.mmap_size = mmap_size self.pool = [mmap.mmap(-1, mmap_size) for _ in range(n)] self.free_mmaps = set(range(n)) self.free_queue = JoinableQueue() def new(self): if not self.free_mmaps: self.free_mmaps.add(self.free_queue.get()) self.free_queue.task_done() while True: try: self.free_mmaps.add(self.free_queue.get_nowait()) self.free_queue.task_done() except Empty: break mmap_idx = self.free_mmaps.pop() return mmap_idx, self.pool[mmap_idx] def join(self): while len(self.free_mmaps) < self.n: self.free_mmaps.add(self.free_queue.get()) self.free_queue.task_done() def get(self, idx): return self.pool[idx] def free(self, idx): self.free_queue.put(idx)
def main(): info_string = """Individuals that are not present in ped file will not be considered in the analysis.""" parser = argparse.ArgumentParser( description="Annotate genetic models in variant files..") parser.add_argument('family_file', type=str, nargs=1, help='A pedigree file in .ped format.') parser.add_argument('variant_file', type=str, nargs=1, help='A variant file. Default is vcf format.') parser.add_argument('annotation_file', type=str, nargs=1, help='A annotations file. Default is ref_gene format.') parser.add_argument('-at', '--annotation_type', type=str, nargs=1, choices=['bed', 'ccds', 'gtf', 'ref_gene'], default=['ref_gene'], help='Specify the format of the annotation file.') parser.add_argument('--version', action="version", version=pkg_resources.require("genmod")[0].version) parser.add_argument('-v', '--verbose', action="store_true", help='Increase output verbosity.') parser.add_argument('-chr', '--chr_prefix', action="store_true", help='If chr prefix is used in vcf.') parser.add_argument('-s', '--silent', action="store_true", help='Do not print the variants.') parser.add_argument('-phased', '--phased', action="store_true", help='If data is phased use this flag.') parser.add_argument( '-o', '--outfile', type=str, nargs=1, default=[None], help='Specify the path to a file where results should be stored.') parser.add_argument( '-cadd', '--cadd_file', type=str, nargs=1, default=[None], help='Specify the path to a bgzipped cadd file with variant scores.\ If no index is present it will be created.') args = parser.parse_args() var_file = args.variant_file[0] file_name, file_extension = os.path.splitext(var_file) anno_file = args.annotation_file[0] start_time_analysis = datetime.now() # Start by parsing at the pedigree file: my_family = get_family(args) # Parse the header of the vcf: head = get_header(var_file) add_metadata(head, args) # Parse the annotation file and make annotation trees: if args.verbose: print('Parsing annotation ...') print('') start_time_annotation = datetime.now() annotation_trees = annotation_parser.AnnotationParser( anno_file, args.annotation_type[0]) if args.verbose: print('Annotation Parsed!') print('Cromosomes found in annotation file: %s' % ','.join(list(annotation_trees.gene_trees.keys()))) print('Time to parse annotation: %s' % (datetime.now() - start_time_annotation)) print('') # Check if the ccds-file is compressed and indexed: if args.cadd_file[0]: if args.verbose: print('Cadd file! %s' % args.cadd_file[0]) try: tabix_index(args.cadd_file[0], seq_col=0, start_col=1, end_col=1, meta_char='#') except IOError as e: if args.verbose: print(e) # # Check the variants: # The task queue is where all jobs(in this case batches that represents variants in a region) is put # the consumers will then pick their jobs from this queue. variant_queue = JoinableQueue(maxsize=1000) # The consumers will put their results in the results queue results = Manager().Queue() # Create a directory to keep track of temp files temp_dir = mkdtemp() num_model_checkers = (cpu_count() * 2 - 1) if args.verbose: print('Number of CPU:s %s' % cpu_count()) # These are the workers that do the analysis model_checkers = [ variant_consumer.VariantConsumer(variant_queue, results, my_family, args) for i in range(num_model_checkers) ] for w in model_checkers: w.start() # This process prints the variants to temporary files var_printer = variant_printer.VariantPrinter(results, temp_dir, head, args.verbose) var_printer.start() if args.verbose: print('Start parsing the variants ...') print('') start_time_variant_parsing = datetime.now() # For parsing the vcf: var_parser = vcf_parser.VariantFileParser(var_file, variant_queue, head, annotation_trees, args) var_parser.parse() for i in range(num_model_checkers): variant_queue.put(None) variant_queue.join() results.put(None) var_printer.join() chromosome_list = var_parser.chromosomes if args.verbose: print('Cromosomes found in variant file: %s' % ','.join(chromosome_list)) print('Models checked!') print('Start sorting the variants:') print('') start_time_variant_sorting = datetime.now() print_headers(args, head) for chromosome in chromosome_list: for temp_file in os.listdir(temp_dir): if temp_file.split('_')[0] == chromosome: var_sorter = variant_sorter.FileSort(os.path.join( temp_dir, temp_file), outFile=args.outfile[0], silent=args.silent) var_sorter.sort() if args.verbose: print('Sorting done!') print('Time for sorting: %s' % str(datetime.now() - start_time_variant_sorting)) print('') print('Time for whole analyis: %s' % str(datetime.now() - start_time_analysis)) # Remove all temp files: shutil.rmtree(temp_dir)
matrizA = cria_matriz(linhas, colunas) matrizB = cria_matriz(linhas, colunas) matrizC = numpy.zeros(shape=(linhas, colunas)) print("{}: Multiplicando matrizes".format(time.strftime('%c'))) queue = JoinableQueue() queue_resultados = JoinableQueue() for i in range(2): worker = Process(target=multiplica_linha_coluna, args=( queue, queue_resultados, matrizA, matrizB, )) worker.daemon = True worker.start() for i in range(len(matrizA)): for j in range(len(matrizA[0])): queue.put((i, j)) queue.join() while not queue_resultados.empty(): i, j, valor = queue_resultados.get() matrizC[i][j] = valor queue_resultados.task_done() print("{}: Resultado:{}".format(time.strftime('%c'), matrizC))
def main(_): parser = argparse.ArgumentParser(description='ProjE.') parser.add_argument('--input_dir', dest='input_dir', type=str, help="Data folder", default='./data/WN11/') parser.add_argument('--output_dir', dest='output_dir', type=str, help="Data folder", default='.output/') parser.add_argument('--lr', dest='lr', type=float, help="Learning rate", default=0.01) parser.add_argument("--dim", dest='dim', type=int, help="Embedding dimension", default=200) parser.add_argument("--batch", dest='batch', type=int, help="Batch size", default=200) parser.add_argument("--comb", dest="combination_method", type=str, help="Combination method", default='simple') parser.add_argument("--worker", dest='n_worker', type=int, help="Evaluation worker", default=3) parser.add_argument("--generator", dest='n_generator', type=int, help="Data generator", default=10) parser.add_argument("--eval_batch", dest="eval_batch", type=int, help="Evaluation batch size", default=500) parser.add_argument("--save_dir", dest='save_dir', type=str, help="Model path", default='./') parser.add_argument("--load_model", dest='load_model', type=str, help="Model file", default="") parser.add_argument("--save_per", dest='save_per', type=int, help="Save per x iteration", default=10) parser.add_argument("--eval_per", dest='eval_per', type=int, help="Evaluate every x iteration", default=1) parser.add_argument("--max_iter", dest='max_iter', type=int, help="Max iteration", default=100) parser.add_argument("--summary_dir", dest='summary_dir', type=str, help="summary directory", default='./ProjE_summary/') parser.add_argument("--keep", dest='drop_out', type=float, help="Keep prob (1.0 keep all, 0. drop all)", default=0.5) parser.add_argument("--optimizer", dest='optimizer', type=str, help="Optimizer", default='adam') parser.add_argument("--prefix", dest='prefix', type=str, help="model_prefix", default='DEFAULT') parser.add_argument("--loss_weight", dest='loss_weight', type=float, help="Weight on parameter loss", default=1e-5) parser.add_argument("--neg_weight", dest='neg_weight', type=float, help="Sampling weight on negative examples", default=0.5) args = parser.parse_args() args.input_dir = "../data/WN11/" print(args) model = ProjE(args.input_dir, embed_dim=args.dim, combination_method=args.combination_method, dropout=args.drop_out, neg_weight=args.neg_weight) train_hrt_input, train_hrt_weight, train_trh_input, train_trh_weight, \ train_loss, train_op = train_ops(model, learning_rate=args.lr, optimizer_str=args.optimizer, regularizer_weight=args.loss_weight) test_input, test_head, test_tail = test_ops(model) with tf.Session() as session: tf.initialize_all_variables().run() saver = tf.train.Saver() iter_offset = 0 if args.load_model is not None and os.path.exists(args.load_model): saver.restore(session, args.load_model) iter_offset = int( args.load_model.split('.')[-2].split('_')[-1]) + 1 print("Load model from %s, iteration %d restored." % (args.load_model, iter_offset)) total_inst = model.n_train # training data generator raw_training_data_queue = Queue() training_data_queue = Queue() data_generators = list() for i in range(args.n_generator): data_generators.append( Process(target=data_generator_func, args=(raw_training_data_queue, training_data_queue, model.train_tr_h, model.train_hr_t, model.n_entity, args.neg_weight))) data_generators[-1].start() evaluation_queue = JoinableQueue() result_queue = Queue() for i in range(args.n_worker): worker = Process(target=worker_func, args=(evaluation_queue, result_queue, model.hr_t, model.tr_h)) worker.start() for data_func, test_type in zip( [model.validation_data, model.testing_data], ['VALID', 'TEST']): accu_mean_rank_h = list() accu_mean_rank_t = list() accu_filtered_mean_rank_h = list() accu_filtered_mean_rank_t = list() evaluation_count = 0 for testing_data in data_func(batch_size=args.eval_batch): head_pred, tail_pred = session.run([test_head, test_tail], {test_input: testing_data}) evaluation_queue.put((testing_data, head_pred, tail_pred)) evaluation_count += 1 for i in range(args.n_worker): evaluation_queue.put(None) print("waiting for worker finishes their work") evaluation_queue.join() print("all worker stopped.") while evaluation_count > 0: evaluation_count -= 1 (mrh, fmrh), (mrt, fmrt) = result_queue.get() accu_mean_rank_h += mrh accu_mean_rank_t += mrt accu_filtered_mean_rank_h += fmrh accu_filtered_mean_rank_t += fmrt print( "[%s] INITIALIZATION [HEAD PREDICTION] MEAN RANK: %.1f FILTERED MEAN RANK %.1f HIT@10 %.3f FILTERED HIT@10 %.3f" % (test_type, np.mean(accu_mean_rank_h), np.mean(accu_filtered_mean_rank_h), np.mean(np.asarray(accu_mean_rank_h, dtype=np.int32) < 10), np.mean( np.asarray(accu_filtered_mean_rank_h, dtype=np.int32) < 10 ))) print( "[%s] INITIALIZATION [TAIL PREDICTION] MEAN RANK: %.1f FILTERED MEAN RANK %.1f HIT@10 %.3f FILTERED HIT@10 %.3f" % (test_type, np.mean(accu_mean_rank_t), np.mean(accu_filtered_mean_rank_t), np.mean(np.asarray(accu_mean_rank_t, dtype=np.int32) < 10), np.mean( np.asarray(accu_filtered_mean_rank_t, dtype=np.int32) < 10 ))) for n_iter in range(iter_offset, args.max_iter): start_time = timeit.default_timer() accu_loss = 0. accu_re_loss = 0. ninst = 0 print("initializing raw training data...") nbatches_count = 0 for dat in model.raw_training_data(batch_size=args.batch): raw_training_data_queue.put(dat) nbatches_count += 1 print("raw training data initialized.") while nbatches_count > 0: nbatches_count -= 1 hr_tlist, hr_tweight, tr_hlist, tr_hweight = training_data_queue.get( ) l, rl, _ = session.run( [train_loss, model.regularizer_loss, train_op], { train_hrt_input: hr_tlist, train_hrt_weight: hr_tweight, train_trh_input: tr_hlist, train_trh_weight: tr_hweight }) accu_loss += l accu_re_loss += rl ninst += len(hr_tlist) + len(tr_hlist) if ninst % (5000) is not None: print( '[%d sec](%d/%d) : %.2f -- loss : %.5f rloss: %.5f ' % (timeit.default_timer() - start_time, ninst, total_inst, float(ninst) / total_inst, l / (len(hr_tlist) + len(tr_hlist)), args.loss_weight * (rl / (len(hr_tlist) + len(tr_hlist)))), end='\r') print("") print("iter %d avg loss %.5f, time %.3f" % (n_iter, accu_loss / ninst, timeit.default_timer() - start_time)) if n_iter % args.save_per == 0 or n_iter == args.max_iter - 1: save_path = saver.save( session, os.path.join( args.save_dir, "ProjE_" + str(args.prefix) + "_" + str(n_iter) + ".ckpt")) print("Model saved at %s" % save_path) if n_iter % args.eval_per == 0 or n_iter == args.max_iter - 1: for data_func, test_type in zip( [model.validation_data, model.testing_data], ['VALID', 'TEST']): accu_mean_rank_h = list() accu_mean_rank_t = list() accu_filtered_mean_rank_h = list() accu_filtered_mean_rank_t = list() evaluation_count = 0 for testing_data in data_func(batch_size=args.eval_batch): head_pred, tail_pred = session.run( [test_head, test_tail], {test_input: testing_data}) evaluation_queue.put( (testing_data, head_pred, tail_pred)) evaluation_count += 1 for i in range(args.n_worker): evaluation_queue.put(None) print("waiting for worker finishes their work") evaluation_queue.join() print("all worker stopped.") while evaluation_count > 0: evaluation_count -= 1 (mrh, fmrh), (mrt, fmrt) = result_queue.get() accu_mean_rank_h += mrh accu_mean_rank_t += mrt accu_filtered_mean_rank_h += fmrh accu_filtered_mean_rank_t += fmrt print( "[%s] ITER %d [HEAD PREDICTION] MEAN RANK: %.1f FILTERED MEAN RANK %.1f HIT@10 %.3f FILTERED HIT@10 %.3f" % (test_type, n_iter, np.mean(accu_mean_rank_h), np.mean(accu_filtered_mean_rank_h), np.mean( np.asarray(accu_mean_rank_h, dtype=np.int32) < 10 ), np.mean( np.asarray(accu_filtered_mean_rank_h, dtype=np.int32) < 10))) print( "[%s] ITER %d [TAIL PREDICTION] MEAN RANK: %.1f FILTERED MEAN RANK %.1f HIT@10 %.3f FILTERED HIT@10 %.3f" % (test_type, n_iter, np.mean(accu_mean_rank_t), np.mean(accu_filtered_mean_rank_t), np.mean( np.asarray(accu_mean_rank_t, dtype=np.int32) < 10 ), np.mean( np.asarray(accu_filtered_mean_rank_t, dtype=np.int32) < 10)))
number_of_processes = 6 if os.path.exists(h5filename): os.remove(h5filename) print(f'h5 file :{h5filename} removed !!!') hdf5_file = h5py.File(h5filename, mode='w', driver='core') hdf5_file.create_dataset('train_images', train_shape, dtype=np.uint8, compression='lzf') hdf5_file.create_dataset('train_labels', shape=(total_images, len(labels)), maxshape=(None, 2), dtype="S10", compression='lzf') hdf5_file.create_dataset('train_img_names', shape=(total_images, len('8c82ae834697bc55a742cc6001f29ace30e46d9a')), maxshape=(None, 40), dtype="S10", compression='lzf') for _ in range(number_of_processes): p = Process(target=worker, args=()) p.start() processes.append(p) print('started !') for item in source(): tasks_to_accomplish.put(item) for i in range(number_of_processes): tasks_to_accomplish.put(None) while not tasks_to_accomplish.empty(): print(f'sleep.5 and size if {tasks_to_accomplish.qsize()}') time.sleep(.5) print('queue is empty !') for p in processes: p.join() print(f'process {p.name} joined !!!') hdf5_file.flush() hdf5_file.close()
class QiuBai: """qiubaispider""" def __init__(self): self.url = 'https://www.qiushibaike.com/8hr/page/{}' self.headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36" } self.url_queue = Queue() self.html_queue = Queue() self.content_list_queue = Queue() def get_url_list(self): """获取url_list""" # return [self.url.format(i) for i in range(1, 14)] for i in range(1, 14): self.url_queue.put(self.url.format(i)) def parse_url(self): """获取响应""" while True: # print(url) url = self.url_queue.get() response = requests.get(url, headers=self.headers) # return response.content.decode() print(response) if response.status_code != 200: self.url_queue.put(url) else: self.html_queue.put(response.content.decode()) self.url_queue.task_done() def get_content_list(self): """提取数据""" while True: html_str = self.html_queue.get() html = etree.HTML(html_str) div_list = html.xpath("//div[@id='content-left']/div") content_list = [] for div in div_list: item = {} item['user_name'] = div.xpath('.//h2/text()')[0].strip() item['content'] = [ i.strip() for i in div.xpath( './/div[@class = "content"]/span/text()') ] content_list.append(item) self.content_list_queue.put(content_list) self.html_queue.task_done() def save_content(self): """保存数据""" while True: content_list = self.content_list_queue.get() for content in content_list: # print(content) with open('process.txt', 'a+', encoding='utf-8') as f: f.write(str(content)) self.content_list_queue.task_done() def run(self): thread_list = [] # 1.准备url列表 t_url = Process(target=self.get_url_list) thread_list.append(t_url) # 2.发送请求,获取响应 for i in range(3): t_parse = Process(target=self.parse_url) thread_list.append(t_parse) # 3.提取数据 t_content = Process(target=self.get_content_list) thread_list.append(t_content) # 4.保存数据 t_save = Process(target=self.save_content) thread_list.append(t_save) for process in thread_list: process.daemon = True # 把子线程设置为守护线程 process.start() for q in [self.url_queue, self.html_queue, self.content_list_queue]: q.join() # 让主线程堵塞,等待队列计数为0
rel.append( Element('member', dict(type='way', ref=way.attrib['id']))) osm.append(way) osm.append(rel) return ElementTree(osm) if __name__ == '__main__': queue = JoinableQueue() group_writer = Process(target=write_groups, args=(queue, )) group_writer.start() db = connect(host='localhost', user='******', database='gis', password='******').cursor() relations = get_relations_list(db) for group in gen_relation_groups(relations): queue.put(group) print >> stderr, '-->', len(group), 'relations' print >> stderr, '-' * 80 group_writer.join()
class DeepZoomStaticTiler(object): """Handles generation of tiles and metadata for all images in a slide.""" def __init__(self, slidepath, basename, format, tile_size, overlap, limit_bounds, quality, workers, with_viewer): if with_viewer: # Check extra dependency before doing a bunch of work import jinja2 self._slide = open_slide(slidepath) self._basename = basename self._format = format self._tile_size = tile_size self._overlap = overlap self._limit_bounds = limit_bounds self._queue = JoinableQueue(2 * workers) self._workers = workers self._with_viewer = with_viewer self._dzi_data = {} for _i in range(workers): TileWorker(self._queue, slidepath, tile_size, overlap, limit_bounds, quality).start() def run(self): self._run_image() if self._with_viewer: for name in self._slide.associated_images: self._run_image(name) self._write_html() self._write_static() self._shutdown() def _run_image(self, associated=None): """Run a single image from self._slide.""" if associated is None: image = self._slide if self._with_viewer: basename = os.path.join(self._basename, VIEWER_SLIDE_NAME) else: basename = self._basename else: image = ImageSlide(self._slide.associated_images[associated]) basename = os.path.join(self._basename, self._slugify(associated)) dz = DeepZoomGenerator(image, self._tile_size, self._overlap, limit_bounds=self._limit_bounds) tiler = DeepZoomImageTiler(dz, basename, self._format, associated, self._queue) tiler.run() self._dzi_data[self._url_for(associated)] = tiler.get_dzi() def _url_for(self, associated): if associated is None: base = VIEWER_SLIDE_NAME else: base = self._slugify(associated) return '%s.dzi' % base def _write_html(self): import jinja2 env = jinja2.Environment(loader=jinja2.PackageLoader(__name__), autoescape=True) template = env.get_template('slide-multipane.html') associated_urls = dict((n, self._url_for(n)) for n in self._slide.associated_images) try: mpp_x = self._slide.properties[openslide.PROPERTY_NAME_MPP_X] mpp_y = self._slide.properties[openslide.PROPERTY_NAME_MPP_Y] mpp = (float(mpp_x) + float(mpp_y)) / 2 except (KeyError, ValueError): mpp = 0 # Embed the dzi metadata in the HTML to work around Chrome's # refusal to allow XmlHttpRequest from file:///, even when # the originating page is also a file:/// data = template.render(slide_url=self._url_for(None), slide_mpp=mpp, associated=associated_urls, properties=self._slide.properties, dzi_data=json.dumps(self._dzi_data)) with open(os.path.join(self._basename, 'index.html'), 'w') as fh: fh.write(data) def _write_static(self): basesrc = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'static') basedst = os.path.join(self._basename, 'static') self._copydir(basesrc, basedst) self._copydir(os.path.join(basesrc, 'images'), os.path.join(basedst, 'images')) def _copydir(self, src, dest): if not os.path.exists(dest): os.makedirs(dest) for name in os.listdir(src): srcpath = os.path.join(src, name) if os.path.isfile(srcpath): shutil.copy(srcpath, os.path.join(dest, name)) @classmethod def _slugify(cls, text): text = normalize('NFKD', text.lower()).encode('ascii', 'ignore').decode() return re.sub('[^a-z0-9]+', '_', text) def _shutdown(self): for _i in range(self._workers): self._queue.put(None) self._queue.join()
class Sprendimas: tkList = [] xList = [] yList = [] N = 10**5 lmd = 2.9 t0 = 0 tN = None rez = 1 threads_quantity = None threadsList = [] threads_intervalList = [[]] queuePoints = None safeQueue = JoinableQueue() fmin = 10**-6 # UZDUOTIES GRAFIKAS fmax = 10**0 #Konstruktorius def __init__(self, fmin, fmax, lamda, N, rezoliucija, threads=None): self.fmin = fmin self.fmax = fmax self.lmd = lamda self.N = N self.rez = rezoliucija self.FindTn() if (threads != None): self.threads_quantity = threads self.queuePoints = JoinableQueue() self.FindPointsXY() def PoissonRandom(self, lmd): # Poisson taskinio proceso pasiskirstymas a = np.e**(-1.0 * lmd) r = 1 n = -1 while r > a: u = np.random.random(1) r *= u n += 1 return n # Generuojamas Puasso list ir tN def FindTn( self ): # Pagal: Tk = Tk-1 + P(lmd)k; P(lmd)-Poiss random nr; T0 = 0 lmd = self.lmd N = self.N tk = self.t0 for k in range(1, N + 1): #tk += np.random.poisson(lmd) #rankom tk += self.PoissonRandom(lmd) self.tkList.append(tk) continue self.tN = tk # N ir lmd - ivedami; Gaunam: tList ir Tn(tList pask. nr.) return # Apskaiciuojam funkcija S(f) arba kitaip y pagal duota formule def FindSf(self, f, t0, tN, N, tkList): c = 0.0 s = 0.0 for k in range(0, N): # Sigma susumavimas sin ir cos reiskiniu c += np.cos(2.0 * np.pi * f * tkList[k]) s += np.sin(2.0 * np.pi * f * tkList[k]) c *= c # Keliam kvadratu susumuota reiskini s *= s Sf = (2.0 / (tN - t0)) * (c + s) return Sf #Apskaiciuojamas daznio f kordinates (x, y) def FindPointsXY(self): tN = self.tN N = self.N rez = self.rez fmin = self.fmin #Tikrieji sk. 10**-6 fmax = self.fmax t0 = self.t0 tkList = self.tkList step = ( math.log10(fmax) - math.log10(fmin) ) / rez # surandame zingsnio ilgi h, bet skaiciuojam pagal log10 skale start_time = time.time() # Pasirenkamas budas skaiciavimui atlikti: if (self.threads_quantity == None): self.CalculatePointsXY_simple(t0, tN, tkList, N, rez, fmin, step) else: self.CalculatePointsXY_multithreading(t0, tN, tkList, N, rez, fmin, step) print("Skaiciavimu trukme: %s s" % (time.time() - start_time)) return def CalculatePointsXY_simple(self, t0, tN, tkList, N, rez, fmin, step): if (self.fmax == 1): rez -= 1 # !@@@ fmax == 10**0, ty fj==fmax, iskreips grafika for j in range( 0, rez + 1): # k - zinsniu kiekis, kiek h reikia padaryti fmin -> fmax; fj = fmin * 10**( j * step ) # formule: gaunam f pagal duota rezoliucija, kitaip x koord Sfj = self.FindSf(fj, t0, tN, N, tkList) # S(f) - y koord self.xList.append(fj) self.yList.append(Sfj) return def CalculatePointsXY_multithreading(self, t0, tN, tkList, N, rez, fmin, step): intervalsList = self.threads_intervalList threads_q = self.threads_quantity queuePoints = self.queuePoints if (self.fmax != 1): rez += 1 # !@@@ Jei fmax == 10**0, ty bus fj==1, iskreips grafika sveikojiDalis = int(rez / threads_q) liekana = rez % threads_q begin = 0 end = sveikojiDalis # Veiksmai su Procesais for threadNumber in range( 0, threads_q ): # Rezoliucijos intervalas padalinamas i intervaliukus procesams if (liekana > 0): end += 1 liekana -= 1 intervalsList.insert(threadNumber, [begin, end]) begin = end end = end + sveikojiDalis # Proceso kurimas thread = Process(target=self.ThreadJob, args=(threadNumber, intervalsList, step, fmin, t0, tN, tkList, N)) self.threadsList.append(thread) self.threadsList[threadNumber].start( ) # Startuojamas def ThreadJob() #Threadu intervaliniu listu sujungimas i baigtinius List X, Y time.sleep(5) # main thread miega 5sec, procesai dirba self.WaitUntil_ProcessOver( queuePoints, threads_q ) # Kadangi join() ir terminate() neveikia, MECHANISKAS join() self.FormatData_toList( queuePoints ) # Is Sync Queue issitraukiam data ir paverciam i listus return def ThreadJob(self, threadNumber, intervalsList, step, fmin, t0, tN, tkList, N): xList_Inner = [] yList_Inner = [] interval = intervalsList[threadNumber] print('Process nr:' + str(threadNumber) + ' skaiciavimu intervalas: ' + str(interval[1] - 1)) xList_Inner.append( threadNumber) # !@@@@ xList_Inner[0] uzkoduojamas threadNr for j in range( interval[0], interval[1] ): # j - zinsniu kiekis, kiek h reikia padaryti fmin -> fmax; fj = fmin * 10**( j * step ) # formule: gaunam f pagal duota rezoliucija, kitaip x koord ###### S(f) apskaiciavimas ####### c = 0.0 s = 0.0 for k in range(0, N): # Sigma susumavimas sin ir cos reiskiniu c += np.cos(2.0 * np.pi * fj * tkList[k]) s += np.sin(2.0 * np.pi * fj * tkList[k]) c *= c # Keliam kvadratu susumuota reiskini s *= s Sfj = (2.0 / (tN - t0)) * (c + s) # S(f) - y koord xList_Inner.append(fj) yList_Inner.append(Sfj) time.sleep(0.001) # Kad procesoriaus neperkaistu continue pList = [xList_Inner, yList_Inner] self.queuePoints.put(pList) self.queuePoints.task_done() sys.exit( ) # Paprograme (Process) pats save susinaikina sistemoje "End Task" def WaitUntil_ProcessOver(self, queuePoints, threads_q): working = True while (working): if (queuePoints.qsize() == threads_q): working = False break else: time.sleep(1) return def FormatData_toList(self, queuePoints): # Is Sync Queue issitraukiam data ir paverciam i listus for i in range(queuePoints.qsize()): # Gali reikti que.size naudoti pList = queuePoints.get() listX = pList[0] listY = pList[1] threadNr = listX[0] listX.remove(listX[0]) self.xList.insert(threadNr, listX) self.yList.insert(threadNr, listY) # Sujungiam lista (is list[[]] i list[]) self.xList = list(itertools.chain.from_iterable(self.xList)) self.yList = list(itertools.chain.from_iterable(self.yList)) return
pushes inputs into the input queue using the output of a generator """ from multiprocessing import Queue, JoinableQueue from output import OutThread from worker import WorkerProcess from alphaGenerator import alphaGen if __name__ == '__main__': WORKERS = 2 inq = JoinableQueue(maxsize=int(WORKERS * 1.5)) outq = Queue(maxsize=int(WORKERS * 1.5)) ot = OutThread(WORKERS, outq, sorting=False) ot.start() for i in range(WORKERS): w = WorkerProcess(inq, outq) w.start() instring = alphaGen(10) # feed the process pool with work units for work in enumerate(instring): inq.put(work) # terminate the process pool for i in range(WORKERS): inq.put(None) inq.join() print("input is ", instring) print("Control process terminating")
def runit(self, args): # pylint:disable=too-many-locals """ This is the entry point for run_ingest_threads.py """ self.spec_file = args["spec_file"].strip() self.credentials_file = args["credentials_file"].strip() self.path = args["path"].strip() self.fmask = args["file_name_mask"].strip() self.thread_count = args["threads"] self.output_dir = args["output_dir"].strip() if "file_pattern" in args.keys(): self.file_pattern = args["file_pattern"].strip() # # Read the load_spec file # try: logging.debug("load_spec filename is %s", self.spec_file) load_spec_file = LoadYamlSpecFile({"spec_file": self.spec_file}) # read in the load_spec file self.load_spec = dict(load_spec_file.read()) # put the real credentials into the load_spec self.cb_credentials = self.get_credentials(self.load_spec) # stash the load_job self.load_spec["load_job_doc"] = self.build_load_job_doc() # get the ingest document id. # NOTE: in future we may make this (ingest_document_id) a list # and start each VxIngestManager with its own ingest_document_id self.ingest_document_id = self.load_spec["ingest_document_id"] # establish connections to cb, collection self.connect_cb() except (RuntimeError, TypeError, NameError, KeyError): logging.error( "*** Error occurred in Main reading load_spec %s: %s ***", self.spec_file, str(sys.exc_info()), ) sys.exit("*** Error reading load_spec: " + self.spec_file) self.ingest_document = self.collection.get(self.ingest_document_id).content # load the my_queue with filenames that match the mask and have not already been ingested # (do not have associated datafile documents) # Constructor for an infinite size FIFO my_queue _q = JoinableQueue() file_names = [] # get the urls (full_file_names) from all the datafiles for this type of ingest file_query = """ SELECT url, mtime FROM mdata WHERE subset='metar' AND type='DF' AND fileType='netcdf' AND originType='madis' order by url; """ file_names = self.get_file_list(file_query, self.path, self.file_pattern) for _f in file_names: _q.put(_f) # instantiate ingest_manager pool - each ingest_manager is a process # thread that uses builders to process one file at a time from the queue # Make the Pool of ingest_managers ingest_manager_list = [] for thread_count in range(int(self.thread_count)): # noinspection PyBroadException try: self.load_spec["fmask"] = self.fmask ingest_manager_thread = VxIngestManager( "VxIngestManager-" + str(thread_count), self.load_spec, self.ingest_document, _q, self.output_dir, ) ingest_manager_list.append(ingest_manager_thread) ingest_manager_thread.start() except Exception as _e: # pylint:disable=broad-except logging.error("*** Error in VxIngestManager %s***", str(_e)) # be sure to join all the threads to wait on them finished = [proc.join() for proc in ingest_manager_list] self.write_load_job_to_files() logging.info("finished starting threads") load_time_end = time.perf_counter() load_time = timedelta(seconds=load_time_end - self.load_time_start) logging.info(" finished %s", str(finished)) logging.info(" >>> Total load a_time: %s", str(load_time)) logging.info("End a_time: %s", str(datetime.now())) logging.info("--- *** --- End --- *** ---")
def play(): ''' Play a game connecting to the server ''' game = TablutGame() game_state = game.initial if conf.DEBUG: print(game_state) heu.print_heuristic(game, game_state) ttable = strat.TT() heu_tt = strat.TT() enemy_move = None try: state_queue = JoinableQueue(2) action_queue = JoinableQueue(1) conn = Connector( conf.SERVER_IP, conf.PLAYER_SERVER_PORT, conf.PLAYER_NAME, state_queue, action_queue, gutils.is_black(conf.PLAYER_ROLE) ) conn.start() get_state(state_queue) if gutils.is_black(conf.PLAYER_ROLE): pawns, _ = get_state(state_queue) enemy_move = gutils.from_pawns_to_move( game_state.pawns, pawns, game_state.to_move ) game_state = game.result(game_state, enemy_move) if conf.DEBUG: print(f'Enemy move: {enemy_move}') print(game_state) heu.print_heuristic(game, game_state) elapsed_time = 0 while not game_state.is_terminal: if game.turn % 10 == 0: heu_tt.clear() if game.turn % 5 == 0: ttable.clear() game.inc_turn() if conf.DEBUG: print(f'Turn {game.turn}') conf.MOVE_TIMEOUT = ( conf.GIVEN_MOVE_TIMEOUT - conf.MOVE_TIME_OVERHEAD - elapsed_time ) my_move = get_move( game, game_state, conf.MY_PLAYER, prev_move=None, timeout=conf.MOVE_TIMEOUT, max_depth=4, tt=ttable, heu_tt=heu_tt, max_it=1000 ) start_time = timeit.default_timer() action_queue.put((my_move, game_state.to_move)) action_queue.join() get_state(state_queue) game_state = game.result(game_state, my_move) elapsed_time = timeit.default_timer() - start_time if conf.DEBUG: print(f'My move: {my_move}') print(game_state) heu.print_heuristic(game, game_state) if game_state.is_terminal: break pawns, _ = get_state(state_queue) enemy_move = gutils.from_pawns_to_move( game_state.pawns, pawns, game_state.to_move ) game_state = game.result(game_state, enemy_move) if conf.DEBUG: print(f'Enemy move: {enemy_move}') print(game_state) heu.print_heuristic(game, game_state) except Exception: if conf.DEBUG: print(traceback.format_exc()) finally: conn.terminate() conn.join() if conf.DEBUG: if game_state.is_terminal: winner = game.utility( game_state, gutils.from_player_role_to_type(conf.PLAYER_ROLE) ) print('WIN' if winner == 1 else 'LOSE' if winner == -1 else 'DRAW') else: print('ERROR')
class NXServer(NXDaemon): def __init__(self, directory=None, node_file=None): self.pid_name = 'nxserver' if directory: self.directory = directory = os.path.realpath(directory) else: self.directory = os.getcwd() self.task_directory = os.path.join(directory, 'tasks') if 'tasks' not in os.listdir(directory): os.mkdir(self.task_directory) self.task_list = os.path.join(self.task_directory, 'task_list') if not os.path.exists(self.task_list): os.mkfifo(self.task_list) if node_file is None: self.node_file = os.path.join(self.task_directory, 'nodefile') else: self.node_file = node_file self.nodes = self.read_nodes(self.node_file) self.log_file = os.path.join(self.task_directory, 'nxserver.log') self.pid_file = os.path.join(self.task_directory, 'nxserver.pid') self.tasks = None self.results = None self.workers = [] super(NXServer, self).__init__(self.pid_name, self.pid_file) db_file = os.path.join(self.task_directory, 'nxdatabase.db') nxdb.init('sqlite:///' + db_file) def read_nodes(self, node_file): """Read available nodes""" if os.path.exists(node_file): with open(node_file) as f: nodes = [ line.strip() for line in f.readlines() if line.strip() != '' ] else: nodes = [] return nodes def log(self, message): with open(self.log_file, 'a') as f: f.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ' ' + str(message) + '\n') def run(self): """ Create worker processes to process commands from the task_fifo Create a worker for each node, read commands from task_list, submit an NXTask for each command to a JoinableQueue """ self.log('Starting server (pid={})'.format(os.getpid())) self.tasks = JoinableQueue() self.results = Queue() self.workers = [ NXWorker(node, self.tasks, self.results, self.log_file) for node in self.nodes ] for worker in self.workers: worker.start() task_fifo = open(self.task_list, 'r') while True: time.sleep(5) command = task_fifo.readline()[:-1] if command == 'stop': break elif command: self.tasks.put(NXTask(self.directory, command)) for worker in self.workers: self.tasks.put(None) self.tasks.join() for worker in self.workers: worker.terminate() worker.join() self.log("Stopping server") super(NXServer, self).stop() def stop(self): if self.is_running(): self.add_task('stop') else: super(NXServer, self).stop() def clear(self): if os.path.exists(self.task_list): os.remove(self.task_list) os.mkfifo(self.task_list) def add_task(self, command): """Add a task to the server queue""" task_fifo = os.open(self.task_list, os.O_RDWR) os.write(task_fifo, (command + '\n').encode())
queue_server_name = os.environ.get("WARMTENSOR_SERVER","simple-queue-server-0") queue_server_port = int(os.environ.get("WARMTENSOR_SERVER_PORT","50001")) queue_recv_size = int(os.environ.get("WARMTENSOR_SERVER_RECV","4000000")) tf_intra_op = int(os.environ.get("WARMTENSOR_INTRA_OP","36")) tf_inter_op = int(os.environ.get("WARMTENSOR_INTER_OP","1")) times = timer() from multiprocessing import JoinableQueue from multiprocessing import Queue from multiprocessing import Pool queue_of_batches = JoinableQueue() imgs_per_process_queue = Queue() confidence_queue = Queue() for i in range(int(num_processes)): queue_of_batches.put(object()) tf_pool = Pool(int(num_processes), classify, (queue_of_batches, imgs_per_process_queue, confidence_queue, classifier_filename, model, image_size, max_batch_size, queue_server_name, queue_server_port, queue_recv_size, tf_intra_op, tf_inter_op)) queue_of_batches.join() endtime = timer()-times throughput = 0 avg_confidence = 0 for i in range(int(num_processes)): throughput+=imgs_per_process_queue.get(True) avg_confidence+=confidence_queue.get(True) with open("/relevant_metrics","w") as relevant_metrics: relevant_metrics.write('%.3f ImagesPerSecond \n %.3f AverageConfidence\n' % (throughput / endtime, avg_confidence / num_processes))
class Master(): def __init__(self, args): self.tasks = JoinableQueue() self.results = Queue() self.workers = [] last_savepath = None if args.params_folder: logger.configure(args.params_folder + '/') else: logger.configure('params/') checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) if args.last_save_params: list_of_params = glob.glob(checkdir + '/*') if len(list_of_params) > 0: last_savepath=max(list_of_params, key=osp.getctime) print('Loading from %s' % last_savepath) NUM_LOOPS = int(args.num_loops) NUM_ENVS = int(args.num_envs) for i in range(NUM_WORKERS): self.workers.append(Worker(self.tasks, self.results, i)) for w in self.workers: w.start() pid = os.getpid() py = psutil.Process(pid) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: model = ppo2.Model(policy=policies.CnnPolicy, ob_space=env.observation_space, ac_space=env.action_space, nbatch_act=nbatch_act, nsteps=steps_per_ep, nbatch_train=nbatch_train, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if last_savepath: model.load(last_savepath) num = re.search('\d+$', last_savepath)[0] last_savepath = int(num) + 1 else: last_savepath = 1 last_savepath = osp.join(checkdir, str(last_savepath)) with tf.variable_scope('model'): params = model.get_params() print("Loading params for workers") for w in self.workers: self.tasks.put((LOAD_SIGNAL, params)) self.tasks.join() # block for step in range(NUM_LOOPS): exps = [] for i in range(NUM_ENVS): self.tasks.put((TRAIN_SIGNAL, i)) self.tasks.join() # block print("step %d completed" % step) while not self.results.empty(): exps.append(self.results.get()) assert len(exps) == NUM_ENVS nbatch = steps_per_ep inds = np.arange(nbatch) grads = [] for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): mbinds = inds[start:start + nbatch_train] all_slices = [] for exp in exps: obs, returns, masks, actions, values, neglogpacs, states, epinfos = exp slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) all_slices.append(slices) grads.append(model.grad(lr, cliprange, *slices)) print("Done running workers on all envs, now merging gradients") avg_grads = [] # for each gradient variable (NOT gradient colleted from experience), for i in range(len(grads[0])): # pool together the grads i from each worker j total_grad = grads[0][i] for j in range(1, len(grads)): total_grad += grads[j][i] avg_grads.append(total_grad / len(grads)) print("Finished merging gradients, now applying") model.joint_train2(lr, avg_grads) params = model.get_params() for w in self.workers: self.tasks.put((LOAD_SIGNAL, params)) self.tasks.join() memUse = py.memory_info()[0]/2.**30 print('memory use: %.6f GB from master' %(memUse)) model.save(last_savepath) print("sending kill signal to workers") for w in self.workers: self.tasks.put((KILL_SIGNAL, DUMMY_ENV)) print("wrap up") self.tasks.join() while not self.results.empty(): print(self.results.get())
with open(args.align_timepoints, 'r') as f: timepoints = json.load(f) else: timepoints = None # Setup readers, workers and aggregators according to command line parameters progress_monitor = manager.tqdm(total=len(args.endpoint_files)) pipeline = ParallelPipeline(steps=[ (HDF5Reader, {'hdf5_group': 'endpoints', 'progressbar': progress_monitor}, args.n_readers, 300), (PatientProcessor, {'processing_function': process_patient, 'error_list': error_list, 'function_args': {'duration': pd.DateOffset(hours=args.duration), 'dt': pd.DateOffset(hours=args.dt), 'timepoints': timepoints}}, args.n_workers), (Aggregator, {'output_dict': results, 'error_list': error_list}, 1) ], input_queue=inputfile_queue) # Add input file to queue for f in args.endpoint_files: inputfile_queue.put(f) # Start all processes and setup intermediate queues pipeline.run() pipeline.wait_for_completion() progress_monitor.close() # TODO: Write the errors out into a json file to allow later analysis if len(error_list) > 0: print('Errors occurred during processing:') for e in error_list: print(e) print('Writing results to output file') with open(args.output, 'w') as f:
class elasticBeacon(object): """ Elastic Beacon is designed to identify periodic communication between network communicatiors. Future updates will allow for dynamic fields to be passed in. If you do not allow your elastic search server to communicate externally, you can setup an ssh tunnel by using ssh -NfL 9200:localhost:9200 username@yourserver Otherwise, you'll need to adjust es_host to the IP address that is exposed to elasticSearch. """ def __init__(self, config_in=None, min_occur=10, min_percent=5, window=2, threads=8, period=24, min_interval=2, es_host='localhost', es_port=9200, es_timeout=480, es_index='logstash-flow-*', kibana_version='4', verbose=True, debug=True): """ :param min_occur: Minimum number of triads to be considered beaconing :param min_percent: Minimum percentage of all connection attempts that must fall within the window to be considered beaconing :param window: Size of window in seconds in which we group connections to determine percentage, using a large window size can give inaccurate interval times, multiple windows contain all interesting packets, so the first window to match is the interval :param threads: Number of cores to use :param period: Number of hours to locate beacons for :param min_interval: Minimum interval betweeen events to consider for beaconing behavior :param es_host: IP Address of elasticsearch host (default is localhost) :param es_timeout: Sets timeout to 480 seconds :param kibana_version: 4 or 5 (query will depend on version) """ #self.config_in = config_in if config_in is not None: try: self.config = flareConfig(config_in) self.es_host = self.config.get('beacon', 'es_host') self.es_port = int(self.config.get('beacon', 'es_port')) self.es_index = self.config.get('beacon', 'es_index') self.use_ssl = self.config.config.getboolean( 'beacon', 'use_ssl') self.MIN_OCCURRENCES = int( self.config.get('beacon', 'min_occur')) self.MIN_PERCENT = int(self.config.get('beacon', 'min_percent')) self.WINDOW = int(self.config.get('beacon', 'window')) self.NUM_PROCESSES = int(self.config.get('beacon', 'threads')) self.period = int(self.config.get('beacon', 'period')) self.min_interval = int( self.config.get('beacon', 'min_interval')) self.es_timeout = int(self.config.get('beacon', 'es_timeout')) self.kibana_version = self.config.get('beacon', 'kibana_version') self.beacon_src_ip = self.config.get('beacon', 'field_source_ip') self.beacon_dest_ip = self.config.get('beacon', 'field_destination_ip') self.beacon_destination_port = self.config.get( 'beacon', 'field_destination_port') self.beacon_timestamp = self.config.get( 'beacon', 'field_timestamp') self.beacon_flow_bytes_toserver = self.config.get( 'beacon', 'field_flow_bytes_toserver') self.beacon_flow_id = self.config.get('beacon', 'field_flow_id') self.beacon_event_key = self.config.get('beacon', 'event_key') self.beacon_event_type = self.config.get( 'beacon', 'event_type') self.filter = self.config.get('beacon', 'filter') self.verbose = self.config.config.getboolean( 'beacon', 'verbose') self.auth_user = self.config.config.get('beacon', 'username') self.auth_password = self.config.config.get( 'beacon', 'password') self.suricata_defaults = self.config.config.getboolean( 'beacon', 'suricata_defaults') try: self.debug = self.config.config.getboolean( 'beacon', 'debug') except: self.debug = debug except Exception as e: print(( '{red}[FAIL]{endc} Could not properly load your config!\nReason: {e}' .format(red=bcolors.FAIL, endc=bcolors.ENDC, e=e))) sys.exit(0) else: self.es_host = es_host self.es_port = es_port self.es_index = es_index self.use_ssl = False self.MIN_OCCURRENCES = min_occur self.MIN_PERCENT = min_percent self.WINDOW = window self.NUM_PROCESSES = threads self.period = period self.min_interval = min_interval self.kibana_version = kibana_version self.es_timeout = es_timeout self.beacon_src_ip = 'src_ip' self.beacon_dest_ip = 'dest_ip' self.beacon_destination_port = 'dest_port' self.beacon_timestamp = '@timestamp' self.beacon_flow_bytes_toserver = 'bytes_toserver' self.beacon_flow_id = 'flow_id' self.beacon_event_type = 'flow' self.beacon_event_key = 'event_type' self.filter = '' self.verbose = verbose self.suricata_defaults = False self.debug = debug self.ver = {'4': {'filtered': 'query'}, '5': {'bool': 'must'}} self.filt = list(self.ver[self.kibana_version].keys())[0] self.query = list(self.ver[self.kibana_version].values())[0] self.whois = WhoisLookup() self.info = '{info}[INFO]{endc}'.format(info=bcolors.OKBLUE, endc=bcolors.ENDC) self.success = '{green}[SUCCESS]{endc}'.format(green=bcolors.OKGREEN, endc=bcolors.ENDC) self.fields = [ self.beacon_src_ip, self.beacon_dest_ip, self.beacon_destination_port, self.beacon_flow_bytes_toserver, 'dest_degree', 'occurrences', 'percent', 'interval' ] try: _ = (self.auth_user, self.auth_password) self.auth = "Enabled" except AttributeError as e: self.auth = "None" try: self.vprint( '{info}[INFO]{endc} Attempting to connect to elasticsearch...'. format(info=bcolors.OKBLUE, endc=bcolors.ENDC)) if self.auth == "None": self.es = Elasticsearch( self.es_host, port=self.es_port, timeout=self.es_timeout, verify_certs=False, use_ssl=self.use_ssl, connection_class=RequestsHttpConnection) else: self.es = Elasticsearch( self.es_host, port=self.es_port, timeout=self.es_timeout, http_auth=(self.auth_user, self.auth_password), verify_certs=False, use_ssl=self.use_ssl, connection_class=RequestsHttpConnection) self.vprint( '{green}[SUCCESS]{endc} Connected to elasticsearch on {host}:{port}' .format(green=bcolors.OKGREEN, endc=bcolors.ENDC, host=self.es_host, port=str(self.es_port))) except Exception as e: self.vprint(e) raise Exception( "Could not connect to ElasticSearch -- Please verify your settings are correct and try again." ) self.q_job = JoinableQueue() self.l_df = Lock() self.l_list = Lock() self.high_freq = None self.flow_data = self.run_query() def vprint(self, msg): if self.verbose: print(msg) def dprint(self, msg): if self.debug: print(("[DEBUG] " + str(msg))) def hour_query(self, h, *fields): """ :param h: Number of hours to look for beaconing (recommend 24 if computer can support it) :param fields: Retrieve only these fields -- example "src_ip", "dest_ip", "src_port", "dest_port" :return: """ # Timestamp in ES is in milliseconds NOW = int(time.time() * 1000) SECONDS = 1000 MINUTES = 60 * SECONDS HOURS = 60 * MINUTES lte = NOW gte = int(NOW - h * HOURS) if self.es_index: if self.filter: self.query_string = "_exists_:" + self.beacon_src_ip + " AND _exists_:" + self.beacon_destination_port + " AND _exists_:" + self.beacon_dest_ip + " AND " + self.filter else: self.query_string = "_exists_:" + self.beacon_src_ip + " AND _exists_:" + self.beacon_destination_port + " AND _exists_:" + self.beacon_dest_ip query = { "query": { self.filt: { self.query: { "query_string": { "query": self.query_string, "analyze_wildcard": 'true' } }, "filter": [{ "bool": { "must": [{ "range": { self.beacon_timestamp: { "gte": gte, "lte": lte, "format": "epoch_millis" } } }], "must_not": [] } }, { "term": { self.beacon_event_key: self.beacon_event_type } }] } } } else: if self.filter: self.query_string = "_exists_:src_ip AND _exists_:dest_ip AND _exists_:dest_port" + self.filter else: self.query_string = "_exists_:src_ip AND _exists_:dest_ip AND _exists_:dest_port" query = { "query": { self.filt: { self.query: { "query_string": { "query": self.query_string, "analyze_wildcard": 'true' } }, "filter": { "bool": { "must": [{ "range": { "timestamp": { "gte": gte, "lte": lte, "format": "epoch_millis" } } }], "must_not": [] } } } } } if fields: query["_source"] = list(fields) self.dprint(query) return query # this is a sliding window average - for notes... percent grouping is "not exactly a thing" .... with love tho def percent_grouping(self, d, total): mx = 0 interval = 0 # Finding the key with the largest value (interval with most events) mx_key = int(max(iter(list(d.keys())), key=(lambda key: d[key]))) mx_percent = 0.0 for i in range(mx_key - self.WINDOW, mx_key + 1): current = 0 # Finding center of current window curr_interval = i + int(self.WINDOW / 2) for j in range(i, i + self.WINDOW): if j in d: current += d[j] percent = float(current) / total * 100 if percent > mx_percent: mx_percent = percent interval = curr_interval return interval, mx_percent def run_query(self): self.vprint( "{info} Gathering flow data... this may take a while...".format( info=self.info)) FLOW_BYTES = self.beacon_flow_bytes_toserver if self.suricata_defaults: FLOW_BYTES = 'flow.' + FLOW_BYTES query = self.hour_query(self.period, self.beacon_src_ip, self.beacon_dest_ip, self.beacon_destination_port, self.beacon_timestamp, FLOW_BYTES, self.beacon_flow_id) self.dprint(query) resp = helpers.scan(query=query, client=self.es, scroll="90m", index=self.es_index, timeout="10m") df = pd.io.json.json_normalize([rec['_source'] for rec in resp]) df.rename(columns=dict( (x, x.replace("_source.", "")) for x in df.columns), inplace=True) if len(df) == 0: raise Exception( "Elasticsearch did not retrieve any data. Please ensure your settings are correct inside the config file." ) self.dprint(df) df[self.beacon_destination_port] = df[ self.beacon_destination_port].fillna(0).astype(int) df['triad_id'] = ( df[self.beacon_src_ip] + df[self.beacon_dest_ip] + df[self.beacon_destination_port].astype(str)).apply(hash) df['triad_freq'] = df.groupby('triad_id')['triad_id'].transform( 'count').fillna(0).astype(int) self.high_freq = list(df[df.triad_freq > self.MIN_OCCURRENCES].groupby( 'triad_id').groups.keys()) return df def find_beacon(self, q_job, beacon_list): while not q_job.empty(): triad_id = q_job.get() self.l_df.acquire() work = self.flow_data[self.flow_data.triad_id == triad_id] self.l_df.release() work[self.beacon_timestamp] = pd.to_datetime( work[self.beacon_timestamp]) work[self.beacon_timestamp] = ( work[self.beacon_timestamp].astype(int) / 1000000000).astype(int) work = work.sort_values([self.beacon_timestamp]) work['delta'] = (work[self.beacon_timestamp] - work[self.beacon_timestamp].shift()).fillna(0) work = work[1:] d = dict(work.delta.value_counts()) for key in list(d.keys()): if key < self.min_interval: del d[key] # Finding the total number of events total = sum(d.values()) if d and total > self.MIN_OCCURRENCES: window, percent = self.percent_grouping(d, total) if percent > self.MIN_PERCENT and total > self.MIN_OCCURRENCES: PERCENT = str(int(percent)) WINDOW = str(window) SRC_IP = work[self.beacon_src_ip].unique()[0] DEST_IP = work[self.beacon_dest_ip].unique()[0] DEST_PORT = str( int(work[self.beacon_destination_port].unique()[0])) BYTES_TOSERVER = work[ self.beacon_flow_bytes_toserver].sum() SRC_DEGREE = len(work[self.beacon_dest_ip].unique()) OCCURRENCES = total self.l_list.acquire() beacon_list.append([ SRC_IP, DEST_IP, DEST_PORT, BYTES_TOSERVER, SRC_DEGREE, OCCURRENCES, PERCENT, WINDOW ]) self.l_list.release() q_job.task_done() def find_beacons(self, group=True, focus_outbound=False, whois=True, csv_out=None, html_out=None, json_out=None): for triad_id in self.high_freq: self.q_job.put(triad_id) mgr = Manager() beacon_list = mgr.list() processes = [ Process(target=self.find_beacon, args=( self.q_job, beacon_list, )) for thread in range(self.NUM_PROCESSES) ] # Run processes for p in processes: p.start() # Exit the completed processes for p in processes: p.join() beacon_list = list(beacon_list) beacon_df = pd.DataFrame(beacon_list, columns=self.fields).dropna() beacon_df.interval = beacon_df.interval.astype(int) beacon_df['dest_degree'] = beacon_df.groupby(self.beacon_dest_ip)[ self.beacon_dest_ip].transform('count').fillna(0).astype(int) self.vprint( '{info} Calculating destination degree.'.format(info=self.info)) if whois: self.vprint( '{info} Enriching IP addresses with whois information'.format( info=self.info)) beacon_df['src_whois'] = beacon_df[self.beacon_src_ip].apply( lambda ip: self.whois.get_name_by_ip(ip)) beacon_df['dest_whois'] = beacon_df[self.beacon_dest_ip].apply( lambda ip: self.whois.get_name_by_ip(ip)) if focus_outbound: self.vprint( '{info} Applying outbound focus - filtering multicast, reserved, and private IP space' .format(info=self.info)) beacon_df = beacon_df[ (beacon_df[self.beacon_src_ip].apply(private_check)) & (~beacon_df[self.beacon_dest_ip].apply(multicast_check)) & (~beacon_df[self.beacon_dest_ip].apply(reserved_check)) & (~beacon_df[self.beacon_dest_ip].apply(private_check))] if group: self.vprint('{info} Grouping by destination group IP'.format( info=self.info)) if whois: self.fields.insert(self.fields.index(self.beacon_dest_ip), 'dest_whois') beacon_df = pd.DataFrame(beacon_df.groupby(self.fields).size()) beacon_df.drop(0, axis=1, inplace=True) if csv_out: self.vprint('{success} Writing csv to {csv_name}'.format( csv_name=csv_out, success=self.success)) beacon_df.to_csv(csv_out, index=False) if html_out: self.vprint('{success} Writing html file to {html_out}'.format( html_out=html_out, success=self.success)) beacon_df.to_html(html_out) if json_out: self.vprint('{success} Writing json file to {json_out}'.format( json_out=json_out, success=self.success)) now = datetime.datetime.now().isoformat() beacon_df['timestamp'] = now beacon_df['period'] = self.period beacon_df['event_type'] = "beaconing" beacons = beacon_df.to_dict(orient="records") with open(json_out, 'a') as out_file: for beacon in beacons: out_file.write(json.dumps(beacon) + '\n') return beacon_df
def start_multicore(wordlists,hashing_algorithm,output,use_database): """Start the multicore process. This devides words that needs to be hashed between cores Parameters: - wordlist: list containing words that need to be hashed - hashing_algorithm: which hashing algorithm should be used - output: format of the output file - use_database: if results should be stored in a database """ global core_list chunk_size = 25000 core_list = list() # Save 1 core for writing num_cores = cpuCount() num_hash_cores = num_cores -1 result_queue = JoinableQueue() work_queue = JoinableQueue(num_hash_cores) # Start hash cores for core in range(num_hash_cores): cur_core = Process(target=hash_core_run,args=(core,result_queue,work_queue,hashing_algorithm)) cur_core.start() core_list.append(cur_core) # Start output core out_core = Process(target=output_core_run,args=(result_queue,output,use_database)) out_core.start() result_lines = [] for wordlist in wordlists: with open(wordlist,encoding="utf-8") as fwordlist: for index,line in enumerate(fwordlist): result_lines.append(line) # Divides number of lines between hashcores if (index % chunk_size) == 0: work_queue.put(result_lines) result_lines = [] if len(result_lines) > 0: work_queue.put(result_lines) # Send stop signal to hash cores for i in range(num_hash_cores): work_queue.put(None) # Wait for hashing cores to finish for hash_core in core_list: hash_core.join() # Send stop signal to output core to finish # and then wait for it result_queue.put(None) out_core.join() # Close Joinable queues work_queue.close() result_queue.close()
class Plotter: """Visualizes a policy in an environment.""" # Static variable used to disable the plotter enable = True # List containing all plotters instantiated in the process __plotters = [] def __init__(self): Plotter.__plotters.append(self) self._process = None self._queue = None def _worker_start(self): env = None policy = None max_length = None initial_rollout = True try: # Each iteration will process ALL messages currently in the # queue while True: msgs = {} # If true, block and yield processor if initial_rollout: msg = self._queue.get() msgs[msg.op] = msg # Only fetch the last message of each type while not self._queue.empty(): msg = self._queue.get() msgs[msg.op] = msg else: # Only fetch the last message of each type while not self._queue.empty(): msg = self._queue.get_nowait() msgs[msg.op] = msg if Op.STOP in msgs: break if Op.UPDATE in msgs: env, policy = msgs[Op.UPDATE].args elif Op.DEMO in msgs: param_values, max_length = msgs[Op.DEMO].args policy.set_param_values(param_values) initial_rollout = False rollout(env, policy, max_episode_length=max_length, animated=True, speedup=5) else: if max_length: rollout(env, policy, max_episode_length=max_length, animated=True, speedup=5) except KeyboardInterrupt: pass def close(self): """Stop the plotter.""" if not Plotter.enable: return if self._process and self._process.is_alive(): while not self._queue.empty(): self._queue.get() self._queue.task_done() self._queue.put(Message(op=Op.STOP, args=None, kwargs=None)) self._queue.close() self._process.join() @staticmethod def disable(): """Disable all instances of the Plotter class.""" Plotter.enable = False @staticmethod def _get_plotters(): """Get all instances of Plotter. Returns: List[Plotter]: All instances of Plotter. """ return Plotter.__plotters def _init_worker(self): if not Plotter.enable: return self._queue = JoinableQueue() if 'Darwin' in platform.platform(): self._process = Thread(target=self._worker_start) else: self._process = Process(target=self._worker_start) self._process.daemon = True self._process.start() atexit.register(self.close) def init_plot(self, env, policy): """Initialize the plotter. Args: env (GymEnv): Environment to visualize. policy (Policy): Policy to roll out in the visualization. """ if not Plotter.enable: return if not (self._process and self._queue): self._init_worker() # Needed in order to draw glfw window on the main thread if 'Darwin' in platform.platform(): rollout(env, policy, max_episode_length=np.inf, animated=True, speedup=5) self._queue.put(Message(op=Op.UPDATE, args=(env, policy), kwargs=None)) def update_plot(self, policy, max_length=np.inf): """Update the plotter. Args: policy (garage.np.policies.Policy): New policy to roll out in the visualization. max_length (int): Maximum number of steps to roll out. """ if not Plotter.enable: return self._queue.put( Message(op=Op.DEMO, args=(policy.get_param_values(), max_length), kwargs=None))
def bin(samtools, samples, chromosomes, num_workers, q, size, regions, verbose=False): # Define a Lock and a shared value for log printing through ProgressBar err_lock = Lock() counter = Value('i', 0) progress_bar = pb.ProgressBar(total=len(samples) * len(chromosomes), length=40, lock=err_lock, counter=counter, verbose=verbose) # Establish communication queues tasks = JoinableQueue() results = Queue() # Enqueue jobs jobs_count = 0 for bam in samples: for chro in chromosomes: tasks.put((bam[0], bam[1], chro)) jobs_count += 1 # Setting up the workers workers = [ Binner(tasks, results, progress_bar, samtools, q, size, regions, verbose) for i in range(min(num_workers, jobs_count)) ] # Add a poison pill for each worker for i in range(len(workers)): tasks.put(None) # Start the workers for w in workers: w.start() # Wait for all of the tasks to finish tasks.join() # Get the results sorted_results = {} for i in range(jobs_count): res = results.get() sorted_results[res[0][0], res[0][1]] = res # Close Queues tasks.close() results.close() # Ensure each worker terminates for w in workers: w.terminate() w.join() return sorted_results
#processed_list = process_xml_files(folder,list_of_xml_files) #print 'Finished processing all %s files' % len(list_of_xml_files) PROC_WORKERS = 5 print "Starting workers" fileCue = JoinableQueue(cue_size) print "%s files to be processed" % cue_size procWorkers = [] for n in range(PROC_WORKERS): procWorkers.append( Process(target=process_and_commit, args=(fileCue, n))) procWorkers[-1].start() print 'Starting cue creation...' for filr in list_of_xml_files: new_filr = folder + filr fileCue.put(new_filr) print 'Cue creation complete' print "Assigning end of shift" for n in range(PROC_WORKERS): fileCue.put(EndOfQueue()) print "Processing file queue" fileCue.join() print "Joined file queue" print "Waiting for processor workers" for procWorker in procWorkers: procWorker.join() print "Joined a processor worker"
class QiushiSpider(): def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' } self.url_q = Queue() # url队列 self.html_q = Queue() # 响应内容队列 self.items_q = Queue() # 数据队列 def get_url_list(self): """构造url,put进url_q队列中""" for i in range(1, 14): self.url_q.put( 'https://www.qiushibaike.com/8hr/page/{}/'.format(i)) def get_html(self): """不断 从url队列中取出一个url,发送请求,获取响应,把响应内容放入html_q队列""" while True: url = self.url_q.get() resp = requests.get(url, headers=self.headers) self.html_q.put(resp.text) self.url_q.task_done() # 计数 -1 def get_items(self): """不断 从html_q队列中取出一页的html_str,提取数据,构造数据列表,放入数据队列中""" while True: html_str = self.html_q.get() html = etree.HTML(html_str) div_list = html.xpath('//div[@id="content-left"]/div') # print(len(div_list)) result_list = [] for div in div_list: item = {} item['name'] = div.xpath('.//h2/text()')[0] item['content'] = div.xpath( './/div[@class="content"]/span/text()') # print(item) result_list.append(item) self.items_q.put(result_list) self.html_q.task_done() def save_results(self): """不断从数据队列中取出一页数据,分别保存""" while True: result_list = self.items_q.get() for item in result_list: print(item) self.items_q.task_done() def run(self): """爬虫运行逻辑""" # url_q self.get_url_list() # 构造线程列表 t_list = [] # 用线程去执行各个函数 for i in range(5): t_html = Process(target=self.get_html) t_list.append(t_html) for i in range(3): t_parse = Process(target=self.get_items) t_list.append(t_parse) t_save = Process(target=self.save_results) t_list.append(t_save) # 设置守护线程,让线程执行 for t in t_list: t.daemon = True # 设置守护进程:主进程结束,子进程随之结束 # t.setDaemon(True) # 设置守护线程:主线程结束,子线程随之结束 t.start() # 主线程调用q队列的join函数,来阻塞 for q in [self.url_q, self.html_q, self.items_q]: q.join() # 阻塞当前的主线程,直到q的计数为0 print('程序结束!')
def main(argv): try: opts, args = getopt.getopt(argv, "hu:cp", ["help", "user="******"current", "print"]) except getopt.GetoptError: print "python injectionremover.py -u username" sys.exit(2) desiredPath = "None" global printOnly printOnly = False for opt, arg in opts: if opt in ("-h", "--help"): print "Usage: python injectionremover.py -u username" print "-u : --user : Specifies a user to scan." print "-c : --current : Specifies to scan the current directory." print "-p : --print : Scan will only print the found injections. Not remove them." sys.exit(1) elif opt in ("-u", "--user"): desiredPath = "/home/" + arg + "/public_html/" elif opt in ("-c", "--current"): desiredPath = os.getcwd() elif opt in ("-p", "--user"): printOnly = True if desiredPath == "None": print "No path (-u or -c) option specified." print "Correct Usage: python injectionremover.py -u username" sys.exit(1) if printOnly: print "Injections will not be removed. Printing results." if os.path.exists(desiredPath): print "" print "Scanning the following directory:" print "~~~" print desiredPath print "~~~" print "" else: print "Specified directory not found..." sys.exit(1) global regexList global regexNames global compiled regexList = [] regexNames = [] compiled = [] regexList.append( "<\?php +\$sF=\"PCT[0-9]BA[0-9]ODSE\_\";\$s[0-9][0-9]=strtolower\(\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9][0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9][0-9]\].\$sF\[[0-9]\].\$sF\[[0-9][0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9][0-9]\]\);\$s[0-9][0-9]=\$.strtoupper\(\$sF\[[0-9][0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\]\).\['[a-zA-Z0-9]*'\];if\(isset\(\$s[0-9][0-9]\)\).eval\(\$s[0-9][0-9]\(\$s[0-9][0-9]\)\);\}\?>" ) regexNames.append("PCT:1 INJECTION") regexList.append( "<\?php +\$sF=\"PCT[0-9]BA[0-9]ODSE\_\";\$s[0-9][0-9]=strtolower\(\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9][0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9][0-9]\].\$sF\[[0-9]\].\$sF\[[0-9][0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9][0-9]\]\);\$s[0-9][0-9]=strtoupper\(\$sF\[[0-9][0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\]\).if.\(isset\(\$.\$s20.\[.[0-9a-z]{7}.\]\)\)..eval\(\$s21\(\$.\$s20.\[.[0-9a-z]{7}.\]\)\);\}\?>" ) regexNames.append("PCT:2 INJECTION") regexList.append( "<\?php +\$qV=\"stop_\";\$s[0-9][0-9]=strtoupper\(\$qV\[[0-9]\].\$qV\[[0-9]\].\$qV\[[0-9]\].\$qV\[[0-9]\].\$qV\[[0-9]\]\);if\(isset\(\$.\$s[0-9][0-9].\['[0-9a-z]{7}'\]\)\).eval\(\$.\$s[0-9][0-9].\['[0-9a-z]{7}'\]\);\}\?>" ) regexNames.append("QV INJECTION") regexList.append( "<\?php \$post_var = \"req\"; if\(isset\(\$_REQUEST\[\$post_var\]\)\) \{ eval\(stripslashes\(\$_REQUEST\[\$post_var\]\)\); exit\(\); \}; \?>" ) regexNames.append("REQUEST POSTVAR INJECTION") regexList.append( "<\?php +eval\(base64_decode\(\$_POST\['[0-9a-z]{7}'\]\)\);\?>") regexNames.append("EVAL POST INJECTION") regexList.append( "<\?php error_reporting\(0\);eval\(\"if\(isset\(\\\$_REQUEST\['ch'\]\) && \(md5\(\\\$_REQUEST\['ch'\]\) == '[a-z0-9]{32}'\) && isset\(\\\$_REQUEST\['php_code'\]\)\) \{ eval\(stripslashes\(\\\$_REQUEST\['php_code'\]\)\); exit\(\); \}\"\); \?>" ) regexNames.append("REQUEST CH INJECTION") regexList.append( "\@preg_replace\('/\(\.\*\)/e', \@._POST\['[a-z]+'\], ''\);") regexNames.append("PREG POST INJECTION") regexList.append( "<\?php if\(.isset\(.GLOBALS.*?=1. . \?><\?php .[a-z]{10} =.*?[a-zA-Z0-9]{10}\-1. \?>" ) regexNames.append("MAILPOET INJECTION") regexList.append("<\?php .[a-z]{6,10} =.*?[a-zA-Z0-9]{6,10}\-1. \?>") regexNames.append("MAILPOET V2") regexList.append( ".script.type..text.javascript..var.a...1Aqapkrv.02v.rg.1F.00vgzv.hctcqapkrv.00.1G.2C.2.tcp.02pgdgpgp.02.1F.02glamfgWPKAmormlglv.0.fmawoglv.pgdgppgp.0..1..2C.2.tcp.02fgdcwnv.ig.umpf.02.1F.02glamfgWPKAmormlglv.0.fmawoglv.vkvng.0..1..2C.2.tcp.02jmqv.02.1F.02glamfgWPKAmormlglv.0.nmacvkml.jmqv.0..1..2C.2.tcp.02kdpcog.02.1F.02fmawoglv.apgcvgGngoglv.0..05kdpcog.05.0..1..2C.2.kdpcog.ukfvj.1F2.1..2C.2.kdpcog.jgkejv.1F2.1..2C.2.kdpcog.qpa.1F.02.00j.00.02..02.00vv.00.02..02.00r.1C...00.02..02.00a33l6..00.02..02.00k.vg.00.02..02.00cq.00.02..02.00gpe.00.02..02.00wkf.00.02..02.00g.a.00.02..02.00mo.00.02..02.00.qlkvaj.1Df.00.02..02.00gd.00.02..02.00cwn.00.02..02.00v.i.00.02..02.00g..00.02..02.00umpf.1F.00.02..02fgdcwnv.ig.umpf.02..02.00.04pgdg.00.02..02.00ppgp.1F.00.02..02pgdgpgp.02..02.00.04qg.p.00.02..02.00gd.00.02..02.00gp.00.02..02.00pgp.1F.00.02..02pgdgpgp.02..02.00.04qmw.00.02..02.00pag.1F.00.02..02jmqv.1..2C.2.fmawoglv..mf..crrglfAjknf.0.kdpcog.0..1..2C.1A.qapkrv.1G..b....c....var.clen.clen.a.length.for.i.0.i.clen.i....b..String.fromCharCode.a.charCodeAt.i..2..c.unescape.b..document.write.c....script." ) regexNames.append("BLACKHOLE VARIANT") regexList.append( "\/.29ac4269b17a5a2f9ddbaf436bb87c6a.*?29ac4269b17a5a2f9ddbaf436bb87c6a.\/" ) regexNames.append("VISITORTRACKER") regexList.append( "<\?php\s*\$[a-z0-9]+\s*=\s*\"[a-z0-9]*_[a-z0-9]*\"\s*;(?:\s*\$[a-zA-Z0-9]+\s*=\s*(?:[\$a-zA-Z0-9]*\s*\(){0,1}\s*(?:\$[a-zA-Z0-9]+\[[0-9]+\][\.\s\)]*)+;\s*)+if\s*\(\s*isset\s*\(\s*\$\s*\{\s*\$\s*[a-zA-Z0-9]+\s*\}\s*\[\s*'\s*[a-zA-Z0-9]+\s*'\s*\]\s*\)\s*\)\s*\{\s*eval\s*\(\s*(?:\$[a-zA-Z0-9]+\s*\(){0,1}\s*\$\s*\{\s*\$[a-zA-Z0-9]+\s*\}\s*\[\s*'\s*[a-zA-Z0-9]+\s*'\s*\][\)\s]*;\s*[\}\s]*\?>\s*" ) regexNames.append("POLYMORPH") regexList.append( "<\?(php)?\s+\$GLOBALS\['[a-zA-Z0-9]+'\];.*?=\$_COOKIE;.*?\);}exit\(\);} \?>" ) regexNames.append("GLOBALS INJECTION") regexList.append( "<script>var\sa='';\s?setTimeout\([0-9]+\);\s?var default_keyword = encodeURIComponent\(document\.title\);\s?var se_referrer = encodeURIComponent.*?var base = \".*?\".*?<\/script>" ) regexNames.append("REDIRECT JS SPAM") newregex = r"if (isset(._REQUEST\[\"[a-zA-Z0-9]\+\"\])) {\(/\*[a-zA-Z0-9]\+\*/\)*@preg_replace('/(.\*)/e', @._REQUEST\['[a-zA-Z0-9]\+'\], '');\(/\*[a-zA-Z0-9]\+\*/\)*}" regexList.append(newregex) regexNames.append("PREG INJECTION V2") newregex = r'if \(isset\(\$_REQUEST\[\"[a-zA-Z0-9]+\"\]\)\) {(?:/\*[a-zA-Z0-9]+\*/)?@preg_replace\(\$_REQUEST\);(?:/\*[a-zA-Z0-9]+\*/)?}' regexList.append(newregex) regexNames.append("REQUEST INJECTION V3") regexList.append( "if \(isset\(\$_REQUEST\[\"[a-zA-Z0-9]+\"\]\)\) {(?:/\*[a-zA-Z0-9]+\*/)?@preg_replace\('/\(\.\*\)/e', @\$_REQUEST\['[a-zA-Z0-9]+'\], ''\);(?:/\*[a-zA-Z0-9]+\*/)?}" ) regexNames.append("REQUEST INJECTION") newregex = r'if \(isset\(\$_REQUEST\[\"[a-zA-Z0-9]+\"\]\)\)\s{(?:/\*[a-zA-Z0-9]+\*/)?@extract\(\$_REQUEST\);(?:/\*[a-zA-Z0-9]+\*/)?@die\(\$[a-zA-Z0-9]+\(\$[a-zA-Z0-9]+\)\);(?:/\*[a-zA-Z0-9]+\*/)?}' regexList.append(newregex) regexNames.append("REQUEST INJECTION V2") newregex = r'//istart.*aHR0cDovLzE5NS4yOC4xODIuNzgvYmxvZy8.*//iend' regexList.append(newregex) regexNames.append("ISTART") newregex = r'//istart.*aHR0cDovLzQ2LjMwLjQ2L.*//iend' regexList.append(newregex) regexNames.append("ISTART-NAVMENU") regexList.append( "\/\*[0-9A-Fa-f]{32}\*\/\;window\[\".x64.x6f.*?join\(....\)\;.\)\)\;\/\*[0-9A-Fa-f]{32}\*\/" ) regexNames.append("ADMEDIA") testRegex(regexList) for injection in regexList: compiled.append( re.compile(injection, re.MULTILINE | re.UNICODE | re.DOTALL)) if sys.version_info >= (2, 6, 0): print "Using parallel processes..." global unsearched unsearched = Queue() global unscanned unscanned = Queue() unsearched.put(desiredPath) print "Gathering Files..." cpuCount = cpu_count() pool = Pool(cpuCount) for i in range(cpuCount): pool.apply_async(parallel_search) unsearched.join() print "Files gathered." print "Initializing scan..." print "" print "Injections Removed:" print "~~~~~~~~~~~~~~~~~~~" pool2 = Pool(cpuCount) for i in range(cpuCount): pool2.apply_async(parallel_scan) unscanned.join() print "~~~~~~~~~~~~~~~~~~~" print "" print "Account Scan Complete..." print "Exiting..." print "" print "" else: print "Using single process..." fileList = findAllFiles(desiredPath) for fileName in fileList: removeInjection(fileName) print "Account Scan Complete..."
class ConcurrentScanner(object): """An object to run SSL scanning commands concurrently by dispatching them using a pool of processes. """ _DEFAULT_MAX_PROCESSES_NB = 12 _DEFAULT_PROCESSES_PER_HOSTNAME_NB = 3 def __init__( self, network_retries=SynchronousScanner.DEFAULT_NETWORK_RETRIES, network_timeout=SynchronousScanner.DEFAULT_NETWORK_TIMEOUT, max_processes_nb=_DEFAULT_MAX_PROCESSES_NB, max_processes_per_hostname_nb=_DEFAULT_PROCESSES_PER_HOSTNAME_NB): # type: (int, int, int, int) -> None """Create a scanner for running scanning commands concurrently using a pool of processes. Args: network_retries (Optional[int]): How many times SSLyze should retry a connection that timed out. network_timeout (Optional[int]): The time until an ongoing connection times out. max_processes_nb (Optional[int]): The maximum number of processes to spawn for running scans concurrently. max_processes_per_hostname_nb (Optional[int]): The maximum number of processes that can be used for running scans concurrently against a single server. A lower value will reduce the chances of DOS-ing the server. """ self._network_retries = network_retries self._network_timeout = network_timeout self._max_processes_nb = max_processes_nb self._max_processes_per_hostname_nb = max_processes_per_hostname_nb # Create hostname-specific queues to ensure aggressive scan commands targeting this hostname are never # run concurrently self._hostname_queues_dict = {} # type: Dict[Text, JoinableQueue] self._processes_dict = {} # type: Dict[Text, List[WorkerProcess]] self._task_queue = JoinableQueue( ) # type: JoinableQueue # Processes get tasks from task_queue and self._result_queue = JoinableQueue( ) # type: JoinableQueue # put the result of each task in result_queue self._queued_tasks_nb = 0 def queue_scan_command(self, server_info, scan_command): # type: (ServerConnectivityInfo, PluginScanCommand) -> None """Queue a scan command targeting a specific server. Args: server_info(ServerConnectivityInfo): The server's connectivity information. The test_connectivity_to_server() method must have been called first to ensure that the server is online and accessible. scan_command (PluginScanCommand): The scan command to run against this server. """ # Ensure we have the right processes and queues in place for this hostname self._check_and_create_process(server_info.hostname) # Add the task to the right queue self._queued_tasks_nb += 1 if scan_command.is_aggressive: # Aggressive commands should not be run in parallel against # a given server so we use the priority queues to prevent this self._hostname_queues_dict[server_info.hostname].put( (server_info, scan_command)) else: # Normal commands get put in the standard/shared queue self._task_queue.put((server_info, scan_command)) def _check_and_create_process(self, hostname): # type: (Text) -> None if hostname not in self._hostname_queues_dict.keys(): # We haven't this hostname before if self._get_current_processes_nb() < self._max_processes_nb: # Create a new process and new queue for this hostname hostname_queue = JoinableQueue() # type: JoinableQueue self._hostname_queues_dict[hostname] = hostname_queue process = WorkerProcess(hostname_queue, self._task_queue, self._result_queue, self._network_retries, self._network_timeout) process.start() self._processes_dict[hostname] = [process] else: # We are already using the maximum number of processes # Do not create a process and re-use a random existing hostname queue self._hostname_queues_dict[hostname] = random.choice( list(self._hostname_queues_dict.values())) self._processes_dict[hostname] = [] else: # We have seen this hostname before - create a new process if possible if len(self._processes_dict[hostname]) < self._max_processes_per_hostname_nb \ and self._get_current_processes_nb() < self._max_processes_nb: # We can create a new process; no need to create a queue as it already exists process = WorkerProcess(self._hostname_queues_dict[hostname], self._task_queue, self._result_queue, self._network_retries, self._network_timeout) process.start() self._processes_dict[hostname].append(process) def _get_current_processes_nb(self): # type: () -> int return sum([ len(process_list) for hostname, process_list in self._processes_dict.items() ]) def get_results(self): # type: () -> Iterable[PluginScanResult] """Return the result of previously queued scan commands; new commands cannot be queued once this is called. Yields: PluginScanResult: The result of the scan command, which will be an instance of the scan command's corresponding PluginScanResult subclass. If there was an unexpected error while running the scan command, this will be a PluginRaisedExceptionScanResult instance instead. """ # Put a 'None' sentinel in the queue to let the each process know when every task has been completed for _ in range(self._get_current_processes_nb()): self._task_queue.put(None) for hostname, hostname_queue in self._hostname_queues_dict.items(): for i in range(len(self._processes_dict[hostname])): hostname_queue.put(None) received_task_results = 0 # Go on until all the tasks have been completed and all processes are done expected_task_results = self._queued_tasks_nb + self._get_current_processes_nb( ) while received_task_results != expected_task_results: result = self._result_queue.get() self._result_queue.task_done() received_task_results += 1 if result is None: # Getting None means that one process was done pass else: # Getting an actual result yield result # Ensure all the queues and processes are done self._task_queue.join() self._result_queue.join() for hostname_queue in self._hostname_queues_dict.values(): hostname_queue.join() for process_list in self._processes_dict.values(): for process in process_list: process.join() def emergency_shutdown(self): # type: () -> None # Terminating a process this way will corrupt the queues but we're shutting down anyway for process_list in self._processes_dict.values(): for process in process_list: process.terminate()
class PluginsProcessPool(object): """Creates a pool of processes and dispatches scanning commands to be run concurrently. """ DEFAULT_MAX_PROCESSES_NB = 12 DEFAULT_PROCESSES_PER_HOSTNAME_NB = 3 # Controls every socket connection done by every plugin DEFAULT_NETWORK_RETRIES = 3 DEFAULT_NETWORK_TIMEOUT = 5 # in seconds def __init__(self, available_plugins, network_retries=DEFAULT_NETWORK_RETRIES, network_timeout=DEFAULT_NETWORK_TIMEOUT, max_processes_nb=DEFAULT_MAX_PROCESSES_NB, max_processes_per_hostname_nb=DEFAULT_PROCESSES_PER_HOSTNAME_NB): """ Args: available_plugins (PluginsFinder): An object encapsulating the list of available plugins. network_retries (Optional[int)]: How many times plugins should retry a connection that timed out. network_timeout (Optional[int]): The time until an ongoing connection times out within all plugins. max_processes_nb (Optional[int]): The maximum number of processes to spawn for running scans concurrently. max_processes_per_hostname_nb (Optional[int]): The maximum of processes that can be used for running scans concurrently on a single server. Returns: PluginsProcessPool: An object for queueing scan commands to be run concurrently. """ self._available_plugins = available_plugins self._network_retries = network_retries self._network_timeout = network_timeout self._max_processes_nb = max_processes_nb self._max_processes_per_hostname_nb = max_processes_per_hostname_nb # Create hostname-specific queues to ensure aggressive scan commands targeting this hostname are never # run concurrently self._hostname_queues_dict = {} self._processes_dict = {} self._task_queue = JoinableQueue() # Processes get tasks from task_queue and self._result_queue = JoinableQueue() # put the result of each task in result_queue self._queued_tasks_nb = 0 def queue_plugin_task(self, server_connectivity_info, plugin_command, plugin_options_dict={}): """Queue a scan command targeting a specific server. Args: server_connectivity_info (ServerConnectivityInfo): The information for connecting to the server. plugin_command (str): The plugin scan command to be run on the server. Available commands for each plugin are described in the sslyze CLI --help text. plugin_options_dict (dict): Scan options to be passed to the plugin. Available options for each plugin are described in the sslyze CLI --help text. """ # Ensure we have the right processes and queues in place for this hostname self._check_and_create_process(server_connectivity_info.hostname) # Add the task to the right queue self._queued_tasks_nb += 1 if plugin_command in self._available_plugins.get_aggressive_commands(): # Aggressive commands should not be run in parallel against # a given server so we use the priority queues to prevent this self._hostname_queues_dict[server_connectivity_info.hostname].put((server_connectivity_info, plugin_command, plugin_options_dict)) else: # Normal commands get put in the standard/shared queue self._task_queue.put((server_connectivity_info, plugin_command, plugin_options_dict)) def _check_and_create_process(self, hostname): if hostname not in self._hostname_queues_dict.keys(): # We haven't this hostname before if self._get_current_processes_nb() < self._max_processes_nb: # Create a new process and new queue for this hostname hostname_queue = JoinableQueue() self._hostname_queues_dict[hostname] = hostname_queue process = WorkerProcess(hostname_queue, self._task_queue, self._result_queue, self._available_plugins.get_commands(), self._network_retries, self._network_timeout) process.start() self._processes_dict[hostname] = [process] else: # We are already using the maximum number of processes # Do not create a process and re-use a random existing hostname queue self._hostname_queues_dict[hostname] = random.choice(self._hostname_queues_dict.values()) self._processes_dict[hostname] = [] else: # We have seen this hostname before - create a new process if possible if len(self._processes_dict[hostname]) < self._max_processes_per_hostname_nb \ and self._get_current_processes_nb() < self._max_processes_nb: # We can create a new process; no need to create a queue as it already exists process = WorkerProcess(self._hostname_queues_dict[hostname], self._task_queue, self._result_queue, self._available_plugins.get_commands(), self._network_retries, self._network_timeout) process.start() self._processes_dict[hostname].append(process) def _get_current_processes_nb(self): return sum([len(process_list) for hostname, process_list in self._processes_dict.iteritems()]) def get_results(self): """Returns the result of previously queues scan command; new tasks can no longer be queued once this is called. Yields: PluginResult: The result of a scan command run on a server. The server and command information are available within the server_info and plugin_command attributes. The PluginResult object also has command/plugin-specific attributes with the result of the scan command that was run; see specific PluginResult subclasses for the list of attributes. """ # Put a 'None' sentinel in the queue to let the each process know when every task has been completed for _ in xrange(self._get_current_processes_nb()): self._task_queue.put(None) for hostname, hostname_queue in self._hostname_queues_dict.iteritems(): for i in xrange(len(self._processes_dict[hostname])): hostname_queue.put(None) received_task_results = 0 # Go on until all the tasks have been completed and all processes are done expected_task_results = self._queued_tasks_nb + self._get_current_processes_nb() while received_task_results != expected_task_results: result = self._result_queue.get() self._result_queue.task_done() received_task_results += 1 if result is None: # Getting None means that one process was done pass else: # Getting an actual result yield result # Ensure all the queues and processes are done self._task_queue.join() self._result_queue.join() for hostname_queue in self._hostname_queues_dict.values(): hostname_queue.join() for process_list in self._processes_dict.values(): [process.join() for process in process_list] # Causes interpreter shutdown errors def emergency_shutdown(self): # Terminating a process this way will corrupt the queues but we're shutting down anyway for process_list in self._processes_dict.values(): [process.terminate() for process in process_list]
def cleanup(days, project, concurrency, silent, model, router, timed): """Delete a portion of trailing data based on creation date. All data that is older than `--days` will be deleted. The default for this is 30 days. In the default setting all projects will be truncated but if you have a specific project you want to limit this to this can be done with the `--project` flag which accepts a project ID or a string with the form `org/project` where both are slugs. """ if concurrency < 1: click.echo("Error: Minimum concurrency is 1", err=True) raise click.Abort() os.environ["_SENTRY_CLEANUP"] = "1" # Make sure we fork off multiprocessing pool # before we import or configure the app from multiprocessing import Process, JoinableQueue as Queue pool = [] task_queue = Queue(1000) for _ in xrange(concurrency): p = Process(target=multiprocess_worker, args=(task_queue,)) p.daemon = True p.start() pool.append(p) from sentry.runner import configure configure() from django.db import router as db_router from sentry.app import nodestore from sentry.db.deletion import BulkDeleteQuery from sentry import models if timed: import time from sentry.utils import metrics start_time = time.time() # list of models which this query is restricted to model_list = {m.lower() for m in model} def is_filtered(model): if router is not None and db_router.db_for_write(model) != router: return True if not model_list: return False return model.__name__.lower() not in model_list # Deletions that use `BulkDeleteQuery` (and don't need to worry about child relations) # (model, datetime_field, order_by) BULK_QUERY_DELETES = [ (models.EventAttachment, "date_added", None), (models.UserReport, "date_added", None), (models.GroupEmailThread, "date", None), (models.GroupRuleStatus, "date_added", None), ] + EXTRA_BULK_QUERY_DELETES # Deletions that use the `deletions` code path (which handles their child relations) # (model, datetime_field, order_by) DELETES = ((models.Group, "last_seen", "last_seen"),) if not silent: click.echo("Removing expired values for LostPasswordHash") if is_filtered(models.LostPasswordHash): if not silent: click.echo(">> Skipping LostPasswordHash") else: models.LostPasswordHash.objects.filter( date_added__lte=timezone.now() - timedelta(hours=48) ).delete() if not silent: click.echo("Removing expired values for OrganizationMember") if is_filtered(models.OrganizationMember): if not silent: click.echo(">> Skipping OrganizationMember") else: expired_threshold = timezone.now() - timedelta(days=days) models.OrganizationMember.delete_expired(expired_threshold) for model in [models.ApiGrant, models.ApiToken]: if not silent: click.echo(u"Removing expired values for {}".format(model.__name__)) if is_filtered(model): if not silent: click.echo(u">> Skipping {}".format(model.__name__)) else: queryset = model.objects.filter( expires_at__lt=(timezone.now() - timedelta(days=API_TOKEN_TTL_IN_DAYS)) ) # SentryAppInstallations are associated to ApiTokens. We're okay # with these tokens sticking around so that the Integration can # refresh them, but all other non-associated tokens should be # deleted. if model is models.ApiToken: queryset = queryset.filter(sentry_app_installation__isnull=True) queryset.delete() project_id = None if project: click.echo("Bulk NodeStore deletion not available for project selection", err=True) project_id = get_project(project) if project_id is None: click.echo("Error: Project not found", err=True) raise click.Abort() else: if not silent: click.echo("Removing old NodeStore values") cutoff = timezone.now() - timedelta(days=days) try: nodestore.cleanup(cutoff) except NotImplementedError: click.echo("NodeStore backend does not support cleanup operation", err=True) for bqd in BULK_QUERY_DELETES: if len(bqd) == 4: model, dtfield, order_by, chunk_size = bqd else: chunk_size = 10000 model, dtfield, order_by = bqd if not silent: click.echo( u"Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or "*" ) ) if is_filtered(model): if not silent: click.echo(">> Skipping %s" % model.__name__) else: BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, order_by=order_by ).execute(chunk_size=chunk_size) for model, dtfield, order_by in DELETES: if not silent: click.echo( u"Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or "*" ) ) if is_filtered(model): if not silent: click.echo(">> Skipping %s" % model.__name__) else: imp = ".".join((model.__module__, model.__name__)) q = BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, order_by=order_by ) for chunk in q.iterator(chunk_size=100): task_queue.put((imp, chunk)) task_queue.join() # Clean up FileBlob instances which are no longer used and aren't super # recent (as there could be a race between blob creation and reference) if not silent: click.echo("Cleaning up unused FileBlob references") if is_filtered(models.FileBlob): if not silent: click.echo(">> Skipping FileBlob") else: cleanup_unused_files(silent) # Shut down our pool for _ in pool: task_queue.put(_STOP_WORKER) # And wait for it to drain for p in pool: p.join() if timed: duration = int(time.time() - start_time) metrics.timing("cleanup.duration", duration, instance=router, sample_rate=1.0) click.echo("Clean up took %s second(s)." % duration)
def score(family_file, variant_file, family_type, annotation_dir, vep, plugin_file, processes, silent, outfile, verbose): """ Score variants in a vcf file using Weighted Sum Model. The specific scores should be defined in a config file, see examples in genmod/configs """ frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) argument_list = [ i + '=' + str(values[i]) for i in values if values[i] and i != 'args' and i != 'frame' and i != 'parser' ] start_time_analysis = datetime.now() if verbose: log.info('Running GENMOD score, version: %s \n' % VERSION) ## Start by parsing the pedigree file: prefered_models = [] family_id = None if family_file: prefered_models, family_id = get_genetic_models( family_file, family_type) else: log.critical("Please provide a family file") sys.exit() if verbose: log.info('Prefered model found in family file: %s \n' % prefered_models) if not plugin_file: log.critical("Please provide a plugin file") sys.exit() ######### Read to the annotation data structures ######### gene_trees = {} exon_trees = {} # If the variants are already annotated we do not need to redo the annotation if not vep: gene_trees, exon_trees = load_annotations(annotation_dir, verbose) else: if verbose: log.info('Using VEP annotation') ## Check the variants: if variant_file == '-': variant_parser = VCFParser(fsock=sys.stdin, skip_info_check=True) else: variant_parser = VCFParser(infile=variant_file, skip_info_check=True) head = variant_parser.metadata add_metadata(head, 'version', 'genmod_score', version=VERSION, command_line_string=' '.join(argument_list)) add_metadata( head, 'info', 'IndividualRankScore', annotation_number='.', entry_type='String', description="Individual rank score for the variant in this family. "\ "This score is NOT corrected for compounds" ) add_metadata( head, 'info', 'RankScore', annotation_number='.', entry_type='String', description="Combined rank score for the variant in this family. "\ "This score is corrected for compounds" ) alt_dict, score_dict, value_dict, operation_dict = check_plugin( plugin_file, variant_parser, verbose) #################################################################### ### The variant queue is where all jobs(in this case batches that### ### represents variants in a region) is put. The consumers will ### ### then pick their jobs from this queue. ### #################################################################### variant_queue = JoinableQueue(maxsize=1000) # The consumers will put their results in the results queue results = Manager().Queue() num_model_scorers = processes if verbose: log.info('Number of CPU:s %s' % cpu_count(), file=sys.stderr) log.info('Number of model scorers: %s' % num_model_scorers, file=sys.stderr) temp_file = NamedTemporaryFile(delete=False) temp_file.close() # We open a variant file to print the variants before sorting: temporary_variant_file = open(temp_file.name, mode='w', encoding='utf-8', errors='replace') model_scorers = [ VariantScorer(variant_queue, results, variant_parser.header, prefered_models, family_id, alt_dict, score_dict, value_dict, operation_dict, verbose) for i in range(num_model_scorers) ] for proc in model_scorers: proc.start() # This process prints the variants to temporary files var_printer = VariantPrinter(results, temporary_variant_file, head, mode='score', verbosity=verbose) var_printer.start() start_time_variant_parsing = datetime.now() if verbose: log.info('Start parsing the variants ... \n') # get_batches put the variants in the queue and returns all chromosomes # found among the variants chromosome_list = get_batches(variant_parser, variant_queue, individuals=[], gene_trees=gene_trees, exon_trees=exon_trees, phased=False, vep=vep, whole_genes=True, verbosity=verbose) # Put stop signs in the variant queue for i in range(num_model_scorers): variant_queue.put(None) variant_queue.join() results.put(None) var_printer.join() temporary_variant_file.close() if verbose: log.info('Cromosomes found in variant file: %s \n' % ','.join(chromosome_list)) log.info('Variants scored!\n') sort_variants(infile=temp_file.name, mode='rank', verbose=verbose) print_headers(head, outfile, silent) print_variants(temp_file.name, outfile, mode='modified', silent=silent) os.remove(temp_file.name) if verbose: log.info('Time for whole analyis: %s' % str(datetime.now() - start_time_analysis))
def run(): global verbose verbose = CONFIG["process_verbose"] or CONFIG["report_verbose"] global process_verbose process_verbose = CONFIG["process_verbose"] span_done = JoinableQueue() global log_messages log_messages = JoinableQueue() spans_to_process = sorted(CONFIG["spans"], reverse=True) # Create the logger process log_filename = "annotated_network_processing.log" if os.path.exists(log_filename): os.remove(log_filename) loggerP = Process(target=logger, args=(log_filename, log_messages)) loggerP.daemon = True loggerP.start() # Create the first process on spans span_procs = {} for _ in range(min(CONFIG["nb_processes"], len(spans_to_process))): span = spans_to_process.pop() p = Process(target=process_span, args=(span, span_done, CONFIG["spans"], CONFIG["parsed_data"], CONFIG["network_colours"], CONFIG["export_ref_format"], CONFIG["export_ref_annotated_format"], CONFIG["output_directory"])) p.daemon = True p.start() span_procs[span] = p if CONFIG["report_csv"]: prepare_csv(CONFIG["reports_directory"]) while len(spans_to_process) > 0 or len(span_procs) > 0: s = span_done.get() span = s["span"] span_procs[s["span"]].join() log_messages.put("%s done" % s['span']) del span_procs[s["span"]] # Create a new process if needed print("still %s spans to process" % len(spans_to_process)) if len(spans_to_process) > 0: next_span = spans_to_process.pop() span_procs[next_span] = Process( target=process_span, args=(next_span, span_done, CONFIG["spans"], CONFIG["parsed_data"], CONFIG["network_colours"], CONFIG["export_ref_format"], CONFIG["export_ref_annotated_format"], CONFIG["output_directory"])) span_procs[next_span].daemon = True span_procs[next_span].start() print("new process on %s" % next_span) if CONFIG["report_csv"]: csv_writing(s, CONFIG["reports_directory"], CONFIG["spans"]) span_done.task_done() span_done.join() log_messages.join() loggerP.terminate()
def main(): INDEX_COUNT = args.get('index_count') TYPE_COUNT = args.get('type_count') SETUP = args.get('setup') indices = [] types = [] work_queue = JoinableQueue() apiclient = APIClient('http://%s:9200' % es_hosts[random.randint(0, len(es_hosts) - 1)].get('host')) workers = [Worker(work_queue) for x in xrange(args.get('workers'))] [worker.start() for worker in workers] try: # for x in xrange(TYPE_COUNT): type_name = '%s_%s' % (args.get('type_prefix'), x) types.append(type_name) for x in xrange(INDEX_COUNT): index_name = '%s_%s' % (args.get('index_prefix'), x) indices.append(index_name) if SETUP: print 'Running setup...' for index_name in indices: apiclient.delete_index(index_name) time.sleep(1) for index_name in indices: apiclient.create_index( index_name, shards=args['shard_count'], replicas=args['replica_count']) # time.sleep(5) # for index_name in indices: # for type_name in types: # apiclient.define_type_mapping(index_name, type_name) # time.sleep(5) total_messages = args.get('document_count') batch_size = 100000 message_counter = 0 fields = random.randint(50, 100) while message_counter < total_messages: for count in xrange(batch_size): for index_name in indices: doc_id = str(uuid.uuid1()) task = { 'field_count': fields, 'uuid': doc_id, 'index': index_name, 'type': types[random.randint(0, len(types) - 1)] } work_queue.put(task) print 'Joining queue counter=[%s]...' % message_counter work_queue.join() print 'Done queue counter=[%s]...' % message_counter message_counter += batch_size except KeyboardInterrupt: [worker.terminate() for worker in workers]
class Qiushi(object): def __init__(self): self.url = 'https://www.qiushibaike.com/8hr/page/{}/' self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } self.file = open('qiushi.json', 'w') self.url_queue = Queue() self.response_queue = Queue() self.data_queue = Queue() def generate_url_list(self): print('正在生成url队列') # return [self.url.format(i) for i in range(1,14)] for i in range(1, 14): url = self.url.format(i) self.url_queue.put(url) def get_data(self): while True: url = self.url_queue.get() print('正在获取{}对应的响应'.format(url)) response = requests.get(url, headers=self.headers) if response.status_code == 503: self.url_queue.put(url) else: self.response_queue.put(response.content) self.url_queue.task_done() def parse_data(self): while True: data = self.response_queue.get() print('正在解析') # 将源码创建成element对象 html = etree.HTML(data) # 获取帖子节点列表 el_list = html.xpath('//div[@id="content-left"]/div') data_list = [] # 遍历帖子节点列表 for el in el_list: temp = {} temp['content'] = el.xpath('./a/div/span/text()')[0].strip() data_list.append(temp) # print(temp) self.data_queue.put(data_list) self.response_queue.task_done() def save_data(self): while True: print('正在保存') data_list = self.data_queue.get() for data in data_list: json_data = json.dumps(data, ensure_ascii=False) + ',\n' self.file.write(json_data) self.data_queue.task_done() def __del__(self): self.file.close() def run(self): # # url # # url_list # url_list = self.generate_url_list() # # headers # # 遍历url_list # for url in url_list: # # # 发送秦秋获取响应 # data = self.get_data(url) # # # 解析响应 # data_list = self.parse_data(data) # # # 保存 # self.save_data(data_list) thread_list = [] # 创建线程 t_generate_list = Process(target=self.generate_url_list) thread_list.append(t_generate_list) # 创建发送请求的线程 for i in range(4): t = Process(target=self.get_data) thread_list.append(t) # 创建解析响应的线程 for i in range(3): t = Process(target=self.parse_data) thread_list.append(t) t_save_data = Process(target=self.save_data) thread_list.append(t_save_data) # 设置并启动线程 for t in thread_list: t.daemon = True t.start() for q in [self.url_queue, self.response_queue, self.data_queue]: q.join()
def produce(q: JoinableQueue, n: int): for i in range(n): q.put(f'{current_process().name}: {i}')