Пример #1
0
def queue_info(iters=None,):
    work = JoinableQueue()

    for filename in iters:
        work.put(obj=filename)
    time.sleep(1)
    return work
def main():
    jobs = JoinableQueue()
    result = JoinableQueue()


    numToProcess = -1
    scores = pd.DataFrame(columns=['query','fmeasure','precision','recall',
                                   'size','maxDistance','topHits',"contextSteps"])

    print len(datasets)

    for key in datasets:
        jobs.put(key)

    processed_count = Counter()
        
    for i in xrange(NUMBER_OF_PROCESSES):
        p = Process(target=work, args=(i, jobs, result, processed_count))
        p.daemon = True
        p.start()

    #work(1, jobs, result, processed_count)

    automated_annotations = {}
    distances = {}

    jobs.join()

    dataset_index = collections.defaultdict(set)
    annotated_datasets = set()
    while not result.empty():
        dataset, classes = result.get()
        if len(classes) == 0:
            annotated_datasets.add(dataset)
        for c in classes.keys():
            dataset_index[c].add(dataset)
            owl_class = Class(c, graph=graph)
            for parent in owl_class.parents:
                dataset_index[parent.identifier].add(dataset)
        result.task_done()

    print '\n'
    
    for query, c in queries.items():
        manual = ground_truth[query]
        automated = dataset_index[c]
        hits = manual & automated
        misses = manual - automated
        precision = np.nan if len(automated) == 0 else float(len(hits)) / len(automated)
        recall = np.nan if len(manual) == 0 else float(len(hits)) / len(manual)
        if precision != 0 or recall != 0:
            fmeasure = 0 if np.isnan(precision) or np.isnan(recall) else 2 * (precision * recall) / (precision + recall)
        else:
            fmeasure = 0
        scores = scores.append(dict(query=query, size=len(manual), precision=precision, recall=recall, fmeasure=fmeasure,topHits=topHits, maxDistance=maxDistance, contextSteps = context_steps),
                        ignore_index=True)
        print "Hits for", query, c
        print '\n'.join(sorted(hits))
    print scores
    print "Annotated", len(annotated_datasets), "datasets."
def launch_mesos_tf(marathon_url_str, tsknom_str, cpu_float, mem_float, ntasks_int, uri_str, marathon_usr, marathon_usrpwd, localhost_str, mxattempts=10):
   toret_nodes = dict()

   docker = False
   if uri_str.find('docker') > -1:
      uri_str = uri_str.replace('docker://', '')
      docker = True
 
   uri_str = uri_str.rstrip('/')
   marathon_url_str = marathon_url_str.rstrip('/') 

   counter = 0
   tq = JoinableQueue()
   q = Queue()
   plist = list()

   consumers = [ Consumer(tq, q) for i in xrange(ntasks_int) ]
   for c in consumers:
      c.start()

   for i in xrange(ntasks_int):
      tq.put(Task(post_marathon_tasks, (marathon_url_str, tsknom_str, cpu_float, mem_float, i+1, ntasks_int, uri_str, marathon_usr, marathon_usrpwd, localhost_str, mxattempts, docker)))

   for i in xrange(ntasks_int):
      tq.put(None)

   tq.join()

   for i in xrange(1, ntasks_int+1):
      toret_nodes[i] = q.get()

   return toret_nodes
Пример #4
0
def setup_queue(options):	
	probe_servers = Queue()
	progress_queue = Queue()

	run = Probe.ProbeRun.objects.get(id = options.run_id)

	summary_top = Results.ResultSummaryList.objects.get(part_of_run=run)
	summary_top.setup()

	connection.close()
	
	threads = [] 
	for i in range(options.threads):
		new_thread = Process(target=SetupQueueThread, args=(i,run, probe_servers, progress_queue))
		new_thread.daemon = True
		new_thread.start()
		threads.append(new_thread)
		
	progress_thread = threading.Thread(target=__ProgressCounter, args=(run,  progress_queue, threads,options))
	progress_thread.daemon = True
	progress_thread.start()

	i = 0;
	if options.input_filename and (not options.count or i < options.count):
		for hostname_line in fileinput.input(options.input_filename, openhook=fileinput.hook_compressed):
			probe_servers.put(hostname_line)
			i+=1
			if options.count and i >= options.count:
				break;

	probe_servers.join()
	progress_queue.join()
	
	return run
Пример #5
0
class emailSubsystem(object):
    def __init__(self):
        ### will move to Celery eventually; with Celery, the app would be able to periodically
        # wakeup and check on replyQueue to see which emails were send, which were not and
        # what to do ...

        self.emailQueue = JoinableQueue()
        self.replyQueue = JoinableQueue()

        self.worker = Process(target=sendEmailWorker, args=(self.emailQueue, self.replyQueue))

    def start(self):
        # temporarily comment out starting a new process as it seems to leave zombies
        # and causes app not to start as max process limit is reached.
        #self.worker.start()
        return

    def shutdown(self):
        # post poison pill
        # wait on the queue to be done; ie join on emailQueue
        # wait on the worker process to die; ie join on worker

        self.emailQueue.put(None)
        self.emailQueue.join()
        self.worker.join()
def crunch(file_name, ext_type, handler, pool_size=4, queue_size=40,
           limit=None):

    print 'Crunching file: %s, limit: %s' % (file_name, limit)

    q = JoinableQueue(queue_size)
    q_feats = Queue()

    pool = Pool(pool_size, wrap_handler(handler), ((q, q_feats),))

    with file_reader(file_name) as reader:
        idx = 0
        for entry in reader:

            if (entry.pathname.find(ext_type) != -1):
                text = [b for b in entry.get_blocks()]
                key = entry.pathname.split('/')[-1].split('.')[0]

                q.put((key, text), True)
                idx += 1

                print 'Processing:', entry.pathname, idx

                if limit and idx >= limit:
                    print 'Reached the limit'
                    break

        q.close()
        q.join()
        pool.close()

    result = []
    for i in range(q_feats.qsize()):
        result.append(q_feats.get())
    return result
Пример #7
0
def build(opts):
    tasks = JoinableQueue()
    results = JoinableQueue()
    
    if opts.remove:
        log.info("Removing existing docs collection")
        session = utils.get_session(config)
        session.docs.drop()
        
    # start up our builder threads
    log.info("Creating %d Builder processes" % opts.threads)
    builders = [ Builder(tasks, results) for i in xrange(opts.threads)]
    for b in builders:
        b.start()
        
    # queue up the bibcodes
    for bib in get_bibcodes(opts):
        tasks.put(bib)
    
    # add some poison pills to the end of the queue
    log.info("poisoning our task threads")
    for i in xrange(opts.threads):
        tasks.put(None)
    
    # join the results queue. this should
    # block until all tasks in the task queue are completed
    log.info("Joining the task queue")
    tasks.join()
    log.info("Joining the task threads")
    for b in builders:
        b.join()
    
    log.info("All work complete")
def upload(args=None, authdata=None):
    """
    Initialize the containers and pseudo-directories for what is to be
    uploaded.  Separates jobs into sub-jobs based on container.
    Up to 100 containers per second.
    """
    #initalize the containers in parallel
    containers = []
    for obj in os.listdir(args['dir']):
        if args['container']:
            containers.append(args['container'])
            break
        #if os.path.isdir(os.path.abspath(args['dir']+'/'+obj)):
        if os.path.isdir(os.path.join(args['dir'], obj)):
            containers.append(obj)
    if containers:
        #set container job count to the less of args['cc'] or container count
        if args['cc'] < len(containers):
            args['cc'] = len(containers)
        #create queue and jobs
        container_queue = JoinableQueue()
        for container_worker in range(args['cc']):
            job = Process(target=container_consumer,
                          args=(args, authdata, container_queue,))
            job.daemon=False
            job.start()
        for container in containers:
            container_queue.put(container)
        #tailing the queue with a Null marker so the works shut down nicely.
        for container in range(args['cc']):
            container_queue.put(None)
        container_queue.join()
class SimpleSynergeticServer(Process):
    
    def __init__(self, authen_key):
        Process.__init__(self)
        self.__task_queue = JoinableQueue(1)
        self.__return_queue = Queue(1)
        self.serv = Listener(('', 40000), authkey=authen_key)
    
    def run(self):
        print 'Server Works'
        copy_reg.pickle(types.MethodType, _reduce_method)
        #Start the synergeticProcess in Deamon Mode
        worker_p = SynergeticProcess(self.__task_queue, self.__return_queue)
        worker_p.deamon = True
        worker_p.start()          
        while True:
            print 'wait for Client'
            pool_conn = self.serv.accept()
            print 'connection Client Accepted'
            while True:
                print 'in LOOP Simple Server'
                #There is no need for task_id in this version
                try:
                    print 'Try to recv MSG'
                    unpickled_msg = pool_conn.recv()
                    print 'MSG Reseved'
                except Exception as e: # EOFError:
                    print 'Fail To Receive MSG:', e
                    break 
                if unpickled_msg[0] == 'MODULES':
                    self.import_mods( unpickled_msg[1] )
                    ret = 'MODULES-READY'
                else:    
                    self.__task_queue.put(unpickled_msg)
                    ret = self.__return_queue.get()
                try:
                    print 'SEND RESPONCE'
                    try:
                        pool_conn.send( ret )
                    except EOFError:
                        print 'SENT TO POOL FAILD'
                    print 'RESPONCE SENT ', ret
                except EOFError:
                    break
            pool_conn.close()
    
    def import_mods(self, mods_d):
        for mod_name, mod_bytecode in mods_d.items():
            try:
                fobj = open(mod_name + ".pyc", 'wb')
            except Exception as e:
                print("Synergeticprocessing.SimpleServer --> Module file error: %s" % e)
            else:
                fobj.write( mod_bytecode )
            finally:
                fobj.close()
        for mod in mods_d:
            print 'blocking'
            __import__( mod )
            print 'imported ', mod
Пример #10
0
def solve(iterations, proc_count):

    queue = JoinableQueue()
    partition = get_iterations_partition(iterations, proc_count)
    for iteration in partition:
        queue.put(iteration)
    for i in range(process_count):
        queue.put(None)

    manager = Manager()
    result = manager.list()
    processes = []

    cur_time = time.time()
    for i in range(process_count):
        proc = Process(target=worker, args=(queue, result,))
        proc.start()
        processes.append(proc)

    queue.join()
    for proc in processes:
        proc.join()

    cur_time = time.time() - cur_time
    print_results(cur_time, result, iterations)
Пример #11
0
def aggress(map):
    global startMap
    startMap = map

    #print "Regressing..."
    state = State()

    jobs = []

    longestSolution = Value('d', 20)
    highestScore = Value('d', 0)

    queue = JoinableQueue()

    manager = Manager()

    d = manager.dict()
    d.clear()

    l = RLock()

    if multiProc:
        queue.put((state, map, 1))

        for i in range(numProcs):
           p = Process(target = multiMain, args=(startMap, l, d, queue,highestScore))
           p.start()

        queue.join()
    else:
        a(l, highestScore, d, None, state, map, 1)
Пример #12
0
class ProcessPool(object):

    def __init__(self, size=1):
        self.size = size
        self.jobs = Queue()
        self.results = Queue()
        self.processes = []

    def start(self):
        '''start all processes'''

        for i in range(self.size):
            self.processes.append(ProcessWorker(self))

        for process in self.processes:
            process.start()

    def append_job(self, job, *args, **kwargs):
        self.jobs.put((job, args, kwargs))

    def join(self):
        '''waiting all jobs done'''
        self.jobs.join()

    def stop(self):
        '''kill all processes'''
        for process in self.processes:
            process.stop()

        for process in self.processes:  # waiting processes completing
            if process.is_alive():
                process.join()

        del self.processes[:]  # reset processes to empty
Пример #13
0
def get_citations(**args):
    """
    Method to prepare the actual citation dictionary creation
    """
    # create the queues
    tasks = JoinableQueue()
    results = JoinableQueue()
    # how many threads are there to be used
    if 'threads' in args:
        threads = args['threads']
    else:
        threads = cpu_count()
    # initialize the "harvesters" (each harvester get the citations for a bibcode)
    harvesters = [ CitationHarvester(tasks, results) for i in range(threads)]
    # start the harvesters
    for b in harvesters:
        b.start()
    # put the bibcodes in the tasks queue
    for bib in args['bibcodes']:
        tasks.put(bib)
    # add some 'None' values at the end of the tasks list, to faciliate proper closure
    for i in range(threads):
        tasks.put(None)

    tasks.join()
    for b in harvesters:
        b.join()

    return [item for sublist in cit_dict.values() for item in sublist]
Пример #14
0
    def run(self):

        # Changes the process name shown by ps for instance
        setProcTitle ("agentcluster master [version: %s] [monitoring: %d seconds]" % (__version__,self.monitoring_period) );

        try:
            logger.info ( 'Agent cluster server starting' );

            logger.info ( 'Configurations will be scanned in directories:' );
            for directory in confdir.data:
                logger.info ( '  o %s', os.path.abspath(directory) );

            self.watchdog = Watchdog(self.monitoring_period)
            self.watchdog.start()

            # Generates a deadlock to enter in sleep mode
            # Only an external signal can break this deadlock
            logger.info ( 'Agent cluster server started' );
            queue = JoinableQueue()
            queue.put(object());
            queue.join();

        except KeyboardInterrupt:
            logger.info ( 'Agent cluster server interrupted' );
        except Exception:
            logger.error ( 'Exception catched in main process: %s', sys.exc_info()[1] );
            logger.debug ( "", exc_info=True );
        finally:
            # First stop the monitoring to avoid restarting killed agents
            if self.watchdog is not None:
                self.watchdog.shutdown = True
                self.watchdog.join()
            logger.info ( 'Agent cluster server end' );
            logging.shutdown()
Пример #15
0
def main(opts, files):
    
    if opts.threads == 1:
        log.info("running synchronously")
        run_syncronous(opts, files)
    else:
        Q = JoinableQueue()
        workers = [Worker(Q, opts) for i in xrange(opts.threads)]
        
        log.info("initializing %d threads" % opts.threads)
        for w in workers:
            w.start()
            
        # push log events onto the queue
        events_iter = events(files, opts)
        if opts.limit:
            events_iter = itertools.islice(events_iter, opts.limit)
            
        for event in events_iter:
            Q.put(event)
        
        # add poison pills 
        for i in xrange(opts.threads):
            Q.put(None)
            
        Q.join()
        log.info("work complete. shutting down threads.")
        for w in workers:
            w.join()   
Пример #16
0
def evaluate(points,meshToBasis,kernel,quadRule,coeffs,nprocs=None):
    """Evaluate a kernel using the given coefficients"""


    if nprocs==None: nprocs=cpu_count()

    inputQueue=JoinableQueue()

    nelements=meshToBasis.nelements

    for elem in meshToBasis: inputQueue.put(elem)

    buf=sharedctypes.RawArray('b',len(points[0])*numpy.dtype(numpy.complex128).itemsize)
    result=numpy.frombuffer(buf,dtype=numpy.complex128)
    result[:]=numpy.zeros(1,dtype=numpy.complex128)

    time.sleep(.5)
    workers=[]

    for id in range(nprocs):
        worker=EvaluationWorker(points,kernel,quadRule,coeffs,inputQueue,result)
        worker.start()
        workers.append(worker)


    inputQueue.join()
    for worker in workers: worker.join()

    return result.copy()
Пример #17
0
def readCEFFile(afile,pygtail):
    if exists(afile): #sometimes files can move/archive while we iterate the list
        try:
            #start a process to post our stuff.
            logcache=JoinableQueue()
            postingProcess=Process(target=postLogs,args=(logcache,),name="cef2mozdefHTTPPost")
            postingProcess.start()            
            #have pygtail feed us lines 
            for line in pygtail:
                pygtail._update_offset_file()
                cefDict=parseCEF(line)
                #logger.debug(json.dumps(cefDict))
                #append json to the list for posting
                if cefDict is not None:
                    logcache.put(json.dumps(cefDict))        
            logger.info('{0} done'.format(afile))
            logger.info('waiting for posting to finish')
            logcache.put(None)
            logcache.close()
            #logger.info('posting done')
        except KeyboardInterrupt:
            sys.exit(1)
        except ValueError as e:
            logger.fatal('Exception while handling CEF message: %r'%e)
            sys.exit(1)    
class QueueTask:
    def __init__(self):
        self.queue = JoinableQueue()
        self.event = Event()
        atexit.register( self.queue.join )

        process = Process(target=self.work)
        process.daemon = True
        process.start()


    def work(self):
        while True:
            func, args, wait_for = self.queue.get()

            for evt in wait_for: 
                evt.wait()
            func(*args)
            self.event.set()

            self.queue.task_done()


    def enqueue(self, func, args=[], wait_for=[]):
        self.event.clear()
        self.queue.put( (func, args, wait_for) )

        return self.event 
Пример #19
0
	def batchProcess(self, arr_to_enque, work_method, t=False):
		q = JoinableQueue()
		output = JoinableQueue()
		extra = JoinableQueue()
		third = JoinableQueue()
		if t: 
			args = ((q, output, extra, third))
		else:
			args=(q, output, extra)
		for obj in arr_to_enque:
			q.put(obj)
		processes = [Process(target=work_method, args=args, name=str(obj)) for obj in arr_to_enque]
		for p in processes:
			p.start()
		for p in processes: 
			p.join(30)
			if p.is_alive():
				print "ERROR JOINING PROCESS FOR: ", p.name
				p.terminate()
				raise Exception("Goal Conversion Error:", (self.account_id, self.project_id, exp_id, var_ids))
		print "end batch process"
		if t:
			return (output, extra, third)
		else:
			return (output, extra)
Пример #20
0
def main(workers=10):
    """
    Executes main function of mini-framework's Control thread.
    :param workers: Integer detailing number of worker FIFO threads to employ
    """
    start_logging()
    log_info("New multiprocessing session with {} workers".format(workers))
    
    # Input JoinableQueue and Output Queue
    inq = JoinableQueue(maxsize=int(workers*1.5))
    outq = Queue(maxsize=int(workers*1.5))
    
    ot = OutThread(workers, outq)
    ot.start()
    
    for _ in range(workers):
        w = WorkerThread(inq, outq)
        w.start()
    
    # Create a sequence of a 1000 random alphabetic characters
    random_chars = (ascii_letters[randint(0, 51)] for _ in range(1000))
    
    # Keep input queue loaded for as long as possible
    # Feed the process pool with work units
    for work in enumerate(random_chars):
        inq.put(work)
    
    # Fill the input queue with Nones to shut the worker threads down
    # which terminates the process pool
    for _ in range(workers):
        inq.put(None)
        
    inq.join()
    print("Control process terminating")
Пример #21
0
def cdp_no_split_single(loaded_seq_list, loaded_seq_name_list,
                        ref_file,
                        nt, cores):
    """
    Aligns a single SRNA_seq object to multiple refseq seqs in a Ref object
    at a time.  No splitting of read counts.
    """

    refs = RefSeq()
    refs.load_ref_file(ref_file)
    print(colored("------------------ALIGNING READS------------------\n", 'green'))

    workers = cores
    work_queue = JoinableQueue()
    processes = []
    mgr = Manager()
    count = 0
    counts_by_ref = mgr.dict()  # {header:[count1, count2,.......]}
    for header, seq in refs:
        work_queue.put((header, seq,))
        count += 1
        if count % 10000 == 0:
            _cdp_no_split_single_queue(counts_by_ref, loaded_seq_list, nt, processes, work_queue, workers)
    _cdp_no_split_single_queue(counts_by_ref, loaded_seq_list, nt, processes, work_queue, workers)

    _cdp_single_output(counts_by_ref.copy(), loaded_seq_name_list, ref_file, nt)
Пример #22
0
class JavaMultipleParserExecutor:
    def __init__(self, output_dir, repo_path, processes=None):
        self.target_blobs = JoinableQueue()

        self.num_consumers = processes if processes else cpu_count()
        self.consumers = [JavaConsumer(self.target_blobs, repo_path, output_dir)
                          for i in range(self.num_consumers)]

        for consumer in self.consumers:
            consumer.start()

        self.closed = False

    def parse_blob(self, blob):
        if self.closed:
            return
        self.target_blobs.put(blob.hexsha)

    def join(self):
        if self.closed:
            return
        for i in range(self.num_consumers):
            self.target_blobs.put(None)

        self.target_blobs.join()
        self.closed = True
Пример #23
0
def processData(imageList,featuresDir,featuresExt,task):
  numProcs = 8
  taskQueue = JoinableQueue()
  resultQueue = Queue()
  processes = []
  for i in range(numProcs):
    t = Process(target=worker, args=(taskQueue, resultQueue, task))
    t.daemon = True
    t.start()
    processes.append(t)

  for img in imageList:
    filename = featuresDir+'/'+img+'.'+featuresExt
    idxFile = re.sub(r'\..+$',r'.idx',filename)
    content = open(filename)
    index = open(idxFile)
    taskQueue.put( (img,content.read(),index.read()) )
    #taskQueue.put( (img,filename,idxFile) )
    index.close()
    content.close()
  for i in range(len(processes)):
    taskQueue.put('stop')

  results = []
  retrieved = 0
  while retrieved < len(imageList):
    data = resultQueue.get()
    retrieved += 1
    if data != 'Ignore':
      results.append(data)
  return results
def parallelPrepareImg(img, info, name, idx):
  # Make Color Image
  if img.ndim == 2:
    img = np.tile(img[:, :, np.newaxis], (1, 1, 3))
  elif img.shape[2] == 4:
    img = img[:, :, :3]
  # Prepare processes
  numProcs = 3
  taskQueue = JoinableQueue()
  resultQueue = ProcQueue()
  processes = []
  for i in range(numProcs):
    t = Process(target=singleWindowProcess, args=(taskQueue, resultQueue, img))
    t.daemon = True
    t.start()
    processes.append(t)
  j = 0
  # Add tasks to the queue
  for b in info:
    idx.write(b[4])
    taskQueue.put( (b,j) )
    j += 1
  for i in range(len(processes)):
    taskQueue.put('stop')
  # Collect results
  data = np.zeros([len(info), 227, 227, 3])
  retrieved = 0
  while retrieved < len(info):
    j,win = resultQueue.get()
    data[j,:,:,:] = win
    retrieved += 1
  # Substract mean and return
  data -= imagenet.IMAGENET_MEAN[14:241,14:241,:]
  return data.swapaxes(2, 3).swapaxes(1, 2)
def find_vocabulary(data_dir, stats_dir, category, min_num_images, save_description):
    print "Start find vocabulary"
    filequeue = JoinableQueue()
    photoqueue = Queue()

    init_dict = initialize_variables(None, None, False)

    # Create new processes
    num_processes = cpu_count()
    temp_dir = os.path.join(stats_dir, "database_temp", "vocab", category)
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    processes = [FindVocabularyProcess(filequeue, photoqueue, init_dict, 30.0, num_processes, temp_dir, category) for i in xrange(num_processes)]
    for p in processes:
        p.start()

    #Add the files to the process queue
    add_files_to_queue(data_dir, category, filequeue)
    #Add a poison pill for each process
    for i in xrange(num_processes):
        filequeue.put("Stop")

    for p in processes:
        p.join()

    merge_vocabulary_files(data_dir, temp_dir, min_num_images, save_description)

    print "Removing temp files"
    shutil.rmtree(temp_dir)

    print "Done with find vocabulary"
Пример #26
0
class FlightProducer(Process):

    def __init__(self, options={}, date_group=[]):
        self.options    = options
        self.date_group = date_group
        self.date_queue = JoinableQueue()

    def start(self):
        consumers_list = []
        consumers_num  = cpu_count() * 2

        # Consumers
        for i in xrange(consumers_num):
            consumers_list.append(FlightConsumer(self.options, self.date_queue))

        for consumer in consumers_list:
            consumer.start()

        # Put each date group to queue
        for date_item in self.date_group:
            self.date_queue.put(date_item)

        # Tell consumers can exit
        for i in xrange(consumers_num):
            self.date_queue.put(None)

        # Wait for all of the consumers to finish
        self.date_queue.join()

        print('Done')
Пример #27
0
def task_writer(task: JoinableQueue):
    for n in News.objects.all()[:50].iterator():
        task.put(n)

    for i in range(PROCESS_NUM):
        task.put("end")
    print("task writer ends")
def save_transaction_list(data_dir, stats_dir, category, concept_vocabulary, save_description):
    print "Start saving transaction list"
    filequeue = JoinableQueue()

    concept_vocabulary_list, concept_vocabulary_freq = zip(*concept_vocabulary)
    init_dict = initialize_variables(concept_vocabulary_list, None, True)

    # Create new processes
    temp_dir = os.path.join(stats_dir, "transaction_list")
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    else:
        print "todo"
    lock = Lock()
    num_processes = cpu_count()
    processes = [TransactionListProcess(filequeue, init_dict, 30, num_processes, temp_dir, save_description, lock) for i in xrange(num_processes)]
    for p in processes:
        p.start()

    #Add the files to the process queue
    add_files_to_queue(data_dir, category, filequeue)
    #Add a poison pill for each process
    for i in xrange(num_processes):
        filequeue.put("Stop")

    for p in processes:
        p.join()

    print "Removing temp files"
    shutil.rmtree(temp_dir)

    print "Done with saving transaction list"
def main(multiplier):
    # Establish communication queues
    tasks = JoinableQueue()
    results = Queue()

    # Start consumers
    num_consumers = cpu_count() * multiplier
    print 'Creating %d consumers' % num_consumers
    consumers = [Consumer(tasks, results) for i in xrange(num_consumers)]
    for w in consumers:
        w.start()
    
    fout = open(os.path.join(settings.PERSIST_DIR, 'doc_matrix_comparison.csv'), 'w', 0)
    rw = ResultWriter(results, csv.writer(fout))
    rw.start()

    #num_docs = 801781
    num_docs = 25
    for i in xrange(num_docs):
        tasks.put(Task(i))


    # Add a poison pill for each consumer
    for i in xrange(num_consumers):
        tasks.put(None)

    # Wait for all of the tasks to finish
    tasks.join()
    results.put('STOP')
Пример #30
0
class MMapPool(object):
    def __init__(self, n, mmap_size):
        self.n = n
        self.mmap_size = mmap_size
        self.pool = [mmap.mmap(-1, mmap_size) for _ in range(n)]
        self.free_mmaps = set(range(n))
        self.free_queue = JoinableQueue()

    def new(self):
        if not self.free_mmaps:
            self.free_mmaps.add(self.free_queue.get())
            self.free_queue.task_done()
        while True:
            try:
                self.free_mmaps.add(self.free_queue.get_nowait())
                self.free_queue.task_done()
            except Empty:
                break
        mmap_idx = self.free_mmaps.pop()
        return mmap_idx, self.pool[mmap_idx]

    def join(self):
        while len(self.free_mmaps) < self.n:
            self.free_mmaps.add(self.free_queue.get())
            self.free_queue.task_done()

    def get(self, idx):
        return self.pool[idx]

    def free(self, idx):
        self.free_queue.put(idx)
Пример #31
0
def main():

    info_string = """Individuals that are not present in ped file will not be considered in the analysis."""

    parser = argparse.ArgumentParser(
        description="Annotate genetic models in variant files..")

    parser.add_argument('family_file',
                        type=str,
                        nargs=1,
                        help='A pedigree file in .ped format.')
    parser.add_argument('variant_file',
                        type=str,
                        nargs=1,
                        help='A variant file. Default is vcf format.')

    parser.add_argument('annotation_file',
                        type=str,
                        nargs=1,
                        help='A annotations file. Default is ref_gene format.')

    parser.add_argument('-at',
                        '--annotation_type',
                        type=str,
                        nargs=1,
                        choices=['bed', 'ccds', 'gtf', 'ref_gene'],
                        default=['ref_gene'],
                        help='Specify the format of the annotation file.')

    parser.add_argument('--version',
                        action="version",
                        version=pkg_resources.require("genmod")[0].version)

    parser.add_argument('-v',
                        '--verbose',
                        action="store_true",
                        help='Increase output verbosity.')

    parser.add_argument('-chr',
                        '--chr_prefix',
                        action="store_true",
                        help='If chr prefix is used in vcf.')

    parser.add_argument('-s',
                        '--silent',
                        action="store_true",
                        help='Do not print the variants.')

    parser.add_argument('-phased',
                        '--phased',
                        action="store_true",
                        help='If data is phased use this flag.')

    parser.add_argument(
        '-o',
        '--outfile',
        type=str,
        nargs=1,
        default=[None],
        help='Specify the path to a file where results should be stored.')

    parser.add_argument(
        '-cadd',
        '--cadd_file',
        type=str,
        nargs=1,
        default=[None],
        help='Specify the path to a bgzipped cadd file with variant scores.\
            If no index is present it will be created.')

    args = parser.parse_args()
    var_file = args.variant_file[0]
    file_name, file_extension = os.path.splitext(var_file)
    anno_file = args.annotation_file[0]

    start_time_analysis = datetime.now()

    # Start by parsing at the pedigree file:

    my_family = get_family(args)

    # Parse the header of the vcf:

    head = get_header(var_file)
    add_metadata(head, args)
    # Parse the annotation file and make annotation trees:

    if args.verbose:
        print('Parsing annotation ...')
        print('')
        start_time_annotation = datetime.now()

    annotation_trees = annotation_parser.AnnotationParser(
        anno_file, args.annotation_type[0])

    if args.verbose:
        print('Annotation Parsed!')
        print('Cromosomes found in annotation file: %s' %
              ','.join(list(annotation_trees.gene_trees.keys())))
        print('Time to parse annotation: %s' %
              (datetime.now() - start_time_annotation))
        print('')

    # Check if the ccds-file is compressed and indexed:

    if args.cadd_file[0]:
        if args.verbose:
            print('Cadd file! %s' % args.cadd_file[0])
        try:
            tabix_index(args.cadd_file[0],
                        seq_col=0,
                        start_col=1,
                        end_col=1,
                        meta_char='#')
        except IOError as e:
            if args.verbose:
                print(e)

    # # Check the variants:

    # The task queue is where all jobs(in this case batches that represents variants in a region) is put
    # the consumers will then pick their jobs from this queue.
    variant_queue = JoinableQueue(maxsize=1000)
    # The consumers will put their results in the results queue
    results = Manager().Queue()

    # Create a directory to keep track of temp files
    temp_dir = mkdtemp()

    num_model_checkers = (cpu_count() * 2 - 1)

    if args.verbose:
        print('Number of CPU:s %s' % cpu_count())

    # These are the workers that do the analysis
    model_checkers = [
        variant_consumer.VariantConsumer(variant_queue, results, my_family,
                                         args)
        for i in range(num_model_checkers)
    ]

    for w in model_checkers:
        w.start()

    # This process prints the variants to temporary files
    var_printer = variant_printer.VariantPrinter(results, temp_dir, head,
                                                 args.verbose)
    var_printer.start()

    if args.verbose:
        print('Start parsing the variants ...')
        print('')
        start_time_variant_parsing = datetime.now()

    # For parsing the vcf:
    var_parser = vcf_parser.VariantFileParser(var_file, variant_queue, head,
                                              annotation_trees, args)
    var_parser.parse()

    for i in range(num_model_checkers):
        variant_queue.put(None)

    variant_queue.join()
    results.put(None)
    var_printer.join()

    chromosome_list = var_parser.chromosomes

    if args.verbose:
        print('Cromosomes found in variant file: %s' %
              ','.join(chromosome_list))
        print('Models checked!')
        print('Start sorting the variants:')
        print('')
        start_time_variant_sorting = datetime.now()

    print_headers(args, head)

    for chromosome in chromosome_list:
        for temp_file in os.listdir(temp_dir):
            if temp_file.split('_')[0] == chromosome:
                var_sorter = variant_sorter.FileSort(os.path.join(
                    temp_dir, temp_file),
                                                     outFile=args.outfile[0],
                                                     silent=args.silent)
                var_sorter.sort()

    if args.verbose:
        print('Sorting done!')
        print('Time for sorting: %s' %
              str(datetime.now() - start_time_variant_sorting))
        print('')
        print('Time for whole analyis: %s' %
              str(datetime.now() - start_time_analysis))

    # Remove all temp files:
    shutil.rmtree(temp_dir)
Пример #32
0
    matrizA = cria_matriz(linhas, colunas)
    matrizB = cria_matriz(linhas, colunas)
    matrizC = numpy.zeros(shape=(linhas, colunas))

    print("{}: Multiplicando matrizes".format(time.strftime('%c')))
    queue = JoinableQueue()
    queue_resultados = JoinableQueue()
    for i in range(2):
        worker = Process(target=multiplica_linha_coluna,
                         args=(
                             queue,
                             queue_resultados,
                             matrizA,
                             matrizB,
                         ))
        worker.daemon = True
        worker.start()

    for i in range(len(matrizA)):
        for j in range(len(matrizA[0])):
            queue.put((i, j))

    queue.join()

    while not queue_resultados.empty():
        i, j, valor = queue_resultados.get()
        matrizC[i][j] = valor
        queue_resultados.task_done()

    print("{}: Resultado:{}".format(time.strftime('%c'), matrizC))
Пример #33
0
def main(_):
    parser = argparse.ArgumentParser(description='ProjE.')
    parser.add_argument('--input_dir',
                        dest='input_dir',
                        type=str,
                        help="Data folder",
                        default='./data/WN11/')
    parser.add_argument('--output_dir',
                        dest='output_dir',
                        type=str,
                        help="Data folder",
                        default='.output/')
    parser.add_argument('--lr',
                        dest='lr',
                        type=float,
                        help="Learning rate",
                        default=0.01)
    parser.add_argument("--dim",
                        dest='dim',
                        type=int,
                        help="Embedding dimension",
                        default=200)
    parser.add_argument("--batch",
                        dest='batch',
                        type=int,
                        help="Batch size",
                        default=200)
    parser.add_argument("--comb",
                        dest="combination_method",
                        type=str,
                        help="Combination method",
                        default='simple')
    parser.add_argument("--worker",
                        dest='n_worker',
                        type=int,
                        help="Evaluation worker",
                        default=3)
    parser.add_argument("--generator",
                        dest='n_generator',
                        type=int,
                        help="Data generator",
                        default=10)
    parser.add_argument("--eval_batch",
                        dest="eval_batch",
                        type=int,
                        help="Evaluation batch size",
                        default=500)
    parser.add_argument("--save_dir",
                        dest='save_dir',
                        type=str,
                        help="Model path",
                        default='./')
    parser.add_argument("--load_model",
                        dest='load_model',
                        type=str,
                        help="Model file",
                        default="")
    parser.add_argument("--save_per",
                        dest='save_per',
                        type=int,
                        help="Save per x iteration",
                        default=10)
    parser.add_argument("--eval_per",
                        dest='eval_per',
                        type=int,
                        help="Evaluate every x iteration",
                        default=1)
    parser.add_argument("--max_iter",
                        dest='max_iter',
                        type=int,
                        help="Max iteration",
                        default=100)
    parser.add_argument("--summary_dir",
                        dest='summary_dir',
                        type=str,
                        help="summary directory",
                        default='./ProjE_summary/')
    parser.add_argument("--keep",
                        dest='drop_out',
                        type=float,
                        help="Keep prob (1.0 keep all, 0. drop all)",
                        default=0.5)
    parser.add_argument("--optimizer",
                        dest='optimizer',
                        type=str,
                        help="Optimizer",
                        default='adam')
    parser.add_argument("--prefix",
                        dest='prefix',
                        type=str,
                        help="model_prefix",
                        default='DEFAULT')
    parser.add_argument("--loss_weight",
                        dest='loss_weight',
                        type=float,
                        help="Weight on parameter loss",
                        default=1e-5)
    parser.add_argument("--neg_weight",
                        dest='neg_weight',
                        type=float,
                        help="Sampling weight on negative examples",
                        default=0.5)

    args = parser.parse_args()
    args.input_dir = "../data/WN11/"
    print(args)

    model = ProjE(args.input_dir,
                  embed_dim=args.dim,
                  combination_method=args.combination_method,
                  dropout=args.drop_out,
                  neg_weight=args.neg_weight)

    train_hrt_input, train_hrt_weight, train_trh_input, train_trh_weight, \
    train_loss, train_op = train_ops(model, learning_rate=args.lr,
                                     optimizer_str=args.optimizer,
                                     regularizer_weight=args.loss_weight)
    test_input, test_head, test_tail = test_ops(model)

    with tf.Session() as session:
        tf.initialize_all_variables().run()

        saver = tf.train.Saver()

        iter_offset = 0

        if args.load_model is not None and os.path.exists(args.load_model):
            saver.restore(session, args.load_model)
            iter_offset = int(
                args.load_model.split('.')[-2].split('_')[-1]) + 1
            print("Load model from %s, iteration %d restored." %
                  (args.load_model, iter_offset))

        total_inst = model.n_train

        # training data generator
        raw_training_data_queue = Queue()
        training_data_queue = Queue()
        data_generators = list()
        for i in range(args.n_generator):
            data_generators.append(
                Process(target=data_generator_func,
                        args=(raw_training_data_queue, training_data_queue,
                              model.train_tr_h, model.train_hr_t,
                              model.n_entity, args.neg_weight)))
            data_generators[-1].start()

        evaluation_queue = JoinableQueue()
        result_queue = Queue()
        for i in range(args.n_worker):
            worker = Process(target=worker_func,
                             args=(evaluation_queue, result_queue, model.hr_t,
                                   model.tr_h))
            worker.start()

        for data_func, test_type in zip(
            [model.validation_data, model.testing_data], ['VALID', 'TEST']):
            accu_mean_rank_h = list()
            accu_mean_rank_t = list()
            accu_filtered_mean_rank_h = list()
            accu_filtered_mean_rank_t = list()

            evaluation_count = 0

            for testing_data in data_func(batch_size=args.eval_batch):
                head_pred, tail_pred = session.run([test_head, test_tail],
                                                   {test_input: testing_data})

                evaluation_queue.put((testing_data, head_pred, tail_pred))
                evaluation_count += 1

            for i in range(args.n_worker):
                evaluation_queue.put(None)

            print("waiting for worker finishes their work")
            evaluation_queue.join()
            print("all worker stopped.")
            while evaluation_count > 0:
                evaluation_count -= 1

                (mrh, fmrh), (mrt, fmrt) = result_queue.get()
                accu_mean_rank_h += mrh
                accu_mean_rank_t += mrt
                accu_filtered_mean_rank_h += fmrh
                accu_filtered_mean_rank_t += fmrt

            print(
                "[%s] INITIALIZATION [HEAD PREDICTION] MEAN RANK: %.1f FILTERED MEAN RANK %.1f HIT@10 %.3f FILTERED HIT@10 %.3f"
                %
                (test_type, np.mean(accu_mean_rank_h),
                 np.mean(accu_filtered_mean_rank_h),
                 np.mean(np.asarray(accu_mean_rank_h, dtype=np.int32) < 10),
                 np.mean(
                     np.asarray(accu_filtered_mean_rank_h, dtype=np.int32) < 10
                 )))

            print(
                "[%s] INITIALIZATION [TAIL PREDICTION] MEAN RANK: %.1f FILTERED MEAN RANK %.1f HIT@10 %.3f FILTERED HIT@10 %.3f"
                %
                (test_type, np.mean(accu_mean_rank_t),
                 np.mean(accu_filtered_mean_rank_t),
                 np.mean(np.asarray(accu_mean_rank_t, dtype=np.int32) < 10),
                 np.mean(
                     np.asarray(accu_filtered_mean_rank_t, dtype=np.int32) < 10
                 )))

        for n_iter in range(iter_offset, args.max_iter):
            start_time = timeit.default_timer()
            accu_loss = 0.
            accu_re_loss = 0.
            ninst = 0

            print("initializing raw training data...")
            nbatches_count = 0
            for dat in model.raw_training_data(batch_size=args.batch):
                raw_training_data_queue.put(dat)
                nbatches_count += 1
            print("raw training data initialized.")

            while nbatches_count > 0:
                nbatches_count -= 1

                hr_tlist, hr_tweight, tr_hlist, tr_hweight = training_data_queue.get(
                )

                l, rl, _ = session.run(
                    [train_loss, model.regularizer_loss, train_op], {
                        train_hrt_input: hr_tlist,
                        train_hrt_weight: hr_tweight,
                        train_trh_input: tr_hlist,
                        train_trh_weight: tr_hweight
                    })

                accu_loss += l
                accu_re_loss += rl
                ninst += len(hr_tlist) + len(tr_hlist)

                if ninst % (5000) is not None:
                    print(
                        '[%d sec](%d/%d) : %.2f -- loss : %.5f rloss: %.5f ' %
                        (timeit.default_timer() - start_time, ninst,
                         total_inst, float(ninst) / total_inst, l /
                         (len(hr_tlist) + len(tr_hlist)), args.loss_weight *
                         (rl / (len(hr_tlist) + len(tr_hlist)))),
                        end='\r')
            print("")
            print("iter %d avg loss %.5f, time %.3f" %
                  (n_iter, accu_loss / ninst,
                   timeit.default_timer() - start_time))

            if n_iter % args.save_per == 0 or n_iter == args.max_iter - 1:
                save_path = saver.save(
                    session,
                    os.path.join(
                        args.save_dir, "ProjE_" + str(args.prefix) + "_" +
                        str(n_iter) + ".ckpt"))
                print("Model saved at %s" % save_path)

            if n_iter % args.eval_per == 0 or n_iter == args.max_iter - 1:

                for data_func, test_type in zip(
                    [model.validation_data, model.testing_data],
                    ['VALID', 'TEST']):
                    accu_mean_rank_h = list()
                    accu_mean_rank_t = list()
                    accu_filtered_mean_rank_h = list()
                    accu_filtered_mean_rank_t = list()

                    evaluation_count = 0

                    for testing_data in data_func(batch_size=args.eval_batch):
                        head_pred, tail_pred = session.run(
                            [test_head, test_tail], {test_input: testing_data})

                        evaluation_queue.put(
                            (testing_data, head_pred, tail_pred))
                        evaluation_count += 1

                    for i in range(args.n_worker):
                        evaluation_queue.put(None)

                    print("waiting for worker finishes their work")
                    evaluation_queue.join()
                    print("all worker stopped.")
                    while evaluation_count > 0:
                        evaluation_count -= 1

                        (mrh, fmrh), (mrt, fmrt) = result_queue.get()
                        accu_mean_rank_h += mrh
                        accu_mean_rank_t += mrt
                        accu_filtered_mean_rank_h += fmrh
                        accu_filtered_mean_rank_t += fmrt

                    print(
                        "[%s] ITER %d [HEAD PREDICTION] MEAN RANK: %.1f FILTERED MEAN RANK %.1f HIT@10 %.3f FILTERED HIT@10 %.3f"
                        %
                        (test_type, n_iter, np.mean(accu_mean_rank_h),
                         np.mean(accu_filtered_mean_rank_h),
                         np.mean(
                             np.asarray(accu_mean_rank_h, dtype=np.int32) < 10
                         ),
                         np.mean(
                             np.asarray(accu_filtered_mean_rank_h,
                                        dtype=np.int32) < 10)))

                    print(
                        "[%s] ITER %d [TAIL PREDICTION] MEAN RANK: %.1f FILTERED MEAN RANK %.1f HIT@10 %.3f FILTERED HIT@10 %.3f"
                        %
                        (test_type, n_iter, np.mean(accu_mean_rank_t),
                         np.mean(accu_filtered_mean_rank_t),
                         np.mean(
                             np.asarray(accu_mean_rank_t, dtype=np.int32) < 10
                         ),
                         np.mean(
                             np.asarray(accu_filtered_mean_rank_t,
                                        dtype=np.int32) < 10)))
Пример #34
0
number_of_processes = 6

if os.path.exists(h5filename):
    os.remove(h5filename)
    print(f'h5 file :{h5filename} removed !!!')
hdf5_file = h5py.File(h5filename, mode='w', driver='core')
hdf5_file.create_dataset('train_images', train_shape, dtype=np.uint8, compression='lzf')
hdf5_file.create_dataset('train_labels', shape=(total_images, len(labels)), maxshape=(None, 2), dtype="S10",
                         compression='lzf')
hdf5_file.create_dataset('train_img_names', shape=(total_images, len('8c82ae834697bc55a742cc6001f29ace30e46d9a')),
                         maxshape=(None, 40), dtype="S10",
                         compression='lzf')
for _ in range(number_of_processes):
    p = Process(target=worker, args=())
    p.start()
    processes.append(p)
print('started !')
for item in source():
    tasks_to_accomplish.put(item)
for i in range(number_of_processes):
    tasks_to_accomplish.put(None)
while not tasks_to_accomplish.empty():
    print(f'sleep.5 and size if {tasks_to_accomplish.qsize()}')
    time.sleep(.5)
print('queue is empty !')
for p in processes:
    p.join()
    print(f'process {p.name} joined !!!')
hdf5_file.flush()
hdf5_file.close()
Пример #35
0
class QiuBai:
    """qiubaispider"""
    def __init__(self):
        self.url = 'https://www.qiushibaike.com/8hr/page/{}'
        self.headers = {
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
        }
        self.url_queue = Queue()
        self.html_queue = Queue()
        self.content_list_queue = Queue()

    def get_url_list(self):
        """获取url_list"""
        # return [self.url.format(i) for i in range(1, 14)]
        for i in range(1, 14):
            self.url_queue.put(self.url.format(i))

    def parse_url(self):
        """获取响应"""
        while True:
            # print(url)
            url = self.url_queue.get()
            response = requests.get(url, headers=self.headers)
            # return response.content.decode()
            print(response)
            if response.status_code != 200:
                self.url_queue.put(url)
            else:
                self.html_queue.put(response.content.decode())
            self.url_queue.task_done()

    def get_content_list(self):
        """提取数据"""
        while True:
            html_str = self.html_queue.get()
            html = etree.HTML(html_str)
            div_list = html.xpath("//div[@id='content-left']/div")
            content_list = []
            for div in div_list:
                item = {}
                item['user_name'] = div.xpath('.//h2/text()')[0].strip()
                item['content'] = [
                    i.strip() for i in div.xpath(
                        './/div[@class = "content"]/span/text()')
                ]
                content_list.append(item)
            self.content_list_queue.put(content_list)
            self.html_queue.task_done()

    def save_content(self):
        """保存数据"""
        while True:
            content_list = self.content_list_queue.get()
            for content in content_list:
                # print(content)
                with open('process.txt', 'a+', encoding='utf-8') as f:
                    f.write(str(content))
            self.content_list_queue.task_done()

    def run(self):
        thread_list = []
        # 1.准备url列表
        t_url = Process(target=self.get_url_list)
        thread_list.append(t_url)
        # 2.发送请求,获取响应
        for i in range(3):
            t_parse = Process(target=self.parse_url)
            thread_list.append(t_parse)
        # 3.提取数据
        t_content = Process(target=self.get_content_list)
        thread_list.append(t_content)
        # 4.保存数据
        t_save = Process(target=self.save_content)
        thread_list.append(t_save)

        for process in thread_list:
            process.daemon = True  # 把子线程设置为守护线程
            process.start()
        for q in [self.url_queue, self.html_queue, self.content_list_queue]:
            q.join()  # 让主线程堵塞,等待队列计数为0
Пример #36
0
            rel.append(
                Element('member', dict(type='way', ref=way.attrib['id'])))

            osm.append(way)

        osm.append(rel)

    return ElementTree(osm)


if __name__ == '__main__':

    queue = JoinableQueue()

    group_writer = Process(target=write_groups, args=(queue, ))
    group_writer.start()

    db = connect(host='localhost', user='******', database='gis',
                 password='******').cursor()

    relations = get_relations_list(db)

    for group in gen_relation_groups(relations):
        queue.put(group)

        print >> stderr, '-->', len(group), 'relations'
        print >> stderr, '-' * 80

    group_writer.join()
Пример #37
0
class DeepZoomStaticTiler(object):
    """Handles generation of tiles and metadata for all images in a slide."""

    def __init__(self, slidepath, basename, format, tile_size, overlap,
                limit_bounds, quality, workers, with_viewer):
        if with_viewer:
            # Check extra dependency before doing a bunch of work
            import jinja2
        self._slide = open_slide(slidepath)
        self._basename = basename
        self._format = format
        self._tile_size = tile_size
        self._overlap = overlap
        self._limit_bounds = limit_bounds
        self._queue = JoinableQueue(2 * workers)
        self._workers = workers
        self._with_viewer = with_viewer
        self._dzi_data = {}
        for _i in range(workers):
            TileWorker(self._queue, slidepath, tile_size, overlap,
                        limit_bounds, quality).start()

    def run(self):
        self._run_image()
        if self._with_viewer:
            for name in self._slide.associated_images:
                self._run_image(name)
            self._write_html()
            self._write_static()
        self._shutdown()

    def _run_image(self, associated=None):
        """Run a single image from self._slide."""
        if associated is None:
            image = self._slide
            if self._with_viewer:
                basename = os.path.join(self._basename, VIEWER_SLIDE_NAME)
            else:
                basename = self._basename
        else:
            image = ImageSlide(self._slide.associated_images[associated])
            basename = os.path.join(self._basename, self._slugify(associated))
        dz = DeepZoomGenerator(image, self._tile_size, self._overlap,
                    limit_bounds=self._limit_bounds)
        tiler = DeepZoomImageTiler(dz, basename, self._format, associated,
                    self._queue)
        tiler.run()
        self._dzi_data[self._url_for(associated)] = tiler.get_dzi()

    def _url_for(self, associated):
        if associated is None:
            base = VIEWER_SLIDE_NAME
        else:
            base = self._slugify(associated)
        return '%s.dzi' % base

    def _write_html(self):
        import jinja2
        env = jinja2.Environment(loader=jinja2.PackageLoader(__name__),
                    autoescape=True)
        template = env.get_template('slide-multipane.html')
        associated_urls = dict((n, self._url_for(n))
                    for n in self._slide.associated_images)
        try:
            mpp_x = self._slide.properties[openslide.PROPERTY_NAME_MPP_X]
            mpp_y = self._slide.properties[openslide.PROPERTY_NAME_MPP_Y]
            mpp = (float(mpp_x) + float(mpp_y)) / 2
        except (KeyError, ValueError):
            mpp = 0
        # Embed the dzi metadata in the HTML to work around Chrome's
        # refusal to allow XmlHttpRequest from file:///, even when
        # the originating page is also a file:///
        data = template.render(slide_url=self._url_for(None),
                    slide_mpp=mpp,
                    associated=associated_urls,
                    properties=self._slide.properties,
                    dzi_data=json.dumps(self._dzi_data))
        with open(os.path.join(self._basename, 'index.html'), 'w') as fh:
            fh.write(data)

    def _write_static(self):
        basesrc = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                    'static')
        basedst = os.path.join(self._basename, 'static')
        self._copydir(basesrc, basedst)
        self._copydir(os.path.join(basesrc, 'images'),
                    os.path.join(basedst, 'images'))

    def _copydir(self, src, dest):
        if not os.path.exists(dest):
            os.makedirs(dest)
        for name in os.listdir(src):
            srcpath = os.path.join(src, name)
            if os.path.isfile(srcpath):
                shutil.copy(srcpath, os.path.join(dest, name))

    @classmethod
    def _slugify(cls, text):
        text = normalize('NFKD', text.lower()).encode('ascii', 'ignore').decode()
        return re.sub('[^a-z0-9]+', '_', text)

    def _shutdown(self):
        for _i in range(self._workers):
            self._queue.put(None)
        self._queue.join()
Пример #38
0
class Sprendimas:

    tkList = []
    xList = []
    yList = []
    N = 10**5
    lmd = 2.9
    t0 = 0
    tN = None
    rez = 1
    threads_quantity = None
    threadsList = []
    threads_intervalList = [[]]
    queuePoints = None
    safeQueue = JoinableQueue()
    fmin = 10**-6  # UZDUOTIES GRAFIKAS
    fmax = 10**0

    #Konstruktorius
    def __init__(self, fmin, fmax, lamda, N, rezoliucija, threads=None):
        self.fmin = fmin
        self.fmax = fmax
        self.lmd = lamda
        self.N = N
        self.rez = rezoliucija
        self.FindTn()
        if (threads != None):
            self.threads_quantity = threads
            self.queuePoints = JoinableQueue()
        self.FindPointsXY()

    def PoissonRandom(self, lmd):  # Poisson taskinio proceso pasiskirstymas
        a = np.e**(-1.0 * lmd)
        r = 1
        n = -1
        while r > a:
            u = np.random.random(1)
            r *= u
            n += 1
        return n

    # Generuojamas Puasso list ir tN
    def FindTn(
            self
    ):  # Pagal: Tk = Tk-1 + P(lmd)k;  P(lmd)-Poiss random nr; T0 = 0
        lmd = self.lmd
        N = self.N
        tk = self.t0

        for k in range(1, N + 1):
            #tk += np.random.poisson(lmd) #rankom
            tk += self.PoissonRandom(lmd)
            self.tkList.append(tk)
            continue
        self.tN = tk  # N ir lmd - ivedami; Gaunam: tList ir Tn(tList pask. nr.)
        return

    # Apskaiciuojam funkcija S(f) arba kitaip y pagal duota formule
    def FindSf(self, f, t0, tN, N, tkList):
        c = 0.0
        s = 0.0
        for k in range(0, N):  # Sigma susumavimas sin ir cos reiskiniu
            c += np.cos(2.0 * np.pi * f * tkList[k])
            s += np.sin(2.0 * np.pi * f * tkList[k])
        c *= c  # Keliam kvadratu susumuota reiskini
        s *= s
        Sf = (2.0 / (tN - t0)) * (c + s)
        return Sf
        #Apskaiciuojamas daznio f kordinates (x, y)
    def FindPointsXY(self):
        tN = self.tN
        N = self.N
        rez = self.rez
        fmin = self.fmin  #Tikrieji sk. 10**-6
        fmax = self.fmax
        t0 = self.t0
        tkList = self.tkList
        step = (
            math.log10(fmax) - math.log10(fmin)
        ) / rez  # surandame zingsnio ilgi h, bet skaiciuojam pagal log10 skale
        start_time = time.time()
        # Pasirenkamas budas skaiciavimui atlikti:
        if (self.threads_quantity == None):
            self.CalculatePointsXY_simple(t0, tN, tkList, N, rez, fmin, step)
        else:
            self.CalculatePointsXY_multithreading(t0, tN, tkList, N, rez, fmin,
                                                  step)
        print("Skaiciavimu trukme: %s s" % (time.time() - start_time))
        return

    def CalculatePointsXY_simple(self, t0, tN, tkList, N, rez, fmin, step):
        if (self.fmax == 1):
            rez -= 1  # !@@@ fmax == 10**0, ty fj==fmax, iskreips grafika
        for j in range(
                0, rez +
                1):  # k - zinsniu kiekis, kiek h reikia padaryti fmin -> fmax;
            fj = fmin * 10**(
                j * step
            )  # formule: gaunam f pagal duota rezoliucija, kitaip x koord
            Sfj = self.FindSf(fj, t0, tN, N, tkList)  # S(f) - y koord
            self.xList.append(fj)
            self.yList.append(Sfj)
        return

    def CalculatePointsXY_multithreading(self, t0, tN, tkList, N, rez, fmin,
                                         step):
        intervalsList = self.threads_intervalList
        threads_q = self.threads_quantity
        queuePoints = self.queuePoints
        if (self.fmax != 1):
            rez += 1  # !@@@ Jei fmax == 10**0, ty bus fj==1, iskreips grafika
        sveikojiDalis = int(rez / threads_q)
        liekana = rez % threads_q
        begin = 0
        end = sveikojiDalis
        # Veiksmai su Procesais
        for threadNumber in range(
                0, threads_q
        ):  # Rezoliucijos intervalas padalinamas i intervaliukus procesams
            if (liekana > 0):
                end += 1
                liekana -= 1
            intervalsList.insert(threadNumber, [begin, end])
            begin = end
            end = end + sveikojiDalis
            # Proceso kurimas
            thread = Process(target=self.ThreadJob,
                             args=(threadNumber, intervalsList, step, fmin, t0,
                                   tN, tkList, N))
            self.threadsList.append(thread)
            self.threadsList[threadNumber].start(
            )  # Startuojamas def ThreadJob()
        #Threadu intervaliniu listu sujungimas i baigtinius List X, Y
        time.sleep(5)  # main thread miega 5sec, procesai dirba
        self.WaitUntil_ProcessOver(
            queuePoints, threads_q
        )  # Kadangi join() ir terminate() neveikia, MECHANISKAS join()
        self.FormatData_toList(
            queuePoints
        )  # Is Sync Queue issitraukiam data ir paverciam i listus
        return

    def ThreadJob(self, threadNumber, intervalsList, step, fmin, t0, tN,
                  tkList, N):
        xList_Inner = []
        yList_Inner = []
        interval = intervalsList[threadNumber]
        print('Process nr:' + str(threadNumber) + ' skaiciavimu intervalas: ' +
              str(interval[1] - 1))
        xList_Inner.append(
            threadNumber)  # !@@@@ xList_Inner[0] uzkoduojamas threadNr
        for j in range(
                interval[0], interval[1]
        ):  # j - zinsniu kiekis, kiek h reikia padaryti fmin -> fmax;
            fj = fmin * 10**(
                j * step
            )  #  formule: gaunam f pagal duota rezoliucija, kitaip x koord
            ###### S(f) apskaiciavimas #######
            c = 0.0
            s = 0.0
            for k in range(0, N):  # Sigma susumavimas sin ir cos reiskiniu
                c += np.cos(2.0 * np.pi * fj * tkList[k])
                s += np.sin(2.0 * np.pi * fj * tkList[k])
            c *= c  # Keliam kvadratu susumuota reiskini
            s *= s
            Sfj = (2.0 / (tN - t0)) * (c + s)  # S(f) - y koord
            xList_Inner.append(fj)
            yList_Inner.append(Sfj)
            time.sleep(0.001)  # Kad procesoriaus neperkaistu
            continue
        pList = [xList_Inner, yList_Inner]
        self.queuePoints.put(pList)
        self.queuePoints.task_done()
        sys.exit(
        )  # Paprograme (Process) pats save susinaikina sistemoje "End Task"

    def WaitUntil_ProcessOver(self, queuePoints, threads_q):
        working = True
        while (working):
            if (queuePoints.qsize() == threads_q):
                working = False
                break
            else:
                time.sleep(1)
        return

    def FormatData_toList(self, queuePoints):
        # Is Sync Queue issitraukiam data ir paverciam i listus
        for i in range(queuePoints.qsize()):  # Gali reikti que.size naudoti
            pList = queuePoints.get()
            listX = pList[0]
            listY = pList[1]
            threadNr = listX[0]
            listX.remove(listX[0])
            self.xList.insert(threadNr, listX)
            self.yList.insert(threadNr, listY)
        # Sujungiam lista (is list[[]] i list[])
        self.xList = list(itertools.chain.from_iterable(self.xList))
        self.yList = list(itertools.chain.from_iterable(self.yList))
        return
Пример #39
0
pushes inputs into the input queue using the output of a generator
"""
from multiprocessing import Queue, JoinableQueue
from output import OutThread
from worker import WorkerProcess
from alphaGenerator import alphaGen

if __name__ == '__main__':
    WORKERS = 2

    inq = JoinableQueue(maxsize=int(WORKERS * 1.5))
    outq = Queue(maxsize=int(WORKERS * 1.5))

    ot = OutThread(WORKERS, outq, sorting=False)
    ot.start()

    for i in range(WORKERS):
        w = WorkerProcess(inq, outq)
        w.start()
    instring = alphaGen(10)

    # feed the process pool with work units
    for work in enumerate(instring):
        inq.put(work)
    # terminate the process pool
    for i in range(WORKERS):
        inq.put(None)
    inq.join()
    print("input is ", instring)
    print("Control process terminating")
Пример #40
0
    def runit(self, args):  # pylint:disable=too-many-locals
        """
        This is the entry point for run_ingest_threads.py
        """
        self.spec_file = args["spec_file"].strip()
        self.credentials_file = args["credentials_file"].strip()
        self.path = args["path"].strip()
        self.fmask = args["file_name_mask"].strip()
        self.thread_count = args["threads"]
        self.output_dir = args["output_dir"].strip()
        if "file_pattern" in args.keys():
            self.file_pattern = args["file_pattern"].strip()

        #
        #  Read the load_spec file
        #
        try:
            logging.debug("load_spec filename is %s", self.spec_file)
            load_spec_file = LoadYamlSpecFile({"spec_file": self.spec_file})
            # read in the load_spec file
            self.load_spec = dict(load_spec_file.read())
            # put the real credentials into the load_spec
            self.cb_credentials = self.get_credentials(self.load_spec)
            # stash the load_job
            self.load_spec["load_job_doc"] = self.build_load_job_doc()
            # get the ingest document id.
            # NOTE: in future we may make this (ingest_document_id) a list
            # and start each VxIngestManager with its own ingest_document_id
            self.ingest_document_id = self.load_spec["ingest_document_id"]
            # establish connections to cb, collection
            self.connect_cb()
        except (RuntimeError, TypeError, NameError, KeyError):
            logging.error(
                "*** Error occurred in Main reading load_spec %s: %s ***",
                self.spec_file,
                str(sys.exc_info()),
            )
            sys.exit("*** Error reading load_spec: " + self.spec_file)

        self.ingest_document = self.collection.get(self.ingest_document_id).content
        # load the my_queue with filenames that match the mask and have not already been ingested
        # (do not have associated datafile documents)
        # Constructor for an infinite size  FIFO my_queue
        _q = JoinableQueue()
        file_names = []
        # get the urls (full_file_names) from all the datafiles for this type of ingest
        file_query = """
            SELECT url, mtime
            FROM mdata
            WHERE
            subset='metar'
            AND type='DF'
            AND fileType='netcdf'
            AND originType='madis' order by url;
            """
        file_names = self.get_file_list(file_query, self.path, self.file_pattern)
        for _f in file_names:
            _q.put(_f)

        # instantiate ingest_manager pool - each ingest_manager is a process
        # thread that uses builders to process one file at a time from the queue
        # Make the Pool of ingest_managers
        ingest_manager_list = []
        for thread_count in range(int(self.thread_count)):
            # noinspection PyBroadException
            try:
                self.load_spec["fmask"] = self.fmask
                ingest_manager_thread = VxIngestManager(
                    "VxIngestManager-" + str(thread_count),
                    self.load_spec,
                    self.ingest_document,
                    _q,
                    self.output_dir,
                )
                ingest_manager_list.append(ingest_manager_thread)
                ingest_manager_thread.start()
            except Exception as _e:  # pylint:disable=broad-except
                logging.error("*** Error in  VxIngestManager %s***", str(_e))
        # be sure to join all the threads to wait on them
        finished = [proc.join() for proc in ingest_manager_list]
        self.write_load_job_to_files()
        logging.info("finished starting threads")
        load_time_end = time.perf_counter()
        load_time = timedelta(seconds=load_time_end - self.load_time_start)
        logging.info(" finished %s", str(finished))
        logging.info("    >>> Total load a_time: %s", str(load_time))
        logging.info("End a_time: %s", str(datetime.now()))
        logging.info("--- *** --- End  --- *** ---")
Пример #41
0
def play():
    '''
    Play a game connecting to the server
    '''
    game = TablutGame()
    game_state = game.initial
    if conf.DEBUG:
        print(game_state)
        heu.print_heuristic(game, game_state)
    ttable = strat.TT()
    heu_tt = strat.TT()
    enemy_move = None
    try:
        state_queue = JoinableQueue(2)
        action_queue = JoinableQueue(1)
        conn = Connector(
            conf.SERVER_IP,
            conf.PLAYER_SERVER_PORT,
            conf.PLAYER_NAME,
            state_queue, action_queue,
            gutils.is_black(conf.PLAYER_ROLE)
        )
        conn.start()
        get_state(state_queue)
        if gutils.is_black(conf.PLAYER_ROLE):
            pawns, _ = get_state(state_queue)
            enemy_move = gutils.from_pawns_to_move(
                game_state.pawns, pawns, game_state.to_move
            )
            game_state = game.result(game_state, enemy_move)
            if conf.DEBUG:
                print(f'Enemy move: {enemy_move}')
                print(game_state)
                heu.print_heuristic(game, game_state)
        elapsed_time = 0
        while not game_state.is_terminal:
            if game.turn % 10 == 0:
                heu_tt.clear()
            if game.turn % 5 == 0:
                ttable.clear()
            game.inc_turn()
            if conf.DEBUG:
                print(f'Turn {game.turn}')
            conf.MOVE_TIMEOUT = (
                conf.GIVEN_MOVE_TIMEOUT - conf.MOVE_TIME_OVERHEAD - elapsed_time
            )
            my_move = get_move(
                game, game_state, conf.MY_PLAYER, prev_move=None,
                timeout=conf.MOVE_TIMEOUT, max_depth=4, tt=ttable,
                heu_tt=heu_tt, max_it=1000
            )
            start_time = timeit.default_timer()
            action_queue.put((my_move, game_state.to_move))
            action_queue.join()
            get_state(state_queue)
            game_state = game.result(game_state, my_move)
            elapsed_time = timeit.default_timer() - start_time
            if conf.DEBUG:
                print(f'My move: {my_move}')
                print(game_state)
                heu.print_heuristic(game, game_state)
            if game_state.is_terminal:
                break
            pawns, _ = get_state(state_queue)
            enemy_move = gutils.from_pawns_to_move(
                game_state.pawns, pawns, game_state.to_move
            )
            game_state = game.result(game_state, enemy_move)
            if conf.DEBUG:
                print(f'Enemy move: {enemy_move}')
                print(game_state)
                heu.print_heuristic(game, game_state)
    except Exception:
        if conf.DEBUG:
            print(traceback.format_exc())
    finally:
        conn.terminate()
        conn.join()

    if conf.DEBUG:
        if game_state.is_terminal:
            winner = game.utility(
                game_state, gutils.from_player_role_to_type(conf.PLAYER_ROLE)
            )
            print('WIN' if winner == 1 else 'LOSE' if winner == -1 else 'DRAW')
        else:
            print('ERROR')
Пример #42
0
class NXServer(NXDaemon):
    def __init__(self, directory=None, node_file=None):
        self.pid_name = 'nxserver'
        if directory:
            self.directory = directory = os.path.realpath(directory)
        else:
            self.directory = os.getcwd()
        self.task_directory = os.path.join(directory, 'tasks')
        if 'tasks' not in os.listdir(directory):
            os.mkdir(self.task_directory)
        self.task_list = os.path.join(self.task_directory, 'task_list')
        if not os.path.exists(self.task_list):
            os.mkfifo(self.task_list)
        if node_file is None:
            self.node_file = os.path.join(self.task_directory, 'nodefile')
        else:
            self.node_file = node_file
        self.nodes = self.read_nodes(self.node_file)
        self.log_file = os.path.join(self.task_directory, 'nxserver.log')
        self.pid_file = os.path.join(self.task_directory, 'nxserver.pid')

        self.tasks = None
        self.results = None
        self.workers = []

        super(NXServer, self).__init__(self.pid_name, self.pid_file)
        db_file = os.path.join(self.task_directory, 'nxdatabase.db')
        nxdb.init('sqlite:///' + db_file)

    def read_nodes(self, node_file):
        """Read available nodes"""
        if os.path.exists(node_file):
            with open(node_file) as f:
                nodes = [
                    line.strip() for line in f.readlines()
                    if line.strip() != ''
                ]
        else:
            nodes = []
        return nodes

    def log(self, message):
        with open(self.log_file, 'a') as f:
            f.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ' ' +
                    str(message) + '\n')

    def run(self):
        """
        Create worker processes to process commands from the task_fifo

        Create a worker for each node, read commands from task_list, submit
            an NXTask for each command to a JoinableQueue
        """
        self.log('Starting server (pid={})'.format(os.getpid()))
        self.tasks = JoinableQueue()
        self.results = Queue()
        self.workers = [
            NXWorker(node, self.tasks, self.results, self.log_file)
            for node in self.nodes
        ]
        for worker in self.workers:
            worker.start()
        task_fifo = open(self.task_list, 'r')
        while True:
            time.sleep(5)
            command = task_fifo.readline()[:-1]
            if command == 'stop':
                break
            elif command:
                self.tasks.put(NXTask(self.directory, command))
        for worker in self.workers:
            self.tasks.put(None)
        self.tasks.join()
        for worker in self.workers:
            worker.terminate()
            worker.join()
        self.log("Stopping server")
        super(NXServer, self).stop()

    def stop(self):
        if self.is_running():
            self.add_task('stop')
        else:
            super(NXServer, self).stop()

    def clear(self):
        if os.path.exists(self.task_list):
            os.remove(self.task_list)
        os.mkfifo(self.task_list)

    def add_task(self, command):
        """Add a task to the server queue"""
        task_fifo = os.open(self.task_list, os.O_RDWR)
        os.write(task_fifo, (command + '\n').encode())
Пример #43
0
    queue_server_name = os.environ.get("WARMTENSOR_SERVER","simple-queue-server-0")
    queue_server_port = int(os.environ.get("WARMTENSOR_SERVER_PORT","50001"))
    queue_recv_size = int(os.environ.get("WARMTENSOR_SERVER_RECV","4000000"))
    tf_intra_op = int(os.environ.get("WARMTENSOR_INTRA_OP","36"))
    tf_inter_op = int(os.environ.get("WARMTENSOR_INTER_OP","1"))

    times = timer()
    from multiprocessing import JoinableQueue
    from multiprocessing import Queue
    from multiprocessing import Pool

    queue_of_batches = JoinableQueue()
    imgs_per_process_queue = Queue()
    confidence_queue = Queue()
    for i in range(int(num_processes)):
        queue_of_batches.put(object())

    tf_pool = Pool(int(num_processes), classify, (queue_of_batches, imgs_per_process_queue,
                                                  confidence_queue, classifier_filename, model, image_size,
                                                  max_batch_size, queue_server_name, queue_server_port,
                                                  queue_recv_size, tf_intra_op, tf_inter_op))
    queue_of_batches.join()
    endtime = timer()-times
    throughput = 0
    avg_confidence = 0
    for i in range(int(num_processes)):
        throughput+=imgs_per_process_queue.get(True)
        avg_confidence+=confidence_queue.get(True)

    with open("/relevant_metrics","w") as relevant_metrics:
        relevant_metrics.write('%.3f ImagesPerSecond \n %.3f AverageConfidence\n' % (throughput / endtime, avg_confidence / num_processes))
Пример #44
0
class Master():
    def __init__(self, args):
        self.tasks = JoinableQueue()
        self.results = Queue()
        self.workers = []
        last_savepath = None

        if args.params_folder:
            logger.configure(args.params_folder + '/')
        else:
            logger.configure('params/')
        checkdir = osp.join(logger.get_dir(), 'checkpoints')
        os.makedirs(checkdir, exist_ok=True)
        if args.last_save_params:
            list_of_params = glob.glob(checkdir + '/*')
            if len(list_of_params) > 0:
                last_savepath=max(list_of_params, key=osp.getctime)
                print('Loading from %s' % last_savepath)

        NUM_LOOPS = int(args.num_loops)
        NUM_ENVS = int(args.num_envs)

        for i in range(NUM_WORKERS):
            self.workers.append(Worker(self.tasks, self.results, i))

        for w in self.workers:
            w.start()

        pid = os.getpid()
        py = psutil.Process(pid)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            model = ppo2.Model(policy=policies.CnnPolicy,
                               ob_space=env.observation_space,
                               ac_space=env.action_space,
                               nbatch_act=nbatch_act,
                               nsteps=steps_per_ep,
                               nbatch_train=nbatch_train,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm)
            if last_savepath:
                model.load(last_savepath)
                num = re.search('\d+$', last_savepath)[0]
                last_savepath = int(num) + 1
            else:
                last_savepath = 1

            last_savepath = osp.join(checkdir, str(last_savepath))

            with tf.variable_scope('model'):
                params = model.get_params()

            print("Loading params for workers")

            for w in self.workers:
                self.tasks.put((LOAD_SIGNAL, params))
            self.tasks.join() # block

            for step in range(NUM_LOOPS):
                exps = []
                for i in range(NUM_ENVS):
                    self.tasks.put((TRAIN_SIGNAL, i))
                self.tasks.join() # block

                print("step %d completed" % step)

                while not self.results.empty():
                    exps.append(self.results.get())
                assert len(exps) == NUM_ENVS
                nbatch = steps_per_ep
                inds = np.arange(nbatch)
                grads = []
                for _ in range(noptepochs):
                    np.random.shuffle(inds)
                    for start in range(0, nbatch, nbatch_train):
                        mbinds = inds[start:start + nbatch_train]
                        all_slices = []
                        for exp in exps:
                            obs, returns, masks, actions, values, neglogpacs, states, epinfos = exp
                            slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                            all_slices.append(slices)
                            grads.append(model.grad(lr, cliprange, *slices))

                print("Done running workers on all envs, now merging gradients")

                avg_grads = []
                # for each gradient variable (NOT gradient colleted from experience),
                for i in range(len(grads[0])):
                    # pool together the grads i from each worker j
                    total_grad = grads[0][i]
                    for j in range(1, len(grads)):
                        total_grad += grads[j][i]
                    avg_grads.append(total_grad / len(grads))

                print("Finished merging gradients, now applying")
                model.joint_train2(lr, avg_grads)

                params = model.get_params()

                for w in self.workers:
                    self.tasks.put((LOAD_SIGNAL, params))
                self.tasks.join()

                memUse = py.memory_info()[0]/2.**30
                print('memory use: %.6f GB from master' %(memUse))

            model.save(last_savepath)

        print("sending kill signal to workers")
        for w in self.workers:
            self.tasks.put((KILL_SIGNAL, DUMMY_ENV))

        print("wrap up")
        self.tasks.join()
        while not self.results.empty():
            print(self.results.get())
Пример #45
0
        with open(args.align_timepoints, 'r') as f:
            timepoints = json.load(f)
    else:
        timepoints = None

    # Setup readers, workers and aggregators according to command line parameters
    progress_monitor = manager.tqdm(total=len(args.endpoint_files))
    pipeline = ParallelPipeline(steps=[
        (HDF5Reader, {'hdf5_group': 'endpoints', 'progressbar': progress_monitor}, args.n_readers, 300),
        (PatientProcessor, {'processing_function': process_patient, 'error_list': error_list, 'function_args': {'duration': pd.DateOffset(hours=args.duration), 'dt': pd.DateOffset(hours=args.dt), 'timepoints': timepoints}}, args.n_workers),
        (Aggregator, {'output_dict': results, 'error_list': error_list}, 1)
    ], input_queue=inputfile_queue)

    # Add input file to queue
    for f in args.endpoint_files:
        inputfile_queue.put(f)

    # Start all processes and setup intermediate queues
    pipeline.run()

    pipeline.wait_for_completion()
    progress_monitor.close()

    # TODO: Write the errors out into a json file to allow later analysis
    if len(error_list) > 0:
        print('Errors occurred during processing:')
        for e in error_list:
            print(e)

    print('Writing results to output file')
    with open(args.output, 'w') as f:
Пример #46
0
class elasticBeacon(object):
    """
    Elastic Beacon is  designed to identify periodic communication between
    network communicatiors. Future updates will allow for dynamic fields to be passed in.

    If you do not allow your elastic search server to communicate externally, you can setup an
    ssh tunnel by using ssh -NfL 9200:localhost:9200 username@yourserver

    Otherwise, you'll need to adjust es_host to the IP address that is exposed to elasticSearch.

    """
    def __init__(self,
                 config_in=None,
                 min_occur=10,
                 min_percent=5,
                 window=2,
                 threads=8,
                 period=24,
                 min_interval=2,
                 es_host='localhost',
                 es_port=9200,
                 es_timeout=480,
                 es_index='logstash-flow-*',
                 kibana_version='4',
                 verbose=True,
                 debug=True):
        """

        :param min_occur: Minimum number of triads to be considered beaconing
        :param min_percent: Minimum percentage of all connection attempts that
         must fall within the window to be considered beaconing
        :param window: Size of window in seconds in which we group connections to determine percentage, using a
         large window size can give inaccurate interval times, multiple windows contain all interesting packets,
         so the first window to match is the interval
        :param threads: Number of cores to use
        :param period: Number of hours to locate beacons for
        :param min_interval: Minimum interval betweeen events to consider for beaconing behavior
        :param es_host: IP Address of elasticsearch host (default is localhost)
        :param es_timeout: Sets timeout to 480 seconds
        :param kibana_version: 4 or 5 (query will depend on version)
        """
        #self.config_in = config_in
        if config_in is not None:
            try:
                self.config = flareConfig(config_in)
                self.es_host = self.config.get('beacon', 'es_host')
                self.es_port = int(self.config.get('beacon', 'es_port'))
                self.es_index = self.config.get('beacon', 'es_index')
                self.use_ssl = self.config.config.getboolean(
                    'beacon', 'use_ssl')
                self.MIN_OCCURRENCES = int(
                    self.config.get('beacon', 'min_occur'))
                self.MIN_PERCENT = int(self.config.get('beacon',
                                                       'min_percent'))
                self.WINDOW = int(self.config.get('beacon', 'window'))
                self.NUM_PROCESSES = int(self.config.get('beacon', 'threads'))
                self.period = int(self.config.get('beacon', 'period'))
                self.min_interval = int(
                    self.config.get('beacon', 'min_interval'))
                self.es_timeout = int(self.config.get('beacon', 'es_timeout'))
                self.kibana_version = self.config.get('beacon',
                                                      'kibana_version')
                self.beacon_src_ip = self.config.get('beacon',
                                                     'field_source_ip')
                self.beacon_dest_ip = self.config.get('beacon',
                                                      'field_destination_ip')
                self.beacon_destination_port = self.config.get(
                    'beacon', 'field_destination_port')
                self.beacon_timestamp = self.config.get(
                    'beacon', 'field_timestamp')
                self.beacon_flow_bytes_toserver = self.config.get(
                    'beacon', 'field_flow_bytes_toserver')
                self.beacon_flow_id = self.config.get('beacon',
                                                      'field_flow_id')
                self.beacon_event_key = self.config.get('beacon', 'event_key')
                self.beacon_event_type = self.config.get(
                    'beacon', 'event_type')
                self.filter = self.config.get('beacon', 'filter')
                self.verbose = self.config.config.getboolean(
                    'beacon', 'verbose')
                self.auth_user = self.config.config.get('beacon', 'username')
                self.auth_password = self.config.config.get(
                    'beacon', 'password')
                self.suricata_defaults = self.config.config.getboolean(
                    'beacon', 'suricata_defaults')
                try:
                    self.debug = self.config.config.getboolean(
                        'beacon', 'debug')
                except:
                    self.debug = debug

            except Exception as e:
                print((
                    '{red}[FAIL]{endc} Could not properly load your config!\nReason: {e}'
                    .format(red=bcolors.FAIL, endc=bcolors.ENDC, e=e)))
                sys.exit(0)

        else:
            self.es_host = es_host
            self.es_port = es_port
            self.es_index = es_index
            self.use_ssl = False
            self.MIN_OCCURRENCES = min_occur
            self.MIN_PERCENT = min_percent
            self.WINDOW = window
            self.NUM_PROCESSES = threads
            self.period = period
            self.min_interval = min_interval
            self.kibana_version = kibana_version
            self.es_timeout = es_timeout
            self.beacon_src_ip = 'src_ip'
            self.beacon_dest_ip = 'dest_ip'
            self.beacon_destination_port = 'dest_port'
            self.beacon_timestamp = '@timestamp'
            self.beacon_flow_bytes_toserver = 'bytes_toserver'
            self.beacon_flow_id = 'flow_id'
            self.beacon_event_type = 'flow'
            self.beacon_event_key = 'event_type'
            self.filter = ''
            self.verbose = verbose
            self.suricata_defaults = False
            self.debug = debug

        self.ver = {'4': {'filtered': 'query'}, '5': {'bool': 'must'}}
        self.filt = list(self.ver[self.kibana_version].keys())[0]
        self.query = list(self.ver[self.kibana_version].values())[0]
        self.whois = WhoisLookup()
        self.info = '{info}[INFO]{endc}'.format(info=bcolors.OKBLUE,
                                                endc=bcolors.ENDC)
        self.success = '{green}[SUCCESS]{endc}'.format(green=bcolors.OKGREEN,
                                                       endc=bcolors.ENDC)
        self.fields = [
            self.beacon_src_ip, self.beacon_dest_ip,
            self.beacon_destination_port, self.beacon_flow_bytes_toserver,
            'dest_degree', 'occurrences', 'percent', 'interval'
        ]

        try:
            _ = (self.auth_user, self.auth_password)
            self.auth = "Enabled"
        except AttributeError as e:
            self.auth = "None"

        try:
            self.vprint(
                '{info}[INFO]{endc} Attempting to connect to elasticsearch...'.
                format(info=bcolors.OKBLUE, endc=bcolors.ENDC))
            if self.auth == "None":
                self.es = Elasticsearch(
                    self.es_host,
                    port=self.es_port,
                    timeout=self.es_timeout,
                    verify_certs=False,
                    use_ssl=self.use_ssl,
                    connection_class=RequestsHttpConnection)
            else:
                self.es = Elasticsearch(
                    self.es_host,
                    port=self.es_port,
                    timeout=self.es_timeout,
                    http_auth=(self.auth_user, self.auth_password),
                    verify_certs=False,
                    use_ssl=self.use_ssl,
                    connection_class=RequestsHttpConnection)
            self.vprint(
                '{green}[SUCCESS]{endc} Connected to elasticsearch on {host}:{port}'
                .format(green=bcolors.OKGREEN,
                        endc=bcolors.ENDC,
                        host=self.es_host,
                        port=str(self.es_port)))
        except Exception as e:
            self.vprint(e)
            raise Exception(
                "Could not connect to ElasticSearch -- Please verify your settings are correct and try again."
            )

        self.q_job = JoinableQueue()
        self.l_df = Lock()
        self.l_list = Lock()
        self.high_freq = None
        self.flow_data = self.run_query()

    def vprint(self, msg):
        if self.verbose:
            print(msg)

    def dprint(self, msg):
        if self.debug:
            print(("[DEBUG] " + str(msg)))

    def hour_query(self, h, *fields):
        """

        :param h: Number of hours to look for beaconing (recommend 24 if computer can support it)
        :param fields: Retrieve only these fields -- example "src_ip", "dest_ip", "src_port", "dest_port"
        :return:
        """
        # Timestamp in ES is in milliseconds
        NOW = int(time.time() * 1000)
        SECONDS = 1000
        MINUTES = 60 * SECONDS
        HOURS = 60 * MINUTES
        lte = NOW
        gte = int(NOW - h * HOURS)

        if self.es_index:
            if self.filter:
                self.query_string = "_exists_:" + self.beacon_src_ip + " AND _exists_:" + self.beacon_destination_port + " AND _exists_:" + self.beacon_dest_ip + " AND " + self.filter
            else:
                self.query_string = "_exists_:" + self.beacon_src_ip + " AND _exists_:" + self.beacon_destination_port + " AND _exists_:" + self.beacon_dest_ip
            query = {
                "query": {
                    self.filt: {
                        self.query: {
                            "query_string": {
                                "query": self.query_string,
                                "analyze_wildcard": 'true'
                            }
                        },
                        "filter": [{
                            "bool": {
                                "must": [{
                                    "range": {
                                        self.beacon_timestamp: {
                                            "gte": gte,
                                            "lte": lte,
                                            "format": "epoch_millis"
                                        }
                                    }
                                }],
                                "must_not": []
                            }
                        }, {
                            "term": {
                                self.beacon_event_key: self.beacon_event_type
                            }
                        }]
                    }
                }
            }
        else:
            if self.filter:
                self.query_string = "_exists_:src_ip AND _exists_:dest_ip AND _exists_:dest_port" + self.filter
            else:
                self.query_string = "_exists_:src_ip AND _exists_:dest_ip AND _exists_:dest_port"
            query = {
                "query": {
                    self.filt: {
                        self.query: {
                            "query_string": {
                                "query": self.query_string,
                                "analyze_wildcard": 'true'
                            }
                        },
                        "filter": {
                            "bool": {
                                "must": [{
                                    "range": {
                                        "timestamp": {
                                            "gte": gte,
                                            "lte": lte,
                                            "format": "epoch_millis"
                                        }
                                    }
                                }],
                                "must_not": []
                            }
                        }
                    }
                }
            }
        if fields:
            query["_source"] = list(fields)
            self.dprint(query)

        return query

    # this is a sliding window average - for notes... percent grouping is "not exactly a thing" .... with love tho
    def percent_grouping(self, d, total):
        mx = 0
        interval = 0
        # Finding the key with the largest value (interval with most events)
        mx_key = int(max(iter(list(d.keys())), key=(lambda key: d[key])))

        mx_percent = 0.0

        for i in range(mx_key - self.WINDOW, mx_key + 1):
            current = 0
            # Finding center of current window
            curr_interval = i + int(self.WINDOW / 2)
            for j in range(i, i + self.WINDOW):
                if j in d:
                    current += d[j]
            percent = float(current) / total * 100

            if percent > mx_percent:
                mx_percent = percent
                interval = curr_interval

        return interval, mx_percent

    def run_query(self):
        self.vprint(
            "{info} Gathering flow data... this may take a while...".format(
                info=self.info))

        FLOW_BYTES = self.beacon_flow_bytes_toserver
        if self.suricata_defaults:
            FLOW_BYTES = 'flow.' + FLOW_BYTES

        query = self.hour_query(self.period, self.beacon_src_ip,
                                self.beacon_dest_ip,
                                self.beacon_destination_port,
                                self.beacon_timestamp, FLOW_BYTES,
                                self.beacon_flow_id)
        self.dprint(query)
        resp = helpers.scan(query=query,
                            client=self.es,
                            scroll="90m",
                            index=self.es_index,
                            timeout="10m")
        df = pd.io.json.json_normalize([rec['_source'] for rec in resp])
        df.rename(columns=dict(
            (x, x.replace("_source.", "")) for x in df.columns),
                  inplace=True)
        if len(df) == 0:
            raise Exception(
                "Elasticsearch did not retrieve any data. Please ensure your settings are correct inside the config file."
            )

        self.dprint(df)
        df[self.beacon_destination_port] = df[
            self.beacon_destination_port].fillna(0).astype(int)

        df['triad_id'] = (
            df[self.beacon_src_ip] + df[self.beacon_dest_ip] +
            df[self.beacon_destination_port].astype(str)).apply(hash)
        df['triad_freq'] = df.groupby('triad_id')['triad_id'].transform(
            'count').fillna(0).astype(int)
        self.high_freq = list(df[df.triad_freq > self.MIN_OCCURRENCES].groupby(
            'triad_id').groups.keys())
        return df

    def find_beacon(self, q_job, beacon_list):

        while not q_job.empty():
            triad_id = q_job.get()
            self.l_df.acquire()
            work = self.flow_data[self.flow_data.triad_id == triad_id]
            self.l_df.release()

            work[self.beacon_timestamp] = pd.to_datetime(
                work[self.beacon_timestamp])
            work[self.beacon_timestamp] = (
                work[self.beacon_timestamp].astype(int) /
                1000000000).astype(int)
            work = work.sort_values([self.beacon_timestamp])
            work['delta'] = (work[self.beacon_timestamp] -
                             work[self.beacon_timestamp].shift()).fillna(0)
            work = work[1:]

            d = dict(work.delta.value_counts())
            for key in list(d.keys()):
                if key < self.min_interval:
                    del d[key]

            # Finding the total number of events
            total = sum(d.values())

            if d and total > self.MIN_OCCURRENCES:
                window, percent = self.percent_grouping(d, total)
                if percent > self.MIN_PERCENT and total > self.MIN_OCCURRENCES:
                    PERCENT = str(int(percent))
                    WINDOW = str(window)
                    SRC_IP = work[self.beacon_src_ip].unique()[0]
                    DEST_IP = work[self.beacon_dest_ip].unique()[0]
                    DEST_PORT = str(
                        int(work[self.beacon_destination_port].unique()[0]))
                    BYTES_TOSERVER = work[
                        self.beacon_flow_bytes_toserver].sum()
                    SRC_DEGREE = len(work[self.beacon_dest_ip].unique())
                    OCCURRENCES = total
                    self.l_list.acquire()
                    beacon_list.append([
                        SRC_IP, DEST_IP, DEST_PORT, BYTES_TOSERVER, SRC_DEGREE,
                        OCCURRENCES, PERCENT, WINDOW
                    ])
                    self.l_list.release()

            q_job.task_done()

    def find_beacons(self,
                     group=True,
                     focus_outbound=False,
                     whois=True,
                     csv_out=None,
                     html_out=None,
                     json_out=None):

        for triad_id in self.high_freq:
            self.q_job.put(triad_id)

        mgr = Manager()
        beacon_list = mgr.list()
        processes = [
            Process(target=self.find_beacon, args=(
                self.q_job,
                beacon_list,
            )) for thread in range(self.NUM_PROCESSES)
        ]

        # Run processes
        for p in processes:
            p.start()

        # Exit the completed processes
        for p in processes:
            p.join()

        beacon_list = list(beacon_list)
        beacon_df = pd.DataFrame(beacon_list, columns=self.fields).dropna()
        beacon_df.interval = beacon_df.interval.astype(int)
        beacon_df['dest_degree'] = beacon_df.groupby(self.beacon_dest_ip)[
            self.beacon_dest_ip].transform('count').fillna(0).astype(int)
        self.vprint(
            '{info} Calculating destination degree.'.format(info=self.info))

        if whois:
            self.vprint(
                '{info} Enriching IP addresses with whois information'.format(
                    info=self.info))
            beacon_df['src_whois'] = beacon_df[self.beacon_src_ip].apply(
                lambda ip: self.whois.get_name_by_ip(ip))
            beacon_df['dest_whois'] = beacon_df[self.beacon_dest_ip].apply(
                lambda ip: self.whois.get_name_by_ip(ip))

        if focus_outbound:
            self.vprint(
                '{info} Applying outbound focus - filtering multicast, reserved, and private IP space'
                .format(info=self.info))
            beacon_df = beacon_df[
                (beacon_df[self.beacon_src_ip].apply(private_check))
                & (~beacon_df[self.beacon_dest_ip].apply(multicast_check)) &
                (~beacon_df[self.beacon_dest_ip].apply(reserved_check)) &
                (~beacon_df[self.beacon_dest_ip].apply(private_check))]

        if group:
            self.vprint('{info} Grouping by destination group IP'.format(
                info=self.info))

            if whois:
                self.fields.insert(self.fields.index(self.beacon_dest_ip),
                                   'dest_whois')
            beacon_df = pd.DataFrame(beacon_df.groupby(self.fields).size())
            beacon_df.drop(0, axis=1, inplace=True)

        if csv_out:
            self.vprint('{success} Writing csv to {csv_name}'.format(
                csv_name=csv_out, success=self.success))
            beacon_df.to_csv(csv_out, index=False)

        if html_out:
            self.vprint('{success} Writing html file to {html_out}'.format(
                html_out=html_out, success=self.success))
            beacon_df.to_html(html_out)

        if json_out:
            self.vprint('{success} Writing json file to {json_out}'.format(
                json_out=json_out, success=self.success))
            now = datetime.datetime.now().isoformat()
            beacon_df['timestamp'] = now
            beacon_df['period'] = self.period
            beacon_df['event_type'] = "beaconing"
            beacons = beacon_df.to_dict(orient="records")

            with open(json_out, 'a') as out_file:
                for beacon in beacons:
                    out_file.write(json.dumps(beacon) + '\n')

        return beacon_df
Пример #47
0
def start_multicore(wordlists,hashing_algorithm,output,use_database):
  """Start the multicore process.

  This devides words that needs to be hashed between cores

  Parameters:
    - wordlist: list containing words that need to be hashed
    - hashing_algorithm: which hashing algorithm should be used
    - output: format of the output file
    - use_database: if results should be stored in a database
  """

  global core_list
  chunk_size = 25000
  core_list = list()

  # Save 1 core for writing
  num_cores = cpuCount()
  num_hash_cores = num_cores -1

  result_queue = JoinableQueue()
  work_queue = JoinableQueue(num_hash_cores)

  # Start hash cores
  for core in range(num_hash_cores):

    cur_core = Process(target=hash_core_run,args=(core,result_queue,work_queue,hashing_algorithm))
    cur_core.start()
    core_list.append(cur_core)

  # Start output core
  out_core = Process(target=output_core_run,args=(result_queue,output,use_database))
  out_core.start()

  result_lines = []
  for wordlist in wordlists:
    with open(wordlist,encoding="utf-8") as fwordlist:

      for index,line in enumerate(fwordlist):
        result_lines.append(line)

        # Divides number of lines between hashcores
        if (index % chunk_size) == 0:
          work_queue.put(result_lines)
          result_lines = []

  if len(result_lines) > 0:
    work_queue.put(result_lines)

  # Send stop signal to hash cores
  for i in range(num_hash_cores):
    work_queue.put(None)

  # Wait for hashing cores to finish
  for hash_core in core_list:
    hash_core.join()

  # Send stop signal to output core to finish
  # and then wait for it
  result_queue.put(None)
  out_core.join()

  # Close Joinable queues
  work_queue.close()
  result_queue.close()
Пример #48
0
class Plotter:
    """Visualizes a policy in an environment."""

    # Static variable used to disable the plotter
    enable = True
    # List containing all plotters instantiated in the process
    __plotters = []

    def __init__(self):
        Plotter.__plotters.append(self)
        self._process = None
        self._queue = None

    def _worker_start(self):
        env = None
        policy = None
        max_length = None
        initial_rollout = True
        try:
            # Each iteration will process ALL messages currently in the
            # queue
            while True:
                msgs = {}
                # If true, block and yield processor
                if initial_rollout:
                    msg = self._queue.get()
                    msgs[msg.op] = msg
                    # Only fetch the last message of each type
                    while not self._queue.empty():
                        msg = self._queue.get()
                        msgs[msg.op] = msg
                else:
                    # Only fetch the last message of each type
                    while not self._queue.empty():
                        msg = self._queue.get_nowait()
                        msgs[msg.op] = msg

                if Op.STOP in msgs:
                    break

                if Op.UPDATE in msgs:
                    env, policy = msgs[Op.UPDATE].args
                elif Op.DEMO in msgs:
                    param_values, max_length = msgs[Op.DEMO].args
                    policy.set_param_values(param_values)
                    initial_rollout = False
                    rollout(env,
                            policy,
                            max_episode_length=max_length,
                            animated=True,
                            speedup=5)
                else:
                    if max_length:
                        rollout(env,
                                policy,
                                max_episode_length=max_length,
                                animated=True,
                                speedup=5)
        except KeyboardInterrupt:
            pass

    def close(self):
        """Stop the plotter."""
        if not Plotter.enable:
            return
        if self._process and self._process.is_alive():
            while not self._queue.empty():
                self._queue.get()
                self._queue.task_done()
            self._queue.put(Message(op=Op.STOP, args=None, kwargs=None))
            self._queue.close()
            self._process.join()

    @staticmethod
    def disable():
        """Disable all instances of the Plotter class."""
        Plotter.enable = False

    @staticmethod
    def _get_plotters():
        """Get all instances of Plotter.

        Returns:
            List[Plotter]: All instances of Plotter.

        """
        return Plotter.__plotters

    def _init_worker(self):
        if not Plotter.enable:
            return
        self._queue = JoinableQueue()
        if 'Darwin' in platform.platform():
            self._process = Thread(target=self._worker_start)
        else:
            self._process = Process(target=self._worker_start)
        self._process.daemon = True
        self._process.start()
        atexit.register(self.close)

    def init_plot(self, env, policy):
        """Initialize the plotter.

        Args:
            env (GymEnv): Environment to visualize.
            policy (Policy): Policy to roll out in the
                visualization.

        """
        if not Plotter.enable:
            return
        if not (self._process and self._queue):
            self._init_worker()

        # Needed in order to draw glfw window on the main thread
        if 'Darwin' in platform.platform():
            rollout(env,
                    policy,
                    max_episode_length=np.inf,
                    animated=True,
                    speedup=5)

        self._queue.put(Message(op=Op.UPDATE, args=(env, policy), kwargs=None))

    def update_plot(self, policy, max_length=np.inf):
        """Update the plotter.

        Args:
            policy (garage.np.policies.Policy): New policy to roll out in the
                visualization.
            max_length (int): Maximum number of steps to roll out.

        """
        if not Plotter.enable:
            return
        self._queue.put(
            Message(op=Op.DEMO,
                    args=(policy.get_param_values(), max_length),
                    kwargs=None))
Пример #49
0
def bin(samtools,
        samples,
        chromosomes,
        num_workers,
        q,
        size,
        regions,
        verbose=False):
    # Define a Lock and a shared value for log printing through ProgressBar
    err_lock = Lock()
    counter = Value('i', 0)
    progress_bar = pb.ProgressBar(total=len(samples) * len(chromosomes),
                                  length=40,
                                  lock=err_lock,
                                  counter=counter,
                                  verbose=verbose)

    # Establish communication queues
    tasks = JoinableQueue()
    results = Queue()

    # Enqueue jobs
    jobs_count = 0
    for bam in samples:
        for chro in chromosomes:
            tasks.put((bam[0], bam[1], chro))
            jobs_count += 1

    # Setting up the workers
    workers = [
        Binner(tasks, results, progress_bar, samtools, q, size, regions,
               verbose) for i in range(min(num_workers, jobs_count))
    ]

    # Add a poison pill for each worker
    for i in range(len(workers)):
        tasks.put(None)

    # Start the workers
    for w in workers:
        w.start()

    # Wait for all of the tasks to finish
    tasks.join()

    # Get the results
    sorted_results = {}
    for i in range(jobs_count):
        res = results.get()
        sorted_results[res[0][0], res[0][1]] = res

    # Close Queues
    tasks.close()
    results.close()

    # Ensure each worker terminates
    for w in workers:
        w.terminate()
        w.join()

    return sorted_results
    #processed_list = process_xml_files(folder,list_of_xml_files)
    #print 'Finished processing all %s files' % len(list_of_xml_files)
    PROC_WORKERS = 5
    print "Starting workers"
    fileCue = JoinableQueue(cue_size)
    print "%s files to be processed" % cue_size
    procWorkers = []
    for n in range(PROC_WORKERS):
        procWorkers.append(
            Process(target=process_and_commit, args=(fileCue, n)))
        procWorkers[-1].start()
    print 'Starting cue creation...'

    for filr in list_of_xml_files:
        new_filr = folder + filr
        fileCue.put(new_filr)

    print 'Cue creation complete'

    print "Assigning end of shift"
    for n in range(PROC_WORKERS):
        fileCue.put(EndOfQueue())

        print "Processing file queue"
        fileCue.join()
        print "Joined file queue"

        print "Waiting for processor workers"
        for procWorker in procWorkers:
            procWorker.join()
            print "Joined a processor worker"
Пример #51
0
class QiushiSpider():
    def __init__(self):
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
        }
        self.url_q = Queue()  # url队列
        self.html_q = Queue()  # 响应内容队列
        self.items_q = Queue()  # 数据队列

    def get_url_list(self):
        """构造url,put进url_q队列中"""
        for i in range(1, 14):
            self.url_q.put(
                'https://www.qiushibaike.com/8hr/page/{}/'.format(i))

    def get_html(self):
        """不断 从url队列中取出一个url,发送请求,获取响应,把响应内容放入html_q队列"""
        while True:
            url = self.url_q.get()
            resp = requests.get(url, headers=self.headers)
            self.html_q.put(resp.text)
            self.url_q.task_done()  # 计数 -1

    def get_items(self):
        """不断 从html_q队列中取出一页的html_str,提取数据,构造数据列表,放入数据队列中"""
        while True:
            html_str = self.html_q.get()
            html = etree.HTML(html_str)
            div_list = html.xpath('//div[@id="content-left"]/div')
            # print(len(div_list))
            result_list = []
            for div in div_list:
                item = {}
                item['name'] = div.xpath('.//h2/text()')[0]
                item['content'] = div.xpath(
                    './/div[@class="content"]/span/text()')
                # print(item)
                result_list.append(item)
            self.items_q.put(result_list)
            self.html_q.task_done()

    def save_results(self):
        """不断从数据队列中取出一页数据,分别保存"""
        while True:
            result_list = self.items_q.get()
            for item in result_list:
                print(item)
            self.items_q.task_done()

    def run(self):
        """爬虫运行逻辑"""

        # url_q
        self.get_url_list()

        # 构造线程列表
        t_list = []

        # 用线程去执行各个函数
        for i in range(5):
            t_html = Process(target=self.get_html)
            t_list.append(t_html)

        for i in range(3):
            t_parse = Process(target=self.get_items)
            t_list.append(t_parse)

        t_save = Process(target=self.save_results)
        t_list.append(t_save)

        # 设置守护线程,让线程执行
        for t in t_list:
            t.daemon = True  # 设置守护进程:主进程结束,子进程随之结束
            # t.setDaemon(True) # 设置守护线程:主线程结束,子线程随之结束
            t.start()

        # 主线程调用q队列的join函数,来阻塞
        for q in [self.url_q, self.html_q, self.items_q]:
            q.join()  # 阻塞当前的主线程,直到q的计数为0

        print('程序结束!')
Пример #52
0
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "hu:cp",
                                   ["help", "user="******"current", "print"])
    except getopt.GetoptError:
        print "python injectionremover.py -u username"
        sys.exit(2)

    desiredPath = "None"
    global printOnly
    printOnly = False
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            print "Usage: python injectionremover.py -u username"
            print "-u : --user : Specifies a user to scan."
            print "-c : --current : Specifies to scan the current directory."
            print "-p : --print : Scan will only print the found injections. Not remove them."
            sys.exit(1)
        elif opt in ("-u", "--user"):
            desiredPath = "/home/" + arg + "/public_html/"
        elif opt in ("-c", "--current"):
            desiredPath = os.getcwd()
        elif opt in ("-p", "--user"):
            printOnly = True

    if desiredPath == "None":
        print "No path (-u or -c) option specified."
        print "Correct Usage: python injectionremover.py -u username"
        sys.exit(1)

    if printOnly:
        print "Injections will not be removed. Printing results."

    if os.path.exists(desiredPath):
        print ""
        print "Scanning the following directory:"
        print "~~~"
        print desiredPath
        print "~~~"
        print ""
    else:
        print "Specified directory not found..."
        sys.exit(1)

    global regexList
    global regexNames
    global compiled
    regexList = []
    regexNames = []
    compiled = []

    regexList.append(
        "<\?php +\$sF=\"PCT[0-9]BA[0-9]ODSE\_\";\$s[0-9][0-9]=strtolower\(\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9][0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9][0-9]\].\$sF\[[0-9]\].\$sF\[[0-9][0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9][0-9]\]\);\$s[0-9][0-9]=\$.strtoupper\(\$sF\[[0-9][0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\]\).\['[a-zA-Z0-9]*'\];if\(isset\(\$s[0-9][0-9]\)\).eval\(\$s[0-9][0-9]\(\$s[0-9][0-9]\)\);\}\?>"
    )
    regexNames.append("PCT:1 INJECTION")

    regexList.append(
        "<\?php +\$sF=\"PCT[0-9]BA[0-9]ODSE\_\";\$s[0-9][0-9]=strtolower\(\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9][0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9][0-9]\].\$sF\[[0-9]\].\$sF\[[0-9][0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9][0-9]\]\);\$s[0-9][0-9]=strtoupper\(\$sF\[[0-9][0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\].\$sF\[[0-9]\]\).if.\(isset\(\$.\$s20.\[.[0-9a-z]{7}.\]\)\)..eval\(\$s21\(\$.\$s20.\[.[0-9a-z]{7}.\]\)\);\}\?>"
    )
    regexNames.append("PCT:2 INJECTION")

    regexList.append(
        "<\?php +\$qV=\"stop_\";\$s[0-9][0-9]=strtoupper\(\$qV\[[0-9]\].\$qV\[[0-9]\].\$qV\[[0-9]\].\$qV\[[0-9]\].\$qV\[[0-9]\]\);if\(isset\(\$.\$s[0-9][0-9].\['[0-9a-z]{7}'\]\)\).eval\(\$.\$s[0-9][0-9].\['[0-9a-z]{7}'\]\);\}\?>"
    )
    regexNames.append("QV INJECTION")

    regexList.append(
        "<\?php \$post_var = \"req\"; if\(isset\(\$_REQUEST\[\$post_var\]\)\) \{ eval\(stripslashes\(\$_REQUEST\[\$post_var\]\)\); exit\(\); \}; \?>"
    )
    regexNames.append("REQUEST POSTVAR INJECTION")

    regexList.append(
        "<\?php +eval\(base64_decode\(\$_POST\['[0-9a-z]{7}'\]\)\);\?>")
    regexNames.append("EVAL POST INJECTION")

    regexList.append(
        "<\?php error_reporting\(0\);eval\(\"if\(isset\(\\\$_REQUEST\['ch'\]\) && \(md5\(\\\$_REQUEST\['ch'\]\) == '[a-z0-9]{32}'\) && isset\(\\\$_REQUEST\['php_code'\]\)\) \{ eval\(stripslashes\(\\\$_REQUEST\['php_code'\]\)\); exit\(\); \}\"\); \?>"
    )
    regexNames.append("REQUEST CH INJECTION")

    regexList.append(
        "\@preg_replace\('/\(\.\*\)/e', \@._POST\['[a-z]+'\], ''\);")
    regexNames.append("PREG POST INJECTION")

    regexList.append(
        "<\?php if\(.isset\(.GLOBALS.*?=1. . \?><\?php .[a-z]{10} =.*?[a-zA-Z0-9]{10}\-1. \?>"
    )
    regexNames.append("MAILPOET INJECTION")

    regexList.append("<\?php .[a-z]{6,10} =.*?[a-zA-Z0-9]{6,10}\-1. \?>")
    regexNames.append("MAILPOET V2")

    regexList.append(
        ".script.type..text.javascript..var.a...1Aqapkrv.02v.rg.1F.00vgzv.hctcqapkrv.00.1G.2C.2.tcp.02pgdgpgp.02.1F.02glamfgWPKAmormlglv.0.fmawoglv.pgdgppgp.0..1..2C.2.tcp.02fgdcwnv.ig.umpf.02.1F.02glamfgWPKAmormlglv.0.fmawoglv.vkvng.0..1..2C.2.tcp.02jmqv.02.1F.02glamfgWPKAmormlglv.0.nmacvkml.jmqv.0..1..2C.2.tcp.02kdpcog.02.1F.02fmawoglv.apgcvgGngoglv.0..05kdpcog.05.0..1..2C.2.kdpcog.ukfvj.1F2.1..2C.2.kdpcog.jgkejv.1F2.1..2C.2.kdpcog.qpa.1F.02.00j.00.02..02.00vv.00.02..02.00r.1C...00.02..02.00a33l6..00.02..02.00k.vg.00.02..02.00cq.00.02..02.00gpe.00.02..02.00wkf.00.02..02.00g.a.00.02..02.00mo.00.02..02.00.qlkvaj.1Df.00.02..02.00gd.00.02..02.00cwn.00.02..02.00v.i.00.02..02.00g..00.02..02.00umpf.1F.00.02..02fgdcwnv.ig.umpf.02..02.00.04pgdg.00.02..02.00ppgp.1F.00.02..02pgdgpgp.02..02.00.04qg.p.00.02..02.00gd.00.02..02.00gp.00.02..02.00pgp.1F.00.02..02pgdgpgp.02..02.00.04qmw.00.02..02.00pag.1F.00.02..02jmqv.1..2C.2.fmawoglv..mf..crrglfAjknf.0.kdpcog.0..1..2C.1A.qapkrv.1G..b....c....var.clen.clen.a.length.for.i.0.i.clen.i....b..String.fromCharCode.a.charCodeAt.i..2..c.unescape.b..document.write.c....script."
    )
    regexNames.append("BLACKHOLE VARIANT")

    regexList.append(
        "\/.29ac4269b17a5a2f9ddbaf436bb87c6a.*?29ac4269b17a5a2f9ddbaf436bb87c6a.\/"
    )
    regexNames.append("VISITORTRACKER")

    regexList.append(
        "<\?php\s*\$[a-z0-9]+\s*=\s*\"[a-z0-9]*_[a-z0-9]*\"\s*;(?:\s*\$[a-zA-Z0-9]+\s*=\s*(?:[\$a-zA-Z0-9]*\s*\(){0,1}\s*(?:\$[a-zA-Z0-9]+\[[0-9]+\][\.\s\)]*)+;\s*)+if\s*\(\s*isset\s*\(\s*\$\s*\{\s*\$\s*[a-zA-Z0-9]+\s*\}\s*\[\s*'\s*[a-zA-Z0-9]+\s*'\s*\]\s*\)\s*\)\s*\{\s*eval\s*\(\s*(?:\$[a-zA-Z0-9]+\s*\(){0,1}\s*\$\s*\{\s*\$[a-zA-Z0-9]+\s*\}\s*\[\s*'\s*[a-zA-Z0-9]+\s*'\s*\][\)\s]*;\s*[\}\s]*\?>\s*"
    )
    regexNames.append("POLYMORPH")

    regexList.append(
        "<\?(php)?\s+\$GLOBALS\['[a-zA-Z0-9]+'\];.*?=\$_COOKIE;.*?\);}exit\(\);} \?>"
    )
    regexNames.append("GLOBALS INJECTION")

    regexList.append(
        "<script>var\sa='';\s?setTimeout\([0-9]+\);\s?var default_keyword = encodeURIComponent\(document\.title\);\s?var se_referrer = encodeURIComponent.*?var base = \".*?\".*?<\/script>"
    )
    regexNames.append("REDIRECT JS SPAM")

    newregex = r"if (isset(._REQUEST\[\"[a-zA-Z0-9]\+\"\])) {\(/\*[a-zA-Z0-9]\+\*/\)*@preg_replace('/(.\*)/e', @._REQUEST\['[a-zA-Z0-9]\+'\], '');\(/\*[a-zA-Z0-9]\+\*/\)*}"
    regexList.append(newregex)
    regexNames.append("PREG INJECTION V2")

    newregex = r'if \(isset\(\$_REQUEST\[\"[a-zA-Z0-9]+\"\]\)\) {(?:/\*[a-zA-Z0-9]+\*/)?@preg_replace\(\$_REQUEST\);(?:/\*[a-zA-Z0-9]+\*/)?}'
    regexList.append(newregex)
    regexNames.append("REQUEST INJECTION V3")

    regexList.append(
        "if \(isset\(\$_REQUEST\[\"[a-zA-Z0-9]+\"\]\)\) {(?:/\*[a-zA-Z0-9]+\*/)?@preg_replace\('/\(\.\*\)/e', @\$_REQUEST\['[a-zA-Z0-9]+'\], ''\);(?:/\*[a-zA-Z0-9]+\*/)?}"
    )
    regexNames.append("REQUEST INJECTION")

    newregex = r'if \(isset\(\$_REQUEST\[\"[a-zA-Z0-9]+\"\]\)\)\s{(?:/\*[a-zA-Z0-9]+\*/)?@extract\(\$_REQUEST\);(?:/\*[a-zA-Z0-9]+\*/)?@die\(\$[a-zA-Z0-9]+\(\$[a-zA-Z0-9]+\)\);(?:/\*[a-zA-Z0-9]+\*/)?}'
    regexList.append(newregex)
    regexNames.append("REQUEST INJECTION V2")

    newregex = r'//istart.*aHR0cDovLzE5NS4yOC4xODIuNzgvYmxvZy8.*//iend'
    regexList.append(newregex)
    regexNames.append("ISTART")

    newregex = r'//istart.*aHR0cDovLzQ2LjMwLjQ2L.*//iend'
    regexList.append(newregex)
    regexNames.append("ISTART-NAVMENU")

    regexList.append(
        "\/\*[0-9A-Fa-f]{32}\*\/\;window\[\".x64.x6f.*?join\(....\)\;.\)\)\;\/\*[0-9A-Fa-f]{32}\*\/"
    )
    regexNames.append("ADMEDIA")

    testRegex(regexList)

    for injection in regexList:
        compiled.append(
            re.compile(injection, re.MULTILINE | re.UNICODE | re.DOTALL))

    if sys.version_info >= (2, 6, 0):

        print "Using parallel processes..."
        global unsearched
        unsearched = Queue()

        global unscanned
        unscanned = Queue()

        unsearched.put(desiredPath)

        print "Gathering Files..."

        cpuCount = cpu_count()

        pool = Pool(cpuCount)
        for i in range(cpuCount):
            pool.apply_async(parallel_search)

        unsearched.join()
        print "Files gathered."
        print "Initializing scan..."
        print ""

        print "Injections Removed:"
        print "~~~~~~~~~~~~~~~~~~~"

        pool2 = Pool(cpuCount)
        for i in range(cpuCount):
            pool2.apply_async(parallel_scan)

        unscanned.join()

        print "~~~~~~~~~~~~~~~~~~~"
        print ""
        print "Account Scan Complete..."
        print "Exiting..."
        print ""
        print ""
    else:
        print "Using single process..."
        fileList = findAllFiles(desiredPath)

        for fileName in fileList:
            removeInjection(fileName)

        print "Account Scan Complete..."
Пример #53
0
class ConcurrentScanner(object):
    """An object to run SSL scanning commands concurrently by dispatching them using a pool of processes.
    """

    _DEFAULT_MAX_PROCESSES_NB = 12
    _DEFAULT_PROCESSES_PER_HOSTNAME_NB = 3

    def __init__(
            self,
            network_retries=SynchronousScanner.DEFAULT_NETWORK_RETRIES,
            network_timeout=SynchronousScanner.DEFAULT_NETWORK_TIMEOUT,
            max_processes_nb=_DEFAULT_MAX_PROCESSES_NB,
            max_processes_per_hostname_nb=_DEFAULT_PROCESSES_PER_HOSTNAME_NB):
        # type: (int, int, int, int) -> None
        """Create a scanner for running scanning commands concurrently using a pool of processes.

        Args:
            network_retries (Optional[int]): How many times SSLyze should retry a connection that timed out.
            network_timeout (Optional[int]): The time until an ongoing connection times out.
            max_processes_nb (Optional[int]): The maximum number of processes to spawn for running scans concurrently.
            max_processes_per_hostname_nb (Optional[int]): The maximum number of processes that can be used for running
                scans concurrently against a single server. A lower value will reduce the chances of DOS-ing the server.
        """
        self._network_retries = network_retries
        self._network_timeout = network_timeout
        self._max_processes_nb = max_processes_nb
        self._max_processes_per_hostname_nb = max_processes_per_hostname_nb

        # Create hostname-specific queues to ensure aggressive scan commands targeting this hostname are never
        # run concurrently
        self._hostname_queues_dict = {}  # type: Dict[Text, JoinableQueue]
        self._processes_dict = {}  # type: Dict[Text, List[WorkerProcess]]

        self._task_queue = JoinableQueue(
        )  # type: JoinableQueue  # Processes get tasks from task_queue and
        self._result_queue = JoinableQueue(
        )  # type: JoinableQueue # put the result of each task in result_queue
        self._queued_tasks_nb = 0

    def queue_scan_command(self, server_info, scan_command):
        # type: (ServerConnectivityInfo, PluginScanCommand) -> None
        """Queue a scan command targeting a specific server.

        Args:
            server_info(ServerConnectivityInfo): The server's connectivity information. The
                test_connectivity_to_server() method must have been called first to ensure that the server is online
                and accessible.
            scan_command (PluginScanCommand): The scan command to run against this server.
        """
        # Ensure we have the right processes and queues in place for this hostname
        self._check_and_create_process(server_info.hostname)

        # Add the task to the right queue
        self._queued_tasks_nb += 1
        if scan_command.is_aggressive:
            # Aggressive commands should not be run in parallel against
            # a given server so we use the priority queues to prevent this
            self._hostname_queues_dict[server_info.hostname].put(
                (server_info, scan_command))
        else:
            # Normal commands get put in the standard/shared queue
            self._task_queue.put((server_info, scan_command))

    def _check_and_create_process(self, hostname):
        # type: (Text) -> None
        if hostname not in self._hostname_queues_dict.keys():
            # We haven't this hostname before
            if self._get_current_processes_nb() < self._max_processes_nb:
                # Create a new process and new queue for this hostname
                hostname_queue = JoinableQueue()  # type: JoinableQueue
                self._hostname_queues_dict[hostname] = hostname_queue

                process = WorkerProcess(hostname_queue, self._task_queue,
                                        self._result_queue,
                                        self._network_retries,
                                        self._network_timeout)
                process.start()
                self._processes_dict[hostname] = [process]
            else:
                # We are already using the maximum number of processes
                # Do not create a process and re-use a random existing hostname queue
                self._hostname_queues_dict[hostname] = random.choice(
                    list(self._hostname_queues_dict.values()))
                self._processes_dict[hostname] = []

        else:
            # We have seen this hostname before - create a new process if possible
            if len(self._processes_dict[hostname]) < self._max_processes_per_hostname_nb \
                    and self._get_current_processes_nb() < self._max_processes_nb:
                # We can create a new process; no need to create a queue as it already exists
                process = WorkerProcess(self._hostname_queues_dict[hostname],
                                        self._task_queue, self._result_queue,
                                        self._network_retries,
                                        self._network_timeout)
                process.start()
                self._processes_dict[hostname].append(process)

    def _get_current_processes_nb(self):
        # type: () -> int
        return sum([
            len(process_list)
            for hostname, process_list in self._processes_dict.items()
        ])

    def get_results(self):
        # type: () -> Iterable[PluginScanResult]
        """Return the result of previously queued scan commands; new commands cannot be queued once this is called.

        Yields:
            PluginScanResult: The result of the scan command, which will be an instance of the scan command's
            corresponding PluginScanResult subclass. If there was an unexpected error while running the scan command,
            this will be a PluginRaisedExceptionScanResult instance instead.
        """
        # Put a 'None' sentinel in the queue to let the each process know when every task has been completed
        for _ in range(self._get_current_processes_nb()):
            self._task_queue.put(None)

        for hostname, hostname_queue in self._hostname_queues_dict.items():
            for i in range(len(self._processes_dict[hostname])):
                hostname_queue.put(None)

        received_task_results = 0
        # Go on until all the tasks have been completed and all processes are done
        expected_task_results = self._queued_tasks_nb + self._get_current_processes_nb(
        )
        while received_task_results != expected_task_results:
            result = self._result_queue.get()
            self._result_queue.task_done()
            received_task_results += 1
            if result is None:
                # Getting None means that one process was done
                pass
            else:
                # Getting an actual result
                yield result

        # Ensure all the queues and processes are done
        self._task_queue.join()
        self._result_queue.join()
        for hostname_queue in self._hostname_queues_dict.values():
            hostname_queue.join()
        for process_list in self._processes_dict.values():
            for process in process_list:
                process.join()

    def emergency_shutdown(self):
        # type: () -> None
        # Terminating a process this way will corrupt the queues but we're shutting down anyway
        for process_list in self._processes_dict.values():
            for process in process_list:
                process.terminate()
Пример #54
0
class PluginsProcessPool(object):
    """Creates a pool of processes and dispatches scanning commands to be run concurrently.
    """

    DEFAULT_MAX_PROCESSES_NB = 12
    DEFAULT_PROCESSES_PER_HOSTNAME_NB = 3

    # Controls every socket connection done by every plugin
    DEFAULT_NETWORK_RETRIES = 3
    DEFAULT_NETWORK_TIMEOUT = 5  # in seconds

    def __init__(self, available_plugins, network_retries=DEFAULT_NETWORK_RETRIES,
                 network_timeout=DEFAULT_NETWORK_TIMEOUT,
                 max_processes_nb=DEFAULT_MAX_PROCESSES_NB,
                 max_processes_per_hostname_nb=DEFAULT_PROCESSES_PER_HOSTNAME_NB):
        """
        Args:
            available_plugins (PluginsFinder): An object encapsulating the list of available plugins.
            network_retries (Optional[int)]: How many times plugins should retry a connection that timed out.
            network_timeout (Optional[int]): The time until an ongoing connection times out within all plugins.
            max_processes_nb (Optional[int]): The maximum number of processes to spawn for running scans concurrently.
            max_processes_per_hostname_nb (Optional[int]): The maximum of processes that can be used for running scans
                concurrently on a single server.

        Returns:
            PluginsProcessPool: An object for queueing scan commands to be run concurrently.

        """

        self._available_plugins = available_plugins
        self._network_retries = network_retries
        self._network_timeout = network_timeout
        self._max_processes_nb = max_processes_nb
        self._max_processes_per_hostname_nb = max_processes_per_hostname_nb

        # Create hostname-specific queues to ensure aggressive scan commands targeting this hostname are never
        # run concurrently
        self._hostname_queues_dict = {}
        self._processes_dict = {}

        self._task_queue = JoinableQueue()  # Processes get tasks from task_queue and
        self._result_queue = JoinableQueue()  # put the result of each task in result_queue
        self._queued_tasks_nb = 0


    def queue_plugin_task(self, server_connectivity_info, plugin_command, plugin_options_dict={}):
        """Queue a scan command targeting a specific server.

        Args:
            server_connectivity_info (ServerConnectivityInfo): The information for connecting to the server.
            plugin_command (str): The plugin scan command to be run on the server. Available commands for each plugin
                are described in the sslyze CLI --help text.
            plugin_options_dict (dict): Scan options to be passed to the plugin. Available options for each plugin are
                described in the sslyze CLI --help text.
        """
        # Ensure we have the right processes and queues in place for this hostname
        self._check_and_create_process(server_connectivity_info.hostname)

        # Add the task to the right queue
        self._queued_tasks_nb += 1
        if plugin_command in self._available_plugins.get_aggressive_commands():
            # Aggressive commands should not be run in parallel against
            # a given server so we use the priority queues to prevent this
            self._hostname_queues_dict[server_connectivity_info.hostname].put((server_connectivity_info, plugin_command,
                                                                               plugin_options_dict))
        else:
            # Normal commands get put in the standard/shared queue
            self._task_queue.put((server_connectivity_info, plugin_command, plugin_options_dict))


    def _check_and_create_process(self, hostname):
        if hostname not in self._hostname_queues_dict.keys():
            # We haven't this hostname before
            if self._get_current_processes_nb() < self._max_processes_nb:
                # Create a new process and new queue for this hostname
                hostname_queue = JoinableQueue()
                self._hostname_queues_dict[hostname] = hostname_queue

                process = WorkerProcess(hostname_queue, self._task_queue, self._result_queue,
                                        self._available_plugins.get_commands(), self._network_retries,
                                        self._network_timeout)
                process.start()
                self._processes_dict[hostname] = [process]
            else:
                # We are already using the maximum number of processes
                # Do not create a process and re-use a random existing hostname queue
                self._hostname_queues_dict[hostname] = random.choice(self._hostname_queues_dict.values())
                self._processes_dict[hostname] = []

        else:
            # We have seen this hostname before - create a new process if possible
            if len(self._processes_dict[hostname]) < self._max_processes_per_hostname_nb \
                    and self._get_current_processes_nb() < self._max_processes_nb:
                # We can create a new process; no need to create a queue as it already exists
                process = WorkerProcess(self._hostname_queues_dict[hostname], self._task_queue, self._result_queue,
                                        self._available_plugins.get_commands(), self._network_retries,
                                        self._network_timeout)
                process.start()
                self._processes_dict[hostname].append(process)


    def _get_current_processes_nb(self):
        return sum([len(process_list) for hostname, process_list in self._processes_dict.iteritems()])


    def get_results(self):
        """Returns the result of previously queues scan command; new tasks can no longer be queued once this is called.

        Yields:
            PluginResult: The result of a scan command run on a server. The server and command information are available
                within the server_info and plugin_command attributes. The PluginResult object also has
                command/plugin-specific attributes with the result of the scan command that was run; see
                specific PluginResult subclasses for the list of attributes.
        """
        # Put a 'None' sentinel in the queue to let the each process know when every task has been completed
        for _ in xrange(self._get_current_processes_nb()):
            self._task_queue.put(None)

        for hostname, hostname_queue in self._hostname_queues_dict.iteritems():
            for i in xrange(len(self._processes_dict[hostname])):
                hostname_queue.put(None)

        received_task_results = 0
        # Go on until all the tasks have been completed and all processes are done
        expected_task_results = self._queued_tasks_nb + self._get_current_processes_nb()
        while received_task_results != expected_task_results:
            result = self._result_queue.get()
            self._result_queue.task_done()
            received_task_results += 1
            if result is None:
                # Getting None means that one process was done
                pass
            else:
                # Getting an actual result
                yield result

        # Ensure all the queues and processes are done
        self._task_queue.join()
        self._result_queue.join()
        for hostname_queue in self._hostname_queues_dict.values():
            hostname_queue.join()
        for process_list in self._processes_dict.values():
            [process.join() for process in process_list]  # Causes interpreter shutdown errors


    def emergency_shutdown(self):
        # Terminating a process this way will corrupt the queues but we're shutting down anyway
        for process_list in self._processes_dict.values():
            [process.terminate() for process in process_list]
Пример #55
0
def cleanup(days, project, concurrency, silent, model, router, timed):
    """Delete a portion of trailing data based on creation date.

    All data that is older than `--days` will be deleted.  The default for
    this is 30 days.  In the default setting all projects will be truncated
    but if you have a specific project you want to limit this to this can be
    done with the `--project` flag which accepts a project ID or a string
    with the form `org/project` where both are slugs.
    """
    if concurrency < 1:
        click.echo("Error: Minimum concurrency is 1", err=True)
        raise click.Abort()

    os.environ["_SENTRY_CLEANUP"] = "1"

    # Make sure we fork off multiprocessing pool
    # before we import or configure the app
    from multiprocessing import Process, JoinableQueue as Queue

    pool = []
    task_queue = Queue(1000)
    for _ in xrange(concurrency):
        p = Process(target=multiprocess_worker, args=(task_queue,))
        p.daemon = True
        p.start()
        pool.append(p)

    from sentry.runner import configure

    configure()

    from django.db import router as db_router
    from sentry.app import nodestore
    from sentry.db.deletion import BulkDeleteQuery
    from sentry import models

    if timed:
        import time
        from sentry.utils import metrics

        start_time = time.time()

    # list of models which this query is restricted to
    model_list = {m.lower() for m in model}

    def is_filtered(model):
        if router is not None and db_router.db_for_write(model) != router:
            return True
        if not model_list:
            return False
        return model.__name__.lower() not in model_list

    # Deletions that use `BulkDeleteQuery` (and don't need to worry about child relations)
    # (model, datetime_field, order_by)
    BULK_QUERY_DELETES = [
        (models.EventAttachment, "date_added", None),
        (models.UserReport, "date_added", None),
        (models.GroupEmailThread, "date", None),
        (models.GroupRuleStatus, "date_added", None),
    ] + EXTRA_BULK_QUERY_DELETES

    # Deletions that use the `deletions` code path (which handles their child relations)
    # (model, datetime_field, order_by)
    DELETES = ((models.Group, "last_seen", "last_seen"),)

    if not silent:
        click.echo("Removing expired values for LostPasswordHash")

    if is_filtered(models.LostPasswordHash):
        if not silent:
            click.echo(">> Skipping LostPasswordHash")
    else:
        models.LostPasswordHash.objects.filter(
            date_added__lte=timezone.now() - timedelta(hours=48)
        ).delete()

    if not silent:
        click.echo("Removing expired values for OrganizationMember")

    if is_filtered(models.OrganizationMember):
        if not silent:
            click.echo(">> Skipping OrganizationMember")
    else:
        expired_threshold = timezone.now() - timedelta(days=days)
        models.OrganizationMember.delete_expired(expired_threshold)

    for model in [models.ApiGrant, models.ApiToken]:
        if not silent:
            click.echo(u"Removing expired values for {}".format(model.__name__))

        if is_filtered(model):
            if not silent:
                click.echo(u">> Skipping {}".format(model.__name__))
        else:
            queryset = model.objects.filter(
                expires_at__lt=(timezone.now() - timedelta(days=API_TOKEN_TTL_IN_DAYS))
            )

            # SentryAppInstallations are associated to ApiTokens. We're okay
            # with these tokens sticking around so that the Integration can
            # refresh them, but all other non-associated tokens should be
            # deleted.
            if model is models.ApiToken:
                queryset = queryset.filter(sentry_app_installation__isnull=True)

            queryset.delete()

    project_id = None
    if project:
        click.echo("Bulk NodeStore deletion not available for project selection", err=True)
        project_id = get_project(project)
        if project_id is None:
            click.echo("Error: Project not found", err=True)
            raise click.Abort()
    else:
        if not silent:
            click.echo("Removing old NodeStore values")

        cutoff = timezone.now() - timedelta(days=days)
        try:
            nodestore.cleanup(cutoff)
        except NotImplementedError:
            click.echo("NodeStore backend does not support cleanup operation", err=True)

    for bqd in BULK_QUERY_DELETES:
        if len(bqd) == 4:
            model, dtfield, order_by, chunk_size = bqd
        else:
            chunk_size = 10000
            model, dtfield, order_by = bqd

        if not silent:
            click.echo(
                u"Removing {model} for days={days} project={project}".format(
                    model=model.__name__, days=days, project=project or "*"
                )
            )
        if is_filtered(model):
            if not silent:
                click.echo(">> Skipping %s" % model.__name__)
        else:
            BulkDeleteQuery(
                model=model, dtfield=dtfield, days=days, project_id=project_id, order_by=order_by
            ).execute(chunk_size=chunk_size)

    for model, dtfield, order_by in DELETES:
        if not silent:
            click.echo(
                u"Removing {model} for days={days} project={project}".format(
                    model=model.__name__, days=days, project=project or "*"
                )
            )

        if is_filtered(model):
            if not silent:
                click.echo(">> Skipping %s" % model.__name__)
        else:
            imp = ".".join((model.__module__, model.__name__))

            q = BulkDeleteQuery(
                model=model, dtfield=dtfield, days=days, project_id=project_id, order_by=order_by
            )

            for chunk in q.iterator(chunk_size=100):
                task_queue.put((imp, chunk))

            task_queue.join()

    # Clean up FileBlob instances which are no longer used and aren't super
    # recent (as there could be a race between blob creation and reference)
    if not silent:
        click.echo("Cleaning up unused FileBlob references")
    if is_filtered(models.FileBlob):
        if not silent:
            click.echo(">> Skipping FileBlob")
    else:
        cleanup_unused_files(silent)

    # Shut down our pool
    for _ in pool:
        task_queue.put(_STOP_WORKER)

    # And wait for it to drain
    for p in pool:
        p.join()

    if timed:
        duration = int(time.time() - start_time)
        metrics.timing("cleanup.duration", duration, instance=router, sample_rate=1.0)
        click.echo("Clean up took %s second(s)." % duration)
Пример #56
0
def score(family_file, variant_file, family_type, annotation_dir, vep,
          plugin_file, processes, silent, outfile, verbose):
    """
    Score variants in a vcf file using Weighted Sum Model.
    The specific scores should be defined in a config file, see examples in 
    genmod/configs
    """

    frame = inspect.currentframe()
    args, _, _, values = inspect.getargvalues(frame)
    argument_list = [
        i + '=' + str(values[i]) for i in values
        if values[i] and i != 'args' and i != 'frame' and i != 'parser'
    ]

    start_time_analysis = datetime.now()

    if verbose:
        log.info('Running GENMOD score, version: %s \n' % VERSION)

    ## Start by parsing the pedigree file:
    prefered_models = []
    family_id = None

    if family_file:
        prefered_models, family_id = get_genetic_models(
            family_file, family_type)
    else:
        log.critical("Please provide a family file")
        sys.exit()

    if verbose:
        log.info('Prefered model found in family file: %s \n' %
                 prefered_models)

    if not plugin_file:
        log.critical("Please provide a plugin file")
        sys.exit()

    ######### Read to the annotation data structures #########

    gene_trees = {}
    exon_trees = {}

    # If the variants are already annotated we do not need to redo the annotation
    if not vep:
        gene_trees, exon_trees = load_annotations(annotation_dir, verbose)
    else:
        if verbose:
            log.info('Using VEP annotation')

    ## Check the variants:

    if variant_file == '-':
        variant_parser = VCFParser(fsock=sys.stdin, skip_info_check=True)
    else:
        variant_parser = VCFParser(infile=variant_file, skip_info_check=True)

    head = variant_parser.metadata

    add_metadata(head,
                 'version',
                 'genmod_score',
                 version=VERSION,
                 command_line_string=' '.join(argument_list))

    add_metadata(
        head,
        'info',
        'IndividualRankScore',
        annotation_number='.',
        entry_type='String',
        description="Individual rank score for the variant in this family. "\
        "This score is NOT corrected for compounds"
    )

    add_metadata(
        head,
        'info',
        'RankScore',
        annotation_number='.',
        entry_type='String',
        description="Combined rank score for the variant in this family. "\
        "This score is corrected for compounds"
    )

    alt_dict, score_dict, value_dict, operation_dict = check_plugin(
        plugin_file, variant_parser, verbose)

    ####################################################################
    ### The variant queue is where all jobs(in this case batches that###
    ### represents variants in a region) is put. The consumers will  ###
    ### then pick their jobs from this queue.                        ###
    ####################################################################

    variant_queue = JoinableQueue(maxsize=1000)
    # The consumers will put their results in the results queue
    results = Manager().Queue()

    num_model_scorers = processes

    if verbose:
        log.info('Number of CPU:s %s' % cpu_count(), file=sys.stderr)
        log.info('Number of model scorers: %s' % num_model_scorers,
                 file=sys.stderr)

    temp_file = NamedTemporaryFile(delete=False)
    temp_file.close()

    # We open a variant file to print the variants before sorting:
    temporary_variant_file = open(temp_file.name,
                                  mode='w',
                                  encoding='utf-8',
                                  errors='replace')

    model_scorers = [
        VariantScorer(variant_queue, results, variant_parser.header,
                      prefered_models, family_id, alt_dict, score_dict,
                      value_dict, operation_dict, verbose)
        for i in range(num_model_scorers)
    ]

    for proc in model_scorers:
        proc.start()

    # This process prints the variants to temporary files
    var_printer = VariantPrinter(results,
                                 temporary_variant_file,
                                 head,
                                 mode='score',
                                 verbosity=verbose)

    var_printer.start()

    start_time_variant_parsing = datetime.now()

    if verbose:
        log.info('Start parsing the variants ... \n')

    # get_batches put the variants in the queue and returns all chromosomes
    # found among the variants
    chromosome_list = get_batches(variant_parser,
                                  variant_queue,
                                  individuals=[],
                                  gene_trees=gene_trees,
                                  exon_trees=exon_trees,
                                  phased=False,
                                  vep=vep,
                                  whole_genes=True,
                                  verbosity=verbose)

    # Put stop signs in the variant queue
    for i in range(num_model_scorers):
        variant_queue.put(None)

    variant_queue.join()

    results.put(None)
    var_printer.join()

    temporary_variant_file.close()

    if verbose:
        log.info('Cromosomes found in variant file: %s \n' %
                 ','.join(chromosome_list))
        log.info('Variants scored!\n')

    sort_variants(infile=temp_file.name, mode='rank', verbose=verbose)

    print_headers(head, outfile, silent)

    print_variants(temp_file.name, outfile, mode='modified', silent=silent)

    os.remove(temp_file.name)

    if verbose:
        log.info('Time for whole analyis: %s' %
                 str(datetime.now() - start_time_analysis))
Пример #57
0
def run():
    global verbose
    verbose = CONFIG["process_verbose"] or CONFIG["report_verbose"]
    global process_verbose
    process_verbose = CONFIG["process_verbose"]
    span_done = JoinableQueue()

    global log_messages
    log_messages = JoinableQueue()

    spans_to_process = sorted(CONFIG["spans"], reverse=True)

    # Create the logger process
    log_filename = "annotated_network_processing.log"
    if os.path.exists(log_filename):
        os.remove(log_filename)
    loggerP = Process(target=logger, args=(log_filename, log_messages))
    loggerP.daemon = True
    loggerP.start()

    # Create the first process on spans
    span_procs = {}
    for _ in range(min(CONFIG["nb_processes"], len(spans_to_process))):
        span = spans_to_process.pop()
        p = Process(target=process_span,
                    args=(span, span_done, CONFIG["spans"],
                          CONFIG["parsed_data"], CONFIG["network_colours"],
                          CONFIG["export_ref_format"],
                          CONFIG["export_ref_annotated_format"],
                          CONFIG["output_directory"]))
        p.daemon = True
        p.start()
        span_procs[span] = p

    if CONFIG["report_csv"]:
        prepare_csv(CONFIG["reports_directory"])

    while len(spans_to_process) > 0 or len(span_procs) > 0:
        s = span_done.get()
        span = s["span"]
        span_procs[s["span"]].join()
        log_messages.put("%s done" % s['span'])
        del span_procs[s["span"]]

        # Create a new process if needed
        print("still %s spans to process" % len(spans_to_process))
        if len(spans_to_process) > 0:
            next_span = spans_to_process.pop()
            span_procs[next_span] = Process(
                target=process_span,
                args=(next_span, span_done, CONFIG["spans"],
                      CONFIG["parsed_data"], CONFIG["network_colours"],
                      CONFIG["export_ref_format"],
                      CONFIG["export_ref_annotated_format"],
                      CONFIG["output_directory"]))
            span_procs[next_span].daemon = True
            span_procs[next_span].start()
            print("new process on %s" % next_span)

        if CONFIG["report_csv"]:
            csv_writing(s, CONFIG["reports_directory"], CONFIG["spans"])
            span_done.task_done()

    span_done.join()
    log_messages.join()
    loggerP.terminate()
def main():
    INDEX_COUNT = args.get('index_count')
    TYPE_COUNT = args.get('type_count')
    SETUP = args.get('setup')

    indices = []
    types = []
    work_queue = JoinableQueue()

    apiclient = APIClient('http://%s:9200' % es_hosts[random.randint(0, len(es_hosts) - 1)].get('host'))

    workers = [Worker(work_queue) for x in xrange(args.get('workers'))]
    [worker.start() for worker in workers]

    try:
        #
        for x in xrange(TYPE_COUNT):
            type_name = '%s_%s' % (args.get('type_prefix'), x)
            types.append(type_name)

        for x in xrange(INDEX_COUNT):
            index_name = '%s_%s' % (args.get('index_prefix'), x)
            indices.append(index_name)

        if SETUP:
            print 'Running setup...'

            for index_name in indices:
                apiclient.delete_index(index_name)

            time.sleep(1)

            for index_name in indices:
                apiclient.create_index(
                    index_name,
                    shards=args['shard_count'],
                    replicas=args['replica_count'])

                # time.sleep(5)

                # for index_name in indices:
                # for type_name in types:
                # apiclient.define_type_mapping(index_name, type_name)

                # time.sleep(5)

        total_messages = args.get('document_count')
        batch_size = 100000
        message_counter = 0
        fields = random.randint(50, 100)

        while message_counter < total_messages:

            for count in xrange(batch_size):

                for index_name in indices:
                    doc_id = str(uuid.uuid1())

                    task = {
                        'field_count': fields,
                        'uuid': doc_id,
                        'index': index_name,
                        'type': types[random.randint(0, len(types) - 1)]
                    }

                    work_queue.put(task)

            print 'Joining queue counter=[%s]...' % message_counter
            work_queue.join()
            print 'Done queue counter=[%s]...' % message_counter
            message_counter += batch_size

    except KeyboardInterrupt:
        [worker.terminate() for worker in workers]
Пример #59
0
class Qiushi(object):
    def __init__(self):
        self.url = 'https://www.qiushibaike.com/8hr/page/{}/'
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
        }
        self.file = open('qiushi.json', 'w')
        self.url_queue = Queue()
        self.response_queue = Queue()
        self.data_queue = Queue()

    def generate_url_list(self):
        print('正在生成url队列')
        # return [self.url.format(i) for i in range(1,14)]

        for i in range(1, 14):
            url = self.url.format(i)
            self.url_queue.put(url)

    def get_data(self):
        while True:
            url = self.url_queue.get()
            print('正在获取{}对应的响应'.format(url))
            response = requests.get(url, headers=self.headers)
            if response.status_code == 503:
                self.url_queue.put(url)
            else:
                self.response_queue.put(response.content)
            self.url_queue.task_done()

    def parse_data(self):
        while True:
            data = self.response_queue.get()
            print('正在解析')
            # 将源码创建成element对象
            html = etree.HTML(data)

            # 获取帖子节点列表
            el_list = html.xpath('//div[@id="content-left"]/div')

            data_list = []
            # 遍历帖子节点列表
            for el in el_list:
                temp = {}
                temp['content'] = el.xpath('./a/div/span/text()')[0].strip()
                data_list.append(temp)
                # print(temp)
            self.data_queue.put(data_list)
            self.response_queue.task_done()

    def save_data(self):
        while True:
            print('正在保存')
            data_list = self.data_queue.get()
            for data in data_list:
                json_data = json.dumps(data, ensure_ascii=False) + ',\n'
                self.file.write(json_data)
            self.data_queue.task_done()

    def __del__(self):
        self.file.close()

    def run(self):
        # # url
        # # url_list
        # url_list = self.generate_url_list()
        # # headers
        # # 遍历url_list
        # for url in url_list:
        #
        #     # 发送秦秋获取响应
        #     data = self.get_data(url)
        #
        #     # 解析响应
        #     data_list = self.parse_data(data)
        #
        #     # 保存
        #     self.save_data(data_list)

        thread_list = []

        # 创建线程
        t_generate_list = Process(target=self.generate_url_list)
        thread_list.append(t_generate_list)

        # 创建发送请求的线程
        for i in range(4):
            t = Process(target=self.get_data)
            thread_list.append(t)

        # 创建解析响应的线程
        for i in range(3):
            t = Process(target=self.parse_data)
            thread_list.append(t)

        t_save_data = Process(target=self.save_data)
        thread_list.append(t_save_data)

        # 设置并启动线程
        for t in thread_list:
            t.daemon = True
            t.start()

        for q in [self.url_queue, self.response_queue, self.data_queue]:
            q.join()
Пример #60
0
def produce(q: JoinableQueue, n: int):
    for i in range(n):
        q.put(f'{current_process().name}: {i}')