Пример #1
0
def main():
    sniffer_count = min([cpu_count(), len(argv[1:])])
    sniffer_queue = JoinableQueue()
    db_queue = JoinableQueue()
    for filename in argv[1:]:
        if os.path.isfile(filename):
            print('Will be loading from file', filename)
        else:
            print('Will sniff from interface', filename)
        sniffer_queue.put(filename)
    if argv[1:] == []:
        sniffer_queue.put('*')
        if sniffer_count == 0:
            sniffer_count = 1

    sniffers = []
    for _ in range(sniffer_count):
        p = Process(target=sniffer, args=(sniffer_queue, db_queue))
        p.start()
        sniffers.append(p)
        sniffer_queue.put(None)
    db_proc = Process(target=db_worker, args=(sniffer_count, db_queue))
    db_proc.start()
    interfaces = []
    sniffer_queue.close()
    db_proc.join()
    for _ in sniffers:
        _.join()
Пример #2
0
class multiproc_calculator():
    def __init__(self, num_processes, shape=(4, 4)):
        self.matrix_list = []
        self.task_queue = JoinableQueue()
        self.result_queue = Queue()
        self.result = np.zeros(shape)
        self.processes = [
            matmul_process(self.task_queue, self.result_queue)
            for i in range(num_processes)
        ]

    def get_result(self):
        self.task_queue.join()
        self.task_queue.close()
        while not self.result_queue.empty():
            np.add(self.result, self.result_queue.get(), out=self.result)

        return self.result

    def start_proc(self):
        for proc in self.processes:
            proc.daemon = True
            proc.start()

    def add_new_matrix(self, matrix):
        # генерируем все возможные пары
        for a in self.matrix_list:
            self.task_queue.put((a, matrix))
            self.task_queue.put((matrix, a))
        self.matrix_list.append(matrix)
def annotate_gtf_parallel(input_gtf_file, output_gtf_file, gtf_sample_attr,
                          num_processors, tmp_dir):
    # create queue
    input_queue = JoinableQueue(maxsize=num_processors * 3)
    # start worker processes
    procs = []
    worker_gtf_files = []
    for i in xrange(num_processors):
        worker_gtf_file = os.path.join(tmp_dir,
                                       "annotate_worker%03d.gtf" % (i))
        worker_gtf_files.append(worker_gtf_file)
        args = (input_queue, worker_gtf_file, gtf_sample_attr)
        p = Process(target=annotate_gtf_worker, args=args)
        p.daemon = True
        p.start()
        procs.append(p)
    for lines in parse_loci(open(input_gtf_file)):
        input_queue.put(lines)
    # stop workers
    for p in procs:
        input_queue.put([])
    # close queue
    input_queue.join()
    input_queue.close()
    # join worker processes
    for p in procs:
        p.join()
    # merge/sort worker gtf files
    logging.debug("Merging %d worker GTF file(s)" % (num_processors))
    merge_sort_gtf_files(worker_gtf_files, output_gtf_file, tmp_dir=tmp_dir)
    # remove worker gtf files
    for filename in worker_gtf_files:
        if os.path.exists(filename):
            os.remove(filename)
Пример #4
0
def main(argv):
    logging.basicConfig(filename=options.log_filename,
                        level=logging.INFO + 10 *
                        (options.quiet - options.verbose))
    log = logging.getLogger(os.path.basename(sys.argv[0]))
    FORMAT = '%(asctime)s|%(levelname)s|%(process)d|%(module)s.py|%(funcName)s|%(lineno)d|  %(message)s'
    if 1:
        handler = logging.StreamHandler(sys.stdout)
        handler.setLevel(logging.DEBUG)
        formatter = logging.Formatter(FORMAT, datefmt="%Y-%m-%d %H:%M:%S")
        handler.setFormatter(formatter)
        #pprint(dir(handler))
        log.addHandler(handler)
    #log.info('test')
    #e()
    file_object_cache = FileObjectCache()
    #key_name, value_kwargs = args
    #value = Value(file_object_cache, content=None, filename=None, md5=None, offset=None, path=None, size=None, bucket_name=None)
    start = time.time()
    if 1:
        put_queue = JoinableQueue(1024 * options.processes)
        stat_queue = JoinableQueue()
        walk = {'filesystem': walk_filesystem}[options.walk]
        args = [
            '/auto/fina-data/share/FARepository/prod/CIGActgS11/position/processing/Priority_2/PositionSide/122654_DESK_CDRG183872PositionSide.bcp.SSrvr'
        ]
        walker_process = Process(target=walker,
                                 args=(walk, put_queue, args, options))
        walker_process.start()

    if 1:
        put = {'update': put_update}[options.put]
        #print put
        #e()
        putter_processes = list(
            islice(
                repeatedly(Process,
                           target=putter,
                           args=(put, put_queue, stat_queue, options)),
                options.processes))
        for putter_process in putter_processes:
            #print putter_process
            putter_process.start()
    walker_process.join()
    if 1:
        statter_process = Process(target=statter,
                                  args=(stat_queue, start, options))
        statter_process.start()

    for putter_process in putter_processes:
        put_queue.put(None)
    put_queue.close()
    for putter_process in putter_processes:
        putter_process.join()

    stat_queue.put(None)
    stat_queue.close()
    statter_process.join()
    put_queue.join_thread()
    stat_queue.join_thread()
Пример #5
0
def readCEFFile(afile,pygtail):
    if exists(afile): #sometimes files can move/archive while we iterate the list
        try:
            #start a process to post our stuff.
            logcache=JoinableQueue()
            postingProcess=Process(target=postLogs,args=(logcache,),name="cef2mozdefHTTPPost")
            postingProcess.start()            
            #have pygtail feed us lines 
            for line in pygtail:
                pygtail._update_offset_file()
                cefDict=parseCEF(line)
                #logger.debug(json.dumps(cefDict))
                #append json to the list for posting
                if cefDict is not None:
                    logcache.put(json.dumps(cefDict))        
            logger.info('{0} done'.format(afile))
            logger.info('waiting for posting to finish')
            logcache.put(None)
            logcache.close()
            #logger.info('posting done')
        except KeyboardInterrupt:
            sys.exit(1)
        except ValueError as e:
            logger.fatal('Exception while handling CEF message: %r'%e)
            sys.exit(1)    
class GHDDIMultiProcessPool:

    def __init__(self, target, database=None):
        self._inputQueue = Queue()
        self._outputQueue = Queue()
        jobs = []
        for i in range(0, os.cpu_count()):
            jobs.append(GHDDIProcess(target, database, self._inputQueue, self._outputQueue))
        self._jobs = jobs

    def __del__(self):
        print('processPool del')
        self._inputQueue.join()
        self._outputQueue.join()

        self._inputQueue.close()
        self._outputQueue.close()
        for p in self._jobs:
            p.terminate()
            p.close()

    def startAll(self):
        for p in self._jobs:
            p.start()

    def finishAll(self):
        pass

    def putTask(self, taskArgs, block=True, timeout=None):
        self._inputQueue.put(taskArgs, block=block, timeout=timeout)

    def getTaskRet(self, block=True, timeout=None):
        return self._outputQueue.get(block=block, timeout=timeout)
Пример #7
0
def search8(q, path):
    jobs = Queue()
    result = JoinableQueue()
    NUMBER_OF_PROCESSES = cpu_count()

    job_count = 0
    for f in os.scandir('data'):
        jobs.put(f.path)
        job_count = job_count + 1

    [
        Process(target=work, args=(i, q, jobs, result)).start()
        for i in range(NUMBER_OF_PROCESSES)
    ]

    matches = []
    for t in range(job_count):
        r = result.get()
        result.task_done()
        if r:
            matches.append(r)

    matches.sort()

    for w in range(NUMBER_OF_PROCESSES):
        jobs.put(None)

    result.join()
    jobs.close()
    result.close()

    return matches
def crunch(file_name, ext_type, handler, pool_size=4, queue_size=40,
           limit=None):

    print 'Crunching file: %s, limit: %s' % (file_name, limit)

    q = JoinableQueue(queue_size)
    q_feats = Queue()

    pool = Pool(pool_size, wrap_handler(handler), ((q, q_feats),))

    with file_reader(file_name) as reader:
        idx = 0
        for entry in reader:

            if (entry.pathname.find(ext_type) != -1):
                text = [b for b in entry.get_blocks()]
                key = entry.pathname.split('/')[-1].split('.')[0]

                q.put((key, text), True)
                idx += 1

                print 'Processing:', entry.pathname, idx

                if limit and idx >= limit:
                    print 'Reached the limit'
                    break

        q.close()
        q.join()
        pool.close()

    result = []
    for i in range(q_feats.qsize()):
        result.append(q_feats.get())
    return result
Пример #9
0
def readCEFFile(afile, pygtail):
    if exists(afile
              ):  #sometimes files can move/archive while we iterate the list
        try:
            #start a process to post our stuff.
            logcache = JoinableQueue()
            postingProcess = Process(target=postLogs,
                                     args=(logcache, ),
                                     name="cef2mozdefHTTPPost")
            postingProcess.start()
            #have pygtail feed us lines
            for line in pygtail:
                pygtail._update_offset_file()
                cefDict = parseCEF(line)
                #logger.debug(json.dumps(cefDict))
                #append json to the list for posting
                if cefDict is not None:
                    logcache.put(json.dumps(cefDict))
            logger.info('{0} done'.format(afile))
            logger.info('waiting for posting to finish')
            logcache.put(None)
            logcache.close()
            #logger.info('posting done')
        except KeyboardInterrupt:
            sys.exit(1)
        except ValueError as e:
            logger.fatal('Exception while handling CEF message: %r' % e)
            sys.exit(1)
Пример #10
0
def main():
    jobs = Queue()
    result = JoinableQueue()
    NUMBER_OF_PROCESSES = cpu_count()

    tasks = ["1", "2", "3", "4", "5"]

    for w in tasks:
        jobs.put(w)

    [
        Process(target=work, args=(i, jobs, result)).start()
        for i in range(NUMBER_OF_PROCESSES)
    ]

    print('starting workers')

    for t in range(len(tasks)):
        r = result.get()
        time.sleep(0.5)
        print(r)
        result.task_done()

    for w in range(NUMBER_OF_PROCESSES):
        jobs.put(None)

    result.join()
    jobs.close()
    result.close()
Пример #11
0
class Queue:
    def __init__(self):
        self._queue = JoinableQueue()

    def put(self, element):
        if self._queue is not None:
            self._queue.put(element)

    def get(self):
        if self._queue is not None:
            try:
                return self._queue.get()
            except:
                return None

    def join(self):
        if self._queue is not None:
            self._queue.join()

    def task_done(self):
        if self._queue is not None:
            self._queue.task_done()

    def unblock_gets(self):
        if self._queue is not None:
            self._queue.close()
            self._queue = JoinableQueue()
Пример #12
0
def queueManager(numProc, myList, function, *args):
	'''queueManager(numProc, myList, function, *args):
	generic function used to start worker processes via the multiprocessing Queue object
	numProc - number of processors to use
	myList - a list of objects to be iterated over
	function - target function
	*args - additional arguments to pass to function

	Return - an unordered list of the results from myList
	'''
	qIn = Queue()
	qOut = JoinableQueue()
	if args:
		arguments = (qIn, qOut,) + args
	else:
		arguments = (qIn, qOut,)
	results = []
	
	# reduce processer count if proc count > files
	
	i = 0
	for l in myList:
		qIn.put((i,l))
		i += 1

	for _ in range(numProc):
		p = Process(target = function, args = arguments).start()
	sys.stdout.write("Progress: {:>3}%".format(0)
)
	curProgress = 0
	lastProgress = 0
	while qOut.qsize() < len(myList):
		#sys.stdout.write("\b\b\b\b{:>3}%".format(int(ceil(100*qOut.qsize()/len(myList)))))
		curProgress = int(ceil(100*qOut.qsize()/len(myList)))
		if curProgress - lastProgress > 10:
			lastProgress += 10
			sys.stdout.write("\nProgress: {:>3}%".format(lastProgress))
			sys.stdout.flush()
	sys.stdout.write("\nProgress: {:>3}%".format(100))
	#sys.stdout.write("\b\b\b\b{:>3}%".format(100))
	sys.stdout.write("\n")
	for _ in range(len(myList)):
		# indicate done results processing
		results.append(qOut.get())
		qOut.task_done()
	#tell child processes to stop
	for _ in range(numProc):
		qIn.put('STOP')

	orderedRes = [None]*len(results)
	for i, res in results:
		orderedRes[i] = res

	qOut.join()

	qIn.close()
	qOut.close()
	return orderedRes
def main():
    print("______POPULATE FEATURE NAMES START__")
    populateFeatureNames(trainFile)
    print("___POPULATE FEATURE NAMES ENDS__")

    ################
    print("building Queue start")
    q = JoinableQueue(20)
    q_feats = Queue()
    print("building Queue end")
    print("building pool start")
    pool = Pool(16, populateFeatures, ((q, q_feats),))
    print("buiding pool ends")
    returnedList = []
    print("onlyfiles start")
    onlyfiles = [f for f in os.listdir(path) if ".asm" in f]
    print("onlyfiles")
    print(onlyfiles)
    print("onlyfiles ends")
    print("___FEATURE EXTRACTION STARTS FOR PATH__")
    for ffile in onlyfiles:
        q.put((ffile, path))
    start = time.asctime(time.localtime(time.time()))
    print("Start Time : " + start)
    q.close()
    print("Q closed")
    start = time.asctime(time.localtime(time.time()))
    print("Start Time : " + start)
    # time.sleep(100)
    q.join()
    print("Q joined")
    start = time.asctime(time.localtime(time.time()))
    print("Start Time : " + start)
    # time.sleep(100)
    pool.close()
    print("Pool closed")
    start = time.asctime(time.localtime(time.time()))
    print("Start Time : " + start)

    for i in range(q_feats.qsize()):
        returnedList.append(q_feats.get())
        # returnedList=p.map(functools.partial(populateFeatures, filePath=path), onlyfiles)
        # time.sleep(10)
        # p.close()
        # time.sleep(100)
        # p.join()
        # time.sleep(10)
    print("___ PROCESSING OUTPUT OF MAP FUNCTION FOR FEATURE_EXTRACTION STARTS___")
    # except:
    # 	print("Something went wrong")
    generateHeader()
    generateFeatures(returnedList)

    print("_____ PROCESSING OUTPUT OF MAP FUNCTION FOR FEATURE_EXTRACTION ENDS____")
    print("_____FEATURE EXTRACTION ENDS____")
Пример #14
0
def tcount(samtools, samples, chromosomes, num_workers, q, verbose=False):
    # Define a Lock and a shared value for log printing through ProgressBar
    err_lock = Lock()
    counter = Value('i', 0)
    progress_bar = pb.ProgressBar(total=len(samples) * len(chromosomes),
                                  length=40,
                                  lock=err_lock,
                                  counter=counter,
                                  verbose=verbose)

    # Establish communication queues
    tasks = JoinableQueue()
    results = Queue()

    # Enqueue jobs
    jobs_count = 0
    for bam in samples:
        for chro in chromosomes:
            tasks.put((bam[0], bam[1], chro))
            jobs_count += 1

    # Setting up the workers
    workers = [
        TotalCounter(tasks, results, progress_bar, samtools, q, verbose)
        for i in range(min(num_workers, jobs_count))
    ]

    # Add a poison pill for each worker
    for i in range(len(workers)):
        tasks.put(None)

    # Start the workers
    for w in workers:
        w.start()

    # Wait for all of the tasks to finish
    tasks.join()

    # Get the results
    sorted_results = {}
    for i in range(jobs_count):
        res = results.get()
        sorted_results[res[0], res[1]] = res[2]

    # Close Queues
    tasks.close()
    results.close()

    # Ensure each worker terminates
    for w in workers:
        w.terminate()
        w.join()

    return sorted_results
Пример #15
0
 def parallel(self):
     from multiprocessing import Process, Queue, JoinableQueue
     self.ntrajs = []
     for i in range(self.cpus):
         self.ntrajs.append(min(int(floor(float(self.ntraj)
             /self.cpus)),
             self.ntraj-sum(self.ntrajs)))
     cnt = sum(self.ntrajs)
     while cnt<self.ntraj:
         for i in range(self.cpus):
             self.ntrajs[i] += 1
             cnt+=1
             if (cnt>=self.ntraj):
                 break
     self.ntrajs = np.array(self.ntrajs)
     self.ntrajs = self.ntrajs[np.where(self.ntrajs>0)]
     self.nprocs = len(self.ntrajs)
     sols = []
     processes = []
     resq = JoinableQueue()
     print "Number of cpus:", self.cpus
     print "Trying to start", self.nprocs, "process(es)."
     print "Number of trajectories for each process:"
     print self.ntrajs
     for i in range(self.nprocs):
         p = Process(target=self.evolve_serial,
                 args=((resq,self.ntrajs[i],i,self.seed*(i+1)),))
         p.start()
         processes.append(p)
     resq.join()
     cnt = 0
     while True:
         try:
             sols.append(resq.get())
             resq.task_done()
             cnt += 1
             if (cnt >= self.nprocs): break
         except KeyboardInterrupt:
             break
         except:
             pass
     resq.join()
     for proc in processes:
         try:
             proc.join()
         except KeyboardInterrupt:
             print("Cancel thread on keyboard interrupt")
             proc.terminate()
             proc.join()
     resq.close()
     return sols
Пример #16
0
class KnowledgeBase(Daemon):

    def __init__(self, config):
        set_logging(config)
        self.config = config
        self.pidfile = os.path.abspath(config['pidfile'])
        self.time_lock = Lock()
        self.teller_queue = JoinableQueue()
        self.session_factory = get_sasession(self.config)
        session = self.session_factory()

    def run(self):
        if int(self.config['instant_duration']):
            self.clock = Ticker(self.config, self.session_factory(),
                                self.time_lock, self.teller_queue)
            self.clock.start()

        host = self.config['kb_host']
        port = int(self.config['kb_port'])
        nproc = int(self.config['teller_processes'])
        for n in range(nproc):
            teller = Teller(self.config, self.session_factory, self.teller_queue)
            teller.daemon = True
            teller.start()
        self.socket = Listener((host, port))
        while True:
            try:
                client = self.socket.accept()
            except InterruptedError:
                return
            self.time_lock.acquire()
            self.teller_queue.put(client)
            self.time_lock.release()

    def cleanup(self, signum, frame):
        """cleanup tasks"""
        nproc = int(self.config['teller_processes'])
        for n in range(nproc):
            self.teller_queue.put(None)
        self.teller_queue.close()
        try:
            self.clock.ticking = False
        except AttributeError:
            pass
        self.teller_queue.join()
        try:
            self.clock.join()
        except AttributeError:
            pass
        logger.warn('bye from {n}, received signal {p}'.format(n=mp.current_process().name, p=str(signum)))
Пример #17
0
    def __iter__(self):
        queue = JoinableQueue(maxsize=self.max_queue_size)

        n_batches, job_queue = self._start_producers(queue)

        # Run as consumer (read items from queue, in current thread)
        for x in xrange(n_batches):
            item = queue.get()
            #print queue.qsize(), "GET"
            yield item  # Yield the item to the consumer (user)
            queue.task_done()

        queue.close()
        job_queue.close()
Пример #18
0
class Multiplexer(object):
    def __init__(self, worker, writer, threads=4):
        self.worker = worker
        self.writer = writer
        self.q = JoinableQueue()
        self.done = Value(c_bool, False)
        self.consumer = Process(target=self.consume)
        self.pool = Pool(threads, init_opener)

    def start(self):
        self.done.value = False
        self.consumer.start()

    def addjob(self, url, data=None):
        params = [url]
        if data: params.append(data)
        try:
            return self.pool.apply_async(self.worker,
                                         params,
                                         callback=self.q.put)
        except:
            logger.error('[!] failed to scrape ' + url)
            logger.error(traceback.format_exc())
            raise

    def finish(self):
        self.pool.close()
        logger.info('closed pool')
        self.pool.join()
        logger.info('joined pool')
        self.done.value = True
        self.q.close()
        logger.info('closed q')
        self.consumer.join()
        logger.info('joined consumer')
        #self.q.join()
        #logger.info('joined q')

    def consume(self):
        param = [0, 0]
        while True:
            job = None
            try:
                job = self.q.get(True, timeout=1)
            except Empty:
                if self.done.value == True: break
            if job:
                param = self.writer(job, param)
                self.q.task_done()
        logger.info('added/updated: %s' % param)
Пример #19
0
class Mothership(object):

    """ Monitor of producer and consumers """

    def __init__(self, producer, consumers, graceful=False):
        self._queue = JoinableQueue()

        self._producer_proxy = ProducerProxy(self._queue, producer)
        self._consumer_pool = list(ConsumerProxy(self._queue, cons) for cons in consumers)
        self._graceful = graceful

    def start(self):
        try:
            """ Start working """
            logger.info('Starting Producers'.center(20, '='))
            self._producer_proxy.start()

            time.sleep(0.1)

            logger.info('Starting Consumers'.center(20, '='))
            for consumer in self._consumer_pool:
                consumer.start()

            self._producer_proxy.join()
            self._queue.join()
            for consumer in self._consumer_pool:
                consumer.join()

            self._queue.close()

        except KeyboardInterrupt:

            self._producer_proxy.stop()
            self._producer_proxy.join()

            if self._graceful:
                logger.info('Shutting Down gracefully...')
                self._queue.join()

            for consumer in self._consumer_pool:
                consumer.stop()
                consumer.join()

            self._queue.close()

    def __enter__(self):
        return self

    def __exit__(self, types, value, tb):
        return
Пример #20
0
	def __iter__(self):
		queue = JoinableQueue(maxsize=self.max_queue_size)

		n_batches, job_queue = self._start_producers(queue)

		# Run as consumer (read items from queue, in current thread)
		for x in xrange(n_batches):
			item = queue.get()
			#print queue.qsize(), "GET"
			yield item # Yield the item to the consumer (user)
			queue.task_done()

		queue.close()
		job_queue.close()
Пример #21
0
    def __iter__(self):
        queue = JoinableQueue(maxsize=params.N_PRODUCERS * 2)

        n_batches, job_queue = self.start_producers(queue)

        # Run as consumer (read items from queue, in current thread)
        for x in xrange(n_batches):
            item = queue.get()
            #print len(item[0]), queue.qsize(), "GET"
            yield item
            queue.task_done()

        #queue.join() #Lock until queue is fully done
        queue.close()
        job_queue.close()
	def __iter__(self):
		queue = JoinableQueue(maxsize=params.N_PRODUCERS*2)

		n_batches, job_queue = self.start_producers(queue)

		# Run as consumer (read items from queue, in current thread)
		for x in xrange(n_batches):
			item = queue.get()
			#print len(item[0]), queue.qsize(), "GET"
			yield item
			queue.task_done()

		#queue.join() #Lock until queue is fully done
		queue.close()
		job_queue.close()
def main():
    workers=cpu_count()
    line_queue=JoinableQueue(workers*2) # Keep at most 2*workers lines in flight
    input_file=open(sys.argv[1], 'rU')
    output_file=open(sys.argv[2], 'w')
    output_queue=Queue(workers*3)

    processes=[]
    for i in xrange(workers):
        this_process=Process(target=process_queue,
                             args=(line_queue, output_queue, LINES_AT_ONCE))
        this_process.start()
        processes.append(this_process)

    # Start the output processor
    output_processor=Process(target=retrieve_output, 
                             args=(output_queue, output_file, LINES_AT_ONCE))
    output_processor.start()

    small_queue=[]
    block_number=0
    for l in input_file:
        small_queue.append(l)
        if len(small_queue)>=LINES_AT_ONCE:
            line_queue.put((block_number, small_queue))
            block_number+=1
            small_queue=[]
    if len(small_queue)>0:
        line_queue.put((block_number, small_queue))
        
    for i in xrange(workers):
        line_queue.put('STOP')
    
    print "Waiting for all tasks to end."
    line_queue.close()
    line_queue.join()
    for p in processes:
        p.join()
    
    print "All tasks ended. Dumping the final output."
    output_queue.put(None)
    output_queue.close()
    output_processor.join()
    
    print "Done. Exiting."
    output_file.close()
    
    return
Пример #24
0
class TaskControl:
    def __init__(self, cls_worker, count, *args, **kwargs):
        self.queue = JoinableQueue()
        self.stopped = Event()
        self.count_processed = Value('i', 0)

        self.processes = [cls_worker(self, *args) for _ in range(count)]
        map(Process.start, self.processes)

    def is_active(self):
        return not self.stopped.is_set()

    def is_alive(self):
        alive = filter(bool, map(Process.is_alive, self.processes))
        print '---- %d child processes are still alive' % len(alive)
        return alive

    def stop(self):
        self.stopped.set()
        self.queue.close()
        print '-- waiting for processes to finish'
        map(Process.join, self.processes)
        self.queue.cancel_join_thread()

    def send_chunk(self, items):
        map(self.queue.put, items)
        print '--- waiting for queue to complete'
        while self.get_stats()[1] and self.is_alive():
            time.sleep(1)

    def get(self):
        while self.is_active():
            try:
                yield self.queue.get(timeout=1)
            except Queue.Empty:
                pass

    def tick(self):
        self.queue.task_done()
        self.count_processed.value += 1
        if not self.count_processed.value % 20:
            print '%d items processed' % self.count_processed.value
        time.sleep(0.5)

    def get_stats(self):
        stats = self.count_processed.value, self.queue.qsize()
        print '--- %d items processed, %d queued' % stats
        return stats
Пример #25
0
class Multiplexer(object):
    def __init__(self, worker, writer, threads=4):
        self.worker=worker
        self.writer=writer
        self.q=JoinableQueue()
        self.done = Value(c_bool,False)
        self.consumer=Process(target=self.consume)
        self.pool = Pool(threads, init_opener)

    def start(self):
        self.done.value=False
        self.consumer.start()

    def addjob(self, url, data=None):
        params=[url]
        if data: params.append(data)
        try:
           return self.pool.apply_async(self.worker,params,callback=self.q.put)
        except:
            logger.error('[!] failed to scrape '+ url)
            logger.error(traceback.format_exc())
            raise

    def finish(self):
        self.pool.close()
        logger.info('closed pool')
        self.pool.join()
        logger.info('joined pool')
        self.done.value=True
        self.q.close()
        logger.info('closed q')
        self.consumer.join()
        logger.info('joined consumer')
        #self.q.join()
        #logger.info('joined q')

    def consume(self):
        param=[0,0]
        while True:
            job=None
            try:
                job=self.q.get(True, timeout=1)
            except Empty:
                if self.done.value==True: break
            if job:
                param = self.writer(job, param)
                self.q.task_done()
        logger.info('added/updated: %s' % param)
Пример #26
0
def qwork(command_file, nproc):
    """Queue up commands to run in parallel."""

    print("Queuing work using %d processes...\n" % nproc)
    queue = JoinableQueue()

    for command in command_file:
        queue.put(command.decode('utf8').rstrip('\n'))

    for ii in range(nproc):
        Runner(queue)

    queue.join()
    queue.close()

    print("\n...done!")
Пример #27
0
def main():
    processes = cpu_count() * 2
    queue = JoinableQueue()
    get_links(queue)
    create_folder()

    for i in range(processes):
        # .start() - Not sure what that actually returns....
        p = Process(target=save_image, args=(queue,))
        p.start()

    for i in range(processes):
        queue.put(None) ## Tell the processes to end

    queue.join()
    queue.close()
Пример #28
0
	def insert_files(self, out,cfg, producer,return_dict, skip_header=0, rec_delim=os.linesep):
		self.opt.skip_header = skip_header
		self.opt.rec_delim = rec_delim
		log = logging.getLogger('cli')
		self.scfg, self.tcfg = cfg
		file_object_cache = FileObjectCache()
		start = time.time()
		
		stat_queue = JoinableQueue()
		
		if 1:
			put_queue = JoinableQueue(1024 * self.opt.processes)
			

			
		if 1:
			put = {'update': self.put_update}[self.opt.put]
			putter_processes = list(islice(repeatedly(Process, target=self.putter, args=(put, put_queue, stat_queue, return_dict)), self.opt.processes))
			for putter_process in putter_processes:
				putter_process.start()
		if 1:
			statter_process = Process(target=self.statter, args=(stat_queue, start))
			statter_process.start()
			
		out_names=[]
		#walk = {'filesystem': self.walk_filesystem}[self.opt.walk]
		for file in producer[0](*producer[1]):
			out_names.append(file)
			put_queue.put(file)	
			#time.sleep(3)
		out.dump_files=out_names

		for putter_process in putter_processes:
			put_queue.put(None)
		put_queue.close()
		for putter_process in putter_processes:
			putter_process.join()
			
		stat_queue.put(None)
		stat_queue.close()
		statter_process.join()
		put_queue.join_thread()
		stat_queue.join_thread()
		print 77777, counter.value()
		print 77777, self.total_ins
		print 7777, (return_dict.values())
def clear_area_around_eye(size = 256, image_dir = 'I:/AI_for_an_eyes/test/test/', target_dir = 'I:/AI_for_an_eyes/test/test_zonder_meuk_256/'):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    util.update_progress(0)


    tasks = glob.glob(image_dir+'*.jpeg')
    job_total = len(tasks)

    print 'Processing images matching ' + image_dir+ '*.jpeg'

    jobs = Queue()
    result = JoinableQueue()
    NUMBER_OF_PROCESSES = cpu_count()*2

    for im_name in tasks:
        jobs.put(im_name)

    for i in xrange(NUMBER_OF_PROCESSES):
        p = Thread(target=worker, args=(i, jobs, result, target_dir, size))
        p.daemon = True
        p.start()

    print 'Starting workers (', NUMBER_OF_PROCESSES, ')!'

    n_complete = 0
    for t in xrange(len(tasks)):
        r = result.get()
        n_complete += 1
        util.update_progress(n_complete/job_total)
        result.task_done()
        #print t, 'done'

    for w in xrange(NUMBER_OF_PROCESSES):
        jobs.put(None)

    util.update_progress(1)

    print 'Done!'
    time.sleep(1)
    result.join()
    jobs.close()
    result.close()
Пример #30
0
def process_task(num_workers):
    logging.info("Started")

    task_queue = JoinableQueue()
    done_queue = Queue()

    def worker(name):
        """
        represents an 'expensive' task
        """
        logging.info("Started process : %s" % name)
        for task in iter(task_queue.get, 'Stop'):
            done_queue.put(task)
            time.sleep(1)
            task_queue.task_done()
        # This is for the poison pill task
        task_queue.task_done()
        logging.info("Done process : %s" % name)

    # First we start the workers, and give them a list that we can look at after
    for i in range(num_workers):
        Process(target=worker, args=("P-%s" % (i+1), )).start()

    # Now the main thread populates the Queue
    num_tasks = num_workers * 5
    for i in range(num_tasks):
        task_queue.put(i)

    # Now, administer the poison pill which tells processes that we are done populating the Q
    for i in range(num_workers):
        task_queue.put('Stop')

    # Now wait for workers to finish their work
    task_queue.close()
    task_queue.join()

    logging.info("Workers are done")

    # Now verify that all tasks are done by seeing them in the done queue
    done_queue.put('Stop')
    done_tasks = [task for task in iter(done_queue.get, 'Stop')]
    assert len(done_tasks) == num_tasks
    logging.info("Verified work - done!")
Пример #31
0
    def __iter__(self):
        queue = JoinableQueue(maxsize=params.N_PRODUCERS * 8)

        n_batches, job_queue = self.start_producers(queue)

        # Run as consumer (read items from queue, in current thread)
        for x in xrange(n_batches):
            item = queue.get()
            yield item
            queue.task_done()

        queue.close()
        job_queue.close()
        if self.shuffle:
            shuffled_idx = np.random.permutation(len(self.X))
            X_new = []
            for i in range(len(self.X)):
                X_new += [self.X[shuffled_idx[i]]]
            self.X = X_new
Пример #32
0
    def _process_test_q(self, source: JoinableQueue,
                        local_test_q: JoinableQueue):
        count = 0
        while count < self._num_processes:
            test_batch = source.get()
            source.task_done()
            if test_batch is not None \
                    and len(test_batch.test_ids) >1 \
                    and test_batch.restriction == TestExecutionConstraint.SINGLE_NODE:
                for test_id in test_batch.test_ids:
                    local_test_q.put(TestBatch([test_id]))
                    local_test_q.join()
            else:
                local_test_q.put(test_batch)
                local_test_q.join()
            if test_batch is None:
                count += 1

        local_test_q.close()
Пример #33
0
	def insert_files(self, file_names, out,cfg, skip_header=0, rec_delim=os.linesep):
		self.opt.skip_header = skip_header
		self.opt.rec_delim = rec_delim
		log = logging.getLogger('cli')
		self.scfg, self.tcfg = cfg
		file_object_cache = FileObjectCache()
		start = time.time()
		if 1:
			put_queue = JoinableQueue(1024 * self.opt.processes)
			stat_queue = JoinableQueue()
			#walk = {'filesystem': self.walk_filesystem}[self.opt.walk]
			for file in file_names.file_names:
				put_queue.put(file)	
			
		if 1:
			put = {'update': self.put_update}[self.opt.put]
			putter_processes = list(islice(repeatedly(Process, target=self.putter, args=(put, put_queue, stat_queue)), self.opt.processes))
			for putter_process in putter_processes:
				putter_process.start()
		if 1:
			statter_process = Process(target=self.statter, args=(stat_queue, start))
			statter_process.start()
		
		for putter_process in putter_processes:
			put_queue.put(None)
		put_queue.close()
		for putter_process in putter_processes:
			putter_process.join()
			
		stat_queue.put(None)
		stat_queue.close()
		statter_process.join()
		put_queue.join_thread()
		stat_queue.join_thread()
	
		#print(3334,file_names.file_names )
		#e()
		out.file_names = ['%s.gz' % os.path.basename(x[0]) for x in file_names.file_names]
		#pp(out.file_names)
		#e()
		out.file_keys = ['%s.gz' % x[0] for x in file_names.file_names]
		out.file_location = os.path.dirname(file_names.file_names[0][0])
Пример #34
0
def readCEFFile(afile):
    if exists(afile
              ):  #sometimes files can move/archive while we iterate the list
        try:
            #start a process to post our stuff.
            logcache = JoinableQueue()
            postingProcess = Process(target=postLogs,
                                     args=(logcache, ),
                                     name="cef2mozdefHTTPPost")
            postingProcess.start()
            #tail a file to feed us lines
            #yielding a line on newline, buffering input in between
            fh = os.open(afile, os.O_RDONLY | os.O_NONBLOCK)
            os.lseek(fh, 0, os.SEEK_END)
            bufa = Buffer()
            bufb = Buffer()
            while True:
                time.sleep(0.001)  # Wait a little
                bufa.append(nonBlockRead(fh))
                if '\n' in ''.join(bufa):  #new line/end of log is found
                    for line in ''.join(bufa).splitlines(True):
                        if '\n' in line:
                            cefDict = parseCEF(line.strip())
                            #logger.debug(json.dumps(cefDict))
                            #append json to the list for posting
                            if cefDict is not None:
                                logcache.put(json.dumps(cefDict))
                        else:
                            bufb.append(line)
                    bufa.clear()
                    bufa.append(''.join(bufb))
                    bufb.clear()
            logger.info('{0} done'.format(afile))
            logger.info('waiting for posting to finish')
            logcache.put(None)
            logcache.close()
            #logger.info('posting done')
        except KeyboardInterrupt:
            sys.exit(1)
        except ValueError as e:
            logger.fatal('Exception while handling CEF message: %r' % e)
            sys.exit(1)
Пример #35
0
def hist_eq(image_dir='test_hist/',
            target_dir='test_result_hist/',
            method='CLAHE'):

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    tasks = glob.glob(image_dir + '*.jpeg')
    job_total = len(tasks)

    print 'Processing images matching ' + image_dir + '*.jpeg'

    jobs = Queue()
    result = JoinableQueue()
    NUMBER_OF_PROCESSES = cpu_count() * 2

    for im_name in tasks:
        jobs.put(im_name)

    for i in xrange(NUMBER_OF_PROCESSES):
        p = Thread(target=worker, args=(i, jobs, result, target_dir, method))
        p.daemon = True
        p.start()

    print 'Starting workers (', NUMBER_OF_PROCESSES, ')!'

    n_complete = 0
    for t in xrange(len(tasks)):
        r = result.get()
        n_complete += 1
        util.update_progress(n_complete / job_total)
        result.task_done()
        #print t, 'done'

    for w in xrange(NUMBER_OF_PROCESSES):
        jobs.put(None)

    print 'Done!'
    result.join()
    jobs.close()
    result.close()
Пример #36
0
def __run_chm_test_procs(mems, model, regions, ntasks, nthreads):
    """Starts ntasks processes running __run_chm_test_proc then calls __run_chm_test_parallel."""
    from multiprocessing import JoinableQueue, Process
    from time import sleep
    print("Running CHM test with %d task%s and %d thread%s per task" %
          (ntasks, 's' if ntasks > 1 else '', nthreads,
           's' if nthreads > 1 else ''))
    nthreads_full = ntasks * nthreads

    # Start the child processes
    q = JoinableQueue()
    args = (mems, model, nthreads, q)
    processes = [
        Process(target=__run_chm_test_proc, name="CHM-test-%d" % p, args=args)
        for p in xrange(ntasks)
    ]
    for p in processes:
        p.daemon = True
        p.start()
    sleep(0)

    # Run the CHM-test in parallel
    try:
        out = __run_chm_test_parallel(mems, model, regions, q, processes,
                                      nthreads_full)
    except:
        __clear_queue(q)
        __kill_processes(processes)
        raise

    # Tell all processes we are done and make sure they all actually terminate
    for _ in xrange(ntasks):
        q.put_nowait(None)
    q.close()
    q.join()
    q.join_thread()
    for p in processes:
        p.join()

    # Done! Return the output image
    return out
Пример #37
0
def parallel_for(a, cls, args=[], kwargs={}, num_processes=None):
    from multiprocessing import Process, JoinableQueue, cpu_count, Pipe
    if num_processes is None:
        num_processes = cpu_count()
    # Note that JoinableQueue uses an integer for tracking locations in the queue.
    # Because it's using shared memory it's not terribly flexible and gives annoyingly
    # unclear errors if you go over the limit. We'd like the queue to be as large as
    # possible so that we can avoid contention, but without allocating a max possible
    # size queue unless we need it, thus the calculation below. 32767 is a hard limit.
    q = JoinableQueue(maxsize=min(len(a)+num_processes, 2**15 - 1))

    output_pipes = [Pipe(duplex=False) for _ in range(num_processes)]
    send_pipes = [p for _, p in output_pipes]
    recv_pipes = [p for p, _ in output_pipes]
    pool = [Process(target=_parallel_for, args=(q, cls, pipe) + tuple(args), kwargs=kwargs)
            for pipe in send_pipes]
    output_watcher = MultiPipeWatcher(recv_pipes)
    try:
        for p in pool:
            p.start()
        output_watcher.start()
        for x in a:
            q.put(x)
        for _ in range(num_processes):
            q.put(None) # End markers
        q.close()
        q.join_thread()
        q.join()
        for p in pool:
            p.join()
        output_watcher.flush()
        output_watcher.join()
        combined_output = output_watcher.merged
        return combined_output
    except KeyboardInterrupt:
        print "Interrupted -- terminating worker processes"
        for p in pool:
            p.terminate()
        for p in pool:
            p.join()
        raise
Пример #38
0
def parallel_for(a, cls, args=[], kwargs={}, num_processes=None):
    from multiprocessing import Process, JoinableQueue, cpu_count, Pipe
    if num_processes is None:
        num_processes = cpu_count()
    # Note that JoinableQueue uses an integer for tracking locations in the queue.
    # Because it's using shared memory it's not terribly flexible and gives annoyingly
    # unclear errors if you go over the limit. We'd like the queue to be as large as
    # possible so that we can avoid contention, but without allocating a max possible
    # size queue unless we need it, thus the calculation below. 32767 is a hard limit.
    q = JoinableQueue(maxsize=min(len(a)+num_processes, 2**15 - 1))

    output_pipes = [Pipe(duplex=False) for _ in range(num_processes)]
    send_pipes = [p for _, p in output_pipes]
    recv_pipes = [p for p, _ in output_pipes]
    pool = [Process(target=_parallel_for, args=(q, cls, pipe) + tuple(args), kwargs=kwargs)
            for pipe in send_pipes]
    output_watcher = MultiPipeWatcher(recv_pipes)
    try:
        for p in pool:
            p.start()
        output_watcher.start()
        for x in a:
            q.put(x)
        for _ in range(num_processes):
            q.put(None) # End markers
        q.close()
        q.join_thread()
        q.join()
        for p in pool:
            p.join()
        output_watcher.flush()
        output_watcher.join()
        combined_output = output_watcher.merged
        return combined_output
    except KeyboardInterrupt:
        print("Interrupted -- terminating worker processes")
        for p in pool:
            p.terminate()
        for p in pool:
            p.join()
        raise
def hist_eq(image_dir = 'test_hist/', target_dir = 'test_result_hist/', method = 'CLAHE'):

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)


    tasks = glob.glob(image_dir+'*.jpeg')
    job_total = len(tasks)

    print 'Processing images matching ' + image_dir+ '*.jpeg'

    jobs = Queue()
    result = JoinableQueue()
    NUMBER_OF_PROCESSES = cpu_count()*2

    for im_name in tasks:
        jobs.put(im_name)

    for i in xrange(NUMBER_OF_PROCESSES):
        p = Thread(target=worker, args=(i, jobs, result, target_dir, method))
        p.daemon = True
        p.start()

    print 'Starting workers (', NUMBER_OF_PROCESSES, ')!'

    n_complete = 0
    for t in xrange(len(tasks)):
        r = result.get()
        n_complete += 1
        util.update_progress(n_complete/job_total)
        result.task_done()
        #print t, 'done'

    for w in xrange(NUMBER_OF_PROCESSES):
        jobs.put(None)

    print 'Done!'
    result.join()
    jobs.close()
    result.close()
Пример #40
0
def crunch(file_name,
           ext_type,
           handler,
           pool_size=4,
           queue_size=40,
           limit=None):

    print('Crunching file: %s, limit: %s' % (file_name, limit))

    q = JoinableQueue(queue_size)
    q_feats = Queue()

    pool = Pool(pool_size, wrap_handler(handler), ((q, q_feats), ))

    with file_reader(file_name) as reader:
        idx = 0
        for entry in reader:

            if (entry.pathname.find(ext_type) != -1):
                text = [b for b in entry.get_blocks()]
                key = entry.pathname.split('/')[-1].split('.')[0]

                q.put((key, text), True)
                idx += 1

                print('Processing:', entry.pathname, idx)

                if limit and idx >= limit:
                    print('Reached the limit')
                    break

        q.close()
        q.join()
        pool.close()

    result = []
    for i in range(q_feats.qsize()):
        result.append(q_feats.get())
    return result
Пример #41
0
def readCEFFile(afile):
    if exists(afile): #sometimes files can move/archive while we iterate the list
        try:
            #start a process to post our stuff.
            logcache=JoinableQueue()
            postingProcess=Process(target=postLogs,args=(logcache,),name="cef2mozdefHTTPPost")
            postingProcess.start()
            #tail a file to feed us lines
            #yielding a line on newline, buffering input in between
            fh = os.open(afile, os.O_RDONLY | os.O_NONBLOCK)
            os.lseek(fh, 0, os.SEEK_END)
            bufa=Buffer()
            bufb=Buffer()
            while True:
                time.sleep(0.001) # Wait a little
                bufa.append(nonBlockRead(fh))
                if '\n' in ''.join(bufa):  #new line/end of log is found
                    for line in ''.join(bufa).splitlines(True):
                        if '\n' in line:
                            cefDict=parseCEF(line.strip())
                            #logger.debug(json.dumps(cefDict))
                            #append json to the list for posting
                            if cefDict is not None:
                                logcache.put(json.dumps(cefDict)) 
                        else:
                            bufb.append(line)
                    bufa.clear()
                    bufa.append(''.join(bufb))
                    bufb.clear()
            logger.info('{0} done'.format(afile))
            logger.info('waiting for posting to finish')
            logcache.put(None)
            logcache.close()
            #logger.info('posting done')
        except KeyboardInterrupt:
            sys.exit(1)
        except ValueError as e:
            logger.fatal('Exception while handling CEF message: %r'%e)
            sys.exit(1)    
Пример #42
0
    def __iter__(self):
        queue = JoinableQueue(maxsize=params.N_PRODUCERS * 8)

        n_batches, job_queue = self.start_producers(queue)

        # Run as consumer (read items from queue, in current thread)
        for x in xrange(n_batches):
            item = queue.get()
            yield item
            queue.task_done()

        queue.close()
        job_queue.close()
        if self.shuffle:
            shuffled_idx = np.random.permutation(len(self.X))
            X_new = []
            y_new = []
            for i in range(len(self.X)):
                X_new += [self.X[shuffled_idx[i]]]
                y_new += [self.y[shuffled_idx[i]]]
            self.X = X_new
            self.y = y_new
def main():
	print("______POPULATE FEATURE NAMES START__")
	populateFeatureNames(trainFile)
	print("___POPULATE FEATURE NAMES ENDS__")
	
	################
	q = JoinableQueue(20)
	q_feats = Queue()
	pool = Pool(6,populateFeatures, ((q, q_feats),))	
	returnedList=[]
	print("__onlyfile population starts__")
	onlyfiles = [ f for f in os.listdir(path) if ".asm" in f]
	print("___FEATURE EXTRACTION STARTS FOR PATH__")
	for ffile in onlyfiles:
		q.put((ffile,path),True)
	q.close()
	#time.sleep(100)
	q.join()
	#time.sleep(100)
	pool.close()
	
	for i in range(q_feats.qsize()):
		returnedList.append(q_feats.get())
	#returnedList=p.map(functools.partial(populateFeatures, filePath=path), onlyfiles)
	#time.sleep(10)
	#p.close()
	#time.sleep(100)
	#p.join()
	#time.sleep(10)
	print("___ PROCESSING OUTPUT OF MAP FUNCTION FOR FEATURE_EXTRACTION STARTS___")
	#except:
	#	print("Something went wrong")
	generateHeader()
	generateFeatures(returnedList)
	
	print("_____ PROCESSING OUTPUT OF MAP FUNCTION FOR FEATURE_EXTRACTION ENDS____")
	print("_____FEATURE EXTRACTION ENDS____")
Пример #44
0
def parexec(signal, out, num_consumers, iterator):
    t = time.time()
    tasks = JoinableQueue()
    results = Queue()
    print 'starting consumers'
    consumers = [Consumer(tasks, results, [signal]) for _ in range(num_consumers)]
    for w in consumers:
        w.start()
    print 'adding tasks'
    for i in iterator:
        tasks.put(Task(i, signal))
    for i in range(num_consumers):
        tasks.put(None)
    print 'collecting'
    for _ in range(len(iterator)):
        out.append(results.get())
        if _%100000 == 0:
            print _
    tasks.close()
    tasks.join_thread()
    print 'closing'
    for w in consumers:
        w.join()
    print time.time() - t
Пример #45
0
def annotate_gtf_parallel(input_gtf_file,
                          output_gtf_file, 
                          gtf_sample_attr, 
                          num_processors, 
                          tmp_dir):
    # create queue
    input_queue = JoinableQueue(maxsize=num_processors*3)
    # start worker processes
    procs = []
    worker_gtf_files = []
    for i in xrange(num_processors):
        worker_gtf_file = os.path.join(tmp_dir, "annotate_worker%03d.gtf" % (i))
        worker_gtf_files.append(worker_gtf_file)
        args = (input_queue, worker_gtf_file, gtf_sample_attr)
        p = Process(target=annotate_gtf_worker, args=args)
        p.daemon = True
        p.start()
        procs.append(p)
    for lines in parse_loci(open(input_gtf_file)):
        input_queue.put(lines)
    # stop workers
    for p in procs:
        input_queue.put([])
    # close queue
    input_queue.join()
    input_queue.close()
    # join worker processes
    for p in procs:
        p.join()
    # merge/sort worker gtf files
    logging.debug("Merging %d worker GTF file(s)" % (num_processors))
    merge_sort_gtf_files(worker_gtf_files, output_gtf_file, tmp_dir=tmp_dir)
    # remove worker gtf files
    for filename in worker_gtf_files:
        if os.path.exists(filename):
            os.remove(filename)
def main():
    """Main loop"""
    start_time = time.time()
    motd()
    args = get_args()
    print 'Started: ', time.strftime("%a %b %d, %Y  %H:%M:%S", time.localtime(start_time))
    # build our configuration object w/ input params
    conf = ConfigParser.ConfigParser()
    conf.read(args.config)
    params = Parameters(conf)
    # create the db and tables, returning connection
    # and cursor
    if params.db.create:
        conn, cur = db.create_db_and_new_tables(params.db.name)
    # alert user we're dropping reads
    print "[WARN] Dropping all demultiplexed reads ≤ {0} bp long".format(params.quality.drop_len)
    # get num reads and split up work
    print "Splitting reads into work units..."
    num_reads, work = get_work(params, args.job_size)
    print "There are {:,d} reads".format(num_reads)
    # give some indication of progress for longer runs
    if num_reads > 999:
        sys.stdout.write('Running...\n')
    #pdb.set_trace()
    #r1out = open('r1-reads.fasta', 'w', 1)
    #r2out = open('r2-reads.fasta', 'w', 1)
    # MULTICORE
    if params.parallelism.cores > 1:
        jobs = Queue()
        results = JoinableQueue()
        # We're stacking groups of jobs on the work
        # Queue, conceivably to save the overhead of
        # placing them on there one-by-one.
        print "Adding jobs to work queue..."
        for unit in work:
            jobs.put(unit)
        print "There are {} jobs...".format(num_reads / args.job_size)
        # setup the processes for the jobs
        print "Starting {} workers...".format(params.parallelism.cores)
        # start the worker processes
        [Process(target=multiproc, args=(jobs, results, params)).start()
            for i in xrange(params.parallelism.cores)]
        # we're putting single results on the results Queue so
        # that the db can (in theory) consume them at
        # a rather consistent rate rather than in spurts
        #for unit in xrange(num_reads):
        count = 0
        for unit in xrange(num_reads):
            dmux = results.get()
            rowid = db.insert_record_to_db(cur, dmux)
            write_results_out(cur, rowid, params, dmux)
            results.task_done()
            progress(rowid, 10000, 100000)
            count += 1
        # make sure we put None at end of Queue
        # in an amount equiv. to num_procs
        for unit in xrange(params.parallelism.cores):
            jobs.put(None)
        # join the results, so that they can finish
        results.join()
        # close up our queues
        jobs.close()
        results.close()

    # SINGLECORE
    else:
        # fake a multiprocessing queue, so stacking and accessing results
        # is identical.
        fake_queue = ListQueue()
        results = singleproc(work, fake_queue, params)
        count = 0
        for dmux in results:
            rowid = db.insert_record_to_db(cur, dmux)
            write_results_out(cur, rowid, params, dmux)
            progress(rowid, 10000, 100000)
            count += 1
    params.storage.close()
    conn.commit()
    cur.close()
    conn.close()
    end_time = time.time()
    pretty_end_time = time.strftime("%a %b %d, %Y  %H:%M:%S", time.localtime(end_time))
    print "\nEnded: {} (run time {} minutes)".format(pretty_end_time,
            round((end_time - start_time)/60, 3))
def run_parallel(config):
    """
    runs assembly in parallel and merges output from child processes 

    config: RunConfig object
    """
    # create temp directory
    tmp_dir = os.path.join(config.output_dir, "tmp")
    if not os.path.exists(tmp_dir):
        logging.debug("Creating tmp directory '%s'" % (tmp_dir))
        os.makedirs(tmp_dir)
    # create queue
    input_queue = JoinableQueue(maxsize=config.num_processors*3)
    # shared memory values
    locus_id_value_obj = LockValue(1)
    gene_id_value_obj = LockValue(1)
    tss_id_value_obj = LockValue(1)
    t_id_value_obj = LockValue(1)
    # start worker processes
    procs = []
    worker_prefixes = []
    for i in xrange(config.num_processors):
        worker_prefix = os.path.join(tmp_dir, "worker%03d" % (i))
        worker_prefixes.append(worker_prefix)
        args = (input_queue, 
                locus_id_value_obj,
                gene_id_value_obj,
                tss_id_value_obj,
                t_id_value_obj,
                worker_prefix,
                config)
        p = Process(target=assembly_worker, args=args)
        p.daemon = True
        p.start()
        procs.append(p)
    # parse gtf file                
    for lines in parse_loci(open(config.gtf_input_file)):
        input_queue.put(lines)
    # stop workers
    for p in procs:
        input_queue.put([])
    # close queue
    input_queue.join()
    input_queue.close()
    # join worker processes
    for p in procs:
        p.join()
    # merge gtf files
    if config.create_gtf:
        logging.info("Merging %d worker GTF files" % 
                     (config.num_processors))
        worker_gtf_files = [prefix + ".gtf" for prefix in worker_prefixes]
        output_gtf_file = os.path.join(config.output_dir, "assembly.gtf")
        merge_sort_gtf_files(worker_gtf_files, output_gtf_file, 
                             tmp_dir=tmp_dir)
        # remove worker gtf files
        for filename in worker_gtf_files:
            if os.path.exists(filename):
                os.remove(filename)
    # merge bed files
    if config.create_bed:
        logging.info("Merging %d worker BED files" % 
                     (config.num_processors))
        worker_bed_files = [p + ".bed" for p in worker_prefixes]
        output_bed_file = os.path.join(config.output_dir, "assembly.bed")
        merge_sort_files(worker_bed_files, output_bed_file, 
                         sort_func=sort_bed, 
                         tmp_dir=tmp_dir)
        # write bed file track description line
        track_name = os.path.basename(config.output_dir)
        track_line = ' '.join(['track name="%s"' % (track_name),
                               'description="%s"' % (track_name),
                               'visibility=pack',
                               'useScore=1'])
        track_file = os.path.join(config.output_dir, 
                                  "assembly.bed.ucsc_track")
        fileh = open(track_file, "w")
        print >>fileh, track_line
        fileh.close()
    # merge bedgraph files
    if config.create_bedgraph:
        logging.info("Merging %d worker bedGraph files" % 
                     (config.num_processors))
        for strand in xrange(0,3):
            strand_name = STRAND_NAMES[strand]
            bgfiles = ['%s_%s.bedgraph' % (p, strand_name)
                       for p in worker_prefixes]
            output_file = os.path.join(config.output_dir, 
                                       "assembly_%s.bedgraph" % strand_name)
            merge_sort_files(bgfiles, output_file, 
                             sort_func=sort_bed, 
                             tmp_dir=tmp_dir)
            track_name = '%s_%s' % (os.path.basename(config.output_dir), 
                                    strand_name)
            track_line = ' '.join(['track type=bedGraph',
                                   'name="%s"' % (track_name),
                                   'description="%s"' % (track_name),
                                   'visibility=full',
                                   'color=%s' % (STRAND_COLORS[strand]),
                                   'autoScale=on',
                                   'alwaysZero=on',
                                   'maxHeightPixels=64:64:11'])
            track_file = os.path.join(config.output_dir, 
                                      "assembly_%s.bedgraph.ucsc_track" % strand_name)
            fileh = open(track_file, "w")
            print >>fileh, track_line
            fileh.close()
    # cleanup
    if os.path.exists(tmp_dir):
        shutil.rmtree(tmp_dir)
    logging.info("Done")
    return 0
Пример #48
0
def assemble_parallel(args, results, num_samples):
    '''
    args: object containing parameters to configure the assembly process
    results: Results object containing input and output filenames
    num_samples: number of samples in assembly

    Args
    ====
    - guided_strand
    - guided_ends
    - guided_assembly
    - change_point
    - change_point_pvalue
    - change_point_fold_change
    - change_point_trim
    - path_graph_kmax
    - path_frac
    - max_paths
    - isoform_frac
    - max_isoforms
    - assemble_unstranded

    Results
    =======
    Input file attributes:
    - locus_index_file
    - transfrags_bed_file

    Output file attributes:
    - bedgraph_files
    - splice_bed_file
    - splice_graph_gtf_file
    - change_point_gtf_file
    - path_graph_stats_file
    - assembly_gtf_file
    - assembly_bed_file
    '''
    logging.info('Assembling in parallel using %d processes' %
                 (args.num_processes))
    # create queue
    input_queue = JoinableQueue(maxsize=args.num_processes * 2)
    bed_file = results.transfrags_bed_file
    global_ids = GlobalIds()
    # start worker processes
    procs = []
    worker_results = []
    for i in xrange(args.num_processes):
        worker_id = 'worker%03d' % i
        worker_dir = os.path.join(results.tmp_dir, worker_id)
        if not os.path.exists(worker_dir):
            logging.debug("\tcreating worker directory '%s'" % (worker_dir))
            os.makedirs(worker_dir)
        worker_results.append(Results(worker_dir))
        worker_state = WorkerState(bed_file, input_queue, global_ids,
                                   args, num_samples, worker_dir)
        p = Process(target=assemble_worker, args=(worker_state,))
        p.start()
        procs.append(p)
    # parse locus file
    for locus in parse_locus_index(results.locus_index_file):
        input_queue.put(locus)
    for p in procs:
        input_queue.put(None)
    # close input queue
    input_queue.join()
    input_queue.close()
    # join worker processes
    for p in procs:
        p.join()

    # merge output files
    def merge(input_files, output_file, key, header=None):
        fhs = [open(f, 'rb', 64*1024) for f in input_files]
        with open(output_file, 'wb', 64*1024) as output:
            if header is not None:
                output.write(header)
            iterator = batch_merge(key, *fhs)
            output.writelines(iterator)
        for fh in fhs:
            fh.close()

    logging.info('Merging output files')
    logging.debug('\tmerging bedgraph files')
    for i, output_file in enumerate(results.bedgraph_files):
        input_files = [r.bedgraph_files[i] for r in worker_results]
        merge(input_files, output_file, sort_key_bed)
    logging.debug('\tmerging splice bed file')
    header = ('track name=junctions description="Splice Junctions" '
              'graphType=junctions\n')
    merge(input_files=[r.splice_bed_file for r in worker_results],
          output_file=results.splice_bed_file,
          key=sort_key_bed,
          header=header)
    logging.debug('\tmerging splice graph gtf file')
    merge(input_files=[r.splice_graph_gtf_file for r in worker_results],
          output_file=results.splice_graph_gtf_file,
          key=sort_key_gtf)
    logging.debug('\tmerging change point gtf file')
    merge(input_files=[r.change_point_gtf_file for r in worker_results],
          output_file=results.change_point_gtf_file,
          key=sort_key_gtf)
    logging.debug('\tmerging path graph stats file')
    header = ['chrom', 'start', 'end', 'strand', 'k', 'kmax', 'transfrags',
              'short_transfrags', 'short_expr', 'lost_short', 'lost_short_expr',
              'kmers', 'lost_kmers', 'tot_expr', 'graph_expr', 'expr_frac',
              'valid', 'opt', 'is_opt\n']
    header = '\t'.join(header)
    merge(input_files=[r.path_graph_stats_file for r in worker_results],
          output_file=results.path_graph_stats_file,
          key=sort_key_bed,
          header=header)
    logging.debug('\tmerging assembly bed file')
    merge(input_files=[r.assembly_bed_file for r in worker_results],
          output_file=results.assembly_bed_file,
          key=sort_key_bed)
    logging.debug('\tmerging assembly gtf file')
    merge(input_files=[r.assembly_gtf_file for r in worker_results],
          output_file=results.assembly_gtf_file,
          key=sort_key_gtf)
    # cleanup worker data
    logging.info('Removing temporary files')
    def shutil_error_callback(func, path, excinfo):
        logging.error('Error removing tmp files path=%s message=%s' %
                      (path, excinfo))
    for r in worker_results:
        shutil.rmtree(r.output_dir, onerror=shutil_error_callback)
    logging.info('Done')
    return 0
def run_parallel(config):
    """
    runs assembly in parallel and merges output from child processes 

    config: RunConfig object
    """
    # create temp directory
    tmp_dir = os.path.join(config.output_dir, "tmp")
    if not os.path.exists(tmp_dir):
        logging.debug("Creating tmp directory '%s'" % (tmp_dir))
        os.makedirs(tmp_dir)
    # create queue
    input_queue = JoinableQueue(maxsize=config.num_processors * 3)
    # shared memory values
    locus_id_value_obj = LockValue(1)
    gene_id_value_obj = LockValue(1)
    tss_id_value_obj = LockValue(1)
    t_id_value_obj = LockValue(1)
    # start worker processes
    procs = []
    worker_prefixes = []
    for i in xrange(config.num_processors):
        worker_prefix = os.path.join(tmp_dir, "worker%03d" % (i))
        worker_prefixes.append(worker_prefix)
        args = (
            input_queue,
            locus_id_value_obj,
            gene_id_value_obj,
            tss_id_value_obj,
            t_id_value_obj,
            worker_prefix,
            config,
        )
        p = Process(target=assembly_worker, args=args)
        p.daemon = True
        p.start()
        procs.append(p)
    # parse gtf file
    for lines in parse_loci(open(config.gtf_input_file)):
        input_queue.put(lines)
    # stop workers
    for p in procs:
        input_queue.put([])
    # close queue
    input_queue.join()
    input_queue.close()
    # join worker processes
    for p in procs:
        p.join()
    # merge gtf files
    if config.create_gtf:
        logging.info("Merging %d worker GTF files" % (config.num_processors))
        worker_gtf_files = [prefix + ".gtf" for prefix in worker_prefixes]
        output_gtf_file = os.path.join(config.output_dir, "assembly.gtf")
        merge_sort_gtf_files(worker_gtf_files, output_gtf_file, tmp_dir=tmp_dir)
        # remove worker gtf files
        for filename in worker_gtf_files:
            if os.path.exists(filename):
                os.remove(filename)
    # merge bed files
    if config.create_bed:
        logging.info("Merging %d worker BED files" % (config.num_processors))
        worker_bed_files = [p + ".bed" for p in worker_prefixes]
        output_bed_file = os.path.join(config.output_dir, "assembly.bed")
        merge_sort_files(worker_bed_files, output_bed_file, sort_func=sort_bed, tmp_dir=tmp_dir)
        # write bed file track description line
        track_name = os.path.basename(config.output_dir)
        track_line = " ".join(
            ['track name="%s"' % (track_name), 'description="%s"' % (track_name), "visibility=pack", "useScore=1"]
        )
        track_file = os.path.join(config.output_dir, "assembly.bed.ucsc_track")
        fileh = open(track_file, "w")
        print >> fileh, track_line
        fileh.close()
    # merge bedgraph files
    if config.create_bedgraph:
        logging.info("Merging %d worker bedGraph files" % (config.num_processors))
        for strand in xrange(0, 3):
            strand_name = STRAND_NAMES[strand]
            bgfiles = ["%s_%s.bedgraph" % (p, strand_name) for p in worker_prefixes]
            output_file = os.path.join(config.output_dir, "assembly_%s.bedgraph" % strand_name)
            merge_sort_files(bgfiles, output_file, sort_func=sort_bed, tmp_dir=tmp_dir)
            track_name = "%s_%s" % (os.path.basename(config.output_dir), strand_name)
            track_line = " ".join(
                [
                    "track type=bedGraph",
                    'name="%s"' % (track_name),
                    'description="%s"' % (track_name),
                    "visibility=full",
                    "color=%s" % (STRAND_COLORS[strand]),
                    "autoScale=on",
                    "alwaysZero=on",
                    "maxHeightPixels=64:64:11",
                ]
            )
            track_file = os.path.join(config.output_dir, "assembly_%s.bedgraph.ucsc_track" % strand_name)
            fileh = open(track_file, "w")
            print >> fileh, track_line
            fileh.close()
    # cleanup
    if os.path.exists(tmp_dir):
        shutil.rmtree(tmp_dir)
    logging.info("Done")
    return 0
        counts = count_bytes(lines)

        q_out.put([name, counts])
        q_in.task_done()

q = JoinableQueue(20)
q_feats = Queue()

pool = Pool(6, get_features, ((q, q_feats),))


with libarchive.public.file_reader(TRAIN_PATH) as archive:
    for entry in archive:

        # Use only .bytes
        if (entry.pathname.find('.bytes') != -1):
            text = []
            for b in entry.get_blocks():
                text.append(b)

            q.put((entry.pathname, text), True)

    q.close()
    q.join()
    pool.close()

# Now you can get a list of features like that
feats = []
for i in range(q_feats.qsize()):
    feats.append(q_feats.get())
Пример #51
0
    def parallelbuild(self, numworkers):
        class Worker(Process):
            def __init__(self, task_queue, results, chromosomes, reference,
                         lengths):
                Process.__init__(self)
                self.task_queue = task_queue
                self.result_queue = results
                self.chromosomes = chromosomes
                self.reference = reference
                self.lengths = lengths

            def run(self):
                while True:
                    next_task = self.task_queue.get()
                    if next_task is None:
                        # Poison pill means shutdown
                        self.task_queue.task_done()
                        break
                    self.buildHaplotype(next_task)
                    self.task_queue.task_done()
                    self.result_queue.put(next_task[-1])
                return

            def buildChromosome(self, sequence, haplotype, reference):
                referencesequence = {
                    bi: sequence[bi[0]:bi[1]]
                    for bi in reference
                }
                result = ''
                for bi in haplotype:
                    result += ''.join(referencesequence[bi])
                return result

            def buildHaplotype(self, task):
                originalfa, haplotypes, haplengths, output = task
                with open(originalfa, 'r') as fa:
                    with open(output, 'w') as out:
                        name = ''
                        sequence = []
                        for line in fa:
                            if line != '':
                                if line[0] == '>':
                                    if name != '' and name in self.chromosomes and haplengths[
                                            name] > 0:
                                        out.write(">{}\n".format(name))
                                        assert (len(sequence) ==
                                                self.lengths[name])
                                        out.write("{}\n".format(
                                            self.buildChromosome(
                                                sequence, haplotypes[name],
                                                self.reference[name])))
                                    name = line.strip()[1:]
                                    sequence = []
                                else:
                                    if name in self.chromosomes and haplengths[
                                            name] > 0:
                                        sequence += list(line.strip())
                        if len(sequence) > 0:
                            if name != '' and name in self.chromosomes:
                                out.write(">{}\n".format(name))
                                assert (len(sequence) == self.lengths[name])
                                out.write("{}\n".format(
                                    self.buildChromosome(
                                        sequence, haplotypes[name],
                                        self.reference[name])))

        # Establish communication queues
        tasks = JoinableQueue()
        results = Queue()

        # Enqueue jobs
        c = copy.deepcopy
        jobs_count = 0
        for clone in self.tumor.clones:
            originalfa = c(clone.humanGenome.maternalfa)
            haplotypes = {
                c(chro): c(clone.genome[chro].maternalHaplotype)
                for chro in clone.humanGenome.chromosomes
            }
            haplengths = {
                c(chro): c(clone.genome[chro].maternalHaplotypeLength)
                for chro in clone.humanGenome.chromosomes
            }
            output = os.path.join(self.xdir,
                                  '{}.maternal.fa'.format(c(clone.label)))
            tasks.put((originalfa, haplotypes, haplengths, output))
            jobs_count += 1

            originalfa = c(clone.humanGenome.paternalfa)
            haplotypes = {
                c(chro): c(clone.genome[chro].paternalHaplotype)
                for chro in clone.humanGenome.chromosomes
            }
            haplengths = {
                c(chro): c(clone.genome[chro].paternalHaplotypeLength)
                for chro in clone.humanGenome.chromosomes
            }
            output = os.path.join(self.xdir,
                                  '{}.paternal.fa'.format(c(clone.label)))
            tasks.put((originalfa, haplotypes, haplengths, output))
            jobs_count += 1

        # Setting up the workers
        workers = [
            Worker(task_queue=tasks,
                   results=results,
                   chromosomes=c(self.tumor.human.chromosomes),
                   reference={
                       c(chro): c(self.tumor.root.genome[chro].reference)
                       for chro in self.tumor.human.chromosomes
                   },
                   lengths={
                       c(chro): c(self.tumor.root.genome[chro].length)
                       for chro in self.tumor.human.chromosomes
                   }) for i in range(min(numworkers, jobs_count))
        ]

        # Add a poison pill for each worker
        for i in range(len(workers)):
            tasks.put(None)

        # Start the workers
        for w in workers:
            w.start()

        # Wait for all of the tasks to finish
        tasks.join()

        # Collect results
        collect = []
        for i in range(jobs_count):
            collect.append(results.get())

        # Close Queues
        tasks.close()
        results.close()

        # Ensure each worker terminates
        for w in workers:
            w.terminate()
            w.join()

        return collect
Пример #52
0
def aggregate_parallel(samples, args, results):
    '''
    Process and aggregate GTF input files

    samples: list of Sample objects
    args: from Argparse module. command-line arguments to configure the
          assembly process
    results: Results object containing input and output filenames
    '''
    logging.info('Aggregating in parallel using %d processes' %
                 (args.num_processes))

    if args.filter_splice_juncs and args.ref_genome_fasta_file:
        # test opening FastaFile
        logging.info('Indexing reference genome fasta file (if necessary)')
        fasta_fh = FastaFile(args.ref_genome_fasta_file)
        fasta_fh.close()

    # create queue
    input_queue = JoinableQueue(maxsize=args.num_processes * 2)
    # start worker processes
    procs = []
    worker_results = []
    for i in xrange(args.num_processes):
        worker_id = 'aggregate_worker%03d' % i
        worker_dir = os.path.join(results.tmp_dir, worker_id)
        if not os.path.exists(worker_dir):
            os.makedirs(worker_dir)
        worker_results.append(Results(worker_dir))
        p = Process(target=aggregate_worker,
                    args=(input_queue, args, worker_dir))
        p.start()
        procs.append(p)

    # reference gtf
    if args.ref_gtf_file is not None:
        logging.debug('Reference: %s' % args.ref_gtf_file)
        input_queue.put(Sample(args.ref_gtf_file, Sample.REF_ID))
    # parse samples
    for sample in samples:
        input_queue.put(sample)
    for p in procs:
        input_queue.put(None)
    # close input queue
    input_queue.join()
    input_queue.close()
    # join worker processes
    for p in procs:
        p.join()

    # merge output files
    logging.info('Merging aggregated files')
    logging.debug('\tmerging bed files')
    retcode = merge_bed(input_files=[r.transfrags_bed_file for r in worker_results],
                        output_file=results.transfrags_bed_file,
                        num_processes=args.num_processes,
                        tmp_dir=results.tmp_dir)
    if retcode != 0:
        raise TacoError('Error running linux merge')

    logging.debug('\tmerging filtered bed files')
    retcode = merge_bed(input_files=[r.transfrags_filtered_bed_file for r in worker_results],
                        output_file=results.transfrags_filtered_bed_file,
                        num_processes=args.num_processes,
                        tmp_dir=results.tmp_dir)
    if retcode != 0:
        raise TacoError('Error running linux merge')

    logging.debug('\tmerging sample stats')
    def sort_key_field0(line):
        fields = line.split('\t', 1)
        return fields[0]
    stats_header = ['sample_id', 'num_transfrags', 'filtered_length',
                    'filtered_expr', 'filtered_splice\n']
    stats_header = '\t'.join(stats_header)
    merge_files(input_files=[r.sample_stats_file for r in worker_results],
                output_file=results.sample_stats_file,
                key=sort_key_field0,
                header=stats_header)
    # cleanup worker data
    logging.info('Removing temporary files')
    def shutil_error_callback(func, path, excinfo):
        logging.error('Error removing tmp files path=%s message=%s' %
                      (path, excinfo))
    for r in worker_results:
        shutil.rmtree(r.output_dir, onerror=shutil_error_callback)
    logging.info('Aggregate done')
    return 0
Пример #53
0
def main(argv):
    parser = OptionParser()
    group = OptionGroup(parser, 'S3 options')
    group.add_option('--bucket', metavar='BUCKET',
            help='set bucket')
    group.add_option('--insecure', action='store_false', dest='secure',
            help='use insecure connection')
    group.add_option('--secure', action='store_true', default=True, dest='secure',
            help='use secure connection')
    parser.add_option_group(group)
    group = OptionGroup(parser, 'Source options')
    group.add_option('--walk', choices=('filesystem', 'tar'), default='filesystem', metavar='MODE',
            help='set walk mode (filesystem or tar)')
    parser.add_option_group(group)
    group = OptionGroup(parser, 'Put options')
    group.add_option('--content-type', metavar='CONTENT-TYPE',
            help='set content type')
    group.add_option('--gzip', action='store_true',
            help='gzip values and set content encoding')
    group.add_option('--put', choices=('add', 'stupid', 'update'), default='update', metavar='MODE',
            help='set put mode (add, stupid, or update)')
    group.add_option('--prefix', default='', metavar='PREFIX',
            help='set key prefix')
    group.add_option('--resume', action='append', default=[], metavar='FILENAME',
            help='resume from log file')
    group.add_option('--grant', metavar='GRANT', default=None, choices=CannedACLStrings,
            help='A canned ACL policy to be applied to each file uploaded.\nChoices: %s' %
            ', '.join(CannedACLStrings))
    parser.add_option_group(group)
    group = OptionGroup(parser, 'Logging options')
    group.add_option('--log-filename', metavar='FILENAME',
            help='set log filename')
    group.add_option('--quiet', '-q', action='count', default=0,
            help='less output')
    group.add_option('--verbose', '-v', action='count', default=0,
            help='more output')
    parser.add_option_group(group)
    group = OptionGroup(parser, 'Debug and performance tuning options')
    group.add_option('--dry-run', action='store_true',
            help='don\'t write to S3')
    group.add_option('--limit', metavar='N', type=int,
            help='set maximum number of keys to put')
    group.add_option('--processes', default=8, metavar='PROCESSES', type=int,
            help='set number of putter processes')
    parser.add_option_group(group)
    options, args = parser.parse_args(argv[1:])
    logging.basicConfig(filename=options.log_filename, level=logging.INFO + 10 * (options.quiet - options.verbose))
    logger = logging.getLogger(os.path.basename(sys.argv[0]))
    if len(args) < 1:
        logger.error('missing source operand')
        return 1
    if not options.bucket:
        logger.error('missing bucket')
        return 1
    connection = S3Connection(is_secure=options.secure)
    bucket = connection.get_bucket(options.bucket)
    del bucket
    del connection
    start = time.time()
    put_queue = JoinableQueue(1024 * options.processes)
    stat_queue = JoinableQueue()
    walk = {'filesystem': walk_filesystem, 'tar': walk_tar}[options.walk]
    walker_process = Process(target=walker, args=(walk, put_queue, args, options))
    walker_process.start()
    put = {'add': put_add, 'stupid': put_stupid, 'update': put_update}[options.put]
    putter_processes = list(islice(repeatedly(Process, target=putter, args=(put, put_queue, stat_queue, options)), options.processes))
    for putter_process in putter_processes:
        putter_process.start()
    statter_process = Process(target=statter, args=(stat_queue, start, options))
    statter_process.start()
    walker_process.join()
    for putter_process in putter_processes:
        put_queue.put(None)
    put_queue.close()
    for putter_process in putter_processes:
        putter_process.join()
    stat_queue.put(None)
    stat_queue.close()
    statter_process.join()
    put_queue.join_thread()
    stat_queue.join_thread()
Пример #54
0
class PartonicRunner2:
    def __init__(self, m2, q2, Delta, nlf, f, fp, Neta, nProcesses=cpu_count()):
        # parameters
        self.m2 = m2
        self.q2 = q2
        self.Delta = Delta
        self.nlf = nlf
        self.f = f
        self.fp = fp
        self.Neta = Neta
        self.nProcesses = nProcesses
        # vars
        self.__qIn = JoinableQueue()
        self.__qOut = Queue()
        self.__js = []
        self.__etas = []
        self.__ks = []
        self.__q2s = []
        self.__data = {}
        self.__processes = []

    # setup default grid
    def _getGrid(self):
        self.__js = range(self.Neta)
        self.__etas = [10.0 ** (-3.0 + 6.0 / (self.Neta - 1) * j) for j in self.__js]
        g = []
        for proj in ["G", "L", "P"]:
            for j in self.__js:
                g.append({"proj": proj, "j": j, "eta": self.__etas[j], "f": self.f, "res": np.nan})
        return g

    # setup Marcos grid
    def _getGridMarco(self):
        self.__etas = [j1 / 2.0 * 10.0 ** (j2) for j1 in xrange(2, 19) for j2 in [-3, -2, -1, 0, 1, 2]]
        self.__etas.append(1e3)
        self.__etas.sort()
        self.__js = range(len(self.__etas))
        self.__q2s = [-1e3]  # [-1e-2,-1e0,-1e1,-1e2,-1e3]
        self.__ks = range(len(self.__q2s))
        g = []
        for proj in ["G", "L"]:
            for k in self.__ks:
                for j in self.__js:
                    g.append(
                        {
                            "proj": proj,
                            "j": j,
                            "eta": self.__etas[j],
                            "k": k,
                            "q2": self.__q2s[k],
                            "f": self.f,
                            "res": np.nan,
                        }
                    )
        return g

    # start processes
    def _compute(self, g):
        # start processes
        oArgs = {
            "G": (self.m2, self.q2, self.Delta, ElProduction.projT.G, self.nlf),
            "L": (self.m2, self.q2, self.Delta, ElProduction.projT.L, self.nlf),
            "P": (self.m2, self.q2, self.Delta, ElProduction.projT.P, self.nlf),
        }
        lenParams = len(g)
        processes = []
        for j in xrange(self.nProcesses):
            processes.append(Process(target=_threadWorker, args=(self.__qIn, self.__qOut, oArgs, lenParams)))
        [p.start() for p in processes]
        # fill
        for e in g:
            self.__qIn.put(e)
        # add EOF
        for n in xrange(self.nProcesses):
            self.__qIn.put(None)
        # run
        try:
            self.__qIn.join()
        except KeyboardInterrupt:
            [p.terminate() for p in processes]
            self.__qIn.close()
        sys.stdout.write("\n")

    # reorder
    def _reorder(self):
        self.__data = {}
        self.__data["G"] = [np.nan for j in self.__js]
        self.__data["L"] = [np.nan for j in self.__js]
        self.__data["P"] = [np.nan for j in self.__js]
        while not self.__qOut.empty():
            p = self.__qOut.get()
            self.__data[p["proj"]][p["j"]] = p["res"]

    # reorder Marcos data
    def _reorderMarco(self):
        self.__data = {}
        self.__data["G"] = [[np.nan for j in self.__js] for k in self.__ks]
        self.__data["L"] = [[np.nan for j in self.__js] for k in self.__ks]
        while not self.__qOut.empty():
            p = self.__qOut.get()
            self.__data[p["proj"]][p["k"]][p["j"]] = p["res"]

    # write data
    def _write(self):
        with open(self.fp, "w") as f:
            for j in self.__js:
                dataT = self.__data["G"][j] + self.__data["L"][j] / 2.0
                f.write("%e\t%e\t%e\t%e\n" % (self.__etas[j], dataT, self.__data["L"][j], self.__data["P"][j]))

    # write Marcos data
    def _writeMarco(self):
        with open(self.fp, "w") as f:
            for k in self.__ks:
                for j in self.__js:
                    vs = []
                    vs.append(self.__etas[j])
                    vs.append(-self.__q2s[k])
                    vs.append(self.__data["L"][k][j])
                    dataT = self.__data["G"][k][j] + self.__data["L"][k][j] / 2.0
                    vs.append(dataT)
                    # vs.append(self.__data["G"][k][j])
                    f.write(("\t").join("%e" % v for v in vs) + "\n")

    # run program
    def run(self):
        self._compute(self._getGrid())
        self._reorder()
        self._write()

    # run program to compare to Marco
    def runMarco(self):
        self._compute(self._getGridMarco())
        self._reorderMarco()
        self._writeMarco()
Пример #55
0
class Factory:
    """Like a Pool, but workers must work, not swim!
    """

    def __init__(self, size=None, autostart=True, max_queue_size=None):
        self.size = size or cpu_count()
        if max_queue_size is None:
            max_queue_size = self.size * 3
        self.max_queue_size = max_queue_size
        self._task_id_counter = _counter()

        if autostart:
            self.start()

    def start(self):
        """Start the factory, making it possible to run tasks.
        """

        if getattr(self, '_running', False):
            return
        self._running = True

        logger.info('Starting factory')
        self.queue = JoinableQueue(self.max_queue_size)
        self.workers = []
        for x in range(self.size):
            proc = Process(target=self._worker_process, args=(x, self.queue,),
                           daemon=True)
            self.workers.append(proc)
            proc.start()

    def _worker_process(self, idx, queue):
        # SIGINT is handled by controller process
        signal.signal(signal.SIGINT, signal.SIG_IGN)

        logger.info('[Worker %s] Entering main loop', idx)
        while True:
            if queue.empty():
                logger.info('[Worker %s] idling', idx)
            task = queue.get(block=True)
            logger.info('[Worker %s] Accepted new task: %s', idx, task)
            logger.debug('Queue size is now %s', queue.qsize())

            try:
                task.function(*task.args, **task.kwargs)
            except:
                logger.exception(
                    '[Worker %s] Exception raised while running task', idx)
            else:
                logger.info('[Worker %s] Task complete: %s', idx, str(task))
            finally:
                queue.task_done()

    def run(self, func, *args, **kwargs):
        """Runs a function, asynchronously in the pool

        Args:
            func: the function name
            *args: arguments to the function to be called
            **kwargs: keyword arguments to called function

        Return:
            int: the task id
        """
        task_id = self._get_next_task_id()
        task = Task(task_id, func, args, kwargs)
        logger.info('Scheduling task: %s', str(task))
        self.queue.put(Task(task_id, func, args, kwargs))

    def shutdown(self):
        """Shutdown the factory.

        Will wait until all the queued processes have been completed,
        then shuts down worker processes.
        """
        logger.info('Shutting down (waiting for tasks to complete)')
        self.queue.close()
        self.queue.join()
        logger.info('Processing complete. Shutting down workers')
        self.terminate()

    def terminate(self):
        """Immediately terminate the factory.

        Will send a SIGTERM to all worker processes; running tasks
        will be interrupted, queued ones will be lost.
        """
        for idx, proc in enumerate(self.workers):
            logger.info('Terminating worker %s (pid %s)', idx, proc.pid)
            proc.terminate()
            proc.join()
        self._running = False

    def _get_next_task_id(self):
        return next(self._task_id_counter)
Пример #56
0
def MCMC(n,
         theta_0,
         priors_dict,
         beta,
         rho,
         chains,
         burn_rate=0.1,
         down_sample=1,
         max_attempts=6,
         pflag=True,
         cpu=None,
         randomize=True):
    # Check input parameters
    mcmcChecks(n, theta_0, beta, rho, chains, burn_rate, down_sample,
               max_attempts)
    print("Performing MCMC Analysis")
    # Selecting optimal temperature
    hyper_theta, beta = hyperparameter_fitting(theta_0, priors_dict, beta, rho,
                                               max_attempts)
    if pflag == True:
        check_proposals(hyper_theta, 50)
    # Overdisperse chains
    if randomize == True:
        print("Dispersing chains")
        if chains > 1:
            chains_list = disperse_chains(hyper_theta, priors_dict, chains)
        else:
            chains_list = [hyper_theta]
    else:
        chains_list = [hyper_theta for i in range(chains)]
    # Sample using MCMC
    print("Sampling from posterior distribution")
    if chains >= cpu_count():
        NUMBER_OF_PROCESSES = cpu_count() - 1
    else:
        NUMBER_OF_PROCESSES = chains
    if cpu != None:
        NUMBER_OF_PROCESSES = cpu  # Manual override of core number selection
    print("Using {} threads".format(NUMBER_OF_PROCESSES))
    with open(results_dir + 'progress.txt',
              'w') as f:  # clear previous progress report
        f.write('')
    jobs = Queue()  # put jobs on queue
    result = JoinableQueue()
    countQ = JoinableQueue()
    if NUMBER_OF_PROCESSES == 1:
        jobs.put([chains_list[0], beta, rho, n, priors_dict])
        mh(0, jobs, result, countQ)
    else:
        for m in range(chains):
            jobs.put([chains_list[m], beta, rho, n, priors_dict])
        [
            Process(target=mh, args=(i, jobs, result, countQ)).start()
            for i in range(NUMBER_OF_PROCESSES)
        ]
    # pull in the results from each thread
    pool_results = []
    chain_attempts = []
    for m in range(chains):
        r = result.get()
        pool_results.append(r)
        result.task_done()
        a = countQ.get()
        chain_attempts.append(a)
    # tell the workers there are no more jobs
    for w in range(NUMBER_OF_PROCESSES):
        jobs.put(None)
    # close all extra threads
    result.join()
    jobs.close()
    result.close()
    countQ.close()

    # Perform data analysis
    average_acceptance = np.mean([el[1] for el in chain_attempts])
    print("Average acceptance rate was {:.1f}%".format(average_acceptance))
    samples = get_parameter_distributions(pool_results, burn_rate, down_sample)
    plot_parameter_autocorrelations(samples.drop('gamma', axis=1))
    get_summary_statistics(samples.drop('gamma', axis=1))
    with open(results_dir + 'simulation_summary.txt', 'w') as f:
        f.write('Temperature used was {}\n'.format(beta))
        f.write('Number of chains = {}\n'.format(chains))
        f.write(
            "Average acceptance rate was {:.1f}%\n".format(average_acceptance))
        f.write("Initial conditions were\n")
        for i in chains_list:
            f.write(str(i))
            f.write("\n")
Пример #57
0
def continue_sampling(n,
                      n_old,
                      priors_dict,
                      rho,
                      burn_rate,
                      down_sample,
                      cpu=None):
    variableNames = []
    stdDevs = []
    type_of_dists = []
    extra_bounds = []

    with open(results_dir + 'simulation_summary.txt', 'r') as f:
        beta = float(f.readline()[21:-1])
        numChains = int(f.readline()[19:-1])
        f.readline()
        f.readline()
        distribution_descriptions = f.readline()[2:-3]
        distribution_descriptions = distribution_descriptions.split('], [')
        example_theta = []
        for variable in distribution_descriptions:
            var = variable.replace('\'', '').split(', ')
            example_theta.append(var)
        variableNames = [el[0] for el in example_theta]
        stdDevs = [float(el[2]) for el in example_theta]
        type_of_dists = [el[3] for el in example_theta]
        extra_bounds = [el[4] if len(el) == 5 else [] for el in example_theta]

    with open(results_dir + 'chain_lengths.txt', 'r') as f:
        chainLengths = list(map(int, f.readline().split(', ')[:]))
    chains = pd.read_csv(results_dir + 'complete_samples.csv', index_col=0)
    chainList = []
    end_points = []
    hyper_theta_valList = []
    readChainLengths = [0] + chainLengths + [-1]
    for i in range(numChains):
        chainList.append(chains.iloc[readChainLengths[i]:readChainLengths[i] +
                                     readChainLengths[i + 1] - 1])
        end_points.append(chainList[-1].iloc[-1])
        hyper_theta_valList.append(chainList[-1].iloc[0])
    hyper_theta = [[[
        variableNames[i], hyper_theta_valList[c][variableNames[i]], stdDevs[i],
        type_of_dists[i]
    ] + extra_bounds[i] for i in range(len(variableNames))]
                   for c in range(numChains)]
    restarting_points = [[[
        variableNames[i], end_points[c][variableNames[i]], stdDevs[i],
        type_of_dists[i]
    ] + extra_bounds[i] for i in range(len(variableNames))]
                         for c in range(numChains)]
    pre_pool_results = [[] for i in range(numChains)]
    for q in range(numChains):
        pre_pool_results[q] = [[[
            variableNames[i], chainList[q].iloc[j][variableNames[i]],
            stdDevs[i], type_of_dists[i]
        ] + extra_bounds[i] for i in range(len(variableNames))]
                               for j in range(len(chainList[q]))]

    print("Continuing MCMC Analysis")
    print("Sampling from posterior distribution")
    if numChains >= cpu_count():
        NUMBER_OF_PROCESSES = cpu_count() - 1
    else:
        NUMBER_OF_PROCESSES = numChains
    if cpu != None:
        NUMBER_OF_PROCESSES = cpu  # Manual override of core number selection
    print("Using {} threads".format(NUMBER_OF_PROCESSES))
    f = open('progress.txt', 'w')
    f.close()
    jobs = Queue()
    result = JoinableQueue()
    for m in range(numChains):
        jobs.put([restarting_points[m], beta, rho, n, priors_dict])
    [
        Process(target=mh, args=(i, jobs, result)).start()
        for i in range(NUMBER_OF_PROCESSES)
    ]
    # pull in the results from each thread
    pool_results = []
    for m in range(numChains):
        r = result.get()
        pool_results.append(r)
        result.task_done()
    # tell the workers there are no more jobs
    for w in range(NUMBER_OF_PROCESSES):
        jobs.put(None)
    # close all extra threads
    result.join()
    jobs.close()
    result.close()
    # Combine old results with new results
    for j in range(len(pool_results)):
        pool_results[j] = pre_pool_results[j] + pool_results[j]
    # Perform data analysis
    total_samples = sum([len(i) for i in pool_results])
    print("Average acceptance rate was {:.1f}%".format(
        total_samples * 100 / ((n + n_old) * numChains)))
    samples = get_parameter_distributions(pool_results, burn_rate, down_sample)
    plot_parameter_autocorrelations(samples)
    get_summary_statistics(samples)
    with open(results_dir + 'simulation_summary.txt', 'w') as f:
        f.write('Temperature used was {}\n'.format(beta))
        f.write('Number of chains = {}\n'.format(numChains))
        f.write("Average acceptance rate was {:.1f}%\n".format(
            total_samples * 100 / ((n + n_old) * numChains)))
        f.write("Initial conditions were\n")
        for i in hyper_theta:
            f.write(str(hyper_theta))
            f.write("\n")
Пример #58
0
def main(course_file='courses.txt', clear_db=True):
    """Main method/entrypoint
    """

    # Courses
    work_queue = JoinableQueue()
    skipped_queue = Queue(0)

    with open(course_file, "r") as f:
        for line in f:
            work_queue.put(line.strip())

    # For holding the database info
    db_queue = Queue()
    db_lock = Lock()

    # Create the threads
    process_list = []
    for i in range(multiprocessing.cpu_count()):
        p = multiprocessing.Process(target=process_data,
                                    args=(work_queue, skipped_queue, db_queue, db_lock))
        process_list.append(p)
        p.start()


    work_queue.join()
    work_queue.close()

    db_lock.acquire()
    print('Done work. Got {0} courses, skipped {1}'.format(db_queue.qsize(), skipped_queue.qsize()))
    db_lock.release()

    print()
    # Announce skipped courses
    with open('skippedCourses.txt', 'w') as f:
        if not skipped_queue.empty():
            print('These courses were skipped: ')
            while not skipped_queue.empty():
                skipped_course = skipped_queue.get()
                print('  {0}'.format(skipped_course))
                to_file = skipped_course.split(',', 1)[0]
                f.write(u'{0}\n'.format(to_file).encode('utf8'))
        print()

    db_courses = Queue(0)
    db_sections = Queue(0)
    db_activities = Queue(0)

    while not db_queue.empty():
        course = db_queue.get()
        # course name
        db_courses.put(course[0])
        # sections
        for section in course[1]:
            db_sections.put(section)
        # activities
        for activity in course[2]:
            db_activities.put(activity)



    # Print total count of all items
    print('Courses: {0}'.format(db_courses.qsize()))
    print('Sections: {0}'.format(db_sections.qsize()))
    print('Activities: {0}'.format(db_activities.qsize()))

    # Write courses to files
    with open('db_courses.csv', 'w' if clear_db else 'a') as f:
        while not db_courses.empty():
            f.write(u'{0}\n'.format(db_courses.get()).encode('utf8'))

    # Write sections to files
    with open('db_sections.csv', 'w' if clear_db else 'a') as f:
        while not db_sections.empty():
            f.write(u'{0}\n'.format(db_sections.get()).encode('utf8'))

    # Write activities to files
    with open('db_activities.csv', 'w' if clear_db else 'a') as f:
        while not db_activities.empty():
            f.write(u'{0}\n'.format(db_activities.get()).encode('utf8'))
Пример #59
0
    def parallel(self):
        from multiprocessing import Process, Queue, JoinableQueue

        if debug:
            print(inspect.stack()[0][3])
    
        self.ntrajs = []
        for i in range(self.cpus):
            self.ntrajs.append(min(int(np.floor(float(self.ntraj)
                                             / self.cpus)),
                                   self.ntraj - sum(self.ntrajs)))
        cnt = sum(self.ntrajs)
        while cnt < self.ntraj:
            for i in range(self.cpus):
                self.ntrajs[i] += 1
                cnt += 1
                if (cnt >= self.ntraj):
                    break
        self.ntrajs = np.array(self.ntrajs)
        self.ntrajs = self.ntrajs[np.where(self.ntrajs > 0)]
        self.nprocs = len(self.ntrajs)
        sols = []
        processes = []
        resq = JoinableQueue()
        resq.join()

        if debug:
            print("Number of cpus: " + str(self.cpus))
            print("Trying to start " + str(self.nprocs) + " process(es).")
            print("Number of trajectories for each process: " + str(self.ntrajs))

        for i in range(self.nprocs):
            p = Process(target=self.evolve_serial,
                        args=((resq, self.ntrajs[i], i, self.seed * (i + 1)),))
            p.start()
            processes.append(p)
        cnt = 0

        while True:
            try:
                sols.append(resq.get())
                resq.task_done()
                cnt += 1
                if (cnt >= self.nprocs):
                    break
            except KeyboardInterrupt:
                break
            except:
                pass

        resq.join()
        for proc in processes:
            try:
                proc.join()
            except KeyboardInterrupt:
                if debug:
                    print("Cancel thread on keyboard interrupt")
                proc.terminate()
                proc.join()
        resq.close()
        return sols