Пример #1
0
class TaskManager:
    # noinspection PyPep8Naming
    def __init__(self,
                 jobs_queue_capacity: int,
                 workers_num: int,
                 WorkerClass: Worker.__class__ = Worker):
        # empty job queue
        self._queue = JoinableQueue(maxsize=jobs_queue_capacity)
        logger.info(
            f'Queue size set to accept at most {jobs_queue_capacity} before pausing job assignment.'
        )
        self.WorkerClass = WorkerClass
        self.workers_num = max_number_of_workers(workers_num)

    _workers = []

    def wake_up_workers(self):
        self._workers: List[Worker] = [
            self.WorkerClass(self._queue) for _ in range(self.workers_num)
        ]
        for worker in self._workers:
            worker.start()

    def assign_task(self, job: Task):
        self._queue.put(job)

    def stop_workers(self):
        logger.info('waiting all workers to finish')
        # usual termination condition is to put None on the queue. Queues are FIFO but from Python 3.8 docs:
        # https://docs.python.org/3.8/library/multiprocessing.html#pipes-and-queues
        # "If multiple processes are enqueuing objects, it is possible for the objects to be received at the other
        # end out-of-order. However, objects enqueued by the same process will always be in the expected order
        # with respect to each other.". So, when there's a single producer, that's not an issue; when there are many
        # producers it may happen that even if Nones are enqueued at the end of the queue, consumers pick 'em
        # before other items in the queue (breaking the FIFO assumption). In this case the workers would leave
        # before the queue is empty. To avid this, before sending Nones, it's better to wait for the queue to be
        # consumed.

        while not self._queue.empty(
        ):  # not bullet-proof as empty() and qsize() return approx. values, but it helps
            print(f"jobs waiting to be assigned: {self._queue.qsize()}")
            sleep(1)
        for _ in self._workers:
            self._queue.put(None, block=True, timeout=None)
        self._queue.join()
        logger.info('all processes_finished')

    def discard_waiting_tasks(self):
        while not self._queue.empty():
            try:
                self._queue.get(False)
            except Empty:
                continue
            self._queue.task_done()

    def number_of_waiting_tasks(self):
        return self._queue.qsize()
Пример #2
0
class ImageCrawler:

    NUM_PER_FETCH = 100
    NUM_PROCESSES = 10

    def __init__(self, database_config_path):
        self.queue = JoinableQueue()
        self.logger = Logger("image_crawler")
        self.adapter = ImageStoreAdapter(database_config_path, self.logger)

    def produce(self):
        while True:
            if self.queue.empty():
                for image_id, link in self.adapter.load_undownloaded_images(
                        self.NUM_PER_FETCH):
                    self.logger.log("Producer: add new image to crawl:" +
                                    image_id + " " + link)
                    self.queue.put((image_id, link))
            time.sleep(10)

    def consume(self, process_id):
        while True:
            self.logger.log("Consumer process:" + str(process_id) +
                            " fetch new image from queue")
            if not self.queue.empty():
                image_id, link = self.queue.get()
                self.logger.log("Consumer process:" + str(process_id) +
                                " start crawling " + str(link))
                image = common_utils.page_crawl(link)
                if image != None:
                    self.logger.log(link + "crawled successfully")
                    self.adapter.store_image(image_id, image)
                else:
                    self.logger.log(link + " failed at crawling")
                    self.adapter.update_image_status(
                        image_id, ImageIndexStatus.DOWNLOAD_FAILED)
                self.queue.task_done()
                time.sleep(1)
            else:
                self.logger.log("Queue empty")
                time.sleep(10)

    def run(self):
        producer = Process(target=self.produce)
        producer.start()
        consumers = []
        for i in range(self.NUM_PROCESSES):
            consumer = Process(target=self.consume, args=(i, ))
            consumers.append(consumer)
            consumer.start()

        for consumer in consumers:
            consumer.join()
        producer.join()
        self.queue.join()
Пример #3
0
class ImageCrawler:
    
    NUM_PER_FETCH = 100
    NUM_PROCESSES = 10
    def __init__(self, database_config_path):
        self.queue = JoinableQueue()
        self.logger = Logger("image_crawler")
        self.adapter = ImageStoreAdapter(database_config_path, self.logger)
        
    def produce(self):
        while True:
            if self.queue.empty():
                for image_id, link in self.adapter.load_undownloaded_images(self.NUM_PER_FETCH):
                    self.logger.log("Producer: add new image to crawl:" + image_id + " " + link)
                    self.queue.put((image_id, link))
            time.sleep(10)
            
    def consume(self, process_id):
        while True:
            self.logger.log("Consumer process:" + str(process_id) + " fetch new image from queue")
            if not self.queue.empty():
                image_id, link = self.queue.get()
                self.logger.log("Consumer process:"+ str(process_id) + " start crawling " + str(link))
                image = common_utils.page_crawl(link)
                if image != None:
                    self.logger.log(link + "crawled successfully")
                    self.adapter.store_image(image_id, image)
                else:
                    self.logger.log(link + " failed at crawling")
                    self.adapter.update_image_status(image_id, ImageIndexStatus.DOWNLOAD_FAILED)
                self.queue.task_done()
                time.sleep(1)
            else:
                self.logger.log("Queue empty")
                time.sleep(10)
    
    def run(self):
        producer = Process(target=self.produce)
        producer.start()
        consumers = []
        for i in range(self.NUM_PROCESSES):
            consumer = Process(target=self.consume, args=(i,))
            consumers.append(consumer)
            consumer.start()
        
        for consumer in consumers:
            consumer.join()
        producer.join()
        self.queue.join()
    def test_hyperband_executor_basic(self):
        # Create executor
        inputs_queue = JoinableQueue()
        results_queue = JoinableQueue()
        executor = executors.HyperbandExecutor.HyperbandExecutor(
            inputs_queue, results_queue, hyperband_epochs_budget=100)
        executor.get_data_loading_pipelines = get_data_loading_pipelines_override

        # Load sample data
        data_uri = utils.utils.get_git_root(
            os.path.dirname(
                os.path.abspath(__file__))) + "/test_data/185_baseball"
        assert (os.path.exists(data_uri))
        problem_doc, dataset = utils.utils.load_data_from_dir(data_uri,
                                                              mode="train")

        # Process item
        inputs_queue.put((problem_doc, dataset))
        executor.start()
        inputs_queue.join()

        # Gather results
        results = []
        while not results_queue.empty():
            print("Gathering...")
            results.append(
                results_queue.get(True, executors.Executor.QUEUE_TIMEOUT))

        executor.terminate()
Пример #5
0
def calculate_set(num_processes):
    todo_queue = JoinableQueue()
    results_queue = JoinableQueue()

    # setup and launch workers
    # we'll make them daemon processes so they shut down automatically when this process exits, but
    # we'll also shut them down ourselves when we finish
    workers = [
        Process(target=worker, args=(todo_queue, results_queue))
        for i in xrange(num_processes)
    ]
    for individual in workers:
        individual.daemon = True
        individual.start()

    result = numpy.zeros([ny, nx])
    for i in xrange(ny):
        y = i * dy + ylo
        for j in xrange(nx):
            x = j * dx + xlo
            todo_queue.put((x, y, i, j))
    todo_queue.join()

    while not results_queue.empty():
        i, j, val = results_queue.get()
        result[i, j] = val
        results_queue.task_done()

    # shutdown the compute processes
    for individual in workers:
        individual.terminate()

    return result
def main():
    jobs = JoinableQueue()
    result = JoinableQueue()


    numToProcess = -1
    scores = pd.DataFrame(columns=['query','fmeasure','precision','recall',
                                   'size','maxDistance','topHits',"contextSteps"])

    print len(datasets)

    for key in datasets:
        jobs.put(key)

    processed_count = Counter()
        
    for i in xrange(NUMBER_OF_PROCESSES):
        p = Process(target=work, args=(i, jobs, result, processed_count))
        p.daemon = True
        p.start()

    #work(1, jobs, result, processed_count)

    automated_annotations = {}
    distances = {}

    jobs.join()

    dataset_index = collections.defaultdict(set)
    annotated_datasets = set()
    while not result.empty():
        dataset, classes = result.get()
        if len(classes) == 0:
            annotated_datasets.add(dataset)
        for c in classes.keys():
            dataset_index[c].add(dataset)
            owl_class = Class(c, graph=graph)
            for parent in owl_class.parents:
                dataset_index[parent.identifier].add(dataset)
        result.task_done()

    print '\n'
    
    for query, c in queries.items():
        manual = ground_truth[query]
        automated = dataset_index[c]
        hits = manual & automated
        misses = manual - automated
        precision = np.nan if len(automated) == 0 else float(len(hits)) / len(automated)
        recall = np.nan if len(manual) == 0 else float(len(hits)) / len(manual)
        if precision != 0 or recall != 0:
            fmeasure = 0 if np.isnan(precision) or np.isnan(recall) else 2 * (precision * recall) / (precision + recall)
        else:
            fmeasure = 0
        scores = scores.append(dict(query=query, size=len(manual), precision=precision, recall=recall, fmeasure=fmeasure,topHits=topHits, maxDistance=maxDistance, contextSteps = context_steps),
                        ignore_index=True)
        print "Hits for", query, c
        print '\n'.join(sorted(hits))
    print scores
    print "Annotated", len(annotated_datasets), "datasets."
Пример #7
0
def progdev_all(boffile, gain):
    """ Initialize all roach boards with boffile and gain settings """
    roachlist = ['rofl%i'%i for i in range(1,16+1)]
    n_roach = len(roachlist)
    
    print "Programming all roaches with %s"%boffile
    print "Gain value: %ix"%gain
    print "Please wait..."
    # Create threads and message queue
    procs = []
    q     = JoinableQueue()
    for i in range(n_roach):
        p = Process(target=progdev_adc16, args=(roachlist[i], q, boffile, gain))
        procs.append(p)
    # Start threads
    for p in procs:
        p.start()
    # Join threads      
    for p in procs:
        p.join()
    
    # Print messages
    while q.empty() is False:
        print q.get()
    print "OK"
Пример #8
0
def waiter(queue: multiprocessing.JoinableQueue) -> tuple:
    longest_word = None
    while True:
        results = queue.get()

        if results[2] is True:
            longest_word = results
        queue.task_done()

        if queue.empty():
            print("Final queue is empty, waiting 1 second to be sure it's REALLY empty.")
            time.sleep(1)
            if queue.empty():
                break

    return longest_word
class ScheduleContainer(object):
    def __init__(self):
        self.schedule_container = JoinableQueue(maxsize=0)
        self.scheduler = schedule
        self._run = True
        worker = Thread(target=self.work)
        worker.start()

    def append(self, request_form):
        self.schedule_container.put(request_form)

    @staticmethod
    def task(temp):
        def inner():
            t.change_temp(temp)

        return inner

    def work(self):
        lock = Lock()

        while self._run:
            lock.acquire()
            if not self.schedule_container.empty():
                schedule_obj = self.schedule_container.get()
                job = self.scheduler.every()
                job.start_day = str(schedule_obj.day)
                job.unit = 'weeks'
                job.at(str(schedule_obj.time)).do(self.task(schedule_obj.temp))
                print('schedule made into job')
                schedule_obj.save()
                self.schedule_container.task_done()
            lock.release()
            schedule.run_pending()
            time.sleep(1)
Пример #10
0
    def _parallel(self, reps, keys, treatments, num_threads):
        jobs = JoinableQueue()
        results = JoinableQueue()

        for t in treatments:
            kwargs = dict(zip(keys, t)) if keys else {}
            for r in range(reps):
                jobs.put((str(t), kwargs))

        def op(jobs, results):
            while True:
                name, kwargs = jobs.get()
                res = self.sim.run(self.builder(**kwargs))
                results.put((name, res))
                jobs.task_done()

        for th in range(num_threads):
            process = Process(target=op, name=str(th), args=[jobs, results])
            process.start()

        jobs.join()

        formatted_results = {}
        while not results.empty():
            n, r = results.get()
            if n in formatted_results:
                formatted_results[n].append(r)
            else:
                formatted_results[n] = [r]
        if len(formatted_results) == 1:
            return formatted_results[list(formatted_results.keys())[0]]
        return formatted_results
Пример #11
0
    def _drain_and_join_queue(q: mp.JoinableQueue, join: bool = True) -> None:
        """
        Drains a queue completely, such that it is joinable

        :param q: Queue to join
        :param join: Whether to join the queue or not
        """
        # Do nothing when it's not set
        if q is None:
            return

        # Call task done up to the point where we get a ValueError. We need to do this when child processes already
        # started processing on some tasks and got terminated half-way.
        n = 0
        try:
            while True:
                q.task_done()
                n += 1
        except ValueError:
            pass

        try:
            while not q.empty() or n != 0:
                q.get(block=True, timeout=1.0)
                n -= 1
        except (queue.Empty, EOFError):
            pass

        # Join
        if join:
            q.join()
Пример #12
0
class BLEScanner:
    def __init__(self, devices_to_check_for, scan_delegate_class):

        self.manager = Manager()
        self.devices_to_check_for = self.manager.dict()

        for key in devices_to_check_for:
            self.devices_to_check_for[key] = devices_to_check_for[key]

        self.to_connect_queue = JoinableQueue()

        self.scanner = Scanner().withDelegate(scan_delegate_class(self))
        self.stopped = False

    def start(self):
        self.stop_event = Event()
        self.stop_event.clear()

        self.process_connection_event = Event()
        self.process_connection_event.clear()

        # Need to use multiprocessing to start this in a new process
        # This is because the Scanner code in bluepy-helper sets the state of currently connected devices to "disconnected"
        # Thus, if the connected device code is in another thread waiting for notifications, this will
        # cause it to raise a BTLEDisconnectedError even though the device is actually still connected.

        # The workaround is to start the scanning in a new _process_ instead. This will create a whole new copy
        # of bluepy-helper, which allows the scanner to do whatever it want to the device state in its copy of
        # bluepy-helper and not have to worry about screwing up the device state in the connected BLE object
        self.process = Process(target=self.scan, args=())
        self.process.start()

        print("Started Scan")

        return self

    def scan(self):
        while True:

            if self.stop_event.is_set():
                return

            # Delay if still connecting (There are devices on the queue that are not done yet)
            self.to_connect_queue.join()

            # Done waiting: clear the process_connection_event bit
            self.process_connection_event.clear()

            self.devices = self.scanner.scan(5, passive=True)

            # If there are devices that we need to connect to, signal that we want to connect to them to the main process.
            # The following line will
            if not self.to_connect_queue.empty():
                self.process_connection_event.set()

    def stop(self):
        # self.stopped = True
        self.stop_event.set()
Пример #13
0
def multi_write_selected_pfam_genes(options, useful_pfam, annot_genes_all):
    '''
    Run "write_selected_pfam_genes" on multiple threads. 
    '''
    global q
    q = JoinableQueue() 
    for fam in useful_pfam:
        q.put(fam)
    for i in range(options.threads):
        p = Process(target = write_selected_pfam_genes, name = '%i' % (i+1), 
                    args = (options, annot_genes_all))
        p.start()
    sleep(options.threads*0.05)
    q.join()
    sleep(options.threads*0.05)
    if p.is_alive() and q.empty():
        sleep(options.threads*0.2)
        if p.is_alive() and q.empty():
            p.terminate()
Пример #14
0
def main():
    jobs = JoinableQueue()
    result = JoinableQueue()

    print len(datasets)
    numToProcess = 10
    scores = pd.DataFrame(columns=[
        'precision', 'recall', 'fmeasure', 'numResult', 'minScore', 'topHits',
        'contentWeight', 'relationWeight'
    ])
    manual_annotations = get_manual_annotations(numToProcess)
    manual_tuples = get_ir_tuples(manual_annotations)

    for key in manual_annotations.keys():
        jobs.put(key)

    processed_count = Counter()

    for i in xrange(NUMBER_OF_PROCESSES):
        p = Process(target=work, args=(i, jobs, result, processed_count))
        p.daemon = True
        p.start()

    #work(1, jobs, result, processed_count)

    automated_annotations = {}

    jobs.join()

    while not result.empty():
        dataset, classes = result.get()
        automated_annotations[dataset] = classes
        result.task_done()

    automated_tuples = get_ir_tuples(automated_annotations)
    hits = manual_tuples & automated_tuples
    misses = manual_tuples - automated_tuples

    precision = float(len(hits)) / len(automated_tuples)
    recall = float(len(hits)) / len(manual_tuples)
    fmeasure = 2 * (precision * recall) / (precision + recall)
    # print '\t'.join([str(x) for x in [precision, recall, fmeasure,
    #                              numResult, minScore, topHits]])
    scores = scores.append(dict(precision=precision,
                                recall=recall,
                                fmeasure=fmeasure,
                                numResult=numResult,
                                minScore=minScore,
                                topHits=topHits,
                                contentWeight=contentWeight,
                                relationWeight=relationWeight),
                           ignore_index=True)

    print scores
Пример #15
0
class findFile(object):
    def __init__(self):
        self.input_Q = JoinableQueue()
        self.out_Q = JoinableQueue()

    def run(self):
        p1 = Process(target=self.funcA())
        p2 = Process(target=self.funcB())
        p3 = Process(target=self.funcB())
        p1.start()
        p2.start()
        p3.start()
        p1.join()
        p2.join()
        p3.join()
        self.input_Q.join()
        l = []
        while self.out_Q.empty() is False:
            l.append(self.out_Q.get())
        print('l = ', l)
        print("p1.is_alive() = ", p1.is_alive())
        print("p2.is_alive() = ", p2.is_alive())
        print("p3.is_alive() = ", p3.is_alive())

    def funcA(self):
        while True:
            find_str = input('请输入要查找的字符串:')
            if 'exit' == find_str.lower():
                break
            self.input_Q.put(find_str)

    def funcB(self):
        while True:
            if self.input_Q.empty():
                break
            find_str = self.input_Q.get()
            self.out_Q.put(find_str)
            print('find_str = ', find_str)
            self.input_Q.task_done()
def main():
    jobs = JoinableQueue()
    result = JoinableQueue()


    print len(datasets)
    numToProcess = 10
    scores = pd.DataFrame(columns=['precision','recall','fmeasure',
                                   'numResult','minScore','topHits',
                                   'contentWeight','relationWeight'])
    manual_annotations = get_manual_annotations(numToProcess)
    manual_tuples = get_ir_tuples(manual_annotations)

    for key in manual_annotations.keys():
        jobs.put(key)

    processed_count = Counter()
        
    for i in xrange(NUMBER_OF_PROCESSES):
        p = Process(target=work, args=(i, jobs, result, processed_count))
        p.daemon = True
        p.start()

    #work(1, jobs, result, processed_count)

    automated_annotations = {}

    jobs.join()

    while not result.empty():
        dataset, classes = result.get()
        automated_annotations[dataset] = classes
        result.task_done()

    automated_tuples = get_ir_tuples(automated_annotations)
    hits = manual_tuples & automated_tuples
    misses = manual_tuples - automated_tuples
    
    precision = float(len(hits)) / len(automated_tuples)
    recall = float(len(hits)) / len(manual_tuples)
    fmeasure = 2 * (precision * recall) / (precision + recall)
    # print '\t'.join([str(x) for x in [precision, recall, fmeasure,
    #                              numResult, minScore, topHits]])
    scores = scores.append(dict(precision=precision, recall=recall, fmeasure=fmeasure,
                                numResult=numResult, minScore=minScore, topHits=topHits,
                                contentWeight=contentWeight, relationWeight=relationWeight),
                        ignore_index=True)

    print scores
Пример #17
0
def test_basic():
    in_queue = JoinableQueue()

    mysql_reader = Mysqlio('localhost','3600','test','root','') 
    mysql_reader.scan_and_queue(in_queue,"SELECT * FROM swallow")

    assert in_queue.qsize() == 3

    res = []
    while not in_queue.empty():
        res.append(in_queue.get())

    expected_res = [{'id':1,'libelle':'test'},{'id':2,'libelle':'john'},{'id':3,'libelle':'woo'}]

    assert res == expected_res
Пример #18
0
    def _make_concept_pair(self):
        q = JoinableQueue()
        done_q = Queue()

        threads = []
        for i in range(THREAD):
            t = Process(target=_evaluate_pair_word,
                        args=(self.model, self.w_ij, q, done_q, self.sim,
                              self.threshold))
            t.start()
            threads.append(t)

        t = Process(target=_print_progress, args=(q, ))
        t.start()
        threads.append(t)

        for cluster in self._clusters:
            lc_0 = set()
            lc_1 = set()
            for sen in cluster:
                tup = self.d_sentence(id(sen))
                if tup[0] == 0:
                    lc_0.update(self.s_ik[tup[0]][tup[1]])
                else:
                    lc_1.update(self.s_ik[tup[0]][tup[1]])
            for c_0 in lc_0:
                for c_1 in lc_1:
                    tup = ((self.d_concept[c_0][1], c_0),
                           (self.d_concept[c_1][1], c_1))
                    q.put(tup)

        logger.info('Queuing complete')
        while not q.empty() or not done_q.empty():
            try:
                tup = done_q.get()
                j = tup[0][0]
                k = tup[0][1]
                self.u_jk[(j, k)] = tup[1]
            except queue.Empty:
                pass
        # block until all tasks are done
        q.join()
        # stop workers
        for t in threads:
            t.terminate()

        print('Nb pair : ' + str(len(self.u_jk)))
Пример #19
0
    def process_modules_worker(cls, queue: multiprocessing.JoinableQueue) -> None:
        
        while True:
            if queue.empty():
                continue

            print(f"{os.getpid()}")
            q = queue.get_nowait()
            
            irc: IrcClient = q[0]
            message:str = q[1]
            
            if not message:
                continue
                
            irc.process_private_message(irc, message)
            queue.task_done()
Пример #20
0
class RealTimePlotSimulation:
  def __init__(self, SimulationClass, remoteplotter):
    self.remoteplotter = remoteplotter
    self.SimulationClass = SimulationClass
    self.plotter_conn, worker_conn = Pipe()
    self.queue = JoinableQueue(maxsize=1)
    self.axesextent = remoteplotter.axesExtent()
    self.plotter_conn.send(('axes',self.axesextent))
    phantomprocess = Process(target=self.work, args=(self.queue, worker_conn, SimulationClass, ))
    phantomprocess.start()
    anim = animation.FuncAnimation(remoteplotter.fig, self.update_plot, interval=100)
    show()
    # The phantom process is properly stopped
    print 'Killing the job...'
    self.plotter_conn.send(('gokillyourself',))
    phantomprocess.join()
    print 'Done.'

  def update_plot(self, num):
    extent = self.remoteplotter.axesExtent()
    if extent != self.axesextent:
      self.plotter_conn.send(('axes',extent))
      self.axesextent = extent
    if self.queue.empty(): return
    data = self.queue.get()
    self.remoteplotter.plotData(data)
    self.queue.task_done()

  def work(self, queue, worker_conn, SimulationClass):
    s = SimulationClass()
    # Main loop
    for time in s.simulation:
      s.inmainloop()
      # Check for orders
      if worker_conn.poll():
	message = worker_conn.recv()
	if message[0] == 'gokillyourself':
	  print 'Suicide order received. Exiting main loop.'
	  break
	if message[0] == 'axes':
	  extent = message[1]
      if queue.empty():
	# Retreive data
	queue.put(self.remoteplotter.pickableData(s.simulation, extent))
    queue.cancel_join_thread()
Пример #21
0
    def test_sklearn_stacked_lstm_executor(self):
        git_root_path = utils.utils.get_git_root(
            os.path.dirname(os.path.abspath(__file__)))

        # Create executor
        inputs_queue = JoinableQueue()
        results_queue = JoinableQueue()
        lstm_path = git_root_path + "/Stanford-D3M-Full/experimental/lstm_predictor/saved_models/20190130/"
        assert (os.path.exists(lstm_path))

        executor = executors.SklearnStackedLSTMExecutor.SklearnStackedLSTMExecutor(
            inputs_queue,
            results_queue,
            override_sklearn_primitives_set=[
                "d3m.primitives.regression.linear_svr.SKlearn",
                "d3m.primitives.regression.gaussian_process.SKlearn"
            ],
            lstm_path=lstm_path)
        executor.get_data_loading_pipelines = get_data_loading_pipelines_override

        # Load sample data
        data_uri = git_root_path + "/test_data/185_baseball"
        assert (os.path.exists(data_uri))
        problem_doc, dataset = utils.utils.load_data_from_dir(data_uri,
                                                              mode="train")

        # Process item
        inputs_queue.put((problem_doc, dataset))
        inputs_queue.put((problem_doc, dataset))
        inputs_queue.put((problem_doc, dataset))
        executor.start()
        inputs_queue.join()

        # Gather results
        results = []
        while not results_queue.empty():
            print("Gathering...")
            results.append(
                results_queue.get(True, executors.Executor.QUEUE_TIMEOUT))

        executor.terminate()

        for result in results:
            assert None not in result
Пример #22
0
def test(tasks_queue: mp.JoinableQueue, result_queue: mp.Queue):
    while not tasks_queue.empty():
        result = list()
        task = tasks_queue.get()

        # modified code from readme.md
        apk_info = APKInfo(task)
        for field in sorted([
                getattr(apk_info, m)
                for m in dir(apk_info) if not m.startswith("_")
        ],
                            key=lambda x: callable(x)):
            result.append(
                f"{field.__name__:25}: {field()}" if callable(field) else
                str(field) if not isinstance(field, APKOpener) else
                os.path.basename(task))  # подменим первое поле на имя пакета

        result_queue.put(result)
        tasks_queue.task_done()
Пример #23
0
def z_from_u_worker(q: mp.JoinableQueue, function, grid, u_values, z_values):
    """
    z_from_u unit function in case of multiprocessing
    :param q:
    :param function:
    :param grid:
    :param u_values:
    :param z_values:
    :return:
    """
    while not q.empty():
        i = q.get()
        a_loc = grid.loc[grid['u_values'] <= u_values[i]]
        a_loc = a_loc.iloc[len(a_loc) - 1:len(a_loc)].index[0]
        b_loc = grid.loc[grid['u_values'] >= u_values[i]].index[0]
        z_values[i] = brentq(f=lambda x: function(x, u_values[i]),
                             a=grid.iloc[a_loc, 0],
                             b=grid.iloc[b_loc, 0])
        q.task_done()
Пример #24
0
class SimpleTaskWorkerManager(object):  # os.cpu_count() also for default?
    """
    this will use a queue to keep track of all the total tasks and assign a fixed amount of
    workers to finish all of the tasks.
    """
    def __init__(self, tasks=None, max_workers=8):
        self.workers = []
        if tasks is None:
            self.tasks = JoinableQueue()
        else:
            self.tasks = tasks
        self.total_tasks = 0
        self.num_workers = max_workers

    def __nonzero__(self):
        if self.tasks.empty():
            return True
        return False

    def add_task_to_queue(self, task):
        try:
            self.tasks.put(task)
            self.total_tasks += 1
        except Exception as e:
            print 'unable to add worker to queue. error code: %s' % str(e)

    def start_working(self):
        if self.num_workers > self.total_tasks:
            self.num_workers = self.total_tasks
        print 'assigning %d workers to help you.' % self.num_workers
        for _ in range(self.num_workers):
            p = SimpleTaskWorker(args=(self.tasks, ))
            p.start()
            self.workers.append(p)
        self.tasks.join()

    def wait_all(self):
        for worker in self.workers:
            self.tasks.put(None)
        for worker in self.workers:
            worker.join()
Пример #25
0
    def test_simple_random_sklearn_executor(self):

        # Create executor
        inputs_queue = JoinableQueue()
        results_queue = JoinableQueue()
        executor = executors.SimpleRandomSklearnExecutor.SimpleRandomSklearnExecutor(
            inputs_queue,
            results_queue,
            override_sklearn_primitives_set=[
                "d3m.primitives.regression.linear_svr.SKlearn"
            ])
        executor.get_data_loading_pipelines = get_data_loading_pipelines_override

        # Load sample data
        data_uri = utils.utils.get_git_root(
            os.path.dirname(
                os.path.abspath(__file__))) + "/test_data/185_baseball"
        assert (os.path.exists(data_uri))
        problem_doc, dataset = utils.utils.load_data_from_dir(data_uri,
                                                              mode="train")

        # Process item
        inputs_queue.put((problem_doc, dataset))
        inputs_queue.put((problem_doc, dataset))
        inputs_queue.put((problem_doc, dataset))
        executor.start()
        inputs_queue.join()

        # Gather results
        results = []
        while not results_queue.empty():
            print("Gathering...")
            results.append(
                results_queue.get(True, executors.Executor.QUEUE_TIMEOUT))

        executor.terminate()

        assert (len(results) == 3)
        for result in results:
            assert None not in result
Пример #26
0
class FileReader(Process):
    def __init__(self, filename, buffer_size=1000):
        super(FileReader, self).__init__()
        self.filename = filename
        self.que = JoinableQueue(buffer_size)
        self.event = Event()
        self.event.set()
        self.started = Event()
        self.started.clear()

    # It's crucial to call task_done on the queue after the item was processed
    def get_queue(self):
        return self.que

    def get_event(self):
        return self.event

    def is_done(self):
        return not self.event.is_set() and self.que.empty()

    def run(self):
        self.started.set()
        self.proc()
        self.event.clear()

    def proc(self):
        with open_gz(self.filename, encoding='utf-8') as file:
            for line in file:
                self.que.put(line)

    def __iter__(self):
        self.start()
        self.started.wait()
        while not self.is_done():
            try:
                text = self.que.get(timeout=0.1)
                yield text
                self.que.task_done()
            except Empty:
                pass
Пример #27
0
def boss(ebs, worker, iterable):
    """
    Boss Process

    :type ebs: EBSSnapshot
    :type worker: Callable
    :param iterable:
    :return:
    """
    logger = getLogger('ebssnapshot.boss')
    jobqueue = JoinableQueue(ebs.workers)
    procs = []
    for i in range(1, ebs.workers + 1):
        proc = Process(target=worker,
                       args=[
                           i, jobqueue, ebs.region, ebs.description, ebs.uuid,
                           ebs.role,
                           ebs.session()
                       ])
        proc.daemon = True
        proc.start()
        procs.append(proc)

    signal.signal(signal.SIGINT, terminate)
    signal.signal(signal.SIGTERM, terminate)

    for job in iterable:
        while True:
            running = any(p.is_alive() for p in procs)
            if not running:
                logger.fatal('No children are alive: Exiting')
                sys.exit(-1)

            if jobqueue.empty():
                jobqueue.put(job, block=True, timeout=60)
                break

    jobqueue.join()
Пример #28
0
def test_basic():
    in_queue = JoinableQueue()

    mysql_reader = Mysqlio('localhost', '3600', 'test', 'root', '')
    mysql_reader.scan_and_queue(in_queue, "SELECT * FROM swallow")

    assert in_queue.qsize() == 3

    res = []
    while not in_queue.empty():
        res.append(in_queue.get())

    expected_res = [{
        'id': 1,
        'libelle': 'test'
    }, {
        'id': 2,
        'libelle': 'john'
    }, {
        'id': 3,
        'libelle': 'woo'
    }]

    assert res == expected_res
Пример #29
0
def main():
    global L2C_BEGIN, L2C_END, L2C_DELTA, L2G_BEGIN, L2G_END, L2G_DELTA
    global NFOLDS, ADD_ARGS, SVM_TRAIN, TRAIN_DATA, N_PER_SSH
    
    parser = OptionParser(
        usage="usage: %prog [options] <dataset> <gridscore-file>")
    parser.add_option("--log2c", dest="log2c", metavar="BEGIN END STEP",
        type='float', nargs=3, default=(L2C_BEGIN, L2C_END, L2C_DELTA),
        help="log2 of C SVM contraint [default: %default]")
    parser.add_option("--log2g", dest="log2g", metavar="BEGIN END STEP",
        type='float', nargs=3, default=(L2G_BEGIN, L2G_END, L2G_DELTA),
        help="log2 of G SVM contraint [default: %default]")
    parser.add_option("-v", "--fold", dest="fold", metavar="FOLD",
        type='int', default=NFOLDS,
        help="number of cross validation folds [default: %default]")
    parser.add_option("-a", "--args", dest="args", metavar="ARGS",
        type='string', default=ADD_ARGS,
        help="additional arguments to the SVM trainer [default: %default]")
    parser.add_option("--svm-train", dest="svm_train", metavar="PATHNAME",
        type='string', default=SVM_TRAIN,
        help="path of SVM trainer [default: %default]")
    (options, args) = parser.parse_args()
    if len(args) != 2:
        parser.print_usage(file=sys.stderr)
        return 1

    L2C_BEGIN, L2C_END, L2C_DELTA = options.log2c
    L2G_BEGIN, L2G_END, L2G_DELTA = options.log2g
    NFOLDS = options.fold
    ADD_ARGS = options.args
    SVM_TRAIN = options.svm_train
    TRAIN_DATA, outfile = args
    
    job_queue = Queue()
    result_queue = Queue()

    for log2c, log2g in product(
            frange(L2C_BEGIN, L2C_END, L2C_DELTA),
            frange(L2G_BEGIN, L2G_END, L2G_DELTA)):
        job_queue.put((log2c, log2g))

    for i in range(LOCAL_WORKERS):
        LocalWorker('local-%d' % i, job_queue, result_queue).start()
    
    for i, host in enumerate(SSH_WORKERS):
        for j in range(N_PER_SSH):
            SSHWorker('ssh-%d/%d' % (i, j), 
                host, job_queue, result_queue).start()

    #block until all jobs are done
    job_queue.join()
    
    result = []
    while not result_queue.empty():
        result.append(result_queue.get())
    result = sorted(result, key=op.itemgetter(3,1,2), reverse=True)

    _, best_log2c, best_log2g, best_score = max(result, key=op.itemgetter(3,1,2))

    with open(outfile, 'w') as ofp:
        ofp.write("#best result: log2c=%f, log2g=%f, score=%f\n" % \
            (best_log2c, best_log2g, best_score))
        ofp.write("#log2(c)\tlog2(g)\tscore\n")
        for (name, log2c, log2g, score) in result:
            ofp.write("%f\t%f\t%f\n" % (log2c, log2g, score))

    return 0
class _PrPipe(object):
    """Custom pipe manager to capture the output of processes and store them in
       dedicated thread-safe queues.

       Clients register their own queues.
    """
    def __init__(self, pipeHandle):
        """
        Args:
            pipeHandle (pipe): Pipe to monitor for records
        """
        self.id = ''.join(
            [random.choice('0123456789ABCDEF') for x in range(6)])

        self.queue = JoinableQueue(MAX_QUEUE_LENGTH)

        self.process = Process(target=self.enqueue_output,
                               kwargs={
                                   "out": pipeHandle,
                                   "queue": self.queue
                               })
        self.process.daemon = True
        self.process.start()

        self.clientQueuesLock = Lock()
        self.clientQueues = dict()
        self.lastClientId = 0

    # Class contains Locks and Queues which cannot be pickled
    def __getstate__(self):
        """Prevent _PrPipe from being pickled across Processes

        Raises:
            Exception
        """
        raise Exception("Don't pickle me!")

    def enqueue_output(self, out, queue):
        """Copy lines from a given pipe handle into a local threading.Queue

        Runs in a separate process, started by __init__

        Args:
            out (pipe): Pipe to read from
            queue (Queue): Queue to write to
        """
        for line in iter(out.readline, b''):
            queue.put(line.decode('utf-8'))
        out.close()

    def publish(self):
        """Push messages from the main queue to all client queues

        Must be triggered by an external mechanism
        Typically triggered by getLine or wait

        """
        try:
            while not self.queue.empty():

                with self.clientQueuesLock:
                    line = self.queue.get_nowait()
                    for q in list(self.clientQueues.values()):
                        q.put(line)

                self.queue.task_done()

        except Empty:
            pass

    def getQueue(self, clientId):
        """Retrieve a client's Queue proxy object

        Args:
            clientId (string): ID of the client

        Returns:
            QueueProxy
        """
        return self.clientQueues[text(clientId)]

    def isEmpty(self, clientId=None):
        """Checks whether the primary Queue or any clients' Queues are empty

        Returns True ONLY if ALL queues are empty if clientId is None
        Returns True ONLY if both main queue and specfied client queue are empty
        when clientId is provided

        Args:
            clientId (string): ID of the client

        Returns:
            bool
        """
        if clientId is not None:
            return self.queue.empty() \
                and self.getQueue(clientId).empty()

        else:
            empty = self.queue.empty()

            with self.clientQueuesLock:
                for q in list(self.clientQueues.values()):
                    empty = empty and q.empty()

            return empty

    def is_alive(self):
        """Check whether the thread managing the pipe > Queue movement
        is still active

        Returns:
            bool
        """
        return self.process.is_alive()

    def getLine(self, clientId):
        """Retrieve a line from a given client's Queue

        Args:
            clientId (string): ID of the client

        Returns:
            <element from Queue>

        Raises:
            Empty
        """
        # Pull any newer lines
        self.publish()

        # Throws Empty
        q = self.getQueue(clientId)
        line = q.get_nowait()
        q.task_done()

        return line

    def registerClientQueue(self, queueProxy):
        """Attach an additional Queue proxy to this _PrPipe

        All elements published() from now on will also be added to this Queue
        Returns the clientId for the new client, which must be used in all
        future interaction with this _PrPipe

        Args:
            queueProxy (QueueProxy): Proxy object to a Queue we should populate

        Returns:
            string. The client's ID for acccess to this queue

        """
        # Make sure we don't re-use a clientId
        clientId = self.lastClientId + 1
        self.lastClientId = clientId

        with self.clientQueuesLock:
            self.clientQueues[text(clientId)] = queueProxy

        return text(clientId)

    def unRegisterClientQueue(self, clientId):
        """Detach a Queue proxy from this _PrPipe

        Returns the clientId that was removed

        Args:
            clientId (string): ID of the client

        Returns:
            string. ID of the client queue

        """
        with self.clientQueuesLock:
            self.clientQueues.pop(clientId)

        return text(clientId)

    def destructiveAudit(self):
        """Print a line from each client Queue attached to this _PrPipe

        This is a destructive operation, as it *removes* a line from each Queue
        """
        with self.clientQueuesLock:
            for clientId in list(self.clientQueues):
                try:
                    print("clientId " + text(clientId) + ": " +
                          self.getLine(clientId))
                except:
                    print("clientId " + text(clientId) + " is empty")
Пример #31
0
def main():
    global L2C_BEGIN, L2C_END, L2C_DELTA, L2G_BEGIN, L2G_END, L2G_DELTA
    global NFOLDS, ADD_ARGS, SVM_TRAIN, TRAIN_DATA, N_PER_SSH

    parser = OptionParser(
        usage="usage: %prog [options] <dataset> <gridscore-file>")
    parser.add_option("--log2c",
                      dest="log2c",
                      metavar="BEGIN END STEP",
                      type='float',
                      nargs=3,
                      default=(L2C_BEGIN, L2C_END, L2C_DELTA),
                      help="log2 of C SVM contraint [default: %default]")
    parser.add_option("--log2g",
                      dest="log2g",
                      metavar="BEGIN END STEP",
                      type='float',
                      nargs=3,
                      default=(L2G_BEGIN, L2G_END, L2G_DELTA),
                      help="log2 of G SVM contraint [default: %default]")
    parser.add_option(
        "-v",
        "--fold",
        dest="fold",
        metavar="FOLD",
        type='int',
        default=NFOLDS,
        help="number of cross validation folds [default: %default]")
    parser.add_option(
        "-a",
        "--args",
        dest="args",
        metavar="ARGS",
        type='string',
        default=ADD_ARGS,
        help="additional arguments to the SVM trainer [default: %default]")
    parser.add_option("--svm-train",
                      dest="svm_train",
                      metavar="PATHNAME",
                      type='string',
                      default=SVM_TRAIN,
                      help="path of SVM trainer [default: %default]")
    (options, args) = parser.parse_args()
    if len(args) != 2:
        parser.print_usage(file=sys.stderr)
        return 1

    L2C_BEGIN, L2C_END, L2C_DELTA = options.log2c
    L2G_BEGIN, L2G_END, L2G_DELTA = options.log2g
    NFOLDS = options.fold
    ADD_ARGS = options.args
    SVM_TRAIN = options.svm_train
    TRAIN_DATA, outfile = args

    job_queue = Queue()
    result_queue = Queue()

    for log2c, log2g in product(frange(L2C_BEGIN, L2C_END, L2C_DELTA),
                                frange(L2G_BEGIN, L2G_END, L2G_DELTA)):
        job_queue.put((log2c, log2g))

    for i in range(LOCAL_WORKERS):
        LocalWorker('local-%d' % i, job_queue, result_queue).start()

    for i, host in enumerate(SSH_WORKERS):
        for j in range(N_PER_SSH):
            SSHWorker('ssh-%d/%d' % (i, j), host, job_queue,
                      result_queue).start()

    #block until all jobs are done
    job_queue.join()

    result = []
    while not result_queue.empty():
        result.append(result_queue.get())
    result = sorted(result, key=op.itemgetter(3, 1, 2), reverse=True)

    _, best_log2c, best_log2g, best_score = max(result,
                                                key=op.itemgetter(3, 1, 2))

    with open(outfile, 'w') as ofp:
        ofp.write("#best result: log2c=%f, log2g=%f, score=%f\n" % \
            (best_log2c, best_log2g, best_score))
        ofp.write("#log2(c)\tlog2(g)\tscore\n")
        for (name, log2c, log2g, score) in result:
            ofp.write("%f\t%f\t%f\n" % (log2c, log2g, score))

    return 0
Пример #32
0
    return [*file_hashes]


if __name__ == '__main__':
    freeze_support()
    print(perf_counter())
    with EXECUTOR(max_workers=MAX_WORKERS) as executor:
        in_dirs = 0
        out_dirs = 1
        in_list = list()
        out_list = [
            basepath,
        ]
        hash_count = 0
        sleeps = 0
        while not q.empty() and q.qsize():
            item = q.get()
            print(f'{in_dirs}: {item}')
            in_list.append(item)
            in_dirs += 1
            future_dirs = executor.submit(dir_worker, item)
            dirs = future_dirs.result()
            q_dirs = [*map(q.put, dirs)]
            #print(dirs)
            out_dirs += len(dirs)
            out_list.extend(dirs)
            future_hashes = executor.submit(hash_worker, item)
            q_hashes = [*map(hash_q.put, dirs)]
            hashes = future_hashes.result()
            print(hashes)
            hash_count += len(hashes)
def main():
    jobs = JoinableQueue()
    result = JoinableQueue()


    numToProcess = -1
    scores = pd.DataFrame(columns=['fmeasure','precision','recall',
                                   'numResult','maxDistance','topHits',
                                   'contentWeight','relationWeight', 'hits', "contextSteps"])
    manual_annotations = get_manual_annotations(numToProcess)
    manual_tuples = get_ir_tuples(manual_annotations)

    print len(manual_annotations)
    for i in range(weighted_kmeans_clustering_passes):
        print "Training pass",i+1
        train_kmeans(manual_annotations.keys(), target_class_subtree)
        print "Complete."

    print "Training LSA..."
    lsa_model = train_lsa(manual_annotations)
    global useLSA
    useLSA = True
    global idf, lsa_model, target_classes, targets, target_class_subtree
    target_classes, idf, lsa_model = vectorize_ontology(graph, idf, lsa_model)
    subtree = set(graph.transitive_subjects(RDFS.subClassOf, oboe.MeasurementType))
    target_class_subtree = [x for x in target_classes if x.identifier in subtree and x.identifier != oboe.MeasurementType]
    targets = dict([(x.identifier, x) for x in target_class_subtree])
    print "Done."
    
    for key in manual_annotations.keys():
        jobs.put(key)

    processed_count = Counter()
        
    #for i in xrange(NUMBER_OF_PROCESSES):
    #    p = Process(target=work, args=(i, jobs, result, processed_count))
    #    p.daemon = True
    #    p.start()

    work(1, jobs, result, processed_count)

    automated_annotations = {}
    distances = {}

    jobs.join()

    while not result.empty():
        dataset, classes = result.get()
        automated_annotations[dataset] = set(classes.keys())
        distances[dataset] = classes
        result.task_done()

    automated_tuples = get_ir_tuples(automated_annotations)
    hits = manual_tuples & automated_tuples
    misses = manual_tuples - automated_tuples
    precision = float(len(hits)) / len(automated_tuples)
    recall = float(len(hits)) / len(manual_tuples)
    fmeasure = 2 * (precision * recall) / (precision + recall)
    # print '\t'.join([str(x) for x in [precision, recall, fmeasure,
    #                              numResult, minScore, topHits]])
    scores = scores.append(dict(precision=precision, recall=recall, fmeasure=fmeasure, hits=len(manual_tuples),topHits=topHits, maxDistance=maxDistance, contextSteps = context_steps),
                        ignore_index=True)
    print '\n'
    print scores
    results_file = 'results.csv'
    if len(sys.argv) > 1:
        results_file = sys.argv[1]
    hit_curves = csv.writer(open(results_file,'wb'),delimiter=",")
    hit_curves.writerow(['dataset','class','distance','hit'])
    
    for dataset, c in automated_tuples:
        distance = round(distances[dataset][c],3)
        hit = 1 if (dataset,c) in manual_tuples else 0
        hit_curves.writerow([dataset,c,distance,hit])
Пример #34
0
class FindText(BaseWorkerCustomer):
    NUM_WORKING_PROCESSES = 2

    def __init__(self, params, *args, **kwargs):
        super(FindText, self).__init__(*args, **kwargs)

        self.path = params.get("path", "/")
        self.text = params.get("text", "")

        self.params = params

        # file queue to be processed by many threads
        self.file_queue = JoinableQueue(maxsize=0)
        self.result_queue = Queue(maxsize=0)
        self.result = []

        self.is_alive = {"status": True}

        self.re_text = re.compile(".*" + fnmatch.translate(self.text)[:-7] + ".*", re.UNICODE | re.IGNORECASE)
        # remove \Z(?ms) from end of result expression

    def run(self):
        try:
            self.preload()
        except Exception as e:
            result = {"error": True, "message": str(e), "traceback": traceback.format_exc()}

            self.on_error(self.status_id, result, pid=self.pid, pname=self.name)
            return

        def worker(re_text, file_queue, result_queue, logger, timeout):
            while int(time.time()) < timeout:
                if file_queue.empty() is not True:
                    f_path = file_queue.get()
                    try:
                        if not is_binary(f_path):
                            mime = mimetypes.guess_type(f_path)[0]

                            # исключаем некоторые mime типы из поиска
                            if mime not in ["application/pdf", "application/rar"]:
                                with open(f_path, "rb") as fp:
                                    for line in fp:
                                        try:
                                            line = as_unicode(line)
                                        except UnicodeDecodeError:
                                            charset = chardet.detect(line)
                                            if charset.get("encoding") in ["MacCyrillic"]:
                                                detected = "windows-1251"
                                            else:
                                                detected = charset.get("encoding")

                                            if detected is None:
                                                break
                                            try:
                                                line = str(line, detected, "replace")
                                            except LookupError:
                                                pass

                                        if re_text.match(line) is not None:
                                            result_queue.put(f_path)
                                            # logger.debug("matched file = %s " % f_path)
                                            break

                    except UnicodeDecodeError as unicode_e:
                        logger.error("UnicodeDecodeError %s, %s" % (str(unicode_e), traceback.format_exc()))

                    except IOError as io_e:
                        logger.error("IOError %s, %s" % (str(io_e), traceback.format_exc()))

                    except Exception as other_e:
                        logger.error("Exception %s, %s" % (str(other_e), traceback.format_exc()))
                    finally:
                        file_queue.task_done()
                else:
                    time.sleep(REQUEST_DELAY)

        try:
            self.logger.debug("findText started with timeout = %s" % TIMEOUT_LIMIT)
            time_limit = int(time.time()) + TIMEOUT_LIMIT
            # Launches a number of worker threads to perform operations using the queue of inputs
            for i in range(self.NUM_WORKING_PROCESSES):
                p = Process(
                    target=worker, args=(self.re_text, self.file_queue, self.result_queue, self.logger, time_limit)
                )
                p.start()
                proc = psutil.Process(p.pid)
                proc.ionice(psutil.IOPRIO_CLASS_IDLE)
                proc.nice(20)
                self.logger.debug(
                    "Search worker #%s, set ionice = idle and nice = 20 for pid %s" % (str(i), str(p.pid))
                )
                self.processes.append(p)

            abs_path = self.get_abs_path(self.path)
            self.logger.debug("FM FindText worker run(), abs_path = %s" % abs_path)

            if not os.path.exists(abs_path):
                raise Exception("Provided path not exist")

            self.on_running(self.status_id, pid=self.pid, pname=self.name)
            for current, dirs, files in os.walk(abs_path):
                for f in files:
                    try:
                        file_path = os.path.join(current, f)
                        self.file_queue.put(file_path)

                    except UnicodeDecodeError as e:
                        self.logger.error("UnicodeDecodeError %s, %s" % (str(e), traceback.format_exc()))

                    except IOError as e:
                        self.logger.error("IOError %s, %s" % (str(e), traceback.format_exc()))

                    except Exception as e:
                        self.logger.error("Exception %s, %s" % (str(e), traceback.format_exc()))

            while int(time.time()) <= time_limit:
                self.logger.debug(
                    "file_queue size = %s , empty = %s (timeout: %s/%s)"
                    % (self.file_queue.qsize(), self.file_queue.empty(), str(int(time.time())), time_limit)
                )
                if self.file_queue.empty():
                    self.logger.debug("join() file_queue until workers done jobs")
                    self.file_queue.join()
                    break
                else:
                    time.sleep(REQUEST_DELAY)

            if int(time.time()) > time_limit:
                self.is_alive["status"] = False

            for p in self.processes:
                try:
                    self.logger.debug("FM FindText terminate worker process, pid = %s" % p.pid)
                    kill(p.pid, signal.SIGKILL, self.logger)
                except OSError:
                    self.logger.error("FindText unable to terminate worker process, pid = %s" % p.pid)

            if self.is_alive["status"] is True:
                while not self.result_queue.empty():
                    file_path = self.result_queue.get()
                    self.result.append(self._make_file_info(file_path))

                self.on_success(self.status_id, data=self.result, pid=self.pid, pname=self.name)
            else:
                result = {"error": True, "message": "Operation timeout exceeded", "traceback": ""}
                self.on_error(self.status_id, result, pid=self.pid, pname=self.name)

        except Exception as e:
            result = {"error": True, "message": str(e), "traceback": traceback.format_exc()}

            self.on_error(self.status_id, result, pid=self.pid, pname=self.name)
Пример #35
0
    queue_log = JoinableQueue()
    test_commprocess = CommProcess(port=8080,address='127.0.0.1',events={'enable_comms':event_enable_comms,'client_disconnect':event_client_disconnect},queues={'tx_msg':queue_tx,'rx_msg':queue_rx,'log':queue_log},debug_log=debug_log_path)

    retcode = 0
    
    try:
        
        event_enable_comms.set()
        
        event_enable_comms.set()
        test_commprocess.start()

        while True:
          
                
            if queue_log.empty()==False:
                print pop_queue(queue_log)
                
            if queue_rx.empty()==False:
                msg = pop_queue(queue_rx)
                print msg
                queue_tx.put("echoing %s\n"%msg)

            ## check if client disconnected :
            if event_client_disconnect.wait(PROCESS_EVENT_CLIENT_DISCONNECT_TIMEOUT_S):
                

                ## check if the commprocess is still alive :
                if not test_commprocess.is_alive():
                    ## what happened ?
                    commprocess_retcode = test_commprocess.exitcode
def main(fileName):

    # load test config and default values
    with open(fileName, 'r') as f:
        tests = json.load(f)
    prepare_tests_settings(tests)
    default = tests['default']

    jobQueue = JoinableQueue()
    resultQueue = JoinableQueue()

    # NOTE: some parameters are obsolete as they are overruled by the parameters in individual tests
    if default['browser'].lower() == 'chrome':
        # use producer-consumer mode for chrome
        # this mode helps isolating individual failures
        # as well as supporting parallel browsers
        workers = start_parallel_instances(default, jobQueue, resultQueue)
        dispatch_parallel_tests(tests, jobQueue)

        def terminate_jobs(_, __):
            logging.warning("SIGINT: terminating all the intances ")
            for worker in workers:
                # SIGTERM will trigger teardown function of the workers
                # so that they could nicely kill the processes (chrome, Xvfb) they started
                os.kill(worker.pid, signal.SIGTERM)
                time.sleep(0.5)
            sys.exit(-1)
        # SIGINT is for nice teardown
        # NOTE: if SIGKILL this process, there could be orphan processes that stop new tests
        # one must manually kill them if that happens
        signal.signal(signal.SIGINT, terminate_jobs)
        #loader = ChromeLoader(disable_quic=default['disable_quic'], disable_spdy=default['disable_spdy'],
        #                      check_protocol_availability=False, save_packet_capture=True,
        #                      log_ssl_keys=default['log_ssl_keys'], save_har=True, disable_local_cache=False,
        #                      headless=default['headless'], ignore_certificate_errors=default['ignore_certificate_errors'])
        #loader.load_pages(tests)
        #pprint.pprint(dict(loader.load_results))

        # then wait for the queue to be empty
        jobQueue.join()

        while not resultQueue.empty():
            # print all the test reports
            result = resultQueue.get(False)
            print result
            resultQueue.task_done()
        # send teardown message then wait
        teardown_parallel_instances(default, jobQueue)
        jobQueue.join()

    elif default['browser'].lower() == 'firefox':
        # simplier single thread mode for firefox
        loader = FirefoxLoader(disable_quic=default['disable_quic'], disable_spdy=default['disable_spdy'],
                               check_protocol_availability=False, save_packet_capture=True,
                               log_ssl_keys=default['log_ssl_keys'], save_har=True, disable_local_cache=False,
                               headless=default['headless'], ignore_certificate_errors=default['ignore_certificate_errors'])
        loader.load_pages(tests)
        pprint.pprint(dict(loader.load_results))
    else:
        logging.critical('Uknown browser %s', default['browser'].lower())
        sys.exit(-1)
Пример #37
0
class FindText(BaseWorkerCustomer):
    NUM_WORKING_PROCESSES = 2

    def __init__(self, params, session, *args, **kwargs):
        super(FindText, self).__init__(*args, **kwargs)

        self.path = params.get('path', '/')
        self.session = session
        self.session = session
        self.text = params.get('text', '')

        self.params = params

        # file queue to be processed by many threads
        self.file_queue = JoinableQueue(maxsize=0)
        self.result_queue = Queue(maxsize=0)
        self.result = []

        self.is_alive = {
            "status": True
        }

        self.re_text = re.compile('.*' + fnmatch.translate(self.text)[:-7] + '.*',
                                  re.UNICODE | re.IGNORECASE)
        # remove \Z(?ms) from end of result expression

    def run(self):
        try:
            self.preload()
            sftp = self.get_sftp_connection(self.session)

            self.logger.debug("findText started with timeout = %s" % TIMEOUT_LIMIT)
            time_limit = int(time.time()) + TIMEOUT_LIMIT
            # Launches a number of worker threads to perform operations using the queue of inputs
            sftp_managers = []
            for i in range(self.NUM_WORKING_PROCESSES):
                p = Process(target=self.worker,
                            args=(self.re_text, self.file_queue, self.result_queue, time_limit))
                p.start()
                proc = psutil.Process(p.pid)
                proc.ionice(psutil.IOPRIO_CLASS_IDLE)
                proc.nice(20)
                self.logger.debug(
                    "Search worker #%s, set ionice = idle and nice = 20 for pid %s" % (
                        str(i), str(p.pid)))
                self.processes.append(p)

            abs_path = self.path
            self.logger.debug("FM FindText worker run(), abs_path = %s" % abs_path)

            if not sftp.exists(abs_path):
                raise Exception("Provided path not exist")

            self.on_running(self.status_id, pid=self.pid, pname=self.name)
            for current, dirs, files in sftp.walk(abs_path):
                for f in files:
                    try:
                        file_path = os.path.join(current, f)
                        self.file_queue.put(file_path)

                    except UnicodeDecodeError as e:
                        self.logger.error(
                            "UnicodeDecodeError %s, %s" % (str(e), traceback.format_exc()))

                    except IOError as e:
                        self.logger.error("IOError %s, %s" % (str(e), traceback.format_exc()))

                    except Exception as e:
                        self.logger.error(
                            "Exception %s, %s" % (str(e), traceback.format_exc()))

            while int(time.time()) <= time_limit:
                self.logger.debug("file_queue size = %s , empty = %s (timeout: %s/%s)" % (
                    self.file_queue.qsize(), self.file_queue.empty(), str(int(time.time())), time_limit))
                if self.file_queue.empty():
                    self.logger.debug("join() file_queue until workers done jobs")
                    self.file_queue.join()
                    break
                else:
                    time.sleep(REQUEST_DELAY)

            if int(time.time()) > time_limit:
                self.is_alive['status'] = False

            for sftp in sftp_managers:
                sftp.conn.close()

            for p in self.processes:
                try:
                    self.logger.debug("FM FindText terminate worker process, pid = %s" % p.pid)
                    kill(p.pid, signal.SIGKILL, self.logger)
                except OSError:
                    self.logger.error(
                        "FindText unable to terminate worker process, pid = %s" % p.pid)

            if self.is_alive['status'] is True:
                while not self.result_queue.empty():
                    file_path = self.result_queue.get()
                    self.result.append(sftp.make_file_info(file_path))

                self.on_success(self.status_id, data=self.result, pid=self.pid, pname=self.name)
            else:
                result = {
                    "error": True,
                    "message": "Operation timeout exceeded",
                    "traceback": ""
                }
                self.on_error(self.status_id, result, pid=self.pid, pname=self.name)

        except Exception as e:
            result = {
                "error": True,
                "message": str(e),
                "traceback": traceback.format_exc()
            }

            self.on_error(self.status_id, result, pid=self.pid, pname=self.name)

    def worker(self, re_text, file_queue, result_queue, timeout):
        try:
            worker_sftp = self.get_sftp_connection(self.session)
            while int(time.time()) < timeout:
                if file_queue.empty() is not True:
                    f_path = file_queue.get()
                    try:
                        if not worker_sftp.is_binary(f_path):
                            mime = mimetypes.guess_type(f_path)[0]

                            # исключаем некоторые mime типы из поиска
                            if mime not in ['application/pdf', 'application/rar']:
                                with worker_sftp.open(f_path, 'rb') as fp:
                                    for line in fp:
                                        try:
                                            line = as_unicode(line)
                                        except UnicodeDecodeError:
                                            charset = chardet.detect(line)
                                            if charset.get('encoding') in ['MacCyrillic']:
                                                detected = 'windows-1251'
                                            else:
                                                detected = charset.get('encoding')

                                            if detected is None:
                                                break
                                            try:
                                                line = str(line, detected, "replace")
                                            except LookupError:
                                                pass

                                        if re_text.match(line) is not None:
                                            result_queue.put(f_path)
                                            self.logger.debug("matched file = %s " % f_path)
                                            break

                    except UnicodeDecodeError as unicode_e:
                        self.logger.error(
                            "UnicodeDecodeError %s, %s" % (str(unicode_e), traceback.format_exc()))

                    except IOError as io_e:
                        self.logger.error("IOError %s, %s" % (str(io_e), traceback.format_exc()))

                    except Exception as other_e:
                        self.logger.error("Exception %s, %s" % (str(other_e), traceback.format_exc()))
                    finally:
                        file_queue.task_done()
                else:
                    time.sleep(REQUEST_DELAY)
            worker_sftp.close()

        except Exception as e:
            result = {
                "error": True,
                "message": str(e),
                "traceback": traceback.format_exc()
            }

            self.logger.error('SFTP FindText Worker Exception {}'.format(result))
Пример #38
0
def process_pool_executor_handler(executor: ProcessPoolExecutor,
                                  manager: DownloadProcess,
                                  file_maps: Dict[str, str],
                                  directory: str) -> None:
    done_queue = JoinableQueue()

    def update_hook(future: Future):
        temp = future.result()
        if temp:
            for failed_links in temp:
                done_queue.put(failed_links)

    while manager.done_retries != manager.max_retries:
        print(
            f"Starting download {manager.get_total_links() - manager.get_total_downloaded_links_count()} links left"
        )
        available_cpus = [0, 1, 2, 3
                          ] if platform.system() == "Windows" else list(
                              os.sched_getaffinity(os.getpid()))
        print(
            f"available cpu's {available_cpus}, initializing {4 * manager.get_process_num()}"
            f" threads with {manager.get_thread_num()} links per "
            f"process")

        if len(manager.error_links):
            download_links = manager.error_links.copy()
            manager.error_links = []
        else:
            download_links = manager.get_download_links().copy()

        process_futures: List[Future] = []

        start = 0
        for temp_num in range(len(download_links)):
            end = start + manager.get_thread_num()

            if end > len(download_links):
                end = len(download_links)

            cpu_num = available_cpus[temp_num % len(available_cpus)]
            process_futures.append(
                executor.submit(start_threads, download_links[start:end],
                                file_maps, manager.get_session(), directory,
                                manager.http2, manager.debug, cpu_num))
            process_futures[-1].add_done_callback(update_hook)
            start = end

            if end >= len(download_links):
                break

        wait(process_futures)

        while not done_queue.empty():
            link = done_queue.get()
            manager.error_links.append(link)

        manager.set_total_downloaded_links_count(manager.get_total_links() -
                                                 len(manager.error_links))

        if manager.debug:
            print(
                f"Total downloaded links {manager.get_total_downloaded_links_count()}"
            )
            print(f"Error links generated {len(manager.error_links)}")

        if len(manager.error_links):
            manager.set_thread_num(
                int(
                    ceil((manager.get_total_links() -
                          manager.get_total_downloaded_links_count()) /
                         manager.get_process_num())))
            print(
                f"{manager.get_total_links()} was expected but "
                f"{manager.get_total_downloaded_links_count()} was downloaded."
            )
            manager.done_retries += 1
            print(f"Trying retry {manager.done_retries}")
        else:
            break
Пример #39
0
class VideoProcessor:
    def __init__(self, config: VideoProcessorConfig):

        self.config = config

        self.ballsQueue = Queue()
        self.cueQueue = Queue()

        self.ballThrottle = JoinableQueue()
        self.cueThrottle = JoinableQueue()

        self.eventQueueVP = Queue()
        self.eventQueueBP = Queue()
        self.eventQueueCP = Queue()

        self.event = None

        self.vcap = None
        self.vrec = None

        self.frameReadLock = Lock()

        self.ballProcess = None
        self.cueProcess = None
        self.outputModuleProcess = None

        self.initFrameProcessing: InitialFrameProcessing = None

        self.classificator = None

    def capture(self):

        self.initFrameProcessing = InitialFrameProcessing(self.config)

        frameWidth = Value('i', 1)
        frameHeight = Value('i', 1)

        ballProcessorConfig = BallProcessorConfig(
            self.config.width,
            self.config.height,
            frameWidth,
            frameHeight,
            8,
            12,
            22,
            1.0,
            24,
            90,
            10,
        )

        ballProcessorConfig.genDataSet = self.config.genDataSet
        ballProcessorConfig.genDataSetFolder = self.config.genDataSetFolder

        cueProcessorConfig = CueProcessorConfig(self.config.width,
                                                self.config.height, frameWidth,
                                                frameHeight)

        cueProcessorConfig.genDataSet = self.config.genDataSet
        cueProcessorConfig.genDataSetFolder = self.config.genDataSetFolder

        sharedFrame = RawArray(np.ctypeslib.as_ctypes_type(np.uint8),
                               self.config.get_flat_shape())
        sharedAvgFrame = RawArray(np.ctypeslib.as_ctypes_type(np.uint8),
                                  self.config.get_flat_shape())
        numpyFrame = np.frombuffer(sharedFrame, dtype=np.uint8).reshape(
            self.config.get_flat_shape())
        numpyAvgFrame = np.frombuffer(sharedAvgFrame, dtype=np.uint8).reshape(
            self.config.get_flat_shape())

        self.ballProcess = BallProcessor(self.ballsQueue, self.ballThrottle,
                                         sharedFrame, sharedAvgFrame,
                                         self.frameReadLock,
                                         ballProcessorConfig,
                                         self.eventQueueBP)

        self.cueProcess = CueProcessor(self.cueQueue, self.cueThrottle,
                                       sharedFrame, sharedAvgFrame,
                                       self.frameReadLock, cueProcessorConfig,
                                       self.eventQueueCP)

        self.outputModuleProcess = OutputModule(self.ballsQueue, self.cueQueue,
                                                self.eventQueueVP,
                                                self.eventQueueBP,
                                                self.eventQueueCP,
                                                self.config.webPort)

        self.ballProcess.start()
        # Póki nie ma implementacji nie może kręcić się na sucho
        self.cueProcess.start()
        self.outputModuleProcess.start()

        try:
            self.vcap = cv2.VideoCapture(
                "udp://0.0.0.0:" + str(self.config.udpPort) +
                "?overrun_nonfatal=1", cv2.CAP_FFMPEG)
            self.vcap.set(cv2.CAP_PROP_BUFFERSIZE, 0)

            while (1):

                self.eventHandling()
                ret, frame = self.vcap.read()

                if ret:
                    cv2.imshow('VP: ORIGINAL', frame)

                    self.initFrameProcessing.on_frame(frame)
                    #self.initFrameProcessing.display_components()

                    w, h = self.initFrameProcessing.get_pool_size()

                    frame = self.initFrameProcessing.get_warped_frame(
                    ).flatten()
                    frame = np.resize(frame, self.config.get_flat_shape())

                    avg_frame = self.initFrameProcessing.get_avg_frame(
                    ).flatten()
                    avg_frame = np.resize(avg_frame,
                                          self.config.get_flat_shape())

                    # chwilowe, bo wywala się gdy podczas wykrywania stołu jest ten fragment nagrania bez stołu
                    if w < 500 or h < 500:
                        continue
                    with self.frameReadLock:
                        frameWidth.value = w
                        frameHeight.value = h
                        np.copyto(numpyFrame, frame)
                        np.copyto(numpyAvgFrame, avg_frame)

                    if not self.ballThrottle.empty():
                        self.ballThrottle.get()
                        self.ballThrottle.task_done()

                    if not self.cueThrottle.empty():
                        self.cueThrottle.get()
                        self.cueThrottle.task_done()

                # Main wait to refresh windows
                c = cv2.waitKey(1)

        except (KeyboardInterrupt, SystemExit):
            print("VP: Interrupt")
            self.cleanup()
            self.terminate()
        print("VP: Exit")
        sys.exit(0)

    def record(self):
        try:
            self.vrec = cv2.VideoWriter(
                self.config.recordingPath, cv2.VideoWriter_fourcc(*'MP4V'),
                self.config.recordingFps,
                (self.config.width, self.config.height))

            self.vcap = cv2.VideoCapture(
                "udp://0.0.0.0:" + str(self.config.udpPort), cv2.CAP_FFMPEG)

            while (1):
                ret, frame = self.vcap.read()
                if frame is not None:
                    self.vrec.write(frame)
                    cv2.imshow('VIDEO', frame)
                    c = cv2.waitKey(1)
                    if c & 0xFF == ord('q'):
                        self.cleanup()
                        break

        except (KeyboardInterrupt, SystemExit):
            self.cleanup()
            sys.exit(0)

    def eventHandling(self):
        while not self.eventQueueVP.empty():
            event = self.eventQueueVP.get_nowait()
            print("VP: ", event.eventType)
            if isinstance(event, RerunInitRequestEvent):
                self.initFrameProcessing.reset_avg()
            elif isinstance(event, InitDurationChangeEvent):
                self.config.initDuration = int(event.initDuration)
            elif isinstance(event, PoolColorsChangeEvent):
                self.config.pool_color_range = event.pool_color_range

    def cleanup(self):
        if self.vrec is not None:
            self.vrec.release()
        if self.vcap is not None:
            self.vcap.release()

    def terminate(self):
        self.ballProcess.terminate()
        self.ballProcess.join()

        self.cueProcess.terminate()
        self.cueProcess.join()

        self.outputModuleProcess.kill()
def main():
    jobs = JoinableQueue()
    result = JoinableQueue()


    numToProcess = -1
    scores = pd.DataFrame(columns=['fmeasure','precision','recall',
                                   'numResult','maxDistance','topHits', 'hits', "contextSteps"])
    manual_annotations = get_manual_annotations(numToProcess)
    manual_tuples = get_ir_tuples(manual_annotations)

    print len(manual_annotations)
    for i in range(weighted_kmeans_clustering_passes):
        print "Training pass",i+1
        train_kmeans(manual_annotations.keys(), target_class_subtree)
        print "Complete."

    for key in manual_annotations.keys():
        jobs.put(key)

    processed_count = Counter()
        
    for i in xrange(NUMBER_OF_PROCESSES):
        p = Process(target=work, args=(i, jobs, result, processed_count))
        p.daemon = True
        p.start()

    #work(1, jobs, result, processed_count)

    automated_annotations = {}
    distances = {}

    jobs.join()

    while not result.empty():
        dataset, classes = result.get()
        automated_annotations[dataset] = set(classes.keys())
        distances[dataset] = classes
        result.task_done()

    automated_tuples = get_ir_tuples(automated_annotations)
    hits = manual_tuples & automated_tuples
    misses = manual_tuples - automated_tuples
    precision = float(len(hits)) / len(automated_tuples)
    recall = float(len(hits)) / len(manual_tuples)
    fmeasure = 2 * (precision * recall) / (precision + recall)
    # print '\t'.join([str(x) for x in [precision, recall, fmeasure,
    #                              numResult, minScore, topHits]])
    scores = scores.append(dict(precision=precision, recall=recall, fmeasure=fmeasure, hits=len(manual_tuples),topHits=topHits, maxDistance=maxDistance, contextSteps = context_steps),
                        ignore_index=True)
    print '\n'
    print scores
    results_file = 'results.csv'
    if len(sys.argv) > 1:
        results_file = sys.argv[1]
    hit_curves = csv.writer(open(results_file,'wb'),delimiter=",")
    hit_curves.writerow(['dataset','class','distance','hit'])
    
    for dataset, c in automated_tuples:
        distance = round(distances[dataset][c],3)
        hit = 1 if (dataset,c) in manual_tuples else 0
        hit_curves.writerow([dataset,c,distance,hit])
Пример #41
0
class Task:
    def __init__(self, name, opts, resdir, log):
        self.__name = name
        self.__opts = opts
        self.__resdir = resdir
        self.__log = log
        self.__proc = None
        self.__is_finished = False
        self.__results = None
        self.__refs = {}

        task_mod = __import__(name)

        installed_version = int(task_mod.Task.version)
        actual_version = int(self.__opts['version'])

        if installed_version < actual_version: # update installed task
            raise Exception("Version {0} for task {1} is too old, task must be updated to {2}".format(installed_version, name, actual_version))

        f = io.StringIO()
        #with redirect_stdout(f):
        self.__cls = task_mod.Task()
        #print('Got stdout: "{0}"'.format(f.getvalue()))


    def __collect_argrefs(self, name2task):
        if not hasattr(self.__cls, 'refs'):
            return dict()
        inrefs = self.__cls.refs
        if not isinstance(inrefs, dict):
            raise Exception("Refs are not dict")

        logger.debug('refs found: {0}'.format(inrefs))

        outrefs = {}
        for name, ref in inrefs.items():
            rtask_name, rtask_retval_name = ref.split('.')
            rtask_res = name2task[rtask_name].get_result()
            if rtask_retval_name not in rtask_res:
                raise Exception('Task {0} does not return val {1} referenced by another task'.format(rtask_name, rtask_retval_name))
            outrefs[name] = rtask_res[rtask_retval_name]
        return outrefs


    @staticmethod
    def __run_wrapper(functor, args, refs, resdir, log, q, exc_q):
        try:
            res = functor(args, refs, resdir, log)
        except Exception as e:
            traceback.print_exc()
            exc_q.put({'Exited by exception': repr(e)})
            exc_q.task_done()
        else:
            if not isinstance(res, dict):
                res = {'result': res}
            q.put(res)
            q.task_done()


    def run(self, name2task):
        refs = self.__collect_argrefs(name2task)
        self.__q = JoinableQueue()
        self.__exc_q = JoinableQueue()
        self.__proc = Process(target=Task.__run_wrapper, args=(self.__cls, self.__opts['args'], refs, self.__resdir, self.__log, self.__q, self.__exc_q))
        self.__proc.start()


    def is_alive(self):
        return self.__proc is not None and self.__proc.is_alive()


    def probe(self):
        if self.__proc is None or self.__proc.is_alive():
            return

        self.__q.join()
        self.__exc_q.join()

        if not self.__exc_q.empty():
            self.__results = self.__exc_q.get()
        elif not self.__q.empty():
            self.__results = self.__q.get()
        else:
            self.__results = {}
        
        self.__proc.join()
        self.__proc = None
        self.__is_finished = True
    
    
    def is_finished(self):
        return self.__is_finished


    def get_result(self):
        return self.__results


    def get_name(self):
        return self.__name


    def get_log(self):
        with open(self.__log) as f:
            data = f.read()
        return data
class MultiThreadedFlickrCrawler:
    ###########################################################################
    # System parameters and initializations
    ###########################################################################
    def __init__(self, cfg, category, max_num_images, communication_q, rate_limit):
        self.cfg = cfg
        self.category = category
        argv = self.cfg.vars
        self.communication_q = communication_q
        self.do_exit = False
        self.rate_limit = rate_limit
        self.rate_q = Queue()

        # flickr auth information: change these to your flickr api keys and secret
        self.flickrAPIkeys = argv["flickrAPIkeys"].split(', ')  # API key
        self.flickrAPIsecrets = argv["flickrAPIsecrets"].split(', ')  # shared "secret"
        self.queryFileName = argv["queryFileName"]  #'query_terms.txt'
        self.homeDir = argv["homeDir"]
        self.imagesPerDir = int(argv["imagesPerDir"])
        self.flickrerrors = 0

        # Crawler parameters
        self.resultsPerPage = int(argv["resultsPerPage"])
        self.downloadsPerQuery = int(argv["downloadsPerQuery"])
        self.numberOfThreads = int(argv["numberOfThreads"])
        self.startTime = int(argv["crawlerBeginTime"])  #1072915200 # 01/01/2004
        self.finalTime = int(time.time())
        self.singleDay = 86400  # 24hr*60min*60sec = 1day
        self.max_num_images = max_num_images
        self.database = argv["databaseName"]

        # Structures Initialization
        self.dbdir = DBDirectories(self.homeDir, argv["sysdir"], category)
        self.indexOfUniqueImages = self.dbdir.inf + 'imageIndex.txt'
        self.indexOfUniqueUsers = self.dbdir.inf + 'usersIndex.txt'
        self.recentUsers = dict()
        self.queryTerms = []

        # Multithreaded downloading of images
        self.queue = JoinableQueue()
        self.out_queue = JoinableQueue()
        self.threadsList = []
        for i in range(self.numberOfThreads):
            t = DownloadImageThread(self.queue, self.out_queue, self.dbdir.img, self.dbdir.txt, self.category,
                                    self.database)
            t.setDaemon(True)
            t.start()
            self.threadsList.append(t)

        print(("{} initialized".format(self.category)))

    ###########################################################################
    # Method to load query terms
    ###########################################################################
    def loadQueries(self):
        # Each term is a category
        self.queryTerms = [self.category]
        print(('positive queries:', self.queryTerms))
        list(map(lambda t: t.setValidTags(self.queryTerms), self.threadsList))
        return len(self.queryTerms)

    ###########################################################################
    # Method to load index of image names
    ###########################################################################
    def loadImageNamesIndex(self):
        print('Loading index of images')
        if os.path.exists(self.indexOfUniqueImages):
            self.allImageNames = dict(
                [(img.replace('\n', ''), True) for img in open(self.indexOfUniqueImages).readlines()])
            print(('Index with', len(self.allImageNames), 'names is ready to use'))
        else:
            self.allImageNames = dict()
            print(('No previous index found at {}'.format(self.indexOfUniqueImages)))
        print('Loading index of users')
        if os.path.exists(self.indexOfUniqueUsers):
            self.recentUsers = dict([(usr.replace('\n', ''), 1) for usr in open(self.indexOfUniqueUsers).readlines()])
            print(('Index with', len(self.recentUsers), 'users is ready to use'))
        else:
            self.recentUsers = dict()
            print(('No previous user index found at {}'.format(self.indexOfUniqueUsers)))

    ###########################################################################
    # Find out if an image is a duplicate or of a user already visited
    ###########################################################################
    def isDuplicateImage(self, flickrResult):
        b = flickrResult
        owner_date = b['owner'] + '_' + simpleDateFormat(b['datetaken'])
        imgName = b['server'] + '_' + b['id'] + '_' + b['secret'] + '_' + owner_date + '.jpg'
        alreadyIndexed = False
        userPhotos = 0

        if imgName in self.allImageNames:
            alreadyIndexed = self.allImageNames[imgName]
        else:
            self.allImageNames[imgName] = False

        if owner_date in self.recentUsers:
            userPhotos = self.recentUsers[owner_date]
        else:
            self.recentUsers[owner_date] = 0

        if (not alreadyIndexed) and userPhotos < 1:
            self.recentUsers[owner_date] += 1
            self.allImageNames[imgName] = True
            return False
        else:
            return True

    ###########################################################################
    #Find out if medium format of photo exists for download
    ###########################################################################
    def get_url(self, flickrResult, fapi, size):
        url = "https://farm{}.staticflickr.com/{}/{}_{}.jpg".format(flickrResult['farm'], flickrResult['server'], flickrResult['id'], flickrResult['secret'])
        return True, url

        #TODO find way to speed up actual url retrieval
        # image_id = flickrResult['id']
        # success = False
        # try:
        #     rsp = fapi.photos_getSizes(api_key=self.flickrAPIKey, photo_id=image_id)
        #     fapi.testFailure(rsp)
        # except:
        #      print sys.exc_info()[0]
        #      print ('Exception encountered while querying for urls\n')
        # else:
        #     if getattr(rsp, 'sizes', None):
        #         if int(rsp.sizes[0]['candownload']) == 1:
        #             if getattr(rsp.sizes[0], 'size', None):
        #                 for image_size in rsp.sizes[0].size:
        #                     if image_size['label'] == size:
        #                         return True, image_size['source']
        #
        # return False, ""

    ###########################################################################
    # Update index of unique image names
    ###########################################################################
    def updateImageNamesIndex(self, newImages):
        with open(self.indexOfUniqueImages, 'a') as indexFile:
            for img in newImages:
                indexFile.write(img + '\n')
        self.allImageNames = []

    ###########################################################################
    # Main Method. This runs the crawler in an infinite loop
    ###########################################################################
    def start(self):
        socket.setdefaulttimeout(30)  #30 second time out on sockets before they throw
        self.cfg.log(self.homeDir, "CRAWLER STARTED")
        while not self.do_exit:
            try:
                command = self.communication_q.get(False)
            except Empty as e:
                #Randomly choose flickrAPIkeys and flickrAPIsecrets
                currentKey = int(math.floor(random.random()*len(self.flickrAPIkeys)))
                # make a new FlickrAPI instance
                fapi = FlickrAPI(self.flickrAPIkeys[currentKey], self.flickrAPIsecrets[currentKey])
                num_queries = self.loadQueries()
                if num_queries == 0:
                    break
                newImages = []
                # Set time variables
                self.finalTime = int(time.time())
                currentTimeWindow = self.finalTime - self.startTime
                mintime = self.startTime + random.randint(0, currentTimeWindow)
                maxtime = mintime + 3 * self.singleDay
                print(('Since:', datetime.fromtimestamp(mintime)))
                print(('Until:', datetime.fromtimestamp(maxtime)))
                print(('Previous Users:', len(self.recentUsers)))
                self.loadImageNamesIndex()
                if len(self.allImageNames) > self.max_num_images:
                    print("Max Images reached")
                    break
                # Search Images using the query terms
                for current_tag in range(0, num_queries):
                    dirNumName = self.dbdir.uploadCurrentDirAndGetNext(self.imagesPerDir, self.queryTerms)
                    print(("Current Directory Number: ", dirNumName))
                    #form the query string.
                    query_string = self.queryTerms[current_tag]
                    print(('\n\nquery_string is ' + query_string))
                    #only visit 8 pages max, to try and avoid the dreaded duplicate bug.
                    #8 pages * 250 images  = 2000 images, should be duplicate safe.  Most interesting pictures will be taken.
                    num_visit_pages = 16
                    pagenum = 1
                    while ( pagenum <= num_visit_pages ):
                        if (self.rate_q.qsize()>self.rate_limit):
                            #Age out time stamps older than one hour
                            found_all = False
                            while(not found_all):
                                next_stamp = self.rate_q.get()
                                if time.time() - next_stamp < 3600:
                                    found_all = True
                                    self.rate_q.put(next_stamp)

                            #Wait to age out time stamps if exceeded rate limit
                            if (self.rate_q.qsize()>self.rate_limit):
                                next_stamp = self.rate_q.get()
                                remaining_time = 3600 - (time.time() - next_stamp)
                                time.sleep(remaining_time)
                        self.rate_q.put(time.time()+60)
                        try:
                            rsp = fapi.photos_search(api_key=self.flickrAPIkeys[currentKey], ispublic="1", media="photos",
                                                     per_page=str(self.resultsPerPage), page=str(pagenum),
                                                     sort="interestingness-desc", text=query_string,
                                                     extras="tags, original_format, license, geo, date_taken, date_upload, o_dims, views, description",
                                                     min_upload_date=str(mintime),
                                                     max_upload_date=str(maxtime))
                            fapi.testFailure(rsp)
                        except KeyboardInterrupt:
                            print('Keyboard exception while querying for images, exiting\n')
                            raise
                        except (IOError, SSLError) as e:
                            print(('Error on Flickr photo request:{}\n'.format(e.strerror)))
                        except FlickrExpatError as e:
                            print(('Exception encountered while querying for images: {}\n'.format(e.message)))
                            print(('{}: {} to {} page {}\n'.format(query_string, mintime, maxtime, pagenum)))
                            print((e.xmlstr))

                            #I've identified two possible causes of this error: (1)Bad Gateway and (2)bad unicode characters in xml
                            time.sleep(5) #Waiting is best cure for bad gateway
                            pagenum = pagenum + 1 #Skipping to next page is best cure for bad character

                            #Just in case it has some connection to the rate limit, change the key
                            #Randomly choose flickrAPIkeys and flickrAPIsecrets
                            currentKey = int(math.floor(random.random()*len(self.flickrAPIkeys)))
                            # make a new FlickrAPI instance
                            fapi = FlickrAPI(self.flickrAPIkeys[currentKey], self.flickrAPIsecrets[currentKey])

                            self.flickrerrors += 1
                            if self.flickrerrors > 5:
                                print(("Too many Flickr Expat Errors in {}: Exiting".format(self.category)))
                                exit(1)
                        except Exception as e:
                            print((sys.exc_info()[0]))
                            print('Exception encountered while querying for images\n')
                        else:
                            # Process results
                            if getattr(rsp, 'photos', None):
                                if getattr(rsp.photos[0], 'photo', None):
                                    random.shuffle(rsp.photos[0].photo)
                                    for k in range(0, min(self.downloadsPerQuery, len(rsp.photos[0].photo))):
                                        b = rsp.photos[0].photo[k]
                                        if not self.isDuplicateImage(b):
                                            isDownloadable, url = self.get_url(b, fapi, "Medium 640")
                                            if isDownloadable:
                                                b["url"] = url
                                                self.queue.put((b, dirNumName))
                                    print('Waiting threads')
                                    self.queue.join()
                                    while not self.out_queue.empty():
                                        newImages.append(self.out_queue.get())
                                    print((len(newImages), ' downloaded images'))
                            pagenum = pagenum + 1  #this is in the else exception block.  It won't increment for a failure.
                            num_visit_pages = min(4, int(rsp.photos[0]['pages']))
                            # End While of Pages
                # BEGIN: PROCESS DOWNLOADED IMAGES
                self.updateImageNamesIndex(newImages)
            else:
                if command == "exit":
                    self.do_exit = True
                    print(("Wait for safe exit {}".format(self.category)))

        print('End')
        self.cfg.log(self.homeDir, "CRAWLER STOPPED")
Пример #43
0
class WebDav:
    NUM_WORKING_PROCESSES = 5

    def __init__(self, host, user, passwd, timeout=-999, logger=None):
        self.fp = dict()

        webdav_host = host

        self.webdav_host = webdav_host
        self.host = host
        self.user = user
        self.passwd = passwd
        self.processes = []
        self.file_queue = JoinableQueue(maxsize=0)
        self.result_queue = Queue(maxsize=0)

        self.is_alive = {
            "status": True
        }

        options = {
            'webdav_hostname': self.webdav_host,
            'webdav_login': self.user,
            'webdav_password': self.passwd
        }

        self.webdavClient = wc.Client(options)

        self.logger = logger
        self._tzinfo = TimeZoneMSK()

    def parent(self, path):
        return urn.Urn(path).parent()

    def path(self, path):
        return urn.Urn(path).path()

    def generate_file_info(self, file_path):
        info = self.webdavClient.info(file_path)

        is_dir = False
        is_link = False

        if self.webdavClient.is_dir(file_path):
            is_dir = True
        else:
            pass

        file_name = urn.Urn(file_path).filename().replace("/", "")
        file_dir = urn.Urn(file_path).parent()

        ext = ''
        divide = file_name.split('.')
        if len(divide) > 1:
            ext = file_name.split('.')[-1].lower()

        mtime = info['modified']

        file_info = {
            "is_dir": is_dir,
            "is_link": is_link,
            "name": file_name,
            "ext": ext,
            "path": file_dir,
            "owner": self.user,
            "mode": "600",
            "size": info['size'] if not is_dir else 0,
            "mtime": mtime,
            'mtime_str': str(mtime),
        }
        return file_info

    def _make_file_info(self, file_queue, result_queue, logger, timeout):
        while int(time.time()) < timeout:
            if file_queue.empty() is not True:
                file_path = file_queue.get()
                try:
                    file_info = self.generate_file_info(file_path)
                    result_queue.put(file_info)
                except UnicodeDecodeError as unicode_e:
                    logger.error(
                        "UnicodeDecodeError %s, %s" % (str(unicode_e), traceback.format_exc()))

                except IOError as io_e:
                    logger.error("IOError %s, %s" % (str(io_e), traceback.format_exc()))

                except Exception as other_e:
                    logger.error("Exception %s, %s" % (str(other_e), traceback.format_exc()))
                finally:
                    file_queue.task_done()
            else:
                time.sleep(REQUEST_DELAY)

    @staticmethod
    def to_byte(value):
        if isinstance(value, str):
            try:
                value = value.encode("utf-8")
            except UnicodeDecodeError:
                value = value.encode("ISO-8859-1")
        return value

    def size(self, path):
        try:
            return self.webdavClient.info(path)['size']
        except Exception as e:
            self.logger.error("Error in WebDav size(): %s, traceback = %s" % (str(e), traceback.format_exc()))
            return 0

    def info(self, path):
        return self.webdavClient.info(self.to_byte(path))

    def exists(self, path):
        return self.webdavClient.check(path)

    def isdir(self, path):
        return self.webdavClient.is_dir(path)

    def isfile(self, path):
        return not self.webdavClient.is_dir(self.to_byte(path))

    def list(self, path):
        flist = {
            "path": path,
            "items": []
        }

        try:
            self.webdavClient.check('/')
        except Exception:
            raise Exception("Error during establishing webdav connection")

        listdir = self.webdavClient.list(self.to_byte(path))
        self.logger.info("listdir=%s", listdir)

        time_limit = int(time.time()) + TIMEOUT_LIMIT

        self.file_queue = JoinableQueue(maxsize=0)
        self.result_queue = Queue(maxsize=0)

        for i in range(self.NUM_WORKING_PROCESSES):
            p = Process(target=self._make_file_info, args=(self.file_queue, self.result_queue, self.logger, time_limit))
            p.start()
            proc = psutil.Process(p.pid)
            proc.ionice(psutil.IOPRIO_CLASS_IDLE)
            proc.nice(20)
            self.logger.debug(
                    "ListDir worker #%s, set ionice = idle and nice = 20 for pid %s" % (
                        str(i), str(p.pid)))
            self.processes.append(p)

        for name in listdir:
            try:
                item_path = '{0}/{1}'.format(path, name)
                self.file_queue.put(item_path)
            except UnicodeDecodeError as e:
                self.logger.error(
                    "UnicodeDecodeError %s, %s" % (str(e), traceback.format_exc()))

            except IOError as e:
                self.logger.error("IOError %s, %s" % (str(e), traceback.format_exc()))

            except Exception as e:
                self.logger.error(
                    "Exception %s, %s" % (str(e), traceback.format_exc()))

        while not self.file_queue.empty():
            self.logger.debug("file_queue size = %s , empty = %s (timeout: %s/%s)" % (
                self.file_queue.qsize(), self.file_queue.empty(), str(int(time.time())), time_limit))
            time.sleep(REQUEST_DELAY)

        if self.file_queue.empty():
            self.logger.debug("join() file_queue until workers done jobs")
            self.file_queue.join()

        for p in self.processes:
            try:
                self.logger.debug("WebDav ListDir terminate worker process, pid = %s" % p.pid)
                kill(p.pid, signal.SIGKILL, self.logger)
            except OSError:
                self.logger.error(
                    "ListDir unable to terminate worker process, pid = %s" % p.pid)

        if self.is_alive['status'] is True:
            while not self.result_queue.empty():
                file_info = self.result_queue.get()
                flist["items"].append(file_info)

        return flist

    def listdir(self, path):
        listdir = self.webdavClient.list(path)

        listing = []
        for name in listdir:
            item_path = '{0}/{1}'.format(path, name)
            listing.append(item_path)
        return listing

    def remove(self, target):
        try:
            self.logger.debug("Removing target=%s" % target)
            if self.isdir(target):
                target += '/'
            self.webdavClient.unpublish(target)
            self.webdavClient.clean(target)
        except Exception as e:
            self.logger.error("Error in WebDav dir remove(): %s, traceback = %s" % (str(e), traceback.format_exc()))
            raise Exception

    def mkdir(self, path):
        self.logger.debug("Creating directory=%s" % path)
        return self.webdavClient.mkdir(path)

    def upload(self, source, target, overwrite=False, rename=None, operation_progress=None):
        result = {}
        file_list = {}

        succeed = []
        failed = []

        try:
            if rename is not None:
                target_path = os.path.join(target, rename)
            else:
                target_path = os.path.join(target, source)

            if not overwrite and self.exists(target_path):
                failed.append(source)
                raise Exception("File '%s' already exists and overwrite not permitted" % target_path)

            try:
                self.logger.debug("Uploading target_path=%s, source=%s" % (target_path, source))
                self.webdavClient.upload(target_path, source, operation_progress)
            except Exception as e:
                failed.append(source)
                self.logger.error("Error in WebDav upload(): %s, traceback = %s" % (str(e), traceback.format_exc()))
                raise Exception("Error during file uploading %s" % traceback.format_exc())

            succeed.append(source)

            file_list['succeed'] = succeed
            file_list['failed'] = failed

            result['success'] = True
            result['error'] = None
            result['file_list'] = file_list

            return result

        except Exception as e:
            self.logger.error("Error in WebDav upload(): %s, traceback = %s" % (str(e), traceback.format_exc()))

            file_list['succeed'] = succeed
            file_list['failed'] = failed

            result['success'] = False
            result['error'] = e
            result['file_list'] = file_list

            return result

    def download(self, source, target, operation_progress=None):
        result = {}
        file_list = {}

        succeed = []
        failed = []

        try:
            target_path = os.path.join(target, os.path.basename(source))

            try:
                self.logger.debug("Downloading source=%s, target_path=%s" % (source, target_path))
                self.webdavClient.download(source, target_path, operation_progress)
            except Exception as e:
                failed.append(source)
                self.logger.error("Error in WebDav download(): %s, traceback = %s" % (str(e), traceback.format_exc()))
                raise Exception("Error during file download")

            succeed.append(source)

            file_list['succeed'] = succeed
            file_list['failed'] = failed

            result['success'] = True
            result['error'] = None
            result['file_list'] = file_list

            return result

        except Exception as e:
            self.logger.error("Error in WebDav download(): %s, traceback = %s" % (str(e), traceback.format_exc()))

            file_list['succeed'] = succeed
            file_list['failed'] = failed

            result['success'] = False
            result['error'] = e
            result['file_list'] = file_list

            return result

    def copy_file(self, source, target, overwrite=False):
        result = {}
        file_list = {}

        succeed = []
        failed = []

        try:
            if not overwrite and self.exists(target):
                failed.append(source)
                raise Exception('file exist and cannot be overwritten')

            try:
                self.logger.debug("Copying file source=%s, target=%s" % (source, target))
                self.webdavClient.copy(source, target)

            except Exception as e:
                failed.append(source)
                raise Exception('Cannot copy file %s' % (e,))

            succeed.append(source)

            file_list['succeed'] = succeed
            file_list['failed'] = failed

            result['success'] = True
            result['error'] = None
            result['file_list'] = file_list

            return result

        except Exception as e:
            file_list['succeed'] = succeed
            file_list['failed'] = failed

            result['success'] = False
            result['error'] = e
            result['file_list'] = file_list

            return result

    def move_file(self, source, target, overwrite=False):
        result = {}
        file_list = {}

        succeed = []
        failed = []

        try:
            if not overwrite and self.exists(target):
                failed.append(source)
                raise Exception('file exist and cannot be overwritten')

            try:
                self.logger.debug("Moving file source=%s, target=%s" % (source, target))
                self.webdavClient.move(source, target)

            except Exception as e:
                failed.append(source)
                raise Exception('Cannot move file %s' % (e,))

            succeed.append(source)

            file_list['succeed'] = succeed
            file_list['failed'] = failed

            result['success'] = True
            result['error'] = None
            result['file_list'] = file_list

            return result

        except Exception as e:
            file_list['succeed'] = succeed
            file_list['failed'] = failed

            result['success'] = False
            result['error'] = e
            result['file_list'] = file_list

            return result

    def make_destination_dir(self, destination, overwrite):
        self.logger.info("making destination %s" % destination)
        if not self.exists(destination):
            self.mkdir(destination)
        elif overwrite and self.exists(destination) and not self.isdir(destination):
            self.remove(destination)
            self.mkdir(destination)
        elif not overwrite and self.exists(destination) and not self.isdir(destination):
            raise Exception("destination is not a dir")
        else:
            pass
Пример #44
0
def main():
    jobs = JoinableQueue()
    result = JoinableQueue()

    numToProcess = -1
    scores = pd.DataFrame(columns=[
        'fmeasure', 'precision', 'recall', 'numResult', 'maxDistance',
        'topHits', 'contentWeight', 'relationWeight', 'hits', "contextSteps"
    ])
    manual_annotations = get_manual_annotations(numToProcess)
    manual_tuples = get_ir_tuples(manual_annotations)

    print len(manual_annotations)
    for i in range(weighted_kmeans_clustering_passes):
        print "Training pass", i + 1
        train_kmeans(manual_annotations.keys(), target_class_subtree)
        print "Complete."

    for key in manual_annotations.keys():
        jobs.put(key)

    processed_count = Counter()

    for i in xrange(NUMBER_OF_PROCESSES):
        p = Process(target=work, args=(i, jobs, result, processed_count))
        p.daemon = True
        p.start()

    #work(1, jobs, result, processed_count)

    automated_annotations = {}
    distances = {}

    jobs.join()

    while not result.empty():
        dataset, classes = result.get()
        automated_annotations[dataset] = set(classes.keys())
        distances[dataset] = classes
        result.task_done()

    automated_tuples = get_ir_tuples(automated_annotations)
    hits = manual_tuples & automated_tuples
    misses = manual_tuples - automated_tuples
    precision = float(len(hits)) / len(automated_tuples)
    recall = float(len(hits)) / len(manual_tuples)
    fmeasure = 2 * (precision * recall) / (precision + recall)
    # print '\t'.join([str(x) for x in [precision, recall, fmeasure,
    #                              numResult, minScore, topHits]])
    scores = scores.append(dict(precision=precision,
                                recall=recall,
                                fmeasure=fmeasure,
                                hits=len(manual_tuples),
                                topHits=topHits,
                                maxDistance=maxDistance,
                                contextSteps=context_steps),
                           ignore_index=True)
    print '\n'
    print scores
    results_file = 'results.csv'
    if len(sys.argv) > 1:
        results_file = sys.argv[1]
    hit_curves = csv.writer(open(results_file, 'wb'), delimiter=",")
    hit_curves.writerow(['dataset', 'class', 'distance', 'hit'])

    for dataset, c in automated_tuples:
        distance = round(distances[dataset][c], 3)
        hit = 1 if (dataset, c) in manual_tuples else 0
        hit_curves.writerow([dataset, c, distance, hit])
Пример #45
0
class FileIO(BaseObj):
    """FileIO object

       Usage:
           from nfstest.file_io import FileIO

           # Instantiate FileIO object given top level directory
           x = FileIO(datadir="/tmp/data")

           # Run workload creating the top level directory if necessary
           x.run()
    """
    def __init__(self, **kwargs):
        """Constructor

           Initialize object's private data

           datadir:
               Top level directory where files will be created,
               it will be created if it does not exist
           seed:
               Seed to initialized the random number generator
               [default: automatically generated]
           nprocs:
               Number of processes to use [default: 1]
           runtime:
               Run time [default: 0 (indefinitely)]
           verbose:
               Verbose level: none|info|debug|dbg1-7|all [default: 'none']
           exiterr:
               Exit on first error [default: False]
           read:
               Read file percentage [default: 40]
           write:
               Write file percentage [default: 40]
           rdwr:
               Read/write file percentage [default: 20]
           randio:
               Random file access percentage [default: 50]
           iodelay:
               Seconds to delay I/O operations [default: 0.0]
           direct:
               Use direct I/O [default: False]
           rdwronly:
               Use read and write only, no rename, remove, etc. [default: False]
           create:
               Create file percentage [default: 5]
           odgrade:
               Open downgrade percentage [default: 10]
           osync:
               Open file with O_SYNC [default: 20]
           fsync:
               Percentage of fsync after write [default: 5]
           rename:
               Rename file percentage [default: 5]
           remove:
               Remove file percentage [default: 5]
           trunc:
               Truncate file percentage [default: 5]
           ftrunc:
               Truncate opened file percentage [default: 5]
           link:
               Create hard link percentage [default: 2]
           slink:
               Create symbolic link percentage [default: 1]
           readdir:
               List contents of directory percentage [default: 1]
           lock:
               Lock file percentage [default: 20]
           unlock:
               Unlock file percentage [default: 80]
           tlock:
               Lock test percentage [default: 50]
           lockfull:
               Lock full file percentage [default: 50]
           minfiles:
               Mininum number of files to create before any file operation
               is executed [default: 10]
           fsizeavg:
               File size average [default: 1m]
           fsizedev:
               File size standard deviation [default: 256k]
           rsize:
               Read block size [default: 64k]
           rsizedev:
               Read block size standard deviation [default: 8k]
           wsize:
               Write block size [default: 64k]
           wsizedev:
               Write block size standard deviation [default: 8k]
           sizemult:
               Size multiplier [default: 1]
           createlog:
               Create log file [default: False]
           createlogs:
               Create a log file for each process [default: False]
           logdir:
               Log directory [default: '/tmp']
        """
        self.progname   = os.path.basename(sys.argv[0])
        self.datadir    = kwargs.pop("datadir",    None)
        self.seed       = kwargs.pop("seed",       P_SEED)
        self.nprocs     = kwargs.pop("nprocs",     P_NPROCS)
        self.runtime    = kwargs.pop("runtime",    P_RUNTIME)
        self.verbose    = kwargs.pop("verbose",    P_VERBOSE)
        self.createlog  = kwargs.pop("createlog",  P_CREATELOG)
        self.createlogs = kwargs.pop("createlogs", P_CREATELOGS)
        self.create     = kwargs.pop("create",     P_CREATE)
        self.osync      = kwargs.pop("osync",      P_OSYNC)
        self.fsync      = kwargs.pop("fsync",      P_FSYNC)
        self.read       = kwargs.pop("read",       None)
        self.write      = kwargs.pop("write",      None)
        self.rdwr       = kwargs.pop("rdwr",       None)
        self.odgrade    = kwargs.pop("odgrade",    P_ODGRADE)
        self.randio     = kwargs.pop("randio",     P_RANDIO)
        self.rdwronly   = kwargs.pop("rdwronly",   P_RDWRONLY)
        self.iodelay    = kwargs.pop("iodelay",    P_IODELAY)
        self.direct     = kwargs.pop("direct",     P_DIRECT)
        self.logdir     = kwargs.pop("logdir",     P_TMPDIR)
        self.exiterr    = kwargs.pop("exiterr",    False)
        self.minfiles   = kwargs.pop("minfiles",   str(MIN_FILES))

        if self.datadir is None:
            print "Error: datadir is required"
            sys.exit(2)

        data = [int(x) for x in self.minfiles.split(",")]
        if len(data) == 1:
            self.up_minfiles = -1
            self.top_minfiles  = data[0]
            self.bot_minfiles  = data[0]
        elif len(data) > 1:
            self.up_minfiles = 0
            self.top_minfiles  = max(data)
            self.bot_minfiles  = min(data)
        else:
            print "Error: option minfiles must be an integer or two integers separated by a ',': %s" % self.minfiles
            sys.exit(2)
        self.minfiles = self.top_minfiles

        if self.rdwronly:
            # When rdwronly option is given, set all options for manipulating
            # files to zero if not explicitly given
            self.rename   = kwargs.pop("rename",   0)
            self.remove   = kwargs.pop("remove",   0)
            self.trunc    = kwargs.pop("trunc",    0)
            self.ftrunc   = kwargs.pop("ftrunc",   0)
            self.link     = kwargs.pop("link",     0)
            self.slink    = kwargs.pop("slink",    0)
            self.readdir  = kwargs.pop("readdir",  0)
            self.lock     = kwargs.pop("lock",     0)
            self.unlock   = kwargs.pop("unlock",   0)
            self.tlock    = kwargs.pop("tlock",    0)
            self.lockfull = kwargs.pop("lockfull", 0)
        else:
            self.rename   = kwargs.pop("rename",   P_RENAME)
            self.remove   = kwargs.pop("remove",   P_REMOVE)
            self.trunc    = kwargs.pop("trunc",    P_TRUNC)
            self.ftrunc   = kwargs.pop("ftrunc",   P_FTRUNC)
            self.link     = kwargs.pop("link",     P_LINK)
            self.slink    = kwargs.pop("slink",    P_SLINK)
            self.readdir  = kwargs.pop("readdir",  P_READDIR)
            self.lock     = kwargs.pop("lock",     P_LOCK)
            self.unlock   = kwargs.pop("unlock",   P_UNLOCK)
            self.tlock    = kwargs.pop("tlock",    P_TLOCK)
            self.lockfull = kwargs.pop("lockfull", P_LOCKFULL)

        # Get size multiplier
        self.sizemult  = convert_str(kwargs.pop("sizemult", P_SIZEMULT))
        # Convert sizes and apply multiplier
        self.fsizeavg  = int(self.sizemult * convert_str(kwargs.pop("fsizeavg", P_FILESIZE)))
        self.fsizedev  = int(self.sizemult * convert_str(kwargs.pop("fsizedev", P_FSIZEDEV)))
        self.rsize     = int(self.sizemult * convert_str(kwargs.pop("rsize",    P_RSIZE)))
        self.wsize     = int(self.sizemult * convert_str(kwargs.pop("wsize",    P_WSIZE)))
        self.rsizedev  = int(self.sizemult * convert_str(kwargs.pop("rsizedev", P_RSIZEDEV)))
        self.wsizedev  = int(self.sizemult * convert_str(kwargs.pop("wsizedev", P_WSIZEDEV)))

        if self.direct:
            # When using direct I/O, use fixed read/write block sizes
            self.rsizedev = 0
            self.wsizedev = 0

        # Initialize counters
        self.rbytes   = 0
        self.wbytes   = 0
        self.nopen    = 0
        self.nopendgr = 0
        self.nosync   = 0
        self.nclose   = 0
        self.nread    = 0
        self.nwrite   = 0
        self.nfsync   = 0
        self.nrename  = 0
        self.nremove  = 0
        self.ntrunc   = 0
        self.nftrunc  = 0
        self.nlink    = 0
        self.nslink   = 0
        self.nreaddir = 0
        self.nlock    = 0
        self.nunlock  = 0
        self.ntlock   = 0
        self.stime    = 0

        # Set read and write option percentages
        total = 100
        if self.rdwr is None:
            if self.read is None and self.write is None:
                # All read and write options are not given, use defaults
                self.read  = P_READ
                self.write = P_WRITE
                self.rdwr  = P_RDWR
            elif self.read is None or self.write is None:
                # If only read or write is given, don't use rdwr
                self.rdwr = 0
            else:
                # If both read and write are given, set rdwr to add up to 100
                self.rdwr = max(0, total - self.read - self.write)
        else:
            # Option rdwr is given, calculate remainder left for read and write
            total -= self.rdwr

        if self.read is None and self.write is None:
            # Only rdwr is given, distribute remainder equally
            # between read and write
            self.read = int(total/2)
            self.write = total - self.read
        elif self.read is None and self.write is not None:
            # Option rdwr and write are given, set read percentage
            self.read = total - self.write
        elif self.read is not None and self.write is None:
            # Option rdwr and read are given, set write percentage
            self.write = total - self.read

        # Verify read and write options add up to 100 percent
        total = abs(self.read) + abs(self.write) + abs(self.rdwr)
        if total != 100:
            print "Total for read, write and rdwr must be == 100"
            sys.exit(2)

        # Set verbose level mask
        self.debug_level(self.verbose)

        # Set timestamp format to include the date and time
        self.tstamp(fmt="{0:date:%Y-%m-%d %H:%M:%S.%q}  ")

        self.logbase = None
        if self.createlog or self.createlogs:
            # Create main log file
            datetimestr = self.timestamp("{0:date:%Y%m%d%H%M%S_%q}")
            logname = "%s_%s" % (self.progname, datetimestr)
            self.logbase = os.path.join(self.logdir, logname)
            self.logfile = self.logbase + ".log"
            self.open_log(self.logfile)

        # Multiprocessing
        self.tid   = 0
        self.queue = None

        # Memory buffers
        self.fbuffers = []
        self.PAGESIZE = os.sysconf(os.sysconf_names['SC_PAGESIZE'])

        # Load share library for calling C library functions
        try:
            # Linux
            self.libc = ctypes.CDLL('libc.so.6', use_errno=True)
        except:
            # MacOS
            self.libc = ctypes.CDLL('libc.dylib', use_errno=True)
        self.libc.malloc.argtypes = [ctypes.c_long]
        self.libc.malloc.restype = ctypes.c_void_p
        self.libc.posix_memalign.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_long, ctypes.c_long]
        self.libc.posix_memalign.restype = ctypes.c_int
        self.libc.read.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_long]
        self.libc.read.restype = ctypes.c_int
        self.libc.write.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_long]
        self.libc.write.restype = ctypes.c_int
        self.libc.lseek.argtypes = [ctypes.c_int, ctypes.c_long, ctypes.c_int]
        self.libc.lseek.restype = ctypes.c_long
        self.libc.memcpy.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_long]
        self.libc.memcpy.restype = ctypes.c_void_p

    def __del__(self):
        """Destructor"""
        if getattr(self, 'logfile', None):
            print "\nLogfile: %s" % self.logfile

    def _dprint(self, level, msg):
        """Local dprint function, if called from a subprocess send the
           message to the main process, otherwise use dprint on message
        """
        if self.queue and not self.createlogs:
            # Send message to main process
            self.queue.put([level,msg])
        else:
            # Display message and send it to the log file
            self.dprint(level, msg)

    def _get_tree(self):
        """Read top level directory for existing files to populate database
           This is used so it can be run in the same top level directory
           multiple times
        """
        for entry in os.listdir(self.datadir):
            # Must match file names given by _newname
            if not re.search(r'^f[\dA-F]+$', entry):
                continue
            # Get tid from file name
            tid = int(entry[1:self.bidx], 16)
            if self.tid != tid:
                continue
            # Get index from file name and set it
            index = int(entry[self.bidx:], 16)
            if self.n_index <= index:
                self.n_index = index + 1

            # Get file size and append it to database
            absfile = os.path.join(self.datadir, entry)
            try:
                fst = os.stat(absfile)
                size = fst.st_size
            except:
                size = 0
            fileobj = FileObj(name=entry, size=size)
            fileobj.debug_repr(1)
            if os.path.islink(absfile):
                fileobj.srcname = os.path.basename(os.readlink(absfile))
            self.n_files.append(fileobj)

    def _newname(self):
        """Create new file name"""
        name = "%s%06X" % (self.basename, self.n_index)
        self.n_index += 1
        return name

    def _percent(self, pvalue):
        """Test percent value"""
        if pvalue >= 100:
            return True
        elif pvalue <= 0:
            return False
        return self.random.randint(0,99) < pvalue

    def _get_fileobj(self):
        """Get a random file object"""
        # Number of files available
        nlen = len(self.n_files)
        self.findex = self.random.randint(0, nlen-1)
        return self.n_files[self.findex]

    def _getiolist(self, size, iswrite):
        """Return list of I/O blocks to read/write"""
        iolist = []
        if iswrite:
            bsize = self.wsize
            bdev  = self.wsizedev
        else:
            bsize = self.rsize
            bdev  = self.rsizedev

        tsize = 0
        offset = 0
        while tsize < size:
            block = {}
            if self.direct:
                # Direct I/O uses same block size for all blocks
                blocksize = bsize
            else:
                # Buffered I/O uses different block sizes
                blocksize = int(abs(self.random.gauss(bsize, bdev)))
            if tsize + blocksize > size:
                # Use remaining bytes for last block
                blocksize = size - tsize
            iolist.append({'offset':offset, 'write':iswrite, 'size':blocksize})
            offset += blocksize
            tsize += blocksize

        return iolist

    def _mem_alloc(self, size, aligned=False):
        """Allocate memory for use in C library functions"""
        dbuffer = None
        if aligned:
            # Allocate aligned buffer
            dbuffer = ctypes.c_void_p()
            self.libc.posix_memalign(ctypes.byref(dbuffer), self.PAGESIZE, size)
        else:
            # Allocate regular buffer
            dbuffer = self.libc.malloc(size)
        # Add allocated buffer so it can be freed
        self.fbuffers.append(dbuffer)
        return dbuffer

    def _getlock(self, name, fd, lock_type=None, offset=0, length=0, lock=None, tlock=False):
        """Get byte range lock on file given by file descriptor"""
        n = self.random.randint(0,99)
        stype = fcntl.F_SETLK
        if lock_type == fcntl.F_UNLCK:
            lstr = "UNLOCK"
            if not lock or n >= self.unlock:
                # Do not unlock file
                return
            self.nunlock += 1
        else:
            if tlock:
                # Just do TLOCK
                lstr = "TLOCK "
                stype = fcntl.F_GETLK
                if n >= self.tlock:
                    # No lock, so no tlock
                    return
                self.ntlock += 1
            else:
                lstr = "LOCK  "
                if n >= self.lock:
                    # No lock
                    return
                self.nlock += 1
            if lock_type is None:
                # Choose lock: read or write
                if self._percent(50):
                    lock_type = fcntl.F_RDLCK
                else:
                    lock_type = fcntl.F_WRLCK
            if not tlock:
                # LOCK is requested, but do TLOCK before actual lock
                self._getlock(name, fd, lock_type=lock_type, offset=offset, length=length, lock=lock, tlock=True)
        fstr = ""
        if offset == 0 and length == 0 and lstr == "LOCK  ":
            fstr = " full file"
        self._dprint("DBG4", "%s  %s %d @ %d (%s)%s" % (lstr, name, length, offset, LOCKMAP[lock_type], fstr))
        lockdata = struct.pack('hhllhh', lock_type, 0, offset, length, 0, 0)
        return fcntl.fcntl(fd, stype, lockdata)

    def _do_io(self, **kwargs):
        """Read or write to the given file descriptor"""
        fd       = kwargs.pop("fd", None)
        write    = kwargs.pop("write", False)
        offset   = kwargs.pop("offset", 0)
        size     = kwargs.pop("size", 0)
        fileobj  = kwargs.pop("fileobj", None)
        lockfull = kwargs.pop("lockfull", True)
        lockout  = None

        if self.iodelay > 0.0:
            time.sleep(self.iodelay)

        # Set file offset to read/write
        os.lseek(fd, offset, os.SEEK_SET)

        if write:
            if self.random and not lockfull:
                # Lock file segment
                lockout = self._getlock(fileobj.name, fd, lock_type=fcntl.F_WRLCK, offset=offset, length=size)
            data = 'x' * size
            self._dprint("DBG5", "WRITE   %s %d @ %d" % (fileobj.name, size, offset))

            if self.direct:
                # Direct I/O -- use native write function
                count = self.libc.write(fd, self.wbuffer, size)
            else:
                # Buffered I/O
                count = os.write(fd, data)
                if self._percent(self.fsync):
                    self._dprint("DBG4", "FSYNC   %s" % fileobj.name)
                    self.nfsync += 1
                    os.fsync(fd)

            self.nwrite += 1
            self.wbytes += count
            fsize = offset + count
            if fileobj.size < fsize:
                fileobj.size = fsize
        else:
            if self.random and not lockfull:
                # Lock file segment
                lockout = self._getlock(fileobj.name, fd, lock_type=fcntl.F_RDLCK, offset=offset, length=size)
            self._dprint("DBG5", "READ    %s %d @ %d" % (fileobj.name, size, offset))

            if self.direct:
                # Direct I/O -- use native read function
                count = self.libc.read(fd, self.rbuffer, size)
            else:
                # Buffered I/O
                data = os.read(fd, size)
                count = len(data)
            self.rbytes += count
            self.nread += 1

        if self.random and not lockfull:
            # Unlock file segment
            self._getlock(fileobj.name, fd, lock_type=fcntl.F_UNLCK, offset=offset, length=size, lock=lockout)
        return count

    def _do_file(self):
        """Operate on a file, create, read, truncate, etc."""
        self.absfile = ""
        # Number of files available
        nlen = len(self.n_files)
        if self.up_minfiles == 0 and nlen > self.minfiles:
            self.minfiles = self.bot_minfiles
            self.up_minfiles = 1
        if self.up_minfiles > 0 and nlen < self.minfiles:
            self.minfiles = self.top_minfiles
            self.up_minfiles = 0

        if nlen > self.minfiles and self._percent(self.trunc):
            # Truncate file using the file name
            fileobj = self._get_fileobj()
            self.absfile = os.path.join(self.datadir, fileobj.name)
            # Choose new size at random
            nsize = self.random.randint(0, fileobj.size + self.wsizedev)
            self._dprint("DBG2", "TRUNC   %s %d -> %d" % (fileobj.name, fileobj.size, nsize))
            out = self.libc.truncate(self.absfile, nsize)
            if out == -1:
                err = ctypes.get_errno()
                if hasattr(fileobj, 'srcname') and err == errno.ENOENT:
                    # Make sure not to fail if it is a broken symbolic link
                    self._dprint("DBG2", "TRUNC   %s: broken symbolic link" % fileobj.name)
                    return
                raise OSError(err, os.strerror(err), fileobj.name)
            else:
                self.ntrunc += 1
                fileobj.size = nsize
            return

        if nlen > self.minfiles and self._percent(self.rename):
            # Rename file
            fileobj = self._get_fileobj()
            name = self._newname()
            self.absfile = os.path.join(self.datadir, fileobj.name)
            newfile = os.path.join(self.datadir, name)
            self._dprint("DBG2", "RENAME  %s -> %s" % (fileobj.name, name))
            os.rename(self.absfile, newfile)
            self.nrename += 1
            fileobj.name = name
            return

        if nlen > self.minfiles and self._percent(self.remove):
            # Remove file
            fileobj = self._get_fileobj()
            self.absfile = os.path.join(self.datadir, fileobj.name)
            self._dprint("DBG2", "REMOVE  %s" % fileobj.name)
            os.unlink(self.absfile)
            self.nremove += 1
            self.n_files.pop(self.findex)
            return

        if nlen > self.minfiles and self._percent(self.link):
            # Create hard link
            name = self._newname()
            self.absfile = os.path.join(self.datadir, name)
            index = 0
            while True:
                index += 1
                fileobj = self._get_fileobj()
                if not hasattr(fileobj, 'srcname'):
                    # This file is not a symbolic link, use it
                    break
                if index >= 10:
                    self.absfile = os.path.join(self.datadir, fileobj.name)
                    raise Exception("Unable to find a valid source file for hard link")
            srcfile = os.path.join(self.datadir, fileobj.name)
            self._dprint("DBG2", "LINK    %s -> %s" % (name, fileobj.name))
            os.link(srcfile, self.absfile)
            self.nlink += 1
            linkobj = FileObj(name=name, size=fileobj.size)
            self.n_files.append(linkobj)
            return

        if nlen > self.minfiles and self._percent(self.slink):
            # Create symbolic link
            name = self._newname()
            self.absfile = os.path.join(self.datadir, name)
            index = 0
            while True:
                index += 1
                fileobj = self._get_fileobj()
                if not hasattr(fileobj, 'srcname'):
                    # This file is not a symbolic link, use it
                    break
                if index >= 10:
                    self.absfile = os.path.join(self.datadir, fileobj.name)
                    raise Exception("Unable to find a valid source file for symbolic link")
            self._dprint("DBG2", "SLINK   %s -> %s" % (name, fileobj.name))
            os.symlink(fileobj.name, self.absfile)
            self.nslink += 1
            slinkobj = FileObj(name=name, size=fileobj.size, srcname=fileobj.name)
            self.n_files.append(slinkobj)
            return

        if nlen > self.minfiles and self._percent(self.readdir):
            # Read directory
            count = self.random.randint(1,99)
            self._dprint("DBG2", "READDIR %s maxentries: %d" % (self.datadir, count))
            self.absfile = self.datadir
            fd = self.libc.opendir(self.datadir)
            index = 0
            while True:
                dirent = self.libc.readdir(fd)
                if dirent == 0 or index >= count:
                    break
                index += 1
            out = self.libc.closedir(fd)
            self.nreaddir += 1
            return

        # Select type of open: read, write or rdwr
        total = self.read + self.write
        rn = self.random.randint(0,99)
        if rn < self.read:
            oflags = os.O_RDONLY
            oflist = ["O_RDONLY"]
        elif rn < total:
            oflags = os.O_WRONLY
            oflist = ["O_WRONLY"]
        else:
            oflags = os.O_RDWR
            oflist = ["O_RDWR"]

        # Set create file flag
        if nlen < self.minfiles:
            # Create at least self.minfiles before any other operation
            cflag = True
        else:
            cflag = self._percent(self.create)

        if cflag:
            # Create new name
            name = self._newname()
            fileobj = FileObj(name=name, size=0)
            self.n_files.append(fileobj)
            if oflags == os.O_RDONLY:
                # Creating file, must be able to write
                oflags = os.O_WRONLY
                oflist = ["O_WRONLY"]
            oflags |= os.O_CREAT
            oflist.append("O_CREAT")
        else:
            # Use name chosen at random
            fileobj = self._get_fileobj()

        if "O_RDONLY" not in oflist and self._percent(self.osync):
            # Add O_SYNC flag when opening file for writing
            oflags |= os.O_SYNC
            oflist.append("O_SYNC")
            self.nosync += 1

        if self.direct:
            # Open file for direct I/O
            oflags |= os.O_DIRECT
            oflist.append("O_DIRECT")

        # Select random or sequential I/O
        sstr = "sequen"
        if self._percent(self.randio):
            sstr = "random"

        ostr = "|".join(oflist)

        fd = None
        index = 0
        is_symlink = False
        while fd is None:
            try:
                index += 1
                if hasattr(fileobj, 'srcname'):
                    is_symlink = True
                self.absfile = os.path.join(self.datadir, fileobj.name)
                self._dprint("DBG2", "OPEN    %s %s %s" % (fileobj.name, sstr, ostr))
                fd = os.open(self.absfile, oflags)
                st = os.fstat(fd)
                if is_symlink:
                    self._dprint("DBG6", "OPEN    %s inode:%d symlink" % (fileobj.name, st.st_ino))
                    absfile = os.path.join(self.datadir, fileobj.srcname)
                    st = os.stat(absfile)
                    self._dprint("DBG6", "OPEN    %s inode:%d src:%s" % (fileobj.name, st.st_ino, fileobj.srcname))
                else:
                    self._dprint("DBG6", "OPEN    %s inode:%d" % (fileobj.name, st.st_ino))
            except OSError as openerr:
                if is_symlink and openerr.errno == errno.ENOENT:
                    self._dprint("DBG2", "OPEN    %s: broken symbolic link" % fileobj.name)
                    if index >= 10:
                        # Do not exit execution, just return to select another operation
                        return
                    # Choose a new name at random
                    fileobj = self._get_fileobj()
                    is_symlink = False
                else:
                    # Unknown error
                    raise
        self.nopen += 1

        # Get file size for writing
        size = int(abs(self.random.gauss(self.fsizeavg, self.fsizedev)))

        odgrade = False
        if oflags & os.O_WRONLY == os.O_WRONLY:
            lock_type = fcntl.F_WRLCK
            iolist = self._getiolist(size, True)
        elif oflags & os.O_RDWR == os.O_RDWR:
            lock_type = None
            iolist  = self._getiolist(size, True)
            iolist += self._getiolist(size, False)
            if self._percent(self.odgrade):
                odgrade = True
        else:
            lock_type = fcntl.F_RDLCK
            size = fileobj.size
            if size == 0:
                # File does not have any data, at least try to read one block
                size = self.rsize
            iolist = self._getiolist(size, False)

        if sstr == "random":
            # Shuffle I/O list for random access
            self.random.shuffle(iolist)

        # Lock full file if necessary
        lockfull = False
        if self._percent(self.lockfull):
            lockfull = True
            lockfout = self._getlock(fileobj.name, fd, lock_type=lock_type, offset=0, length=0)

        if nlen > self.minfiles and "O_RDONLY" not in oflist and self._percent(self.ftrunc):
            # Truncate file using the file descriptor
            # Choose new size at random
            nsize = self.random.randint(0, fileobj.size + self.wsizedev)
            self._dprint("DBG2", "FTRUNC  %s %d -> %d" % (fileobj.name, fileobj.size, nsize))
            os.ftruncate(fd, nsize)
            self.nftrunc += 1
            fileobj.size = nsize

        # Read or write the file
        for item in iolist:
            if self.runtime > 0 and time.time() >= self.s_time + self.runtime:
                # Runtime has been reached
                break
            self._do_io(**dict(fd=fd, fileobj=fileobj, lockfull=lockfull, **item))

        if lockfull:
            # Unlock full file
            self._getlock(fileobj.name, fd, lock_type=fcntl.F_UNLCK, offset=0, length=0, lock=lockfout)

        fdr = None
        fdroffset = 0
        if odgrade:
            # Need for open downgrade:
            # First, the file has been opened for read and write
            # Second, open file again for reading
            # Then close read and write file descriptor
            self._dprint("DBG2", "OPENDGR %s" % fileobj.name)
            fdr = os.open(self.absfile, os.O_RDONLY)
            self.nopendgr += 1
            count = self._do_io(fd=fdr, offset=fdroffset, size=self.rsize, fileobj=fileobj)
            fdroffset += count

        # Close main file descriptor
        self._dprint("DBG3", "CLOSE   %s" % fileobj.name)
        os.close(fd)
        self.nclose += 1

        if odgrade:
            for i in xrange(10):
                count = self._do_io(fd=fdr, offset=fdroffset, size=self.rsize, fileobj=fileobj)
                fdroffset += count
            self._dprint("DBG3", "CLOSE   %s" % fileobj.name)
            os.close(fdr)
            self.nclose += 1

        return

    def run_process(self, tid=0):
        """Main loop for each process"""
        ret = 0
        stime = time.time()
        self.tid = tid
        self.n_index = 1
        self.n_files = []
        self.s_time  = stime

        # Setup signal handler to gracefully terminate process
        signal.signal(signal.SIGTERM, stop_handler)

        # Set file base name according to the number processes
        self.bidx = 1 + max(2, len("{0:x}".format(max(0,self.nprocs-1))))
        self.basename = "f{0:0{width}X}".format(self.tid, width=self.bidx-1)

        if self.createlogs:
            # Open a log file for each process
            if self.nprocs <= 10:
                self.logfile = self.logbase + "_%d.log" % self.tid
            elif self.nprocs <= 100:
                self.logfile = self.logbase + "_%02d.log" % self.tid
            elif self.nprocs <= 1000:
                self.logfile = self.logbase + "_%03d.log" % self.tid
            else:
                self.logfile = self.logbase + "_%04d.log" % self.tid
            self.open_log(self.logfile)

        # Read top level directory and populate file database when
        # a previous instance was ran on the same top level directory
        self._get_tree()

        # Create random object and initialized seed for process
        self.random = Random()
        self.random.seed(self.seed + tid)

        if self.direct:
            # Round up to nearest PAGESIZE boundary
            rsize = self.rsize + (self.PAGESIZE - self.rsize)%self.PAGESIZE
            wsize = self.wsize + (self.PAGESIZE - self.wsize)%self.PAGESIZE
            self._dprint("DBG7", "Allocating aligned read buffer of size %d" % rsize)
            self.rbuffer = self._mem_alloc(rsize, aligned=True)
            self._dprint("DBG7", "Allocating aligned write buffer of size %d" % wsize)
            self.wbuffer = self._mem_alloc(wsize, aligned=True)
            pdata = ctypes.create_string_buffer('x' * wsize)
            self.libc.memcpy(self.wbuffer, pdata, wsize);

        count = 0
        while True:
            try:
                self._do_file()
            except TermSignal:
                # SIGTERM has been raised, so stop running and send stats
                break
            except Exception:
                errstr = "ERROR on file object %s (process #%d)\n" % (self.absfile, self.tid)
                errstr += "Directory i-node: %d\n" % self.datadir_st.st_ino
                ioerror = traceback.format_exc()
                self._dprint("INFO", errstr+ioerror)
                ret = 1
                break
            ctime = time.time()
            if self.runtime > 0 and ctime >= stime + self.runtime:
                # Runtime has been reached
                break
            count += 1
        if self.queue:
            # Send all counts to main process
            self.queue.put(["RBYTES",   self.rbytes])
            self.queue.put(["WBYTES",   self.wbytes])
            self.queue.put(["NOPEN",    self.nopen])
            self.queue.put(["NOPENDGR", self.nopendgr])
            self.queue.put(["NOSYNC",   self.nosync])
            self.queue.put(["NCLOSE",   self.nclose])
            self.queue.put(["NREAD",    self.nread])
            self.queue.put(["NWRITE",   self.nwrite])
            self.queue.put(["NFSYNC",   self.nfsync])
            self.queue.put(["NRENAME",  self.nrename])
            self.queue.put(["NREMOVE",  self.nremove])
            self.queue.put(["NTRUNC",   self.ntrunc])
            self.queue.put(["NFTRUNC",  self.nftrunc])
            self.queue.put(["NLINK",    self.nlink])
            self.queue.put(["NSLINK",   self.nslink])
            self.queue.put(["NREADDIR", self.nreaddir])
            self.queue.put(["NLOCK",    self.nlock])
            self.queue.put(["NTLOCK",   self.ntlock])
            self.queue.put(["NUNLOCK",  self.nunlock])
            self.queue.put(["RETVALUE", ret])

        if self.direct:
            self._dprint("DBG7", "Free data buffers")
            for dbuffer in self.fbuffers:
                self.libc.free(dbuffer)
        self.close_log()
        return ret

    def run(self):
        """Main function where all processes are started"""
        errors = 0
        if self.seed is None:
            # Create random seed
            self.seed = int(1000.0*time.time())

        # Main seed so run can be reproduced
        self.dprint("INFO", "SEED = %d" % self.seed)
        # Flush log file descriptor to make sure above info is not written
        # to all log files when using multiple logs for each subprocess
        self.flush_log()
        stime = time.time()

        if not os.path.exists(self.datadir):
            # Create top level directory if it does not exist
            os.mkdir(self.datadir, 0777)
        self.datadir_st = os.stat(self.datadir)

        if self.nprocs > 1:
            # setup interprocess queue
            self.queue = JoinableQueue()
            processes = []
            for i in xrange(self.nprocs):
                # Run each subprocess with its own process id (tid)
                # The process id is used to set the random number generator
                # and also to have each process work with different files
                process = Process(target=self.run_process, kwargs={'tid':self.tid})
                processes.append(process)
                process.start()
                self.tid += 1
            done = False
            while not done:
                # Wait for a short time so main process does not hog the CPU
                # by checking the queue continuously
                time.sleep(0.1)
                while not self.queue.empty():
                    # Get any pending messages from any of the processes
                    level, msg = self.queue.get()
                    # Check if message is a valid count first
                    if level == "RBYTES":
                        self.rbytes += msg
                    elif level == "WBYTES":
                        self.wbytes += msg
                    elif level == "NOPEN":
                        self.nopen += msg
                    elif level == "NOPENDGR":
                        self.nopendgr += msg
                    elif level == "NOSYNC":
                        self.nosync += msg
                    elif level == "NCLOSE":
                        self.nclose += msg
                    elif level == "NREAD":
                        self.nread += msg
                    elif level == "NWRITE":
                        self.nwrite += msg
                    elif level == "NFSYNC":
                        self.nfsync += msg
                    elif level == "NRENAME":
                        self.nrename += msg
                    elif level == "NREMOVE":
                        self.nremove += msg
                    elif level == "NTRUNC":
                        self.ntrunc += msg
                    elif level == "NFTRUNC":
                        self.nftrunc += msg
                    elif level == "NLINK":
                        self.nlink += msg
                    elif level == "NSLINK":
                        self.nslink += msg
                    elif level == "NREADDIR":
                        self.nreaddir += msg
                    elif level == "NLOCK":
                        self.nlock += msg
                    elif level == "NTLOCK":
                        self.ntlock += msg
                    elif level == "NUNLOCK":
                        self.nunlock += msg
                    elif level == "RETVALUE":
                        if msg != 0:
                            errors += 1
                            if self.exiterr:
                                # Exit on first error
                                for process in list(processes):
                                    process.terminate()
                                break
                    else:
                        # Message is not any of the valid counts,
                        # so treat it as a debug message
                        self.dprint(level, msg)
                # Check if any process has finished
                for process in list(processes):
                    if not process.is_alive():
                        process.join()
                        if not self.exiterr and abs(process.exitcode):
                            errors += 1
                        processes.remove(process)
                        if len(processes) == 0:
                            done = True
                            break
        else:
            # Only one process to run, just run the function
            out = self.run_process(tid=self.tid)
            if out != 0:
                errors += 1
        # Set seed to make sure if this function is called again a different
        # set of operations will be called
        self.seed += self.nprocs
        delta = time.time() - stime

        # Display stats
        self.dprint("INFO", "==================STATS===================")
        self.dprint("INFO", "OPEN:    % 7d" % self.nopen)
        self.dprint("INFO", "OPENDGR: % 7d" % self.nopendgr)
        self.dprint("INFO", "CLOSE:   % 7d" % self.nclose)
        self.dprint("INFO", "OSYNC:   % 7d" % self.nosync)
        self.dprint("INFO", "READ:    % 7d, % 10s, % 10s/s" % (self.nread,  convert_uint(self.rbytes), convert_uint(self.rbytes/delta)))
        self.dprint("INFO", "WRITE:   % 7d, % 10s, % 10s/s" % (self.nwrite, convert_uint(self.wbytes), convert_uint(self.wbytes/delta)))
        self.dprint("INFO", "FSYNC:   % 7d" % self.nfsync)
        self.dprint("INFO", "RENAME:  % 7d" % self.nrename)
        self.dprint("INFO", "REMOVE:  % 7d" % self.nremove)
        self.dprint("INFO", "TRUNC:   % 7d" % self.ntrunc)
        self.dprint("INFO", "FTRUNC:  % 7d" % self.nftrunc)
        self.dprint("INFO", "LINK:    % 7d" % self.nlink)
        self.dprint("INFO", "SLINK:   % 7d" % self.nslink)
        self.dprint("INFO", "READDIR: % 7d" % self.nreaddir)
        self.dprint("INFO", "LOCK:    % 7d" % self.nlock)
        self.dprint("INFO", "TLOCK:   % 7d" % self.ntlock)
        self.dprint("INFO", "UNLOCK:  % 7d" % self.nunlock)
        if errors > 0:
            self.dprint("INFO", "ERRORS:  % 7d" % errors)
        self.dprint("INFO", "TIME:    % 7d secs" % delta)
Пример #46
0
    tasks = [Task(q, out_queue) for i in range(NUM_WORKERS)]
    for w in tasks:
        w.start()

    logging.info("Items left in queue: {0}".format(q.qsize()))
    logging.debug("Joining q")
    # q.join()
    # qf.join()

    if False:
        processes_active = True
        while processes_active:
            for w in tasks:
                processes_active = False or w.is_alive()
                logging.debug(w.is_alive())
            sleep(0.2)

    for y in tasks:
        y.join()

    logging.info("Elapsed time with {0} threads and {1} as maximum number: {2}".format(NUM_WORKERS,
                                                                                       MAX_PRIME_NUMBER,
                                                                                       datetime.now()-start_time))

    count = 0
    while not out_queue.empty():
        out_queue.get()
        out_queue.task_done()
        count += 1
    logging.info("Total primes found: {0}".format(count))
Пример #47
0
def update_data(args, logger):
    start_date = datetime.datetime.strptime(args.start_date, '%Y-%m-%d')
    end_date = datetime.datetime.strptime(args.end_date, '%Y-%m-%d')
    if start_date > end_date:
        logger.warning("start_date[%s] greater than end_date[%s]" % (args.start_date, args.end_date))
        return True

    task_queue = JoinableQueue()
    result_queue = JoinableQueue()

    finished_date_list = []
    cur_date = end_date
    cnt = 0
    while cur_date >= start_date:
        cur_date_str = cur_date.strftime("%Y-%m-%d")
        task_queue.put(cur_date_str)
        cur_date = cur_date - datetime.timedelta(days=1)
        cnt += 1
    logger.info("run task in [%s] days" % cnt)

    process_list = []
    for i in range(args.parallel):
        process = Process(target=update_data_each_day, args=(task_queue, result_queue, i, logger))
        process_list.append(process)

    for process in process_list:
        process.daemon = True
        process.start()

    logger.info("run task in main process")
    success_dates = []
    failed_dates = []
    while 1:
        if len(finished_date_list) >= cnt:
            logger.info("finish all task with finished_data_list_len[%s], cnt[%s]" % (len(finished_date_list), cnt))
            break

        already_finish_flag = True
        if not result_queue.empty():
            finished_date_info_str = result_queue.get()
            finished_date_info = json.loads(finished_date_info_str)
            finished_date = finished_date_info["date"]
            finished_data_status = finished_date_info["status"]
            logger.info("finished_date[%s] get from result_queue" % finished_date)
            if not int(finished_date_info["already_update"]):
                already_finish_flag = False

            if finished_data_status == "fail":
                logger.warning("finished_date[%s] generate_pb_data failed, already_update_flag[%s]" % (
                    finished_date, already_finish_flag))
                failed_dates.append(finished_date)
                notice(finished_date, 0, "generate_pb_data_failed", already_finish_flag, logger)
            else:
                if finished_date in finished_date_list:
                    logger.error("finished_date[%s] already in finished_date_list, already_update_flag[%s]" % (
                        finished_date, already_finish_flag))
                    failed_dates.append(finished_date)
                    notice(finished_date, 0, "repeated_date_generation", already_finish_flag, logger)
                else:
                    logger.info("finished_date[%s] normal case add to finished_date_list" % finished_date)

                if delete_data(finished_date, args.delay_num, logger) < 0:
                    logger.error("finished_date[%s] delete old_data failed, already_update_flag[%s]" % (
                        finished_date, already_finish_flag))
                    failed_dates.append(finished_date)
                    notice(finished_date, 0, "delete_old_data_failed", already_finish_flag, logger)
                else:
                    logger.info("finished_date[%s] update_success, already_update_flag[%s]" % (
                        finished_date, already_finish_flag))
                    success_dates.append(finished_date)
                    notice(finished_date, 1, "update_pb_success", already_finish_flag, logger)

                latest_success_delay_date = get_latest_success_delay_date(success_dates, args.delay_num, logger)
                if latest_success_delay_date != "-1":
                    notice_latest(latest_success_delay_date, already_finish_flag, logger)
            finished_date_list.append(finished_date)

            with open("success_pb_dates", "w") as fp1:
                json.dump(success_dates, fp1)
            with open("failed_pb_dates", "w") as fp1:
                json.dump(failed_dates, fp1)

        time.sleep(2)

    logger.info("stop subprocess tasks")
    for _ in process_list:
        task_queue.put(None)
    task_queue.join()
    logger.info("run task in main process finish")
     histogram_merge_worker.start()

     if args.top:
          reader_procs = [ psutil.Process(reader.pid) for reader in readers ]
          worker_procs = [ psutil.Process(worker.pid) for worker in workers ]

     pair_buffer={}
     scaffold_count={}
#     while (not inq.empty()) or sum( [reader.is_alive() for reader in readers] )>0:
     while True:
          if args.debug: print("get")
          try:
               procid,scaffold,pairs = inq.get()
#               procid,scaffold,pairs = inq.get(True,10)
               #print("#got data:",procid,scaffold,len(pairs))
               print("#got data from inq:",procid,scaffold,len(pairs),inq.empty(),inq.qsize(),inq.full(),strftime("%Y-%m-%d %H:%M:%S"),sum( [reader.is_alive() for reader in readers] ),"q.size():",q.qsize(),file=sys.stderr,sep="\t")
               sys.stderr.flush()
               sys.stdout.flush()
          except Exception as e:
               print(e,file=sys.stderr)
               if args.top:
                    print("queue get timed out",[reader.cpu_percent() for reader in reader_procs],[worker.cpu_percent() for worker in worker_procs])
               #print("#timed out",inq.empty())
               print("#read from queue timed out:",inq.empty(),inq.qsize(),inq.full(),strftime("%Y-%m-%d %H:%M:%S"),sum( [reader.is_alive() for reader in readers] ),file=sys.stderr,sep="\t")
               sys.stderr.flush()
               continue
          if args.debug: print("got")
          if not scaffold in pair_buffer:
               pair_buffer[scaffold]=[]
          pair_buffer[scaffold] += pairs
          scaffold_count[scaffold] = scaffold_count.get(scaffold,0)+1