Exemplo n.º 1
0
class BaseAggregator(object):
    """Base class for the data aggregator interface. This class is used
    alongside the BaseListener class to spawn an aggregator process that
    combines data from multiple crawl processes. The BaseAggregator class
    manages the child listener process.

    Parameters
    ----------
    manager_params : dict
        TaskManager configuration parameters
    browser_params : list of dict
        List of browser configuration dictionaries"""
    __metaclass__ = abc.ABCMeta

    def __init__(self, manager_params, browser_params):
        self.manager_params = manager_params
        self.browser_params = browser_params
        self.logger = loggingclient(*manager_params['logger_address'])
        self.listener_address = None
        self.listener_process = None

    @abc.abstractmethod
    def save_configuration(self, openwpm_version, browser_version):
        """Save configuration details to the database"""

    @abc.abstractmethod
    def get_next_visit_id(self):
        """Return a unique visit ID to be used as a key for a single page visit"""

    @abc.abstractmethod
    def get_next_crawl_id(self):
        """Return a unique crawl ID to be used as a key for a browser instance"""

    def launch(self, listener_process_runner):
        """Launch the aggregator listener process"""
        self.status_queue = Queue()
        self.listener_process = Process(
            target=listener_process_runner,
            args=(self.manager_params, self.status_queue))
        self.listener_process.daemon = True
        self.listener_process.start()
        self.listener_address = self.status_queue.get()

    def shutdown(self):
        """ Terminate the aggregator listener process"""
        self.logger.debug(
            "Sending the shutdown signal to the %s listener process..." %
            type(self).__name__
        )
        self.status_queue.put("SHUTDOWN")
        start_time = time.time()
        self.listener_process.join(300)
        self.logger.debug(
            "%s took %s seconds to close." % (
                type(self).__name__,
                str(time.time() - start_time)
            )
        )
        self.listener_address = None
        self.listener_process = None
Exemplo n.º 2
0
    def mp_process(self,nprocs,func,*args):
        images=args[0]
#        def worker(imgs,i,chunksize, out_q,func,*args):
#            """ The worker function, invoked in a process. 'images' is a
#                list of images to span the process upon. The results are placed in
#                a dictionary that's pushed to a queue.
#            """
#            outdict = {}
#            for imn in range(len(imgs)-1):
#                print(i*chunksize+imn)
#                outdict[i*chunksize+imn] = func(imgs[imn],imgs[imn+1],*args[1:],i*chunksize+imn)
#            out_q.put(outdict)
    
        # Each process will get 'chunksize' nums and a queue to put his out
        # dict into
        out_q = Queue()
        chunksize = int(math.ceil((len(images)-1) / float(nprocs)))
        procs = []
        print("Chunks of size:",chunksize)
        for i in range(nprocs):
            if i == nprocs-1:
                p = Process(
                        target=worker,
                        args=(images[chunksize * i:len(images)-1],i,chunksize,out_q,func,*args))
                procs.append(p)
                p.start()
                self.loading.progress2['value']+=chunksize
                self.update()
            else:                
                p = Process(
                        target=worker,
                        args=(images[chunksize * i:chunksize * (i + 1)+1],i,chunksize,out_q,func,*args))
                procs.append(p)
                p.start()
                self.loading.progress2['value']+=chunksize
                self.update()
    
        # Collect all results into a single result dict. We know how many dicts
        # with results to expect.
        resultdict = {}
        for i in range(nprocs):
            resultdict.update(out_q.get())
    
        # Wait for all worker processes to finish
        for p in procs:
            p.join()
            
        results=[]
        for j in range(len(resultdict)):
            results.append(resultdict[j])

        return results
Exemplo n.º 3
0
    def run_parallel(self,
                     test_suites,
                     test_runner,
                     result_type=None,
                     results_path=None):

        exit_code = 0
        proc = None
        unittest.installHandler()
        processes = []
        manager = Manager()
        results = manager.dict()
        manager.dict()
        start = time.time()

        test_mapping = {}
        for test_suite in test_suites:
            # Give each test suite an uuid so it can be
            # matched to the correct test result
            test_id = str(uuid.uuid4())
            test_mapping[test_id] = test_suite

            proc = Process(target=self.execute_test,
                           args=(test_runner, test_id, test_suite, results))
            processes.append(proc)
            proc.start()

        for proc in processes:
            proc.join()

        finish = time.time()

        errors, failures, _ = self.dump_results(start, finish, results)

        if result_type is not None:
            all_results = []
            for test_id, result in list(results.items()):
                tests = test_mapping[test_id]
                result_parser = SummarizeResults(vars(result), tests,
                                                 (finish - start))
                all_results += result_parser.gather_results()

            reporter = Reporter(result_parser=result_parser,
                                all_results=all_results)
            reporter.generate_report(result_type=result_type,
                                     path=results_path)

        if failures or errors:
            exit_code = 1

        return exit_code
Exemplo n.º 4
0
def main(args):
    lines = []
    word2int = {}
    int2word = {}
    count = 0
    line_count = 0
    pid = 0
    readFile = open(args.FILE_PATH, 'r')
    writeFile = open(args.OUTPUT_FILE_PATH, 'w')
    start = time()
    cpu_count = mp.cpu_count()
    pool = Pool(cpu_count - 1)
    processes = []
    print('Starting everything...')
    lock = mp.Lock()
    for line in readFile:
        print('line count: {}'.format(line_count))
        word1 = line.split('\n')[0].split('\t')[0].split('/')[0]
        word2 = line.split('\n')[0].split('\t')[1].split('/')[1]
        weight = line.split('\n')[0].split('\t')[-1]
        lines.append([word1, word2, weight])
        for word in [word1, word2]:
            if not word in word2int:
                word2int[word] = count
                int2word[count] = word
                count += 1
        line_count += 1
        if line_count % 8000000 == 0:
            # senf the lines to be written in new file
            p = Process(target=assignNumbers,
                        args=(pid, args, word2int, writeFile, lock, lines,
                              len(lines)))
            processes.append(p)
            p.start()
            pid += 1
            lines = []
    for process in processes:
        process.join()
    end = time()
    print('Total time for the whole process : {} seconds'.format(end - start))
    print('proceddings with writign mappings')
    # pool.map(writeFileModule, [(word2int, 'word2int.eng'), (int2word, 'int2word.eng')])
    P = Process(target=writeFileModule, args=(word2int, 'word2int.eng'))
    Q = Process(target=writeFileModule, args=(int2word, 'int2word.eng'))
    P.start()
    Q.start()
    P.join()
    Q.join()
    readFile.close()
    writeFile.close()
Exemplo n.º 5
0
    def run_parallel(
            self, test_suites, test_runner, result_type=None,
            results_path=None):

        exit_code = 0
        proc = None
        unittest.installHandler()
        processes = []
        manager = Manager()
        results = manager.dict()
        manager.dict()
        start = time.time()

        test_mapping = {}
        for test_suite in test_suites:
            # Give each test suite an uuid so it can be
            # matched to the correct test result
            test_id = str(uuid.uuid4())
            test_mapping[test_id] = test_suite

            proc = Process(
                target=self.execute_test,
                args=(test_runner, test_id, test_suite, results))
            processes.append(proc)
            proc.start()

        for proc in processes:
            proc.join()

        finish = time.time()

        errors, failures, _ = self.dump_results(start, finish, results)

        if result_type is not None:
            all_results = []
            for test_id, result in list(results.items()):
                tests = test_mapping[test_id]
                result_parser = SummarizeResults(
                    vars(result), tests, (finish - start))
                all_results += result_parser.gather_results()

            reporter = Reporter(
                result_parser=result_parser, all_results=all_results)
            reporter.generate_report(
                result_type=result_type, path=results_path)

        if failures or errors:
            exit_code = 1

        return exit_code
    def mp_processTR(self, nprocs, func, *args):
        images = args[0]
        TrMat = args[1]
        TrMatNull = np.array([[[1., 0., 0.], [0., 1., 0.]]])
        TrM = np.vstack((TrMatNull, TrMat))
        fnames = args[2]
        print(len(images), len(TrM), len(fnames))
        out_q = Queue()
        chunksize = int(math.ceil((len(images) - 1) / float(nprocs)))
        procs = []
        print("Chunks of size:", chunksize)
        for i in range(nprocs):
            if i == nprocs - 1:
                p = Process(target=workerTR,
                            args=(images[chunksize * i:len(images)],
                                  TrM[chunksize * i:len(images)],
                                  fnames[chunksize * i:len(images)], i,
                                  chunksize, out_q, func, *args[3:]))
                procs.append(p)
                p.start()
                self.loading.progress2['value'] += chunksize
                self.update()
            else:
                p = Process(target=workerTR,
                            args=(images[chunksize * i:chunksize * (i + 1)],
                                  TrM[chunksize * i:chunksize * (i + 1)],
                                  fnames[chunksize * i:chunksize * (i + 1)], i,
                                  chunksize, out_q, func, *args[3:]))
                procs.append(p)
                p.start()
                self.loading.progress2['value'] += chunksize
                self.update()

        # Collect all results into a single result dict. We know how many dicts
        # with results to expect.
        resultdict = {}
        for i in range(nprocs):
            resultdict.update(out_q.get())

        # Wait for all worker processes to finish
        for p in procs:
            p.join()

        results = []
        for j in range(len(resultdict)):
            results.append(resultdict[j])

        return results
Exemplo n.º 7
0
def async_generator(generator, backlog_size=10):
    from multiprocess import Process, Queue
    pipe = Queue(maxsize=backlog_size)
    done_flag = 'FO0B@R'

    def gen_reader(pipe):
        for thing in generator:
            pipe.put(thing)  # And smoke it.
        pipe.put(done_flag)

    reader_proc = Process(target=gen_reader, args=(pipe, ))
    reader_proc.start()

    while True:
        next_thing = pipe.get()
        if next_thing == done_flag:
            break
        yield next_thing

    reader_proc.join()
Exemplo n.º 8
0
def process_message(m, local_message_i, max_run_time, run_dir, aws_region,
                    server_name, log_stream_prefix):
    event = json.loads(m.body)

    # run this in a thread: pywren.wrenhandler.generic_handler(event)
    p = Process(target=job_handler,
                args=(event, local_message_i, run_dir, aws_region, server_name,
                      log_stream_prefix))
    # is thread done
    p.start()
    start_time = time.time()

    response = m.change_visibility(
        VisibilityTimeout=SQS_VISIBILITY_INCREMENT_SEC)

    # add 10s to visibility
    run_time = time.time() - start_time
    last_visibility_update_time = time.time()
    while run_time < max_run_time:
        if (time.time() - last_visibility_update_time) > (
                SQS_VISIBILITY_INCREMENT_SEC * 0.9):
            response = m.change_visibility(
                VisibilityTimeout=SQS_VISIBILITY_INCREMENT_SEC)
            last_visibility_update_time = time.time()
            logger.debug("incrementing visibility timeout by {} sec".format(
                SQS_VISIBILITY_INCREMENT_SEC))
        if p.exitcode is not None:
            logger.debug("attempting to join process")
            # FIXME will this join ever hang?
            p.join()
            break
        else:
            print "sleeping"
            time.sleep(PROCESS_SLEEP_DUR_SEC)

        run_time = time.time() - start_time

    if p.exitcode is None:
        p.terminate()  # PRINT LOTS OF ERRORS HERE

    m.delete()
Exemplo n.º 9
0
    def _execute(self, tz, start, stop, function, period_in_seconds,
                 number_of_reattempts, reattempt_duration_in_seconds):
        """
        Parameters
        ----------
        function : callable function
            name of the function which needs to be scheduled
        period_in_seconds : int
        number_of_reattempts : int
            each event is tried these many number of times, but executed once
        reattempt_duration_in_seconds : int
            duration to wait (in seconds) after un-successful attempt

        Returns
        -------
        int

        """
        time_ = datetime.now(timezone(tz)).ctime()[4:]
        stop = stop if stop else time_
        start = start if start else time_
        if (time_ >= str(start)) & (time_ <= str(stop)):
            p = Process(target=function)
            p.start()
            self._workers.append(p)
            timer = time() + period_in_seconds
            for tries in range(number_of_reattempts + 1):
                self._sleep(period_in_seconds = timer - \
                                                time() - \
                                                reattempt_duration_in_seconds)
                if (number_of_reattempts > 1) & (p.exitcode != 0):
                    p.join()
                    p = Process(target=function)
                    p.start()
                    self._workers.append(p)
            self._sleep(period_in_seconds=timer - time())
            return 1
        elif time_ < str(start):
            return 1
        elif time_ > str(stop):
            return 0
Exemplo n.º 10
0
    def run(self, func, arglist):
        # TODO: Generalise this a bit maybe?
        #       We know len(arglist) == self.workers in pretty much all cases

        pool = []
        queue = Queue()
        for idx, split in enumerate(arglist):
            proc = Process(target=self._eval, args=(idx, func, split, queue))
            pool.append(proc)
            proc.start()
        for proc in pool:
            proc.join()

        yss = [[]] * len(arglist)
        while not queue.empty():
            idx, ys = queue.get()
            yss[idx] = ys
        ret = [y for ys in yss for y in ys]
        #from code import interact
        #interact(local=locals())
        return ret
Exemplo n.º 11
0
def spin_crawl_threads(state, classifiers, MAX_BIT_SIZE, MAX_DL_THREADS, image_path):
    print("Running threads...")
    manager = Manager()

    location_q = manager.Queue(maxsize=16)
    image_q = manager.Queue(maxsize=64)
    state_lock = manager.Lock()

    generate_location = Process(target=generate_location_thread,
                                args=(location_q, MAX_BIT_SIZE),
                                name="generate_location")
    classification = Process(target=classification_thread,
                             args=(image_q, classifiers, image_path,
                                   state, state_lock), name="classification")
    download_image_t = Process(target=download_image_thread,
                               args=(location_q, image_q, MAX_DL_THREADS),
                               name="download_image")

    download_image_t.start()
    classification.start()
    generate_location.start()

    def kill_threads():
        for thread in active_children():
            thread.terminate()

    atexit.register(kill_threads)

    download_image_t.join()
    classification.join()
    generate_location.join()
    def mp_process(self, nprocs, func, *args):
        images = args[0]
        out_q = Queue()
        chunksize = int(math.ceil((len(images) - 1) / float(nprocs)))
        procs = []
        print("Chunks of size:", chunksize)
        for i in range(nprocs):
            if i == nprocs - 1:
                p = Process(target=worker,
                            args=(images[chunksize * i:len(images)], i,
                                  chunksize, out_q, func, *args))
                procs.append(p)
                p.start()
                self.loading.progress2['value'] += chunksize
                self.update()
            else:
                p = Process(target=worker,
                            args=(images[chunksize * i:chunksize * (i + 1) +
                                         1], i, chunksize, out_q, func, *args))
                procs.append(p)
                p.start()
                self.loading.progress2['value'] += chunksize
                self.update()

        # Collect all results into a single result dict. We know how many dicts
        # with results to expect.
        resultdict = {}
        for i in range(nprocs):
            resultdict.update(out_q.get())

        # Wait for all worker processes to finish
        for p in procs:
            p.join()

        results = []
        for j in range(len(resultdict)):
            results.append(resultdict[j])

        return results
Exemplo n.º 13
0
def api_curtains_control(status):
    if status in STATES:
        curtain_status_all = get_curtain_status_all()
        if status == "open" and (curtain_status_all == "closed"
                                 or curtain_status_all == "partlyopen"):
            curtain_status_right = get_curtain_status("right")
            p_right = None
            if curtain_status_right == "closed":
                p_right = Process(target=open_curtain, args=('right', ))
                p_right.start()

            curtain_status_left = get_curtain_status("left")
            p_left = None
            if curtain_status_left == "closed":
                p_left = Process(target=open_curtain, args=('left', ))
                p_left.start()

            if p_right is not None:
                p_right.join()
            if p_left is not None:
                p_left.join()
            return get_curtain_status_all()

        elif status == "closed" and (curtain_status_all == "open"
                                     or curtain_status_all == "partlyopen"):
            curtain_status_right = get_curtain_status("right")
            p_right = None
            if curtain_status_right == "open":
                p_right = Process(target=close_curtain, args=('right', ))
                p_right.start()

            curtain_status_left = get_curtain_status("left")
            p_left = None
            if curtain_status_left == "open":
                p_left = Process(target=close_curtain, args=('left', ))
                p_left.start()

            if p_right is not None:
                p_right.join()
            if p_left is not None:
                p_left.join()
            return get_curtain_status_all()

    return {
        "error": 400,
        "curtain_status_all": curtain_status_all,
        "status": status
    }, 400
Exemplo n.º 14
0
class BaseManager(object):
    '''
    Base class for managers
    '''
    _registry = {}
    _Server = Server

    def __init__(self, address=None, authkey=None, serializer='pickle'):
        if authkey is None:
            authkey = current_process().authkey
        self._address = address     # XXX not final address if eg ('', 0)
        self._authkey = AuthenticationString(authkey)
        self._state = State()
        self._state.value = State.INITIAL
        self._serializer = serializer
        self._Listener, self._Client = listener_client[serializer]

    def get_server(self):
        '''
        Return server object with serve_forever() method and address attribute
        '''
        assert self._state.value == State.INITIAL
        return Server(self._registry, self._address,
                      self._authkey, self._serializer)

    def connect(self):
        '''
        Connect manager object to the server process
        '''
        Listener, Client = listener_client[self._serializer]
        conn = Client(self._address, authkey=self._authkey)
        dispatch(conn, None, 'dummy')
        self._state.value = State.STARTED

    def start(self, initializer=None, initargs=()):
        '''
        Spawn a server process for this manager object
        '''
        assert self._state.value == State.INITIAL

        if initializer is not None and not callable(initializer):
            raise TypeError('initializer must be a callable')

        # pipe over which we will retrieve address of server
        reader, writer = connection.Pipe(duplex=False)

        # spawn process which runs a server
        self._process = Process(
            target=type(self)._run_server,
            args=(self._registry, self._address, self._authkey,
                  self._serializer, writer, initializer, initargs),
            )
        ident = ':'.join(str(i) for i in self._process._identity)
        self._process.name = type(self).__name__  + '-' + ident
        self._process.start()

        # get address of server
        writer.close()
        self._address = reader.recv()
        reader.close()

        # register a finalizer
        self._state.value = State.STARTED
        self.shutdown = util.Finalize(
            self, type(self)._finalize_manager,
            args=(self._process, self._address, self._authkey,
                  self._state, self._Client),
            exitpriority=0
            )

    @classmethod
    def _run_server(cls, registry, address, authkey, serializer, writer,
                    initializer=None, initargs=()):
        '''
        Create a server, report its address and run it
        '''
        if initializer is not None:
            initializer(*initargs)

        # create server
        server = cls._Server(registry, address, authkey, serializer)

        # inform parent process of the server's address
        writer.send(server.address)
        writer.close()

        # run the manager
        util.info('manager serving at %r', server.address)
        server.serve_forever()

    def _create(self, typeid, *args, **kwds):
        '''
        Create a new shared object; return the token and exposed tuple
        '''
        assert self._state.value == State.STARTED, 'server not yet started'
        conn = self._Client(self._address, authkey=self._authkey)
        try:
            id, exposed = dispatch(conn, None, 'create', (typeid,)+args, kwds)
        finally:
            conn.close()
        return Token(typeid, self._address, id), exposed

    def join(self, timeout=None):
        '''
        Join the manager process (if it has been spawned)
        '''
        self._process.join(timeout)

    def _debug_info(self):
        '''
        Return some info about the servers shared objects and connections
        '''
        conn = self._Client(self._address, authkey=self._authkey)
        try:
            return dispatch(conn, None, 'debug_info')
        finally:
            conn.close()

    def _number_of_objects(self):
        '''
        Return the number of shared objects
        '''
        conn = self._Client(self._address, authkey=self._authkey)
        try:
            return dispatch(conn, None, 'number_of_objects')
        finally:
            conn.close()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.shutdown()

    @staticmethod
    def _finalize_manager(process, address, authkey, state, _Client):
        '''
        Shutdown the manager process; will be registered as a finalizer
        '''
        if process.is_alive():
            util.info('sending shutdown message to manager')
            try:
                conn = _Client(address, authkey=authkey)
                try:
                    dispatch(conn, None, 'shutdown')
                finally:
                    conn.close()
            except Exception:
                pass

            process.join(timeout=0.2)
            if process.is_alive():
                util.info('manager still alive')
                if hasattr(process, 'terminate'):
                    util.info('trying to `terminate()` manager process')
                    process.terminate()
                    process.join(timeout=0.1)
                    if process.is_alive():
                        util.info('manager still alive after terminate')

        state.value = State.SHUTDOWN
        try:
            del BaseProxy._address_to_local[address]
        except KeyError:
            pass

    address = property(lambda self: self._address)

    @classmethod
    def register(cls, typeid, callable=None, proxytype=None, exposed=None,
                 method_to_typeid=None, create_method=True):
        '''
        Register a typeid with the manager type
        '''
        if '_registry' not in cls.__dict__:
            cls._registry = cls._registry.copy()

        if proxytype is None:
            proxytype = AutoProxy

        exposed = exposed or getattr(proxytype, '_exposed_', None)

        method_to_typeid = method_to_typeid or \
                           getattr(proxytype, '_method_to_typeid_', None)

        if method_to_typeid:
            for key, value in list(method_to_typeid.items()):
                assert type(key) is str, '%r is not a string' % key
                assert type(value) is str, '%r is not a string' % value

        cls._registry[typeid] = (
            callable, exposed, method_to_typeid, proxytype
            )

        if create_method:
            def temp(self, *args, **kwds):
                util.debug('requesting creation of a shared %r object', typeid)
                token, exp = self._create(typeid, *args, **kwds)
                proxy = proxytype(
                    token, self._serializer, manager=self,
                    authkey=self._authkey, exposed=exp
                    )
                conn = self._Client(token.address, authkey=self._authkey)
                dispatch(conn, None, 'decref', (token.id,))
                return proxy
            temp.__name__ = typeid
            setattr(cls, typeid, temp)
Exemplo n.º 15
0
def assign_multiprocess_ext(function, data, pool_args={}, **task_args):
    from multiprocess import Queue, Process, cpu_count
    from Queue import Full, Empty
    from time import sleep
    process_count = pool_args.get('processes', cpu_count() - 1)
    input_pipe, output_pipe, control_pipe = (Queue(process_count),
                                             Queue(process_count),
                                             Queue(process_count))
    stop_signal = hash('OK STOP NAO.')

    def multiprocessor(inpipe, outpipe, controlpipe):
        def returner_process(inp, outp, task):
            args, kwargs = inp.get()
            outpipe.put(task(*args, **kwargs))
            return True

        jobs = []
        while True:
            done = [x for x in jobs if x.ready()]
            if done:
                jobs = [x for x in jobs
                        if x not in done]  # Avoids race condition!
            else:
                sleep(0.1)

            for thing in done:
                thing.successful()
                assert thing.get()
            while len(jobs) < process_count:
                cmd = controlpipe.get()
                if cmd == stop_signal:
                    break
                elif cmd == True:
                    newjob = Process(target=returner_process,
                                     args=(inpipe, outpipe))
                    newjob.start()
                    jobs.append(newjob)
                    # I *think* the pipes have to be passed explicitly,
                    # but I haven't checked.
                else:
                    raise Exception
        outpipe.put(stop_signal)

    multiproc_proc = Process(target=multiprocessor,
                             args=(input_pipe, output_pipe, control_pipe))
    multiproc_proc.start()

    if isinstance(data, list):
        data = (x for x in data)
    nexttask = next(data)
    while True:
        try:
            input_pipe.put_nowait(nexttask)
            control_pipe.put_nowait(True)
            nexttask = next(data)
        except Full:
            pass
        except StopIteration:
            break
        try:
            yield output_pipe.get_nowait()
        except Empty:
            sleep(0.1)

    control_pipe.put(stop_signal)
    while True:
        try:
            out = output_pipe.get()
            if out == stop_signal:
                break
            else:
                yield out
        except Empty:
            sleep(0.1)

    multiproc_proc.join()
Exemplo n.º 16
0
def write(q,l):
    print('Process to write: %s' % os.getpid())
    for value in ['A', 'B', 'C','D']:
        print('Put %s to queue...' % value)
        q.put(value)
        l.append(value)
        print l
        time.sleep(random.random())

# 读数据进程执行的代码:
def read(q,l):
    print('Process to read: %s' % os.getpid())
    while True:
        value = q.get(True)
        print('Get %s from queue.' % value)
        print l

if __name__=='__main__':
    # 父进程创建Queue,并传给各个子进程:
    q = Queue()
    l=[]
    pw = Process(target=write, args=(q,l))
    pr = Process(target=read, args=(q,l))
    # 启动子进程pw,写入:
    pw.start()
    # 启动子进程pr,读取:
    pr.start()
    # 等待pw结束:
    pw.join()
    # pr进程里是死循环,无法等待其结束,只能强行终止:
    pr.terminate()
Exemplo n.º 17
0
"""

from multiprocess import Process
import os
import time
import numpy as np


def hobby_motion(name):
    print('%s喜欢运动' % name)
    print('Child process with processId %s starts.' % os.getpid())
    time.sleep(np.random.randint(1, 3))


def hobby_game(name):
    print('%s喜欢游戏' % name)
    print('Child process with processId %s starts.' % os.getpid())
    time.sleep(np.random.randint(1, 3))


if __name__ == "__main__":
    print('Parent processId is: %s.' % os.getpid())

    p1 = Process(target=hobby_motion, args=('付婷婷', ))
    p2 = Process(target=hobby_game, args=('kebi', ))
    p1.start()
    p2.start()

    p1.join()
    p2.join()
Exemplo n.º 18
0
class BaseAggregator(object):
    """Base class for the data aggregator interface. This class is used
    alongside the BaseListener class to spawn an aggregator process that
    combines data from multiple crawl processes. The BaseAggregator class
    manages the child listener process.

    Parameters
    ----------
    manager_params : dict
        TaskManager configuration parameters
    browser_params : list of dict
        List of browser configuration dictionaries"""
    __metaclass__ = abc.ABCMeta

    def __init__(self, manager_params, browser_params):
        self.manager_params = manager_params
        self.browser_params = browser_params
        self.logger = loggingclient(*manager_params['logger_address'])
        self.listener_address = None
        self.listener_process = None
        self.status_queue = Queue()
        self.shutdown_queue = Queue()
        self._last_status = None
        self._last_status_received = None

    @abc.abstractmethod
    def save_configuration(self, openwpm_version, browser_version):
        """Save configuration details to the database"""

    @abc.abstractmethod
    def get_next_visit_id(self):
        """Return a unique visit ID to be used as a key for a single visit"""

    @abc.abstractmethod
    def get_next_crawl_id(self):
        """Return a unique crawl ID used as a key for a browser instance"""

    def get_most_recent_status(self):
        """Return the most recent queue size sent from the listener process"""

        # Block until we receive the first status update
        if self._last_status is None:
            return self.get_status()

        # Drain status queue until we receive most recent update
        while not self.status_queue.empty():
            self._last_status = self.status_queue.get()
            self._last_status_received = time.time()

        # Check last status signal
        if (time.time() - self._last_status_received) > STATUS_TIMEOUT:
            raise RuntimeError(
                "No status update from DataAggregator listener process "
                "for %d seconds." % (time.time() - self._last_status_received))

        return self._last_status

    def get_status(self):
        """Get listener process status. If the status queue is empty, block."""
        try:
            self._last_status = self.status_queue.get(block=True,
                                                      timeout=STATUS_TIMEOUT)
            self._last_status_received = time.time()
        except queue.Empty:
            raise RuntimeError(
                "No status update from DataAggregator listener process "
                "for %d seconds." % (time.time() - self._last_status_received))
        return self._last_status

    def launch(self, listener_process_runner, *args):
        """Launch the aggregator listener process"""
        args = (self.manager_params, self.status_queue,
                self.shutdown_queue) + args
        self.listener_process = Process(target=listener_process_runner,
                                        args=args)
        self.listener_process.daemon = True
        self.listener_process.start()
        self.listener_address = self.status_queue.get()

    def shutdown(self):
        """ Terminate the aggregator listener process"""
        self.logger.debug(
            "Sending the shutdown signal to the %s listener process..." %
            type(self).__name__)
        self.shutdown_queue.put(SHUTDOWN_SIGNAL)
        start_time = time.time()
        self.listener_process.join(300)
        self.logger.debug("%s took %s seconds to close." %
                          (type(self).__name__, str(time.time() - start_time)))
        self.listener_address = None
        self.listener_process = None
from multiprocess import Lock, Process


def f(lock, num):
    lock.acquire()
    print 'Hello world: {}'.format(num)
    lock.release()


if __name__ == '__main__':
    lock = Lock()

    for num in range(10):
        p = Process(target=f, args=(lock, num))
        p.start()
        p.join()  # To print in sequential order
    # for
Exemplo n.º 20
0
class BaseAggregator(object):
    """Base class for the data aggregator interface. This class is used
    alongside the BaseListener class to spawn an aggregator process that
    combines data from multiple crawl processes. The BaseAggregator class
    manages the child listener process.

    Parameters
    ----------
    manager_params : dict
        TaskManager configuration parameters
    browser_params : list of dict
        List of browser configuration dictionaries"""
    __metaclass__ = abc.ABCMeta

    def __init__(self, manager_params, browser_params):
        self.manager_params = manager_params
        self.browser_params = browser_params
        self.logger = loggingclient(*manager_params['logger_address'])
        self.listener_address = None
        self.listener_process = None
        self.status_queue = Queue()
        self.shutdown_queue = Queue()
        self._last_status = None
        self._last_status_received = None

    @abc.abstractmethod
    def save_configuration(self, openwpm_version, browser_version):
        """Save configuration details to the database"""

    @abc.abstractmethod
    def get_next_visit_id(self):
        """Return a unique visit ID to be used as a key for a single visit"""

    @abc.abstractmethod
    def get_next_crawl_id(self):
        """Return a unique crawl ID used as a key for a browser instance"""

    def get_most_recent_status(self):
        """Return the most recent queue size sent from the listener process"""

        # Block until we receive the first status update
        if self._last_status is None:
            return self.get_status()

        # Drain status queue until we receive most recent update
        while not self.status_queue.empty():
            self._last_status = self.status_queue.get()
            self._last_status_received = time.time()

        # Check last status signal
        if (time.time() - self._last_status_received) > STATUS_TIMEOUT:
            raise RuntimeError(
                "No status update from DataAggregator listener process "
                "for %d seconds." % (time.time() - self._last_status_received)
            )

        return self._last_status

    def get_status(self):
        """Get listener process status. If the status queue is empty, block."""
        try:
            self._last_status = self.status_queue.get(
                block=True, timeout=STATUS_TIMEOUT)
            self._last_status_received = time.time()
        except queue.Empty:
            raise RuntimeError(
                "No status update from DataAggregator listener process "
                "for %d seconds." % (time.time() - self._last_status_received)
            )
        return self._last_status

    def launch(self, listener_process_runner, *args):
        """Launch the aggregator listener process"""
        args = (self.manager_params, self.status_queue,
                self.shutdown_queue) + args
        self.listener_process = Process(
            target=listener_process_runner,
            args=args
        )
        self.listener_process.daemon = True
        self.listener_process.start()
        self.listener_address = self.status_queue.get()

    def shutdown(self):
        """ Terminate the aggregator listener process"""
        self.logger.debug(
            "Sending the shutdown signal to the %s listener process..." %
            type(self).__name__
        )
        self.shutdown_queue.put(SHUTDOWN_SIGNAL)
        start_time = time.time()
        self.listener_process.join(300)
        self.logger.debug(
            "%s took %s seconds to close." % (
                type(self).__name__,
                str(time.time() - start_time)
            )
        )
        self.listener_address = None
        self.listener_process = None
Exemplo n.º 21
0
def execute_parallel(farg_pairs, num_procs=None, verbose=False):
    # see https://blog.ionelmc.ro/2014/12/21/compiling-python-extensions-on-windows/
    from multiprocess import Process, Queue, cpu_count

    if num_procs is None:
        # leave 25%
        num_procs = math.ceil(cpu_count() * .75)
        print "using %d procs in execute parallel" % num_procs

    processes = []
    q = None
    results = []
    q = Queue()

    num_jobs = len(farg_pairs)
    if verbose:
        print "execute_parallel num_procs=%d, num_jobs=%d" % (num_procs,
                                                              num_jobs)

    i = -1
    farg_pair = None
    farg_pairs = copy.copy(farg_pairs)
    while len(farg_pairs) > 0:
        farg_pair = farg_pairs.pop(0)
        i += 1
        if verbose:
            print "running job", i

        def target_func(*args, **kwargs):
            q.put((i, farg_pair[0](*args, **kwargs)))

        if len(farg_pair) > 1:
            p = Process(target=target_func, args=farg_pair[1])
        else:
            p = Process(target=target_func)
        p.start()
        processes.append(p)

        # wait until we drop below num_procs
        while len(processes) >= num_procs:
            len1 = len(results)
            results.append(q.get())
            if len1 != len(results):
                for j, p in enumerate(processes):
                    if p.exitcode is not None:
                        p.join()
                        break
                processes = processes[:j] + processes[j + 1:]
            else:
                time.sleep(0.01)

    while len(results) < num_jobs:
        results.append(q.get())
        time.sleep(0.01)

    assert len(results) == num_jobs

    # join remaining processes before exiting
    for i, p in enumerate(processes):
        p.join()

    results = zip(*sorted(results, key=lambda x: x[0]))[1]
    return results
Exemplo n.º 22
0
    def writeEventsToCsv(self, urls, processedUrlsFName, batchSize=20):
        numUrls = len(urls)
        origNumUrls = numUrls
        urlsWithEvents = 0
        totalEvents = 0
        processedListings = 0
        numTimeouts = 0

        try:
            with open(processedUrlsFName, 'r') as pus:
                pUrls = list(set(pus.read().split('\r\n')))
            logging.info(
                'Already processed {0} of {1} urls. Picking up where we'
                ' left off.'.format(len(pUrls), numUrls))
            urls = [url for url in urls if url not in pUrls]
            numUrls = len(urls)
        except IOError:
            pass

        with open(processedUrlsFName, 'a+') as pus:
            pUrls_writer = csv.writer(pus)
            with open(self.eventFile, 'a+') as f:
                writer = csv.writer(f)
                sttm = time.time()

                if self.eventMode == 'parallel':
                    batches = [
                        urls[x:x + batchSize]
                        for x in xrange(0, len(urls), batchSize)]
                    for b, batch in enumerate(batches):
                        logging.info('Starting batch {0} of  {1}'.format(
                            b + 1, len(batches)))
                        manager = Manager()
                        batchQueue = Queue()
                        batchTimeoutList = manager.list()
                        batchProcessedUrls = manager.list()
                        batchEventQueue = manager.Queue()
                        batchEventsSaved = manager.Value('i', 0)
                        jobs = []
                        for i, url in enumerate(batch):
                            batchQueue.put(
                                [self.eventMode, url, batchEventQueue,
                                 batchProcessedUrls, batchTimeoutList])
                        for i in range(len(batch)):
                            proc = Process(
                                target=self.eventWorker, args=(batchQueue,))
                            proc.start()
                            jobs.append(proc)
                        writeProc = Process(
                            target=self.writeToCsvWorker, args=(
                                batchEventQueue, batchEventsSaved))
                        time.sleep(2)
                        writeProc.start()
                        for j, job in enumerate(jobs):
                            # 5 seconds per url for each process before timeout
                            job.join(max(60, 5 * len(batch)))
                            if job.is_alive():
                                job.terminate()
                                logging.info(
                                    'Subprocess {0} of {1} timed out'.format(
                                        j + 1, min(24, len(batch))))
                        writeProc.join(max(60, 8 * len(batch)))
                        totalEvents += batchEventsSaved.value
                        processedListings += len(batch)
                        for url in set(list(batchProcessedUrls)):
                            pUrls_writer.writerow([url])
                        urlsWithEvents += len(set(list(batchProcessedUrls)))
                        numTimeouts += len(set(list(batchTimeoutList)))
                        durMins, minsLeft = self.timeElapsedLeft(
                            sttm, b + 1, len(batches))
                        logging.info(
                            'Saved {0} new events from {1} of {2} listings. '
                            '\nEstimated time to '
                            'completion: ~{3} min.'.format(
                                batchEventsSaved.value,
                                len(batchProcessedUrls), len(batch), minsLeft))
                        os.system(
                            "ps aux | grep chrome | awk ' { print $2 } ' |"
                            " xargs kill -9")

                elif self.eventMode == 'series':
                    for i, url in enumerate(urls):
                        numEvents = 0
                        events = self.getEventsFromListingUrl(
                            self.eventMode, url, None, urls, [])
                        if events is None:
                            durMins, minsLeft = self.timeElapsedLeft(
                                sttm, i + 1, numUrls)
                            logging.info(
                                'No sales events scraped from listing'
                                ' {0} of {1}. Check url: {2}. {3} min.'
                                'elapsed. {4} min. remaining.'.format(
                                    i + 1, numUrls, url, durMins,
                                    minsLeft))
                            continue
                        for event in events:
                            totalEvents += 1
                            numEvents += 1
                            writer.writerow(event)
                        urlsWithEvents += 1
                        pUrls_writer.writerow([url])
                        durMins, minsLeft = self.timeElapsedLeft(
                            sttm, i, numUrls)
                        if (i + 1) % 1 == 0:
                            logging.info(
                                'Scraped {0} sales events from listing {1}'
                                ' of {2}. Scraped {3} total sales events in'
                                ' {4} min. Estimated time to completion:'
                                ' ~{5} min.'.format(
                                    numEvents, i + 1, numUrls, totalEvents,
                                    durMins, minsLeft))
                else:
                    raise ValueError(
                        'Must specify valid event scraping '
                        'mode: ["parallel", "series"]')
        if numUrls > 0:
            self.pctUrlsWithEvents = round(
                urlsWithEvents / origNumUrls * 100.0, 1)
        else:
            self.pctUrlsWithEvents = -999

        logging.info('#' * 100)
        logging.info('#' * 100)
        logging.info(
            'Scraped events from {0} of {1} ({2}%) urls.'.format(
                urlsWithEvents, numUrls, self.pctUrlsWithEvents).center(
                90, ' ').center(100, '#').upper())
        logging.info(
            ('{0} of {1} urls timed out while scraping events.'.format(
                numTimeouts, numUrls).upper().center(90, ' ').center(
                100, '#')))
        logging.info(
            ('Saved {0} events to {1}'.format(
                totalEvents, self.eventFile).upper().center(
                90, ' ').center(100, '#')))
        logging.info('#' * 100)
        logging.info('#' * 100)
Exemplo n.º 23
0
class TaskManager:
    """
    User-facing Class for interfacing with OpenWPM
    The TaskManager spawns several child processes to run the automation tasks.
        - DataAggregator to aggregate data in a SQLite database
        - MPLogger to aggregate logs across processes
        - BrowserManager processes to isolate Browsers in a separate process
    <manager_params> dict of TaskManager configuration parameters
    <browser_params> is a list of (or a single) dictionaries that specify preferences for browsers to instantiate
    <process_watchdog> will monitor firefox and Xvfb processes, killing any not indexed in TaskManager's browser list.
        NOTE: Only run this in isolated environments. It kills processes by name, indiscriminately.
    """
    def __init__(self, manager_params, browser_params, process_watchdog=False):

        # Make paths absolute in manager_params
        for path in ['data_directory', 'log_directory']:
            if manager_params[path] is not None:
                manager_params[path] = os.path.expanduser(manager_params[path])
        manager_params['database_name'] = os.path.join(
            manager_params['data_directory'], manager_params['database_name'])
        manager_params['log_file'] = os.path.join(
            manager_params['log_directory'], manager_params['log_file'])
        manager_params['screenshot_path'] = os.path.join(
            manager_params['data_directory'], 'screenshots')
        manager_params['source_dump_path'] = os.path.join(
            manager_params['data_directory'], 'sources')
        self.manager_params = manager_params

        # Create data directories if they do not exist
        if not os.path.exists(manager_params['screenshot_path']):
            os.makedirs(manager_params['screenshot_path'])
        if not os.path.exists(manager_params['source_dump_path']):
            os.makedirs(manager_params['source_dump_path'])

        # check size of parameter dictionary
        self.num_browsers = manager_params['num_browsers']
        if len(browser_params) != self.num_browsers:
            raise Exception(
                "Number of <browser_params> dicts is not the same as manager_params['num_browsers']"
            )

        # Flow control
        self.closing = False
        self.failure_status = None
        self.threadlock = threading.Lock()
        self.failurecount = 0
        if manager_params['failure_limit'] is not None:
            self.failure_limit = manager_params['failure_limit']
        else:
            self.failure_limit = self.num_browsers * 2 + 10

        self.process_watchdog = process_watchdog

        # sets up the crawl data database
        db_path = manager_params['database_name']
        if not os.path.exists(manager_params['data_directory']):
            os.mkdir(manager_params['data_directory'])
        self.db = sqlite3.connect(db_path)
        with open(os.path.join(os.path.dirname(__file__), 'schema.sql'),
                  'r') as f:
            self.db.executescript(f.read())
        self.db.commit()

        # sets up logging server + connect a client
        self.logging_status_queue = None
        self.loggingserver = self._launch_loggingserver()
        # socket location: (address, port)
        self.manager_params['logger_address'] = self.logging_status_queue.get()
        self.logger = MPLogger.loggingclient(
            *self.manager_params['logger_address'])

        # Mark if LDBAggregator is needed (if js is enabled on any browser)
        self.ldb_enabled = False
        for params in browser_params:
            if params['save_javascript'] or params['save_javascript_proxy']:
                self.ldb_enabled = True
                break

        # Initialize the data aggregators
        self._launch_aggregators()

        # open client socket
        self.sock = clientsocket(serialization='dill')
        self.sock.connect(*self.manager_params['aggregator_address'])
        print 1
        self._save_configuration(browser_params)
        print 2
        # read the last used site visit id
        cur = self.db.cursor()
        cur.execute("SELECT MAX(visit_id) from site_visits")
        last_visit_id = cur.fetchone()[0]
        if last_visit_id is None:
            last_visit_id = 0
        self.next_visit_id = last_visit_id + 1
        print 3
        # sets up the BrowserManager(s) + associated queues
        self.browsers = self._initialize_browsers(
            browser_params)  # List of the Browser(s)
        print 5
        self._launch_browsers()
        print 4
        # start the manager watchdog
        thread = threading.Thread(target=self._manager_watchdog, args=())
        thread.daemon = True
        thread.start()

    def _save_configuration(self, browser_params):
        """ Saves crawl configuration details to db and logfile"""
        cur = self.db.cursor()

        # Get git version and commit information
        openwpm_v, browser_v = get_version()

        # Record task details
        cur.execute(("INSERT INTO task "
                     "(manager_params, openwpm_version, browser_version) "
                     "VALUES (?,?,?)"),
                    (json.dumps(self.manager_params), openwpm_v, browser_v))
        self.db.commit()
        self.task_id = cur.lastrowid

        # Record browser details for each brower
        for i in xrange(self.num_browsers):
            cur.execute(
                "INSERT INTO crawl (task_id, browser_params) VALUES (?,?)",
                (self.task_id, json.dumps(browser_params[i])))
            self.db.commit()
            browser_params[i]['crawl_id'] = cur.lastrowid

        # Print the configuration details
        self.logger.info(
            get_configuration_string(self.manager_params, browser_params,
                                     (openwpm_v, browser_v)))

    def _initialize_browsers(self, browser_params):
        """ initialize the browser classes, each its unique set of parameters """
        browsers = list()
        for i in xrange(self.num_browsers):
            browsers.append(Browser(self.manager_params, browser_params[i]))

        return browsers

    def _launch_browsers(self):
        print 8
        print self.browsers
        """ launch each browser manager process / browser """
        for browser in self.browsers:
            try:
                print 9
                success = browser.launch_browser_manager()
                print 6
            except:
                print 7
                self._cleanup_before_fail(during_init=True)
                raise

            if not success:
                self.logger.critical(
                    "Browser spawn failure during TaskManager initialization, exiting..."
                )
                self.close()
                break

            # Update our DB with the random browser settings
            # These are found within the scope of each instance of Browser in the browsers list
            screen_res = str(browser.browser_settings['screen_res'])
            ua_string = str(browser.browser_settings['ua_string'])
            self.sock.send(("UPDATE crawl SET screen_res = ?, ua_string = ? \
                             WHERE crawl_id = ?", (screen_res, ua_string,
                                                   browser.crawl_id)))

    def _manager_watchdog(self):
        """
        Periodically checks the following:
        - memory consumption of all browsers every 10 seconds
        - presence of processes that are no longer in use
        """
        while not self.closing:
            time.sleep(10)

            # Check browser memory usage
            for browser in self.browsers:
                try:
                    process = psutil.Process(browser.browser_pid)
                    mem = process.memory_info()[0] / float(2**20)
                    if mem > BROWSER_MEMORY_LIMIT:
                        self.logger.info(
                            "BROWSER %i: Memory usage: %iMB, exceeding limit of %iMB"
                            %
                            (browser.crawl_id, int(mem), BROWSER_MEMORY_LIMIT))
                        browser.restart_required = True
                except psutil.NoSuchProcess:
                    pass

            # Check for browsers or displays that were not closed correctly
            # Provide a 300 second buffer to avoid killing freshly launched browsers
            # TODO This buffer should correspond to the maximum browser spawn timeout
            if self.process_watchdog:
                browser_pids = set()
                display_pids = set()
                check_time = time.time()
                for browser in self.browsers:
                    if browser.browser_pid is not None:
                        browser_pids.add(browser.browser_pid)
                    if browser.display_pid is not None:
                        display_pids.add(browser.display_pid)
                for process in psutil.process_iter():
                    if (process.create_time() + 300 < check_time
                            and ((process.name() == 'firefox'
                                  and process.pid not in browser_pids) or
                                 (process.name() == 'Xvfb'
                                  and process.pid not in display_pids))):
                        self.logger.debug(
                            "Process: %s (pid: %i) with start time %s found running but not in browser process list. Killing."
                            % (process.name(), process.pid,
                               process.create_time()))
                        process.kill()

    def _launch_aggregators(self):
        """
        Launches the various data aggregators, which serialize data from all processes.
        * DataAggregator - sqlite database for crawl data
        * LevelDBAggregator - leveldb database for javascript files
        """
        # DataAggregator
        self.aggregator_status_queue = Queue()
        self.data_aggregator = Process(target=DataAggregator.DataAggregator,
                                       args=(self.manager_params,
                                             self.aggregator_status_queue))
        self.data_aggregator.daemon = True
        self.data_aggregator.start()
        self.manager_params[
            'aggregator_address'] = self.aggregator_status_queue.get(
            )  # socket location: (address, port)

        # LevelDB Aggregator
        if self.ldb_enabled:
            self.ldb_status_queue = Queue()
            self.ldb_aggregator = Process(
                target=LevelDBAggregator.LevelDBAggregator,
                args=(self.manager_params, self.ldb_status_queue))
            self.ldb_aggregator.daemon = True
            self.ldb_aggregator.start()
            self.manager_params['ldb_address'] = self.ldb_status_queue.get(
            )  # socket location: (address, port)

    def _kill_aggregators(self):
        """ Terminates the aggregators gracefully """
        # DataAggregator
        self.logger.debug("Telling the DataAggregator to shut down...")
        self.aggregator_status_queue.put("DIE")
        start_time = time.time()
        self.data_aggregator.join(300)
        self.logger.debug("DataAggregator took " +
                          str(time.time() - start_time) + " seconds to close")

        # LevelDB Aggregator
        if self.ldb_enabled:
            self.logger.debug("Telling the LevelDBAggregator to shut down...")
            self.ldb_status_queue.put("DIE")
            start_time = time.time()
            self.ldb_aggregator.join(300)
            self.logger.debug("LevelDBAggregator took " +
                              str(time.time() - start_time) +
                              " seconds to close")

    def _launch_loggingserver(self):
        """ sets up logging server """
        self.logging_status_queue = Queue()
        loggingserver = Process(target=MPLogger.loggingserver,
                                args=(
                                    self.manager_params['log_file'],
                                    self.logging_status_queue,
                                ))
        loggingserver.daemon = True
        loggingserver.start()
        return loggingserver

    def _kill_loggingserver(self):
        """ terminates logging server gracefully """
        self.logging_status_queue.put("DIE")
        self.loggingserver.join(300)

    def _shutdown_manager(self, failure=False, during_init=False):
        """
        Wait for current commands to finish, close all child processes and
        threads
        <failure> flag to indicate manager failure (True) or end of crawl (False)
        <during_init> flag to indicator if this shutdown is occuring during the TaskManager initialization
        """
        self.closing = True

        for browser in self.browsers:
            browser.shutdown_browser(during_init)
            if failure:
                self.sock.send(
                    ("UPDATE crawl SET finished = -1 WHERE crawl_id = ?",
                     (browser.crawl_id, )))
            else:
                self.sock.send(
                    ("UPDATE crawl SET finished = 1 WHERE crawl_id = ?",
                     (browser.crawl_id, )))

        self.db.close()  # close db connection
        self.sock.close()  # close socket to data aggregator
        self._kill_aggregators()
        self._kill_loggingserver()

    def _cleanup_before_fail(self, during_init=False):
        """
        Execute shutdown commands before throwing an exception
        This should keep us from having a bunch of hanging processes
        and incomplete data.
        <during_init> flag to indicator if this shutdown is occuring during
                      the TaskManager initialization
        """
        self._shutdown_manager(failure=True, during_init=during_init)

    def _check_failure_status(self):
        """ Check the status of command failures. Raise exceptions as necessary

        The failure status property is used by the various asynchronous
        command execution threads which interface with the
        remote browser manager processes. If a failure status is found, the
        appropriate steps are taken to gracefully close the infrastructure
        """
        self.logger.debug("Checking command failure status indicator...")
        if self.failure_status:
            self.logger.debug(
                "TaskManager failure status set, halting command execution.")
            self._cleanup_before_fail()
            if self.failure_status['ErrorType'] == 'ExceedCommandFailureLimit':
                raise CommandExecutionError(
                    "TaskManager exceeded maximum consecutive command "
                    "execution failures.",
                    self.failure_status['CommandSequence'])
            elif self.failure_status[
                    'ErrorType'] == 'ExceedLaunchFailureLimit':
                raise CommandExecutionError(
                    "TaskManager failed to launch browser within allowable "
                    "failure limit.", self.failure_status['CommandSequence'])
            if self.failure_status['ErrorType'] == 'CriticalChildException':
                reraise(*cPickle.loads(self.failure_status['Exception']))

    # CRAWLER COMMAND CODE

    def _distribute_command(self, command_sequence, index=None):
        """
        parses command type and issues command(s) to the proper browser
        <index> specifies the type of command this is:
        = None  -> first come, first serve
        =  #    -> index of browser to send command to
        = *     -> sends command to all browsers
        = **    -> sends command to all browsers (synchronized)
        """
        if index is None:
            #send to first browser available
            command_executed = False
            while True:
                for browser in self.browsers:
                    if browser.ready():
                        browser.current_timeout = command_sequence.total_timeout
                        thread = self._start_thread(browser, command_sequence)
                        command_executed = True
                        break
                if command_executed:
                    break
                time.sleep(SLEEP_CONS)

        elif 0 <= index < len(self.browsers):
            #send the command to this specific browser
            while True:
                if self.browsers[index].ready():
                    self.browsers[
                        index].current_timeout = command_sequence.total_timeout
                    thread = self._start_thread(self.browsers[index],
                                                command_sequence)
                    break
                time.sleep(SLEEP_CONS)
        elif index == '*':
            #send the command to all browsers
            command_executed = [False] * len(self.browsers)
            while False in command_executed:
                for i in xrange(len(self.browsers)):
                    if self.browsers[i].ready() and not command_executed[i]:
                        self.browsers[
                            i].current_timeout = command_sequence.total_timeout
                        thread = self._start_thread(self.browsers[i],
                                                    command_sequence)
                        command_executed[i] = True
                time.sleep(SLEEP_CONS)
        elif index == '**':
            #send the command to all browsers and sync it
            condition = threading.Condition(
            )  # Used to block threads until ready
            command_executed = [False] * len(self.browsers)
            while False in command_executed:
                for i in xrange(len(self.browsers)):
                    if self.browsers[i].ready() and not command_executed[i]:
                        self.browsers[
                            i].current_timeout = command_sequence.total_timeout
                        thread = self._start_thread(self.browsers[i],
                                                    command_sequence,
                                                    condition)
                        command_executed[i] = True
                time.sleep(SLEEP_CONS)
            with condition:
                condition.notifyAll(
                )  # All browsers loaded, tell them to start
        else:
            self.logger.info(
                "Command index type is not supported or out of range")
            return

        if command_sequence.blocking:
            thread.join()
            self._check_failure_status()

    def _start_thread(self, browser, command_sequence, condition=None):
        """  starts the command execution thread """

        # Check status flags before starting thread
        if self.closing:
            self.logger.error(
                "Attempted to execute command on a closed TaskManager")
            return
        self._check_failure_status()

        browser.set_visit_id(self.next_visit_id)
        self.sock.send((
            "INSERT INTO site_visits (visit_id, crawl_id, site_url) VALUES (?,?,?)",
            (self.next_visit_id, browser.crawl_id, command_sequence.url)))
        self.next_visit_id += 1

        # Start command execution thread
        args = (browser, command_sequence, condition)
        thread = threading.Thread(target=self._issue_command, args=args)
        browser.command_thread = thread
        thread.daemon = True
        thread.start()
        return thread

    def _issue_command(self, browser, command_sequence, condition=None):
        """
        sends command tuple to the BrowserManager
        """
        browser.is_fresh = False  # since we are issuing a command, the BrowserManager is no longer a fresh instance

        # if this is a synced call, block on condition
        if condition is not None:
            with condition:
                condition.wait()

        reset = command_sequence.reset
        start_time = None  # tracks when a site visit started, so that flash/profile
        # cookies can be properly tracked.
        for command_and_timeout in command_sequence.commands_with_timeout:
            command, timeout = command_and_timeout
            if command[0] in ['GET', 'BROWSE']:
                start_time = time.time()
                command += (browser.curr_visit_id, )
            elif command[0] in ['DUMP_FLASH_COOKIES', 'DUMP_PROFILE_COOKIES']:
                command += (
                    start_time,
                    browser.curr_visit_id,
                )
            browser.current_timeout = timeout
            # passes off command and waits for a success (or failure signal)
            browser.command_queue.put(command)
            command_succeeded = 0  #1 success, 0 failure from error, -1 timeout
            command_arguments = command[1] if len(command) > 1 else None

            # received reply from BrowserManager, either success signal or failure notice
            try:
                status = browser.status_queue.get(True,
                                                  browser.current_timeout)
                if status == "OK":
                    command_succeeded = 1
                elif status[0] == "CRITICAL":
                    self.logger.critical(
                        "BROWSER %i: Received critical error "
                        "from browser process while executing "
                        "command %s. Setting failure status." %
                        (browser.crawl_id, str(command)))
                    self.failure_status = {
                        'ErrorType': 'CriticalChildException',
                        'CommandSequence': command_sequence,
                        'Exception': status[1]
                    }
                    return
                else:
                    command_succeeded = 0
                    self.logger.info(
                        "BROWSER %i: Received failure status while"
                        " executing command: %s" %
                        (browser.crawl_id, command[0]))
            except EmptyQueue:
                command_succeeded = -1
                self.logger.info(
                    "BROWSER %i: Timeout while executing command, "
                    "%s, killing browser manager" %
                    (browser.crawl_id, command[0]))

            self.sock.send((
                "INSERT INTO CrawlHistory (crawl_id, command, arguments, bool_success)"
                " VALUES (?,?,?,?)", (browser.crawl_id, command[0],
                                      command_arguments, command_succeeded)))

            if command_succeeded != 1:
                with self.threadlock:
                    self.failurecount += 1
                if self.failurecount > self.failure_limit:
                    self.logger.critical(
                        "BROWSER %i: Command execution failure"
                        " pushes failure count above the allowable limit."
                        " Setting failure_status." % browser.crawl_id)
                    self.failure_status = {
                        'ErrorType': 'ExceedCommandFailureLimit',
                        'CommandSequence': command_sequence
                    }
                    return
                browser.restart_required = True
            else:
                with self.threadlock:
                    self.failurecount = 0

            if browser.restart_required:
                break

        if self.closing:
            return

        if browser.restart_required or reset:
            success = browser.restart_browser_manager(clear_profile=reset)
            if not success:
                self.logger.critical(
                    "BROWSER %i: Exceeded the maximum allowable "
                    "consecutive browser launch failures. "
                    "Setting failure_status." % browser.crawl_id)
                self.failure_status = {
                    'ErrorType': 'ExceedLaunchFailureLimit',
                    'CommandSequence': command_sequence
                }
                return
            browser.restart_required = False

    def execute_command_sequence(self, command_sequence, index=None):
        self._distribute_command(command_sequence, index)

    # DEFINITIONS OF HIGH LEVEL COMMANDS
    # NOTE: These wrappers are provided for convenience. To issue sequential
    # commands to the same browser in a single 'visit', use the CommandSequence
    # class directly.

    def get(self, url, index=None, timeout=60, sleep=0, reset=False):
        """ goes to a url """
        command_sequence = CommandSequence.CommandSequence(url)
        command_sequence.get(timeout=timeout, sleep=sleep)
        command_sequence.reset = reset
        self.execute_command_sequence(command_sequence, index=index)

    def browse(self,
               url,
               num_links=2,
               sleep=0,
               index=None,
               timeout=60,
               reset=False):
        """ browse a website and visit <num_links> links on the page """
        command_sequence = CommandSequence.CommandSequence(url)
        command_sequence.get(sleep=sleep, timeout=timeout)
        command_sequence.reset = reset
        self.execute_command_sequence(command_sequence, index=index)

    def close(self):
        """
        Execute shutdown procedure for TaskManager
        """
        if self.closing:
            self.logger.error("TaskManager already closed")
            return
        self._shutdown_manager()
Exemplo n.º 24
0
class TaskManager:
    """
    User-facing Class for interfacing with OpenWPM
    The TaskManager spawns several child processes to run the automation tasks.
        - DataAggregator to aggregate data in a SQLite database
        - MPLogger to aggregate logs across processes
        - BrowserManager processes to isolate Browsers in a separate process
    <manager_params> dict of TaskManager configuration parameters
    <browser_params> is a list of (or a single) dictionaries that specify
    preferences for browsers to instantiate
    <process_watchdog> will monitor firefox and Xvfb processes, killing
    any not indexed in TaskManager's browser list.
        NOTE: Only run this in isolated environments. It kills processes
        by name, indiscriminately.
    """
    def __init__(self, manager_params, browser_params, process_watchdog=False):

        # Make paths absolute in manager_params
        for path in ['data_directory', 'log_directory']:
            if manager_params[path] is not None:
                manager_params[path] = os.path.expanduser(manager_params[path])
        manager_params['database_name'] = os.path.join(
            manager_params['data_directory'], manager_params['database_name'])
        manager_params['log_file'] = os.path.join(
            manager_params['log_directory'], manager_params['log_file'])
        manager_params['screenshot_path'] = os.path.join(
            manager_params['data_directory'], 'screenshots')
        manager_params['source_dump_path'] = os.path.join(
            manager_params['data_directory'], 'sources')
        self.manager_params = manager_params
        self.browser_params = browser_params

        # Create data directories if they do not exist
        if not os.path.exists(manager_params['screenshot_path']):
            os.makedirs(manager_params['screenshot_path'])
        if not os.path.exists(manager_params['source_dump_path']):
            os.makedirs(manager_params['source_dump_path'])

        # check size of parameter dictionary
        self.num_browsers = manager_params['num_browsers']
        if len(browser_params) != self.num_browsers:
            raise Exception("Number of <browser_params> dicts is not the same "
                            "as manager_params['num_browsers']")

        # Flow control
        self.closing = False
        self.failure_status = None
        self.threadlock = threading.Lock()
        self.failurecount = 0
        if manager_params['failure_limit'] is not None:
            self.failure_limit = manager_params['failure_limit']
        else:
            self.failure_limit = self.num_browsers * 2 + 10

        self.process_watchdog = process_watchdog

        # sets up logging server + connect a client
        self.logging_status_queue = None
        self.loggingserver = self._launch_loggingserver()
        # socket location: (address, port)
        self.manager_params['logger_address'] = self.logging_status_queue.get()
        self.logger = MPLogger.loggingclient(
            *self.manager_params['logger_address'])

        # Mark if LDBAggregator is needed
        # (if content saving is enabled on any browser)
        self.ldb_enabled = False
        for params in browser_params:
            if params['save_javascript'] or params['save_all_content']:
                self.ldb_enabled = True
                break

        # Initialize the data aggregators
        self._launch_aggregators()

        # sets up the BrowserManager(s) + associated queues
        self.browsers = self._initialize_browsers(browser_params)
        self._launch_browsers()

        # start the manager watchdog
        thread = threading.Thread(target=self._manager_watchdog, args=())
        thread.daemon = True
        thread.start()

        # Save crawl config information to database
        openwpm_v, browser_v = get_version()
        self.data_aggregator.save_configuration(openwpm_v, browser_v)
        self.logger.info(
            get_configuration_string(self.manager_params, browser_params,
                                     (openwpm_v, browser_v)))

    def _initialize_browsers(self, browser_params):
        """ initialize the browser classes, each its unique set of params """
        browsers = list()
        for i in range(self.num_browsers):
            browser_params[i][
                'crawl_id'] = self.data_aggregator.get_next_crawl_id()
            browsers.append(Browser(self.manager_params, browser_params[i]))

        return browsers

    def _launch_browsers(self):
        """ launch each browser manager process / browser """
        for browser in self.browsers:
            try:
                success = browser.launch_browser_manager()
            except Exception:
                self._cleanup_before_fail(during_init=True)
                raise

            if not success:
                self.logger.critical("Browser spawn failure during "
                                     "TaskManager initialization, exiting...")
                self.close()
                break

    def _manager_watchdog(self):
        """
        Periodically checks the following:
        - memory consumption of all browsers every 10 seconds
        - presence of processes that are no longer in use

        TODO: process watchdog needs to be updated since `psutil` won't
              kill browser processes started by Selenium 3 (with `subprocess`)
        """
        if self.process_watchdog:
            self.logger.error("BROWSER %i: Process watchdog is not currently "
                              "supported." % self.crawl_id)
        while not self.closing:
            time.sleep(10)

            # Check browser memory usage
            for browser in self.browsers:
                try:
                    process = psutil.Process(browser.browser_pid)
                    mem = process.memory_info()[0] / float(2**20)
                    if mem > BROWSER_MEMORY_LIMIT:
                        self.logger.info(
                            "BROWSER %i: Memory usage: %iMB"
                            ", exceeding limit of %iMB" %
                            (browser.crawl_id, int(mem), BROWSER_MEMORY_LIMIT))
                        browser.restart_required = True
                except psutil.NoSuchProcess:
                    pass

            # Check for browsers or displays that were not closed correctly
            # 300 second buffer to avoid killing freshly launched browsers
            # TODO This buffer should correspond to the maximum spawn timeout
            if self.process_watchdog:
                browser_pids = set()
                display_pids = set()
                check_time = time.time()
                for browser in self.browsers:
                    if browser.browser_pid is not None:
                        browser_pids.add(browser.browser_pid)
                    if browser.display_pid is not None:
                        display_pids.add(browser.display_pid)
                for process in psutil.process_iter():
                    if (process.create_time() + 300 < check_time
                            and ((process.name() == 'firefox'
                                  and process.pid not in browser_pids) or
                                 (process.name() == 'Xvfb'
                                  and process.pid not in display_pids))):
                        self.logger.debug("Process: %s (pid: %i) with start "
                                          "time %s found running but not in "
                                          "browser process list. Killing." %
                                          (process.name(), process.pid,
                                           process.create_time()))
                        process.kill()

    def _launch_aggregators(self):
        """Launch the necessary data aggregators"""
        self.data_aggregator = SqliteAggregator.SqliteAggregator(
            self.manager_params, self.browser_params)
        self.data_aggregator.launch()
        self.manager_params[
            'aggregator_address'] = self.data_aggregator.listener_address

        # open connection to aggregator for saving crawl details
        self.sock = clientsocket(serialization='dill')
        self.sock.connect(*self.manager_params['aggregator_address'])

        # TODO refactor ldb aggregator to use new base classes
        if self.ldb_enabled:
            self.ldb_status_queue = Queue()
            self.ldb_aggregator = Process(
                target=LevelDBAggregator.LevelDBAggregator,
                args=(self.manager_params, self.ldb_status_queue))
            self.ldb_aggregator.daemon = True
            self.ldb_aggregator.start()
            # socket location: (address, port)
            self.manager_params['ldb_address'] = self.ldb_status_queue.get()

    def _kill_aggregators(self):
        """Shutdown any currently running data aggregators"""
        self.data_aggregator.shutdown()

        # TODO refactor ldb aggregator to use new base classes
        if self.ldb_enabled:
            self.logger.debug("Telling the LevelDBAggregator to shut down...")
            self.ldb_status_queue.put("DIE")
            start_time = time.time()
            self.ldb_aggregator.join(300)
            self.logger.debug("LevelDBAggregator took %s seconds to close." %
                              (str(time.time() - start_time)))

    def _launch_loggingserver(self):
        """ sets up logging server """
        self.logging_status_queue = Queue()
        loggingserver = Process(target=MPLogger.loggingserver,
                                args=(
                                    self.manager_params['log_file'],
                                    self.logging_status_queue,
                                ))
        loggingserver.daemon = True
        loggingserver.start()
        return loggingserver

    def _kill_loggingserver(self):
        """ terminates logging server gracefully """
        self.logging_status_queue.put("DIE")
        self.loggingserver.join(300)

    def _shutdown_manager(self, during_init=False):
        """
        Wait for current commands to finish, close all child processes and
        threads
        <during_init> flag to indicator if this shutdown is occuring during
                      the TaskManager initialization
        """
        self.closing = True

        for browser in self.browsers:
            browser.shutdown_browser(during_init)

        self.sock.close()  # close socket to data aggregator
        self._kill_aggregators()
        self._kill_loggingserver()

    def _cleanup_before_fail(self, during_init=False):
        """
        Execute shutdown commands before throwing an exception
        This should keep us from having a bunch of hanging processes
        and incomplete data.
        <during_init> flag to indicator if this shutdown is occuring during
                      the TaskManager initialization
        """
        self._shutdown_manager(during_init=during_init)

    def _check_failure_status(self):
        """ Check the status of command failures. Raise exceptions as necessary

        The failure status property is used by the various asynchronous
        command execution threads which interface with the
        remote browser manager processes. If a failure status is found, the
        appropriate steps are taken to gracefully close the infrastructure
        """
        self.logger.debug("Checking command failure status indicator...")
        if self.failure_status:
            self.logger.debug(
                "TaskManager failure status set, halting command execution.")
            self._cleanup_before_fail()
            if self.failure_status['ErrorType'] == 'ExceedCommandFailureLimit':
                raise CommandExecutionError(
                    "TaskManager exceeded maximum consecutive command "
                    "execution failures.",
                    self.failure_status['CommandSequence'])
            elif (self.failure_status['ErrorType'] == ("ExceedLaunch"
                                                       "FailureLimit")):
                raise CommandExecutionError(
                    "TaskManager failed to launch browser within allowable "
                    "failure limit.", self.failure_status['CommandSequence'])
            if self.failure_status['ErrorType'] == 'CriticalChildException':
                reraise(*pickle.loads(self.failure_status['Exception']))

    # CRAWLER COMMAND CODE

    def _distribute_command(self, command_seq, index=None):
        """
        parses command type and issues command(s) to the proper browser
        <index> specifies the type of command this is:
        = None  -> first come, first serve
        =  #    -> index of browser to send command to
        = *     -> sends command to all browsers
        = **    -> sends command to all browsers (synchronized)
        """
        if index is None:
            # send to first browser available
            command_executed = False
            while True:
                for browser in self.browsers:
                    if browser.ready():
                        browser.current_timeout = command_seq.total_timeout
                        thread = self._start_thread(browser, command_seq)
                        command_executed = True
                        break
                if command_executed:
                    break
                time.sleep(SLEEP_CONS)

        elif index == '*':
            # send the command to all browsers
            command_executed = [False] * len(self.browsers)
            while False in command_executed:
                for i in range(len(self.browsers)):
                    if self.browsers[i].ready() and not command_executed[i]:
                        self.browsers[
                            i].current_timeout = command_seq.total_timeout
                        thread = self._start_thread(self.browsers[i],
                                                    command_seq)
                        command_executed[i] = True
                time.sleep(SLEEP_CONS)
        elif index == '**':
            # send the command to all browsers and sync it
            condition = threading.Condition()  # block threads until ready
            command_executed = [False] * len(self.browsers)
            while False in command_executed:
                for i in range(len(self.browsers)):
                    if self.browsers[i].ready() and not command_executed[i]:
                        self.browsers[
                            i].current_timeout = command_seq.total_timeout
                        thread = self._start_thread(self.browsers[i],
                                                    command_seq, condition)
                        command_executed[i] = True
                time.sleep(SLEEP_CONS)
            with condition:
                condition.notifyAll()  # All browsers loaded, start
        elif 0 <= index < len(self.browsers):
            # send the command to this specific browser
            while True:
                if self.browsers[index].ready():
                    self.browsers[
                        index].current_timeout = command_seq.total_timeout
                    thread = self._start_thread(self.browsers[index],
                                                command_seq)
                    break
                time.sleep(SLEEP_CONS)
        else:
            self.logger.info(
                "Command index type is not supported or out of range")
            return

        if command_seq.blocking:
            thread.join()
            self._check_failure_status()

    def _start_thread(self, browser, command_sequence, condition=None):
        """  starts the command execution thread """

        # Check status flags before starting thread
        if self.closing:
            self.logger.error(
                "Attempted to execute command on a closed TaskManager")
            return
        self._check_failure_status()

        browser.set_visit_id(self.data_aggregator.get_next_visit_id())
        self.sock.send(("site_visits", {
            "visit_id": browser.curr_visit_id,
            "crawl_id": browser.crawl_id,
            "site_url": command_sequence.url
        }))

        # Start command execution thread
        args = (browser, command_sequence, condition)
        thread = threading.Thread(target=self._issue_command, args=args)
        browser.command_thread = thread
        thread.daemon = True
        thread.start()
        return thread

    def _issue_command(self, browser, command_sequence, condition=None):
        """
        sends command tuple to the BrowserManager
        """
        browser.is_fresh = False

        # if this is a synced call, block on condition
        if condition is not None:
            with condition:
                condition.wait()

        reset = command_sequence.reset
        start_time = None
        for command_and_timeout in command_sequence.commands_with_timeout:
            command, timeout = command_and_timeout
            if command[0] in [
                    'GET', 'BROWSE', 'SAVE_SCREENSHOT', 'SCREENSHOT_FULL_PAGE',
                    'DUMP_PAGE_SOURCE', 'RECURSIVE_DUMP_PAGE_SOURCE'
            ]:
                start_time = time.time()
                command += (browser.curr_visit_id, )
            elif command[0] in ['DUMP_FLASH_COOKIES', 'DUMP_PROFILE_COOKIES']:
                command += (
                    start_time,
                    browser.curr_visit_id,
                )
            browser.current_timeout = timeout
            # passes off command and waits for a success (or failure signal)
            browser.command_queue.put(command)
            command_succeeded = 0  # 1 success, 0 error, -1 timeout
            command_arguments = command[1] if len(command) > 1 else None

            # received reply from BrowserManager, either success or failure
            try:
                status = browser.status_queue.get(True,
                                                  browser.current_timeout)
                if status == "OK":
                    command_succeeded = 1
                elif status[0] == "CRITICAL":
                    self.logger.critical(
                        "BROWSER %i: Received critical error from browser "
                        "process while executing command %s. Setting failure "
                        "status." % (browser.crawl_id, str(command)))
                    self.failure_status = {
                        'ErrorType': 'CriticalChildException',
                        'CommandSequence': command_sequence,
                        'Exception': status[1]
                    }
                    return
                else:
                    command_succeeded = 0
                    self.logger.info(
                        "BROWSER %i: Received failure status while executing "
                        "command: %s" % (browser.crawl_id, command[0]))
            except EmptyQueue:
                command_succeeded = -1
                self.logger.info(
                    "BROWSER %i: Timeout while executing command, %s, killing "
                    "browser manager" % (browser.crawl_id, command[0]))

            self.sock.send(("crawl_history", {
                "crawl_id": browser.crawl_id,
                "command": command[0],
                "arguments": command_arguments,
                "bool_success": command_succeeded
            }))

            if command_succeeded != 1:
                with self.threadlock:
                    self.failurecount += 1
                if self.failurecount > self.failure_limit:
                    self.logger.critical(
                        "BROWSER %i: Command execution failure pushes failure "
                        "count above the allowable limit. Setting "
                        "failure_status." % browser.crawl_id)
                    self.failure_status = {
                        'ErrorType': 'ExceedCommandFailureLimit',
                        'CommandSequence': command_sequence
                    }
                    return
                browser.restart_required = True
                self.logger.debug("BROWSER %i: Browser restart required" %
                                  (browser.crawl_id))
            else:
                with self.threadlock:
                    self.failurecount = 0

            if browser.restart_required:
                break

        # Sleep after executing CommandSequence to provide extra time for
        # internal buffers to drain. Stopgap in support of #135
        time.sleep(2)

        if self.closing:
            return

        if browser.restart_required or reset:
            success = browser.restart_browser_manager(clear_profile=reset)
            if not success:
                self.logger.critical(
                    "BROWSER %i: Exceeded the maximum allowable consecutive "
                    "browser launch failures. Setting failure_status." %
                    (browser.crawl_id))
                self.failure_status = {
                    'ErrorType': 'ExceedLaunchFailureLimit',
                    'CommandSequence': command_sequence
                }
                return
            browser.restart_required = False

    def execute_command_sequence(self, command_sequence, index=None):
        self._distribute_command(command_sequence, index)

    # DEFINITIONS OF HIGH LEVEL COMMANDS
    # NOTE: These wrappers are provided for convenience. To issue sequential
    # commands to the same browser in a single 'visit', use the CommandSequence
    # class directly.

    def get(self, url, index=None, timeout=60, sleep=0, reset=False):
        """ goes to a url """
        command_sequence = CommandSequence.CommandSequence(url)
        command_sequence.get(timeout=timeout, sleep=sleep)
        command_sequence.reset = reset
        self.execute_command_sequence(command_sequence, index=index)

    def browse(self,
               url,
               num_links=2,
               sleep=0,
               index=None,
               timeout=60,
               reset=False):
        """ browse a website and visit <num_links> links on the page """
        command_sequence = CommandSequence.CommandSequence(url)
        command_sequence.browse(num_links=num_links,
                                sleep=sleep,
                                timeout=timeout)
        command_sequence.reset = reset
        self.execute_command_sequence(command_sequence, index=index)

    def close(self):
        """
        Execute shutdown procedure for TaskManager
        """
        if self.closing:
            self.logger.error("TaskManager already closed")
            return
        self._shutdown_manager()
Exemplo n.º 25
0
class AmpDecorator(Amplifier):
    """This class 'decorates' the Low-Level Amplifier classes with
    Network-Marker and Save-To-File functionality.

    You use it by decorating (not as in Python-Decorator, but in the GoF
    sense) the low level amplifier class you want to use::

        import libmushu
        from libmushu.ampdecorator import AmpDecorator
        from libmushu.driver.randomamp import RandomAmp

        amp = Ampdecorator(RandomAmp)

    Waring: The network marker timings on Windows have a resolution of
    10ms-15ms. On Linux the resolution is 1us. This is due to
    limitations of Python's time.time method, or rather a Windows
    specific issue.

    There exists currently no precise timer, providing times which are
    comparable between two running processes on Windows. The performance
    counter provided on Windows, has a much better resolution but is
    relative to the processes start time and it drifts (1s per 100s), so
    it is only precise for a relatively short amount of time.

    If a higher precision is needed one has to replace the time.time
    calls with something which provides a better precision. For example
    one could create a third process which provides times or regularly
    synchronize both processes with the clock synchronization algorithm
    as described here:

        http://en.wikipedia.org/wiki/Network_Time_Protocol

    Alternatively one could use `timeGetTime` from Windows' Multi Media
    library, which is tunable via `timeBeginPeriod` and provides a
    precision of 1-2ms. Apparently this is the way Chrome and many
    others do it.::

        from __future__ import division

        from ctypes import windll
        import time

        timeBeginPeriod = windll.winmm.timeBeginPeriod
        timeEndPeriod = windll.winmm.timeEndPeriod
        timeGetTime = windll.winmm.timeGetTime

        if __name__ == '__main__':
            # wrap the code that needs high precision in timeBegin- and
            # timeEndPeriod with the same parameter. The parameter is
            # the interval in ms you want as precision. Usually the
            # minimum value allowed is 1 (best).
            timeBeginPeriod(1)
            times = []
            t_start = time.time()
            while time.time() < (time.time() + 1):
                times.append(timeGetTime())
            times = sorted(list(set(times)))
            print(1000 / len(times))
            timeEndPeriod(1)

    """
    def __init__(self, ampcls):
        self.amp = ampcls()
        self.write_to_file = False

    @property
    def presets(self):
        return self.amp.presets

    def start(self, filename=None):
        # prepare files for writing
        self.write_to_file = False
        if filename is not None:
            self.write_to_file = True
            filename_marker = filename + '.marker'
            filename_eeg = filename + '.eeg'
            filename_meta = filename + '.meta'
            for filename in filename_marker, filename_eeg, filename_meta:
                if os.path.exists(filename):
                    logger.error('A file "%s" already exists, aborting.' %
                                 filename)
                    raise Exception
            self.fh_eeg = open(filename_eeg, 'wb')
            self.fh_marker = open(filename_marker, 'w')
            self.fh_meta = open(filename_meta, 'w')
            # write meta data
            meta = {
                'Channels': self.amp.get_channels(),
                'Sampling Frequency': self.amp.get_sampling_frequency(),
                'Amp': str(self.amp)
            }
            json.dump(meta, self.fh_meta, indent=4)

        # start the marker server
        self.marker_queue = Queue()
        self.tcp_reader_running = Event()
        self.tcp_reader_running.set()
        tcp_reader_ready = Event()
        self.tcp_reader = Process(target=marker_reader,
                                  args=(self.marker_queue,
                                        self.tcp_reader_running,
                                        tcp_reader_ready))
        self.tcp_reader.start()
        logger.debug('Waiting for marker server to become ready...')
        tcp_reader_ready.wait()
        logger.debug('Marker server is ready.')
        # zero the sample counter
        self.received_samples = 0
        # start the amp
        self.amp.start()

    def stop(self):
        # stop the amp
        self.amp.stop()
        # stop the marker server
        self.tcp_reader_running.clear()
        logger.debug('Waiting for marker server process to stop...')
        self.tcp_reader.join()
        logger.debug('Marker server process stopped.')
        # close the files
        if self.write_to_file:
            logger.debug('Closing files.')
            for fh in self.fh_eeg, self.fh_marker, self.fh_meta:
                fh.close()

    def configure(self, **kwargs):
        self.amp.configure(**kwargs)

    def get_data(self):
        """Get data from the amplifier.

        This method is supposed to get called as fast as possible (i.e
        hundreds of times per seconds) and returns the data and the
        markers.

        Returns
        -------
        data : 2darray
            a numpy array (time, channels) of the EEG data
        markers : list of (float, str)
            a list of markers. Each element is a tuple of timestamp and
            string. The timestamp is the time in ms relative to the
            onset of the block of data. Note that negative values are
            *allowed* as well as values bigger than the length of the
            block of data returned. That is to be interpreted as a
            marker from the last block and a marker for a future block
            respectively.

        """
        # get data and marker from underlying amp
        data, marker = self.amp.get_data()

        t = time.time()
        # length in sec of the new block according to #samples and fs
        block_duration = len(data) / self.amp.get_sampling_frequency()
        # abs time of start of the block
        t0 = t - block_duration
        # duration of all blocks in ms except the current one
        duration = 1000 * self.received_samples / self.amp.get_sampling_frequency(
        )

        # merge markers
        tcp_marker = []
        while not self.marker_queue.empty():
            m = self.marker_queue.get()
            m[0] = (m[0] - t0) * 1000
            tcp_marker.append(m)
        marker = sorted(marker + tcp_marker)
        # save data to files
        if self.write_to_file:
            for m in marker:
                self.fh_marker.write("%f %s\n" % (duration + m[0], m[1]))
            self.fh_eeg.write(struct.pack("f" * data.size, *data.flatten()))
        self.received_samples += len(data)
        if len(data) == 0 and len(marker) > 0:
            logger.error(
                'Received marker but no data. This is an error, the amp should block on get_data until data is available. Marker timestamps will be unreliable.'
            )
        return data, marker

    def get_channels(self):
        return self.amp.get_channels()

    def get_sampling_frequency(self):
        return self.amp.get_sampling_frequency()
Exemplo n.º 26
0
class BaseManager(object):
    '''
    Base class for managers
    '''
    _registry = {}
    _Server = Server

    def __init__(self, address=None, authkey=None, serializer='pickle'):
        if authkey is None:
            authkey = current_process().authkey
        self._address = address     # XXX not final address if eg ('', 0)
        self._authkey = AuthenticationString(authkey)
        self._state = State()
        self._state.value = State.INITIAL
        self._serializer = serializer
        self._Listener, self._Client = listener_client[serializer]

    def __reduce__(self):
        return type(self).from_address, \
               (self._address, self._authkey, self._serializer)

    def get_server(self):
        '''
        Return server object with serve_forever() method and address attribute
        '''
        assert self._state.value == State.INITIAL
        return Server(self._registry, self._address,
                      self._authkey, self._serializer)

    def connect(self):
        '''
        Connect manager object to the server process
        '''
        Listener, Client = listener_client[self._serializer]
        conn = Client(self._address, authkey=self._authkey)
        dispatch(conn, None, 'dummy')
        self._state.value = State.STARTED

    def start(self, initializer=None, initargs=()):
        '''
        Spawn a server process for this manager object
        '''
        assert self._state.value == State.INITIAL

        if initializer is not None and not hasattr(initializer, '__call__'):
            raise TypeError('initializer must be a callable')

        # pipe over which we will retrieve address of server
        reader, writer = connection.Pipe(duplex=False)

        # spawn process which runs a server
        self._process = Process(
            target=type(self)._run_server,
            args=(self._registry, self._address, self._authkey,
                  self._serializer, writer, initializer, initargs),
            )
        ident = ':'.join(str(i) for i in self._process._identity)
        self._process.name = type(self).__name__  + '-' + ident
        self._process.start()

        # get address of server
        writer.close()
        self._address = reader.recv()
        reader.close()

        # register a finalizer
        self._state.value = State.STARTED
        self.shutdown = util.Finalize(
            self, type(self)._finalize_manager,
            args=(self._process, self._address, self._authkey,
                  self._state, self._Client),
            exitpriority=0
            )

    @classmethod
    def _run_server(cls, registry, address, authkey, serializer, writer,
                    initializer=None, initargs=()):
        '''
        Create a server, report its address and run it
        '''
        if initializer is not None:
            initializer(*initargs)

        # create server
        server = cls._Server(registry, address, authkey, serializer)

        # inform parent process of the server's address
        writer.send(server.address)
        writer.close()

        # run the manager
        util.info('manager serving at %r', server.address)
        server.serve_forever()

    def _create(self, typeid, *args, **kwds):
        '''
        Create a new shared object; return the token and exposed tuple
        '''
        assert self._state.value == State.STARTED, 'server not yet started'
        conn = self._Client(self._address, authkey=self._authkey)
        try:
            id, exposed = dispatch(conn, None, 'create', (typeid,)+args, kwds)
        finally:
            conn.close()
        return Token(typeid, self._address, id), exposed

    def join(self, timeout=None):
        '''
        Join the manager process (if it has been spawned)
        '''
        self._process.join(timeout)

    def _debug_info(self):
        '''
        Return some info about the servers shared objects and connections
        '''
        conn = self._Client(self._address, authkey=self._authkey)
        try:
            return dispatch(conn, None, 'debug_info')
        finally:
            conn.close()

    def _number_of_objects(self):
        '''
        Return the number of shared objects
        '''
        conn = self._Client(self._address, authkey=self._authkey)
        try:
            return dispatch(conn, None, 'number_of_objects')
        finally:
            conn.close()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.shutdown()

    @staticmethod
    def _finalize_manager(process, address, authkey, state, _Client):
        '''
        Shutdown the manager process; will be registered as a finalizer
        '''
        if process.is_alive():
            util.info('sending shutdown message to manager')
            try:
                conn = _Client(address, authkey=authkey)
                try:
                    dispatch(conn, None, 'shutdown')
                finally:
                    conn.close()
            except Exception:
                pass

            process.join(timeout=0.2)
            if process.is_alive():
                util.info('manager still alive')
                if hasattr(process, 'terminate'):
                    util.info('trying to `terminate()` manager process')
                    process.terminate()
                    process.join(timeout=0.1)
                    if process.is_alive():
                        util.info('manager still alive after terminate')

        state.value = State.SHUTDOWN
        try:
            del BaseProxy._address_to_local[address]
        except KeyError:
            pass

    address = property(lambda self: self._address)

    @classmethod
    def register(cls, typeid, callable=None, proxytype=None, exposed=None,
                 method_to_typeid=None, create_method=True):
        '''
        Register a typeid with the manager type
        '''
        if '_registry' not in cls.__dict__:
            cls._registry = cls._registry.copy()

        if proxytype is None:
            proxytype = AutoProxy

        exposed = exposed or getattr(proxytype, '_exposed_', None)

        method_to_typeid = method_to_typeid or \
                           getattr(proxytype, '_method_to_typeid_', None)

        if method_to_typeid:
            for key, value in method_to_typeid.items():
                assert type(key) is str, '%r is not a string' % key
                assert type(value) is str, '%r is not a string' % value

        cls._registry[typeid] = (
            callable, exposed, method_to_typeid, proxytype
            )

        if create_method:
            def temp(self, *args, **kwds):
                util.debug('requesting creation of a shared %r object', typeid)
                token, exp = self._create(typeid, *args, **kwds)
                proxy = proxytype(
                    token, self._serializer, manager=self,
                    authkey=self._authkey, exposed=exp
                    )
                conn = self._Client(token.address, authkey=self._authkey)
                dispatch(conn, None, 'decref', (token.id,))
                return proxy
            temp.__name__ = typeid
            setattr(cls, typeid, temp)
Exemplo n.º 27
0
def jacobianos(Xint, Ns, XextList, params, hessianos=True):
    '''
    funcion que calcula los jacobianos y hessianos de las variables intrinsecas
    y extrinsecas. hace un hilo para cada cuenta
    '''
    # donde guardar resultado de derivadas de params internos
    jInt = Queue()
    if hessianos:
        hInt = Queue()

    # creo e inicializo los threads
    if hessianos:
        # print('cuentas intrinsecas, 2 processos')
        pHInt = Process(target=procHint,
                        args=(Xint, Ns, XextList, params, hInt))
        pHInt.start()
    #else:
    # print('cuentas intrinsecas, 1 processo')

    pJInt = Process(target=procJint, args=(Xint, Ns, XextList, params, jInt))

    pJInt.start()  # inicio procesos

    # donde guardar resultados de jaco y hess externos
    n = len(XextList)
    jExt = np.zeros((n, 1, 6), dtype=float)
    qJext = [Queue() for nn in range(n)]

    if hessianos:
        hExt = np.zeros((n, 6, 6), dtype=float)
        qHext = [Queue() for nn in range(n)]

    # lista de threads
    proJ = list()
    if hessianos:
        proH = list()

    # creo e inicializo los threads
    for i in range(n):
        # print('starting par de processos ', i + 3)
        pJ = Process(target=procJext,
                     args=(XextList[i], Xint, Ns, params, i, qJext[i]))
        proJ.append(pJ)

        if hessianos:
            pH = Process(target=procHext,
                         args=(XextList[i], Xint, Ns, params, i, qHext[i]))
            proH.append(pH)

        pJ.start()  # inicio procesos
        if hessianos:
            pH.start()

    jInt = jInt.get()  # saco los resultados
    if hessianos:
        hInt = hInt.get()

    for i in range(n):
        jExt[i] = qJext[i].get()  # guardo resultados
        if hessianos:
            hExt[i] = qHext[i].get()

    pJInt.join()  # espero a que todos terminen

    if hessianos:
        pHInt.join()

    [p.join() for p in proJ]

    if hessianos:
        [p.join() for p in proH]

    if hessianos:
        return jInt, hInt, jExt, hExt
    else:
        return jInt, jExt
from multiprocess import Manager, Process


def fun(d, l):
    d[1] = '1'
    d[2] = 2
    d[0.25] = None
    l.reverse()


if __name__ == '__main__':
    manager = Manager()

    d = manager.dict()
    l = manager.list(range(10))

    p = Process(target=fun, args=(d, l))
    p.start()
    p.join()

    print d
    print l
Exemplo n.º 29
0
class TaskManager:
    """
    User-facing Class for interfacing with OpenWPM
    The TaskManager spawns several child processes to run the automation tasks.
        - DataAggregator to aggregate data in a SQLite database
        - MPLogger to aggregate logs across processes
        - BrowserManager processes to isolate Browsers in a separate process
    <manager_params> dict of TaskManager configuration parameters
    <browser_params> is a list of (or a single) dictionaries that specify preferences for browsers to instantiate
    <process_watchdog> will monitor firefox and Xvfb processes, killing any not indexed in TaskManager's browser list.
        NOTE: Only run this in isolated environments. It kills processes by name, indiscriminately.
    """

    def __init__(self, manager_params, browser_params, process_watchdog=False):

        # Make paths absolute in manager_params
        for path in ['data_directory','log_directory']:
            if manager_params[path] is not None:
                manager_params[path] = os.path.expanduser(manager_params[path])
        manager_params['database_name'] = os.path.join(manager_params['data_directory'],manager_params['database_name'])
        manager_params['log_file'] = os.path.join(manager_params['log_directory'],manager_params['log_file'])
        manager_params['screenshot_path'] = os.path.join(manager_params['data_directory'], 'screenshots')
        manager_params['source_dump_path'] = os.path.join(manager_params['data_directory'], 'sources')
        self.manager_params = manager_params

        # Create data directories if they do not exist
        if not os.path.exists(manager_params['screenshot_path']):
            os.makedirs(manager_params['screenshot_path'])
        if not os.path.exists(manager_params['source_dump_path']):
            os.makedirs(manager_params['source_dump_path'])

        # check size of parameter dictionary
        self.num_browsers = manager_params['num_browsers']
        if len(browser_params) != self.num_browsers:
            raise Exception("Number of <browser_params> dicts is not the same as manager_params['num_browsers']")

        # Flow control
        self.closing = False
        self.failure_status = None
        self.threadlock = threading.Lock()
        self.failurecount = 0
        if manager_params['failure_limit'] is not None:
            self.failure_limit = manager_params['failure_limit']
        else:
            self.failure_limit = self.num_browsers * 2 + 10

        self.process_watchdog = process_watchdog

        # sets up the crawl data database
        db_path = manager_params['database_name']
        if not os.path.exists(manager_params['data_directory']):
            os.mkdir(manager_params['data_directory'])
        self.db = sqlite3.connect(db_path)
        with open(os.path.join(os.path.dirname(__file__), 'schema.sql'), 'r') as f:
            self.db.executescript(f.read())
        self.db.commit()

        # sets up logging server + connect a client
        self.logging_status_queue = None
        self.loggingserver = self._launch_loggingserver()
        # socket location: (address, port)
        self.manager_params['logger_address'] = self.logging_status_queue.get()
        self.logger = MPLogger.loggingclient(*self.manager_params['logger_address'])

        # Mark if LDBAggregator is needed (if js is enabled on any browser)
        self.ldb_enabled = False
        for params in browser_params:
            if params['save_javascript'] or params['save_javascript_proxy']:
                self.ldb_enabled = True
                break

        # Initialize the data aggregators
        self._launch_aggregators()

        # open client socket
        self.sock = clientsocket(serialization='dill')
        self.sock.connect(*self.manager_params['aggregator_address'])

        self._save_configuration(browser_params)

        # read the last used site visit id
        cur = self.db.cursor()
        cur.execute("SELECT MAX(visit_id) from site_visits")
        last_visit_id = cur.fetchone()[0]
        if last_visit_id is None:
            last_visit_id = 0
        self.next_visit_id = last_visit_id + 1

        # sets up the BrowserManager(s) + associated queues
        self.browsers = self._initialize_browsers(browser_params)  # List of the Browser(s)
        self._launch_browsers()

        # start the manager watchdog
        thread = threading.Thread(target=self._manager_watchdog, args=())
        thread.daemon = True
        thread.start()

    def _save_configuration(self, browser_params):
        """ Saves crawl configuration details to db and logfile"""
        cur = self.db.cursor()

        # Get git version and commit information
        openwpm_v, browser_v = get_version()

        # Record task details
        cur.execute(("INSERT INTO task "
                     "(manager_params, openwpm_version, browser_version) "
                     "VALUES (?,?,?)"),
                (json.dumps(self.manager_params), openwpm_v, browser_v))
        self.db.commit()
        self.task_id = cur.lastrowid

        # Record browser details for each brower
        for i in xrange(self.num_browsers):
            cur.execute("INSERT INTO crawl (task_id, browser_params) VALUES (?,?)",
                        (self.task_id, json.dumps(browser_params[i])))
            self.db.commit()
            browser_params[i]['crawl_id'] = cur.lastrowid

        # Print the configuration details
        self.logger.info(get_configuration_string(self.manager_params,
                                                  browser_params,
                                                  (openwpm_v, browser_v)))

    def _initialize_browsers(self, browser_params):
        """ initialize the browser classes, each its unique set of parameters """
        browsers = list()
        for i in xrange(self.num_browsers):
            browsers.append(Browser(self.manager_params, browser_params[i]))

        return browsers

    def _launch_browsers(self):
        """ launch each browser manager process / browser """
        for browser in self.browsers:
            try:
                success = browser.launch_browser_manager()
            except:
                self._cleanup_before_fail(during_init=True)
                raise

            if not success:
                self.logger.critical("Browser spawn failure during TaskManager initialization, exiting...")
                self.close()
                break

            # Update our DB with the random browser settings
            # These are found within the scope of each instance of Browser in the browsers list
            screen_res = str(browser.browser_settings['screen_res'])
            ua_string = str(browser.browser_settings['ua_string'])
            self.sock.send(("UPDATE crawl SET screen_res = ?, ua_string = ? \
                             WHERE crawl_id = ?", (screen_res, ua_string, browser.crawl_id)))

    def _manager_watchdog(self):
        """
        Periodically checks the following:
        - memory consumption of all browsers every 10 seconds
        - presence of processes that are no longer in use
        """
        while not self.closing:
            time.sleep(10)

            # Check browser memory usage
            for browser in self.browsers:
                try:
                    process = psutil.Process(browser.browser_pid)
                    mem = process.memory_info()[0] / float(2 ** 20)
                    if mem > BROWSER_MEMORY_LIMIT:
                        self.logger.info("BROWSER %i: Memory usage: %iMB, exceeding limit of %iMB"
                            % (browser.crawl_id, int(mem), BROWSER_MEMORY_LIMIT))
                        browser.restart_required = True
                except psutil.NoSuchProcess:
                    pass

            # Check for browsers or displays that were not closed correctly
            # Provide a 300 second buffer to avoid killing freshly launched browsers
            # TODO This buffer should correspond to the maximum browser spawn timeout
            if self.process_watchdog:
                browser_pids = set()
                display_pids = set()
                check_time = time.time()
                for browser in self.browsers:
                    if browser.browser_pid is not None:
                        browser_pids.add(browser.browser_pid)
                    if browser.display_pid is not None:
                        display_pids.add(browser.display_pid)
                for process in psutil.process_iter():
                    if (process.create_time() + 300 < check_time and
                            ((process.name() == 'firefox' and process.pid not in browser_pids) or
                            (process.name() == 'Xvfb' and process.pid not in display_pids))):
                        self.logger.debug("Process: %s (pid: %i) with start time %s found running but not in browser process list. Killing."
                                % (process.name(), process.pid, process.create_time()))
                        process.kill()

    def _launch_aggregators(self):
        """
        Launches the various data aggregators, which serialize data from all processes.
        * DataAggregator - sqlite database for crawl data
        * LevelDBAggregator - leveldb database for javascript files
        """
        # DataAggregator
        self.aggregator_status_queue = Queue()
        self.data_aggregator = Process(target=DataAggregator.DataAggregator,
                             args=(self.manager_params, self.aggregator_status_queue))
        self.data_aggregator.daemon = True
        self.data_aggregator.start()
        self.manager_params['aggregator_address'] = self.aggregator_status_queue.get()  # socket location: (address, port)

        # LevelDB Aggregator
        if self.ldb_enabled:
            self.ldb_status_queue = Queue()
            self.ldb_aggregator = Process(target=LevelDBAggregator.LevelDBAggregator,
                                 args=(self.manager_params, self.ldb_status_queue))
            self.ldb_aggregator.daemon = True
            self.ldb_aggregator.start()
            self.manager_params['ldb_address'] = self.ldb_status_queue.get()  # socket location: (address, port)

    def _kill_aggregators(self):
        """ Terminates the aggregators gracefully """
        # DataAggregator
        self.logger.debug("Telling the DataAggregator to shut down...")
        self.aggregator_status_queue.put("DIE")
        start_time = time.time()
        self.data_aggregator.join(300)
        self.logger.debug("DataAggregator took " + str(time.time() - start_time) + " seconds to close")

        # LevelDB Aggregator
        if self.ldb_enabled:
            self.logger.debug("Telling the LevelDBAggregator to shut down...")
            self.ldb_status_queue.put("DIE")
            start_time = time.time()
            self.ldb_aggregator.join(300)
            self.logger.debug("LevelDBAggregator took " + str(time.time() - start_time) + " seconds to close")

    def _launch_loggingserver(self):
        """ sets up logging server """
        self.logging_status_queue = Queue()
        loggingserver = Process(target=MPLogger.loggingserver,
                             args=(self.manager_params['log_file'], self.logging_status_queue, ))
        loggingserver.daemon = True
        loggingserver.start()
        return loggingserver

    def _kill_loggingserver(self):
        """ terminates logging server gracefully """
        self.logging_status_queue.put("DIE")
        self.loggingserver.join(300)

    def _shutdown_manager(self, failure=False, during_init=False):
        """
        Wait for current commands to finish, close all child processes and
        threads
        <failure> flag to indicate manager failure (True) or end of crawl (False)
        <during_init> flag to indicator if this shutdown is occuring during the TaskManager initialization
        """
        self.closing = True

        for browser in self.browsers:
            browser.shutdown_browser(during_init)
            if failure:
                self.sock.send(("UPDATE crawl SET finished = -1 WHERE crawl_id = ?",
                                (browser.crawl_id,)))
            else:
                self.sock.send(("UPDATE crawl SET finished = 1 WHERE crawl_id = ?",
                                (browser.crawl_id,)))

        self.db.close()  # close db connection
        self.sock.close()  # close socket to data aggregator
        self._kill_aggregators()
        self._kill_loggingserver()

    def _cleanup_before_fail(self, during_init=False):
        """
        Execute shutdown commands before throwing an exception
        This should keep us from having a bunch of hanging processes
        and incomplete data.
        <during_init> flag to indicator if this shutdown is occuring during
                      the TaskManager initialization
        """
        self._shutdown_manager(failure=True, during_init=during_init)

    def _check_failure_status(self):
        """ Check the status of command failures. Raise exceptions as necessary

        The failure status property is used by the various asynchronous
        command execution threads which interface with the
        remote browser manager processes. If a failure status is found, the
        appropriate steps are taken to gracefully close the infrastructure
        """
        self.logger.debug("Checking command failure status indicator...")
        if self.failure_status:
            self.logger.debug("TaskManager failure status set, halting command execution.")
            self._cleanup_before_fail()
            if self.failure_status['ErrorType'] == 'ExceedCommandFailureLimit':
                raise CommandExecutionError(
                    "TaskManager exceeded maximum consecutive command "
                    "execution failures.", self.failure_status['CommandSequence']
                )
            elif self.failure_status['ErrorType'] == 'ExceedLaunchFailureLimit':
                raise CommandExecutionError(
                    "TaskManager failed to launch browser within allowable "
                    "failure limit.", self.failure_status['CommandSequence']
                )
            if self.failure_status['ErrorType'] == 'CriticalChildException':
                reraise(*cPickle.loads(self.failure_status['Exception']))

    # CRAWLER COMMAND CODE

    def _distribute_command(self, command_sequence, index=None):
        """
        parses command type and issues command(s) to the proper browser
        <index> specifies the type of command this is:
        = None  -> first come, first serve
        =  #    -> index of browser to send command to
        = *     -> sends command to all browsers
        = **    -> sends command to all browsers (synchronized)
        """
        if index is None:
            #send to first browser available
            command_executed = False
            while True:
                for browser in self.browsers:
                    if browser.ready():
                        browser.current_timeout = command_sequence.total_timeout
                        thread = self._start_thread(browser, command_sequence)
                        command_executed = True
                        break
                if command_executed:
                    break
                time.sleep(SLEEP_CONS)

        elif 0 <= index < len(self.browsers):
            #send the command to this specific browser
            while True:
                if self.browsers[index].ready():
                    self.browsers[index].current_timeout = command_sequence.total_timeout
                    thread = self._start_thread(self.browsers[index], command_sequence)
                    break
                time.sleep(SLEEP_CONS)
        elif index == '*':
            #send the command to all browsers
            command_executed = [False] * len(self.browsers)
            while False in command_executed:
                for i in xrange(len(self.browsers)):
                    if self.browsers[i].ready() and not command_executed[i]:
                        self.browsers[i].current_timeout = command_sequence.total_timeout
                        thread = self._start_thread(self.browsers[i], command_sequence)
                        command_executed[i] = True
                time.sleep(SLEEP_CONS)
        elif index == '**':
            #send the command to all browsers and sync it
            condition = threading.Condition()  # Used to block threads until ready
            command_executed = [False] * len(self.browsers)
            while False in command_executed:
                for i in xrange(len(self.browsers)):
                    if self.browsers[i].ready() and not command_executed[i]:
                        self.browsers[i].current_timeout = command_sequence.total_timeout
                        thread = self._start_thread(self.browsers[i], command_sequence, condition)
                        command_executed[i] = True
                time.sleep(SLEEP_CONS)
            with condition:
                condition.notifyAll()  # All browsers loaded, tell them to start
        else:
            self.logger.info("Command index type is not supported or out of range")
            return

        if command_sequence.blocking:
            thread.join()
            self._check_failure_status()

    def _start_thread(self, browser, command_sequence, condition=None):
        """  starts the command execution thread """

        # Check status flags before starting thread
        if self.closing:
            self.logger.error("Attempted to execute command on a closed TaskManager")
            return
        self._check_failure_status()

        browser.set_visit_id(self.next_visit_id)
        self.sock.send(("INSERT INTO site_visits (visit_id, crawl_id, site_url) VALUES (?,?,?)",
                        (self.next_visit_id, browser.crawl_id, command_sequence.url)))
        self.next_visit_id += 1

        # Start command execution thread
        args = (browser, command_sequence, condition)
        thread = threading.Thread(target=self._issue_command, args=args)
        browser.command_thread = thread
        thread.daemon = True
        thread.start()
        return thread

    def _issue_command(self, browser, command_sequence, condition=None):
        """
        sends command tuple to the BrowserManager
        """
        browser.is_fresh = False  # since we are issuing a command, the BrowserManager is no longer a fresh instance

        # if this is a synced call, block on condition
        if condition is not None:
            with condition:
                condition.wait()

        reset = command_sequence.reset
        start_time = None  # tracks when a site visit started, so that flash/profile
                           # cookies can be properly tracked.
        for command_and_timeout in command_sequence.commands_with_timeout:
            command, timeout = command_and_timeout
            if command[0] in ['GET', 'BROWSE']:
                start_time = time.time()
                command += (browser.curr_visit_id,)
            elif command[0] in ['DUMP_FLASH_COOKIES', 'DUMP_PROFILE_COOKIES']:
                command += (start_time, browser.curr_visit_id,)
            browser.current_timeout = timeout
            # passes off command and waits for a success (or failure signal)
            browser.command_queue.put(command)
            command_succeeded = 0 #1 success, 0 failure from error, -1 timeout
            command_arguments = command[1] if len(command) > 1 else None

            # received reply from BrowserManager, either success signal or failure notice
            try:
                status = browser.status_queue.get(True, browser.current_timeout)
                if status == "OK":
                    command_succeeded = 1
                elif status[0] == "CRITICAL":
                    self.logger.critical("BROWSER %i: Received critical error "
                                         "from browser process while executing "
                                         "command %s. Setting failure status." % (
                                             browser.crawl_id, str(command)))
                    self.failure_status = {
                        'ErrorType': 'CriticalChildException',
                        'CommandSequence': command_sequence,
                        'Exception': status[1]
                    }
                    return
                else:
                    command_succeeded = 0
                    self.logger.info("BROWSER %i: Received failure status while"
                                     " executing command: %s" % (browser.crawl_id, command[0]))
            except EmptyQueue:
                command_succeeded = -1
                self.logger.info("BROWSER %i: Timeout while executing command, "
                                 "%s, killing browser manager" % (browser.crawl_id, command[0]))

            self.sock.send(("INSERT INTO CrawlHistory (crawl_id, command, arguments, bool_success)"
                            " VALUES (?,?,?,?)",
                            (browser.crawl_id, command[0], command_arguments, command_succeeded)))

            if command_succeeded != 1:
                with self.threadlock:
                    self.failurecount += 1
                if self.failurecount > self.failure_limit:
                    self.logger.critical("BROWSER %i: Command execution failure"
                                         " pushes failure count above the allowable limit."
                                         " Setting failure_status." % browser.crawl_id)
                    self.failure_status = {
                        'ErrorType': 'ExceedCommandFailureLimit',
                        'CommandSequence': command_sequence
                    }
                    return
                browser.restart_required = True
            else:
                with self.threadlock:
                    self.failurecount = 0

            if browser.restart_required:
                break

        # Sleep after executing CommandSequence to provide extra time for
        # internal buffers to drain. Stopgap in support of #135
        time.sleep(2)

        if self.closing:
            return

        if browser.restart_required or reset:
            success = browser.restart_browser_manager(clear_profile = reset)
            if not success:
                self.logger.critical("BROWSER %i: Exceeded the maximum allowable "
                                     "consecutive browser launch failures. "
                                     "Setting failure_status." % browser.crawl_id)
                self.failure_status = {
                    'ErrorType': 'ExceedLaunchFailureLimit',
                    'CommandSequence': command_sequence
                }
                return
            browser.restart_required = False

    def execute_command_sequence(self, command_sequence, index=None):
        self._distribute_command(command_sequence, index)

    # DEFINITIONS OF HIGH LEVEL COMMANDS
    # NOTE: These wrappers are provided for convenience. To issue sequential
    # commands to the same browser in a single 'visit', use the CommandSequence
    # class directly.

    def get(self, url, index=None, timeout=60, sleep=0, reset=False):
        """ goes to a url """
        command_sequence = CommandSequence.CommandSequence(url)
        command_sequence.get(timeout=timeout, sleep=sleep)
        command_sequence.reset = reset
        self.execute_command_sequence(command_sequence, index=index)

    def browse(self, url, num_links=2, sleep=0, index=None, timeout=60, reset=False):
        """ browse a website and visit <num_links> links on the page """
        command_sequence = CommandSequence.CommandSequence(url)
        command_sequence.browse(num_links=num_links, sleep=sleep, timeout=timeout)
        command_sequence.reset = reset
        self.execute_command_sequence(command_sequence, index=index)


    def close(self):
        """
        Execute shutdown procedure for TaskManager
        """
        if self.closing:
            self.logger.error("TaskManager already closed")
            return
        self._shutdown_manager()