示例#1
0
    def multiprocessor(inpipe, outpipe, controlpipe):
        def returner_process(inp, outp, task):
            args, kwargs = inp.get()
            outpipe.put(task(*args, **kwargs))
            return True

        jobs = []
        while True:
            done = [x for x in jobs if x.ready()]
            if done:
                jobs = [x for x in jobs
                        if x not in done]  # Avoids race condition!
            else:
                sleep(0.1)

            for thing in done:
                thing.successful()
                assert thing.get()
            while len(jobs) < process_count:
                cmd = controlpipe.get()
                if cmd == stop_signal:
                    break
                elif cmd == True:
                    newjob = Process(target=returner_process,
                                     args=(inpipe, outpipe))
                    newjob.start()
                    jobs.append(newjob)
                    # I *think* the pipes have to be passed explicitly,
                    # but I haven't checked.
                else:
                    raise Exception
        outpipe.put(stop_signal)
示例#2
0
    def _launch_aggregators(self):
        """Launch the necessary data aggregators"""
        if self.manager_params["output_format"] == "local":
            self.data_aggregator = SqliteAggregator.SqliteAggregator(
                self.manager_params, self.browser_params)
        elif self.manager_params["output_format"] == "s3":
            self.data_aggregator = S3Aggregator.S3Aggregator(
                self.manager_params, self.browser_params)
        else:
            raise Exception("Unrecognized output format: %s" %
                            self.manager_params["output_format"])
        self.data_aggregator.launch()
        self.manager_params[
            'aggregator_address'] = self.data_aggregator.listener_address

        # open connection to aggregator for saving crawl details
        self.sock = clientsocket(serialization='dill')
        self.sock.connect(*self.manager_params['aggregator_address'])

        # TODO refactor ldb aggregator to use new base classes
        if self.ldb_enabled:
            self.ldb_status_queue = Queue()
            self.ldb_aggregator = Process(
                target=LevelDBAggregator.LevelDBAggregator,
                args=(self.manager_params, self.ldb_status_queue))
            self.ldb_aggregator.daemon = True
            self.ldb_aggregator.start()
            # socket location: (address, port)
            self.manager_params['ldb_address'] = self.ldb_status_queue.get()
    def start(self, initializer=None, initargs=()):
        '''
        Spawn a server process for this manager object
        '''
        assert self._state.value == State.INITIAL

        if initializer is not None and not hasattr(initializer, '__call__'):
            raise TypeError('initializer must be a callable')

        # pipe over which we will retrieve address of server
        reader, writer = connection.Pipe(duplex=False)

        # spawn process which runs a server
        self._process = Process(
            target=type(self)._run_server,
            args=(self._registry, self._address, self._authkey,
                  self._serializer, writer, initializer, initargs),
            )
        ident = ':'.join(str(i) for i in self._process._identity)
        self._process.name = type(self).__name__  + '-' + ident
        self._process.start()

        # get address of server
        writer.close()
        self._address = reader.recv()
        reader.close()

        # register a finalizer
        self._state.value = State.STARTED
        self.shutdown = util.Finalize(
            self, type(self)._finalize_manager,
            args=(self._process, self._address, self._authkey,
                  self._state, self._Client),
            exitpriority=0
            )
示例#4
0
    def _launch_aggregators(self):
        """
        Launches the various data aggregators, which serialize data from all processes.
        * DataAggregator - sqlite database for crawl data
        * LevelDBAggregator - leveldb database for javascript files
        """
        # DataAggregator
        self.aggregator_status_queue = Queue()
        self.data_aggregator = Process(target=DataAggregator.DataAggregator,
                                       args=(self.manager_params,
                                             self.aggregator_status_queue))
        self.data_aggregator.daemon = True
        self.data_aggregator.start()
        self.manager_params[
            'aggregator_address'] = self.aggregator_status_queue.get(
            )  # socket location: (address, port)

        # LevelDB Aggregator
        if self.ldb_enabled:
            self.ldb_status_queue = Queue()
            self.ldb_aggregator = Process(
                target=LevelDBAggregator.LevelDBAggregator,
                args=(self.manager_params, self.ldb_status_queue))
            self.ldb_aggregator.daemon = True
            self.ldb_aggregator.start()
            self.manager_params['ldb_address'] = self.ldb_status_queue.get(
            )  # socket location: (address, port)
示例#5
0
    def __init__(self):
        """Builds a new instance of the filament.

        Attributes:
        ----------

        filament_module: module
            module which contains the filament logic

        keventq: Queue
            queue where the main process pushes the kernel events
        """
        Process.__init__(self)
        self._filament_module = None
        self._keventq = None
        self._filters = []
        self._cols = []
        self._tabular = None
        self._limit = 10
        self._interval = 1
        self._sort_by = None
        self._sort_desc = True
        self._log_path = None
        self._logger = None
        self.scheduler = BackgroundScheduler()
示例#6
0
class BaseAggregator(object):
    """Base class for the data aggregator interface. This class is used
    alongside the BaseListener class to spawn an aggregator process that
    combines data from multiple crawl processes. The BaseAggregator class
    manages the child listener process.

    Parameters
    ----------
    manager_params : dict
        TaskManager configuration parameters
    browser_params : list of dict
        List of browser configuration dictionaries"""
    __metaclass__ = abc.ABCMeta

    def __init__(self, manager_params, browser_params):
        self.manager_params = manager_params
        self.browser_params = browser_params
        self.logger = loggingclient(*manager_params['logger_address'])
        self.listener_address = None
        self.listener_process = None

    @abc.abstractmethod
    def save_configuration(self, openwpm_version, browser_version):
        """Save configuration details to the database"""

    @abc.abstractmethod
    def get_next_visit_id(self):
        """Return a unique visit ID to be used as a key for a single page visit"""

    @abc.abstractmethod
    def get_next_crawl_id(self):
        """Return a unique crawl ID to be used as a key for a browser instance"""

    def launch(self, listener_process_runner):
        """Launch the aggregator listener process"""
        self.status_queue = Queue()
        self.listener_process = Process(
            target=listener_process_runner,
            args=(self.manager_params, self.status_queue))
        self.listener_process.daemon = True
        self.listener_process.start()
        self.listener_address = self.status_queue.get()

    def shutdown(self):
        """ Terminate the aggregator listener process"""
        self.logger.debug(
            "Sending the shutdown signal to the %s listener process..." %
            type(self).__name__
        )
        self.status_queue.put("SHUTDOWN")
        start_time = time.time()
        self.listener_process.join(300)
        self.logger.debug(
            "%s took %s seconds to close." % (
                type(self).__name__,
                str(time.time() - start_time)
            )
        )
        self.listener_address = None
        self.listener_process = None
 def _launch_loggingserver(self):
     """ sets up logging server """
     self.logging_status_queue = Queue()
     loggingserver = Process(target=MPLogger.loggingserver,
                          args=(self.manager_params['log_file'], self.logging_status_queue, ))
     loggingserver.daemon = True
     loggingserver.start()
     return loggingserver
示例#8
0
def start_httpd(handler_class=SimpleHTTPRequestHandler):
    clear_httpd_messages()

    httpd_process = Process(target=run_httpd_forever, args=(handler_class,))
    httpd_process.start()

    httpd_url = HTTPD_MESSAGE_QUEUE.get()
    return httpd_process, httpd_url
示例#9
0
def test1():
    """
    Run the ventilator defined in this function.
    """
    Process(target=start_ventilator, args=()).start()
    #time.sleep(0.1)
    Process(target=worker_process, args=()).start()
    Process(target=sink_process, args=()).start()
示例#10
0
def test_worker_sink():
    """
    Launch planetary_imager and process its data
    """
    from multiprocess import Process
    import os
    Process(target=worker_process, args=()).start()
    Process(target=sink_process, args=()).start()
示例#11
0
def test2():
    """
    Launch planetary_imager and process its data
    """
    import os
    Process(target=worker_process, args=()).start()
    Process(target=sink_process, args=()).start()
    os.system('planetary_imager &')
示例#12
0
def test_client():
    """
    Launch planetary_imager and process its data
    """
    Process(target=start_ventilator, args=()).start()
    Process(target=worker_process, args=()).start()
    Process(target=sink_process, args=()).start()
    Process(target=client_process, args=()).start()
示例#13
0
 def _launch_loggingserver(self):
     """ sets up logging server """
     self.logging_status_queue = Queue()
     loggingserver = Process(target=MPLogger.loggingserver,
                          args=(self.manager_params['log_file'], self.logging_status_queue, ))
     loggingserver.daemon = True
     loggingserver.start()
     return loggingserver
示例#14
0
文件: asy.py 项目: Brad-eki/schedula
 def submit(self, func, *args, **kwargs):
     # noinspection PyUnresolvedReferences
     from multiprocess import Process, Pipe
     from concurrent.futures import Future
     fut, (c0, c1) = Future(), Pipe(False)
     task = Process(target=self._target, args=(c1.send, func, args, kwargs))
     self.tasks[fut] = task
     task.start()
     return self._set_future(fut, c0.recv())
示例#15
0
 def launch(self, listener_process_runner):
     """Launch the aggregator listener process"""
     self.status_queue = Queue()
     self.listener_process = Process(target=listener_process_runner,
                                     args=(self.manager_params,
                                           self.status_queue))
     self.listener_process.daemon = True
     self.listener_process.start()
     self.listener_address = self.status_queue.get()
示例#16
0
 def launch(self, listener_process_runner, *args):
     """Launch the aggregator listener process"""
     args = (self.manager_params, self.status_queue,
             self.shutdown_queue) + args
     self.listener_process = Process(target=listener_process_runner,
                                     args=args)
     self.listener_process.daemon = True
     self.listener_process.start()
     self.listener_address = self.status_queue.get()
示例#17
0
    def call_job_fn(self, key, job_fn, args):
        # pylint: disable-next=import-outside-toplevel,no-name-in-module,import-error
        from multiprocess import Process

        # pylint: disable-next=not-callable
        proc = Process(target=job_fn,
                       args=(key, self._make_progress_key(key), args))
        proc.start()
        return proc.pid
示例#18
0
 def submit(self, func, *args, **kwargs):
     # noinspection PyUnresolvedReferences
     from multiprocess import Process, Pipe
     from concurrent.futures import Future
     fut, (c0, c1) = Future(), Pipe(False)
     task = Process(target=self._target, args=(c1.send, func, args, kwargs))
     self.tasks[fut] = task
     task.start()
     return self._set_future(fut, c0.recv())
    def __init__(self, frequency, ok_qs, cmds_q, tokens_q):
        Process.__init__(self)
        self._cmds_q = cmds_q
        self._tokens_q = tokens_q

        self._ok_qs = ok_qs
        self._tokens = {key: False for key in self._ok_qs.keys()}

        self._T = 1.0 / frequency if frequency > 0 else 0
        self._ok_start_time = None
        self._pause = False
示例#20
0
    def mp_process(self,nprocs,func,*args):
        images=args[0]
#        def worker(imgs,i,chunksize, out_q,func,*args):
#            """ The worker function, invoked in a process. 'images' is a
#                list of images to span the process upon. The results are placed in
#                a dictionary that's pushed to a queue.
#            """
#            outdict = {}
#            for imn in range(len(imgs)-1):
#                print(i*chunksize+imn)
#                outdict[i*chunksize+imn] = func(imgs[imn],imgs[imn+1],*args[1:],i*chunksize+imn)
#            out_q.put(outdict)
    
        # Each process will get 'chunksize' nums and a queue to put his out
        # dict into
        out_q = Queue()
        chunksize = int(math.ceil((len(images)-1) / float(nprocs)))
        procs = []
        print("Chunks of size:",chunksize)
        for i in range(nprocs):
            if i == nprocs-1:
                p = Process(
                        target=worker,
                        args=(images[chunksize * i:len(images)-1],i,chunksize,out_q,func,*args))
                procs.append(p)
                p.start()
                self.loading.progress2['value']+=chunksize
                self.update()
            else:                
                p = Process(
                        target=worker,
                        args=(images[chunksize * i:chunksize * (i + 1)+1],i,chunksize,out_q,func,*args))
                procs.append(p)
                p.start()
                self.loading.progress2['value']+=chunksize
                self.update()
    
        # Collect all results into a single result dict. We know how many dicts
        # with results to expect.
        resultdict = {}
        for i in range(nprocs):
            resultdict.update(out_q.get())
    
        # Wait for all worker processes to finish
        for p in procs:
            p.join()
            
        results=[]
        for j in range(len(resultdict)):
            results.append(resultdict[j])

        return results
示例#21
0
    def __init__(self, camera, cmd_q, res, codec, fps):
        Process.__init__(self)

        self.res = res
        self.fps = fps
        self.codec = codec

        self.camera = camera
        self.fourcc = cv2.VideoWriter_fourcc(*self.codec)

        self.cmd_q = cmd_q
        self.recording = False
        self.out = None
        self.data_buf = None
示例#22
0
    def run_parallel(
            self, test_suites, test_runner, result_type=None,
            results_path=None):

        exit_code = 0
        proc = None
        unittest.installHandler()
        processes = []
        manager = Manager()
        results = manager.dict()
        manager.dict()
        start = time.time()

        test_mapping = {}
        for test_suite in test_suites:
            # Give each test suite an uuid so it can be
            # matched to the correct test result
            test_id = str(uuid.uuid4())
            test_mapping[test_id] = test_suite

            proc = Process(
                target=self.execute_test,
                args=(test_runner, test_id, test_suite, results))
            processes.append(proc)
            proc.start()

        for proc in processes:
            proc.join()

        finish = time.time()

        errors, failures, _ = self.dump_results(start, finish, results)

        if result_type is not None:
            all_results = []
            for test_id, result in list(results.items()):
                tests = test_mapping[test_id]
                result_parser = SummarizeResults(
                    vars(result), tests, (finish - start))
                all_results += result_parser.gather_results()

            reporter = Reporter(
                result_parser=result_parser, all_results=all_results)
            reporter.generate_report(
                result_type=result_type, path=results_path)

        if failures or errors:
            exit_code = 1

        return exit_code
 def _save_cache(self, i):
     if not self._save_cache:
         raise Exception(
             "Cannot save cache if no cache path was specified.")
     logging.debug("Saving cache for {0} block {1}".format(
         self.name, self._cur_data_segment))
     data = self._extract_q(i)
     p = Process(target=_dump_cache,
                 args=(data,
                       os.path.join(self._save_path, "{0}.jb".format(
                           self._cur_data_segment)), self.name,
                       self._cur_data_segment))
     p.start()
     self._saving_ps.append(p)
示例#24
0
    def _save_data(self, path, cb, concat):
        if not os.path.exists(path):
            os.makedirs(path)
        target_filename = os.path.join(path, "{0}.jb".format(self.name))
        if self._saving_cache:
            while True in [p.is_alive() for p in self._saving_ps]:
                time.sleep(1e-3)

            p = Process(
                target=_caches_to_file,
                args=(
                    self._save_path,
                    self._start_data_segment,
                    self._cur_data_segment,
                    target_filename,
                    cb,
                    concat,
                ),
            )
            p.start()
            self._start_data_segment = self._cur_data_segment
        else:
            data = self._extract_q(0)
            p = Process(target=_dump, args=(data, target_filename, cb))
            p.start()
    def set_problem(self, problem):
        """
        Sets the problem object to use to calculate the fitness.

        Arguments
        ---------

        problem
            Problem object implementing the fitness method.
        """
        for _ in range(self.num_workers):
            p = Process(target=multiprocessor_process,
                        args=(problem, self.task_queue, self.result_queue))
            p.start()
            self.processes.append(p)
    def mp_processTR(self, nprocs, func, *args):
        images = args[0]
        TrMat = args[1]
        TrMatNull = np.array([[[1., 0., 0.], [0., 1., 0.]]])
        TrM = np.vstack((TrMatNull, TrMat))
        fnames = args[2]
        print(len(images), len(TrM), len(fnames))
        out_q = Queue()
        chunksize = int(math.ceil((len(images) - 1) / float(nprocs)))
        procs = []
        print("Chunks of size:", chunksize)
        for i in range(nprocs):
            if i == nprocs - 1:
                p = Process(target=workerTR,
                            args=(images[chunksize * i:len(images)],
                                  TrM[chunksize * i:len(images)],
                                  fnames[chunksize * i:len(images)], i,
                                  chunksize, out_q, func, *args[3:]))
                procs.append(p)
                p.start()
                self.loading.progress2['value'] += chunksize
                self.update()
            else:
                p = Process(target=workerTR,
                            args=(images[chunksize * i:chunksize * (i + 1)],
                                  TrM[chunksize * i:chunksize * (i + 1)],
                                  fnames[chunksize * i:chunksize * (i + 1)], i,
                                  chunksize, out_q, func, *args[3:]))
                procs.append(p)
                p.start()
                self.loading.progress2['value'] += chunksize
                self.update()

        # Collect all results into a single result dict. We know how many dicts
        # with results to expect.
        resultdict = {}
        for i in range(nprocs):
            resultdict.update(out_q.get())

        # Wait for all worker processes to finish
        for p in procs:
            p.join()

        results = []
        for j in range(len(resultdict)):
            results.append(resultdict[j])

        return results
    def fit(self, X, Y):
        assert not self.fit_done
        assert len(X) == len(Y)

        possible_labels = list(set(y_val for y in Y for y_val in y))
        job_labels = np.array_split(possible_labels, self.n_jobs)

        with Manager() as manager:
            X_proxy = manager.list(X)
            Y_proxy = manager.list(Y)
            output_queue = Queue()
            processes = [
                Process(target=sequential_execute,
                        args=(output_queue, get_binary_clf_from_multilabel, [{
                            'X':
                            X_proxy,
                            'Y':
                            Y_proxy,
                            'label':
                            lbl,
                            'return_label':
                            True
                        } for lbl in job])) for job in job_labels
            ]
            [p.start() for p in processes]
            results = [output_queue.get()
                       for lbl in possible_labels]  # needs to be flattened
            [p.join() for p in processes]

        self.classifiers = dict(results)
        self.fit_done = True
示例#28
0
    def _solve(self, X, Y, batch_size):
        '''
        Solve the multi-objective problem by multiple scalarized single-objective solvers.
        '''
        # generate scalarization weights
        weights = np.random.random((batch_size, self.problem.n_obj))
        weights /= np.expand_dims(np.sum(weights, axis=1), 1)

        # initial solutions
        X = np.vstack([X, lhs(X.shape[1], batch_size)])
        F = self.problem.evaluate(X, return_values_of=['F'])

        # optimization
        xs, ys = [], []
        queue = Queue()
        n_active_process = 0
        for i in range(batch_size):
            x0 = X[np.argmin(augmented_tchebicheff(F, weights[i]))]
            Process(target=optimization,
                    args=(self.problem, x0, weights[i], queue)).start()
            n_active_process += 1
            if n_active_process >= self.n_process:
                x, y = queue.get()
                xs.append(x)
                ys.append(y)
                n_active_process -= 1

        # gather result
        for _ in range(n_active_process):
            x, y = queue.get()
            xs.append(x)
            ys.append(y)

        return np.array(xs), np.array(ys)
示例#29
0
def test_replace(delay=0.):
    zwo.init_workers()
    zwo.init_sink()
    Process(target=start_ventilator, args=(delay, )).start()
    while True:
        time.sleep(10)
        zwo.replace_workers(zwo.dummy_worker)
def test():
    NUMBER_OF_PROCESSES = 4
    TASKS1 = [(mul, (i, 7)) for i in range(20)]
    TASKS2 = [(plus, (i, 8)) for i in range(10)]

    # Create queues
    task_queue = Queue()
    done_queue = Queue()

    # Submit tasks
    list(map(task_queue.put, TASKS1))

    # Start worker processes
    for i in range(NUMBER_OF_PROCESSES):
        Process(target=worker, args=(task_queue, done_queue)).start()

    # Get and print results
    print('Unordered results:')
    for i in range(len(TASKS1)):
        print('\t', done_queue.get())

    # Add more tasks using `put()` instead of `putMany()`
    for task in TASKS2:
        task_queue.put(task)

    # Get and print some more results
    for i in range(len(TASKS2)):
        print('\t', done_queue.get())

    # Tell child processes to stop
    for i in range(NUMBER_OF_PROCESSES):
        task_queue.put('STOP')
示例#31
0
    def make_csv(self, lock):
        file1 = open(self.out_csv1, "w")
        file1.write("id" + ',' + "level" + '\n')
        file2 = open(self.out_csv2, "w")
        file2.write("id" + ',' + "object_name" + '\n')
        file1.close()
        file2.close()

        if __name__ == '__main__':
            list_of_process = []
            list_of_queue1 = []
            list_of_queue2 = []
            for i in range(len(self.list_of_zips)):
                list_of_queue1.append(Queue())
                list_of_queue2.append(Queue())
                list_of_process.append(
                    Process(target=self.parse_Zip,
                            args=(i, lock, list_of_queue1[i],
                                  list_of_queue2[i])))
            for i in range(len(self.list_of_zips)):
                list_of_process[i].start()

            file1 = open(self.out_csv1, "a")
            for i in range(len(self.list_of_zips)):
                while list_of_queue1[i].empty() is False:
                    file1.write(list_of_queue1[i].get()[0] + ',' +
                                list_of_queue1[i].get()[1] + '\n')
            file1.close()
            '''
示例#32
0
    def start(self, initializer=None, initargs=()):
        '''
        Spawn a server process for this manager object
        '''
        assert self._state.value == State.INITIAL

        if initializer is not None and not callable(initializer):
            raise TypeError('initializer must be a callable')

        # pipe over which we will retrieve address of server
        reader, writer = connection.Pipe(duplex=False)

        # spawn process which runs a server
        self._process = Process(
            target=type(self)._run_server,
            args=(self._registry, self._address, self._authkey,
                  self._serializer, writer, initializer, initargs),
            )
        ident = ':'.join(str(i) for i in self._process._identity)
        self._process.name = type(self).__name__  + '-' + ident
        self._process.start()

        # get address of server
        writer.close()
        self._address = reader.recv()
        reader.close()

        # register a finalizer
        self._state.value = State.STARTED
        self.shutdown = util.Finalize(
            self, type(self)._finalize_manager,
            args=(self._process, self._address, self._authkey,
                  self._state, self._Client),
            exitpriority=0
            )
示例#33
0
def run_fuzzmanager():
    def run_fuzzmanager_forever():
        os.chdir('FuzzManager')
        proc = subprocess.Popen(['python', 'server/manage.py', 'runserver'],
                                stdout=subprocess.PIPE,
                                stdin=subprocess.PIPE,
                                stderr=subprocess.STDOUT,
                                universal_newlines=True)

        while True:
            line = proc.stdout.readline()
            print(line, end='')

    fuzzmanager_process = Process(target=run_fuzzmanager_forever)
    fuzzmanager_process.start()

    return fuzzmanager_process
def play_round(cpu_agent, test_agents, win_counts, num_matches):
    """Compare the test agents to the cpu agent in "fair" matches.

    "Fair" matches use random starting locations and force the agents to
    play as both first and second player to control for advantages resulting
    from choosing better opening moves or having first initiative to move.
    """
    timeout_count = 0
    forfeit_count = 0
    queue = JoinableQueue()

    for _ in range(num_matches):

        workers = []

        games = sum([[
            Board(cpu_agent.player, agent.player),
            Board(agent.player, cpu_agent.player)
        ] for agent in test_agents], [])

        # initialize all games with a random move and response
        for _ in range(2):
            move = random.choice(games[0].get_legal_moves())
            for game in games:
                game.apply_move(move)

        # play all games and tally the results
        for game in games:
            p = Process(target=game_play_process, args=(
                queue,
                game,
            ))
            workers.append(p)
            p.start()

        for _ in range(len(workers)):
            (winner, t_c, f_c) = queue.get()
            timeout_count += t_c
            forfeit_count += f_c
            for key, value in win_counts.items():
                if hash(key) == winner:
                    win_counts[key] += 1

        queue.join()

    return timeout_count, forfeit_count
示例#35
0
文件: bot.py 项目: pschwede/AnchorBot
 def __run_processes(target, inqueue, outqueue=None):
     threads = max(1, (NUM_THREADS - 1))
     for n in range(threads):
         if outqueue:
             p = Process(target=target, args=(n, inqueue, outqueue))
         else:
             p = Process(target=target, args=(n, inqueue))
         p.daemon = True
         p.start()
     pp = Process(target=__progress, args=(-1, inqueue))
     pp.daemon = True
     pp.start()
     inqueue.close()
     inqueue.join()
示例#36
0
 def launch(self, listener_process_runner, *args):
     """Launch the aggregator listener process"""
     args = (self.manager_params, self.status_queue,
             self.shutdown_queue) + args
     self.listener_process = Process(
         target=listener_process_runner,
         args=args
     )
     self.listener_process.daemon = True
     self.listener_process.start()
     self.listener_address = self.status_queue.get()
示例#37
0
    def _launch_aggregators(self):
        """
        Launches the various data aggregators, which serialize data from all processes.
        * DataAggregator - sqlite database for crawl data
        * LevelDBAggregator - leveldb database for javascript files
        """
        # DataAggregator
        self.aggregator_status_queue = Queue()
        self.data_aggregator = Process(target=DataAggregator.DataAggregator,
                             args=(self.manager_params, self.aggregator_status_queue))
        self.data_aggregator.daemon = True
        self.data_aggregator.start()
        self.manager_params['aggregator_address'] = self.aggregator_status_queue.get()  # socket location: (address, port)

        # LevelDB Aggregator
        if self.ldb_enabled:
            self.ldb_status_queue = Queue()
            self.ldb_aggregator = Process(target=LevelDBAggregator.LevelDBAggregator,
                                 args=(self.manager_params, self.ldb_status_queue))
            self.ldb_aggregator.daemon = True
            self.ldb_aggregator.start()
            self.manager_params['ldb_address'] = self.ldb_status_queue.get()  # socket location: (address, port)
示例#38
0
    def schedule(self, debug=False):
        '''
        Schedule the queued operations
        '''
        if debug:
            print('~ Initiating reading process')

        reading_process = Process(target=self.storage.reader)
        reading_process.start()

        while True:
            if debug:
                print()
                print('/ High priority queue is ' + str(self.high_access))
                print('/ Normal priority queue is ' + str(self.normal_access))
                print()

            if self.high_access:
                self._pool.apply_async(self._run_queue(self.high_access.pop(0)))
            elif self.normal_access:
                self._pool.apply_async(self._run_queue(self.normal_access.pop(0)))
            else:
                time.sleep(0.5)
示例#39
0
class BaseManager(object):
    '''
    Base class for managers
    '''
    _registry = {}
    _Server = Server

    def __init__(self, address=None, authkey=None, serializer='pickle'):
        if authkey is None:
            authkey = current_process().authkey
        self._address = address     # XXX not final address if eg ('', 0)
        self._authkey = AuthenticationString(authkey)
        self._state = State()
        self._state.value = State.INITIAL
        self._serializer = serializer
        self._Listener, self._Client = listener_client[serializer]

    def get_server(self):
        '''
        Return server object with serve_forever() method and address attribute
        '''
        assert self._state.value == State.INITIAL
        return Server(self._registry, self._address,
                      self._authkey, self._serializer)

    def connect(self):
        '''
        Connect manager object to the server process
        '''
        Listener, Client = listener_client[self._serializer]
        conn = Client(self._address, authkey=self._authkey)
        dispatch(conn, None, 'dummy')
        self._state.value = State.STARTED

    def start(self, initializer=None, initargs=()):
        '''
        Spawn a server process for this manager object
        '''
        assert self._state.value == State.INITIAL

        if initializer is not None and not callable(initializer):
            raise TypeError('initializer must be a callable')

        # pipe over which we will retrieve address of server
        reader, writer = connection.Pipe(duplex=False)

        # spawn process which runs a server
        self._process = Process(
            target=type(self)._run_server,
            args=(self._registry, self._address, self._authkey,
                  self._serializer, writer, initializer, initargs),
            )
        ident = ':'.join(str(i) for i in self._process._identity)
        self._process.name = type(self).__name__  + '-' + ident
        self._process.start()

        # get address of server
        writer.close()
        self._address = reader.recv()
        reader.close()

        # register a finalizer
        self._state.value = State.STARTED
        self.shutdown = util.Finalize(
            self, type(self)._finalize_manager,
            args=(self._process, self._address, self._authkey,
                  self._state, self._Client),
            exitpriority=0
            )

    @classmethod
    def _run_server(cls, registry, address, authkey, serializer, writer,
                    initializer=None, initargs=()):
        '''
        Create a server, report its address and run it
        '''
        if initializer is not None:
            initializer(*initargs)

        # create server
        server = cls._Server(registry, address, authkey, serializer)

        # inform parent process of the server's address
        writer.send(server.address)
        writer.close()

        # run the manager
        util.info('manager serving at %r', server.address)
        server.serve_forever()

    def _create(self, typeid, *args, **kwds):
        '''
        Create a new shared object; return the token and exposed tuple
        '''
        assert self._state.value == State.STARTED, 'server not yet started'
        conn = self._Client(self._address, authkey=self._authkey)
        try:
            id, exposed = dispatch(conn, None, 'create', (typeid,)+args, kwds)
        finally:
            conn.close()
        return Token(typeid, self._address, id), exposed

    def join(self, timeout=None):
        '''
        Join the manager process (if it has been spawned)
        '''
        self._process.join(timeout)

    def _debug_info(self):
        '''
        Return some info about the servers shared objects and connections
        '''
        conn = self._Client(self._address, authkey=self._authkey)
        try:
            return dispatch(conn, None, 'debug_info')
        finally:
            conn.close()

    def _number_of_objects(self):
        '''
        Return the number of shared objects
        '''
        conn = self._Client(self._address, authkey=self._authkey)
        try:
            return dispatch(conn, None, 'number_of_objects')
        finally:
            conn.close()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.shutdown()

    @staticmethod
    def _finalize_manager(process, address, authkey, state, _Client):
        '''
        Shutdown the manager process; will be registered as a finalizer
        '''
        if process.is_alive():
            util.info('sending shutdown message to manager')
            try:
                conn = _Client(address, authkey=authkey)
                try:
                    dispatch(conn, None, 'shutdown')
                finally:
                    conn.close()
            except Exception:
                pass

            process.join(timeout=0.2)
            if process.is_alive():
                util.info('manager still alive')
                if hasattr(process, 'terminate'):
                    util.info('trying to `terminate()` manager process')
                    process.terminate()
                    process.join(timeout=0.1)
                    if process.is_alive():
                        util.info('manager still alive after terminate')

        state.value = State.SHUTDOWN
        try:
            del BaseProxy._address_to_local[address]
        except KeyError:
            pass

    address = property(lambda self: self._address)

    @classmethod
    def register(cls, typeid, callable=None, proxytype=None, exposed=None,
                 method_to_typeid=None, create_method=True):
        '''
        Register a typeid with the manager type
        '''
        if '_registry' not in cls.__dict__:
            cls._registry = cls._registry.copy()

        if proxytype is None:
            proxytype = AutoProxy

        exposed = exposed or getattr(proxytype, '_exposed_', None)

        method_to_typeid = method_to_typeid or \
                           getattr(proxytype, '_method_to_typeid_', None)

        if method_to_typeid:
            for key, value in list(method_to_typeid.items()):
                assert type(key) is str, '%r is not a string' % key
                assert type(value) is str, '%r is not a string' % value

        cls._registry[typeid] = (
            callable, exposed, method_to_typeid, proxytype
            )

        if create_method:
            def temp(self, *args, **kwds):
                util.debug('requesting creation of a shared %r object', typeid)
                token, exp = self._create(typeid, *args, **kwds)
                proxy = proxytype(
                    token, self._serializer, manager=self,
                    authkey=self._authkey, exposed=exp
                    )
                conn = self._Client(token.address, authkey=self._authkey)
                dispatch(conn, None, 'decref', (token.id,))
                return proxy
            temp.__name__ = typeid
            setattr(cls, typeid, temp)
示例#40
0
文件: main.py 项目: skjoenberg/BDAE
        if not argument:
            print("* The dataset does not only contain digits")
            return
    print()
    print("* The dataset only contains digits")


def hue(scheduler, mapper1, reducer1, mapper2, reducer2):
    for i in range(100):
        scheduler.add_operation("DS1", Priority.normal, mapper1, reducer1)
        scheduler.add_operation("DS1", Priority.normal, mapper2, reducer2)
        scheduler.add_operation("DS2", Priority.high, mapper1, reducer1)
        scheduler.add_operation("DS2", Priority.low, mapper2, reducer2)


length_code = length.__code__
sum_code = sum.__code__
digit_code = digit.__code__
digits_code = digits.__code__
map1 = pickle.dumps(length_code)
rec1 = pickle.dumps(sum_code)
map2 = pickle.dumps(digit_code)
rec2 = pickle.dumps(digits_code)

lolle = Process(target=hue, args=(SCHEDULER, map1, rec1, map2, rec2))
lolle.start()
time.sleep(2)

print_section("Scheduling operations")
SCHEDULER.schedule()
示例#41
0
 def delay(self, *args, **kwargs):
     # return MockResult(Process(target=func, args=args, kwargs=kwargs))
     p = Process(target=func, args=args, kwargs=kwargs) #MockResult()
     p.start()
     return True#MockResult(func(*args, **kwargs))
示例#42
0
class BaseAggregator(object):
    """Base class for the data aggregator interface. This class is used
    alongside the BaseListener class to spawn an aggregator process that
    combines data from multiple crawl processes. The BaseAggregator class
    manages the child listener process.

    Parameters
    ----------
    manager_params : dict
        TaskManager configuration parameters
    browser_params : list of dict
        List of browser configuration dictionaries"""
    __metaclass__ = abc.ABCMeta

    def __init__(self, manager_params, browser_params):
        self.manager_params = manager_params
        self.browser_params = browser_params
        self.logger = loggingclient(*manager_params['logger_address'])
        self.listener_address = None
        self.listener_process = None
        self.status_queue = Queue()
        self.shutdown_queue = Queue()
        self._last_status = None
        self._last_status_received = None

    @abc.abstractmethod
    def save_configuration(self, openwpm_version, browser_version):
        """Save configuration details to the database"""

    @abc.abstractmethod
    def get_next_visit_id(self):
        """Return a unique visit ID to be used as a key for a single visit"""

    @abc.abstractmethod
    def get_next_crawl_id(self):
        """Return a unique crawl ID used as a key for a browser instance"""

    def get_most_recent_status(self):
        """Return the most recent queue size sent from the listener process"""

        # Block until we receive the first status update
        if self._last_status is None:
            return self.get_status()

        # Drain status queue until we receive most recent update
        while not self.status_queue.empty():
            self._last_status = self.status_queue.get()
            self._last_status_received = time.time()

        # Check last status signal
        if (time.time() - self._last_status_received) > STATUS_TIMEOUT:
            raise RuntimeError(
                "No status update from DataAggregator listener process "
                "for %d seconds." % (time.time() - self._last_status_received)
            )

        return self._last_status

    def get_status(self):
        """Get listener process status. If the status queue is empty, block."""
        try:
            self._last_status = self.status_queue.get(
                block=True, timeout=STATUS_TIMEOUT)
            self._last_status_received = time.time()
        except queue.Empty:
            raise RuntimeError(
                "No status update from DataAggregator listener process "
                "for %d seconds." % (time.time() - self._last_status_received)
            )
        return self._last_status

    def launch(self, listener_process_runner, *args):
        """Launch the aggregator listener process"""
        args = (self.manager_params, self.status_queue,
                self.shutdown_queue) + args
        self.listener_process = Process(
            target=listener_process_runner,
            args=args
        )
        self.listener_process.daemon = True
        self.listener_process.start()
        self.listener_address = self.status_queue.get()

    def shutdown(self):
        """ Terminate the aggregator listener process"""
        self.logger.debug(
            "Sending the shutdown signal to the %s listener process..." %
            type(self).__name__
        )
        self.shutdown_queue.put(SHUTDOWN_SIGNAL)
        start_time = time.time()
        self.listener_process.join(300)
        self.logger.debug(
            "%s took %s seconds to close." % (
                type(self).__name__,
                str(time.time() - start_time)
            )
        )
        self.listener_address = None
        self.listener_process = None
示例#43
0
processed_link = defaultdict(int)

def worker():
    while True:
        if not job_queue.is_empty():
            link = job_queue.pop()

            weblink_list, processing_result = process_webpage(link) # TODO

            for link in weblink_list:
                if processed_link[link] == 0:
                    job_queue.push(link)
                    processed_link[link] = 1

            result_queue.push(processing_result)

        else:
            time.sleep(0.1)

process_list = []

for i in range(0, 10):

    p = Process(target=worker)
    p.start()

    process_list.append(p)



示例#44
0
            module = getattr(__import__('shell_commands'), valid)
            try:
                module.action(connection, command[1])
            except IndexError:
                module.action(connection, '')

        time.sleep(0.2)
        


if __name__ == "__main__":

    server = raw_input('Enter server: ')
    port = raw_input('Enter port: ')
    bot_nick = raw_input('Enter bot nick: ')
    channel = raw_input('Enter channel: ')

    ircsock, server, port, bot_nick, channel = login_routine(server, port, bot_nick, channel)
   
    #information for inter-process communication
    address = ('localhost', 2424)
    listener = Listener(address)
    client = Client(address)

    #two processes, one for console one for the irc channel
    irc = Process(target=irc_loop, args=(ircsock, bot_nick, channel, client))
    irc.start()

    #launch into the shell loop for interactive commands
    shell_loop(ircsock, channel, listener)
示例#45
0
def spin_crawl_threads(state, classifiers, MAX_BIT_SIZE, MAX_DL_THREADS, image_path):
    print("Running threads...")
    manager = Manager()

    location_q = manager.Queue(maxsize=16)
    image_q = manager.Queue(maxsize=64)
    state_lock = manager.Lock()

    generate_location = Process(target=generate_location_thread,
                                args=(location_q, MAX_BIT_SIZE),
                                name="generate_location")
    classification = Process(target=classification_thread,
                             args=(image_q, classifiers, image_path,
                                   state, state_lock), name="classification")
    download_image_t = Process(target=download_image_thread,
                               args=(location_q, image_q, MAX_DL_THREADS),
                               name="download_image")

    download_image_t.start()
    classification.start()
    generate_location.start()

    def kill_threads():
        for thread in active_children():
            thread.terminate()

    atexit.register(kill_threads)

    download_image_t.join()
    classification.join()
    generate_location.join()
示例#46
0
 def __init__(self, to_worker, from_worker, verbose, failfast):
     Process.__init__(self)
     self.to_worker = to_worker
     self.from_worker = from_worker
     self.verbose = verbose
     self.failfast = failfast
示例#47
0
class TaskManager:
    """
    User-facing Class for interfacing with OpenWPM
    The TaskManager spawns several child processes to run the automation tasks.
        - DataAggregator to aggregate data in a SQLite database
        - MPLogger to aggregate logs across processes
        - BrowserManager processes to isolate Browsers in a separate process
    <manager_params> dict of TaskManager configuration parameters
    <browser_params> is a list of (or a single) dictionaries that specify preferences for browsers to instantiate
    <process_watchdog> will monitor firefox and Xvfb processes, killing any not indexed in TaskManager's browser list.
        NOTE: Only run this in isolated environments. It kills processes by name, indiscriminately.
    """

    def __init__(self, manager_params, browser_params, process_watchdog=False):

        # Make paths absolute in manager_params
        for path in ['data_directory','log_directory']:
            if manager_params[path] is not None:
                manager_params[path] = os.path.expanduser(manager_params[path])
        manager_params['database_name'] = os.path.join(manager_params['data_directory'],manager_params['database_name'])
        manager_params['log_file'] = os.path.join(manager_params['log_directory'],manager_params['log_file'])
        manager_params['screenshot_path'] = os.path.join(manager_params['data_directory'], 'screenshots')
        manager_params['source_dump_path'] = os.path.join(manager_params['data_directory'], 'sources')
        self.manager_params = manager_params

        # Create data directories if they do not exist
        if not os.path.exists(manager_params['screenshot_path']):
            os.makedirs(manager_params['screenshot_path'])
        if not os.path.exists(manager_params['source_dump_path']):
            os.makedirs(manager_params['source_dump_path'])

        # check size of parameter dictionary
        self.num_browsers = manager_params['num_browsers']
        if len(browser_params) != self.num_browsers:
            raise Exception("Number of <browser_params> dicts is not the same as manager_params['num_browsers']")

        # Flow control
        self.closing = False
        self.failure_status = None
        self.threadlock = threading.Lock()
        self.failurecount = 0
        if manager_params['failure_limit'] is not None:
            self.failure_limit = manager_params['failure_limit']
        else:
            self.failure_limit = self.num_browsers * 2 + 10

        self.process_watchdog = process_watchdog

        # sets up the crawl data database
        db_path = manager_params['database_name']
        if not os.path.exists(manager_params['data_directory']):
            os.mkdir(manager_params['data_directory'])
        self.db = sqlite3.connect(db_path)
        with open(os.path.join(os.path.dirname(__file__), 'schema.sql'), 'r') as f:
            self.db.executescript(f.read())
        self.db.commit()

        # sets up logging server + connect a client
        self.logging_status_queue = None
        self.loggingserver = self._launch_loggingserver()
        # socket location: (address, port)
        self.manager_params['logger_address'] = self.logging_status_queue.get()
        self.logger = MPLogger.loggingclient(*self.manager_params['logger_address'])

        # Mark if LDBAggregator is needed (if js is enabled on any browser)
        self.ldb_enabled = False
        for params in browser_params:
            if params['save_javascript'] or params['save_javascript_proxy']:
                self.ldb_enabled = True
                break

        # Initialize the data aggregators
        self._launch_aggregators()

        # open client socket
        self.sock = clientsocket(serialization='dill')
        self.sock.connect(*self.manager_params['aggregator_address'])

        self._save_configuration(browser_params)

        # read the last used site visit id
        cur = self.db.cursor()
        cur.execute("SELECT MAX(visit_id) from site_visits")
        last_visit_id = cur.fetchone()[0]
        if last_visit_id is None:
            last_visit_id = 0
        self.next_visit_id = last_visit_id + 1

        # sets up the BrowserManager(s) + associated queues
        self.browsers = self._initialize_browsers(browser_params)  # List of the Browser(s)
        self._launch_browsers()

        # start the manager watchdog
        thread = threading.Thread(target=self._manager_watchdog, args=())
        thread.daemon = True
        thread.start()

    def _save_configuration(self, browser_params):
        """ Saves crawl configuration details to db and logfile"""
        cur = self.db.cursor()

        # Get git version and commit information
        openwpm_v, browser_v = get_version()

        # Record task details
        cur.execute(("INSERT INTO task "
                     "(manager_params, openwpm_version, browser_version) "
                     "VALUES (?,?,?)"),
                (json.dumps(self.manager_params), openwpm_v, browser_v))
        self.db.commit()
        self.task_id = cur.lastrowid

        # Record browser details for each brower
        for i in xrange(self.num_browsers):
            cur.execute("INSERT INTO crawl (task_id, browser_params) VALUES (?,?)",
                        (self.task_id, json.dumps(browser_params[i])))
            self.db.commit()
            browser_params[i]['crawl_id'] = cur.lastrowid

        # Print the configuration details
        self.logger.info(get_configuration_string(self.manager_params,
                                                  browser_params,
                                                  (openwpm_v, browser_v)))

    def _initialize_browsers(self, browser_params):
        """ initialize the browser classes, each its unique set of parameters """
        browsers = list()
        for i in xrange(self.num_browsers):
            browsers.append(Browser(self.manager_params, browser_params[i]))

        return browsers

    def _launch_browsers(self):
        """ launch each browser manager process / browser """
        for browser in self.browsers:
            try:
                success = browser.launch_browser_manager()
            except:
                self._cleanup_before_fail(during_init=True)
                raise

            if not success:
                self.logger.critical("Browser spawn failure during TaskManager initialization, exiting...")
                self.close()
                break

            # Update our DB with the random browser settings
            # These are found within the scope of each instance of Browser in the browsers list
            screen_res = str(browser.browser_settings['screen_res'])
            ua_string = str(browser.browser_settings['ua_string'])
            self.sock.send(("UPDATE crawl SET screen_res = ?, ua_string = ? \
                             WHERE crawl_id = ?", (screen_res, ua_string, browser.crawl_id)))

    def _manager_watchdog(self):
        """
        Periodically checks the following:
        - memory consumption of all browsers every 10 seconds
        - presence of processes that are no longer in use
        """
        while not self.closing:
            time.sleep(10)

            # Check browser memory usage
            for browser in self.browsers:
                try:
                    process = psutil.Process(browser.browser_pid)
                    mem = process.memory_info()[0] / float(2 ** 20)
                    if mem > BROWSER_MEMORY_LIMIT:
                        self.logger.info("BROWSER %i: Memory usage: %iMB, exceeding limit of %iMB"
                            % (browser.crawl_id, int(mem), BROWSER_MEMORY_LIMIT))
                        browser.restart_required = True
                except psutil.NoSuchProcess:
                    pass

            # Check for browsers or displays that were not closed correctly
            # Provide a 300 second buffer to avoid killing freshly launched browsers
            # TODO This buffer should correspond to the maximum browser spawn timeout
            if self.process_watchdog:
                browser_pids = set()
                display_pids = set()
                check_time = time.time()
                for browser in self.browsers:
                    if browser.browser_pid is not None:
                        browser_pids.add(browser.browser_pid)
                    if browser.display_pid is not None:
                        display_pids.add(browser.display_pid)
                for process in psutil.process_iter():
                    if (process.create_time() + 300 < check_time and
                            ((process.name() == 'firefox' and process.pid not in browser_pids) or
                            (process.name() == 'Xvfb' and process.pid not in display_pids))):
                        self.logger.debug("Process: %s (pid: %i) with start time %s found running but not in browser process list. Killing."
                                % (process.name(), process.pid, process.create_time()))
                        process.kill()

    def _launch_aggregators(self):
        """
        Launches the various data aggregators, which serialize data from all processes.
        * DataAggregator - sqlite database for crawl data
        * LevelDBAggregator - leveldb database for javascript files
        """
        # DataAggregator
        self.aggregator_status_queue = Queue()
        self.data_aggregator = Process(target=DataAggregator.DataAggregator,
                             args=(self.manager_params, self.aggregator_status_queue))
        self.data_aggregator.daemon = True
        self.data_aggregator.start()
        self.manager_params['aggregator_address'] = self.aggregator_status_queue.get()  # socket location: (address, port)

        # LevelDB Aggregator
        if self.ldb_enabled:
            self.ldb_status_queue = Queue()
            self.ldb_aggregator = Process(target=LevelDBAggregator.LevelDBAggregator,
                                 args=(self.manager_params, self.ldb_status_queue))
            self.ldb_aggregator.daemon = True
            self.ldb_aggregator.start()
            self.manager_params['ldb_address'] = self.ldb_status_queue.get()  # socket location: (address, port)

    def _kill_aggregators(self):
        """ Terminates the aggregators gracefully """
        # DataAggregator
        self.logger.debug("Telling the DataAggregator to shut down...")
        self.aggregator_status_queue.put("DIE")
        start_time = time.time()
        self.data_aggregator.join(300)
        self.logger.debug("DataAggregator took " + str(time.time() - start_time) + " seconds to close")

        # LevelDB Aggregator
        if self.ldb_enabled:
            self.logger.debug("Telling the LevelDBAggregator to shut down...")
            self.ldb_status_queue.put("DIE")
            start_time = time.time()
            self.ldb_aggregator.join(300)
            self.logger.debug("LevelDBAggregator took " + str(time.time() - start_time) + " seconds to close")

    def _launch_loggingserver(self):
        """ sets up logging server """
        self.logging_status_queue = Queue()
        loggingserver = Process(target=MPLogger.loggingserver,
                             args=(self.manager_params['log_file'], self.logging_status_queue, ))
        loggingserver.daemon = True
        loggingserver.start()
        return loggingserver

    def _kill_loggingserver(self):
        """ terminates logging server gracefully """
        self.logging_status_queue.put("DIE")
        self.loggingserver.join(300)

    def _shutdown_manager(self, failure=False, during_init=False):
        """
        Wait for current commands to finish, close all child processes and
        threads
        <failure> flag to indicate manager failure (True) or end of crawl (False)
        <during_init> flag to indicator if this shutdown is occuring during the TaskManager initialization
        """
        self.closing = True

        for browser in self.browsers:
            browser.shutdown_browser(during_init)
            if failure:
                self.sock.send(("UPDATE crawl SET finished = -1 WHERE crawl_id = ?",
                                (browser.crawl_id,)))
            else:
                self.sock.send(("UPDATE crawl SET finished = 1 WHERE crawl_id = ?",
                                (browser.crawl_id,)))

        self.db.close()  # close db connection
        self.sock.close()  # close socket to data aggregator
        self._kill_aggregators()
        self._kill_loggingserver()

    def _cleanup_before_fail(self, during_init=False):
        """
        Execute shutdown commands before throwing an exception
        This should keep us from having a bunch of hanging processes
        and incomplete data.
        <during_init> flag to indicator if this shutdown is occuring during
                      the TaskManager initialization
        """
        self._shutdown_manager(failure=True, during_init=during_init)

    def _check_failure_status(self):
        """ Check the status of command failures. Raise exceptions as necessary

        The failure status property is used by the various asynchronous
        command execution threads which interface with the
        remote browser manager processes. If a failure status is found, the
        appropriate steps are taken to gracefully close the infrastructure
        """
        self.logger.debug("Checking command failure status indicator...")
        if self.failure_status:
            self.logger.debug("TaskManager failure status set, halting command execution.")
            self._cleanup_before_fail()
            if self.failure_status['ErrorType'] == 'ExceedCommandFailureLimit':
                raise CommandExecutionError(
                    "TaskManager exceeded maximum consecutive command "
                    "execution failures.", self.failure_status['CommandSequence']
                )
            elif self.failure_status['ErrorType'] == 'ExceedLaunchFailureLimit':
                raise CommandExecutionError(
                    "TaskManager failed to launch browser within allowable "
                    "failure limit.", self.failure_status['CommandSequence']
                )
            if self.failure_status['ErrorType'] == 'CriticalChildException':
                reraise(*cPickle.loads(self.failure_status['Exception']))

    # CRAWLER COMMAND CODE

    def _distribute_command(self, command_sequence, index=None):
        """
        parses command type and issues command(s) to the proper browser
        <index> specifies the type of command this is:
        = None  -> first come, first serve
        =  #    -> index of browser to send command to
        = *     -> sends command to all browsers
        = **    -> sends command to all browsers (synchronized)
        """
        if index is None:
            #send to first browser available
            command_executed = False
            while True:
                for browser in self.browsers:
                    if browser.ready():
                        browser.current_timeout = command_sequence.total_timeout
                        thread = self._start_thread(browser, command_sequence)
                        command_executed = True
                        break
                if command_executed:
                    break
                time.sleep(SLEEP_CONS)

        elif 0 <= index < len(self.browsers):
            #send the command to this specific browser
            while True:
                if self.browsers[index].ready():
                    self.browsers[index].current_timeout = command_sequence.total_timeout
                    thread = self._start_thread(self.browsers[index], command_sequence)
                    break
                time.sleep(SLEEP_CONS)
        elif index == '*':
            #send the command to all browsers
            command_executed = [False] * len(self.browsers)
            while False in command_executed:
                for i in xrange(len(self.browsers)):
                    if self.browsers[i].ready() and not command_executed[i]:
                        self.browsers[i].current_timeout = command_sequence.total_timeout
                        thread = self._start_thread(self.browsers[i], command_sequence)
                        command_executed[i] = True
                time.sleep(SLEEP_CONS)
        elif index == '**':
            #send the command to all browsers and sync it
            condition = threading.Condition()  # Used to block threads until ready
            command_executed = [False] * len(self.browsers)
            while False in command_executed:
                for i in xrange(len(self.browsers)):
                    if self.browsers[i].ready() and not command_executed[i]:
                        self.browsers[i].current_timeout = command_sequence.total_timeout
                        thread = self._start_thread(self.browsers[i], command_sequence, condition)
                        command_executed[i] = True
                time.sleep(SLEEP_CONS)
            with condition:
                condition.notifyAll()  # All browsers loaded, tell them to start
        else:
            self.logger.info("Command index type is not supported or out of range")
            return

        if command_sequence.blocking:
            thread.join()
            self._check_failure_status()

    def _start_thread(self, browser, command_sequence, condition=None):
        """  starts the command execution thread """

        # Check status flags before starting thread
        if self.closing:
            self.logger.error("Attempted to execute command on a closed TaskManager")
            return
        self._check_failure_status()

        browser.set_visit_id(self.next_visit_id)
        self.sock.send(("INSERT INTO site_visits (visit_id, crawl_id, site_url) VALUES (?,?,?)",
                        (self.next_visit_id, browser.crawl_id, command_sequence.url)))
        self.next_visit_id += 1

        # Start command execution thread
        args = (browser, command_sequence, condition)
        thread = threading.Thread(target=self._issue_command, args=args)
        browser.command_thread = thread
        thread.daemon = True
        thread.start()
        return thread

    def _issue_command(self, browser, command_sequence, condition=None):
        """
        sends command tuple to the BrowserManager
        """
        browser.is_fresh = False  # since we are issuing a command, the BrowserManager is no longer a fresh instance

        # if this is a synced call, block on condition
        if condition is not None:
            with condition:
                condition.wait()

        reset = command_sequence.reset
        start_time = None  # tracks when a site visit started, so that flash/profile
                           # cookies can be properly tracked.
        for command_and_timeout in command_sequence.commands_with_timeout:
            command, timeout = command_and_timeout
            if command[0] in ['GET', 'BROWSE']:
                start_time = time.time()
                command += (browser.curr_visit_id,)
            elif command[0] in ['DUMP_FLASH_COOKIES', 'DUMP_PROFILE_COOKIES']:
                command += (start_time, browser.curr_visit_id,)
            browser.current_timeout = timeout
            # passes off command and waits for a success (or failure signal)
            browser.command_queue.put(command)
            command_succeeded = 0 #1 success, 0 failure from error, -1 timeout
            command_arguments = command[1] if len(command) > 1 else None

            # received reply from BrowserManager, either success signal or failure notice
            try:
                status = browser.status_queue.get(True, browser.current_timeout)
                if status == "OK":
                    command_succeeded = 1
                elif status[0] == "CRITICAL":
                    self.logger.critical("BROWSER %i: Received critical error "
                                         "from browser process while executing "
                                         "command %s. Setting failure status." % (
                                             browser.crawl_id, str(command)))
                    self.failure_status = {
                        'ErrorType': 'CriticalChildException',
                        'CommandSequence': command_sequence,
                        'Exception': status[1]
                    }
                    return
                else:
                    command_succeeded = 0
                    self.logger.info("BROWSER %i: Received failure status while"
                                     " executing command: %s" % (browser.crawl_id, command[0]))
            except EmptyQueue:
                command_succeeded = -1
                self.logger.info("BROWSER %i: Timeout while executing command, "
                                 "%s, killing browser manager" % (browser.crawl_id, command[0]))

            self.sock.send(("INSERT INTO CrawlHistory (crawl_id, command, arguments, bool_success)"
                            " VALUES (?,?,?,?)",
                            (browser.crawl_id, command[0], command_arguments, command_succeeded)))

            if command_succeeded != 1:
                with self.threadlock:
                    self.failurecount += 1
                if self.failurecount > self.failure_limit:
                    self.logger.critical("BROWSER %i: Command execution failure"
                                         " pushes failure count above the allowable limit."
                                         " Setting failure_status." % browser.crawl_id)
                    self.failure_status = {
                        'ErrorType': 'ExceedCommandFailureLimit',
                        'CommandSequence': command_sequence
                    }
                    return
                browser.restart_required = True
            else:
                with self.threadlock:
                    self.failurecount = 0

            if browser.restart_required:
                break

        # Sleep after executing CommandSequence to provide extra time for
        # internal buffers to drain. Stopgap in support of #135
        time.sleep(2)

        if self.closing:
            return

        if browser.restart_required or reset:
            success = browser.restart_browser_manager(clear_profile = reset)
            if not success:
                self.logger.critical("BROWSER %i: Exceeded the maximum allowable "
                                     "consecutive browser launch failures. "
                                     "Setting failure_status." % browser.crawl_id)
                self.failure_status = {
                    'ErrorType': 'ExceedLaunchFailureLimit',
                    'CommandSequence': command_sequence
                }
                return
            browser.restart_required = False

    def execute_command_sequence(self, command_sequence, index=None):
        self._distribute_command(command_sequence, index)

    # DEFINITIONS OF HIGH LEVEL COMMANDS
    # NOTE: These wrappers are provided for convenience. To issue sequential
    # commands to the same browser in a single 'visit', use the CommandSequence
    # class directly.

    def get(self, url, index=None, timeout=60, sleep=0, reset=False):
        """ goes to a url """
        command_sequence = CommandSequence.CommandSequence(url)
        command_sequence.get(timeout=timeout, sleep=sleep)
        command_sequence.reset = reset
        self.execute_command_sequence(command_sequence, index=index)

    def browse(self, url, num_links=2, sleep=0, index=None, timeout=60, reset=False):
        """ browse a website and visit <num_links> links on the page """
        command_sequence = CommandSequence.CommandSequence(url)
        command_sequence.browse(num_links=num_links, sleep=sleep, timeout=timeout)
        command_sequence.reset = reset
        self.execute_command_sequence(command_sequence, index=index)


    def close(self):
        """
        Execute shutdown procedure for TaskManager
        """
        if self.closing:
            self.logger.error("TaskManager already closed")
            return
        self._shutdown_manager()
示例#48
0
    def launch_browser_manager(self):
        """
        sets up the BrowserManager and gets the process id, browser pid and,
        if applicable, screen pid. loads associated user profile if necessary
        """
        # if this is restarting from a crash, update the tar location
        # to be a tar of the crashed browser's history
        if self.current_profile_path is not None:
            # tar contents of crashed profile to a temp dir
            tempdir = tempfile.mkdtemp(prefix="owpm_profile_archive_") + "/"
            profile_commands.dump_profile(
                self.current_profile_path,
                self.manager_params,
                self.browser_params,
                tempdir,
                close_webdriver=False,
                browser_settings=self.browser_settings
            )
            # make sure browser loads crashed profile
            self.browser_params['profile_tar'] = tempdir
            # don't re-randomize attributes
            self.browser_params['random_attributes'] = False
            crash_recovery = True
        else:
            tempdir = None
            crash_recovery = False
        self.is_fresh = not crash_recovery

        # Try to spawn the browser within the timelimit
        unsuccessful_spawns = 0
        success = False

        def check_queue(launch_status):
            result = self.status_queue.get(True, self._SPAWN_TIMEOUT)
            if result[0] == 'STATUS':
                launch_status[result[1]] = True
                return result[2]
            elif result[0] == 'CRITICAL':
                reraise(*pickle.loads(result[1]))
            elif result[0] == 'FAILED':
                raise BrowserCrashError(
                    'Browser spawn returned failure status')

        while (not success and
                unsuccessful_spawns < self._UNSUCCESSFUL_SPAWN_LIMIT):
            self.logger.debug("BROWSER %i: Spawn attempt %i " % (
                self.crawl_id, unsuccessful_spawns))
            # Resets the command/status queues
            (self.command_queue, self.status_queue) = (Queue(), Queue())

            # builds and launches the browser_manager
            args = (self.command_queue, self.status_queue, self.browser_params,
                    self.manager_params, crash_recovery)
            self.browser_manager = Process(target=BrowserManager, args=args)
            self.browser_manager.daemon = True
            self.browser_manager.start()

            # Read success status of browser manager
            launch_status = dict()
            try:
                # 1. Selenium profile created
                spawned_profile_path = check_queue(launch_status)
                # 2. Profile tar loaded (if necessary)
                check_queue(launch_status)
                # 3. Display launched
                (self.display_pid, self.display_port) = check_queue(
                    launch_status)
                # 4. Browser launch attempted
                check_queue(launch_status)
                # 5. Browser launched
                (self.browser_pid, self.browser_settings) = check_queue(
                    launch_status)

                (driver_profile_path, ready) = check_queue(launch_status)
                if ready != 'READY':
                    self.logger.error(
                        "BROWSER %i: Mismatch of status queue return values, "
                        "trying again..." % self.crawl_id
                    )
                    unsuccessful_spawns += 1
                    continue
                success = True
            except (EmptyQueue, BrowserCrashError):
                unsuccessful_spawns += 1
                error_string = ''
                status_strings = [
                    'Proxy Ready', 'Profile Created', 'Profile Tar', 'Display',
                    'Launch Attempted', 'Browser Launched', 'Browser Ready']
                for string in status_strings:
                    error_string += " | %s: %s " % (
                        string, launch_status.get(string, False))
                self.logger.error(
                    "BROWSER %i: Spawn unsuccessful %s" % (self.crawl_id,
                                                           error_string))
                self.kill_browser_manager()
                if 'Profile Created' in launch_status:
                    shutil.rmtree(spawned_profile_path, ignore_errors=True)

        # If the browser spawned successfully, we should update the
        # current profile path class variable and clean up the tempdir
        # and previous profile path.
        if success:
            self.logger.debug(
                "BROWSER %i: Browser spawn sucessful!" % self.crawl_id)
            previous_profile_path = self.current_profile_path
            self.current_profile_path = driver_profile_path
            if driver_profile_path != spawned_profile_path:
                shutil.rmtree(spawned_profile_path, ignore_errors=True)
            if previous_profile_path is not None:
                shutil.rmtree(previous_profile_path, ignore_errors=True)
            if tempdir is not None:
                shutil.rmtree(tempdir, ignore_errors=True)

        return success
示例#49
0
class Browser:
    """
     The Browser class is responsbile for holding all of the
     configuration and status information on BrowserManager process
     it corresponds to. It also includes a set of methods for managing
     the BrowserManager process and its child processes/threads.
     <manager_params> are the TaskManager configuration settings.
     <browser_params> are per-browser parameter settings (e.g. whether
                      this browser is headless, etc.)
     """

    def __init__(self, manager_params, browser_params):
        # Constants
        self._SPAWN_TIMEOUT = 120  # seconds
        self._UNSUCCESSFUL_SPAWN_LIMIT = 4

        # manager parameters
        self.current_profile_path = None
        self.db_socket_address = manager_params['aggregator_address']
        self.logger_address = manager_params['logger_address']
        self.crawl_id = browser_params['crawl_id']
        self.curr_visit_id = None
        self.browser_params = browser_params
        self.manager_params = manager_params

        # Queues and process IDs for BrowserManager

        # thread to run commands issues from TaskManager
        self.command_thread = None
        # queue for passing command tuples to BrowserManager
        self.command_queue = None
        # queue for receiving command execution status from BrowserManager
        self.status_queue = None
        # pid for browser instance controlled by BrowserManager
        self.browser_pid = None
        # the pid of the display for the headless browser (if it exists)
        self.display_pid = None
        # the port of the display for the headless browser (if it exists)
        self.display_port = None

        # boolean that says if the BrowserManager new (to optimize restarts)
        self.is_fresh = True
        # boolean indicating if the browser should be restarted
        self.restart_required = False

        self.current_timeout = None  # timeout of the current command
        # dict of additional browser profile settings (e.g. screen_res)
        self.browser_settings = None
        self.browser_manager = None  # process that controls browser
        self.logger = loggingclient(*self.logger_address)

    def ready(self):
        """ return if the browser is ready to accept a command """
        return (self.command_thread is None or
                not self.command_thread.is_alive())

    def set_visit_id(self, visit_id):
        self.curr_visit_id = visit_id

    def launch_browser_manager(self):
        """
        sets up the BrowserManager and gets the process id, browser pid and,
        if applicable, screen pid. loads associated user profile if necessary
        """
        # if this is restarting from a crash, update the tar location
        # to be a tar of the crashed browser's history
        if self.current_profile_path is not None:
            # tar contents of crashed profile to a temp dir
            tempdir = tempfile.mkdtemp(prefix="owpm_profile_archive_") + "/"
            profile_commands.dump_profile(
                self.current_profile_path,
                self.manager_params,
                self.browser_params,
                tempdir,
                close_webdriver=False,
                browser_settings=self.browser_settings
            )
            # make sure browser loads crashed profile
            self.browser_params['profile_tar'] = tempdir
            # don't re-randomize attributes
            self.browser_params['random_attributes'] = False
            crash_recovery = True
        else:
            tempdir = None
            crash_recovery = False
        self.is_fresh = not crash_recovery

        # Try to spawn the browser within the timelimit
        unsuccessful_spawns = 0
        success = False

        def check_queue(launch_status):
            result = self.status_queue.get(True, self._SPAWN_TIMEOUT)
            if result[0] == 'STATUS':
                launch_status[result[1]] = True
                return result[2]
            elif result[0] == 'CRITICAL':
                reraise(*pickle.loads(result[1]))
            elif result[0] == 'FAILED':
                raise BrowserCrashError(
                    'Browser spawn returned failure status')

        while (not success and
                unsuccessful_spawns < self._UNSUCCESSFUL_SPAWN_LIMIT):
            self.logger.debug("BROWSER %i: Spawn attempt %i " % (
                self.crawl_id, unsuccessful_spawns))
            # Resets the command/status queues
            (self.command_queue, self.status_queue) = (Queue(), Queue())

            # builds and launches the browser_manager
            args = (self.command_queue, self.status_queue, self.browser_params,
                    self.manager_params, crash_recovery)
            self.browser_manager = Process(target=BrowserManager, args=args)
            self.browser_manager.daemon = True
            self.browser_manager.start()

            # Read success status of browser manager
            launch_status = dict()
            try:
                # 1. Selenium profile created
                spawned_profile_path = check_queue(launch_status)
                # 2. Profile tar loaded (if necessary)
                check_queue(launch_status)
                # 3. Display launched
                (self.display_pid, self.display_port) = check_queue(
                    launch_status)
                # 4. Browser launch attempted
                check_queue(launch_status)
                # 5. Browser launched
                (self.browser_pid, self.browser_settings) = check_queue(
                    launch_status)

                (driver_profile_path, ready) = check_queue(launch_status)
                if ready != 'READY':
                    self.logger.error(
                        "BROWSER %i: Mismatch of status queue return values, "
                        "trying again..." % self.crawl_id
                    )
                    unsuccessful_spawns += 1
                    continue
                success = True
            except (EmptyQueue, BrowserCrashError):
                unsuccessful_spawns += 1
                error_string = ''
                status_strings = [
                    'Proxy Ready', 'Profile Created', 'Profile Tar', 'Display',
                    'Launch Attempted', 'Browser Launched', 'Browser Ready']
                for string in status_strings:
                    error_string += " | %s: %s " % (
                        string, launch_status.get(string, False))
                self.logger.error(
                    "BROWSER %i: Spawn unsuccessful %s" % (self.crawl_id,
                                                           error_string))
                self.kill_browser_manager()
                if 'Profile Created' in launch_status:
                    shutil.rmtree(spawned_profile_path, ignore_errors=True)

        # If the browser spawned successfully, we should update the
        # current profile path class variable and clean up the tempdir
        # and previous profile path.
        if success:
            self.logger.debug(
                "BROWSER %i: Browser spawn sucessful!" % self.crawl_id)
            previous_profile_path = self.current_profile_path
            self.current_profile_path = driver_profile_path
            if driver_profile_path != spawned_profile_path:
                shutil.rmtree(spawned_profile_path, ignore_errors=True)
            if previous_profile_path is not None:
                shutil.rmtree(previous_profile_path, ignore_errors=True)
            if tempdir is not None:
                shutil.rmtree(tempdir, ignore_errors=True)

        return success

    def restart_browser_manager(self, clear_profile=False):
        """
        kill and restart the two worker processes
        <clear_profile> marks whether we want to wipe the old profile
        """
        self.logger.info("BROWSER %i: BrowserManager restart initiated. "
                         "Clear profile? %s" % (self.crawl_id, clear_profile))
        if self.is_fresh:  # Return success if browser is fresh
            self.logger.info("BROWSER %i: Skipping restart since the browser "
                             "is a fresh instance already" % self.crawl_id)
            return True

        self.kill_browser_manager()

        # if crawl should be stateless we can clear profile
        if clear_profile and self.current_profile_path is not None:
            shutil.rmtree(self.current_profile_path, ignore_errors=True)
            self.current_profile_path = None
            self.browser_params['profile_tar'] = None

        return self.launch_browser_manager()

    def kill_browser_manager(self):
        """Kill the BrowserManager process and all of its children"""
        self.logger.debug(
            "BROWSER %i: Attempting to kill BrowserManager with pid %i. "
            "Display PID: %s | Display Port: %s | Browser PID: %s" % (
                self.crawl_id, self.browser_manager.pid, self.display_pid,
                self.display_port, self.browser_pid)
        )
        if (self.browser_manager is not None and
                self.browser_manager.pid is not None):
            try:
                os.kill(self.browser_manager.pid, signal.SIGKILL)
            except OSError:
                self.logger.debug("BROWSER %i: Browser manager process does "
                                  "not exist" % self.crawl_id)
                pass
        if self.display_pid is not None:
            try:
                os.kill(self.display_pid, signal.SIGKILL)
            except OSError:
                self.logger.debug("BROWSER %i: Display process does not "
                                  "exit" % self.crawl_id)
                pass
            except TypeError:
                self.logger.error("BROWSER %i: PID may not be the correct "
                                  "type %s" % (self.crawl_id,
                                               str(self.display_pid)))
        if self.display_port is not None:  # xvfb diplay lock
            lockfile = "/tmp/.X%s-lock" % self.display_port
            try:
                os.remove(lockfile)
            except OSError:
                self.logger.debug("BROWSER %i: Screen lockfile (%s) already "
                                  "removed" % (self.crawl_id, lockfile))
                pass
        if self.browser_pid is not None:
            """`browser_pid` is the geckodriver process. We first kill
            the child processes (i.e. firefox) and then kill the geckodriver
            process."""
            try:
                geckodriver = psutil.Process(pid=self.browser_pid)
                for child in geckodriver.children():
                    try:
                        child.kill()
                    except psutil.NoSuchProcess:
                        self.logger.debug(
                            "BROWSER %i: Geckodriver child process already "
                            "killed (pid=%i)." % (self.crawl_id, child.pid))
                        pass
                geckodriver.kill()
                geckodriver.wait(timeout=20)
                for child in geckodriver.children():
                    child.wait(timeout=20)
            except psutil.NoSuchProcess:
                self.logger.debug("BROWSER %i: Geckodriver process already "
                                  "killed." % self.crawl_id)
                pass
            except psutil.TimeoutExpired:
                self.logger.debug("BROWSER %i: Timeout while waiting for "
                                  "geckodriver or browser process to close " %
                                  self.crawl_id)
                pass

    def shutdown_browser(self, during_init):
        """ Runs the closing tasks for this Browser/BrowserManager """
        # Join command thread
        if self.command_thread is not None:
            self.logger.debug(
                "BROWSER %i: Joining command thread" % self.crawl_id)
            start_time = time.time()
            if self.current_timeout is not None:
                self.command_thread.join(self.current_timeout + 10)
            else:
                self.command_thread.join(60)
            self.logger.debug(
                "BROWSER %i: %f seconds to join command thread" % (
                    self.crawl_id, time.time() - start_time))

        # Kill BrowserManager process and children
        self.logger.debug(
            "BROWSER %i: Killing browser manager..." % self.crawl_id)
        self.kill_browser_manager()

        # Archive browser profile (if requested)
        self.logger.debug(
            "BROWSER %i: during_init=%s | profile_archive_dir=%s" % (
                self.crawl_id, str(during_init),
                self.browser_params['profile_archive_dir'])
        )
        if (not during_init and
                self.browser_params['profile_archive_dir'] is not None):
            self.logger.debug(
                "BROWSER %i: Archiving browser profile directory to %s" % (
                    self.crawl_id, self.browser_params['profile_archive_dir']))
            profile_commands.dump_profile(
                self.current_profile_path,
                self.manager_params,
                self.browser_params,
                self.browser_params['profile_archive_dir'],
                close_webdriver=False,
                browser_settings=self.browser_settings,
                compress=True,
                save_flash=self.browser_params['disable_flash'] is False
            )

        # Clean up temporary files
        if self.current_profile_path is not None:
            shutil.rmtree(self.current_profile_path, ignore_errors=True)