Exemplo n.º 1
0
def test():
    NUMBER_OF_PROCESSES = 4
    TASKS1 = [(mul, (i, 7)) for i in range(20)]
    TASKS2 = [(plus, (i, 8)) for i in range(10)]

    # Create queues
    task_queue = Queue()
    done_queue = Queue()

    # Submit tasks
    list(map(task_queue.put, TASKS1))

    # Start worker processes
    for i in range(NUMBER_OF_PROCESSES):
        Process(target=worker, args=(task_queue, done_queue)).start()

    # Get and print results
    print('Unordered results:')
    for i in range(len(TASKS1)):
        print('\t', done_queue.get())

    # Add more tasks using `put()` instead of `putMany()`
    for task in TASKS2:
        task_queue.put(task)

    # Get and print some more results
    for i in range(len(TASKS2)):
        print('\t', done_queue.get())

    # Tell child processes to stop
    for i in range(NUMBER_OF_PROCESSES):
        task_queue.put('STOP')
Exemplo n.º 2
0
class Robot():
    def __init__(self):
        GPIO.cleanup()
        self.speed_left = speed.Speed(40, "speed-left")
        self.speed_right = speed.Speed(38, "speed-right")
        self.move = move.Car(31, 33, 35, 37)

        self.robotrun = True
        self.movequeue = Queue()

    def run(self):
        self.speed_left.start()
        self.speed_right.start()
        self.move.start()
        while robotrun:
            try:
                command = raw_input("Command: ")
                self.movequeue.put(command)
            except KeyboardInterrupt:
                self.stop()
        self.stop()
        return

    def stop(self):
        self.speed_left.stop()
        self.speed_right.stop()
        self.move.stop()
        GPIO.cleanup()
        robotrun = False
        return
Exemplo n.º 3
0
def test():
    NUMBER_OF_PROCESSES = 4
    TASKS1 = [(mul, (i, 7)) for i in range(20)]
    TASKS2 = [(plus, (i, 8)) for i in range(10)]

    # Create queues
    task_queue = Queue()
    done_queue = Queue()

    # Submit tasks
    map(task_queue.put, TASKS1)

    # Start worker processes
    for i in range(NUMBER_OF_PROCESSES):
        Process(target=worker, args=(task_queue, done_queue)).start()

    # Get and print results
    print "Unordered results:"
    for i in range(len(TASKS1)):
        print "\t", done_queue.get()

    # Add more tasks using `put()` instead of `putMany()`
    for task in TASKS2:
        task_queue.put(task)

    # Get and print some more results
    for i in range(len(TASKS2)):
        print "\t", done_queue.get()

    # Tell child processes to stop
    for i in range(NUMBER_OF_PROCESSES):
        task_queue.put("STOP")
Exemplo n.º 4
0
class MemoryUnstructuredProvider(UnstructuredStorageProvider):
    """This storage provider stores all data in memory under self.storage as a dict
    from filename to content.
    Use this provider for writing tests and for small crawls where no persistence is required
    """

    async def init(self) -> None:
        pass

    def __init__(self) -> None:
        self.storage: Dict[str, bytes] = {}
        self.queue = Queue()
        self.handle = MemoryProviderHandle(self.queue)

    async def store_blob(
        self,
        filename: str,
        blob: bytes,
        compressed: bool = True,
        skip_if_exists: bool = True,
    ) -> None:
        if skip_if_exists and filename in self.storage:
            return
        if compressed:
            bytesIO = self._compress(blob)
            blob = bytesIO.getvalue()
        self.storage[filename] = blob
        self.queue.put((filename, blob))

    async def flush_cache(self) -> None:
        pass

    async def shutdown(self) -> None:
        pass
Exemplo n.º 5
0
class BaseAggregator(object):
    """Base class for the data aggregator interface. This class is used
    alongside the BaseListener class to spawn an aggregator process that
    combines data from multiple crawl processes. The BaseAggregator class
    manages the child listener process.

    Parameters
    ----------
    manager_params : dict
        TaskManager configuration parameters
    browser_params : list of dict
        List of browser configuration dictionaries"""
    __metaclass__ = abc.ABCMeta

    def __init__(self, manager_params, browser_params):
        self.manager_params = manager_params
        self.browser_params = browser_params
        self.logger = loggingclient(*manager_params['logger_address'])
        self.listener_address = None
        self.listener_process = None

    @abc.abstractmethod
    def save_configuration(self, openwpm_version, browser_version):
        """Save configuration details to the database"""

    @abc.abstractmethod
    def get_next_visit_id(self):
        """Return a unique visit ID to be used as a key for a single page visit"""

    @abc.abstractmethod
    def get_next_crawl_id(self):
        """Return a unique crawl ID to be used as a key for a browser instance"""

    def launch(self, listener_process_runner):
        """Launch the aggregator listener process"""
        self.status_queue = Queue()
        self.listener_process = Process(
            target=listener_process_runner,
            args=(self.manager_params, self.status_queue))
        self.listener_process.daemon = True
        self.listener_process.start()
        self.listener_address = self.status_queue.get()

    def shutdown(self):
        """ Terminate the aggregator listener process"""
        self.logger.debug(
            "Sending the shutdown signal to the %s listener process..." %
            type(self).__name__
        )
        self.status_queue.put("SHUTDOWN")
        start_time = time.time()
        self.listener_process.join(300)
        self.logger.debug(
            "%s took %s seconds to close." % (
                type(self).__name__,
                str(time.time() - start_time)
            )
        )
        self.listener_address = None
        self.listener_process = None
Exemplo n.º 6
0
def _moda_dynamic_bayesian_inference(queue: Queue, signal1: TimeSeries,
                                     signal2: TimeSeries, params: ParamSet):
    """
    UNUSED.

    Uses the MATLAB-packaged function to perform Bayesian inference.
    Unused because it causes a serious error on Linux. Check the Python implementation
    of Bayesian inference instead (`bayesian.py`).
    """
    import full_bayesian
    import matlab

    package = full_bayesian.initialize()

    sig1 = matlab.double(signal1.signal.tolist())
    sig2 = matlab.double(signal2.signal.tolist())

    int1 = list(params.freq_range1)
    int2 = list(params.freq_range2)

    fs = signal1.frequency
    win = params.window
    pr = params.propagation_const
    ovr = params.overlap
    bn = params.order
    ns = params.surr_count
    signif = params.confidence_level

    result = package.full_bayesian(sig1, sig2, *int1, *int2, fs, win, pr, ovr,
                                   bn, ns, signif)

    queue.put((signal1.name, *result))
Exemplo n.º 7
0
class MemoryArrowProvider(ArrowProvider):
    def __init__(self) -> None:
        super().__init__()
        self.queue = Queue()
        self.handle = MemoryProviderHandle(self.queue)

    async def write_table(self, table_name: TableName, table: Table) -> None:
        self.queue.put((table_name, table))

    async def shutdown(self) -> None:
        pass
    def run(self):
        setproctitle('python.DataStreamRecorder.{0}'.format(self._name))
        try:
            logging.debug("Starting data recording on {0}".format(self.name))
            self._tokens_q.put(("return", self.name))
            while True:
                if not self._cmds_q.empty():
                    cmd = self._cmds_q.get()
                    if cmd[0] == 'stop':
                        break
                    elif cmd[0] == 'pause':
                        self._recording = False
                        if self._saving_cache:
                            self._save_cache(self._cur_data_segment)
                            self._cur_data_segment += 1
                            self._data_qs.append(Queue())
                    elif cmd[0] == 'reset_data_segment':
                        self._start_data_segment = self._cur_data_segment
                    elif cmd[0] == 'resume':
                        self._recording = True
                    elif cmd[0] == 'save':
                        self._save_data(cmd[1], cmd[2], cmd[3])
                    elif cmd[0] == 'params':
                        self._args = cmd[1]
                        self._kwargs = cmd[2]

                if self._recording and not self._ok_q.empty():
                    timestamp = self._ok_q.get()
                    self._tokens_q.put(("take", self.name))

                    data = self._data_sampler_method(*self._args,
                                                     **self._kwargs)

                    cur_data_q = self._data_qs[self._cur_data_segment]
                    if self._saving_cache and cur_data_q.qsize(
                    ) == self._save_every:
                        self._save_cache(self._cur_data_segment)
                        cur_data_q = Queue()
                        self._data_qs.append(cur_data_q)
                        self._cur_data_segment += 1
                    cur_data_q.put((timestamp, data))

                    self._tokens_q.put(("return", self.name))

        except KeyboardInterrupt:
            logging.debug("Shutting down data streamer on {0}".format(
                self.name))
            sys.exit(0)
Exemplo n.º 9
0
 def test_process_done_queue(self):
     workers = 2
     done_queue = Queue()
     matches = []
     tournament = axelrod.Tournament(
         name=self.test_name,
         players=self.players,
         game=self.game,
         turns=200,
         repetitions=self.test_repetitions)
     for r in range(self.test_repetitions):
         done_queue.put({})
     for w in range(workers):
         done_queue.put('STOP')
     tournament._process_done_queue(workers, done_queue, matches)
     self.assertEqual(len(matches), self.test_repetitions)
Exemplo n.º 10
0
def main():
	t1 = task("http://www.laurentluce.com/posts/python-threads-synchronization-locks-rlocks-semaphores-conditions-events-and-queues/")
	t2 = task("http://stackoverflow.com/questions/15651128/in-this-semaphore-example-is-it-necessary-to-lock-for-refill-and-buy")
	t3 = task("http://bbs.byr.cn/")
	event = Event()
	tasks = Queue()
	pages = Queue()
	tasks.put(t1)
	tasks.put(t2)
	tasks.put(t3)

	f = open("test.txt",'a')

	Crawler1 = Crawler(tasks,pages,event,f)
	Crawler2 = Crawler(tasks,pages,event,f)
	Crawler1.start()
	Crawler2.start()
	Crawler1.join()
	Crawler2.join()
	f.close()
Exemplo n.º 11
0
    def test_start_workers(self):
        workers = 2
        work_queue = Queue()
        done_queue = Queue()
        for repetition in range(self.test_repetitions):
            work_queue.put(repetition)
        tournament = axelrod.Tournament(
            name=self.test_name,
            players=self.players,
            game=self.game,
            turns=200,
            repetitions=self.test_repetitions)
        tournament._start_workers(workers, work_queue, done_queue)

        stops = 0
        while stops < workers:
            payoffs = done_queue.get()
            if payoffs == 'STOP':
                stops += 1
        self.assertEqual(stops, workers)
Exemplo n.º 12
0
 def _work(cls, input_queue: mp.Queue, transformer: Callable,
           output_queue: mp.Queue):
     try:
         for output in transformer(iterate_until_none(input_queue.get)):
             output_queue.put(output)
     except BaseException as e:
         output_queue.put(e)
     output_queue.put(None)
def run_parallely_with_progress_bar(
    items,
    func,
    msgfunc,
    accumulator=NoOpAcc(),
    title=''):
  PROC_COUNT = 5

  total = len(items)

  task_queue = Queue()
  done_queue = Queue()

  def pb_updater(inq, results_q):
    pb = ProgressBar(title)
    for i in range(total):
      msg, result = results_q.get()
      accumulator.add(result)
      pb.update(percent=((i+1)*100)/total, message=msg)
    pb.finish()
    accumulator.finish()

    # tell the workers to stop
    for i in range(PROC_COUNT):
      inq.put('STOP')

  def worker(inq, outq):
    for item in iter(inq.get, 'STOP'):
      result = func(item)
      outq.put((msgfunc(item), result))

  for i in range(PROC_COUNT):
    Process(target=worker, args=(task_queue, done_queue)).start()

  updater = Process(target=pb_updater, args=(task_queue, done_queue))
  updater.start()
  for item in items:
    task_queue.put(item)
Exemplo n.º 14
0
    def run(self):
        """Starts the run of the tests"""
        results = []
        worker_list = []
        to_worker = Queue()
        from_worker = Queue()
        verbose = self.cl_args.verbose
        failfast = self.cl_args.failfast
        workers = int(not self.cl_args.parallel) or self.cl_args.workers

        for suite in self.suites:
            to_worker.put(suite)

        for _ in range(workers):
            to_worker.put(None)

        start = time.time()
        # A second try catch is needed here because queues can cause locking
        # when they go out of scope, especially when termination signals used
        try:
            for _ in range(workers):
                proc = Consumer(to_worker, from_worker, verbose, failfast)
                worker_list.append(proc)
                proc.start()

            for _ in self.suites:
                results.append(self.log_result(from_worker.get()))

            end = time.time()
            tests_run, errors, failures = self.compile_results(
                run_time=end - start,
                datagen_time=start - self.datagen_start,
                results=results)

        except KeyboardInterrupt:
            print_exception("Runner", "run", "Keyboard Interrupt, exiting...")
            os.killpg(0, 9)
        return bool(sum([errors, failures, not tests_run]))
Exemplo n.º 15
0
    def test_worker(self):
        tournament = axelrod.Tournament(
            name=self.test_name,
            players=self.players,
            game=self.game,
            turns=200,
            repetitions=self.test_repetitions)

        work_queue = Queue()
        for repetition in range(self.test_repetitions):
            work_queue.put(repetition)
        work_queue.put('STOP')

        done_queue = Queue()
        tournament._worker(work_queue, done_queue)
        for r in range(self.test_repetitions):
            new_matches = done_queue.get()
            self.assertEqual(len(new_matches), 15)
            for index_pair, match in new_matches.items():
                self.assertIsInstance(index_pair, tuple)
                self.assertIsInstance(match, list)
        queue_stop = done_queue.get()
        self.assertEqual(queue_stop, 'STOP')
Exemplo n.º 16
0
    def run(self):
        """Starts the run of the tests"""
        results = []
        worker_list = []
        to_worker = Queue()
        from_worker = Queue()
        verbose = self.cl_args.verbose
        failfast = self.cl_args.failfast
        workers = int(not self.cl_args.parallel) or self.cl_args.workers

        for suite in self.suites:
            to_worker.put(suite)

        for _ in range(workers):
            to_worker.put(None)

        start = time.time()
        # A second try catch is needed here because queues can cause locking
        # when they go out of scope, especially when termination signals used
        try:
            for _ in range(workers):
                proc = Consumer(to_worker, from_worker, verbose, failfast)
                worker_list.append(proc)
                proc.start()

            for _ in self.suites:
                results.append(self.log_result(from_worker.get()))

            end = time.time()
            tests_run, errors, failures = self.compile_results(
                run_time=end - start, datagen_time=start - self.datagen_start,
                results=results)

        except KeyboardInterrupt:
            print_exception("Runner", "run", "Keyboard Interrupt, exiting...")
            os.killpg(0, 9)
        return bool(sum([errors, failures, not tests_run]))
class DataStreamSyncer:
    def __init__(self, data_stream_recorders, frequency=0):
        """
        Instantiates a new DataStreamSyncer

        Parameters
        ----------
            data_stream_recorders : list of DataStreamRecorders to sync
            frequency : float, optional
                Frequency in hz used for ratelimiting. If set to 0
                or less, will not rate limit. Defaults to 0.
        """
        self._cmds_q = Queue()
        self._tokens_q = Queue()

        self._data_stream_recorders = data_stream_recorders
        ok_qs = {}
        for data_stream_recorder in self._data_stream_recorders:
            ok_q = Queue()
            name = data_stream_recorder.name
            if name in ok_qs:
                raise ValueError(
                    "Data Stream Recorders must have unique names! "
                    f"{name} is a duplicate!")
            ok_qs[name] = ok_q
            data_stream_recorder._set_qs(ok_q, self._tokens_q)

        self._syncer = _DataStreamSyncer(frequency, ok_qs, self._cmds_q,
                                         self._tokens_q)
        self._syncer.start()

    def start(self):
        """Starts syncer operations"""
        for recorder in self._data_stream_recorders:
            recorder._start_recording()

    def stop(self):
        """Stops syncer operations. Destroys syncer process."""
        self._cmds_q.put(("stop", ))
        for recorder in self._data_stream_recorders:
            recorder._stop()
        try:
            self._syncer.terminate()
        except Exception:
            pass

    def pause(self):
        self._cmds_q.put(("pause", ))
        for recorder in self._data_stream_recorders:
            recorder._pause()

    def resume(self, reset_time=False):
        self._cmds_q.put(("resume", reset_time))
        for recorder in self._data_stream_recorders:
            recorder._resume()

    def reset_time(self):
        self._cmds_q.put(("reset_time", ))

    def flush(self):
        data = {}
        for recorder in self._data_stream_recorders:
            data[recorder.name] = recorder._flush()
        return data
Exemplo n.º 18
0
class BaseAggregator(object):
    """Base class for the data aggregator interface. This class is used
    alongside the BaseListener class to spawn an aggregator process that
    combines data from multiple crawl processes. The BaseAggregator class
    manages the child listener process.

    Parameters
    ----------
    manager_params : dict
        TaskManager configuration parameters
    browser_params : list of dict
        List of browser configuration dictionaries"""
    __metaclass__ = abc.ABCMeta

    def __init__(self, manager_params, browser_params):
        self.manager_params = manager_params
        self.browser_params = browser_params
        self.listener_address = None
        self.listener_process = None
        self.status_queue = Queue()
        self.shutdown_queue = Queue()
        self._last_status = None
        self._last_status_received = None
        self.logger = logging.getLogger('openwpm')

    @abc.abstractmethod
    def save_configuration(self, openwpm_version, browser_version):
        """Save configuration details to the database"""

    @abc.abstractmethod
    def get_next_visit_id(self):
        """Return a unique visit ID to be used as a key for a single visit"""

    @abc.abstractmethod
    def get_next_crawl_id(self):
        """Return a unique crawl ID used as a key for a browser instance"""

    def get_most_recent_status(self):
        """Return the most recent queue size sent from the listener process"""

        # Block until we receive the first status update
        if self._last_status is None:
            return self.get_status()

        # Drain status queue until we receive most recent update
        while not self.status_queue.empty():
            self._last_status = self.status_queue.get()
            self._last_status_received = time.time()

        # Check last status signal
        if (time.time() - self._last_status_received) > STATUS_TIMEOUT:
            raise RuntimeError(
                "No status update from DataAggregator listener process "
                "for %d seconds." % (time.time() - self._last_status_received))

        return self._last_status

    def get_status(self):
        """Get listener process status. If the status queue is empty, block."""
        try:
            self._last_status = self.status_queue.get(block=True,
                                                      timeout=STATUS_TIMEOUT)
            self._last_status_received = time.time()
        except queue.Empty:
            raise RuntimeError(
                "No status update from DataAggregator listener process "
                "for %d seconds." % (time.time() - self._last_status_received))
        return self._last_status

    def launch(self, listener_process_runner, *args):
        """Launch the aggregator listener process"""
        args = (self.manager_params, self.status_queue,
                self.shutdown_queue) + args
        self.listener_process = Process(target=listener_process_runner,
                                        args=args)
        self.listener_process.daemon = True
        self.listener_process.start()
        self.listener_address = self.status_queue.get()

    def shutdown(self):
        """ Terminate the aggregator listener process"""
        self.logger.debug(
            "Sending the shutdown signal to the %s listener process..." %
            type(self).__name__)
        self.shutdown_queue.put(SHUTDOWN_SIGNAL)
        start_time = time.time()
        self.listener_process.join(300)
        self.logger.debug("%s took %s seconds to close." %
                          (type(self).__name__, str(time.time() - start_time)))
        self.listener_address = None
        self.listener_process = None
Exemplo n.º 19
0
class BaseAggregator(object):
    """Base class for the data aggregator interface. This class is used
    alongside the BaseListener class to spawn an aggregator process that
    combines data from multiple crawl processes. The BaseAggregator class
    manages the child listener process.

    Parameters
    ----------
    manager_params : dict
        TaskManager configuration parameters
    browser_params : list of dict
        List of browser configuration dictionaries"""
    __metaclass__ = abc.ABCMeta

    def __init__(self, manager_params, browser_params):
        self.manager_params = manager_params
        self.browser_params = browser_params
        self.logger = loggingclient(*manager_params['logger_address'])
        self.listener_address = None
        self.listener_process = None
        self.status_queue = Queue()
        self.shutdown_queue = Queue()
        self._last_status = None
        self._last_status_received = None

    @abc.abstractmethod
    def save_configuration(self, openwpm_version, browser_version):
        """Save configuration details to the database"""

    @abc.abstractmethod
    def get_next_visit_id(self):
        """Return a unique visit ID to be used as a key for a single visit"""

    @abc.abstractmethod
    def get_next_crawl_id(self):
        """Return a unique crawl ID used as a key for a browser instance"""

    def get_most_recent_status(self):
        """Return the most recent queue size sent from the listener process"""

        # Block until we receive the first status update
        if self._last_status is None:
            return self.get_status()

        # Drain status queue until we receive most recent update
        while not self.status_queue.empty():
            self._last_status = self.status_queue.get()
            self._last_status_received = time.time()

        # Check last status signal
        if (time.time() - self._last_status_received) > STATUS_TIMEOUT:
            raise RuntimeError(
                "No status update from DataAggregator listener process "
                "for %d seconds." % (time.time() - self._last_status_received)
            )

        return self._last_status

    def get_status(self):
        """Get listener process status. If the status queue is empty, block."""
        try:
            self._last_status = self.status_queue.get(
                block=True, timeout=STATUS_TIMEOUT)
            self._last_status_received = time.time()
        except queue.Empty:
            raise RuntimeError(
                "No status update from DataAggregator listener process "
                "for %d seconds." % (time.time() - self._last_status_received)
            )
        return self._last_status

    def launch(self, listener_process_runner, *args):
        """Launch the aggregator listener process"""
        args = (self.manager_params, self.status_queue,
                self.shutdown_queue) + args
        self.listener_process = Process(
            target=listener_process_runner,
            args=args
        )
        self.listener_process.daemon = True
        self.listener_process.start()
        self.listener_address = self.status_queue.get()

    def shutdown(self):
        """ Terminate the aggregator listener process"""
        self.logger.debug(
            "Sending the shutdown signal to the %s listener process..." %
            type(self).__name__
        )
        self.shutdown_queue.put(SHUTDOWN_SIGNAL)
        start_time = time.time()
        self.listener_process.join(300)
        self.logger.debug(
            "%s took %s seconds to close." % (
                type(self).__name__,
                str(time.time() - start_time)
            )
        )
        self.listener_address = None
        self.listener_process = None
Exemplo n.º 20
0
class VideoRecorder:
    """ Encapsulates video recording processes.

    Attributes
    ----------
    device_id : int
        USB index of device
    res : 2-tuple
        resolution of recording and saving. defaults to (640, 480)
    codec : :obj:`str`
        codec used for encoding video. default to XVID. 
    fps : int
        frames per second of video captures. defaults to 30
    """
    def __init__(self,
                 camera,
                 device_id=0,
                 res=(640, 480),
                 codec='XVID',
                 fps=30):
        self._res = res
        self._codec = codec
        self._fps = fps

        self._cmd_q = Queue()

        self._actual_camera = camera

        self._recording = False
        self._started = False

    @property
    def is_recording(self):
        return self._recording

    @property
    def is_started(self):
        return self._started

    def start(self):
        """ Starts the camera recording process. """
        self._started = True
        self._actual_camera.start()
        self._camera = _Camera(self._actual_camera, self._cmd_q, self._res,
                               self._codec, self._fps)
        self._camera.start()

    def start_recording(self, output_file):
        """ Starts recording to a given output video file.

        Parameters
        ----------
        output_file : :obj:`str`
            filename to write video to
        """
        if not self._started:
            raise Exception(
                "Must start the video recorder first by calling .start()!")
        if self._recording:
            raise Exception(
                "Cannot record a video while one is already recording!")
        self._recording = True
        self._cmd_q.put(('start', output_file))

    def stop_recording(self):
        """ Stops writing video to file. """
        if not self._recording:
            raise Exception(
                "Cannot stop a video recording when it's not recording!")
        self._cmd_q.put(('stop', ))
        self._recording = False

    def stop(self):
        """ Stop the camera process. """
        if not self._started:
            raise Exception("Cannot stop a video recorder before starting it!")
        self._started = False
        self._actual_camera.stop()
        self._camera.terminate()
Exemplo n.º 21
0
class DataStreamRecorder(Process):
    def __init__(self,
                 name,
                 data_sampler_method,
                 cache_path=None,
                 save_every=50):
        """ Initializes a DataStreamRecorder
        Parameters
        ----------
            name : string
                    User-friendly identifier for this data stream
            data_sampler_method : function
                    Method to call to retrieve data
        """
        Process.__init__(self)
        self._data_sampler_method = data_sampler_method

        self._has_set_sampler_params = False
        self._recording = False

        self._name = name

        self._cmds_q = Queue()
        self._data_qs = [Queue()]
        self._ok_q = None
        self._tokens_q = None

        self._save_every = save_every
        self._cache_path = cache_path
        self._saving_cache = cache_path is not None
        if self._saving_cache:
            self._save_path = os.path.join(cache_path, self.name)
            if not os.path.exists(self._save_path):
                os.makedirs(self._save_path)

        self._start_data_segment = 0
        self._cur_data_segment = 0
        self._saving_ps = []

    def run(self):
        setproctitle('python.DataStreamRecorder.{0}'.format(self._name))
        try:
            logging.debug("Starting data recording on {0}".format(self.name))
            self._tokens_q.put(("return", self.name))
            while True:
                if not self._cmds_q.empty():
                    cmd = self._cmds_q.get()
                    if cmd[0] == 'stop':
                        break
                    elif cmd[0] == 'pause':
                        self._recording = False
                        if self._saving_cache:
                            self._save_cache(self._cur_data_segment)
                            self._cur_data_segment += 1
                            self._data_qs.append(Queue())
                    elif cmd[0] == 'reset_data_segment':
                        self._start_data_segment = self._cur_data_segment
                    elif cmd[0] == 'resume':
                        self._recording = True
                    elif cmd[0] == 'save':
                        self._save_data(cmd[1], cmd[2], cmd[3])
                    elif cmd[0] == 'params':
                        self._args = cmd[1]
                        self._kwargs = cmd[2]

                if self._recording and not self._ok_q.empty():
                    timestamp = self._ok_q.get()
                    self._tokens_q.put(("take", self.name))

                    data = self._data_sampler_method(*self._args,
                                                     **self._kwargs)

                    cur_data_q = self._data_qs[self._cur_data_segment]
                    if self._saving_cache and cur_data_q.qsize(
                    ) == self._save_every:
                        self._save_cache(self._cur_data_segment)
                        cur_data_q = Queue()
                        self._data_qs.append(cur_data_q)
                        self._cur_data_segment += 1
                    cur_data_q.put((timestamp, data))

                    self._tokens_q.put(("return", self.name))

        except KeyboardInterrupt:
            logging.debug("Shutting down data streamer on {0}".format(
                self.name))
            sys.exit(0)

    def _extract_q(self, i):
        q = self._data_qs[i]
        vals = []
        while q.qsize() > 0:
            vals.append(q.get())
        self._data_qs[i] = None
        del q
        return vals

    def _save_data(self, path, cb, concat):
        if not os.path.exists(path):
            os.makedirs(path)
        target_filename = os.path.join(path, "{0}.jb".format(self.name))
        if self._saving_cache:
            while True in [p.is_alive() for p in self._saving_ps]:
                sleep(1e-3)

            p = Process(target=_caches_to_file,
                        args=(self._save_path, self._start_data_segment,
                              self._cur_data_segment, target_filename, cb,
                              concat))
            p.start()
            self._start_data_segment = self._cur_data_segment
        else:
            data = self._extract_q(0)
            p = Process(target=_dump, args=(data, target_filename, cb))
            p.start()

    def _save_cache(self, i):
        if not self._save_cache:
            raise Exception(
                "Cannot save cache if no cache path was specified.")
        logging.debug("Saving cache for {0} block {1}".format(
            self.name, self._cur_data_segment))
        data = self._extract_q(i)
        p = Process(target=_dump_cache,
                    args=(data,
                          os.path.join(self._save_path, "{0}.jb".format(
                              self._cur_data_segment)), self.name,
                          self._cur_data_segment))
        p.start()
        self._saving_ps.append(p)

    def _start_recording(self, *args, **kwargs):
        """ Starts recording
        Parameters
        ----------
            *args : any
                    Ordinary args used for calling the specified data sampler method
            **kwargs : any
                    Keyword args used for calling the specified data sampler method
        """
        while not self._cmds_q.empty():
            self._cmds_q.get_nowait()
        while not self._data_qs[self._cur_data_segment].empty():
            self._data_qs[self._cur_data_segment].get_nowait()

        self._args = args
        self._kwargs = kwargs

        self._recording = True
        self.start()

    @property
    def name(self):
        return self._name

    def _set_qs(self, ok_q, tokens_q):
        self._ok_q = ok_q
        self._tokens_q = tokens_q

    def _flush(self):
        """ Returns a list of all current data """
        if self._recording:
            raise Exception("Cannot flush data queue while recording!")
        if self._saving_cache:
            logging.warn(
                "Flush when using cache means unsaved data will be lost and not returned!"
            )
            self._cmds_q.put(("reset_data_segment", ))
        else:
            data = self._extract_q(0)
            return data

    def save_data(self, path, cb=_NULL, concat=True):
        if self._recording:
            raise Exception("Cannot save data while recording!")
        self._cmds_q.put(("save", path, cb, concat))

    def _stop(self):
        """ Stops recording. Returns all recorded data and their timestamps. Destroys recorder process."""
        self._pause()
        self._cmds_q.put(("stop", ))
        try:
            self._recorder.terminate()
        except Exception:
            pass
        self._recording = False

    def _pause(self):
        """ Pauses recording """
        self._cmds_q.put(("pause", ))
        self._recording = False

    def _resume(self):
        """ Resumes recording """
        self._cmds_q.put(("resume", ))
        self._recording = True

    def change_data_sampler_params(self, *args, **kwargs):
        """ Chanes args and kwargs for data sampler method
        Parameters
        ----------
            *args : any
                    Ordinary args used for calling the specified data sampler method
            **kwargs : any
                    Keyword args used for calling the specified data sampler method
        """
        self._cmds_q.put(('params', args, kwargs))
Exemplo n.º 22
0
class Fibratus(object):
    """Fibratus entrypoint.

    Setup the core components including the kernel
    event stream collector and the tracing controller.
    At this point the system handles are also being
    enumerated.

    """
    def __init__(self, filament, **kwargs):

        self._start = datetime.now()
        try:
            log_path = os.path.join(os.path.expanduser('~'), '.fibratus',
                                    'fibratus.log')
            FileHandler(log_path, mode='w+').push_application()
            StreamHandler(sys.stdout).push_application()
        except PermissionError:
            panic(
                "ERROR - Unable to open log file for writing due to permission error"
            )

        self.logger = Logger(Fibratus.__name__)

        self._config = YamlConfig()

        self.logger.info('Starting...')

        enable_cswitch = kwargs.pop('cswitch', False)

        self.kcontroller = KTraceController()
        self.ktrace_props = KTraceProps()
        self.ktrace_props.enable_kflags(cswitch=enable_cswitch)
        self.ktrace_props.logger_name = etw.KERNEL_LOGGER_NAME

        enum_handles = kwargs.pop('enum_handles', True)

        self.handle_repository = HandleRepository()
        self._handles = []
        # query for handles on the
        # start of the kernel trace
        if enum_handles:
            self.logger.info('Enumerating system handles...')
            self._handles = self.handle_repository.query_handles()
            self.logger.info('%s handles found' % len(self._handles))
            self.handle_repository.free_buffers()
        self.thread_registry = ThreadRegistry(self.handle_repository,
                                              self._handles)

        self.kevt_streamc = KEventStreamCollector(
            etw.KERNEL_LOGGER_NAME.encode())
        image_skips = self._config.image_skips
        if len(image_skips) > 0:
            self.logger.info("Adding skips for images %s" % image_skips)
            for skip in image_skips:
                self.kevt_streamc.add_skip(skip)

        self.kevent = KEvent(self.thread_registry)
        self.keventq = Queue()

        self._output_classes = dict(console=ConsoleOutput,
                                    amqp=AmqpOutput,
                                    smtp=SmtpOutput,
                                    elasticsearch=ElasticsearchOutput)
        self._outputs = self._construct_outputs()

        if filament:
            filament.keventq = self.keventq
            filament.logger = log_path
            filament.setup_adapters(self._outputs)
        self._filament = filament

        self.fsio = FsIO(self.kevent, self._handles)
        self.hive_parser = HiveParser(self.kevent, self.thread_registry)
        self.tcpip_parser = TcpIpParser(self.kevent)
        self.dll_repository = DllRepository(self.kevent)
        self.context_switch_registry = ContextSwitchRegistry(
            self.thread_registry, self.kevent)

        self.output_kevents = {}
        self.filters_count = 0

    def run(self):
        @atexit.register
        def _exit():
            self.stop_ktrace()

        self.kcontroller.start_ktrace(etw.KERNEL_LOGGER_NAME,
                                      self.ktrace_props)

        if self._filament:
            self._filament.start()

        def on_kstream_open():
            if self._filament is None:
                delta = datetime.now() - self._start
                self.logger.info('Started in %s' %
                                 str(timedelta(seconds=delta.seconds)))

        self.kevt_streamc.set_kstream_open_callback(on_kstream_open)
        self._open_kstream()

    def _open_kstream(self):
        try:
            self.kevt_streamc.open_kstream(self._on_next_kevent)
        except Exception as e:
            self.logger.error(e)
        except KeyboardInterrupt:
            self.stop_ktrace()

    def _construct_outputs(self):
        """Instantiates output classes.

        Builds the dictionary with instances
        of the output classes.
        """
        outputs = {}
        output_configs = self._config.outputs
        for output in output_configs:
            name = next(iter(list(output.keys())), None)
            if name and \
                    name in self._output_classes.keys():
                # get the output configuration
                # and instantiate its class
                self.logger.info("Initializing %s output" % name)
                output_class = self._output_classes[name]
                output_config = output[name]
                outputs[name] = output_class(**output_config)
        return outputs

    def stop_ktrace(self):
        self.logger.info('Stopping fibratus...')
        if self._filament:
            self._filament.close()
        self.kcontroller.stop_ktrace(self.ktrace_props)
        self.kevt_streamc.close_kstream()

    def add_filters(self, kevent_filters, **kwargs):
        self.kevt_streamc.add_pid_filter(kwargs.pop('pid', None))
        if len(kevent_filters) > 0:
            self.filters_count = len(kevent_filters)
            # include the basic filters
            # that are essential to the
            # rest of kernel events
            self.kevt_streamc.add_ktuple_filter(ENUM_PROCESS)
            self.kevt_streamc.add_ktuple_filter(ENUM_THREAD)
            self.kevt_streamc.add_ktuple_filter(ENUM_IMAGE)
            self.kevt_streamc.add_ktuple_filter(REG_CREATE_KCB)
            self.kevt_streamc.add_ktuple_filter(REG_DELETE_KCB)

            # these kevents are necessary for consistent state
            # of the trace. If the user doesn't include them
            # in a filter list, then we do the job but set the
            # kernel event type as not eligible for rendering
            if KEvents.CREATE_PROCESS not in kevent_filters:
                self.kevt_streamc.add_ktuple_filter(CREATE_PROCESS)
                self.output_kevents[CREATE_PROCESS] = False
            else:
                self.output_kevents[CREATE_PROCESS] = True

            if KEvents.CREATE_THREAD not in kevent_filters:
                self.kevt_streamc.add_ktuple_filter(CREATE_THREAD)
                self.output_kevents[CREATE_THREAD] = False
            else:
                self.output_kevents[CREATE_THREAD] = True

            if KEvents.TERMINATE_PROCESS not in kevent_filters:
                self.kevt_streamc.add_ktuple_filter(TERMINATE_PROCESS)
                self.output_kevents[TERMINATE_PROCESS] = False
            else:
                self.output_kevents[TERMINATE_PROCESS] = True

            if KEvents.TERMINATE_THREAD not in kevent_filters:
                self.kevt_streamc.add_ktuple_filter(TERMINATE_THREAD)
                self.output_kevents[TERMINATE_THREAD] = False
            else:
                self.output_kevents[TERMINATE_THREAD] = True

            for kevent_filter in kevent_filters:
                ktuple = kname_to_tuple(kevent_filter)
                if isinstance(ktuple, list):
                    for kt in ktuple:
                        self.kevt_streamc.add_ktuple_filter(kt)
                        if kt not in self.output_kevents:
                            self.output_kevents[kt] = True
                else:
                    self.kevt_streamc.add_ktuple_filter(ktuple)
                    if ktuple not in self.output_kevents:
                        self.output_kevents[ktuple] = True

    def _on_next_kevent(self, ktype, cpuid, ts, kparams):
        """Callback which fires when new kernel event arrives.

        This callback is invoked for every new kernel event
        forwarded from the kernel stream collector.

        Parameters
        ----------

        ktype: tuple
            Kernel event type.
        cpuid: int
            Indentifies the CPU core where the event
            has been captured.
        ts: str
            Temporal reference of the kernel event.
        kparams: dict
            Kernel event's parameters.
        """

        # initialize kernel event properties
        self.kevent.ts = ts
        self.kevent.cpuid = cpuid
        self.kevent.name = ktuple_to_name(ktype)
        kparams = ddict(kparams)

        # thread / process kernel events
        if ktype in [CREATE_PROCESS, CREATE_THREAD, ENUM_PROCESS, ENUM_THREAD]:
            self.thread_registry.add_thread(ktype, kparams)
            if ktype in [CREATE_PROCESS, CREATE_THREAD]:
                self.thread_registry.init_thread_kevent(
                    self.kevent, ktype, kparams)
                self._aggregate(ktype)
        elif ktype in [TERMINATE_PROCESS, TERMINATE_THREAD]:
            self.thread_registry.init_thread_kevent(self.kevent, ktype,
                                                    kparams)
            self._aggregate(ktype)
            self.thread_registry.remove_thread(ktype, kparams)

        # file system/disk kernel events
        elif ktype in [
                CREATE_FILE, DELETE_FILE, CLOSE_FILE, READ_FILE, WRITE_FILE
        ]:
            self.fsio.parse_fsio(ktype, kparams)
            self._aggregate(ktype)

        # dll kernel events
        elif ktype in [LOAD_IMAGE, ENUM_IMAGE]:
            self.dll_repository.register_dll(kparams)
            if ktype == LOAD_IMAGE:
                self._aggregate(ktype)
        elif ktype == UNLOAD_IMAGE:
            self.dll_repository.unregister_dll(kparams)
            self._aggregate(ktype)
        #
        # # registry kernel events
        elif ktype == REG_CREATE_KCB:
            self.hive_parser.add_kcb(kparams)
        elif ktype == REG_DELETE_KCB:
            self.hive_parser.remove_kcb(kparams.key_handle)

        elif ktype in [
                REG_CREATE_KEY, REG_DELETE_KEY, REG_OPEN_KEY, REG_QUERY_KEY,
                REG_SET_VALUE, REG_DELETE_VALUE, REG_QUERY_VALUE
        ]:
            self.hive_parser.parse_hive(ktype, kparams)
            self._aggregate(ktype)

        # network kernel events
        elif ktype in [
                SEND_SOCKET_TCPV4, SEND_SOCKET_UDPV4, RECV_SOCKET_TCPV4,
                RECV_SOCKET_UDPV4, ACCEPT_SOCKET_TCPV4, CONNECT_SOCKET_TCPV4,
                DISCONNECT_SOCKET_TCPV4, RECONNECT_SOCKET_TCPV4
        ]:
            self.tcpip_parser.parse_tcpip(ktype, kparams)
            self._aggregate(ktype)

        # context switch events
        elif ktype == CONTEXT_SWITCH:
            self.context_switch_registry.next_cswitch(cpuid, ts, kparams)
            self._aggregate(ktype)

        if self._filament:
            # put the event on the queue
            # from where the filaments process
            # will poll for kernel events
            if ktype not in [
                    ENUM_PROCESS, ENUM_THREAD, ENUM_IMAGE, REG_CREATE_KCB,
                    REG_DELETE_KCB
            ]:
                ok = self.output_kevents[ktype] if ktype in self.output_kevents \
                    else False
                if self.kevent.name and ok:
                    thread = self.kevent.thread
                    # push the kernel event dict
                    # to processing queue
                    kevt = dict(params=self.kevent.params,
                                name=self.kevent.name,
                                pid=self.kevent.pid,
                                tid=self.kevent.tid,
                                timestamp=self.kevent.ts,
                                cpuid=self.kevent.cpuid,
                                category=self.kevent.category,
                                thread=dict(name=thread.name,
                                            exe=thread.exe,
                                            comm=thread.comm,
                                            pid=thread.pid,
                                            ppid=thread.ppid))
                    self.keventq.put(kevt)

    def _aggregate(self, ktype):
        """Aggregates the kernel event to the output sink.

        Parameters
        ----------

        ktype: tuple
            Identifier of the kernel event
        """
        if not self._filament:
            if ktype in self.output_kevents:
                if self.output_kevents[ktype]:
                    self.kevent.inc_kid()
                    self._emit()
            elif self.filters_count == 0:
                self.kevent.inc_kid()
                self._emit()

    def _emit(self):
        for _, output in self._outputs.items():
            if isinstance(output, ConsoleOutput):
                output.emit(self.kevent)
            else:
                pid, proc = self.kevent.get_thread()
                body = {
                    'id': self.kevent.kid,
                    'timestamp':
                    self.kevent.ts.strftime('%Y-%m-%d %H:%M:%S.%f'),
                    'cpuid': self.kevent.cpuid,
                    'proc': proc,
                    'pid': pid,
                    'name': self.kevent.name,
                    'category': self.kevent.category,
                    'params': self.kevent.params
                }
                output.emit(body)
class MultiprocessorFitnessCaller:
    """
    Fitness caller used for multiprocessor parallelism.

    Arguments
    ---------

    num_workers : int
        Number of worker nodes to create.

    """
    def __init__(self, num_workers):
        self.num_workers = num_workers
        self.problem = None

        self.total_tasks = 0
        self.total_groups = 0
        self.max_group_size = 0

        self.tasks = []

        self.num_calls = 0
        self.max_nodes = 0
        self.min_num_calls = 0

        self.num_workers = self.num_workers
        self.processes = []

        self.task_queue = Queue()
        self.result_queue = Queue()

    def __enter__(self):
        return self

    def set_problem(self, problem):
        """
        Sets the problem object to use to calculate the fitness.

        Arguments
        ---------

        problem
            Problem object implementing the fitness method.
        """
        for _ in range(self.num_workers):
            p = Process(target=multiprocessor_process,
                        args=(problem, self.task_queue, self.result_queue))
            p.start()
            self.processes.append(p)

    def add(self, location, userdata):
        """
        Add a location to be evaluated.

        Arguments
        ---------

        location : numpy array
            Location to be evaluated.

        userdata
            User data to be returned with the evaluation result.
        """
        self.tasks.append([location, userdata])

    def evaluate(self):
        """
        Evaluates all the locations.

        Returns
        -------

        list of (location, value, userdate) tuples
            Tuples containing the location, value and corresponding user data
        """
        num_tasks = len(self.tasks)

        self.total_tasks += num_tasks
        self.total_groups += 1
        if num_tasks > self.max_group_size:
            self.max_group_size = num_tasks

        for i in range(num_tasks):
            self.task_queue.put([i, self.tasks[i][0]])  # [index, loc]

        y = 0
        num_results = 0
        results = []
        while num_results < num_tasks:
            result = self.result_queue.get()  # [ index, y]
            index = result[0]
            y = result[1]
            results.append((
                self.tasks[index][0],
                y,
                self.tasks[index][1],
            ))
            num_results += 1

        self.tasks = []
        return results

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.finish()

    def finish(self):
        """
        Terminates the fitness caller.
        """
        for p in self.processes:
            self.task_queue.put([-1, -1])
        for p in self.processes:
            p.join()
Exemplo n.º 24
0
 def _collect(self, upstream: Iterable, input_queue: mp.Queue):
     for item in upstream:
         input_queue.put(item)
     for _ in range(self.processes):
         input_queue.put(None)
Exemplo n.º 25
0
class TaskManager:
    """
    User-facing Class for interfacing with OpenWPM
    The TaskManager spawns several child processes to run the automation tasks.
        - DataAggregator to aggregate data in a SQLite database
        - MPLogger to aggregate logs across processes
        - BrowserManager processes to isolate Browsers in a separate process
    <manager_params> dict of TaskManager configuration parameters
    <browser_params> is a list of (or a single) dictionaries that specify preferences for browsers to instantiate
    <process_watchdog> will monitor firefox and Xvfb processes, killing any not indexed in TaskManager's browser list.
        NOTE: Only run this in isolated environments. It kills processes by name, indiscriminately.
    """

    def __init__(self, manager_params, browser_params, process_watchdog=False):

        # Make paths absolute in manager_params
        for path in ['data_directory','log_directory']:
            if manager_params[path] is not None:
                manager_params[path] = os.path.expanduser(manager_params[path])
        manager_params['database_name'] = os.path.join(manager_params['data_directory'],manager_params['database_name'])
        manager_params['log_file'] = os.path.join(manager_params['log_directory'],manager_params['log_file'])
        manager_params['screenshot_path'] = os.path.join(manager_params['data_directory'], 'screenshots')
        manager_params['source_dump_path'] = os.path.join(manager_params['data_directory'], 'sources')
        self.manager_params = manager_params

        # Create data directories if they do not exist
        if not os.path.exists(manager_params['screenshot_path']):
            os.makedirs(manager_params['screenshot_path'])
        if not os.path.exists(manager_params['source_dump_path']):
            os.makedirs(manager_params['source_dump_path'])

        # check size of parameter dictionary
        self.num_browsers = manager_params['num_browsers']
        if len(browser_params) != self.num_browsers:
            raise Exception("Number of <browser_params> dicts is not the same as manager_params['num_browsers']")

        # Flow control
        self.closing = False
        self.failure_status = None
        self.threadlock = threading.Lock()
        self.failurecount = 0
        if manager_params['failure_limit'] is not None:
            self.failure_limit = manager_params['failure_limit']
        else:
            self.failure_limit = self.num_browsers * 2 + 10

        self.process_watchdog = process_watchdog

        # sets up the crawl data database
        db_path = manager_params['database_name']
        if not os.path.exists(manager_params['data_directory']):
            os.mkdir(manager_params['data_directory'])
        self.db = sqlite3.connect(db_path)
        with open(os.path.join(os.path.dirname(__file__), 'schema.sql'), 'r') as f:
            self.db.executescript(f.read())
        self.db.commit()

        # sets up logging server + connect a client
        self.logging_status_queue = None
        self.loggingserver = self._launch_loggingserver()
        # socket location: (address, port)
        self.manager_params['logger_address'] = self.logging_status_queue.get()
        self.logger = MPLogger.loggingclient(*self.manager_params['logger_address'])

        # Mark if LDBAggregator is needed (if js is enabled on any browser)
        self.ldb_enabled = False
        for params in browser_params:
            if params['save_javascript'] or params['save_javascript_proxy']:
                self.ldb_enabled = True
                break

        # Initialize the data aggregators
        self._launch_aggregators()

        # open client socket
        self.sock = clientsocket(serialization='dill')
        self.sock.connect(*self.manager_params['aggregator_address'])

        self._save_configuration(browser_params)

        # read the last used site visit id
        cur = self.db.cursor()
        cur.execute("SELECT MAX(visit_id) from site_visits")
        last_visit_id = cur.fetchone()[0]
        if last_visit_id is None:
            last_visit_id = 0
        self.next_visit_id = last_visit_id + 1

        # sets up the BrowserManager(s) + associated queues
        self.browsers = self._initialize_browsers(browser_params)  # List of the Browser(s)
        self._launch_browsers()

        # start the manager watchdog
        thread = threading.Thread(target=self._manager_watchdog, args=())
        thread.daemon = True
        thread.start()

    def _save_configuration(self, browser_params):
        """ Saves crawl configuration details to db and logfile"""
        cur = self.db.cursor()

        # Get git version and commit information
        openwpm_v, browser_v = get_version()

        # Record task details
        cur.execute(("INSERT INTO task "
                     "(manager_params, openwpm_version, browser_version) "
                     "VALUES (?,?,?)"),
                (json.dumps(self.manager_params), openwpm_v, browser_v))
        self.db.commit()
        self.task_id = cur.lastrowid

        # Record browser details for each brower
        for i in xrange(self.num_browsers):
            cur.execute("INSERT INTO crawl (task_id, browser_params) VALUES (?,?)",
                        (self.task_id, json.dumps(browser_params[i])))
            self.db.commit()
            browser_params[i]['crawl_id'] = cur.lastrowid

        # Print the configuration details
        self.logger.info(get_configuration_string(self.manager_params,
                                                  browser_params,
                                                  (openwpm_v, browser_v)))

    def _initialize_browsers(self, browser_params):
        """ initialize the browser classes, each its unique set of parameters """
        browsers = list()
        for i in xrange(self.num_browsers):
            browsers.append(Browser(self.manager_params, browser_params[i]))

        return browsers

    def _launch_browsers(self):
        """ launch each browser manager process / browser """
        for browser in self.browsers:
            try:
                success = browser.launch_browser_manager()
            except:
                self._cleanup_before_fail(during_init=True)
                raise

            if not success:
                self.logger.critical("Browser spawn failure during TaskManager initialization, exiting...")
                self.close()
                break

            # Update our DB with the random browser settings
            # These are found within the scope of each instance of Browser in the browsers list
            screen_res = str(browser.browser_settings['screen_res'])
            ua_string = str(browser.browser_settings['ua_string'])
            self.sock.send(("UPDATE crawl SET screen_res = ?, ua_string = ? \
                             WHERE crawl_id = ?", (screen_res, ua_string, browser.crawl_id)))

    def _manager_watchdog(self):
        """
        Periodically checks the following:
        - memory consumption of all browsers every 10 seconds
        - presence of processes that are no longer in use
        """
        while not self.closing:
            time.sleep(10)

            # Check browser memory usage
            for browser in self.browsers:
                try:
                    process = psutil.Process(browser.browser_pid)
                    mem = process.memory_info()[0] / float(2 ** 20)
                    if mem > BROWSER_MEMORY_LIMIT:
                        self.logger.info("BROWSER %i: Memory usage: %iMB, exceeding limit of %iMB"
                            % (browser.crawl_id, int(mem), BROWSER_MEMORY_LIMIT))
                        browser.restart_required = True
                except psutil.NoSuchProcess:
                    pass

            # Check for browsers or displays that were not closed correctly
            # Provide a 300 second buffer to avoid killing freshly launched browsers
            # TODO This buffer should correspond to the maximum browser spawn timeout
            if self.process_watchdog:
                browser_pids = set()
                display_pids = set()
                check_time = time.time()
                for browser in self.browsers:
                    if browser.browser_pid is not None:
                        browser_pids.add(browser.browser_pid)
                    if browser.display_pid is not None:
                        display_pids.add(browser.display_pid)
                for process in psutil.process_iter():
                    if (process.create_time() + 300 < check_time and
                            ((process.name() == 'firefox' and process.pid not in browser_pids) or
                            (process.name() == 'Xvfb' and process.pid not in display_pids))):
                        self.logger.debug("Process: %s (pid: %i) with start time %s found running but not in browser process list. Killing."
                                % (process.name(), process.pid, process.create_time()))
                        process.kill()

    def _launch_aggregators(self):
        """
        Launches the various data aggregators, which serialize data from all processes.
        * DataAggregator - sqlite database for crawl data
        * LevelDBAggregator - leveldb database for javascript files
        """
        # DataAggregator
        self.aggregator_status_queue = Queue()
        self.data_aggregator = Process(target=DataAggregator.DataAggregator,
                             args=(self.manager_params, self.aggregator_status_queue))
        self.data_aggregator.daemon = True
        self.data_aggregator.start()
        self.manager_params['aggregator_address'] = self.aggregator_status_queue.get()  # socket location: (address, port)

        # LevelDB Aggregator
        if self.ldb_enabled:
            self.ldb_status_queue = Queue()
            self.ldb_aggregator = Process(target=LevelDBAggregator.LevelDBAggregator,
                                 args=(self.manager_params, self.ldb_status_queue))
            self.ldb_aggregator.daemon = True
            self.ldb_aggregator.start()
            self.manager_params['ldb_address'] = self.ldb_status_queue.get()  # socket location: (address, port)

    def _kill_aggregators(self):
        """ Terminates the aggregators gracefully """
        # DataAggregator
        self.logger.debug("Telling the DataAggregator to shut down...")
        self.aggregator_status_queue.put("DIE")
        start_time = time.time()
        self.data_aggregator.join(300)
        self.logger.debug("DataAggregator took " + str(time.time() - start_time) + " seconds to close")

        # LevelDB Aggregator
        if self.ldb_enabled:
            self.logger.debug("Telling the LevelDBAggregator to shut down...")
            self.ldb_status_queue.put("DIE")
            start_time = time.time()
            self.ldb_aggregator.join(300)
            self.logger.debug("LevelDBAggregator took " + str(time.time() - start_time) + " seconds to close")

    def _launch_loggingserver(self):
        """ sets up logging server """
        self.logging_status_queue = Queue()
        loggingserver = Process(target=MPLogger.loggingserver,
                             args=(self.manager_params['log_file'], self.logging_status_queue, ))
        loggingserver.daemon = True
        loggingserver.start()
        return loggingserver

    def _kill_loggingserver(self):
        """ terminates logging server gracefully """
        self.logging_status_queue.put("DIE")
        self.loggingserver.join(300)

    def _shutdown_manager(self, failure=False, during_init=False):
        """
        Wait for current commands to finish, close all child processes and
        threads
        <failure> flag to indicate manager failure (True) or end of crawl (False)
        <during_init> flag to indicator if this shutdown is occuring during the TaskManager initialization
        """
        self.closing = True

        for browser in self.browsers:
            browser.shutdown_browser(during_init)
            if failure:
                self.sock.send(("UPDATE crawl SET finished = -1 WHERE crawl_id = ?",
                                (browser.crawl_id,)))
            else:
                self.sock.send(("UPDATE crawl SET finished = 1 WHERE crawl_id = ?",
                                (browser.crawl_id,)))

        self.db.close()  # close db connection
        self.sock.close()  # close socket to data aggregator
        self._kill_aggregators()
        self._kill_loggingserver()

    def _cleanup_before_fail(self, during_init=False):
        """
        Execute shutdown commands before throwing an exception
        This should keep us from having a bunch of hanging processes
        and incomplete data.
        <during_init> flag to indicator if this shutdown is occuring during
                      the TaskManager initialization
        """
        self._shutdown_manager(failure=True, during_init=during_init)

    def _check_failure_status(self):
        """ Check the status of command failures. Raise exceptions as necessary

        The failure status property is used by the various asynchronous
        command execution threads which interface with the
        remote browser manager processes. If a failure status is found, the
        appropriate steps are taken to gracefully close the infrastructure
        """
        self.logger.debug("Checking command failure status indicator...")
        if self.failure_status:
            self.logger.debug("TaskManager failure status set, halting command execution.")
            self._cleanup_before_fail()
            if self.failure_status['ErrorType'] == 'ExceedCommandFailureLimit':
                raise CommandExecutionError(
                    "TaskManager exceeded maximum consecutive command "
                    "execution failures.", self.failure_status['CommandSequence']
                )
            elif self.failure_status['ErrorType'] == 'ExceedLaunchFailureLimit':
                raise CommandExecutionError(
                    "TaskManager failed to launch browser within allowable "
                    "failure limit.", self.failure_status['CommandSequence']
                )
            if self.failure_status['ErrorType'] == 'CriticalChildException':
                reraise(*cPickle.loads(self.failure_status['Exception']))

    # CRAWLER COMMAND CODE

    def _distribute_command(self, command_sequence, index=None):
        """
        parses command type and issues command(s) to the proper browser
        <index> specifies the type of command this is:
        = None  -> first come, first serve
        =  #    -> index of browser to send command to
        = *     -> sends command to all browsers
        = **    -> sends command to all browsers (synchronized)
        """
        if index is None:
            #send to first browser available
            command_executed = False
            while True:
                for browser in self.browsers:
                    if browser.ready():
                        browser.current_timeout = command_sequence.total_timeout
                        thread = self._start_thread(browser, command_sequence)
                        command_executed = True
                        break
                if command_executed:
                    break
                time.sleep(SLEEP_CONS)

        elif 0 <= index < len(self.browsers):
            #send the command to this specific browser
            while True:
                if self.browsers[index].ready():
                    self.browsers[index].current_timeout = command_sequence.total_timeout
                    thread = self._start_thread(self.browsers[index], command_sequence)
                    break
                time.sleep(SLEEP_CONS)
        elif index == '*':
            #send the command to all browsers
            command_executed = [False] * len(self.browsers)
            while False in command_executed:
                for i in xrange(len(self.browsers)):
                    if self.browsers[i].ready() and not command_executed[i]:
                        self.browsers[i].current_timeout = command_sequence.total_timeout
                        thread = self._start_thread(self.browsers[i], command_sequence)
                        command_executed[i] = True
                time.sleep(SLEEP_CONS)
        elif index == '**':
            #send the command to all browsers and sync it
            condition = threading.Condition()  # Used to block threads until ready
            command_executed = [False] * len(self.browsers)
            while False in command_executed:
                for i in xrange(len(self.browsers)):
                    if self.browsers[i].ready() and not command_executed[i]:
                        self.browsers[i].current_timeout = command_sequence.total_timeout
                        thread = self._start_thread(self.browsers[i], command_sequence, condition)
                        command_executed[i] = True
                time.sleep(SLEEP_CONS)
            with condition:
                condition.notifyAll()  # All browsers loaded, tell them to start
        else:
            self.logger.info("Command index type is not supported or out of range")
            return

        if command_sequence.blocking:
            thread.join()
            self._check_failure_status()

    def _start_thread(self, browser, command_sequence, condition=None):
        """  starts the command execution thread """

        # Check status flags before starting thread
        if self.closing:
            self.logger.error("Attempted to execute command on a closed TaskManager")
            return
        self._check_failure_status()

        browser.set_visit_id(self.next_visit_id)
        self.sock.send(("INSERT INTO site_visits (visit_id, crawl_id, site_url) VALUES (?,?,?)",
                        (self.next_visit_id, browser.crawl_id, command_sequence.url)))
        self.next_visit_id += 1

        # Start command execution thread
        args = (browser, command_sequence, condition)
        thread = threading.Thread(target=self._issue_command, args=args)
        browser.command_thread = thread
        thread.daemon = True
        thread.start()
        return thread

    def _issue_command(self, browser, command_sequence, condition=None):
        """
        sends command tuple to the BrowserManager
        """
        browser.is_fresh = False  # since we are issuing a command, the BrowserManager is no longer a fresh instance

        # if this is a synced call, block on condition
        if condition is not None:
            with condition:
                condition.wait()

        reset = command_sequence.reset
        start_time = None  # tracks when a site visit started, so that flash/profile
                           # cookies can be properly tracked.
        for command_and_timeout in command_sequence.commands_with_timeout:
            command, timeout = command_and_timeout
            if command[0] in ['GET', 'BROWSE']:
                start_time = time.time()
                command += (browser.curr_visit_id,)
            elif command[0] in ['DUMP_FLASH_COOKIES', 'DUMP_PROFILE_COOKIES']:
                command += (start_time, browser.curr_visit_id,)
            browser.current_timeout = timeout
            # passes off command and waits for a success (or failure signal)
            browser.command_queue.put(command)
            command_succeeded = 0 #1 success, 0 failure from error, -1 timeout
            command_arguments = command[1] if len(command) > 1 else None

            # received reply from BrowserManager, either success signal or failure notice
            try:
                status = browser.status_queue.get(True, browser.current_timeout)
                if status == "OK":
                    command_succeeded = 1
                elif status[0] == "CRITICAL":
                    self.logger.critical("BROWSER %i: Received critical error "
                                         "from browser process while executing "
                                         "command %s. Setting failure status." % (
                                             browser.crawl_id, str(command)))
                    self.failure_status = {
                        'ErrorType': 'CriticalChildException',
                        'CommandSequence': command_sequence,
                        'Exception': status[1]
                    }
                    return
                else:
                    command_succeeded = 0
                    self.logger.info("BROWSER %i: Received failure status while"
                                     " executing command: %s" % (browser.crawl_id, command[0]))
            except EmptyQueue:
                command_succeeded = -1
                self.logger.info("BROWSER %i: Timeout while executing command, "
                                 "%s, killing browser manager" % (browser.crawl_id, command[0]))

            self.sock.send(("INSERT INTO CrawlHistory (crawl_id, command, arguments, bool_success)"
                            " VALUES (?,?,?,?)",
                            (browser.crawl_id, command[0], command_arguments, command_succeeded)))

            if command_succeeded != 1:
                with self.threadlock:
                    self.failurecount += 1
                if self.failurecount > self.failure_limit:
                    self.logger.critical("BROWSER %i: Command execution failure"
                                         " pushes failure count above the allowable limit."
                                         " Setting failure_status." % browser.crawl_id)
                    self.failure_status = {
                        'ErrorType': 'ExceedCommandFailureLimit',
                        'CommandSequence': command_sequence
                    }
                    return
                browser.restart_required = True
            else:
                with self.threadlock:
                    self.failurecount = 0

            if browser.restart_required:
                break

        # Sleep after executing CommandSequence to provide extra time for
        # internal buffers to drain. Stopgap in support of #135
        time.sleep(2)

        if self.closing:
            return

        if browser.restart_required or reset:
            success = browser.restart_browser_manager(clear_profile = reset)
            if not success:
                self.logger.critical("BROWSER %i: Exceeded the maximum allowable "
                                     "consecutive browser launch failures. "
                                     "Setting failure_status." % browser.crawl_id)
                self.failure_status = {
                    'ErrorType': 'ExceedLaunchFailureLimit',
                    'CommandSequence': command_sequence
                }
                return
            browser.restart_required = False

    def execute_command_sequence(self, command_sequence, index=None):
        self._distribute_command(command_sequence, index)

    # DEFINITIONS OF HIGH LEVEL COMMANDS
    # NOTE: These wrappers are provided for convenience. To issue sequential
    # commands to the same browser in a single 'visit', use the CommandSequence
    # class directly.

    def get(self, url, index=None, timeout=60, sleep=0, reset=False):
        """ goes to a url """
        command_sequence = CommandSequence.CommandSequence(url)
        command_sequence.get(timeout=timeout, sleep=sleep)
        command_sequence.reset = reset
        self.execute_command_sequence(command_sequence, index=index)

    def browse(self, url, num_links=2, sleep=0, index=None, timeout=60, reset=False):
        """ browse a website and visit <num_links> links on the page """
        command_sequence = CommandSequence.CommandSequence(url)
        command_sequence.browse(num_links=num_links, sleep=sleep, timeout=timeout)
        command_sequence.reset = reset
        self.execute_command_sequence(command_sequence, index=index)


    def close(self):
        """
        Execute shutdown procedure for TaskManager
        """
        if self.closing:
            self.logger.error("TaskManager already closed")
            return
        self._shutdown_manager()
Exemplo n.º 26
0
class EnvironmentPool:
    '''
    the Environment Pool - making parallelable agent-environment interaction
    possible

    # After instantiation and service is started, you can **ONLY** interact
    # with the environment with *messages* via the read/write `Queue`s.
    # Messges are sent in `tuple`s, `(message, content)`, where `message` is the
    # message string. List of legal options for messages are listed below:

    #     - `reset`: calls :func:`env.reset`, `content` should be a tuple containing
    #         environment ID only
    #     - `step`: calls :func:`env.step`, `content` should be a tuple of
    #         environment ID and action vector
    #     - `stop`: stops the service

    # Environments will return their infos via `Queue` as well, `(ID, content)`.
    '''
    @staticmethod
    def _factory_env(difficulty=1):
        env = ProstheticsEnv(visualize=False, difficulty=difficulty)
        env = ForceDictObservation(env)
        env = DictToListFull(env)
        env = JSONable(env)
        return env

    def __init__(self,
                 difficulty=1,
                 pool_size=4,
                 queue_size=16
                 ):  # NOTE: `queue_size` must be larger than 2x `pool_size`!!!
        self._args_queue = Queue(queue_size)
        # pipes for data dispatching
        self._args_pipes = [Pipe() for _ in range(pool_size)]
        self._return_queue = Queue(queue_size)
        self._envs = [
            self._factory_env(difficulty=difficulty) for _ in range(pool_size)
        ]
        self._service_process = Process(target=self._service_job,
                                        args=(self._envs, self._args_pipes,
                                              self._args_queue,
                                              self._return_queue))
        self._service_process.start()
        print('[LOG] environment service started!')

    def __len__(self):
        return len(self._envs)

    @property
    def args_queue(self):
        return self._args_queue

    @property
    def return_queue(self):
        return self._return_queue

    @staticmethod
    def _service_job(_envs, _args_pipes, _args_queue, _return_queue):
        '''
        Queue In:
        - `__stop_all__`
        - `reset_all`
        - `(env_id, env_args)` (for refs on `env_args`, see :func:`__instance_job` doc)

        Queue Out:
        - `(env_id, (obs, rew, done, info))`
        '''
        def __instance_job(env, env_id, args_pipe, return_queue):
            '''
            Pipe In: (`env_args`)
            - `'__stop__'`
            - `'reset'`
            - `numpy.ndarray` of `(19,)`

            Queue Out:
            see :func:`_service_job`
            '''
            while True:
                action = args_pipe.recv()
                if isinstance(action, str):
                    if action == 'reset':
                        env.reset()
                    elif action == '__stop__':
                        args_pipe.stop()
                        break
                else:
                    retval = env.step(action)
                    return_queue.put((env_id, retval))

        _pool = []

        # initialize workers
        for idx, (_, pipeout) in enumerate(_args_pipes):
            _pool.append(
                Process(target=__instance_job,
                        args=(_envs[idx], idx, pipeout, _return_queue)))
            _pool[-1].start()

        # start service loop
        # NOTE: This service loop produces data only!!!
        #       Make sure you consume data in `return_queue` fase enough, or
        #       you'll break the Queue!!!
        while True:
            data = _args_queue.get()
            if isinstance(data, str):
                if data == '__stop_all__':
                    for pipein, _ in _args_pipes:
                        pipein.send('__stop__')
                    for proc in _pool:
                        proc.join()
                    break
                elif data == 'reset_all':
                    for pipein, _ in _args_pipes:
                        pipein.send('reset')
                    # NOTE: data returned via return Queue
                else:
                    raise ValueError('unrecognized message:', data)
            elif isinstance(data, tuple):  # need for data dispatching
                target_env_id = data[0]
                target_pipein = _args_pipes[target_env_id][0]
                target_pipein.send(data[1])
                # NOTE: data returned via return Queue
            else:
                raise TypeError('data must be of type `str` or `tuple`')

    def _send_action(self, actionfn, env_id, obs):
        '''
        Return:
        - `act`
        - `val`
        - `log_prob`
        - `delta_entropy`
        - `dist` (reserved for future extensibility)

        Send:
        - `(env_id, act)` to `self._args_queue`
        '''
        dist, val = actionfn(obs)
        act = dist.sample().clamp_(0.0, 1.0)
        act = act.detach().cpu().numpy()[0]
        self._args_queue.put((env_id, act))
        log_prob = dist.log_prob(act)
        delta_entropy = dist.entropy().mean()
        return act, val, log_prob, delta_entropy, dist

    def get_trajectories_one_round(self, actionfn):
        '''
        Return:
        Un-concatenated trajectories
        '''
        target_rounds = len(self)
        dones_seen = [False for _ in range(target_rounds)]

        # returned via `_send_action`
        actions = [[] for _ in range(target_rounds)]
        values = [[] for _ in range(target_rounds)]
        states = [[] for _ in range(target_rounds)]  # one more
        log_probs = [[] for _ in range(target_rounds)]
        entropy = [0.0 for _ in range(target_rounds)]
        # returned via Queue
        rewards = [[] for _ in range(target_rounds)]
        masks = [[] for _ in range(target_rounds)]

        def __send_and_save(env_id, obs):
            obs = torch.tensor([obs], dtype=torch.flaot32, device=device)
            act, val, log_prob, dlt_ent, _ = \
                self._send_action(actionfn, env_id, obs)
            log_probs[env_id].append(log_prob)
            values[env_id].append(val)
            states[env_id].append(obs)
            actions[env_id].append(act)
            entropy[env_id] += dlt_ent

        # run the very first step (sync-ed)
        self.args_queue.put('reset_all')
        initial_send_buffer = []
        for _ in range(target_rounds):
            env_id, first_obs = self.return_queue.get()
            initial_send_buffer.append((env_id, first_obs))
        for env_id, first_obs in initial_send_buffer:
            __send_and_save(env_id, first_obs)

        # receive and return (parallel)
        while not all(dones_seen):
            env_id, (next_obs, rew, done, _) = self.return_queue.get()
            # save returns: `rewards`, `masks`
            rewards[env_id].append(
                torch.tensor(rew, dtype=torch.float32, device=device))
            masks[env_id].append(
                torch.tensor(1 - done, dtype=torch.float32, device=device))
            if not done:
                __send_and_save(env_id, states[env_id][-1])
            else:
                dones_seen[env_id] = True

        _, next_val = actionfn(next_obs)

        return (actions, values, states, log_probs, entropy, rewards, masks,
                next_val)

    @staticmethod
    def concatenate_trajectories(actions, values, states, log_probs, entropy,
                                 rewards, masks):
        num_runs = len(actions)
        order = [idx for idx in range(num_runs)]
        cat_actions = []
        cat_values = []
        cat_states = []
        cat_log_probs = []
        cat_rewards = []
        cat_masks = []
        for idx in order:
            cat_actions.extend(actions[idx])
            cat_values.extend(values[idx])
            cat_states.extend(states[idx])
            cat_log_probs.extend(log_probs[idx])
            cat_rewards.extend(rewards[idx])
            cat_masks.extend(masks[idx])
        return (cat_actions, cat_values, cat_states, cat_log_probs,
                np.mean(entropy), cat_rewards, cat_masks)

    def get_trajectories(self, actionfn, approx_min_batchsize=500):
        print("[LOG] started collection trajectory...")
        *traj_data, next_value = self.get_trajectories_one_round(actionfn)
        traj_data = self.concatenate_trajectories(*traj_data)
        while len(traj_data[0]) < approx_min_batchsize * (2 / 3):
            *new_traj_data, next_value = self.get_trajectories_one_round(
                actionfn)
            new_traj_data = self.concatenate_trajectories(*new_traj_data)
            for item_idx in range(len(traj_data)):
                traj_data[item_idx].extend(new_traj_data[item_idx])
        return traj_data, next_value
Exemplo n.º 27
0
class TaskManager:
    """
    User-facing Class for interfacing with OpenWPM
    The TaskManager spawns several child processes to run the automation tasks.
        - DataAggregator to aggregate data in a SQLite database
        - MPLogger to aggregate logs across processes
        - BrowserManager processes to isolate Browsers in a separate process
    <manager_params> dict of TaskManager configuration parameters
    <browser_params> is a list of (or a single) dictionaries that specify preferences for browsers to instantiate
    <process_watchdog> will monitor firefox and Xvfb processes, killing any not indexed in TaskManager's browser list.
        NOTE: Only run this in isolated environments. It kills processes by name, indiscriminately.
    """
    def __init__(self, manager_params, browser_params, process_watchdog=False):

        # Make paths absolute in manager_params
        for path in ['data_directory', 'log_directory']:
            if manager_params[path] is not None:
                manager_params[path] = os.path.expanduser(manager_params[path])
        manager_params['database_name'] = os.path.join(
            manager_params['data_directory'], manager_params['database_name'])
        manager_params['log_file'] = os.path.join(
            manager_params['log_directory'], manager_params['log_file'])
        manager_params['screenshot_path'] = os.path.join(
            manager_params['data_directory'], 'screenshots')
        manager_params['source_dump_path'] = os.path.join(
            manager_params['data_directory'], 'sources')
        self.manager_params = manager_params

        # Create data directories if they do not exist
        if not os.path.exists(manager_params['screenshot_path']):
            os.makedirs(manager_params['screenshot_path'])
        if not os.path.exists(manager_params['source_dump_path']):
            os.makedirs(manager_params['source_dump_path'])

        # check size of parameter dictionary
        self.num_browsers = manager_params['num_browsers']
        if len(browser_params) != self.num_browsers:
            raise Exception(
                "Number of <browser_params> dicts is not the same as manager_params['num_browsers']"
            )

        # Flow control
        self.closing = False
        self.failure_status = None
        self.threadlock = threading.Lock()
        self.failurecount = 0
        if manager_params['failure_limit'] is not None:
            self.failure_limit = manager_params['failure_limit']
        else:
            self.failure_limit = self.num_browsers * 2 + 10

        self.process_watchdog = process_watchdog

        # sets up the crawl data database
        db_path = manager_params['database_name']
        if not os.path.exists(manager_params['data_directory']):
            os.mkdir(manager_params['data_directory'])
        self.db = sqlite3.connect(db_path)
        with open(os.path.join(os.path.dirname(__file__), 'schema.sql'),
                  'r') as f:
            self.db.executescript(f.read())
        self.db.commit()

        # sets up logging server + connect a client
        self.logging_status_queue = None
        self.loggingserver = self._launch_loggingserver()
        # socket location: (address, port)
        self.manager_params['logger_address'] = self.logging_status_queue.get()
        self.logger = MPLogger.loggingclient(
            *self.manager_params['logger_address'])

        # Mark if LDBAggregator is needed (if js is enabled on any browser)
        self.ldb_enabled = False
        for params in browser_params:
            if params['save_javascript'] or params['save_javascript_proxy']:
                self.ldb_enabled = True
                break

        # Initialize the data aggregators
        self._launch_aggregators()

        # open client socket
        self.sock = clientsocket(serialization='dill')
        self.sock.connect(*self.manager_params['aggregator_address'])
        print 1
        self._save_configuration(browser_params)
        print 2
        # read the last used site visit id
        cur = self.db.cursor()
        cur.execute("SELECT MAX(visit_id) from site_visits")
        last_visit_id = cur.fetchone()[0]
        if last_visit_id is None:
            last_visit_id = 0
        self.next_visit_id = last_visit_id + 1
        print 3
        # sets up the BrowserManager(s) + associated queues
        self.browsers = self._initialize_browsers(
            browser_params)  # List of the Browser(s)
        print 5
        self._launch_browsers()
        print 4
        # start the manager watchdog
        thread = threading.Thread(target=self._manager_watchdog, args=())
        thread.daemon = True
        thread.start()

    def _save_configuration(self, browser_params):
        """ Saves crawl configuration details to db and logfile"""
        cur = self.db.cursor()

        # Get git version and commit information
        openwpm_v, browser_v = get_version()

        # Record task details
        cur.execute(("INSERT INTO task "
                     "(manager_params, openwpm_version, browser_version) "
                     "VALUES (?,?,?)"),
                    (json.dumps(self.manager_params), openwpm_v, browser_v))
        self.db.commit()
        self.task_id = cur.lastrowid

        # Record browser details for each brower
        for i in xrange(self.num_browsers):
            cur.execute(
                "INSERT INTO crawl (task_id, browser_params) VALUES (?,?)",
                (self.task_id, json.dumps(browser_params[i])))
            self.db.commit()
            browser_params[i]['crawl_id'] = cur.lastrowid

        # Print the configuration details
        self.logger.info(
            get_configuration_string(self.manager_params, browser_params,
                                     (openwpm_v, browser_v)))

    def _initialize_browsers(self, browser_params):
        """ initialize the browser classes, each its unique set of parameters """
        browsers = list()
        for i in xrange(self.num_browsers):
            browsers.append(Browser(self.manager_params, browser_params[i]))

        return browsers

    def _launch_browsers(self):
        print 8
        print self.browsers
        """ launch each browser manager process / browser """
        for browser in self.browsers:
            try:
                print 9
                success = browser.launch_browser_manager()
                print 6
            except:
                print 7
                self._cleanup_before_fail(during_init=True)
                raise

            if not success:
                self.logger.critical(
                    "Browser spawn failure during TaskManager initialization, exiting..."
                )
                self.close()
                break

            # Update our DB with the random browser settings
            # These are found within the scope of each instance of Browser in the browsers list
            screen_res = str(browser.browser_settings['screen_res'])
            ua_string = str(browser.browser_settings['ua_string'])
            self.sock.send(("UPDATE crawl SET screen_res = ?, ua_string = ? \
                             WHERE crawl_id = ?", (screen_res, ua_string,
                                                   browser.crawl_id)))

    def _manager_watchdog(self):
        """
        Periodically checks the following:
        - memory consumption of all browsers every 10 seconds
        - presence of processes that are no longer in use
        """
        while not self.closing:
            time.sleep(10)

            # Check browser memory usage
            for browser in self.browsers:
                try:
                    process = psutil.Process(browser.browser_pid)
                    mem = process.memory_info()[0] / float(2**20)
                    if mem > BROWSER_MEMORY_LIMIT:
                        self.logger.info(
                            "BROWSER %i: Memory usage: %iMB, exceeding limit of %iMB"
                            %
                            (browser.crawl_id, int(mem), BROWSER_MEMORY_LIMIT))
                        browser.restart_required = True
                except psutil.NoSuchProcess:
                    pass

            # Check for browsers or displays that were not closed correctly
            # Provide a 300 second buffer to avoid killing freshly launched browsers
            # TODO This buffer should correspond to the maximum browser spawn timeout
            if self.process_watchdog:
                browser_pids = set()
                display_pids = set()
                check_time = time.time()
                for browser in self.browsers:
                    if browser.browser_pid is not None:
                        browser_pids.add(browser.browser_pid)
                    if browser.display_pid is not None:
                        display_pids.add(browser.display_pid)
                for process in psutil.process_iter():
                    if (process.create_time() + 300 < check_time
                            and ((process.name() == 'firefox'
                                  and process.pid not in browser_pids) or
                                 (process.name() == 'Xvfb'
                                  and process.pid not in display_pids))):
                        self.logger.debug(
                            "Process: %s (pid: %i) with start time %s found running but not in browser process list. Killing."
                            % (process.name(), process.pid,
                               process.create_time()))
                        process.kill()

    def _launch_aggregators(self):
        """
        Launches the various data aggregators, which serialize data from all processes.
        * DataAggregator - sqlite database for crawl data
        * LevelDBAggregator - leveldb database for javascript files
        """
        # DataAggregator
        self.aggregator_status_queue = Queue()
        self.data_aggregator = Process(target=DataAggregator.DataAggregator,
                                       args=(self.manager_params,
                                             self.aggregator_status_queue))
        self.data_aggregator.daemon = True
        self.data_aggregator.start()
        self.manager_params[
            'aggregator_address'] = self.aggregator_status_queue.get(
            )  # socket location: (address, port)

        # LevelDB Aggregator
        if self.ldb_enabled:
            self.ldb_status_queue = Queue()
            self.ldb_aggregator = Process(
                target=LevelDBAggregator.LevelDBAggregator,
                args=(self.manager_params, self.ldb_status_queue))
            self.ldb_aggregator.daemon = True
            self.ldb_aggregator.start()
            self.manager_params['ldb_address'] = self.ldb_status_queue.get(
            )  # socket location: (address, port)

    def _kill_aggregators(self):
        """ Terminates the aggregators gracefully """
        # DataAggregator
        self.logger.debug("Telling the DataAggregator to shut down...")
        self.aggregator_status_queue.put("DIE")
        start_time = time.time()
        self.data_aggregator.join(300)
        self.logger.debug("DataAggregator took " +
                          str(time.time() - start_time) + " seconds to close")

        # LevelDB Aggregator
        if self.ldb_enabled:
            self.logger.debug("Telling the LevelDBAggregator to shut down...")
            self.ldb_status_queue.put("DIE")
            start_time = time.time()
            self.ldb_aggregator.join(300)
            self.logger.debug("LevelDBAggregator took " +
                              str(time.time() - start_time) +
                              " seconds to close")

    def _launch_loggingserver(self):
        """ sets up logging server """
        self.logging_status_queue = Queue()
        loggingserver = Process(target=MPLogger.loggingserver,
                                args=(
                                    self.manager_params['log_file'],
                                    self.logging_status_queue,
                                ))
        loggingserver.daemon = True
        loggingserver.start()
        return loggingserver

    def _kill_loggingserver(self):
        """ terminates logging server gracefully """
        self.logging_status_queue.put("DIE")
        self.loggingserver.join(300)

    def _shutdown_manager(self, failure=False, during_init=False):
        """
        Wait for current commands to finish, close all child processes and
        threads
        <failure> flag to indicate manager failure (True) or end of crawl (False)
        <during_init> flag to indicator if this shutdown is occuring during the TaskManager initialization
        """
        self.closing = True

        for browser in self.browsers:
            browser.shutdown_browser(during_init)
            if failure:
                self.sock.send(
                    ("UPDATE crawl SET finished = -1 WHERE crawl_id = ?",
                     (browser.crawl_id, )))
            else:
                self.sock.send(
                    ("UPDATE crawl SET finished = 1 WHERE crawl_id = ?",
                     (browser.crawl_id, )))

        self.db.close()  # close db connection
        self.sock.close()  # close socket to data aggregator
        self._kill_aggregators()
        self._kill_loggingserver()

    def _cleanup_before_fail(self, during_init=False):
        """
        Execute shutdown commands before throwing an exception
        This should keep us from having a bunch of hanging processes
        and incomplete data.
        <during_init> flag to indicator if this shutdown is occuring during
                      the TaskManager initialization
        """
        self._shutdown_manager(failure=True, during_init=during_init)

    def _check_failure_status(self):
        """ Check the status of command failures. Raise exceptions as necessary

        The failure status property is used by the various asynchronous
        command execution threads which interface with the
        remote browser manager processes. If a failure status is found, the
        appropriate steps are taken to gracefully close the infrastructure
        """
        self.logger.debug("Checking command failure status indicator...")
        if self.failure_status:
            self.logger.debug(
                "TaskManager failure status set, halting command execution.")
            self._cleanup_before_fail()
            if self.failure_status['ErrorType'] == 'ExceedCommandFailureLimit':
                raise CommandExecutionError(
                    "TaskManager exceeded maximum consecutive command "
                    "execution failures.",
                    self.failure_status['CommandSequence'])
            elif self.failure_status[
                    'ErrorType'] == 'ExceedLaunchFailureLimit':
                raise CommandExecutionError(
                    "TaskManager failed to launch browser within allowable "
                    "failure limit.", self.failure_status['CommandSequence'])
            if self.failure_status['ErrorType'] == 'CriticalChildException':
                reraise(*cPickle.loads(self.failure_status['Exception']))

    # CRAWLER COMMAND CODE

    def _distribute_command(self, command_sequence, index=None):
        """
        parses command type and issues command(s) to the proper browser
        <index> specifies the type of command this is:
        = None  -> first come, first serve
        =  #    -> index of browser to send command to
        = *     -> sends command to all browsers
        = **    -> sends command to all browsers (synchronized)
        """
        if index is None:
            #send to first browser available
            command_executed = False
            while True:
                for browser in self.browsers:
                    if browser.ready():
                        browser.current_timeout = command_sequence.total_timeout
                        thread = self._start_thread(browser, command_sequence)
                        command_executed = True
                        break
                if command_executed:
                    break
                time.sleep(SLEEP_CONS)

        elif 0 <= index < len(self.browsers):
            #send the command to this specific browser
            while True:
                if self.browsers[index].ready():
                    self.browsers[
                        index].current_timeout = command_sequence.total_timeout
                    thread = self._start_thread(self.browsers[index],
                                                command_sequence)
                    break
                time.sleep(SLEEP_CONS)
        elif index == '*':
            #send the command to all browsers
            command_executed = [False] * len(self.browsers)
            while False in command_executed:
                for i in xrange(len(self.browsers)):
                    if self.browsers[i].ready() and not command_executed[i]:
                        self.browsers[
                            i].current_timeout = command_sequence.total_timeout
                        thread = self._start_thread(self.browsers[i],
                                                    command_sequence)
                        command_executed[i] = True
                time.sleep(SLEEP_CONS)
        elif index == '**':
            #send the command to all browsers and sync it
            condition = threading.Condition(
            )  # Used to block threads until ready
            command_executed = [False] * len(self.browsers)
            while False in command_executed:
                for i in xrange(len(self.browsers)):
                    if self.browsers[i].ready() and not command_executed[i]:
                        self.browsers[
                            i].current_timeout = command_sequence.total_timeout
                        thread = self._start_thread(self.browsers[i],
                                                    command_sequence,
                                                    condition)
                        command_executed[i] = True
                time.sleep(SLEEP_CONS)
            with condition:
                condition.notifyAll(
                )  # All browsers loaded, tell them to start
        else:
            self.logger.info(
                "Command index type is not supported or out of range")
            return

        if command_sequence.blocking:
            thread.join()
            self._check_failure_status()

    def _start_thread(self, browser, command_sequence, condition=None):
        """  starts the command execution thread """

        # Check status flags before starting thread
        if self.closing:
            self.logger.error(
                "Attempted to execute command on a closed TaskManager")
            return
        self._check_failure_status()

        browser.set_visit_id(self.next_visit_id)
        self.sock.send((
            "INSERT INTO site_visits (visit_id, crawl_id, site_url) VALUES (?,?,?)",
            (self.next_visit_id, browser.crawl_id, command_sequence.url)))
        self.next_visit_id += 1

        # Start command execution thread
        args = (browser, command_sequence, condition)
        thread = threading.Thread(target=self._issue_command, args=args)
        browser.command_thread = thread
        thread.daemon = True
        thread.start()
        return thread

    def _issue_command(self, browser, command_sequence, condition=None):
        """
        sends command tuple to the BrowserManager
        """
        browser.is_fresh = False  # since we are issuing a command, the BrowserManager is no longer a fresh instance

        # if this is a synced call, block on condition
        if condition is not None:
            with condition:
                condition.wait()

        reset = command_sequence.reset
        start_time = None  # tracks when a site visit started, so that flash/profile
        # cookies can be properly tracked.
        for command_and_timeout in command_sequence.commands_with_timeout:
            command, timeout = command_and_timeout
            if command[0] in ['GET', 'BROWSE']:
                start_time = time.time()
                command += (browser.curr_visit_id, )
            elif command[0] in ['DUMP_FLASH_COOKIES', 'DUMP_PROFILE_COOKIES']:
                command += (
                    start_time,
                    browser.curr_visit_id,
                )
            browser.current_timeout = timeout
            # passes off command and waits for a success (or failure signal)
            browser.command_queue.put(command)
            command_succeeded = 0  #1 success, 0 failure from error, -1 timeout
            command_arguments = command[1] if len(command) > 1 else None

            # received reply from BrowserManager, either success signal or failure notice
            try:
                status = browser.status_queue.get(True,
                                                  browser.current_timeout)
                if status == "OK":
                    command_succeeded = 1
                elif status[0] == "CRITICAL":
                    self.logger.critical(
                        "BROWSER %i: Received critical error "
                        "from browser process while executing "
                        "command %s. Setting failure status." %
                        (browser.crawl_id, str(command)))
                    self.failure_status = {
                        'ErrorType': 'CriticalChildException',
                        'CommandSequence': command_sequence,
                        'Exception': status[1]
                    }
                    return
                else:
                    command_succeeded = 0
                    self.logger.info(
                        "BROWSER %i: Received failure status while"
                        " executing command: %s" %
                        (browser.crawl_id, command[0]))
            except EmptyQueue:
                command_succeeded = -1
                self.logger.info(
                    "BROWSER %i: Timeout while executing command, "
                    "%s, killing browser manager" %
                    (browser.crawl_id, command[0]))

            self.sock.send((
                "INSERT INTO CrawlHistory (crawl_id, command, arguments, bool_success)"
                " VALUES (?,?,?,?)", (browser.crawl_id, command[0],
                                      command_arguments, command_succeeded)))

            if command_succeeded != 1:
                with self.threadlock:
                    self.failurecount += 1
                if self.failurecount > self.failure_limit:
                    self.logger.critical(
                        "BROWSER %i: Command execution failure"
                        " pushes failure count above the allowable limit."
                        " Setting failure_status." % browser.crawl_id)
                    self.failure_status = {
                        'ErrorType': 'ExceedCommandFailureLimit',
                        'CommandSequence': command_sequence
                    }
                    return
                browser.restart_required = True
            else:
                with self.threadlock:
                    self.failurecount = 0

            if browser.restart_required:
                break

        if self.closing:
            return

        if browser.restart_required or reset:
            success = browser.restart_browser_manager(clear_profile=reset)
            if not success:
                self.logger.critical(
                    "BROWSER %i: Exceeded the maximum allowable "
                    "consecutive browser launch failures. "
                    "Setting failure_status." % browser.crawl_id)
                self.failure_status = {
                    'ErrorType': 'ExceedLaunchFailureLimit',
                    'CommandSequence': command_sequence
                }
                return
            browser.restart_required = False

    def execute_command_sequence(self, command_sequence, index=None):
        self._distribute_command(command_sequence, index)

    # DEFINITIONS OF HIGH LEVEL COMMANDS
    # NOTE: These wrappers are provided for convenience. To issue sequential
    # commands to the same browser in a single 'visit', use the CommandSequence
    # class directly.

    def get(self, url, index=None, timeout=60, sleep=0, reset=False):
        """ goes to a url """
        command_sequence = CommandSequence.CommandSequence(url)
        command_sequence.get(timeout=timeout, sleep=sleep)
        command_sequence.reset = reset
        self.execute_command_sequence(command_sequence, index=index)

    def browse(self,
               url,
               num_links=2,
               sleep=0,
               index=None,
               timeout=60,
               reset=False):
        """ browse a website and visit <num_links> links on the page """
        command_sequence = CommandSequence.CommandSequence(url)
        command_sequence.get(sleep=sleep, timeout=timeout)
        command_sequence.reset = reset
        self.execute_command_sequence(command_sequence, index=index)

    def close(self):
        """
        Execute shutdown procedure for TaskManager
        """
        if self.closing:
            self.logger.error("TaskManager already closed")
            return
        self._shutdown_manager()
Exemplo n.º 28
0
def deploy_firefox(
    status_queue: Queue,
    browser_params: BrowserParamsInternal,
    manager_params: ManagerParamsInternal,
    crash_recovery: bool,
) -> Tuple[webdriver.Firefox, str, Optional[Display]]:
    """
    launches a firefox instance with parameters set by the input dictionary
    """
    firefox_binary_path = get_firefox_binary_path()

    root_dir = os.path.dirname(__file__)  # directory of this file

    fp = FirefoxProfile()
    browser_profile_path = Path(fp.path)
    status_queue.put(("STATUS", "Profile Created", browser_profile_path))

    # Use Options instead of FirefoxProfile to set preferences since the
    # Options method has no "frozen"/restricted options.
    # https://github.com/SeleniumHQ/selenium/issues/2106#issuecomment-320238039
    fo = Options()
    assert browser_params.browser_id is not None
    if browser_params.seed_tar and not crash_recovery:
        logger.info("BROWSER %i: Loading initial browser profile from: %s" %
                    (browser_params.browser_id, browser_params.seed_tar))
        load_profile(
            browser_profile_path,
            manager_params,
            browser_params,
            browser_params.seed_tar,
        )
    elif browser_params.recovery_tar:
        logger.debug("BROWSER %i: Loading recovered browser profile from: %s" %
                     (browser_params.browser_id, browser_params.recovery_tar))
        load_profile(
            browser_profile_path,
            manager_params,
            browser_params,
            browser_params.recovery_tar,
        )
    status_queue.put(("STATUS", "Profile Tar", None))

    display_mode = browser_params.display_mode
    display_pid = None
    display_port = None
    display = None
    if display_mode == "headless":
        fo.headless = True
        fo.add_argument("--width={}".format(DEFAULT_SCREEN_RES[0]))
        fo.add_argument("--height={}".format(DEFAULT_SCREEN_RES[1]))
    if display_mode == "xvfb":
        try:
            display = Display(visible=0, size=DEFAULT_SCREEN_RES)
            display.start()
            display_pid, display_port = display.pid, display.display
        except EasyProcessError:
            raise RuntimeError("Xvfb could not be started. \
                Please ensure it's on your path. \
                See www.X.org for full details. \
                Commonly solved on ubuntu with `sudo apt install xvfb`")
    # Must do this for all display modes,
    # because status_queue is read off no matter what.
    status_queue.put(("STATUS", "Display", (display_pid, display_port)))

    if browser_params.extension_enabled:
        # Write config file
        extension_config: Dict[str, Any] = dict()
        extension_config.update(browser_params.to_dict())
        extension_config["logger_address"] = manager_params.logger_address
        extension_config[
            "storage_controller_address"] = manager_params.storage_controller_address
        extension_config["testing"] = manager_params.testing
        ext_config_file = browser_profile_path / "browser_params.json"
        with open(ext_config_file, "w") as f:
            json.dump(extension_config, f, cls=ConfigEncoder)
        logger.debug("BROWSER %i: Saved extension config file to: %s" %
                     (browser_params.browser_id, ext_config_file))

        # TODO restore detailed logging
        # fo.set_preference("*****@*****.**", "all")

    # Configure privacy settings
    configure_firefox.privacy(browser_params, fp, fo, root_dir,
                              browser_profile_path)

    # Set various prefs to improve speed and eliminate traffic to Mozilla
    configure_firefox.optimize_prefs(fo)

    # Intercept logging at the Selenium level and redirect it to the
    # main logger.  This will also inform us where the real profile
    # directory is hiding.
    interceptor = FirefoxLogInterceptor(browser_params.browser_id,
                                        browser_profile_path)
    interceptor.start()

    # Set custom prefs. These are set after all of the default prefs to allow
    # our defaults to be overwritten.
    for name, value in browser_params.prefs.items():
        logger.info("BROWSER %i: Setting custom preference: %s = %s" %
                    (browser_params.browser_id, name, value))
        fo.set_preference(name, value)

    # Launch the webdriver
    status_queue.put(("STATUS", "Launch Attempted", None))
    fb = FirefoxBinary(firefox_path=firefox_binary_path)
    driver = webdriver.Firefox(
        firefox_profile=fp,
        firefox_binary=fb,
        firefox_options=fo,
        log_path=interceptor.fifo,
    )

    # Add extension
    if browser_params.extension_enabled:

        # Install extension
        ext_loc = os.path.join(root_dir, "../Extension/firefox/openwpm.xpi")
        ext_loc = os.path.normpath(ext_loc)
        driver.install_addon(ext_loc, temporary=True)
        logger.debug("BROWSER %i: OpenWPM Firefox extension loaded" %
                     browser_params.browser_id)

    # set window size
    driver.set_window_size(*DEFAULT_SCREEN_RES)

    # Get browser process pid
    if hasattr(driver, "service") and hasattr(driver.service, "process"):
        pid = driver.service.process.pid
    elif hasattr(driver, "binary") and hasattr(driver.binary, "process"):
        pid = driver.binary.process.pid
    else:
        raise RuntimeError("Unable to identify Firefox process ID.")

    status_queue.put(("STATUS", "Browser Launched", int(pid)))

    return driver, driver.capabilities["moz:profile"], display
Exemplo n.º 29
0
class TaskManager:
    """
    User-facing Class for interfacing with OpenWPM
    The TaskManager spawns several child processes to run the automation tasks.
        - DataAggregator to aggregate data in a SQLite database
        - MPLogger to aggregate logs across processes
        - BrowserManager processes to isolate Browsers in a separate process
    <manager_params> dict of TaskManager configuration parameters
    <browser_params> is a list of (or a single) dictionaries that specify
    preferences for browsers to instantiate
    <process_watchdog> will monitor firefox and Xvfb processes, killing
    any not indexed in TaskManager's browser list.
        NOTE: Only run this in isolated environments. It kills processes
        by name, indiscriminately.
    """

    def __init__(self, manager_params, browser_params, process_watchdog=False):

        # Make paths absolute in manager_params
        for path in ['data_directory', 'log_directory']:
            if manager_params[path] is not None:
                manager_params[path] = os.path.expanduser(manager_params[path])
        manager_params['database_name'] = os.path.join(
            manager_params['data_directory'], manager_params['database_name'])
        manager_params['log_file'] = os.path.join(
            manager_params['log_directory'], manager_params['log_file'])
        manager_params['screenshot_path'] = os.path.join(
            manager_params['data_directory'], 'screenshots')
        manager_params['source_dump_path'] = os.path.join(
            manager_params['data_directory'], 'sources')
        self.manager_params = manager_params
        self.browser_params = browser_params

        # Create data directories if they do not exist
        if not os.path.exists(manager_params['screenshot_path']):
            os.makedirs(manager_params['screenshot_path'])
        if not os.path.exists(manager_params['source_dump_path']):
            os.makedirs(manager_params['source_dump_path'])

        # check size of parameter dictionary
        self.num_browsers = manager_params['num_browsers']
        if len(browser_params) != self.num_browsers:
            raise Exception("Number of <browser_params> dicts is not the same "
                            "as manager_params['num_browsers']")

        # Flow control
        self.closing = False
        self.failure_status = None
        self.threadlock = threading.Lock()
        self.failurecount = 0
        if manager_params['failure_limit'] is not None:
            self.failure_limit = manager_params['failure_limit']
        else:
            self.failure_limit = self.num_browsers * 2 + 10

        self.process_watchdog = process_watchdog

        # sets up logging server + connect a client
        self.logging_status_queue = None
        self.loggingserver = self._launch_loggingserver()
        # socket location: (address, port)
        self.manager_params['logger_address'] = self.logging_status_queue.get()
        self.logger = MPLogger.loggingclient(
            *self.manager_params['logger_address'])

        # Initialize the data aggregators
        self._launch_aggregators()

        # sets up the BrowserManager(s) + associated queues
        self.browsers = self._initialize_browsers(browser_params)
        self._launch_browsers()

        # start the manager watchdog
        thread = threading.Thread(target=self._manager_watchdog, args=())
        thread.daemon = True
        thread.start()

        # Save crawl config information to database
        openwpm_v, browser_v = get_version()
        self.data_aggregator.save_configuration(openwpm_v, browser_v)
        self.logger.info(
            get_configuration_string(
                self.manager_params, browser_params, (openwpm_v, browser_v)
            )
        )

    def _initialize_browsers(self, browser_params):
        """ initialize the browser classes, each its unique set of params """
        browsers = list()
        for i in range(self.num_browsers):
            browser_params[i][
                'crawl_id'] = self.data_aggregator.get_next_crawl_id()
            browsers.append(Browser(self.manager_params, browser_params[i]))

        return browsers

    def _launch_browsers(self):
        """ launch each browser manager process / browser """
        for browser in self.browsers:
            try:
                success = browser.launch_browser_manager()
            except Exception:
                self._cleanup_before_fail(during_init=True)
                raise

            if not success:
                self.logger.critical("Browser spawn failure during "
                                     "TaskManager initialization, exiting...")
                self.close()
                break

    def _manager_watchdog(self):
        """
        Periodically checks the following:
        - memory consumption of all browsers every 10 seconds
        - presence of processes that are no longer in use

        TODO: process watchdog needs to be updated since `psutil` won't
              kill browser processes started by Selenium 3 (with `subprocess`)
        """
        if self.process_watchdog:
            self.logger.error("BROWSER %i: Process watchdog is not currently "
                              "supported." % self.crawl_id)
        while not self.closing:
            time.sleep(10)

            # Check browser memory usage
            for browser in self.browsers:
                try:
                    process = psutil.Process(browser.browser_pid)
                    mem = process.memory_info()[0] / float(2 ** 20)
                    if mem > BROWSER_MEMORY_LIMIT:
                        self.logger.info("BROWSER %i: Memory usage: %iMB"
                                         ", exceeding limit of %iMB" % (
                                             browser.crawl_id, int(mem),
                                             BROWSER_MEMORY_LIMIT))
                        browser.restart_required = True
                except psutil.NoSuchProcess:
                    pass

            # Check for browsers or displays that were not closed correctly
            # 300 second buffer to avoid killing freshly launched browsers
            # TODO This buffer should correspond to the maximum spawn timeout
            if self.process_watchdog:
                browser_pids = set()
                display_pids = set()
                check_time = time.time()
                for browser in self.browsers:
                    if browser.browser_pid is not None:
                        browser_pids.add(browser.browser_pid)
                    if browser.display_pid is not None:
                        display_pids.add(browser.display_pid)
                for process in psutil.process_iter():
                    if (process.create_time() + 300 < check_time and (
                            (process.name() == 'firefox' and
                             process.pid not in browser_pids) or
                            (process.name() == 'Xvfb' and
                             process.pid not in display_pids))):
                        self.logger.debug("Process: %s (pid: %i) with start "
                                          "time %s found running but not in "
                                          "browser process list. Killing." % (
                                              process.name(), process.pid,
                                              process.create_time()))
                        process.kill()

    def _launch_aggregators(self):
        """Launch the necessary data aggregators"""
        if self.manager_params["output_format"] == "local":
            self.data_aggregator = LocalAggregator.LocalAggregator(
                self.manager_params, self.browser_params)
        elif self.manager_params["output_format"] == "s3":
            self.data_aggregator = S3Aggregator.S3Aggregator(
                self.manager_params, self.browser_params)
        else:
            raise Exception("Unrecognized output format: %s" %
                            self.manager_params["output_format"])
        self.data_aggregator.launch()
        self.manager_params[
            'aggregator_address'] = self.data_aggregator.listener_address

        # open connection to aggregator for saving crawl details
        self.sock = clientsocket(serialization='dill')
        self.sock.connect(*self.manager_params['aggregator_address'])

    def _kill_aggregators(self):
        """Shutdown any currently running data aggregators"""
        self.data_aggregator.shutdown()

    def _launch_loggingserver(self):
        """ sets up logging server """
        self.logging_status_queue = Queue()
        loggingserver = Process(target=MPLogger.loggingserver,
                                args=(self.manager_params['log_file'],
                                      self.logging_status_queue, ))
        loggingserver.daemon = True
        loggingserver.start()
        return loggingserver

    def _kill_loggingserver(self):
        """ terminates logging server gracefully """
        self.logging_status_queue.put("DIE")
        self.loggingserver.join(300)

    def _shutdown_manager(self, during_init=False):
        """
        Wait for current commands to finish, close all child processes and
        threads
        <during_init> flag to indicator if this shutdown is occuring during
                      the TaskManager initialization
        """
        self.closing = True

        for browser in self.browsers:
            browser.shutdown_browser(during_init)

        self.sock.close()  # close socket to data aggregator
        self._kill_aggregators()
        self._kill_loggingserver()

    def _cleanup_before_fail(self, during_init=False):
        """
        Execute shutdown commands before throwing an exception
        This should keep us from having a bunch of hanging processes
        and incomplete data.
        <during_init> flag to indicator if this shutdown is occuring during
                      the TaskManager initialization
        """
        self._shutdown_manager(during_init=during_init)

    def _check_failure_status(self):
        """ Check the status of command failures. Raise exceptions as necessary

        The failure status property is used by the various asynchronous
        command execution threads which interface with the
        remote browser manager processes. If a failure status is found, the
        appropriate steps are taken to gracefully close the infrastructure
        """
        self.logger.debug("Checking command failure status indicator...")
        if self.failure_status:
            self.logger.debug(
                "TaskManager failure status set, halting command execution.")
            self._cleanup_before_fail()
            if self.failure_status['ErrorType'] == 'ExceedCommandFailureLimit':
                raise CommandExecutionError(
                    "TaskManager exceeded maximum consecutive command "
                    "execution failures.",
                    self.failure_status['CommandSequence']
                )
            elif (self.failure_status['ErrorType'] == ("ExceedLaunch"
                                                       "FailureLimit")):
                raise CommandExecutionError(
                    "TaskManager failed to launch browser within allowable "
                    "failure limit.", self.failure_status['CommandSequence']
                )
            if self.failure_status['ErrorType'] == 'CriticalChildException':
                reraise(*pickle.loads(self.failure_status['Exception']))

    # CRAWLER COMMAND CODE

    def _distribute_command(self, command_seq, index=None):
        """
        parses command type and issues command(s) to the proper browser
        <index> specifies the type of command this is:
        = None  -> first come, first serve
        =  #    -> index of browser to send command to
        = *     -> sends command to all browsers
        = **    -> sends command to all browsers (synchronized)
        """

        # Block if the aggregator queue is too large
        agg_queue_size = self.data_aggregator.get_most_recent_status()
        if agg_queue_size >= AGGREGATOR_QUEUE_LIMIT:
            while agg_queue_size >= AGGREGATOR_QUEUE_LIMIT:
                self.logger.info(
                    "Blocking command submission until the DataAggregator "
                    "is below the max queue size of %d. Current queue "
                    "length %d. " % (AGGREGATOR_QUEUE_LIMIT, agg_queue_size)
                )
                agg_queue_size = self.data_aggregator.get_status()

        # Distribute command
        if index is None:
            # send to first browser available
            command_executed = False
            while True:
                for browser in self.browsers:
                    if browser.ready():
                        browser.current_timeout = command_seq.total_timeout
                        thread = self._start_thread(browser, command_seq)
                        command_executed = True
                        break
                if command_executed:
                    break
                time.sleep(SLEEP_CONS)

        elif index == '*':
            # send the command to all browsers
            command_executed = [False] * len(self.browsers)
            while False in command_executed:
                for i in range(len(self.browsers)):
                    if self.browsers[i].ready() and not command_executed[i]:
                        self.browsers[
                            i].current_timeout = command_seq.total_timeout
                        thread = self._start_thread(
                            self.browsers[i], command_seq)
                        command_executed[i] = True
                time.sleep(SLEEP_CONS)
        elif index == '**':
            # send the command to all browsers and sync it
            condition = threading.Condition()  # block threads until ready
            command_executed = [False] * len(self.browsers)
            while False in command_executed:
                for i in range(len(self.browsers)):
                    if self.browsers[i].ready() and not command_executed[i]:
                        self.browsers[
                            i].current_timeout = command_seq.total_timeout
                        thread = self._start_thread(
                            self.browsers[i], command_seq, condition)
                        command_executed[i] = True
                time.sleep(SLEEP_CONS)
            with condition:
                condition.notifyAll()  # All browsers loaded, start
        elif 0 <= index < len(self.browsers):
            # send the command to this specific browser
            while True:
                if self.browsers[index].ready():
                    self.browsers[
                        index].current_timeout = command_seq.total_timeout
                    thread = self._start_thread(
                        self.browsers[index], command_seq)
                    break
                time.sleep(SLEEP_CONS)
        else:
            self.logger.info(
                "Command index type is not supported or out of range")
            return

        if command_seq.blocking:
            thread.join()
            self._check_failure_status()

    def _start_thread(self, browser, command_sequence, condition=None):
        """  starts the command execution thread """

        # Check status flags before starting thread
        if self.closing:
            self.logger.error(
                "Attempted to execute command on a closed TaskManager")
            return
        self._check_failure_status()

        browser.set_visit_id(self.data_aggregator.get_next_visit_id())
        self.sock.send(("site_visits", {
            "visit_id": browser.curr_visit_id,
            "crawl_id": browser.crawl_id,
            "site_url": command_sequence.url
        }))

        # Start command execution thread
        args = (browser, command_sequence, condition)
        thread = threading.Thread(target=self._issue_command, args=args)
        browser.command_thread = thread
        thread.daemon = True
        thread.start()
        return thread

    def _issue_command(self, browser, command_sequence, condition=None):
        """
        sends command tuple to the BrowserManager
        """
        browser.is_fresh = False

        # if this is a synced call, block on condition
        if condition is not None:
            with condition:
                condition.wait()

        reset = command_sequence.reset
        start_time = None
        for command_and_timeout in command_sequence.commands_with_timeout:
            command, timeout = command_and_timeout
            if command[0] in ['GET', 'BROWSE',
                              'SAVE_SCREENSHOT',
                              'SCREENSHOT_FULL_PAGE',
                              'DUMP_PAGE_SOURCE',
                              'RECURSIVE_DUMP_PAGE_SOURCE']:
                start_time = time.time()
                command += (browser.curr_visit_id,)
            elif command[0] in ['DUMP_FLASH_COOKIES', 'DUMP_PROFILE_COOKIES']:
                command += (start_time, browser.curr_visit_id,)
            browser.current_timeout = timeout
            # passes off command and waits for a success (or failure signal)
            browser.command_queue.put(command)
            command_succeeded = 0  # 1 success, 0 error, -1 timeout
            command_arguments = command[1] if len(command) > 1 else None

            # received reply from BrowserManager, either success or failure
            try:
                status = browser.status_queue.get(
                    True, browser.current_timeout)
                if status == "OK":
                    command_succeeded = 1
                elif status[0] == "CRITICAL":
                    self.logger.critical(
                        "BROWSER %i: Received critical error from browser "
                        "process while executing command %s. Setting failure "
                        "status." % (browser.crawl_id, str(command)))
                    self.failure_status = {
                        'ErrorType': 'CriticalChildException',
                        'CommandSequence': command_sequence,
                        'Exception': status[1]
                    }
                    return
                else:
                    command_succeeded = 0
                    self.logger.info(
                        "BROWSER %i: Received failure status while executing "
                        "command: %s" % (browser.crawl_id, command[0]))
            except EmptyQueue:
                command_succeeded = -1
                self.logger.info(
                    "BROWSER %i: Timeout while executing command, %s, killing "
                    "browser manager" % (browser.crawl_id, command[0]))

            self.sock.send(("crawl_history", {
                "crawl_id": browser.crawl_id,
                "visit_id": browser.curr_visit_id,
                "command": command[0],
                "arguments": command_arguments,
                "bool_success": command_succeeded
            }))

            if command_succeeded != 1:
                with self.threadlock:
                    self.failurecount += 1
                if self.failurecount > self.failure_limit:
                    self.logger.critical(
                        "BROWSER %i: Command execution failure pushes failure "
                        "count above the allowable limit. Setting "
                        "failure_status." % browser.crawl_id)
                    self.failure_status = {
                        'ErrorType': 'ExceedCommandFailureLimit',
                        'CommandSequence': command_sequence
                    }
                    return
                browser.restart_required = True
                self.logger.debug("BROWSER %i: Browser restart required" % (
                    browser.crawl_id))
            else:
                with self.threadlock:
                    self.failurecount = 0

            if browser.restart_required:
                break

        # Sleep after executing CommandSequence to provide extra time for
        # internal buffers to drain. Stopgap in support of #135
        time.sleep(2)

        if self.closing:
            return

        if browser.restart_required or reset:
            success = browser.restart_browser_manager(clear_profile=reset)
            if not success:
                self.logger.critical(
                    "BROWSER %i: Exceeded the maximum allowable consecutive "
                    "browser launch failures. Setting failure_status." % (
                        browser.crawl_id))
                self.failure_status = {
                    'ErrorType': 'ExceedLaunchFailureLimit',
                    'CommandSequence': command_sequence
                }
                return
            browser.restart_required = False

    def execute_command_sequence(self, command_sequence, index=None):
        self._distribute_command(command_sequence, index)

    # DEFINITIONS OF HIGH LEVEL COMMANDS
    # NOTE: These wrappers are provided for convenience. To issue sequential
    # commands to the same browser in a single 'visit', use the CommandSequence
    # class directly.

    def get(self, url, index=None, timeout=60, sleep=0, reset=False):
        """ goes to a url """
        command_sequence = CommandSequence.CommandSequence(url)
        command_sequence.get(timeout=timeout, sleep=sleep)
        command_sequence.reset = reset
        self.execute_command_sequence(command_sequence, index=index)

    def browse(self, url, num_links=2, sleep=0, index=None,
               timeout=60, reset=False):
        """ browse a website and visit <num_links> links on the page """
        command_sequence = CommandSequence.CommandSequence(url)
        command_sequence.browse(
            num_links=num_links, sleep=sleep, timeout=timeout)
        command_sequence.reset = reset
        self.execute_command_sequence(command_sequence, index=index)

    def close(self):
        """
        Execute shutdown procedure for TaskManager
        """
        if self.closing:
            self.logger.error("TaskManager already closed")
            return
        self._shutdown_manager()
Exemplo n.º 30
0
def deploy_firefox(
    status_queue: Queue,
    browser_params: BrowserParamsInternal,
    manager_params: ManagerParamsInternal,
    crash_recovery: bool,
) -> Tuple[webdriver.Firefox, Path, Optional[Display]]:
    """
    launches a firefox instance with parameters set by the input dictionary
    """
    firefox_binary_path = get_firefox_binary_path()

    root_dir = os.path.dirname(__file__)  # directory of this file

    browser_profile_path = Path(tempfile.mkdtemp(prefix="firefox_profile_"))
    status_queue.put(("STATUS", "Profile Created", browser_profile_path))

    # Use Options instead of FirefoxProfile to set preferences since the
    # Options method has no "frozen"/restricted options.
    # https://github.com/SeleniumHQ/selenium/issues/2106#issuecomment-320238039
    fo = Options()
    # Set a custom profile that is used in-place and is not deleted by geckodriver.
    # https://firefox-source-docs.mozilla.org/testing/geckodriver/CrashReports.html
    # Using FirefoxProfile breaks stateful crawling:
    # https://github.com/mozilla/OpenWPM/issues/423#issuecomment-521018093
    fo.add_argument("-profile")
    fo.add_argument(str(browser_profile_path))

    assert browser_params.browser_id is not None
    if browser_params.seed_tar and not crash_recovery:
        logger.info("BROWSER %i: Loading initial browser profile from: %s" %
                    (browser_params.browser_id, browser_params.seed_tar))
        load_profile(
            browser_profile_path,
            manager_params,
            browser_params,
            browser_params.seed_tar,
        )
    elif browser_params.recovery_tar:
        logger.debug("BROWSER %i: Loading recovered browser profile from: %s" %
                     (browser_params.browser_id, browser_params.recovery_tar))
        load_profile(
            browser_profile_path,
            manager_params,
            browser_params,
            browser_params.recovery_tar,
        )
    status_queue.put(("STATUS", "Profile Tar", None))

    display_mode = browser_params.display_mode
    display_pid = None
    display_port = None
    display = None
    if display_mode == "headless":
        fo.headless = True
        fo.add_argument("--width={}".format(DEFAULT_SCREEN_RES[0]))
        fo.add_argument("--height={}".format(DEFAULT_SCREEN_RES[1]))
    if display_mode == "xvfb":
        try:
            display = Display(visible=0, size=DEFAULT_SCREEN_RES)
            display.start()
            display_pid, display_port = display.pid, display.display
        except EasyProcessError:
            raise RuntimeError("Xvfb could not be started. \
                Please ensure it's on your path. \
                See www.X.org for full details. \
                Commonly solved on ubuntu with `sudo apt install xvfb`")
    # Must do this for all display modes,
    # because status_queue is read off no matter what.
    status_queue.put(("STATUS", "Display", (display_pid, display_port)))

    if browser_params.extension_enabled:
        # Write config file
        extension_config: Dict[str, Any] = dict()
        extension_config.update(browser_params.to_dict())
        extension_config["logger_address"] = manager_params.logger_address
        extension_config[
            "storage_controller_address"] = manager_params.storage_controller_address
        extension_config["testing"] = manager_params.testing
        ext_config_file = browser_profile_path / "browser_params.json"
        with open(ext_config_file, "w") as f:
            json.dump(extension_config, f, cls=ConfigEncoder)
        logger.debug("BROWSER %i: Saved extension config file to: %s" %
                     (browser_params.browser_id, ext_config_file))

        # TODO restore detailed logging
        # fo.set_preference("*****@*****.**", "all")

    # Geckodriver currently places the user.js file in the wrong profile
    # directory, so we have to create it manually here.
    # TODO: See https://github.com/mozilla/OpenWPM/issues/867 for when
    # to remove this workaround.
    # Load existing preferences from the profile's user.js file
    prefs = configure_firefox.load_existing_prefs(browser_profile_path)
    # Load default geckodriver preferences
    prefs.update(configure_firefox.DEFAULT_GECKODRIVER_PREFS)
    # Pick an available port for Marionette (https://stackoverflow.com/a/2838309)
    # This has a race condition, as another process may get the port
    # before Marionette, but we don't expect it to happen often
    s = socket.socket()
    s.bind(("", 0))
    marionette_port = s.getsockname()[1]
    s.close()
    prefs["marionette.port"] = marionette_port

    # Configure privacy settings
    configure_firefox.privacy(browser_params, prefs)

    # Set various prefs to improve speed and eliminate traffic to Mozilla
    configure_firefox.optimize_prefs(prefs)

    # Intercept logging at the Selenium level and redirect it to the
    # main logger.
    interceptor = FirefoxLogInterceptor(browser_params.browser_id)
    interceptor.start()

    # Set custom prefs. These are set after all of the default prefs to allow
    # our defaults to be overwritten.
    for name, value in browser_params.prefs.items():
        logger.info("BROWSER %i: Setting custom preference: %s = %s" %
                    (browser_params.browser_id, name, value))
        prefs[name] = value

    # Write all preferences to the profile's user.js file
    configure_firefox.save_prefs_to_profile(prefs, browser_profile_path)

    # Launch the webdriver
    status_queue.put(("STATUS", "Launch Attempted", None))
    fb = FirefoxBinary(firefox_path=firefox_binary_path)
    driver = webdriver.Firefox(
        firefox_binary=fb,
        options=fo,
        log_path=interceptor.fifo,
        # TODO: See https://github.com/mozilla/OpenWPM/issues/867 for
        # when to remove this
        service_args=["--marionette-port",
                      str(marionette_port)],
    )

    # Add extension
    if browser_params.extension_enabled:

        # Install extension
        ext_loc = os.path.join(root_dir, "../Extension/firefox/openwpm.xpi")
        ext_loc = os.path.normpath(ext_loc)
        driver.install_addon(ext_loc, temporary=True)
        logger.debug("BROWSER %i: OpenWPM Firefox extension loaded" %
                     browser_params.browser_id)

    # set window size
    driver.set_window_size(*DEFAULT_SCREEN_RES)

    # Get browser process pid
    if hasattr(driver, "service") and hasattr(driver.service, "process"):
        pid = driver.service.process.pid
    elif hasattr(driver, "binary") and hasattr(driver.binary, "process"):
        pid = driver.binary.process.pid
    else:
        raise RuntimeError("Unable to identify Firefox process ID.")

    status_queue.put(("STATUS", "Browser Launched", int(pid)))

    return driver, browser_profile_path, display
Exemplo n.º 31
0
class BaseAggregator:
    """Base class for the data aggregator interface. This class is used
    alongside the BaseListener class to spawn an aggregator process that
    combines data from multiple crawl processes. The BaseAggregator class
    manages the child listener process.

    Parameters
    ----------
    manager_params : ManagerParamsInternal
        TaskManager configuration parameters
    browser_params : list of BrowserParamsInternal
        List of browser configuration class<BrowserParams>"""

    __metaclass__ = abc.ABCMeta

    def __init__(
        self,
        manager_params: ManagerParamsInternal,
        browser_params: List[BrowserParamsInternal],
    ):
        self.manager_params = manager_params
        self.browser_params = browser_params
        self.listener_address = None
        self.listener_process = None
        self.status_queue = Queue()
        self.completion_queue = Queue()
        self.shutdown_queue = Queue()
        self._last_status = None
        self._last_status_received = None
        self.logger = logging.getLogger("openwpm")

    @abc.abstractmethod
    def save_configuration(self, openwpm_version, browser_version):
        """Save configuration details to the database"""

    @abc.abstractmethod
    def get_next_visit_id(self):
        """Return a unique visit ID to be used as a key for a single visit"""

    @abc.abstractmethod
    def get_next_browser_id(self):
        """Return a unique crawl ID used as a key for a browser instance"""

    def get_most_recent_status(self):
        """Return the most recent queue size sent from the listener process"""

        # Block until we receive the first status update
        if self._last_status is None:
            return self.get_status()

        # Drain status queue until we receive most recent update
        while not self.status_queue.empty():
            self._last_status = self.status_queue.get()
            self._last_status_received = time.time()

        # Check last status signal
        if (time.time() - self._last_status_received) > STATUS_TIMEOUT:
            raise RuntimeError(
                "No status update from DataAggregator listener process "
                "for %d seconds." % (time.time() - self._last_status_received))

        return self._last_status

    def get_status(self):
        """Get listener process status. If the status queue is empty, block."""
        try:
            self._last_status = self.status_queue.get(block=True,
                                                      timeout=STATUS_TIMEOUT)
            self._last_status_received = time.time()
        except queue.Empty:
            raise RuntimeError(
                "No status update from DataAggregator listener process "
                "for %d seconds." % (time.time() - self._last_status_received))
        return self._last_status

    def get_new_completed_visits(self) -> List[Tuple[int, bool]]:
        """
        Returns a list of all visit ids that have been processed since
        the last time the method was called and whether or not they
        have been interrupted.

        This method will return an empty list in case no visit ids have
        been processed since the last time this method was called
        """
        finished_visit_ids = list()
        while not self.completion_queue.empty():
            finished_visit_ids.append(self.completion_queue.get())
        return finished_visit_ids

    def launch(self, listener_process_runner, *args):
        """Launch the aggregator listener process"""
        args = ((self.status_queue, self.completion_queue,
                 self.shutdown_queue), ) + args
        self.listener_process = Process(target=listener_process_runner,
                                        args=args)
        self.listener_process.daemon = True
        self.listener_process.start()
        self.listener_address = self.status_queue.get()

    def shutdown(self, relaxed: bool = True):
        """ Terminate the aggregator listener process"""
        self.logger.debug(
            "Sending the shutdown signal to the %s listener process..." %
            type(self).__name__)
        self.shutdown_queue.put((SHUTDOWN_SIGNAL, relaxed))
        start_time = time.time()
        self.listener_process.join(300)
        self.logger.debug("%s took %s seconds to close." %
                          (type(self).__name__, str(time.time() - start_time)))
        self.listener_address = None
        self.listener_process = None
Exemplo n.º 32
0
class StorageControllerHandle:
    """This class contains all methods relevant for the TaskManager
    to interact with the StorageController
    """
    def __init__(
        self,
        structured_storage: StructuredStorageProvider,
        unstructured_storage: Optional[UnstructuredStorageProvider],
    ) -> None:

        self.listener_address: Optional[Tuple[str, int]] = None
        self.listener_process: Optional[Process] = None
        self.status_queue = Queue()
        self.completion_queue = Queue()
        self.shutdown_queue = Queue()
        self._last_status = None
        self._last_status_received: Optional[float] = None
        self.logger = logging.getLogger("openwpm")
        self.storage_controller = StorageController(
            structured_storage,
            unstructured_storage,
            status_queue=self.status_queue,
            completion_queue=self.completion_queue,
            shutdown_queue=self.shutdown_queue,
        )

    def get_next_visit_id(self) -> VisitId:
        """Generate visit id as randomly generated positive integer less than 2^53.

        Parquet can support integers up to 64 bits, but Javascript can only
        represent integers up to 53 bits:
        https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Number/MAX_SAFE_INTEGER
        Thus, we cap these values at 53 bits.
        """
        return VisitId(random.getrandbits(53))

    def get_next_browser_id(self) -> BrowserId:
        """Generate crawl id as randomly generated positive 32bit integer

        Note: Parquet's partitioned dataset reader only supports integer
        partition columns up to 32 bits.
        """
        return BrowserId(random.getrandbits(32))

    def save_configuration(
        self,
        manager_params: ManagerParamsInternal,
        browser_params: List[BrowserParamsInternal],
        openwpm_version: str,
        browser_version: str,
    ) -> None:
        assert self.listener_address is not None
        sock = DataSocket(self.listener_address)
        task_id = random.getrandbits(32)
        sock.store_record(
            TableName("task"),
            INVALID_VISIT_ID,
            {
                "task_id": task_id,
                "manager_params": manager_params.to_json(),
                "openwpm_version": openwpm_version,
                "browser_version": browser_version,
            },
        )
        # Record browser details for each browser
        for browser_param in browser_params:
            sock.store_record(
                TableName("crawl"),
                INVALID_VISIT_ID,
                {
                    "browser_id": browser_param.browser_id,
                    "task_id": task_id,
                    "browser_params": browser_param.to_json(),
                },
            )
        sock.finalize_visit_id(INVALID_VISIT_ID, success=True)

    def launch(self) -> None:
        """Starts the storage controller"""
        self.storage_controller = Process(
            name="StorageController",
            target=StorageController.run,
            args=(self.storage_controller, ),
        )
        self.storage_controller.daemon = True
        self.storage_controller.start()

        self.listener_address = self.status_queue.get()

    def get_new_completed_visits(self) -> List[Tuple[int, bool]]:
        """
        Returns a list of all visit ids that have been processed since
        the last time the method was called and whether or not they
        ran successfully.

        This method will return an empty list in case no visit ids have
        been processed since the last time this method was called
        """
        finished_visit_ids = list()
        while not self.completion_queue.empty():
            finished_visit_ids.append(self.completion_queue.get())
        return finished_visit_ids

    def shutdown(self, relaxed: bool = True) -> None:
        """Terminate the storage controller process"""
        assert isinstance(self.storage_controller, Process)
        self.logger.debug(
            "Sending the shutdown signal to the Storage Controller...")
        self.shutdown_queue.put((SHUTDOWN_SIGNAL, relaxed))
        start_time = time.time()
        self.storage_controller.join(300)
        self.logger.debug("%s took %s seconds to close." %
                          (type(self).__name__, str(time.time() - start_time)))

    def get_most_recent_status(self) -> int:
        """Return the most recent queue size sent from the Storage Controller process"""

        # Block until we receive the first status update
        if self._last_status is None:
            return self.get_status()

        # Drain status queue until we receive most recent update
        while not self.status_queue.empty():
            self._last_status = self.status_queue.get()
            self._last_status_received = time.time()

        # Check last status signal
        if (time.time() - self._last_status_received) > STATUS_TIMEOUT:
            raise RuntimeError(
                "No status update from the storage controller process "
                "for %d seconds." % (time.time() - self._last_status_received))

        return self._last_status

    def get_status(self) -> int:
        """Get listener process status. If the status queue is empty, block."""
        try:
            self._last_status = self.status_queue.get(block=True,
                                                      timeout=STATUS_TIMEOUT)
            self._last_status_received = time.time()
        except queue.Empty:
            assert self._last_status_received is not None
            raise RuntimeError(
                "No status update from the storage controller process "
                "for %d seconds." % (time.time() - self._last_status_received))
        assert isinstance(self._last_status, int)
        return self._last_status
Exemplo n.º 33
0
    def writeEventsToCsv(self, urls, processedUrlsFName, batchSize=20):
        numUrls = len(urls)
        origNumUrls = numUrls
        urlsWithEvents = 0
        totalEvents = 0
        processedListings = 0
        numTimeouts = 0

        try:
            with open(processedUrlsFName, 'r') as pus:
                pUrls = list(set(pus.read().split('\r\n')))
            logging.info(
                'Already processed {0} of {1} urls. Picking up where we'
                ' left off.'.format(len(pUrls), numUrls))
            urls = [url for url in urls if url not in pUrls]
            numUrls = len(urls)
        except IOError:
            pass

        with open(processedUrlsFName, 'a+') as pus:
            pUrls_writer = csv.writer(pus)
            with open(self.eventFile, 'a+') as f:
                writer = csv.writer(f)
                sttm = time.time()

                if self.eventMode == 'parallel':
                    batches = [
                        urls[x:x + batchSize]
                        for x in xrange(0, len(urls), batchSize)]
                    for b, batch in enumerate(batches):
                        logging.info('Starting batch {0} of  {1}'.format(
                            b + 1, len(batches)))
                        manager = Manager()
                        batchQueue = Queue()
                        batchTimeoutList = manager.list()
                        batchProcessedUrls = manager.list()
                        batchEventQueue = manager.Queue()
                        batchEventsSaved = manager.Value('i', 0)
                        jobs = []
                        for i, url in enumerate(batch):
                            batchQueue.put(
                                [self.eventMode, url, batchEventQueue,
                                 batchProcessedUrls, batchTimeoutList])
                        for i in range(len(batch)):
                            proc = Process(
                                target=self.eventWorker, args=(batchQueue,))
                            proc.start()
                            jobs.append(proc)
                        writeProc = Process(
                            target=self.writeToCsvWorker, args=(
                                batchEventQueue, batchEventsSaved))
                        time.sleep(2)
                        writeProc.start()
                        for j, job in enumerate(jobs):
                            # 5 seconds per url for each process before timeout
                            job.join(max(60, 5 * len(batch)))
                            if job.is_alive():
                                job.terminate()
                                logging.info(
                                    'Subprocess {0} of {1} timed out'.format(
                                        j + 1, min(24, len(batch))))
                        writeProc.join(max(60, 8 * len(batch)))
                        totalEvents += batchEventsSaved.value
                        processedListings += len(batch)
                        for url in set(list(batchProcessedUrls)):
                            pUrls_writer.writerow([url])
                        urlsWithEvents += len(set(list(batchProcessedUrls)))
                        numTimeouts += len(set(list(batchTimeoutList)))
                        durMins, minsLeft = self.timeElapsedLeft(
                            sttm, b + 1, len(batches))
                        logging.info(
                            'Saved {0} new events from {1} of {2} listings. '
                            '\nEstimated time to '
                            'completion: ~{3} min.'.format(
                                batchEventsSaved.value,
                                len(batchProcessedUrls), len(batch), minsLeft))
                        os.system(
                            "ps aux | grep chrome | awk ' { print $2 } ' |"
                            " xargs kill -9")

                elif self.eventMode == 'series':
                    for i, url in enumerate(urls):
                        numEvents = 0
                        events = self.getEventsFromListingUrl(
                            self.eventMode, url, None, urls, [])
                        if events is None:
                            durMins, minsLeft = self.timeElapsedLeft(
                                sttm, i + 1, numUrls)
                            logging.info(
                                'No sales events scraped from listing'
                                ' {0} of {1}. Check url: {2}. {3} min.'
                                'elapsed. {4} min. remaining.'.format(
                                    i + 1, numUrls, url, durMins,
                                    minsLeft))
                            continue
                        for event in events:
                            totalEvents += 1
                            numEvents += 1
                            writer.writerow(event)
                        urlsWithEvents += 1
                        pUrls_writer.writerow([url])
                        durMins, minsLeft = self.timeElapsedLeft(
                            sttm, i, numUrls)
                        if (i + 1) % 1 == 0:
                            logging.info(
                                'Scraped {0} sales events from listing {1}'
                                ' of {2}. Scraped {3} total sales events in'
                                ' {4} min. Estimated time to completion:'
                                ' ~{5} min.'.format(
                                    numEvents, i + 1, numUrls, totalEvents,
                                    durMins, minsLeft))
                else:
                    raise ValueError(
                        'Must specify valid event scraping '
                        'mode: ["parallel", "series"]')
        if numUrls > 0:
            self.pctUrlsWithEvents = round(
                urlsWithEvents / origNumUrls * 100.0, 1)
        else:
            self.pctUrlsWithEvents = -999

        logging.info('#' * 100)
        logging.info('#' * 100)
        logging.info(
            'Scraped events from {0} of {1} ({2}%) urls.'.format(
                urlsWithEvents, numUrls, self.pctUrlsWithEvents).center(
                90, ' ').center(100, '#').upper())
        logging.info(
            ('{0} of {1} urls timed out while scraping events.'.format(
                numTimeouts, numUrls).upper().center(90, ' ').center(
                100, '#')))
        logging.info(
            ('Saved {0} events to {1}'.format(
                totalEvents, self.eventFile).upper().center(
                90, ' ').center(100, '#')))
        logging.info('#' * 100)
        logging.info('#' * 100)
Exemplo n.º 34
0
        message = HTML_INTERNAL_SERVER_ERROR.format(error_message=exc)
        self.wfile.write(message.encode("utf8"))

### Logging

if __name__ == '__main__':
    print('\n### Logging')



from multiprocess import Queue

HTTPD_MESSAGE_QUEUE = Queue()

HTTPD_MESSAGE_QUEUE.put("I am another message")

HTTPD_MESSAGE_QUEUE.put("I am one more message")

from .bookutils import rich_output, terminal_escape

def display_httpd_message(message):
    if rich_output():
        display(
            HTML(
                '<pre style="background: NavajoWhite;">' +
                message +
                "</pre>"))
    else:
        print(terminal_escape(message))
Exemplo n.º 35
0
class MemoryStructuredProvider(StructuredStorageProvider):
    """
    This storage provider passes all it's data to the MemoryStructuredProviderHandle in a
    process safe way.

    This makes it ideal for testing

    It also aims to only save out data as late as possible to ensure that storage_controller
    only relies on the guarantees given in the interface.
    """

    lock: Lock

    def __init__(self) -> None:
        super().__init__()
        self.queue = Queue()
        self.handle = MemoryProviderHandle(self.queue)
        self.logger = logging.getLogger("openwpm")
        self.cache1: DefaultDict[
            VisitId, DefaultDict[TableName, List[Dict[str, Any]]]
        ] = defaultdict(lambda: defaultdict(list))
        """The cache for entries before they are finalized"""
        self.cache2: DefaultDict[TableName, List[Dict[str, Any]]] = defaultdict(list)
        """For all entries that have been finalized but not yet flushed out to the queue"""
        self.signal_list: List[Event] = []

    async def init(self) -> None:
        self.lock = asyncio.Lock()

    async def flush_cache(self) -> None:
        async with self.lock as _:
            self.logger.info("Flushing cache")

            for table, record_list in self.cache2.items():
                self.logger.info(f"Saving out {len(record_list)} entries for {table}")
                for record in record_list:
                    self.queue.put((table, record))
            self.cache2.clear()
            for ev in self.signal_list:
                ev.set()

    async def store_record(
        self, table: TableName, visit_id: VisitId, record: Dict[str, Any]
    ) -> None:
        self.logger.info(
            "Saving into table %s for visit_id %d record %r", table, visit_id, record
        )
        self.cache1[visit_id][table].append(record)

    async def finalize_visit_id(
        self, visit_id: VisitId, interrupted: bool = False
    ) -> Task[None]:
        async with self.lock as _:
            self.logger.info(
                f"Finalizing visit_id {visit_id} which was {'' if interrupted else 'not'} interrupted"
            )
            for table, record_list in self.cache1[visit_id].items():
                self.cache2[table].extend(record_list)

            del self.cache1[visit_id]

            async def wait(signal: Event) -> None:
                await signal.wait()

            ev = Event()
            self.signal_list.append(ev)
            return asyncio.create_task(wait(ev))

    async def shutdown(self) -> None:
        if self.cache1 != {} or self.cache2 != {}:
            self.logger.error("Shutting down with unsaved records")