예제 #1
0
def start() -> flask.Response:
    """API endpoint to retrive start the crawler with a certain configuration.

    Starting the crawler with a arbitrary configuration.

    Returns:
        flask.Response: REST response

    """
    config = flask.request.args.get('config')
    if config is None:
        response = communication.Response(
            success=False,
            message='Provide config or filepath via ?config=<your-config>',
            command=communication.MANAGER_START,
        )
        return _get_response(response)
    parser = config_service.ConfigParser(config)
    try:
        config = parser.parse()
    except config_service.ConfigParsingException as error:
        response = communication.Response(success=False,
                                          message=str(error),
                                          command=communication.MANAGER_START)
        return _get_response(response)
    response = scheduler.add_config(config)
    return _get_response(response)
예제 #2
0
def respond_config_deleted(identifier: str, success: bool) -> None:
    """Respond that the config was deleted from the database.

    This function creates a corresponding response object and inserts it
    in the scheduler output queue.
    The response depends if the deletion succeeded or failed.

    Args:
        identifier (str): identifier of the configuration
        success (bool): insertion succeeded/failed

    """
    if success:
        response = communication.Response(
            success=True,
            message=(f'Configuration with identifier {identifier} '
                     f'was successfully deleted from the schedule.'),
            command=communication.SCHEDULER_REMOVE_CONFIG)
    else:
        response = communication.Response(
            success=False,
            message=(f'Configuration with identifier {identifier} '
                     f'wasn\'t deleted from the schedule (not present).'),
            command=communication.SCHEDULER_REMOVE_CONFIG)
    communication.scheduler_queue_output.put(response)
예제 #3
0
def respond_interval_deleted(identifier: str, success: bool) -> None:
    """Respond that the interval was deleted from the database.

    This function creates a corresponding response object and inserts it
    in the scheduler output queue.
    The response depends if the insertion succeeded or failed.

    Args:
        identifier (str): identifier of the interval
        success (bool): insertion succeeded/failed

    """
    if success:
        response = communication.Response(
            success=True,
            message=(f'Interval with identifier {identifier} '
                     f'was successfully deleted from the database.'),
            command=communication.SCHEDULER_REMOVE_INTERVAL)
    else:
        response = communication.Response(
            success=False,
            message=(
                f'Interval with identifier {identifier} '
                f'wasn\'t deleted from the database because it wasn\' present.'
            ),
            command=communication.SCHEDULER_REMOVE_INTERVAL)
    communication.scheduler_queue_output.put(response)
예제 #4
0
def respond_interval_inserted(identifier: str, success: bool) -> None:
    """Respond that the interval was inserted in the database.

    This function creates a corresponding response object and inserts it
    in the scheduler output queue.
    The response depends if the insertion succeeded or failed.

    Args:
        identifier (str): identifier of the interval
        success (bool): insertion succeeded/failed

    """
    if success:
        response = communication.Response(
            success=True,
            message=(f'Interval with identifier {identifier} '
                     f'was successfully added.'),
            command=communication.SCHEDULER_ADD_INTERVAL)
    else:
        response = communication.Response(
            success=False,
            message=(
                f'Interval with identifier {identifier} '
                f'wasn\'t added to the database due to an internal error.'),
            command=communication.SCHEDULER_ADD_INTERVAL)
    communication.scheduler_queue_output.put(response)
예제 #5
0
    def _treewalk_pause(self, data: Any) -> communication.Response:
        """Pause the current execution of the TreeWalk.

        Args:
            data (Any): ignored, but required due to callback signature

        Returns:
            communication.Response: response object

        """
        try:
            self._state.set_paused()
            command = communication.Command(command=communication.WORKER_PAUSE,
                                            data=None)
            for worker_control in self._workers:
                worker_control.queue_input.put(command)
            self._db_connection.set_crawl_state(
                tree_walk_id=self._tree_walk_id,
                status=communication.CRAWL_STATUS_PAUSED)
            success = True
            message = communication.MANAGER_OK
        except treewalk.StateException as err:
            success = False
            message = f'Attempted to pause. {str(err)}'
        return communication.Response(success=success,
                                      message=message,
                                      command=communication.MANAGER_PAUSE)
예제 #6
0
    def _treewalk_stop(self, data: Any) -> communication.Response:
        """Stop the current execution of the TreeWalk.

        Args:
            data (Any): ignored, but required due to callback signature

        Returns:
            communication.Response: response object

        """
        if self._state.is_ready():
            message = 'Attempted to stop when TreeWalk was ready.'
        else:
            command = communication.Command(command=communication.WORKER_STOP,
                                            data=None)
            for worker_control in self._workers:
                worker_control.queue_input.put(command)
                _ = worker_control.queue_output.get()
            self._workers_can_exit.set()
            for worker_control in self._workers:
                worker_control.me.join()
            self._workers_can_exit.clear()
            self._db_connection.set_crawl_state(
                tree_walk_id=self._tree_walk_id,
                status=communication.CRAWL_STATUS_ABORTED)
            self._reset()
            message = communication.MANAGER_OK
        return communication.Response(success=True,
                                      message=message,
                                      command=communication.MANAGER_STOP)
예제 #7
0
def intervals_add() -> flask.Response:
    """API endpoint to add intervals for maximum resource consumption.

    Returns:
        flask.Response: REST response

    """
    start = flask.request.args.get('start')
    end = flask.request.args.get('end')
    cpu = flask.request.args.get('cpu')
    if (start is None) or (end is None) or (cpu is None):
        response = communication.Response(
            success=False,
            message='Please provide a start/end time and a cpu level.',
            command=communication.SCHEDULER_ADD_INTERVAL,
        )
        return _get_response(response)
    if not interval_pkg.TimeInterval.assert_valid(start_str=start,
                                                  end_str=end):
        response = communication.Response(
            success=False,
            message='Invalid start/end times.',
            command=communication.SCHEDULER_ADD_INTERVAL,
        )
        return _get_response(response)
    try:
        cpu = int(cpu)
        if cpu not in range(1, 5):
            raise ValueError
    except ValueError:
        response = communication.Response(
            success=False,
            message='The CPU level must be 1, 2, 3 or 4.',
            command=communication.SCHEDULER_ADD_INTERVAL,
        )
        return _get_response(response)
    interval = interval_pkg.TimeInterval(start_str=start,
                                         end_str=end,
                                         cpu_level=cpu)
    response = scheduler.add_interval(interval)
    return _get_response(response)
예제 #8
0
 def _clean_up(self) -> None:
     """Clean up method for cleaning up all used resources."""
     self.message('cleaning up before exiting.')
     self._db_connection_files.close()
     self._db_connection_metadata.close()
     response = communication.Response(
         success=True,
         message=(self._exiftool_time, self._hashing_time,
                  self._db_connection_files.get_time() +
                  self._db_connection_metadata.get_time()),
         command=communication.WORKER_FINISH)
     self._queue_output.put(response)
     self._event_can_exit.wait()
예제 #9
0
def shutdown():
    manager.shutdown()
    db_updater.shutdown()
    scheduler.shutdown()
    func = flask.request.environ.get('werkzeug.server.shutdown')
    if func is None:
        logging.critical('TWApi: Unable to shutdown Flask server!')
        return None
    func()
    response = communication.Response(success=True,
                                      message='Shutting down. Bye!',
                                      command=communication.MANAGER_SHUTDOWN)
    return _get_response(response)
예제 #10
0
def respond_schedule(schedule: dict) -> None:
    """Respond the TreeWalk schedule.

    This function creates a corresponding response object and inserts it
    in the scheduler output queue.
    The response depends if the insertion succeeded or failed.

    Args:
        schedule (dict): schedule

    """
    if schedule is None:
        response = communication.Response(
            success=False,
            message='Unable to read schedule.',
            command=communication.SCHEDULER_GET_SCHEDULE)
    else:
        response = communication.Response(
            success=True,
            message=schedule,
            command=communication.SCHEDULER_GET_SCHEDULE)
    communication.scheduler_queue_output.put(response)
예제 #11
0
def respond_intervals(intervals: dict) -> None:
    """Respond the intervals.

    This function creates a corresponding response object and inserts it
    in the scheduler output queue.
    The response depends if the insertion succeeded or failed.

    Args:
        intervals (dict): intervals

    """
    if intervals is None:
        response = communication.Response(
            success=False,
            message='Unable to read intervals.',
            command=communication.SCHEDULER_GET_INTERVALS)
    else:
        response = communication.Response(
            success=True,
            message=intervals,
            command=communication.SCHEDULER_GET_INTERVALS)
    communication.scheduler_queue_output.put(response)
예제 #12
0
def respond_interval_overlaps(identifier: str) -> None:
    """Respond that the interval overlaps with a already existing one.

    This function creates a corresponding response object and inserts it
    in the scheduler output queue.

    Args:
        identifier (str): identifier of the interval

    """
    response = communication.Response(
        success=False,
        message=(f'Interval with identifier {identifier} is overlapping.'),
        command=communication.SCHEDULER_ADD_INTERVAL)
    communication.scheduler_queue_output.put(response)
예제 #13
0
def respond_config_already_present(identifier: str) -> None:
    """Respond that the config is already present in the schedule.

    This function creates a corresponding response object and inserts it
    in the scheduler output queue.

    Args:
        identifier (str): identifier of the configuration

    """
    response = communication.Response(
        success=False,
        message=(
            f'Configuration with identifier {identifier} '
            f'is already present in the schedule, thus it was not added.'),
        command=communication.SCHEDULER_ADD_CONFIG)
    communication.scheduler_queue_output.put(response)
예제 #14
0
    def info(self) -> communication.Response:
        """Return the current status.

        Returns:
            communication.Response: current info

        """
        if self._is_ready():
            data = {
                'status': self._status,
                'config': self._config,
                'processes': self._running_workers
            }
        else:
            data = {
                'status': self._status,
                'config': self._config.get_data(as_json=False),
                'processes': self._running_workers,
                'progress': f'{self._progress:.2f}'
            }
        return communication.Response(success=True,
                                      message=data,
                                      command=communication.MANAGER_INFO)
예제 #15
0
    def _treewalk_start(self, config: Config) -> communication.Response:
        """Start the TreeWalk with given configuration.

        Args:
            config (Config): configuration of new TreeWalk

        Returns:
            communication.Response: response object

        """
        def prepare(config: Config) -> Tuple[int, int, list, list]:
            """Prepare the start of the TreeWalk.

            Initialize required data such as work packages or number of workers.

            Args:
                config (Config): config of the execution

            Returns:
                Tuple[int, int, list, list]:
                    (db_id, #workers, work_packages_single, work_packages_split)

            """
            # Prepare database
            self._db_connection = database.DatabaseConnection(
                db_info=self._connection_data, measure_time=self._measure_time)
            db_id = self._db_connection.insert_new_record_crawls(config)
            # Prepare number of workers
            number_of_workers = treewalk.get_number_of_workers(
                config.get_cpu_level())
            max_cpu_level, max_num_workers = self._state.get_cpu_level(True)
            if max_cpu_level > 0:
                actual = number_of_workers
                number_of_workers = min(max_num_workers, number_of_workers)
                logging.info(f'Reduced the number of workers by '
                             f'{abs(max_num_workers - actual)} '
                             f'due to interval restriction.')
            # Prepare work packages
            work_packages, split = treewalk.create_work_packages(
                inputs=config.get_directories(),
                work_package_size=config.get_package_size(),
                number_of_workers=number_of_workers,
                already_processed=[])
            return (db_id, number_of_workers, work_packages, split)

        # Check if it is ok to run (preparing doesn't have to checked
        # since it cannot be interrupted)
        if self._state.is_paused():
            return communication.Response(
                success=False,
                message='Attempted to start when TreeWalk was paused.',
                command=communication.MANAGER_START)
        if self._state.is_running():
            logging.info('TWManager: attempted to start when TW is running.')
            return communication.Response(
                success=False,
                message='Attempted to start when TreeWalk is running.',
                command=communication.MANAGER_START)

        self._state.set_preparing(config)
        # Prepare the data
        data = prepare(config)
        tree_walk_id, num_workers, work_packages, work_packages_split = data
        # Create the worker processes and start them
        for id_worker in range(num_workers):
            queue_input = multiprocessing.Queue()
            queue_output = multiprocessing.Queue()
            worker = Worker(
                queue_input=queue_input,
                queue_output=queue_output,
                config=config,
                connection_data=self._connection_data,
                tree_walk_id=tree_walk_id,
                lock=self._worker_lock,
                counter=self._worker_counter,
                finished=self._workers_finished,
                num_workers=self._num_workers,
                measure_time=self._measure_time,
                event_can_exit=self._workers_can_exit,
                debug=environment.env.CRAWLER_LOGGING_LEVEL == 'DEBUG')
            worker_control = WorkerControl(
                worker=worker,
                queue_input=queue_input,
                queue_output=queue_output,
                event_finished=self._workers_can_exit)
            self._workers.append(worker_control)
        for worker_control in self._workers:
            worker_control.me.start()
        # Update the manager
        self._config = config
        self._roots = config.get_directories()
        self._num_workers.value = num_workers
        self._work_packages = work_packages
        self._work_packages_split = work_packages_split
        self._total = self._get_number_of_work_packages()
        self._tree_walk_id = tree_walk_id
        self._state.set_running(config)
        self._state.set_running_workers(self._num_workers.value)
        self._time_start = datetime.now()
        return communication.Response(success=True,
                                      message=communication.MANAGER_OK,
                                      command=communication.MANAGER_START)