예제 #1
0
    def __init__(
        self,
        base_dir: str,
        structured_dir: bool = False,
        read_filter: io.ReadFilter = None,
        debug: bool = False,
        pool: Optional[multiprocessing.pool.Pool] = None,
    ):
        """
        Initialize the ApiReader object

        :param base_dir: directory containing the files to read
        :param structured_dir: if True, base_dir contains a specific directory structure used by the respective
                                api formats.  If False, base_dir only has the data files.  Default False.
        :param read_filter: ReadFilter for the data files, if None, get everything.  Default None
        :param debug: if True, output program warnings/errors during function execution.  Default False.
        """
        _pool: multiprocessing.pool.Pool = (multiprocessing.Pool()
                                            if pool is None else pool)

        if read_filter:
            self.filter = read_filter
            if self.filter.station_ids:
                self.filter.station_ids = set(self.filter.station_ids)
        else:
            self.filter = io.ReadFilter()
        self.base_dir = base_dir
        self.structured_dir = structured_dir
        self.debug = debug
        self.errors = RedVoxExceptions("APIReader")
        self.files_index = self._get_all_files(_pool)
        self.index_summary = io.IndexSummary.from_index(
            self._flatten_files_index())
        mem_split_factor = 1
        if len(self.files_index) > 0:
            if settings.is_parallelism_enabled():
                mem_split_factor = len(self.files_index)
            self.chunk_limit = psutil.virtual_memory(
            ).available * PERCENT_FREE_MEM_USE / mem_split_factor
            max_file_size = max([
                fe.file_size_bytes for fi in self.files_index
                for fe in fi.entries
            ])
            if max_file_size > self.chunk_limit:
                raise MemoryError(
                    f"System requires {max_file_size} bytes of memory to process a file but only has "
                    f"{self.chunk_limit} available.  Please free or add more RAM."
                )
            if debug:
                print(
                    f"{mem_split_factor} stations each have {int(self.chunk_limit)} bytes for loading files in "
                    f"memory.")
        else:
            self.chunk_limit = 0

        if debug:
            self.errors.print()

        if pool is None:
            _pool.close()
예제 #2
0
    def __init__(
        self,
        base_dir: str,
        structured_dir: bool = False,
        read_filter: io.ReadFilter = None,
        debug: bool = False,
        pool: Optional[multiprocessing.pool.Pool] = None,
    ):
        """
        Initialize the ApiReader object
        :param base_dir: directory containing the files to read
        :param structured_dir: if True, base_dir contains a specific directory structure used by the respective
                                api formats.  If False, base_dir only has the data files.  Default False.
        :param read_filter: ReadFilter for the data files, if None, get everything.  Default None
        :param debug: if True, output additional statements during function execution.  Default False.
        """
        _pool: multiprocessing.pool.Pool = (
            multiprocessing.Pool() if pool is None else pool
        )

        if read_filter:
            self.filter = read_filter
            if self.filter.station_ids:
                self.filter.station_ids = set(self.filter.station_ids)
        else:
            self.filter = io.ReadFilter()
        self.base_dir = base_dir
        self.structured_dir = structured_dir
        self.debug = debug
        self.files_index = self._get_all_files(_pool)
        self.index_summary = io.IndexSummary.from_index(self.files_index)

        if pool is None:
            _pool.close()
def extract_stats_serial(index: io.Index) -> List[StationStat]:
    """
    Extracts StationStat information from packets stored in the provided index.

    :param index: Index of packets to extract information from.
    :return: A list of StationStat objects.
    """
    # noinspection Mypy
    stats_900: Iterator[StationStat] = map(
        StationStat.from_api_900,
        index.stream(io.ReadFilter(api_versions={io.ApiVersion.API_900})),
    )
    # noinspection Mypy
    stats_1000: Iterator[StationStat] = map(
        StationStat.from_api_1000,
        index.stream(io.ReadFilter(api_versions={io.ApiVersion.API_1000})),
    )
    return list(stats_900) + list(stats_1000)
예제 #4
0
    def _check_station_stats(
        self,
        station_index: io.Index,
        pool: Optional[multiprocessing.pool.Pool] = None,
    ) -> List[io.Index]:
        """
        check the index's results; if it has enough information, return it, otherwise search for more data.
        The index should only request one station id
        If the station was restarted during the request period, a new group of indexes will be created
        to represent the change in station metadata.

        :param station_index: index representing the requested information
        :return: List of Indexes that includes as much information as possible that fits the request
        """
        _pool: multiprocessing.pool.Pool = multiprocessing.Pool(
        ) if pool is None else pool
        # if we found nothing, return the index
        if len(station_index.entries) < 1:
            return [station_index]

        stats = fs.extract_stats(station_index, pool=_pool)
        # Close pool if created here
        if pool is None:
            _pool.close()

        timing_offsets: Optional[
            offset_model.TimingOffsets] = offset_model.compute_offsets(stats)

        # punt if duration or other important values are invalid or if the latency array was empty
        if timing_offsets is None:
            return [station_index]

        diff_s = diff_e = timedelta(seconds=0)

        # if our filtered files do not encompass the request even when the packet times are updated
        # try getting 1.5 times the difference of the expected start/end and the start/end of the data
        insufficient_str = ""
        if self.filter.start_dt and timing_offsets.adjusted_start > self.filter.start_dt:
            insufficient_str += f" {self.filter.start_dt} (start)"
            # diff_s = self.filter.start_dt_buf + 1.5 * (timing_offsets.adjusted_start - self.filter.start_dt)
            new_end = self.filter.start_dt - self.filter.start_dt_buf
            new_start = new_end - 1.5 * (timing_offsets.adjusted_start -
                                         self.filter.start_dt)
            new_index = self._apply_filter(io.ReadFilter().with_start_dt(
                new_start).with_end_dt(new_end).with_extensions(
                    self.filter.extensions).with_api_versions(
                        self.filter.api_versions).with_station_ids(
                            set(station_index.summarize().station_ids())
                        ).with_start_dt_buf(diff_s).with_end_dt_buf(diff_e))
            if len(new_index.entries) > 0:
                station_index.append(new_index.entries)
                stats.extend(fs.extract_stats(new_index))
        if self.filter.end_dt and timing_offsets.adjusted_end < self.filter.end_dt:
            insufficient_str += f" {self.filter.end_dt} (end)"
            # diff_e = self.filter.end_dt_buf + 1.5 * (self.filter.end_dt - timing_offsets.adjusted_end)
            new_start = self.filter.end_dt + self.filter.end_dt_buf
            new_end = new_start + 1.5 * (self.filter.end_dt -
                                         timing_offsets.adjusted_end)
            new_index = self._apply_filter(io.ReadFilter().with_start_dt(
                new_start).with_end_dt(new_end).with_extensions(
                    self.filter.extensions).with_api_versions(
                        self.filter.api_versions).with_station_ids(
                            set(station_index.summarize().station_ids())
                        ).with_start_dt_buf(diff_s).with_end_dt_buf(diff_e))
            if len(new_index.entries) > 0:
                station_index.append(new_index.entries)
                stats.extend(fs.extract_stats(new_index))
        if len(insufficient_str) > 0:
            self.errors.append(
                f"Data for {station_index.summarize().station_ids()} exists, "
                f"but not at:{insufficient_str}")

        results = {}
        keys = []

        for v, e in enumerate(stats):
            key = e.app_start_dt
            if key not in keys:
                keys.append(key)
                results[key] = io.Index()

            results[key].append(entries=[station_index.entries[v]])

        return list(results.values())
예제 #5
0
    def create_data_window(self, pool: Optional[multiprocessing.pool.Pool] = None):
        """
        updates the DataWindow to contain only the data within the window parameters
        stations without audio or any data outside the window are removed
        """
        # Let's create and manage a single pool of workers that we can utilize throughout
        # the instantiation of the data window.
        _pool: multiprocessing.pool.Pool = multiprocessing.Pool() if pool is None else pool

        r_f = io.ReadFilter()
        if self._config.start_datetime:
            r_f.with_start_dt(self._config.start_datetime)
        if self._config.end_datetime:
            r_f.with_end_dt(self._config.end_datetime)
        if self._config.station_ids:
            r_f.with_station_ids(self._config.station_ids)
        if self._config.extensions:
            r_f.with_extensions(self._config.extensions)
        else:
            self._config.extensions = r_f.extensions
        if self._config.api_versions:
            r_f.with_api_versions(self._config.api_versions)
        else:
            self._config.api_versions = r_f.api_versions
        r_f.with_start_dt_buf(self._config.start_buffer_td)
        r_f.with_end_dt_buf(self._config.end_buffer_td)

        if self.debug:
            print("Reading files from disk.  This may take a few minutes to complete.")

        # get the data to convert into a window
        a_r = ApiReaderDw(self._config.input_dir, self._config.structured_layout, r_f,
                          correct_timestamps=self._config.apply_correction,
                          use_model_correction=self._config.use_model_correction,
                          dw_base_dir=self.save_dir(),
                          dw_save_mode=self._fs_writer.save_mode(),
                          debug=self.debug, pool=_pool)

        self._errors.extend_error(a_r.errors)

        if self._fs_writer.is_use_mem() and a_r.dw_save_mode != self._fs_writer.save_mode():
            if self.debug:
                print("Estimated size of files exceeds available memory.")
                print("Automatically using temporary directory to store data.")
            self._fs_writer.set_use_temp(True)

        # Parallel update
        # Apply timing correction in parallel by station
        sts = a_r.get_stations()
        if self.debug:
            print("num stations loaded: ", len(sts))
        # if self._config.apply_correction:
            # for st in maybe_parallel_map(_pool, Station.update_timestamps,
            #                              iter(sts), chunk_size=1):
            #     self._add_sensor_to_window(st)
            #     if self.debug:
            #         print("station processed: ", st.id())
        for st in maybe_parallel_map(_pool, Station.update_timestamps, iter(sts), chunk_size=1):
            self.create_window_in_sensors(st, self._config.start_datetime, self._config.end_datetime)
            if self.debug:
                print("station processed: ", st.id())

        # check for stations without data
        self._check_for_audio()
        self._check_valid_ids()

        # update the default data window name if we have data and the default name exists
        if self.event_name == "dw" and len(self._stations) > 0:
            self.event_name = f"dw_{int(self.start_date())}_{len(self._stations)}"

        # must update the start and end in order for the data to be saved
        # update remaining data window values if they're still default
        if not self._config.start_datetime and len(self._stations) > 0:
            self._config.start_datetime = dtu.datetime_from_epoch_microseconds_utc(
                np.min([t.first_data_timestamp() for t in self._stations]))
        # end_datetime is non-inclusive, so it must be greater than our latest timestamp
        if not self._config.end_datetime and len(self._stations) > 0:
            self._config.end_datetime = dtu.datetime_from_epoch_microseconds_utc(
                np.max([t.last_data_timestamp() for t in self._stations]) + 1)

        # If the pool was created by this function, then it needs to managed by this function.
        if pool is None:
            _pool.close()