def __init__( self, base_dir: str, structured_dir: bool = False, read_filter: io.ReadFilter = None, debug: bool = False, pool: Optional[multiprocessing.pool.Pool] = None, ): """ Initialize the ApiReader object :param base_dir: directory containing the files to read :param structured_dir: if True, base_dir contains a specific directory structure used by the respective api formats. If False, base_dir only has the data files. Default False. :param read_filter: ReadFilter for the data files, if None, get everything. Default None :param debug: if True, output program warnings/errors during function execution. Default False. """ _pool: multiprocessing.pool.Pool = (multiprocessing.Pool() if pool is None else pool) if read_filter: self.filter = read_filter if self.filter.station_ids: self.filter.station_ids = set(self.filter.station_ids) else: self.filter = io.ReadFilter() self.base_dir = base_dir self.structured_dir = structured_dir self.debug = debug self.errors = RedVoxExceptions("APIReader") self.files_index = self._get_all_files(_pool) self.index_summary = io.IndexSummary.from_index( self._flatten_files_index()) mem_split_factor = 1 if len(self.files_index) > 0: if settings.is_parallelism_enabled(): mem_split_factor = len(self.files_index) self.chunk_limit = psutil.virtual_memory( ).available * PERCENT_FREE_MEM_USE / mem_split_factor max_file_size = max([ fe.file_size_bytes for fi in self.files_index for fe in fi.entries ]) if max_file_size > self.chunk_limit: raise MemoryError( f"System requires {max_file_size} bytes of memory to process a file but only has " f"{self.chunk_limit} available. Please free or add more RAM." ) if debug: print( f"{mem_split_factor} stations each have {int(self.chunk_limit)} bytes for loading files in " f"memory.") else: self.chunk_limit = 0 if debug: self.errors.print() if pool is None: _pool.close()
def __init__( self, base_dir: str, structured_dir: bool = False, read_filter: io.ReadFilter = None, debug: bool = False, pool: Optional[multiprocessing.pool.Pool] = None, ): """ Initialize the ApiReader object :param base_dir: directory containing the files to read :param structured_dir: if True, base_dir contains a specific directory structure used by the respective api formats. If False, base_dir only has the data files. Default False. :param read_filter: ReadFilter for the data files, if None, get everything. Default None :param debug: if True, output additional statements during function execution. Default False. """ _pool: multiprocessing.pool.Pool = ( multiprocessing.Pool() if pool is None else pool ) if read_filter: self.filter = read_filter if self.filter.station_ids: self.filter.station_ids = set(self.filter.station_ids) else: self.filter = io.ReadFilter() self.base_dir = base_dir self.structured_dir = structured_dir self.debug = debug self.files_index = self._get_all_files(_pool) self.index_summary = io.IndexSummary.from_index(self.files_index) if pool is None: _pool.close()
def extract_stats_serial(index: io.Index) -> List[StationStat]: """ Extracts StationStat information from packets stored in the provided index. :param index: Index of packets to extract information from. :return: A list of StationStat objects. """ # noinspection Mypy stats_900: Iterator[StationStat] = map( StationStat.from_api_900, index.stream(io.ReadFilter(api_versions={io.ApiVersion.API_900})), ) # noinspection Mypy stats_1000: Iterator[StationStat] = map( StationStat.from_api_1000, index.stream(io.ReadFilter(api_versions={io.ApiVersion.API_1000})), ) return list(stats_900) + list(stats_1000)
def _check_station_stats( self, station_index: io.Index, pool: Optional[multiprocessing.pool.Pool] = None, ) -> List[io.Index]: """ check the index's results; if it has enough information, return it, otherwise search for more data. The index should only request one station id If the station was restarted during the request period, a new group of indexes will be created to represent the change in station metadata. :param station_index: index representing the requested information :return: List of Indexes that includes as much information as possible that fits the request """ _pool: multiprocessing.pool.Pool = multiprocessing.Pool( ) if pool is None else pool # if we found nothing, return the index if len(station_index.entries) < 1: return [station_index] stats = fs.extract_stats(station_index, pool=_pool) # Close pool if created here if pool is None: _pool.close() timing_offsets: Optional[ offset_model.TimingOffsets] = offset_model.compute_offsets(stats) # punt if duration or other important values are invalid or if the latency array was empty if timing_offsets is None: return [station_index] diff_s = diff_e = timedelta(seconds=0) # if our filtered files do not encompass the request even when the packet times are updated # try getting 1.5 times the difference of the expected start/end and the start/end of the data insufficient_str = "" if self.filter.start_dt and timing_offsets.adjusted_start > self.filter.start_dt: insufficient_str += f" {self.filter.start_dt} (start)" # diff_s = self.filter.start_dt_buf + 1.5 * (timing_offsets.adjusted_start - self.filter.start_dt) new_end = self.filter.start_dt - self.filter.start_dt_buf new_start = new_end - 1.5 * (timing_offsets.adjusted_start - self.filter.start_dt) new_index = self._apply_filter(io.ReadFilter().with_start_dt( new_start).with_end_dt(new_end).with_extensions( self.filter.extensions).with_api_versions( self.filter.api_versions).with_station_ids( set(station_index.summarize().station_ids()) ).with_start_dt_buf(diff_s).with_end_dt_buf(diff_e)) if len(new_index.entries) > 0: station_index.append(new_index.entries) stats.extend(fs.extract_stats(new_index)) if self.filter.end_dt and timing_offsets.adjusted_end < self.filter.end_dt: insufficient_str += f" {self.filter.end_dt} (end)" # diff_e = self.filter.end_dt_buf + 1.5 * (self.filter.end_dt - timing_offsets.adjusted_end) new_start = self.filter.end_dt + self.filter.end_dt_buf new_end = new_start + 1.5 * (self.filter.end_dt - timing_offsets.adjusted_end) new_index = self._apply_filter(io.ReadFilter().with_start_dt( new_start).with_end_dt(new_end).with_extensions( self.filter.extensions).with_api_versions( self.filter.api_versions).with_station_ids( set(station_index.summarize().station_ids()) ).with_start_dt_buf(diff_s).with_end_dt_buf(diff_e)) if len(new_index.entries) > 0: station_index.append(new_index.entries) stats.extend(fs.extract_stats(new_index)) if len(insufficient_str) > 0: self.errors.append( f"Data for {station_index.summarize().station_ids()} exists, " f"but not at:{insufficient_str}") results = {} keys = [] for v, e in enumerate(stats): key = e.app_start_dt if key not in keys: keys.append(key) results[key] = io.Index() results[key].append(entries=[station_index.entries[v]]) return list(results.values())
def create_data_window(self, pool: Optional[multiprocessing.pool.Pool] = None): """ updates the DataWindow to contain only the data within the window parameters stations without audio or any data outside the window are removed """ # Let's create and manage a single pool of workers that we can utilize throughout # the instantiation of the data window. _pool: multiprocessing.pool.Pool = multiprocessing.Pool() if pool is None else pool r_f = io.ReadFilter() if self._config.start_datetime: r_f.with_start_dt(self._config.start_datetime) if self._config.end_datetime: r_f.with_end_dt(self._config.end_datetime) if self._config.station_ids: r_f.with_station_ids(self._config.station_ids) if self._config.extensions: r_f.with_extensions(self._config.extensions) else: self._config.extensions = r_f.extensions if self._config.api_versions: r_f.with_api_versions(self._config.api_versions) else: self._config.api_versions = r_f.api_versions r_f.with_start_dt_buf(self._config.start_buffer_td) r_f.with_end_dt_buf(self._config.end_buffer_td) if self.debug: print("Reading files from disk. This may take a few minutes to complete.") # get the data to convert into a window a_r = ApiReaderDw(self._config.input_dir, self._config.structured_layout, r_f, correct_timestamps=self._config.apply_correction, use_model_correction=self._config.use_model_correction, dw_base_dir=self.save_dir(), dw_save_mode=self._fs_writer.save_mode(), debug=self.debug, pool=_pool) self._errors.extend_error(a_r.errors) if self._fs_writer.is_use_mem() and a_r.dw_save_mode != self._fs_writer.save_mode(): if self.debug: print("Estimated size of files exceeds available memory.") print("Automatically using temporary directory to store data.") self._fs_writer.set_use_temp(True) # Parallel update # Apply timing correction in parallel by station sts = a_r.get_stations() if self.debug: print("num stations loaded: ", len(sts)) # if self._config.apply_correction: # for st in maybe_parallel_map(_pool, Station.update_timestamps, # iter(sts), chunk_size=1): # self._add_sensor_to_window(st) # if self.debug: # print("station processed: ", st.id()) for st in maybe_parallel_map(_pool, Station.update_timestamps, iter(sts), chunk_size=1): self.create_window_in_sensors(st, self._config.start_datetime, self._config.end_datetime) if self.debug: print("station processed: ", st.id()) # check for stations without data self._check_for_audio() self._check_valid_ids() # update the default data window name if we have data and the default name exists if self.event_name == "dw" and len(self._stations) > 0: self.event_name = f"dw_{int(self.start_date())}_{len(self._stations)}" # must update the start and end in order for the data to be saved # update remaining data window values if they're still default if not self._config.start_datetime and len(self._stations) > 0: self._config.start_datetime = dtu.datetime_from_epoch_microseconds_utc( np.min([t.first_data_timestamp() for t in self._stations])) # end_datetime is non-inclusive, so it must be greater than our latest timestamp if not self._config.end_datetime and len(self._stations) > 0: self._config.end_datetime = dtu.datetime_from_epoch_microseconds_utc( np.max([t.last_data_timestamp() for t in self._stations]) + 1) # If the pool was created by this function, then it needs to managed by this function. if pool is None: _pool.close()