示例#1
0
def validate_station_key_list(data_packets: List[api_m.RedvoxPacketM],
                              errors: RedVoxExceptions) -> bool:
    """
    Checks for consistency in the data packets.  Returns False if discrepancies are found.
    If debug is True, will output the discrepancies.

    :param data_packets: list of WrappedRedvoxPacketM to look at
    :param errors: RedVoxExceptions detailing errors found while validating
    :return: True if no discrepancies found.  False otherwise
    """
    my_errors = RedVoxExceptions("StationKeyValidation")
    if len(data_packets) < 2:
        return True
    j: np.ndarray = np.transpose([[
        t.station_information.id,
        t.station_information.uuid,
        t.timing_information.app_start_mach_timestamp,
        t.api,
        t.sub_api,
        t.station_information.make,
        t.station_information.model,
        t.station_information.os,
        t.station_information.os_version,
        t.station_information.app_version,
        t.station_information.is_private,
        len(t.sensors.audio.samples.values) / t.sensors.audio.sample_rate,
    ] for t in data_packets])

    k: Dict[str, np.ndarray] = {
        "ids": j[0],
        "uuids": j[1],
        "station_start_times": j[2],
        "apis": j[3],
        "sub_apis": j[4],
        "makes": j[5],
        "models": j[6],
        "os": j[7],
        "os_versions": j[8],
        "app_versions": j[9],
        "privates": j[10],
        "durations": j[11],
    }

    for key, value in k.items():
        result = np.unique(value)
        if len(result) > 1:
            my_errors.append(
                f"WARNING: {data_packets[0].station_information.id} "
                f"{key} contains multiple unique values: {result}.\n"
                "Please update your query to focus on one of these values.")

    if my_errors.get_num_errors() > 0:
        errors.extend_error(my_errors)
        return False

    return True  # if here, everything is consistent
class DataWindow:
    """
    Holds the data for a given time window; adds interpolated timestamps to fill gaps and pad start and end values

    Properties:
        event_name: str, name of the DataWindow.  defaults to "dw"

        event_origin: Optional EventOrigin which describes the physical location and radius of the
        origin event.  Default empty EventOrigin (no valid data)

        config: optional DataWindowConfig with information on how to construct DataWindow from
        Redvox (.rdvx*) files.  Default None

        sdk_version: str, the version of the Redvox SDK used to create the DataWindow

        debug: bool, if True, outputs additional information during initialization. Default False

    Protected:
        _fs_writer: DataWindowFileSystemWriter; includes event_name, output directory (Default "."),
        output type (options: "PARQUET", "LZ4", "NONE".  Default NONE), and option to make a
        runme.py example file (Default False)

        _stations: List of Stations that belong to the DataWindow

        _errors: RedVoxExceptions; contains a list of all errors encountered by the DataWindow
    """
    def __init__(
            self,
            event_name: str = "dw",
            event_origin: Optional[EventOrigin] = None,
            config: Optional[DataWindowConfig] = None,
            output_dir: str = ".",
            out_type: str = "NONE",
            make_runme: bool = False,
            debug: bool = False,
    ):
        """
        Initialize the DataWindow

        :param event_name: name of the DataWindow.  defaults to "dw"
        :param event_origin: Optional EventOrigin which describes the physical location and radius of the
                                origin event.  Default empty EventOrigin (no valid data)
        :param config: Optional DataWindowConfig which describes how to extract data from Redvox files.
                        Default None
        :param output_dir: output directory for saving files.  Default "." (current directory)
        :param out_type: type of file to save the DataWindow as.  Options: "PARQUET", "LZ4", "NONE".
                            Default "NONE" (no saving)
        :param make_runme: if True, saves an example runme.py file with the data.  Default False
        :param debug: if True, outputs additional information during initialization.  Default False
        """
        self.event_name: str = event_name
        self.event_origin: EventOrigin = event_origin if event_origin else EventOrigin()
        self._fs_writer = dw_io.DataWindowFileSystemWriter(self.event_name, out_type, output_dir, make_runme)
        self.debug: bool = debug
        self._sdk_version: str = redvox.VERSION
        self._errors = RedVoxExceptions("DataWindow")
        self._stations: List[Station] = []
        self._config = config
        if config:
            if config.start_datetime and config.end_datetime and (config.end_datetime <= config.start_datetime):
                self._errors.append("DataWindow will not work when end datetime is before or equal to start datetime.\n"
                                    f"Your times: {config.end_datetime} <= {config.start_datetime}")
            else:
                self.create_data_window()
        if self.debug:
            self.print_errors()

    # def __repr__(self):
    #     # todo: use representations for the datetime and timedelta objects
    #     # todo: use the dictionary function
    #     return dw_io.dict_to_json({
    #         "event_name": self.event_name,
    #         "event_origin": repr(self.event_origin),
    #         "config": repr(self._config),
    #         "base_dir": self.save_dir(),
    #         "out_type": self._fs_writer.file_extension,
    #         "make_runme": self._fs_writer.make_run_me,
    #         "sdk_version": self._sdk_version,
    #         "errors": repr(self._errors),
    #         "debug": self.debug
    #     })
    #
    # def __str__(self):
    #     # todo: use representations for the datetime and timedelta objects
    #     # todo: use the dictionary function
    #     return dw_io.dict_to_json(
    #         {"event_name": self.event_name,
    #          "event_origin": str(self.event_origin),
    #          "config": str(self._config),
    #          "base_dir": self.save_dir(),
    #          "stations": [s.default_station_json_file_name() for s in self._stations],
    #          "out_type": self._fs_writer.file_extension,
    #          "make_runme": self._fs_writer.make_run_me,
    #          "sdk_version": self._sdk_version,
    #          "errors": str(self._errors),
    #          "debug": self.debug
    #          })

    def save_dir(self) -> str:
        """
        :return: directory data is saved to (empty string means saving to memory)
        """
        return self._fs_writer.save_dir()

    def set_save_dir(self, new_save_dir: Optional[str] = "."):
        """
        :param new_save_dir: directory to save data to; default current directory, or "."
        """
        self._fs_writer.base_dir = new_save_dir

    def is_make_runme(self) -> bool:
        """
        :return: if DataWindow will be saved with a runme file
        """
        return self._fs_writer.make_run_me

    def set_make_runme(self, make_runme: bool = False):
        """
        :param make_runme: if True, DataWindow will create a runme file when saved.  Default False
        """
        self._fs_writer.make_run_me = make_runme

    def fs_writer(self) -> dw_io.DataWindowFileSystemWriter:
        """
        :return: DataWindowFileSystemWriter for DataWindow
        """
        return self._fs_writer

    def out_type(self) -> str:
        """
        :return: string of the output type of the DataWindow
        """
        return self._fs_writer.file_extension

    def set_out_type(self, new_out_type: str):
        """
        set the output type of the DataWindow.  options are "NONE", "PARQUET" and "LZ4".  invalid values become "NONE"

        :param new_out_type: new output type of the DataWindow
        """
        self._fs_writer.set_extension(new_out_type)

    def as_dict(self) -> Dict:
        """
        :return: DataWindow properties as dictionary
        """
        return {"event_name": self.event_name,
                "event_origin": self.event_origin.as_dict(),
                "start_time": self.start_date(),
                "end_time": self.end_date(),
                "base_dir": self.save_dir(),
                "stations": [s.default_station_json_file_name() for s in self._stations],
                "config": self._config.as_dict(),
                "debug": self.debug,
                "errors": self._errors.as_dict(),
                "sdk_version": self._sdk_version,
                "out_type": self._fs_writer.file_extension,
                "make_runme": self._fs_writer.make_run_me
                }

    def pretty(self) -> str:
        """
        :return: DataWindow as dictionary, but easier to read
        """
        # noinspection Mypy
        return pprint.pformat(self.as_dict())

    @staticmethod
    def from_config(config: DataWindowConfigFile) -> "DataWindow":
        """
        Use a config file to create a DataWindow

        :param config: DataWindowConfigFile to load from
        :return: DataWindow
        """
        event_origin = EventOrigin(config.origin_provider, config.origin_latitude, config.origin_latitude_std,
                                   config.origin_longitude, config.origin_longitude_std, config.origin_altitude,
                                   config.origin_altitude_std, config.origin_event_radius_m)
        dw_config = DataWindowConfig(config.input_directory, config.structured_layout, config.start_dt(),
                                     config.end_dt(), config.start_buffer_td(), config.end_buffer_td(),
                                     config.drop_time_seconds, config.station_ids, config.extensions,
                                     config.api_versions, config.apply_correction, config.use_model_correction,
                                     config.copy_edge_points())
        return DataWindow(config.event_name, event_origin, dw_config, config.output_dir, config.output_type,
                          config.make_runme, config.debug)

    @staticmethod
    def from_config_file(file: str) -> "DataWindow":
        """
        Loads a configuration file to create the DataWindow

        :param file: full path to config file
        :return: DataWindow
        """
        return DataWindow.from_config(DataWindowConfigFile.from_path(file))

    @staticmethod
    def deserialize(path: str) -> "DataWindow":
        """
        Decompresses and deserializes a DataWindow written to disk.

        :param path: Path to the serialized and compressed DataWindow.
        :return: An instance of a DataWindow.
        """
        return dw_io.deserialize_data_window(path)

    def serialize(self, compression_factor: int = 4) -> Path:
        """
        Serializes and compresses this DataWindow to a file.
        Uses the event_name and out_dir to name the file.

        :param compression_factor: A value between 1 and 12. Higher values provide better compression, but take
        longer. (default=4).
        :return: The path to the written file.
        """
        return dw_io.serialize_data_window(self, self.save_dir(), f"{self.event_name}.pkl.lz4", compression_factor)

    def _to_json_file(self) -> Path:
        """
        Converts the DataWindow metadata into a JSON file and compresses the DataWindow and writes it to disk.

        :return: The path to the written file
        """
        return dw_io.data_window_to_json(self, self.save_dir())

    def to_json(self) -> str:
        """
        :return: The DataWindow metadata into a JSON string.
        """
        return dw_io.data_window_as_json(self)

    @staticmethod
    def from_json(json_str: str) -> "DataWindow":
        """
        Read the DataWindow from a JSON string.  If file is improperly formatted, raises a ValueError.

        :param json_str: the JSON to read
        :return: The DataWindow as defined by the JSON
        """
        return DataWindow.from_json_dict(dw_io.json_to_dict(json_str))

    @staticmethod
    def from_json_dict(json_dict: Dict) -> "DataWindow":
        """
        Reads a JSON dictionary and loads the data into the DataWindow.
        If dictionary is improperly formatted, raises a ValueError.

        :param json_dict: the dictionary to read
        :return: The DataWindow as defined by the JSON
        """
        if "out_type" not in json_dict.keys() \
                or json_dict["out_type"].upper() not in dw_io.DataWindowOutputType.list_names():
            raise ValueError('Dictionary loading type is invalid or unknown.  '
                             'Check the value "out_type"; it must be one of: '
                             f'{dw_io.DataWindowOutputType.list_non_none_names()}')
        else:
            out_type = dw_io.DataWindowOutputType.str_to_type(json_dict["out_type"])
            if out_type == dw_io.DataWindowOutputType.PARQUET:
                dwin = DataWindow(json_dict["event_name"], EventOrigin.from_dict(json_dict["event_origin"]),
                                  None, json_dict["base_dir"], json_dict["out_type"], json_dict["make_runme"],
                                  json_dict["debug"])
                dwin._config = DataWindowConfig.from_dict(json_dict["config"])
                dwin._errors = RedVoxExceptions.from_dict(json_dict["errors"])
                dwin._sdk_version = json_dict["sdk_version"]
                for st in json_dict["stations"]:
                    dwin.add_station(Station.from_json_file(os.path.join(json_dict["base_dir"], st), f"{st}.json"))
            elif out_type == dw_io.DataWindowOutputType.LZ4:
                dwin = DataWindow.deserialize(os.path.join(json_dict["base_dir"],
                                                           f"{json_dict['event_name']}.pkl.lz4"))
            else:
                dwin = DataWindow()
            return dwin

    def save(self) -> Path:
        """
        save the DataWindow to disk if saving is enabled
        if saving is not enabled, adds an error to the DataWindow and returns an empty path.

        :return: the path to where the files exist; an empty path means no files were saved
        """
        if self._fs_writer.is_save_disk():
            if self._fs_writer.is_use_disk() and self._fs_writer.make_run_me:
                shutil.copyfile(os.path.abspath(inspect.getfile(run_me)),
                                os.path.join(self._fs_writer.save_dir(), "runme.py"))
            if self._fs_writer.file_extension == "parquet":
                return self._to_json_file()
            elif self._fs_writer.file_extension == "lz4":
                return self.serialize()
        else:
            self._errors.append("Saving not enabled.")
            print("WARNING: Cannot save data window without knowing extension.")
            return Path()

    @staticmethod
    def load(file_path: str) -> "DataWindow":
        """
        load from json metadata and lz4 compressed file or directory of files

        :param file_path: full path of file to load
        :return: DataWindow from json metadata
        """
        cur_path = os.getcwd()
        os.chdir(os.path.dirname(file_path))
        result = DataWindow.from_json_dict(dw_io.json_file_to_data_window(file_path))
        os.chdir(cur_path)
        return result

    def config(self) -> DataWindowConfig:
        """
        :return: settings used to create the DataWindow
        """
        return self._config

    def sdk_version(self) -> str:
        """
        :return: sdk version used to create the DataWindow
        """
        return self._sdk_version

    def set_sdk_version(self, version: str):
        """
        :param version: the sdk version to set
        """
        self._sdk_version = version

    def start_date(self) -> float:
        """
        :return: minimum start timestamp of the data or np.nan if no data
        """
        if len(self._stations) > 0:
            return np.min([s.first_data_timestamp() for s in self._stations])
        return np.nan

    def end_date(self) -> float:
        """
        :return: maximum end timestamp of the data or np.nan if no data
        """
        if len(self._stations) > 0:
            return np.max([s.last_data_timestamp() for s in self._stations])
        return np.nan

    def stations(self) -> List[Station]:
        """
        :return: list of stations in the DataWindow
        """
        return self._stations

    def station_ids(self) -> List[str]:
        """
        :return: ids of stations in the DataWindow
        """
        return [s.id() for s in self._stations]

    def add_station(self, station: Station):
        """
        add a station to the DataWindow
        :param station: Station to add
        """
        self._stations.append(station)

    def remove_station(self, station_id: Optional[str] = None, start_date: Optional[float] = None):
        """
        remove the first station from the DataWindow, or a specific station if given the id and/or start date
        if an id is given, the first station with that id will be removed
        if a start date is given, the removed station will start at or after the start date
        start date is in microseconds since epoch UTC

        :param station_id: id of station to remove
        :param start_date: start date that is at or before the station to remove
        """
        id_removals = []
        sd_removals = []
        if station_id is None and start_date is None:
            self._stations.pop()
        else:
            if station_id is not None:
                for s in range(len(self._stations)):
                    if self._stations[s].id == station_id:
                        id_removals.append(s)
            if start_date is not None:
                for s in range(len(self._stations)):
                    if self._stations[s].start_date() >= start_date:
                        sd_removals.append(s)
            if len(id_removals) > 0 and start_date is None:
                self._stations.pop(id_removals.pop())
            elif len(sd_removals) > 0 and station_id is None:
                self._stations.pop(sd_removals.pop())
            elif len(id_removals) > 0 and len(sd_removals) > 0:
                for a in id_removals:
                    for b in sd_removals:
                        if a == b:
                            self._stations.pop(a)
                            return
                        if a < b:
                            continue

    def first_station(self, station_id: Optional[str] = None) -> Optional[Station]:
        """
        :param station_id: optional station id to filter on
        :return: first station matching params; if no params given, gets first station in list.
                    returns None if no station with given station_id exists.
        """
        if len(self._stations) < 1:
            self._errors.append(f"Attempted to get a station, but there are no stations in the data window!")
            if self.debug:
                print(f"Attempted to get a station, but there are no stations in the data window!")
            return None
        elif station_id:
            result = [s for s in self._stations if s.get_key().check_key(station_id, None, None)]
            if len(result) > 0:
                return result[0]
            self._errors.append(f"Attempted to get station {station_id}, but that station is not in this data window!")
            if self.debug:
                print(f"Attempted to get station {station_id}, but that station is not in this data window!")
            return None
        return self._stations[0]

    def get_station(self, station_id: str, station_uuid: Optional[str] = None,
                    start_timestamp: Optional[float] = None) -> Optional[List[Station]]:
        """
        Get stations from the DataWindow.  Must give at least the station's id.  Other parameters may be None,
        which means the value will be ignored when searching.  Results will match all non-None parameters given.

        :param station_id: station id
        :param station_uuid: station uuid, default None
        :param start_timestamp: station start timestamp in microseconds since UTC epoch, default None
        :return: A list of valid stations or None if the station cannot be found
        """
        result = [s for s in self._stations if s.get_key().check_key(station_id, station_uuid, start_timestamp)]
        if len(result) > 0:
            return result
        self._errors.append(f"Attempted to get station {station_id}, but that station is not in this data window!")
        if self.debug:
            print(f"Attempted to get station {station_id}, but that station is not in this data window!")
        return None

    # def _add_sensor_to_window(self, station: Station):
        # set the window start and end if they were specified, otherwise use the bounds of the data
        # self.create_window_in_sensors(station, self._config.start_datetime, self._config.end_datetime)

    def create_data_window(self, pool: Optional[multiprocessing.pool.Pool] = None):
        """
        updates the DataWindow to contain only the data within the window parameters
        stations without audio or any data outside the window are removed
        """
        # Let's create and manage a single pool of workers that we can utilize throughout
        # the instantiation of the data window.
        _pool: multiprocessing.pool.Pool = multiprocessing.Pool() if pool is None else pool

        r_f = io.ReadFilter()
        if self._config.start_datetime:
            r_f.with_start_dt(self._config.start_datetime)
        if self._config.end_datetime:
            r_f.with_end_dt(self._config.end_datetime)
        if self._config.station_ids:
            r_f.with_station_ids(self._config.station_ids)
        if self._config.extensions:
            r_f.with_extensions(self._config.extensions)
        else:
            self._config.extensions = r_f.extensions
        if self._config.api_versions:
            r_f.with_api_versions(self._config.api_versions)
        else:
            self._config.api_versions = r_f.api_versions
        r_f.with_start_dt_buf(self._config.start_buffer_td)
        r_f.with_end_dt_buf(self._config.end_buffer_td)

        if self.debug:
            print("Reading files from disk.  This may take a few minutes to complete.")

        # get the data to convert into a window
        a_r = ApiReaderDw(self._config.input_dir, self._config.structured_layout, r_f,
                          correct_timestamps=self._config.apply_correction,
                          use_model_correction=self._config.use_model_correction,
                          dw_base_dir=self.save_dir(),
                          dw_save_mode=self._fs_writer.save_mode(),
                          debug=self.debug, pool=_pool)

        self._errors.extend_error(a_r.errors)

        if self._fs_writer.is_use_mem() and a_r.dw_save_mode != self._fs_writer.save_mode():
            if self.debug:
                print("Estimated size of files exceeds available memory.")
                print("Automatically using temporary directory to store data.")
            self._fs_writer.set_use_temp(True)

        # Parallel update
        # Apply timing correction in parallel by station
        sts = a_r.get_stations()
        if self.debug:
            print("num stations loaded: ", len(sts))
        # if self._config.apply_correction:
            # for st in maybe_parallel_map(_pool, Station.update_timestamps,
            #                              iter(sts), chunk_size=1):
            #     self._add_sensor_to_window(st)
            #     if self.debug:
            #         print("station processed: ", st.id())
        for st in maybe_parallel_map(_pool, Station.update_timestamps, iter(sts), chunk_size=1):
            self.create_window_in_sensors(st, self._config.start_datetime, self._config.end_datetime)
            if self.debug:
                print("station processed: ", st.id())

        # check for stations without data
        self._check_for_audio()
        self._check_valid_ids()

        # update the default data window name if we have data and the default name exists
        if self.event_name == "dw" and len(self._stations) > 0:
            self.event_name = f"dw_{int(self.start_date())}_{len(self._stations)}"

        # must update the start and end in order for the data to be saved
        # update remaining data window values if they're still default
        if not self._config.start_datetime and len(self._stations) > 0:
            self._config.start_datetime = dtu.datetime_from_epoch_microseconds_utc(
                np.min([t.first_data_timestamp() for t in self._stations]))
        # end_datetime is non-inclusive, so it must be greater than our latest timestamp
        if not self._config.end_datetime and len(self._stations) > 0:
            self._config.end_datetime = dtu.datetime_from_epoch_microseconds_utc(
                np.max([t.last_data_timestamp() for t in self._stations]) + 1)

        # If the pool was created by this function, then it needs to managed by this function.
        if pool is None:
            _pool.close()

    def _check_for_audio(self):
        """
        removes any station without audio data from the DataWindow
        """
        remove = []
        for s in self._stations:
            if not s.has_audio_sensor():
                remove.append(s.id())
        if len(remove) > 0:
            self._stations = [s for s in self._stations if s.id() not in remove]

    def _check_valid_ids(self):
        """
        if there are stations, searches the station_ids for any ids not in the data collected
        and creates an error message for each id requested but has no data
        if there are no stations, creates a single error message declaring no data found
        """
        if len(self._stations) < 1 and self._config.station_ids:
            if len(self._config.station_ids) > 1:
                add_ids = f"for all stations {self._config.station_ids} "
            else:
                add_ids = ""
            self._errors.append(f"No data matching criteria {add_ids}in {self._config.input_dir}"
                                f"\nPlease adjust parameters of DataWindow")
        elif len(self._stations) > 0 and self._config.station_ids:
            for ids in self._config.station_ids:
                if ids.zfill(10) not in [i.id() for i in self._stations]:
                    self._errors.append(
                        f"Requested {ids} but there is no data to read for that station"
                    )

    def create_window_in_sensors(
            self, station: Station, start_datetime: Optional[dtu.datetime] = None,
            end_datetime: Optional[dtu.datetime] = None
    ):
        """
        truncate the sensors in the station to only contain data from start_date_timestamp to end_date_timestamp
        if the start and/or end are not specified, keeps all audio data that fits and uses it
        to truncate the other sensors.
        returns nothing, updates the station in place

        :param station: station object to truncate sensors of
        :param start_datetime: datetime of start of window, default None
        :param end_datetime: datetime of end of window, default None
        """
        if start_datetime:
            start_datetime = dtu.datetime_to_epoch_microseconds_utc(start_datetime)
        else:
            start_datetime = 0
        if end_datetime:
            end_datetime = dtu.datetime_to_epoch_microseconds_utc(end_datetime)
        else:
            end_datetime = dtu.datetime_to_epoch_microseconds_utc(dtu.datetime.max)
        self.process_sensor(station.audio_sensor(), station.id(), start_datetime, end_datetime)
        for sensor in [s for s in station.data() if s.type() != SensorType.AUDIO]:
            self.process_sensor(sensor, station.id(), station.audio_sensor().first_data_timestamp(),
                                station.audio_sensor().last_data_timestamp())
        # recalculate metadata
        station.update_first_and_last_data_timestamps()
        station.set_packet_metadata([meta for meta in station.packet_metadata()
                                     if meta.packet_start_mach_timestamp < station.last_data_timestamp() and
                                     meta.packet_end_mach_timestamp >= station.first_data_timestamp()])
        if self._fs_writer.is_save_disk():
            station.set_save_mode(io.FileSystemSaveMode.DISK)
            station.set_save_dir(self.save_dir() if self._fs_writer.is_use_disk() else self._fs_writer.get_temp())
        self._stations.append(station)

    def process_sensor(self, sensor: SensorData, station_id: str, start_date_timestamp: float,
                       end_date_timestamp: float):
        """
        process a non audio sensor to fit within the DataWindow.  Updates sensor in place, returns nothing.

        :param sensor: sensor to process
        :param station_id: station id
        :param start_date_timestamp: start of DataWindow
        :param end_date_timestamp: end of DataWindow
        """
        if sensor.num_samples() > 0:
            # get only the timestamps between the start and end timestamps
            before_start = np.where(sensor.data_timestamps() < start_date_timestamp)[0]
            after_end = np.where(end_date_timestamp <= sensor.data_timestamps())[0]
            # start_index is inclusive of window start
            if len(before_start) > 0:
                last_before_start = before_start[-1]
                start_index = last_before_start + 1
            else:
                last_before_start = None
                start_index = 0
            # end_index is non-inclusive of window end
            if len(after_end) > 0:
                first_after_end = after_end[0]
                end_index = first_after_end
            else:
                first_after_end = None
                end_index = sensor.num_samples()
            # check if all the samples have been cut off
            is_audio = sensor.type() == SensorType.AUDIO
            if end_index <= start_index:
                if is_audio:
                    self._errors.append(f"Data window for {station_id} "
                                        f"Audio sensor has truncated all data points")
                elif last_before_start is not None and first_after_end is None:
                    first_entry = sensor.pyarrow_table().slice(last_before_start, 1).to_pydict()
                    first_entry["timestamps"] = [start_date_timestamp]
                    sensor.write_pyarrow_table(pa.Table.from_pydict(first_entry))
                elif last_before_start is None and first_after_end is not None:
                    last_entry = sensor.pyarrow_table().slice(first_after_end, 1).to_pydict()
                    last_entry["timestamps"] = [start_date_timestamp]
                    sensor.write_pyarrow_table(pa.Table.from_pydict(last_entry))
                elif last_before_start is not None and first_after_end is not None:
                    sensor.write_pyarrow_table(
                        sensor.interpolate(start_date_timestamp, last_before_start, 1,
                                           self._config.copy_edge_points == gpu.DataPointCreationMode.COPY))
                else:
                    self._errors.append(
                        f"Data window for {station_id} {sensor.type().name} "
                        f"sensor has truncated all data points"
                    )
            else:
                _arrow = sensor.pyarrow_table().slice(start_index, end_index-start_index)
                # if sensor is audio or location, we want nan'd edge points
                if sensor.type() in [SensorType.LOCATION, SensorType.AUDIO]:
                    new_point_mode = gpu.DataPointCreationMode.NAN
                else:
                    new_point_mode = self._config.copy_edge_points
                # add in the data points at the edges of the window if there are defined start and/or end times
                slice_start = _arrow["timestamps"].to_numpy()[0]
                slice_end = _arrow["timestamps"].to_numpy()[-1]
                if not is_audio:
                    end_sample_interval = end_date_timestamp - slice_end
                    end_samples_to_add = 1
                    start_sample_interval = start_date_timestamp - slice_start
                    start_samples_to_add = 1
                else:
                    end_sample_interval = dtu.seconds_to_microseconds(sensor.sample_interval_s())
                    start_sample_interval = -end_sample_interval
                    if self._config.end_datetime:
                        end_samples_to_add = int((dtu.datetime_to_epoch_microseconds_utc(self._config.end_datetime)
                                                  - slice_end) / end_sample_interval)
                    else:
                        end_samples_to_add = 0
                    if self._config.start_datetime:
                        start_samples_to_add = int((slice_start -
                                                    dtu.datetime_to_epoch_microseconds_utc(self._config.start_datetime))
                                                   / end_sample_interval)
                    else:
                        start_samples_to_add = 0
                # add to end
                _arrow = (gpu.add_data_points_to_df(data_table=_arrow, start_index=_arrow.num_rows - 1,
                                                    sample_interval_micros=end_sample_interval,
                                                    num_samples_to_add=end_samples_to_add,
                                                    point_creation_mode=new_point_mode))
                # add to begin
                _arrow = (gpu.add_data_points_to_df(data_table=_arrow, start_index=0,
                                                    sample_interval_micros=start_sample_interval,
                                                    num_samples_to_add=start_samples_to_add,
                                                    point_creation_mode=new_point_mode))
                sensor.sort_by_data_timestamps(_arrow)
        else:
            self._errors.append(f"Data window for {station_id} {sensor.type().name} "
                                f"sensor has no data points!")

    def print_errors(self):
        """
        prints errors to screen
        """
        self._errors.print()
        for stn in self._stations:
            stn.print_errors()
示例#3
0
class EventStream:
    """
    stores event stream data gathered from a single station.
    ALL timestamps in microseconds since epoch UTC unless otherwise stated
    """
    def __init__(self, name: str = "event",
                 schema: Optional[Dict[str, list]] = None,
                 save_mode: FileSystemSaveMode = FileSystemSaveMode.MEM,
                 base_dir: str = "."):
        """
        initialize EventStream for a station

        :param name: name of the EventStream.  Default "event"
        :param schema: a structured dictionary of the data table schema.  Dictionary must look like:
                    {"string": [s_values], "numeric": [n_values], "boolean": [o_values], "byte": [b_values]}
                    where [*_values] is a list of strings and can be empty.  Default None
        :param save_mode: FileSystemSaveMode that determines how data is saved.
                            Default FileSystemSaveMode.MEM (use RAM).  Other options are DISK (save to directory)
                            and TEMP (save to temporary directory)
        :param base_dir: the location of the parquet file that holds the data.  Not used if save_data is False.
                            Default current directory (".")
        """
        self.name = name
        self.timestamps_metadata = {}
        self.metadata = {}

        self._errors = RedVoxExceptions("EventStream")
        self._is_timestamps_corrected = False
        self._fs_writer = Fsw(f"event_{name}", "parquet", base_dir, save_mode)
        self._data = None
        self._schema = {"string": [], "numeric": [], "boolean": [], "byte": []}
        if schema is not None:
            self.set_schema(schema)

    def as_dict(self) -> dict:
        """
        :return: EventStream as a dictionary
        """
        return {
            "name": self.name,
            "metadata": self.metadata,
            "timestamps_metadata": self.timestamps_metadata,
            "is_timestamps_corrected": self._is_timestamps_corrected,
            "schema": self._schema,
            "file_path": self.full_path(),
            "errors": self._errors.as_dict()
        }

    @staticmethod
    def __get_items(payload: Mapping[str]):
        return payload.get_metadata().items()

    @staticmethod
    def __get_items_raw(payload):
        return payload.items()

    @staticmethod
    def __get_keys(ptype: str, payload: Mapping[str]):
        return ptype, payload.get_metadata().keys()

    @staticmethod
    def __get_keys_raw(ptype: str, payload):
        return ptype, payload.keys()

    def __set_schema(self, name: str, value: str):
        self._schema[name].append(value)

    def _get_tbl_schema(self) -> Dict[str, list]:
        """
        :return: the dictionary used to create the EventStream data object
        """
        if self._data:
            result = {}
            for f in self._data.schema.names:
                result[f] = []
        else:
            result = {"timestamps": [], "unaltered_timestamps": []}
            for t, s in self._schema.items():
                for k in s:
                    result[k] = []
        return result

    def read_events(self, eventstream: es.EventStream):
        """
        read the payloads of each event in the eventstream and separate the data by payload type

        :param eventstream: stream of events to process
        """
        self.name = eventstream.get_name()
        self._fs_writer.file_name = f"event_{self.name}"
        num_events = eventstream.get_events().get_count()
        if num_events > 1:
            tbl = self._get_tbl_schema()
            self.timestamps_metadata = eventstream.get_timestamps().get_metadata()
            self.metadata = eventstream.get_metadata()
            first_event = eventstream.get_events().get_values()[0]
            for t, c in map(self.__get_keys, ["string", "numeric", "boolean", "byte"],
                            [first_event.get_string_payload(), first_event.get_numeric_payload(),
                             first_event.get_boolean_payload(), first_event.get_byte_payload()]):
                for k in c:
                    self.add_to_schema(t, k)
                    tbl[k] = []
            for i in range(num_events):
                tbl["timestamps"].append(eventstream.get_timestamps().get_timestamps()[i])
                tbl["unaltered_timestamps"].append(eventstream.get_timestamps().get_timestamps()[i])
                evnt = eventstream.get_events().get_values()[i]
                for items in map(self.__get_items, [evnt.get_string_payload(), evnt.get_numeric_payload(),
                                                    evnt.get_boolean_payload(), evnt.get_byte_payload()]):
                    for c, st in items:
                        tbl[c].append(st)
            self._data = pa.Table.from_pydict(tbl)

    def read_raw(self, stream: RedvoxPacketM.EventStream) -> 'EventStream':
        """
        read the contents of a protobuf stream

        :param stream: the protobuf stream to read
        """
        self.name = stream.name
        self._fs_writer.file_name = f"event_{self.name}"
        num_events = len(stream.events)
        if num_events > 1:
            tbl = self._get_tbl_schema()
            self.timestamps_metadata = stream.timestamps.metadata
            self.metadata = stream.metadata
            first_event = stream.events[0]
            for t, c in map(EventStream.__get_keys_raw, ["string", "numeric", "boolean", "byte"],
                            [first_event.string_payload, first_event.numeric_payload,
                             first_event.boolean_payload, first_event.byte_payload]):
                for k in c:
                    self.add_to_schema(t, k)
                    tbl[k] = []
            for i in range(num_events):
                tbl["timestamps"].append(stream.timestamps.timestamps[i])
                tbl["unaltered_timestamps"].append(stream.timestamps.timestamps[i])
                evnt = stream.events[i]
                for items in map(EventStream.__get_items_raw, [evnt.string_payload, evnt.numeric_payload,
                                                               evnt.boolean_payload, evnt.byte_payload]):
                    for c, st in items:
                        tbl[c].append(st)
            self._data = pa.Table.from_pydict(tbl)
        return self

    def read_from_dir(self, file: str):
        """
        read a pyarrow table from a file on disk

        :param file: full path to the file to read
        """
        try:
            tbl = pq.read_table(file)
            if tbl.schema.names == self._get_tbl_schema():
                self._data = tbl
        except FileNotFoundError:
            self._errors.append("No data file was found; this event is empty.")
            self._data = None

    def get_string_schema(self) -> List[str]:
        """
        :return: the column names of string typed data as a list of strings
        """
        return self._schema["string"]

    def get_numeric_schema(self) -> List[str]:
        """
        :return: the column names of numeric typed data as a list of strings
        """
        return self._schema["numeric"]

    def get_boolean_schema(self) -> List[str]:
        """
        :return: the column names of boolean typed data as a list of strings
        """
        return self._schema["boolean"]

    def get_byte_schema(self) -> List[str]:
        """
        :return: the column names of byte typed data as a list of strings
        """
        return self._schema["byte"]

    def get_schema(self) -> dict:
        """
        :return: the schema of the EventStream
        """
        return self._schema

    def get_string_values(self) -> pa.Table:
        """
        :return: the string data as a pyarrow table
        """
        return self._data.select(self.get_string_schema()) if self._data else pa.Table.from_pydict({})

    def get_numeric_values(self) -> pa.Table:
        """
        :return: the numeric data as a pyarrow table
        """
        return self._data.select(self.get_numeric_schema()) if self._data else pa.Table.from_pydict({})

    def get_boolean_values(self) -> pa.Table:
        """
        :return: the boolean data as a pyarrow table
        """
        return self._data.select(self.get_boolean_schema()) if self._data else pa.Table.from_pydict({})

    def get_byte_values(self) -> pa.Table:
        """
        :return: the byte data as a pyarrow table
        """
        return self._data.select(self.get_byte_schema()) if self._data else pa.Table.from_pydict({})

    def _check_for_name(self, column_name: str, schema: List[str]) -> bool:
        """
        :param column_name: name of column to check for
        :param schema: list of allowed names
        :return: True if column_name is in schema, sets error and returns False if not
        """
        if column_name not in schema:
            self._errors.append(f"WARNING: Column {column_name} does not exist; try one of {schema}")
            return False
        return True

    def __get_column_data(self, schema: List[str], column_name: str) -> np.array:
        """
        :param schema: list of column names to search
        :param column_name: column name to get
        :return: the data as an np.array; if empty, column name or data doesn't exist
        """
        return self._data[column_name].to_numpy() if self._check_for_name(column_name, schema) else np.array([])

    def get_string_column(self, column_name: str) -> np.array:
        """
        :param column_name: name of string payload to retrieve
        :return: string data from the column specified
        """
        return self.__get_column_data(self.get_string_schema(), column_name)

    def get_numeric_column(self, column_name: str) -> np.array:
        """
        :param column_name: name of numeric payload to retrieve
        :return: numeric data from the column specified
        """
        return self.__get_column_data(self.get_numeric_schema(), column_name)

    def get_boolean_column(self, column_name: str) -> np.array:
        """
        :param column_name: name of boolean payload to retrieve
        :return: boolean data from the column specified
        """
        return self.__get_column_data(self.get_boolean_schema(), column_name)

    def get_byte_column(self, column_name: str) -> np.array:
        """
        :param column_name: name of byte payload to retrieve
        :return: bytes data from the column specified
        """
        return self.__get_column_data(self.get_byte_schema(), column_name)

    def set_schema(self, schema: Dict[str, list]):
        """
        sets the schema of the EventStream using a specially structured dictionary.
        Structure is:

        {"string": [s_values], "numeric": [n_values], "boolean": [o_values], "byte": [b_values]}

        where [*_values] is a list of strings and can be empty

        :param schema: specially structured dictionary of data table schema
        """
        if schema.keys() != self._schema.keys():
            self._errors.append(f"Attempted to add invalid schema with keys {list(schema.keys())} to EventStreams.\n"
                                f"Valid keys are: {list(self._schema.keys())}")
        else:
            self._schema = schema

    def add_to_schema(self, key: str, value: str):
        """
        adds a value to the schema, under the specified key

        :param key: one of "string", "numeric", "boolean", or "byte"
        :param value: the name of the column to add to the schema
        """
        if key not in self._schema.keys():
            self._errors.append("Attempted to add an unknown key to the EventStream schema.\n"
                                f"You must use one of {self._schema.keys()}.")
        elif value not in self._schema[key]:
            self._schema[key].append(value)

    def add(self, other_stream: es.EventStream):
        """
        adds a Redvox Api1000 EventStream with the same name to the data

        :param other_stream: another EventStream with the same name
        """
        if self.name != other_stream.get_name():
            self._errors.append(f"Attempted to add a stream with a different name ({other_stream.get_name()})")
        else:
            self.timestamps_metadata = {**self.timestamps_metadata, **other_stream.get_timestamps().get_metadata()}
            self.metadata = {**self.metadata, **other_stream.get_metadata()}
            num_events = other_stream.get_events().get_count()
            if num_events > 1:
                tbl = self._get_tbl_schema()
                for i in range(num_events):
                    tbl["timestamps"].append(other_stream.get_timestamps().get_timestamps()[i])
                    tbl["unaltered_timestamps"].append(other_stream.get_timestamps().get_timestamps()[i])
                    evnt = other_stream.get_events().get_values()[i]
                    for items in map(self.__get_items, [evnt.get_string_payload(), evnt.get_numeric_payload(),
                                                        evnt.get_boolean_payload(), evnt.get_byte_payload()]):
                        for c, st in items:
                            tbl[c].append(st)
                self._data = pa.concat_tables([self._data, pa.Table.from_pydict(tbl)])

    def add_raw(self, other_stream: RedvoxPacketM.EventStream):
        """
        add a protobuf EventStream with the same name to the data

        :param other_stream: a protobuf EventStream to add
        """
        if self.name != other_stream.name:
            self._errors.append(f"Attempted to add a stream with a different name ({other_stream.name})")
        else:
            self.timestamps_metadata = {**self.timestamps_metadata, **other_stream.timestamps.metadata}
            self.metadata = {**self.metadata, **other_stream.metadata}
            num_events = len(other_stream.events)
            if num_events > 1:
                tbl = self._get_tbl_schema()
                for i in range(num_events):
                    tbl["timestamps"].append(other_stream.timestamps.timestamps[i])
                    tbl["unaltered_timestamps"].append(other_stream.timestamps.timestamps[i])
                    evnt = other_stream.events[i]
                    for items in map(EventStream.__get_items_raw, [evnt.string_payload, evnt.numeric_payload,
                                                                   evnt.boolean_payload, evnt.byte_payload]):
                        for c, st in items:
                            tbl[c].append(st)
                self._data = pa.concat_tables([self._data, pa.Table.from_pydict(tbl)])

    def append(self, other_stream: "EventStream"):
        """
        add another EventStream onto the calling one if they have the same name

        :param other_stream: other stream to add to current
        """
        if other_stream.name == self.name:
            self._data = pa.concat_tables([self._data, other_stream._data])
            self.timestamps_metadata = {**self.timestamps_metadata, **other_stream.timestamps_metadata}
            self.metadata = {**self.metadata, **other_stream.metadata}
            self._errors.extend_error(other_stream.errors())

    def timestamps(self) -> np.array:
        """
        :return: the timestamps as a numpy array; returns empty array if no timestamps exist
        """
        if "timestamps" in self.data().schema.names:
            return self.data()["timestamps"].to_numpy()
        else:
            return np.array([])

    def unaltered_timestamps(self) -> np.array:
        """
        :return: the unaltered timestamps as a numpy array; returns empty array if no timestamps exist
        """
        if "unaltered_timestamps" in self.data().schema.names:
            return self.data()["unaltered_timestamps"].to_numpy()
        else:
            return np.array([])

    def update_timestamps(self, offset_model: om.OffsetModel, use_model_function: bool = False):
        """
        updates the timestamps of the data points

        :param offset_model: model used to update the timestamps
        :param use_model_function: if True, use the model's slope function to update the timestamps.
                                    otherwise uses the best offset (model's intercept value).  Default False
        """
        if self._data is not None and self._data.num_rows > 0:
            timestamps = pa.array(offset_model.update_timestamps(self._data["timestamps"].to_numpy(),
                                                                 use_model_function))
            self._data.set_column(0, "timestamps", timestamps)

    def default_json_file_name(self) -> str:
        """
        :return: default event stream json file name (event_[event.name]): note there is no extension
        """
        return f"event_{self.name}"

    def is_save_to_disk(self) -> bool:
        """
        :return: True if sensor will be saved to disk
        """
        return self._fs_writer.is_save_disk()

    def set_save_to_disk(self, save: bool):
        """
        :param save: If True, save to disk
        """
        self._fs_writer.save_to_disk = save

    def set_save_mode(self, save_mode: FileSystemSaveMode):
        """
        set the save mode

        :param save_mode: new save mode
        """
        self._fs_writer.set_save_mode(save_mode)

    def set_file_name(self, new_file: Optional[str] = None):
        """
        * set the pyarrow file name or use the default: event_{EventStream.name}
        * Do not give an extension

        :param new_file: optional file name to change to; default None (use default name)
        """
        self._fs_writer.file_name = new_file if new_file else f"event_{self.name}"

    def full_file_name(self) -> str:
        """
        :return: full name of parquet file containing the data
        """
        return self._fs_writer.full_name()

    def file_name(self) -> str:
        """
        :return: file name without extension
        """
        return self._fs_writer.file_name

    def set_save_dir(self, new_dir: Optional[str] = None):
        """
        set the pyarrow directory or use the default: "." (current directory)

        :param new_dir: the directory to change to; default None (use current directory)
        """
        self._fs_writer.base_dir = new_dir if new_dir else "."

    def save_dir(self) -> str:
        """
        :return: directory containing parquet files for the sensor
        """
        return self._fs_writer.save_dir()

    def full_path(self) -> str:
        """
        :return: the full path to the data file
        """
        return self._fs_writer.full_path()

    def fs_writer(self) -> Fsw:
        """
        :return: FileSystemWriter object
        """
        return self._fs_writer

    def write_table(self):
        """
        writes the event stream data to disk.
        """
        if self._data is not None:
            pq.write_table(self._data, self.full_path())

    def has_data(self) -> bool:
        """
        :return: True if EventStream contains at least one data point
        """
        return self.data().num_rows > 0

    def data(self) -> pa.Table:
        """
        :return: the data as a pyarrow table
        """
        if self._data is None:
            if self.is_save_to_disk():
                self._data = pq.read_table(self.full_path())
            else:
                return pa.Table.from_pydict({})
        return self._data

    @staticmethod
    def from_json_file(file_dir: str, file_name: str) -> "EventStream":
        """
        :param file_dir: full path to containing directory for the file
        :param file_name: name of file and extension to load data from
        :return: EventStream from json file
        """
        if file_name is None:
            file_name = io.get_json_file(file_dir)
            if file_name is None:
                result = EventStream("Empty")
                result.append_error("JSON file to load EventStream from not found.")
                return result
        json_data = io.json_file_to_dict(os.path.join(file_dir, f"{file_name}.json"))
        if "name" in json_data.keys():
            result = EventStream(json_data["name"], json_data["schema"], FileSystemSaveMode.DISK, file_dir)
            result.metadata = json_data["metadata"]
            result.timestamps_metadata = json_data["timestamps_metadata"]
            result.set_errors(RedVoxExceptions.from_dict(json_data["errors"]))
            result.read_from_dir(json_data["file_path"])
        else:
            result = EventStream("Empty")
            result.append_error(f"Loading from {file_name} failed; missing EventStream name.")
        return result

    def to_json_file(self, file_name: Optional[str] = None) -> Path:
        """
        saves the EventStream as a json file

        :param file_name: the optional base file name.  Do not include a file extension.
                            If None, a default file name is created using this format:
                            event_[event.name].json
        :return: path to json file
        """
        if self._fs_writer.file_extension == "parquet" and self._data is not None:
            self.write_table()
        return io.to_json_file(self, file_name)

    def errors(self) -> RedVoxExceptions:
        """
        :return: errors of the sensor
        """
        return self._errors

    def set_errors(self, errors: RedVoxExceptions):
        """
        sets the errors of the Sensor

        :param errors: errors to set
        """
        self._errors = errors

    def append_error(self, error: str):
        """
        add an error to the Sensor

        :param error: error to add
        """
        self._errors.append(error)

    def print_errors(self):
        """
        print all errors to screen
        """
        self._errors.print()