示例#1
0
    def __init__(self, name: str = "event",
                 schema: Optional[Dict[str, list]] = None,
                 save_mode: FileSystemSaveMode = FileSystemSaveMode.MEM,
                 base_dir: str = "."):
        """
        initialize EventStream for a station

        :param name: name of the EventStream.  Default "event"
        :param schema: a structured dictionary of the data table schema.  Dictionary must look like:
                    {"string": [s_values], "numeric": [n_values], "boolean": [o_values], "byte": [b_values]}
                    where [*_values] is a list of strings and can be empty.  Default None
        :param save_mode: FileSystemSaveMode that determines how data is saved.
                            Default FileSystemSaveMode.MEM (use RAM).  Other options are DISK (save to directory)
                            and TEMP (save to temporary directory)
        :param base_dir: the location of the parquet file that holds the data.  Not used if save_data is False.
                            Default current directory (".")
        """
        self.name = name
        self.timestamps_metadata = {}
        self.metadata = {}

        self._errors = RedVoxExceptions("EventStream")
        self._is_timestamps_corrected = False
        self._fs_writer = Fsw(f"event_{name}", "parquet", base_dir, save_mode)
        self._data = None
        self._schema = {"string": [], "numeric": [], "boolean": [], "byte": []}
        if schema is not None:
            self.set_schema(schema)
示例#2
0
    def __init__(
        self,
        base_dir: str,
        structured_dir: bool = False,
        read_filter: io.ReadFilter = None,
        debug: bool = False,
        pool: Optional[multiprocessing.pool.Pool] = None,
    ):
        """
        Initialize the ApiReader object

        :param base_dir: directory containing the files to read
        :param structured_dir: if True, base_dir contains a specific directory structure used by the respective
                                api formats.  If False, base_dir only has the data files.  Default False.
        :param read_filter: ReadFilter for the data files, if None, get everything.  Default None
        :param debug: if True, output program warnings/errors during function execution.  Default False.
        """
        _pool: multiprocessing.pool.Pool = (multiprocessing.Pool()
                                            if pool is None else pool)

        if read_filter:
            self.filter = read_filter
            if self.filter.station_ids:
                self.filter.station_ids = set(self.filter.station_ids)
        else:
            self.filter = io.ReadFilter()
        self.base_dir = base_dir
        self.structured_dir = structured_dir
        self.debug = debug
        self.errors = RedVoxExceptions("APIReader")
        self.files_index = self._get_all_files(_pool)
        self.index_summary = io.IndexSummary.from_index(
            self._flatten_files_index())
        mem_split_factor = 1
        if len(self.files_index) > 0:
            if settings.is_parallelism_enabled():
                mem_split_factor = len(self.files_index)
            self.chunk_limit = psutil.virtual_memory(
            ).available * PERCENT_FREE_MEM_USE / mem_split_factor
            max_file_size = max([
                fe.file_size_bytes for fi in self.files_index
                for fe in fi.entries
            ])
            if max_file_size > self.chunk_limit:
                raise MemoryError(
                    f"System requires {max_file_size} bytes of memory to process a file but only has "
                    f"{self.chunk_limit} available.  Please free or add more RAM."
                )
            if debug:
                print(
                    f"{mem_split_factor} stations each have {int(self.chunk_limit)} bytes for loading files in "
                    f"memory.")
        else:
            self.chunk_limit = 0

        if debug:
            self.errors.print()

        if pool is None:
            _pool.close()
    def __init__(
        self,
        station_id: str = "",
        audio_sample_rate_hz: float = np.nan,
        station_start_timestamp: float = np.nan,
        time_sync_data: Optional[List[TimeSyncData]] = None,
    ):
        """
        Initialize the object

        :param station_id: id of the station to analyze, default empty string
        :param audio_sample_rate_hz: audio sample rate in hz of the station, default np.nan
        :param station_start_timestamp: timestamp of when station started recording, default np.nan
        :param time_sync_data: the TimeSyncData objects created from the packets of the station, default None
        """
        self.station_id: str = station_id
        self.sample_rate_hz: float = audio_sample_rate_hz
        self.station_start_timestamp: float = station_start_timestamp
        self.best_latency_index: int = np.nan
        self.latency_stats = sh.StatsContainer("latency")
        self.offset_stats = sh.StatsContainer("offset")
        self.errors = RedVoxExceptions("TimeSyncAnalysis")
        if time_sync_data:
            self.timesync_data: List[TimeSyncData] = time_sync_data
            self.evaluate_and_validate_data()
        else:
            self.timesync_data = []
            self.offset_model = OffsetModel.empty_model()
    def from_json_dict(json_dict: Dict) -> "DataWindow":
        """
        Reads a JSON dictionary and loads the data into the DataWindow.
        If dictionary is improperly formatted, raises a ValueError.

        :param json_dict: the dictionary to read
        :return: The DataWindow as defined by the JSON
        """
        if "out_type" not in json_dict.keys() \
                or json_dict["out_type"].upper() not in dw_io.DataWindowOutputType.list_names():
            raise ValueError('Dictionary loading type is invalid or unknown.  '
                             'Check the value "out_type"; it must be one of: '
                             f'{dw_io.DataWindowOutputType.list_non_none_names()}')
        else:
            out_type = dw_io.DataWindowOutputType.str_to_type(json_dict["out_type"])
            if out_type == dw_io.DataWindowOutputType.PARQUET:
                dwin = DataWindow(json_dict["event_name"], EventOrigin.from_dict(json_dict["event_origin"]),
                                  None, json_dict["base_dir"], json_dict["out_type"], json_dict["make_runme"],
                                  json_dict["debug"])
                dwin._config = DataWindowConfig.from_dict(json_dict["config"])
                dwin._errors = RedVoxExceptions.from_dict(json_dict["errors"])
                dwin._sdk_version = json_dict["sdk_version"]
                for st in json_dict["stations"]:
                    dwin.add_station(Station.from_json_file(os.path.join(json_dict["base_dir"], st), f"{st}.json"))
            elif out_type == dw_io.DataWindowOutputType.LZ4:
                dwin = DataWindow.deserialize(os.path.join(json_dict["base_dir"],
                                                           f"{json_dict['event_name']}.pkl.lz4"))
            else:
                dwin = DataWindow()
            return dwin
示例#5
0
def validate_station_key_list(data_packets: List[api_m.RedvoxPacketM],
                              errors: RedVoxExceptions) -> bool:
    """
    Checks for consistency in the data packets.  Returns False if discrepancies are found.
    If debug is True, will output the discrepancies.

    :param data_packets: list of WrappedRedvoxPacketM to look at
    :param errors: RedVoxExceptions detailing errors found while validating
    :return: True if no discrepancies found.  False otherwise
    """
    my_errors = RedVoxExceptions("StationKeyValidation")
    if len(data_packets) < 2:
        return True
    j: np.ndarray = np.transpose([[
        t.station_information.id,
        t.station_information.uuid,
        t.timing_information.app_start_mach_timestamp,
        t.api,
        t.sub_api,
        t.station_information.make,
        t.station_information.model,
        t.station_information.os,
        t.station_information.os_version,
        t.station_information.app_version,
        t.station_information.is_private,
        len(t.sensors.audio.samples.values) / t.sensors.audio.sample_rate,
    ] for t in data_packets])

    k: Dict[str, np.ndarray] = {
        "ids": j[0],
        "uuids": j[1],
        "station_start_times": j[2],
        "apis": j[3],
        "sub_apis": j[4],
        "makes": j[5],
        "models": j[6],
        "os": j[7],
        "os_versions": j[8],
        "app_versions": j[9],
        "privates": j[10],
        "durations": j[11],
    }

    for key, value in k.items():
        result = np.unique(value)
        if len(result) > 1:
            my_errors.append(
                f"WARNING: {data_packets[0].station_information.id} "
                f"{key} contains multiple unique values: {result}.\n"
                "Please update your query to focus on one of these values.")

    if my_errors.get_num_errors() > 0:
        errors.extend_error(my_errors)
        return False

    return True  # if here, everything is consistent
    def __init__(
            self,
            event_name: str = "dw",
            event_origin: Optional[EventOrigin] = None,
            config: Optional[DataWindowConfig] = None,
            output_dir: str = ".",
            out_type: str = "NONE",
            make_runme: bool = False,
            debug: bool = False,
    ):
        """
        Initialize the DataWindow

        :param event_name: name of the DataWindow.  defaults to "dw"
        :param event_origin: Optional EventOrigin which describes the physical location and radius of the
                                origin event.  Default empty EventOrigin (no valid data)
        :param config: Optional DataWindowConfig which describes how to extract data from Redvox files.
                        Default None
        :param output_dir: output directory for saving files.  Default "." (current directory)
        :param out_type: type of file to save the DataWindow as.  Options: "PARQUET", "LZ4", "NONE".
                            Default "NONE" (no saving)
        :param make_runme: if True, saves an example runme.py file with the data.  Default False
        :param debug: if True, outputs additional information during initialization.  Default False
        """
        self.event_name: str = event_name
        self.event_origin: EventOrigin = event_origin if event_origin else EventOrigin()
        self._fs_writer = dw_io.DataWindowFileSystemWriter(self.event_name, out_type, output_dir, make_runme)
        self.debug: bool = debug
        self._sdk_version: str = redvox.VERSION
        self._errors = RedVoxExceptions("DataWindow")
        self._stations: List[Station] = []
        self._config = config
        if config:
            if config.start_datetime and config.end_datetime and (config.end_datetime <= config.start_datetime):
                self._errors.append("DataWindow will not work when end datetime is before or equal to start datetime.\n"
                                    f"Your times: {config.end_datetime} <= {config.start_datetime}")
            else:
                self.create_data_window()
        if self.debug:
            self.print_errors()
class AudioWithGaps:
    """
    Represents methods of reconstructing audio data with or without gaps in it

    Properties:
        sample_interval_micros: microseconds between sample points

        metadata: list of start times in microseconds since epoch UTC and the data to add

        gaps: the list of start and end points of gaps (the start and end are actual data points)

        errors: the errors encountered while getting the data
    """
    sample_interval_micros: float
    metadata: Optional[List[Tuple[float, pa.Table]]] = None
    gaps: List[Tuple[float, float]] = field(default_factory=lambda: [])
    errors: RedVoxExceptions = field(
        default_factory=lambda: RedVoxExceptions("AudioWithGaps"))

    def create_timestamps(self) -> pa.Table:
        """
        :return: converts the audio metadata into a data table
        """
        result_array = [[], [], []]
        for m in self.metadata:
            timestamps = calc_evenly_sampled_timestamps(
                m[0], m[1].num_rows, self.sample_interval_micros)
            result_array[0].extend(timestamps)
            result_array[1].extend(timestamps)
            result_array[2].extend(m[1]["microphone"].to_numpy())
        for gs, ge in self.gaps:
            num_samples = int((ge - gs) / self.sample_interval_micros) - 1
            timestamps = calc_evenly_sampled_timestamps(
                gs + self.sample_interval_micros, num_samples,
                self.sample_interval_micros)
            gap_array = [timestamps, np.full(len(timestamps), np.nan)]
            result_array[0].extend(gap_array[0])
            result_array[1].extend(gap_array[0])
            result_array[2].extend(gap_array[1])
        ptable = pa.Table.from_pydict(dict(zip(AUDIO_DF_COLUMNS,
                                               result_array)))
        return pc.take(
            ptable,
            pc.sort_indices(ptable, sort_keys=[("timestamps", "ascending")]))

    def add_error(self, error: str):
        """
        add an error to the result
        :param error: error message to add
        """
        self.errors.append(error)
class GapPadResult:
    """
    The result of filling gaps or padding a time series
    """
    result_df: Optional[pd.DataFrame] = None
    gaps: List[Tuple[float, float]] = field(default_factory=lambda: [])
    errors: RedVoxExceptions = field(
        default_factory=lambda: RedVoxExceptions("GapPadResult"))

    def add_error(self, error: str):
        """
        add an error to the result
        :param error: error message to add
        """
        self.errors.append(error)
示例#9
0
    def __init__(
        self,
        data_packets: Optional[List[api_m.RedvoxPacketM]] = None,
        station_id: str = None,
        uuid: str = None,
        start_time: float = np.nan,
        use_model_correction: bool = True,
    ):
        """
        initialize Station

        :param data_packets: optional list of data packets representing the station, default None
        :param station_id: optional id if no data packets, default None
        :param uuid: optional uuid if no data packets, default None
        :param start_time: optional start time in microseconds since epoch UTC if no data packets, default np.nan
        :param use_model_correction: if True, use OffsetModel functions for time correction, add OffsetModel best offset
                                        (intercept value) otherwise.  Default True
        """
        self.data = []
        self.packet_metadata: List[st_utils.StationPacketMetadata] = []
        self.is_timestamps_updated = False
        self._gaps: List[Tuple[float, float]] = []
        self.errors: RedVoxExceptions = RedVoxExceptions("Station")
        self.use_model_correction = use_model_correction
        if data_packets and st_utils.validate_station_key_list(
                data_packets, self.errors):
            # noinspection Mypy
            self._load_metadata_from_packet(data_packets[0])
            self.timesync_analysis = TimeSyncAnalysis(
                self.id, self.audio_sample_rate_nominal_hz,
                self.start_timestamp).from_raw_packets(data_packets)
            if self.timesync_analysis.errors.get_num_errors() > 0:
                self.errors.extend_error(self.timesync_analysis.errors)
            self._set_all_sensors(data_packets)
            self._get_start_and_end_timestamps()
        else:
            self.id = station_id
            self.uuid = uuid
            self.metadata = st_utils.StationMetadata("None")
            self.start_timestamp = start_time
            self.first_data_timestamp = np.nan
            self.last_data_timestamp = np.nan
            self.audio_sample_rate_nominal_hz = np.nan
            self.is_audio_scrambled = False
            self.timesync_analysis = TimeSyncAnalysis()
 def __init__(self,
              sensor_name: str,
              sensor_data: pd.DataFrame,
              sensor_type: SensorType = SensorType.UNKNOWN_SENSOR,
              sample_rate_hz: float = np.nan,
              sample_interval_s: float = np.nan,
              sample_interval_std_s: float = np.nan,
              is_sample_rate_fixed: bool = False,
              are_timestamps_altered: bool = False,
              calculate_stats: bool = False):
     """
     initialize the sensor data with params
     :param sensor_name: name of the sensor
     :param sensor_type: enumerated type of the sensor, default SensorType.UNKNOWN_SENSOR
     :param sensor_data: dataframe with the timestamps and sensor data; first column is always the timestamps,
                         the other columns are the data channels in the sensor
     :param sample_rate_hz: sample rate in hz of the data
     :param sample_interval_s: sample interval in seconds of the data
     :param sample_interval_std_s: std dev of sample interval in seconds of the data
     :param is_sample_rate_fixed: if True, sample rate is constant for all data, default False
     :param are_timestamps_altered: if True, timestamps in the sensor have been altered from their
                                     original values, default False
     :param calculate_stats: if True, calculate sample_rate, sample_interval_s, and sample_interval_std_s
                             default False
     """
     if "timestamps" not in sensor_data.columns:
         raise AttributeError(
             'SensorData requires the data frame to contain a column titled "timestamps"'
         )
     self.name: str = sensor_name
     self.type: SensorType = sensor_type
     self.data_df: pd.DataFrame = sensor_data.infer_objects()
     self.sample_rate_hz: float = sample_rate_hz
     self.sample_interval_s: float = sample_interval_s
     self.sample_interval_std_s: float = sample_interval_std_s
     self.is_sample_rate_fixed: bool = is_sample_rate_fixed
     self.timestamps_altered: bool = are_timestamps_altered
     self.errors: RedVoxExceptions = RedVoxExceptions("Sensor")
     if calculate_stats:
         self.organize_and_update_stats()
     else:
         self.sort_by_data_timestamps()
示例#11
0
 def from_json_file(file_dir: str, file_name: str) -> "EventStream":
     """
     :param file_dir: full path to containing directory for the file
     :param file_name: name of file and extension to load data from
     :return: EventStream from json file
     """
     if file_name is None:
         file_name = io.get_json_file(file_dir)
         if file_name is None:
             result = EventStream("Empty")
             result.append_error("JSON file to load EventStream from not found.")
             return result
     json_data = io.json_file_to_dict(os.path.join(file_dir, f"{file_name}.json"))
     if "name" in json_data.keys():
         result = EventStream(json_data["name"], json_data["schema"], FileSystemSaveMode.DISK, file_dir)
         result.metadata = json_data["metadata"]
         result.timestamps_metadata = json_data["timestamps_metadata"]
         result.set_errors(RedVoxExceptions.from_dict(json_data["errors"]))
         result.read_from_dir(json_data["file_path"])
     else:
         result = EventStream("Empty")
         result.append_error(f"Loading from {file_name} failed; missing EventStream name.")
     return result
示例#12
0
class AggregateSummary:
    """
    aggregate of summaries

    properties:
        summaries: the summaries of sensors
        gaps: gaps in audio data as a list of tuples of start and end time
    """
    summaries: List[PyarrowSummary] = field(default_factory=lambda: [])
    gaps: List[Tuple[float, float]] = field(default_factory=lambda: [])
    errors: RedVoxExceptions = RedVoxExceptions("AggregateSummary")

    def to_dict(self) -> dict:
        """
        :return: dictionary representation of all summaries
        """
        result = {}
        for ps in self.summaries:
            result[ps.stype.name] = ps.to_dict()
        return result

    @staticmethod
    def from_dict(summary_dict: dict) -> "AggregateSummary":
        """
        :param summary_dict: dictionary to load data from
        :return: AggregateSummary from a dictionary
        """
        result = AggregateSummary()
        for v in summary_dict.values():
            result.summaries.append(PyarrowSummary(v["name"], SensorType[v["stype"]], v["start"], v["srate_hz"],
                                                   v["fdir"], v["scount"], v["smint_s"], v["sstd_s"]))
        return result

    def add_aggregate_summary(self, agg_sum: 'AggregateSummary'):
        """
        adds another aggregate summary to this one

        :param agg_sum: another aggregate summary to add
        """
        self.summaries.extend(agg_sum.summaries)

    def add_summary(self, pya_sum: PyarrowSummary):
        """
        adds a summary to the aggregate

        :param pya_sum: the summary to add
        """
        self.summaries.append(pya_sum)

    def merge_audio_summaries(self):
        """
        combines and replaces all Audio summaries into a single summary; also adds any gaps in the data
        """
        pckt_info = []
        audio_lst = self.get_audio()
        frst_audio = audio_lst[0]
        use_mem = frst_audio.check_data()
        for adl in audio_lst:
            pckt_info.append((int(adl.start), adl.data()))

        audio_data = gpu.fill_audio_gaps2(pckt_info,
                                          dtu.seconds_to_microseconds(1 / frst_audio.srate_hz)
                                          )
        tbl = audio_data.create_timestamps()
        frst_audio = PyarrowSummary(frst_audio.name, frst_audio.stype, frst_audio.start, frst_audio.srate_hz,
                                    frst_audio.fdir, tbl.num_rows, frst_audio.smint_s, frst_audio.sstd_s,
                                    tbl)
        if not use_mem:
            frst_audio.write_data(True)

        self.gaps = audio_data.gaps
        self.summaries = self.get_non_audio_list()
        self.add_summary(frst_audio)

    def merge_non_audio_summaries(self):
        """
        combines and replaces all summaries per type except for audio summaries
        """
        smrs_dict = {}
        for smry in self.summaries:
            if smry.stype != SensorType.AUDIO:
                if smry.stype in smrs_dict.keys():
                    smrs_dict[smry.stype].append(smry)
                else:
                    smrs_dict[smry.stype] = [smry]
        self.summaries = self.get_audio()
        for styp, smrys in smrs_dict.items():
            first_summary = smrys.pop(0)
            tbl = first_summary.data()
            combined_mint = np.mean([smrs.smint_s for smrs in smrys])
            combined_std = np.mean([smrs.sstd_s for smrs in smrys])
            if not first_summary.check_data():
                os.makedirs(first_summary.fdir, exist_ok=True)
            for smrs in smrys:
                tbl = pa.concat_tables([tbl, smrs.data()])
                if not first_summary.check_data():
                    os.remove(smrs.file_name())
            if first_summary.check_data():
                first_summary._data = tbl
            else:
                pq.write_table(tbl, first_summary.file_name())
            mnint = dtu.microseconds_to_seconds(float(np.mean(np.diff(tbl["timestamps"].to_numpy()))))
            stdint = dtu.microseconds_to_seconds(float(np.std(np.diff(tbl["timestamps"].to_numpy()))))
            if not combined_mint + combined_std > mnint > combined_mint - combined_std:
                self.errors.append(f"Mean interval s of combined {styp.name} sensor does not match the "
                                   f"compilation of individual mean interval s per packet.  Will use compilation of "
                                   f"individual values.")
                mnint = combined_mint
                stdint = combined_std
            single_smry = PyarrowSummary(first_summary.name, styp, first_summary.start,
                                         1 / mnint, first_summary.fdir, tbl.num_rows, mnint, stdint,
                                         first_summary.data() if first_summary.check_data() else None
                                         )
            self.summaries.append(single_smry)

    def merge_summaries_of_type(self, stype: SensorType):
        """
        combines and replaces multiple summaries of one SensorType into a single one

        *caution: using this on an audio sensor may cause data validation issues*

        :param stype: the type of sensor to combine
        """
        smrs = []
        other_smrs = []
        for smry in self.summaries:
            if smry.stype == stype:
                smrs.append(smry)
            else:
                other_smrs.append(smry)
        first_summary = smrs.pop(0)
        tbl = first_summary.data()
        if not first_summary.check_data():
            os.makedirs(first_summary.fdir, exist_ok=True)
        for smrys in smrs:
            tbl = pa.concat_tables([first_summary.data(), smrys.data()])
            if first_summary.check_data():
                first_summary._data = tbl
            else:
                pq.write_table(tbl, first_summary.file_name())
                os.remove(smrys.file_name())
        mnint = dtu.microseconds_to_seconds(float(np.mean(np.diff(tbl["timestamps"].to_numpy()))))
        stdint = dtu.microseconds_to_seconds(float(np.std(np.diff(tbl["timestamps"].to_numpy()))))
        single_smry = PyarrowSummary(first_summary.name, first_summary.stype, first_summary.start,
                                     1 / mnint, first_summary.fdir, tbl.num_rows, mnint, stdint,
                                     first_summary.data() if first_summary.check_data() else None
                                     )
        self.summaries = other_smrs
        self.summaries.append(single_smry)

    def merge_all_summaries(self):
        """
        merge all PyarrowSummary with the same sensor type into single PyarrowSummary per type
        """
        self.merge_audio_summaries()
        self.merge_non_audio_summaries()

    def get_audio(self) -> List[PyarrowSummary]:
        """
        :return: a list of PyarrowSummary of only Audio data
        """
        return [s for s in self.summaries if s.stype == srupa.SensorType.AUDIO]

    def get_non_audio(self) -> Dict[srupa.SensorType, List[PyarrowSummary]]:
        """
        :return: a dictionary of non-Audio SensorType: PyarrowSummary
        """
        result = {}
        for k in self.sensor_types():
            if k != srupa.SensorType.AUDIO:
                result[k] = [s for s in self.summaries if s.stype == k]
        return result

    def get_non_audio_list(self) -> List[PyarrowSummary]:
        """
        :return: a list of all non-Audio PyarrowSummary
        """
        return [s for s in self.summaries if s.stype != srupa.SensorType.AUDIO]

    def get_sensor(self, stype: srupa.SensorType) -> List[PyarrowSummary]:
        """
        :param stype: type of sensor to find
        :return: a list of all PyarrowSummary of the specified type
        """
        return [s for s in self.summaries if s.stype == stype]

    def sensor_types(self) -> List[srupa.SensorType]:
        """
        :return: a list of sensor types in self.summaries
        """
        result = []
        for s in self.summaries:
            if s.stype not in result:
                result.append(s.stype)
        return result
示例#13
0
class ApiReader:
    """
    Reads data from api 900 or api 1000 format, converting all data read into RedvoxPacketM for
    ease of comparison and use.

    Properties:
        filter: io.ReadFilter with the station ids, start and end time, start and end time padding, and
        types of files to read

        base_dir: str of the directory containing all the files to read

        structured_dir: bool, if True, the base_dir contains a specific directory structure used by the
        respective api formats.  If False, base_dir only has the data files.  Default False.

        files_index: io.Index of the files that match the filter that are in base_dir

        index_summary: io.IndexSummary of the filtered data

        debug: bool, if True, output additional information during function execution.  Default False.
    """
    def __init__(
        self,
        base_dir: str,
        structured_dir: bool = False,
        read_filter: io.ReadFilter = None,
        debug: bool = False,
        pool: Optional[multiprocessing.pool.Pool] = None,
    ):
        """
        Initialize the ApiReader object

        :param base_dir: directory containing the files to read
        :param structured_dir: if True, base_dir contains a specific directory structure used by the respective
                                api formats.  If False, base_dir only has the data files.  Default False.
        :param read_filter: ReadFilter for the data files, if None, get everything.  Default None
        :param debug: if True, output program warnings/errors during function execution.  Default False.
        """
        _pool: multiprocessing.pool.Pool = (multiprocessing.Pool()
                                            if pool is None else pool)

        if read_filter:
            self.filter = read_filter
            if self.filter.station_ids:
                self.filter.station_ids = set(self.filter.station_ids)
        else:
            self.filter = io.ReadFilter()
        self.base_dir = base_dir
        self.structured_dir = structured_dir
        self.debug = debug
        self.errors = RedVoxExceptions("APIReader")
        self.files_index = self._get_all_files(_pool)
        self.index_summary = io.IndexSummary.from_index(
            self._flatten_files_index())
        mem_split_factor = 1
        if len(self.files_index) > 0:
            if settings.is_parallelism_enabled():
                mem_split_factor = len(self.files_index)
            self.chunk_limit = psutil.virtual_memory(
            ).available * PERCENT_FREE_MEM_USE / mem_split_factor
            max_file_size = max([
                fe.file_size_bytes for fi in self.files_index
                for fe in fi.entries
            ])
            if max_file_size > self.chunk_limit:
                raise MemoryError(
                    f"System requires {max_file_size} bytes of memory to process a file but only has "
                    f"{self.chunk_limit} available.  Please free or add more RAM."
                )
            if debug:
                print(
                    f"{mem_split_factor} stations each have {int(self.chunk_limit)} bytes for loading files in "
                    f"memory.")
        else:
            self.chunk_limit = 0

        if debug:
            self.errors.print()

        if pool is None:
            _pool.close()

    def _flatten_files_index(self):
        """
        :return: flattened version of files_index
        """
        result = io.Index()
        for i in self.files_index:
            result.append(i.entries)
        return result

    def _get_all_files(
            self,
            pool: Optional[multiprocessing.pool.Pool] = None
    ) -> List[io.Index]:
        """
        get all files in the base dir of the ApiReader

        :return: index with all the files that match the filter
        """
        _pool: multiprocessing.pool.Pool = (multiprocessing.Pool()
                                            if pool is None else pool)
        index: List[io.Index] = []
        # this guarantees that all ids we search for are valid
        all_index = self._apply_filter(pool=_pool)
        for station_id in all_index.summarize().station_ids():
            id_index = all_index.get_index_for_station_id(station_id)
            checked_index = self._check_station_stats(id_index, pool=_pool)
            index.extend(checked_index)

        if pool is None:
            _pool.close()

        return index

    def _apply_filter(
        self,
        reader_filter: Optional[io.ReadFilter] = None,
        pool: Optional[multiprocessing.pool.Pool] = None,
    ) -> io.Index:
        """
        apply the filter of the reader, or another filter if specified

        :param reader_filter: optional filter; if None, use the reader's filter, default None
        :return: index of the filtered files
        """
        _pool: multiprocessing.pool.Pool = (multiprocessing.Pool()
                                            if pool is None else pool)
        if not reader_filter:
            reader_filter = self.filter
        if self.structured_dir:
            index = io.index_structured(self.base_dir,
                                        reader_filter,
                                        pool=_pool)
        else:
            index = io.index_unstructured(self.base_dir,
                                          reader_filter,
                                          pool=_pool)
        if pool is None:
            _pool.close()
        return index

    def _check_station_stats(
        self,
        station_index: io.Index,
        pool: Optional[multiprocessing.pool.Pool] = None,
    ) -> List[io.Index]:
        """
        check the index's results; if it has enough information, return it, otherwise search for more data.
        The index should only request one station id
        If the station was restarted during the request period, a new group of indexes will be created
        to represent the change in station metadata.

        :param station_index: index representing the requested information
        :return: List of Indexes that includes as much information as possible that fits the request
        """
        _pool: multiprocessing.pool.Pool = multiprocessing.Pool(
        ) if pool is None else pool
        # if we found nothing, return the index
        if len(station_index.entries) < 1:
            return [station_index]

        stats = fs.extract_stats(station_index, pool=_pool)
        # Close pool if created here
        if pool is None:
            _pool.close()

        timing_offsets: Optional[
            offset_model.TimingOffsets] = offset_model.compute_offsets(stats)

        # punt if duration or other important values are invalid or if the latency array was empty
        if timing_offsets is None:
            return [station_index]

        diff_s = diff_e = timedelta(seconds=0)

        # if our filtered files do not encompass the request even when the packet times are updated
        # try getting 1.5 times the difference of the expected start/end and the start/end of the data
        insufficient_str = ""
        if self.filter.start_dt and timing_offsets.adjusted_start > self.filter.start_dt:
            insufficient_str += f" {self.filter.start_dt} (start)"
            # diff_s = self.filter.start_dt_buf + 1.5 * (timing_offsets.adjusted_start - self.filter.start_dt)
            new_end = self.filter.start_dt - self.filter.start_dt_buf
            new_start = new_end - 1.5 * (timing_offsets.adjusted_start -
                                         self.filter.start_dt)
            new_index = self._apply_filter(io.ReadFilter().with_start_dt(
                new_start).with_end_dt(new_end).with_extensions(
                    self.filter.extensions).with_api_versions(
                        self.filter.api_versions).with_station_ids(
                            set(station_index.summarize().station_ids())
                        ).with_start_dt_buf(diff_s).with_end_dt_buf(diff_e))
            if len(new_index.entries) > 0:
                station_index.append(new_index.entries)
                stats.extend(fs.extract_stats(new_index))
        if self.filter.end_dt and timing_offsets.adjusted_end < self.filter.end_dt:
            insufficient_str += f" {self.filter.end_dt} (end)"
            # diff_e = self.filter.end_dt_buf + 1.5 * (self.filter.end_dt - timing_offsets.adjusted_end)
            new_start = self.filter.end_dt + self.filter.end_dt_buf
            new_end = new_start + 1.5 * (self.filter.end_dt -
                                         timing_offsets.adjusted_end)
            new_index = self._apply_filter(io.ReadFilter().with_start_dt(
                new_start).with_end_dt(new_end).with_extensions(
                    self.filter.extensions).with_api_versions(
                        self.filter.api_versions).with_station_ids(
                            set(station_index.summarize().station_ids())
                        ).with_start_dt_buf(diff_s).with_end_dt_buf(diff_e))
            if len(new_index.entries) > 0:
                station_index.append(new_index.entries)
                stats.extend(fs.extract_stats(new_index))
        if len(insufficient_str) > 0:
            self.errors.append(
                f"Data for {station_index.summarize().station_ids()} exists, "
                f"but not at:{insufficient_str}")

        results = {}
        keys = []

        for v, e in enumerate(stats):
            key = e.app_start_dt
            if key not in keys:
                keys.append(key)
                results[key] = io.Index()

            results[key].append(entries=[station_index.entries[v]])

        return list(results.values())

    def _split_workload(self, findex: io.Index) -> List[io.Index]:
        """
        takes an index and splits it into chunks based on a size limit
        while running_total + next_file_size < limit, adds files to a chunk (Index)
        if limit is exceeded, adds the chunk and puts the next file into a new chunk

        :param findex: index of files to split
        :return: list of Index to process
        """
        packet_list = []
        chunk_queue = 0
        chunk_list = []
        for f in findex.entries:
            chunk_queue += f.file_size_bytes
            if chunk_queue > self.chunk_limit:
                packet_list.append(io.Index(chunk_list))
                chunk_queue = 0
                chunk_list = []
            chunk_list.append(f)
        packet_list.append(io.Index(chunk_list))
        return packet_list

    @staticmethod
    def read_files_in_index(indexf: io.Index) -> List[api_m.RedvoxPacketM]:
        """
        read all the files in the index

        :return: list of RedvoxPacketM, converted from API 900 if necessary
        """
        result: List[api_m.RedvoxPacketM] = []

        # Iterate over the API 900 packets in a memory efficient way
        # and convert to API 1000
        # noinspection PyTypeChecker
        for packet_900 in indexf.stream_raw(
                io.ReadFilter.empty().with_api_versions(
                    {io.ApiVersion.API_900})):
            # noinspection Mypy
            result.append(ac.convert_api_900_to_1000_raw(packet_900))

        # Grab the API 1000 packets
        # noinspection PyTypeChecker
        for packet in indexf.stream_raw(
                io.ReadFilter.empty().with_api_versions(
                    {io.ApiVersion.API_1000})):
            # noinspection Mypy
            result.append(packet)

        return result

    # noinspection PyTypeChecker
    def read_files_by_id(
            self, station_id: str) -> Optional[List[api_m.RedvoxPacketM]]:
        """
        :param station_id: the id to filter on
        :return: the list of packets with the requested id, or None if the id can't be found
        """

        result: List[api_m.RedvoxPacketM] = []

        # Iterate over the API 900 packets in a memory efficient way
        # and convert to API 1000
        for packet_900 in self._flatten_files_index().stream_raw(
                io.ReadFilter.empty().with_api_versions(
                    {io.ApiVersion.API_900}).with_station_ids({station_id})):
            # noinspection Mypy
            result.append(ac.convert_api_900_to_1000_raw(packet_900))

        # Grab the API 1000 packets
        for packet in self._flatten_files_index().stream_raw(
                io.ReadFilter.empty().with_api_versions(
                    {io.ApiVersion.API_1000}).with_station_ids({station_id})):
            # noinspection Mypy
            result.append(packet)

        if len(result) == 0:
            return None

        return result

    def _station_by_index(self, findex: io.Index) -> Station:
        """
        :param findex: index with files to build a station with
        :return: Station built from files in findex
        """
        return Station.create_from_packets(self.read_files_in_index(findex))

    def get_stations(
            self,
            pool: Optional[multiprocessing.pool.Pool] = None) -> List[Station]:
        """
        :param pool: optional multiprocessing pool
        :return: List of all stations in the ApiReader
        """
        return list(
            maybe_parallel_map(pool,
                               self._station_by_index,
                               self.files_index,
                               chunk_size=1))

    def get_station_by_id(self, get_id: str) -> Optional[List[Station]]:
        """
        :param get_id: the id to filter on
        :return: list of all stations with the requested id or None if id can't be found
        """
        result = [s for s in self.get_stations() if s.id() == get_id]
        if len(result) < 1:
            return None
        return result
示例#14
0
class TimeSyncAnalysis:
    """
    Used for multiple TimeSyncData objects from a station
    properties:
        station_id: string, the station_id of the station being analyzed, default empty string
        best_latency_index: int, the index of the TimeSyncData object with the best latency, default np.nan
        latency_stats: StatsContainer, the statistics of the latencies
        offset_stats: StatsContainer, the statistics of the offsets
        offset_model: optional OffsetModel, used to calculate offset at a given point in time
        sample_rate_hz: float, the audio sample rate in hz of the station, default np.nan
        timesync_data: list of TimeSyncData, the TimeSyncData to analyze, default empty list
        station_start_timestamp: float, the timestamp of when the station became active, default np.nan
    """
    def __init__(
        self,
        station_id: str = "",
        audio_sample_rate_hz: float = np.nan,
        station_start_timestamp: float = np.nan,
        time_sync_data: Optional[List[TimeSyncData]] = None,
    ):
        """
        Initialize the object

        :param station_id: id of the station to analyze, default empty string
        :param audio_sample_rate_hz: audio sample rate in hz of the station, default np.nan
        :param station_start_timestamp: timestamp of when station started recording, default np.nan
        :param time_sync_data: the TimeSyncData objects created from the packets of the station, default None
        """
        self.station_id: str = station_id
        self.sample_rate_hz: float = audio_sample_rate_hz
        self.station_start_timestamp: float = station_start_timestamp
        self.best_latency_index: int = np.nan
        self.latency_stats = sh.StatsContainer("latency")
        self.offset_stats = sh.StatsContainer("offset")
        self.errors = RedVoxExceptions("TimeSyncAnalysis")
        if time_sync_data:
            self.timesync_data: List[TimeSyncData] = time_sync_data
            self.evaluate_and_validate_data()
        else:
            self.timesync_data = []
            self.offset_model = OffsetModel.empty_model()

    def evaluate_and_validate_data(self):
        """
        check the data for errors and update the analysis statistics
        """
        self.evaluate_latencies()
        self.validate_start_timestamp()
        self.validate_sample_rate()
        self._calc_timesync_stats()
        self.offset_model = self.get_offset_model()

    def get_offset_model(self) -> OffsetModel:
        """
        :return: an OffsetModel based on the information in the timesync analysis
        """
        return OffsetModel(
            self.get_latencies(), self.get_offsets(),
            np.array([
                td.get_best_latency_timestamp() for td in self.timesync_data
            ]), self.timesync_data[0].packet_start_timestamp,
            self.timesync_data[-1].packet_end_timestamp)

    def _calc_timesync_stats(self):
        """
        calculates the mean and std deviation for latencies and offsets
        """
        if len(self.timesync_data) < 1:
            self.errors.append(
                "Nothing to calculate stats; length of timesync data is less than 1"
            )
        else:
            for index in range(len(self.timesync_data)):
                # add the stats of the latency
                self.latency_stats.add(
                    self.timesync_data[index].mean_latency,
                    self.timesync_data[index].latency_std,
                    self.timesync_data[index].num_tri_messages() * 2,
                )
                # add the stats of the offset
                self.offset_stats.add(
                    self.timesync_data[index].mean_offset,
                    self.timesync_data[index].offset_std,
                    self.timesync_data[index].num_tri_messages() * 2,
                )
            self.latency_stats.best_value = self.get_best_latency()
            self.offset_stats.best_value = self.get_best_offset()

    def from_packets(
        self, packets: List[Union[WrappedRedvoxPacketM, WrappedRedvoxPacket]]
    ) -> 'TimeSyncAnalysis':
        """
        converts packets into TimeSyncData objects, then performs analysis

        :param packets: list of WrappedRedvoxPacketM to convert
        :return: modified version of self
        """
        self.timesync_data = [
            TimeSyncData(
                self.station_id,
                self.sample_rate_hz,
                packet.get_sensors().get_audio().get_num_samples(),
                self.station_start_timestamp,
                packet.get_timing_information(
                ).get_server_acquisition_arrival_timestamp(),
                packet.get_timing_information().
                get_packet_start_mach_timestamp(),
                packet.get_timing_information().get_packet_end_mach_timestamp(
                ),
                packet.get_timing_information().get_synch_exchange_array(),
                packet.get_timing_information().get_best_latency(),
                packet.get_timing_information().get_best_offset(),
            ) if isinstance(packet, WrappedRedvoxPacketM) else TimeSyncData(
                self.station_id,
                self.sample_rate_hz,
                packet.microphone_sensor().payload_values().size,
                self.station_start_timestamp,
                packet.server_timestamp_epoch_microseconds_utc(),
                packet.start_timestamp_us_utc(),
                packet.end_timestamp_us_utc(),
                list(packet.time_synchronization_sensor().payload_values()),
                packet.best_latency(),
                packet.best_offset(),
            ) for packet in packets
        ]
        if len(self.timesync_data) > 0:
            self.evaluate_and_validate_data()
        return self

    def from_raw_packets(
        self, packets: List[Union[RedvoxPacketM,
                                  RedvoxPacket]]) -> 'TimeSyncAnalysis':
        """
        converts packets into TimeSyncData objects, then performs analysis

        :param packets: list of WrappedRedvoxPacketM to convert
        :return: modified version of self
        """
        timesync_data: List[TimeSyncData] = []

        packet: Union[RedvoxPacketM, RedvoxPacket]
        for packet in packets:
            tsd: TimeSyncData
            if isinstance(packet, RedvoxPacketM):
                exchanges: List[float] = reduce(
                    lambda acc, ex: acc +
                    [ex.a1, ex.a2, ex.a3, ex.b1, ex.b2, ex.b3],
                    packet.timing_information.synch_exchanges, [])
                tsd = TimeSyncData(
                    packet.station_information.id,
                    packet.sensors.audio.sample_rate,
                    len(packet.sensors.audio.samples.values),
                    packet.timing_information.app_start_mach_timestamp, packet.
                    timing_information.server_acquisition_arrival_timestamp,
                    packet.timing_information.packet_start_mach_timestamp,
                    packet.timing_information.packet_end_mach_timestamp,
                    exchanges, packet.timing_information.best_latency,
                    packet.timing_information.best_offset)
            else:
                mtz: float = np.nan
                best_latency: float = np.nan
                best_offset: float = np.nan

                for i, v in enumerate(packet.metadata):
                    plus_1: int = i + 1
                    try:
                        if v == "machTimeZero" and plus_1 < len(
                                packet.metadata):
                            mtz = float(packet.metadata[plus_1])
                        if v == "bestLatency" and plus_1 < len(
                                packet.metadata):
                            best_latency = float(packet.metadata[plus_1])
                        if v == "bestOffset" and plus_1 < len(packet.metadata):
                            best_offset = float(packet.metadata[plus_1])
                    except (KeyError, ValueError):
                        continue

                # Get synch exchanges
                exchanges: Optional[np.ndarray] = None
                ch: api900_pb2.UnevenlySampledChannel
                for ch in packet.unevenly_sampled_channels:
                    if api900_pb2.TIME_SYNCHRONIZATION in ch.channel_types:
                        exchanges = util_900.extract_payload(ch)

                tsd = TimeSyncData(
                    packet.redvox_id,
                    packet.evenly_sampled_channels[0].sample_rate_hz,
                    util_900.payload_len(packet.evenly_sampled_channels[0]),
                    mtz,
                    packet.evenly_sampled_channels[0].
                    first_sample_timestamp_epoch_microseconds_utc,
                    packet.server_timestamp_epoch_microseconds_utc,
                    packet.app_file_start_timestamp_machine,
                    list(exchanges),
                    best_latency,
                    best_offset,
                )

            timesync_data.append(tsd)

        self.timesync_data = timesync_data

        if len(self.timesync_data) > 0:
            self.evaluate_and_validate_data()

        return self

    def add_timesync_data(self, timesync_data: TimeSyncData):
        """
        adds a TimeSyncData object to the analysis

        :param timesync_data: TimeSyncData to add
        """
        self.timesync_data.append(timesync_data)
        self.evaluate_and_validate_data()

    def get_num_packets(self) -> int:
        """
        :return: number of packets analyzed
        """
        return len(self.timesync_data)

    def get_best_latency(self) -> float:
        """
        :return: the best latency
        """
        if np.isnan(self.best_latency_index):
            return np.nan
        return self.timesync_data[self.best_latency_index].best_latency

    def get_latencies(self) -> np.array:
        """
        :return: np.array containing all the latencies
        """
        return np.array(
            [ts_data.best_latency for ts_data in self.timesync_data])

    def get_mean_latency(self) -> float:
        """
        :return: the mean of the latencies, or np.nan if it doesn't exist
        """
        return self.latency_stats.mean_of_means()

    def get_latency_stdev(self) -> float:
        """
        :return: the standard deviation of the latencies, or np.nan if it doesn't exist
        """
        return self.latency_stats.total_std_dev()

    def get_best_offset(self) -> float:
        """
        :return: offset associated with the best latency
        """
        if np.isnan(self.best_latency_index):
            return np.nan
        return self.timesync_data[self.best_latency_index].best_offset

    def get_offsets(self) -> np.array:
        """
        :return: np.array containing all the offsets
        """
        return np.array(
            [ts_data.best_offset for ts_data in self.timesync_data])

    def get_mean_offset(self) -> float:
        """
        :return: the mean of the offsets, or np.nan if it doesn't exist
        """
        return self.offset_stats.mean_of_means()

    def get_offset_stdev(self) -> float:
        """
        :return: the standard deviation of the offsets, or np.nan if it doesn't exist
        """
        return self.offset_stats.total_std_dev()

    def get_best_packet_latency_index(self) -> int:
        """
        :return: the best latency's index in the packet with the best latency
        """
        if np.isnan(self.best_latency_index):
            return np.nan
        return self.timesync_data[self.best_latency_index].best_latency_index

    def get_best_start_time(self) -> float:
        """
        :return: start timestamp associated with the best latency
        """
        if np.isnan(self.best_latency_index):
            return np.nan
        return self.timesync_data[
            self.best_latency_index].packet_start_timestamp

    def get_start_times(self) -> np.array:
        """
        :return: list of the start timestamps of each packet
        """
        start_times = []
        for ts_data in self.timesync_data:
            start_times.append(ts_data.packet_start_timestamp)
        return np.array(start_times)

    def get_bad_packets(self) -> List[int]:
        """
        :return: list of all packets that contains invalid data
        """
        bad_packets = []
        for idx in range(self.get_num_packets()
                         ):  # mark bad indices (they have a 0 or less value)
            if self.get_latencies()[idx] <= 0 or np.isnan(
                    self.get_latencies()[idx]):
                bad_packets.append(idx)
        return bad_packets

    def evaluate_latencies(self):
        """
        finds the best latency
        outputs warnings if a change in timestamps is detected
        """
        if self.get_num_packets() < 1:
            self.errors.append(
                "Latencies cannot be evaluated; length of timesync data is less than 1"
            )
        else:
            self.best_latency_index = 0
            # assume the first element has the best timesync values for now, then compare with the others
            for index in range(1, self.get_num_packets()):
                best_latency = self.get_best_latency()
                # find the best latency; in this case, the minimum
                # if new value exists and if the current best does not or new value is better than current best, update
                if (not np.isnan(self.timesync_data[index].best_latency) and
                    (np.isnan(best_latency)) or
                        self.timesync_data[index].best_latency < best_latency):
                    self.best_latency_index = index

    def validate_start_timestamp(self, debug: bool = False) -> bool:
        """
        confirms if station_start_timestamp differs in any of the timesync_data
        outputs warnings if a change in timestamps is detected

        :param debug: if True, output warning message, default False
        :return: True if no change
        """
        for index in range(self.get_num_packets()):
            # compare station start timestamps; notify when they are different
            if (self.timesync_data[index].station_start_timestamp !=
                    self.station_start_timestamp):
                self.errors.append(
                    f"Change in station start timestamp detected; "
                    f"expected: {self.station_start_timestamp}, read: "
                    f"{self.timesync_data[index].station_start_timestamp}")
                if debug:
                    self.errors.print()
                return False
        # if here, all the sample timestamps are the same
        return True

    def validate_sample_rate(self, debug: bool = False) -> bool:
        """
        confirms if sample rate is the same across all timesync_data
        outputs warning if a change in sample rate is detected

        :param debug: if True, output warning message, default False
        :return: True if no change
        """
        for index in range(self.get_num_packets()):
            # compare station start timestamps; notify when they are different
            if (np.isnan(self.timesync_data[index].sample_rate_hz)
                    or self.timesync_data[index].sample_rate_hz !=
                    self.sample_rate_hz):
                self.errors.append(
                    f"Change in station sample rate detected; "
                    f"expected: {self.sample_rate_hz}, read: {self.timesync_data[index].sample_rate_hz}"
                )
                if debug:
                    self.errors.print()
                return False
        # if here, all the sample rates are the same
        return True

    def validate_time_gaps(self,
                           gap_duration_s: float,
                           debug: bool = False) -> bool:
        """
        confirms there are no data gaps between packets
        outputs warning if a gap is detected

        :param gap_duration_s: length of time in seconds to be detected as a gap
        :param debug: if True, output warning message, default False
        :return: True if no gap
        """
        if self.get_num_packets() < 2:
            self.errors.append(
                "Less than 2 timesync data objects to evaluate gaps with")
            if debug:
                self.errors.print()
        else:
            for index in range(1, self.get_num_packets()):
                # compare last packet's end timestamp with current start timestamp
                if (dt.microseconds_to_seconds(
                        self.timesync_data[index].packet_start_timestamp -
                        self.timesync_data[index - 1].packet_end_timestamp) >
                        gap_duration_s):
                    self.errors.append(
                        f"Gap detected at packet number: {index}")
                    if debug:
                        self.errors.print()
                    return False
        # if here, no gaps
        return True

    def update_timestamps(self, use_model: bool = True):
        """
        update timestamps by adding microseconds based on the OffsetModel.

        :param use_model: if True, use the model, otherwise use best offset
        """
        if use_model and self.offset_model:
            self.station_start_timestamp += self.offset_model.get_offset_at_time(
                self.station_start_timestamp)
            for tsd in self.timesync_data:
                tsd.update_timestamps(self.offset_model)
        else:
            self.station_start_timestamp += self.get_best_offset()
            for tsd in self.timesync_data:
                tsd.update_timestamps()
示例#15
0
class DataWindow:
    """
    Holds the data for a given time window; adds interpolated timestamps to fill gaps and pad start and end values

    Properties:
        event_name: str, name of the DataWindow.  defaults to "dw"

        event_origin: Optional EventOrigin which describes the physical location and radius of the
        origin event.  Default empty EventOrigin (no valid data)

        config: optional DataWindowConfig with information on how to construct DataWindow from
        Redvox (.rdvx*) files.  Default None

        sdk_version: str, the version of the Redvox SDK used to create the DataWindow

        debug: bool, if True, outputs additional information during initialization. Default False

    Protected:
        _fs_writer: DataWindowFileSystemWriter; includes event_name, output directory (Default "."),
        output type (options: "PARQUET", "LZ4", "NONE".  Default NONE), and option to make a
        runme.py example file (Default False)

        _stations: List of Stations that belong to the DataWindow

        _errors: RedVoxExceptions; contains a list of all errors encountered by the DataWindow
    """
    def __init__(
            self,
            event_name: str = "dw",
            event_origin: Optional[EventOrigin] = None,
            config: Optional[DataWindowConfig] = None,
            output_dir: str = ".",
            out_type: str = "NONE",
            make_runme: bool = False,
            debug: bool = False,
    ):
        """
        Initialize the DataWindow

        :param event_name: name of the DataWindow.  defaults to "dw"
        :param event_origin: Optional EventOrigin which describes the physical location and radius of the
                                origin event.  Default empty EventOrigin (no valid data)
        :param config: Optional DataWindowConfig which describes how to extract data from Redvox files.
                        Default None
        :param output_dir: output directory for saving files.  Default "." (current directory)
        :param out_type: type of file to save the DataWindow as.  Options: "PARQUET", "LZ4", "NONE".
                            Default "NONE" (no saving)
        :param make_runme: if True, saves an example runme.py file with the data.  Default False
        :param debug: if True, outputs additional information during initialization.  Default False
        """
        self.event_name: str = event_name
        self.event_origin: EventOrigin = event_origin if event_origin else EventOrigin()
        self._fs_writer = dw_io.DataWindowFileSystemWriter(self.event_name, out_type, output_dir, make_runme)
        self.debug: bool = debug
        self._sdk_version: str = redvox.VERSION
        self._errors = RedVoxExceptions("DataWindow")
        self._stations: List[Station] = []
        self._config = config
        if config:
            if config.start_datetime and config.end_datetime and (config.end_datetime <= config.start_datetime):
                self._errors.append("DataWindow will not work when end datetime is before or equal to start datetime.\n"
                                    f"Your times: {config.end_datetime} <= {config.start_datetime}")
            else:
                self.create_data_window()
        if self.debug:
            self.print_errors()

    # def __repr__(self):
    #     # todo: use representations for the datetime and timedelta objects
    #     # todo: use the dictionary function
    #     return dw_io.dict_to_json({
    #         "event_name": self.event_name,
    #         "event_origin": repr(self.event_origin),
    #         "config": repr(self._config),
    #         "base_dir": self.save_dir(),
    #         "out_type": self._fs_writer.file_extension,
    #         "make_runme": self._fs_writer.make_run_me,
    #         "sdk_version": self._sdk_version,
    #         "errors": repr(self._errors),
    #         "debug": self.debug
    #     })
    #
    # def __str__(self):
    #     # todo: use representations for the datetime and timedelta objects
    #     # todo: use the dictionary function
    #     return dw_io.dict_to_json(
    #         {"event_name": self.event_name,
    #          "event_origin": str(self.event_origin),
    #          "config": str(self._config),
    #          "base_dir": self.save_dir(),
    #          "stations": [s.default_station_json_file_name() for s in self._stations],
    #          "out_type": self._fs_writer.file_extension,
    #          "make_runme": self._fs_writer.make_run_me,
    #          "sdk_version": self._sdk_version,
    #          "errors": str(self._errors),
    #          "debug": self.debug
    #          })

    def save_dir(self) -> str:
        """
        :return: directory data is saved to (empty string means saving to memory)
        """
        return self._fs_writer.save_dir()

    def set_save_dir(self, new_save_dir: Optional[str] = "."):
        """
        :param new_save_dir: directory to save data to; default current directory, or "."
        """
        self._fs_writer.base_dir = new_save_dir

    def is_make_runme(self) -> bool:
        """
        :return: if DataWindow will be saved with a runme file
        """
        return self._fs_writer.make_run_me

    def set_make_runme(self, make_runme: bool = False):
        """
        :param make_runme: if True, DataWindow will create a runme file when saved.  Default False
        """
        self._fs_writer.make_run_me = make_runme

    def fs_writer(self) -> dw_io.DataWindowFileSystemWriter:
        """
        :return: DataWindowFileSystemWriter for DataWindow
        """
        return self._fs_writer

    def out_type(self) -> str:
        """
        :return: string of the output type of the DataWindow
        """
        return self._fs_writer.file_extension

    def set_out_type(self, new_out_type: str):
        """
        set the output type of the DataWindow.  options are "NONE", "PARQUET" and "LZ4".  invalid values become "NONE"

        :param new_out_type: new output type of the DataWindow
        """
        self._fs_writer.set_extension(new_out_type)

    def as_dict(self) -> Dict:
        """
        :return: DataWindow properties as dictionary
        """
        return {"event_name": self.event_name,
                "event_origin": self.event_origin.as_dict(),
                "start_time": self.start_date(),
                "end_time": self.end_date(),
                "base_dir": self.save_dir(),
                "stations": [s.default_station_json_file_name() for s in self._stations],
                "config": self._config.as_dict(),
                "debug": self.debug,
                "errors": self._errors.as_dict(),
                "sdk_version": self._sdk_version,
                "out_type": self._fs_writer.file_extension,
                "make_runme": self._fs_writer.make_run_me
                }

    def pretty(self) -> str:
        """
        :return: DataWindow as dictionary, but easier to read
        """
        # noinspection Mypy
        return pprint.pformat(self.as_dict())

    @staticmethod
    def from_config(config: DataWindowConfigFile) -> "DataWindow":
        """
        Use a config file to create a DataWindow

        :param config: DataWindowConfigFile to load from
        :return: DataWindow
        """
        event_origin = EventOrigin(config.origin_provider, config.origin_latitude, config.origin_latitude_std,
                                   config.origin_longitude, config.origin_longitude_std, config.origin_altitude,
                                   config.origin_altitude_std, config.origin_event_radius_m)
        dw_config = DataWindowConfig(config.input_directory, config.structured_layout, config.start_dt(),
                                     config.end_dt(), config.start_buffer_td(), config.end_buffer_td(),
                                     config.drop_time_seconds, config.station_ids, config.extensions,
                                     config.api_versions, config.apply_correction, config.use_model_correction,
                                     config.copy_edge_points())
        return DataWindow(config.event_name, event_origin, dw_config, config.output_dir, config.output_type,
                          config.make_runme, config.debug)

    @staticmethod
    def from_config_file(file: str) -> "DataWindow":
        """
        Loads a configuration file to create the DataWindow

        :param file: full path to config file
        :return: DataWindow
        """
        return DataWindow.from_config(DataWindowConfigFile.from_path(file))

    @staticmethod
    def deserialize(path: str) -> "DataWindow":
        """
        Decompresses and deserializes a DataWindow written to disk.

        :param path: Path to the serialized and compressed DataWindow.
        :return: An instance of a DataWindow.
        """
        return dw_io.deserialize_data_window(path)

    def serialize(self, compression_factor: int = 4) -> Path:
        """
        Serializes and compresses this DataWindow to a file.
        Uses the event_name and out_dir to name the file.

        :param compression_factor: A value between 1 and 12. Higher values provide better compression, but take
        longer. (default=4).
        :return: The path to the written file.
        """
        return dw_io.serialize_data_window(self, self.save_dir(), f"{self.event_name}.pkl.lz4", compression_factor)

    def _to_json_file(self) -> Path:
        """
        Converts the DataWindow metadata into a JSON file and compresses the DataWindow and writes it to disk.

        :return: The path to the written file
        """
        return dw_io.data_window_to_json(self, self.save_dir())

    def to_json(self) -> str:
        """
        :return: The DataWindow metadata into a JSON string.
        """
        return dw_io.data_window_as_json(self)

    @staticmethod
    def from_json(json_str: str) -> "DataWindow":
        """
        Read the DataWindow from a JSON string.  If file is improperly formatted, raises a ValueError.

        :param json_str: the JSON to read
        :return: The DataWindow as defined by the JSON
        """
        return DataWindow.from_json_dict(dw_io.json_to_dict(json_str))

    @staticmethod
    def from_json_dict(json_dict: Dict) -> "DataWindow":
        """
        Reads a JSON dictionary and loads the data into the DataWindow.
        If dictionary is improperly formatted, raises a ValueError.

        :param json_dict: the dictionary to read
        :return: The DataWindow as defined by the JSON
        """
        if "out_type" not in json_dict.keys() \
                or json_dict["out_type"].upper() not in dw_io.DataWindowOutputType.list_names():
            raise ValueError('Dictionary loading type is invalid or unknown.  '
                             'Check the value "out_type"; it must be one of: '
                             f'{dw_io.DataWindowOutputType.list_non_none_names()}')
        else:
            out_type = dw_io.DataWindowOutputType.str_to_type(json_dict["out_type"])
            if out_type == dw_io.DataWindowOutputType.PARQUET:
                dwin = DataWindow(json_dict["event_name"], EventOrigin.from_dict(json_dict["event_origin"]),
                                  None, json_dict["base_dir"], json_dict["out_type"], json_dict["make_runme"],
                                  json_dict["debug"])
                dwin._config = DataWindowConfig.from_dict(json_dict["config"])
                dwin._errors = RedVoxExceptions.from_dict(json_dict["errors"])
                dwin._sdk_version = json_dict["sdk_version"]
                for st in json_dict["stations"]:
                    dwin.add_station(Station.from_json_file(os.path.join(json_dict["base_dir"], st), f"{st}.json"))
            elif out_type == dw_io.DataWindowOutputType.LZ4:
                dwin = DataWindow.deserialize(os.path.join(json_dict["base_dir"],
                                                           f"{json_dict['event_name']}.pkl.lz4"))
            else:
                dwin = DataWindow()
            return dwin

    def save(self) -> Path:
        """
        save the DataWindow to disk if saving is enabled
        if saving is not enabled, adds an error to the DataWindow and returns an empty path.

        :return: the path to where the files exist; an empty path means no files were saved
        """
        if self._fs_writer.is_save_disk():
            if self._fs_writer.is_use_disk() and self._fs_writer.make_run_me:
                shutil.copyfile(os.path.abspath(inspect.getfile(run_me)),
                                os.path.join(self._fs_writer.save_dir(), "runme.py"))
            if self._fs_writer.file_extension == "parquet":
                return self._to_json_file()
            elif self._fs_writer.file_extension == "lz4":
                return self.serialize()
        else:
            self._errors.append("Saving not enabled.")
            print("WARNING: Cannot save data window without knowing extension.")
            return Path()

    @staticmethod
    def load(file_path: str) -> "DataWindow":
        """
        load from json metadata and lz4 compressed file or directory of files

        :param file_path: full path of file to load
        :return: DataWindow from json metadata
        """
        cur_path = os.getcwd()
        os.chdir(os.path.dirname(file_path))
        result = DataWindow.from_json_dict(dw_io.json_file_to_data_window(file_path))
        os.chdir(cur_path)
        return result

    def config(self) -> DataWindowConfig:
        """
        :return: settings used to create the DataWindow
        """
        return self._config

    def sdk_version(self) -> str:
        """
        :return: sdk version used to create the DataWindow
        """
        return self._sdk_version

    def set_sdk_version(self, version: str):
        """
        :param version: the sdk version to set
        """
        self._sdk_version = version

    def start_date(self) -> float:
        """
        :return: minimum start timestamp of the data or np.nan if no data
        """
        if len(self._stations) > 0:
            return np.min([s.first_data_timestamp() for s in self._stations])
        return np.nan

    def end_date(self) -> float:
        """
        :return: maximum end timestamp of the data or np.nan if no data
        """
        if len(self._stations) > 0:
            return np.max([s.last_data_timestamp() for s in self._stations])
        return np.nan

    def stations(self) -> List[Station]:
        """
        :return: list of stations in the DataWindow
        """
        return self._stations

    def station_ids(self) -> List[str]:
        """
        :return: ids of stations in the DataWindow
        """
        return [s.id() for s in self._stations]

    def add_station(self, station: Station):
        """
        add a station to the DataWindow
        :param station: Station to add
        """
        self._stations.append(station)

    def remove_station(self, station_id: Optional[str] = None, start_date: Optional[float] = None):
        """
        remove the first station from the DataWindow, or a specific station if given the id and/or start date
        if an id is given, the first station with that id will be removed
        if a start date is given, the removed station will start at or after the start date
        start date is in microseconds since epoch UTC

        :param station_id: id of station to remove
        :param start_date: start date that is at or before the station to remove
        """
        id_removals = []
        sd_removals = []
        if station_id is None and start_date is None:
            self._stations.pop()
        else:
            if station_id is not None:
                for s in range(len(self._stations)):
                    if self._stations[s].id == station_id:
                        id_removals.append(s)
            if start_date is not None:
                for s in range(len(self._stations)):
                    if self._stations[s].start_date() >= start_date:
                        sd_removals.append(s)
            if len(id_removals) > 0 and start_date is None:
                self._stations.pop(id_removals.pop())
            elif len(sd_removals) > 0 and station_id is None:
                self._stations.pop(sd_removals.pop())
            elif len(id_removals) > 0 and len(sd_removals) > 0:
                for a in id_removals:
                    for b in sd_removals:
                        if a == b:
                            self._stations.pop(a)
                            return
                        if a < b:
                            continue

    def first_station(self, station_id: Optional[str] = None) -> Optional[Station]:
        """
        :param station_id: optional station id to filter on
        :return: first station matching params; if no params given, gets first station in list.
                    returns None if no station with given station_id exists.
        """
        if len(self._stations) < 1:
            self._errors.append(f"Attempted to get a station, but there are no stations in the data window!")
            if self.debug:
                print(f"Attempted to get a station, but there are no stations in the data window!")
            return None
        elif station_id:
            result = [s for s in self._stations if s.get_key().check_key(station_id, None, None)]
            if len(result) > 0:
                return result[0]
            self._errors.append(f"Attempted to get station {station_id}, but that station is not in this data window!")
            if self.debug:
                print(f"Attempted to get station {station_id}, but that station is not in this data window!")
            return None
        return self._stations[0]

    def get_station(self, station_id: str, station_uuid: Optional[str] = None,
                    start_timestamp: Optional[float] = None) -> Optional[List[Station]]:
        """
        Get stations from the DataWindow.  Must give at least the station's id.  Other parameters may be None,
        which means the value will be ignored when searching.  Results will match all non-None parameters given.

        :param station_id: station id
        :param station_uuid: station uuid, default None
        :param start_timestamp: station start timestamp in microseconds since UTC epoch, default None
        :return: A list of valid stations or None if the station cannot be found
        """
        result = [s for s in self._stations if s.get_key().check_key(station_id, station_uuid, start_timestamp)]
        if len(result) > 0:
            return result
        self._errors.append(f"Attempted to get station {station_id}, but that station is not in this data window!")
        if self.debug:
            print(f"Attempted to get station {station_id}, but that station is not in this data window!")
        return None

    # def _add_sensor_to_window(self, station: Station):
        # set the window start and end if they were specified, otherwise use the bounds of the data
        # self.create_window_in_sensors(station, self._config.start_datetime, self._config.end_datetime)

    def create_data_window(self, pool: Optional[multiprocessing.pool.Pool] = None):
        """
        updates the DataWindow to contain only the data within the window parameters
        stations without audio or any data outside the window are removed
        """
        # Let's create and manage a single pool of workers that we can utilize throughout
        # the instantiation of the data window.
        _pool: multiprocessing.pool.Pool = multiprocessing.Pool() if pool is None else pool

        r_f = io.ReadFilter()
        if self._config.start_datetime:
            r_f.with_start_dt(self._config.start_datetime)
        if self._config.end_datetime:
            r_f.with_end_dt(self._config.end_datetime)
        if self._config.station_ids:
            r_f.with_station_ids(self._config.station_ids)
        if self._config.extensions:
            r_f.with_extensions(self._config.extensions)
        else:
            self._config.extensions = r_f.extensions
        if self._config.api_versions:
            r_f.with_api_versions(self._config.api_versions)
        else:
            self._config.api_versions = r_f.api_versions
        r_f.with_start_dt_buf(self._config.start_buffer_td)
        r_f.with_end_dt_buf(self._config.end_buffer_td)

        if self.debug:
            print("Reading files from disk.  This may take a few minutes to complete.")

        # get the data to convert into a window
        a_r = ApiReaderDw(self._config.input_dir, self._config.structured_layout, r_f,
                          correct_timestamps=self._config.apply_correction,
                          use_model_correction=self._config.use_model_correction,
                          dw_base_dir=self.save_dir(),
                          dw_save_mode=self._fs_writer.save_mode(),
                          debug=self.debug, pool=_pool)

        self._errors.extend_error(a_r.errors)

        if self._fs_writer.is_use_mem() and a_r.dw_save_mode != self._fs_writer.save_mode():
            if self.debug:
                print("Estimated size of files exceeds available memory.")
                print("Automatically using temporary directory to store data.")
            self._fs_writer.set_use_temp(True)

        # Parallel update
        # Apply timing correction in parallel by station
        sts = a_r.get_stations()
        if self.debug:
            print("num stations loaded: ", len(sts))
        # if self._config.apply_correction:
            # for st in maybe_parallel_map(_pool, Station.update_timestamps,
            #                              iter(sts), chunk_size=1):
            #     self._add_sensor_to_window(st)
            #     if self.debug:
            #         print("station processed: ", st.id())
        for st in maybe_parallel_map(_pool, Station.update_timestamps, iter(sts), chunk_size=1):
            self.create_window_in_sensors(st, self._config.start_datetime, self._config.end_datetime)
            if self.debug:
                print("station processed: ", st.id())

        # check for stations without data
        self._check_for_audio()
        self._check_valid_ids()

        # update the default data window name if we have data and the default name exists
        if self.event_name == "dw" and len(self._stations) > 0:
            self.event_name = f"dw_{int(self.start_date())}_{len(self._stations)}"

        # must update the start and end in order for the data to be saved
        # update remaining data window values if they're still default
        if not self._config.start_datetime and len(self._stations) > 0:
            self._config.start_datetime = dtu.datetime_from_epoch_microseconds_utc(
                np.min([t.first_data_timestamp() for t in self._stations]))
        # end_datetime is non-inclusive, so it must be greater than our latest timestamp
        if not self._config.end_datetime and len(self._stations) > 0:
            self._config.end_datetime = dtu.datetime_from_epoch_microseconds_utc(
                np.max([t.last_data_timestamp() for t in self._stations]) + 1)

        # If the pool was created by this function, then it needs to managed by this function.
        if pool is None:
            _pool.close()

    def _check_for_audio(self):
        """
        removes any station without audio data from the DataWindow
        """
        remove = []
        for s in self._stations:
            if not s.has_audio_sensor():
                remove.append(s.id())
        if len(remove) > 0:
            self._stations = [s for s in self._stations if s.id() not in remove]

    def _check_valid_ids(self):
        """
        if there are stations, searches the station_ids for any ids not in the data collected
        and creates an error message for each id requested but has no data
        if there are no stations, creates a single error message declaring no data found
        """
        if len(self._stations) < 1 and self._config.station_ids:
            if len(self._config.station_ids) > 1:
                add_ids = f"for all stations {self._config.station_ids} "
            else:
                add_ids = ""
            self._errors.append(f"No data matching criteria {add_ids}in {self._config.input_dir}"
                                f"\nPlease adjust parameters of DataWindow")
        elif len(self._stations) > 0 and self._config.station_ids:
            for ids in self._config.station_ids:
                if ids.zfill(10) not in [i.id() for i in self._stations]:
                    self._errors.append(
                        f"Requested {ids} but there is no data to read for that station"
                    )

    def create_window_in_sensors(
            self, station: Station, start_datetime: Optional[dtu.datetime] = None,
            end_datetime: Optional[dtu.datetime] = None
    ):
        """
        truncate the sensors in the station to only contain data from start_date_timestamp to end_date_timestamp
        if the start and/or end are not specified, keeps all audio data that fits and uses it
        to truncate the other sensors.
        returns nothing, updates the station in place

        :param station: station object to truncate sensors of
        :param start_datetime: datetime of start of window, default None
        :param end_datetime: datetime of end of window, default None
        """
        if start_datetime:
            start_datetime = dtu.datetime_to_epoch_microseconds_utc(start_datetime)
        else:
            start_datetime = 0
        if end_datetime:
            end_datetime = dtu.datetime_to_epoch_microseconds_utc(end_datetime)
        else:
            end_datetime = dtu.datetime_to_epoch_microseconds_utc(dtu.datetime.max)
        self.process_sensor(station.audio_sensor(), station.id(), start_datetime, end_datetime)
        for sensor in [s for s in station.data() if s.type() != SensorType.AUDIO]:
            self.process_sensor(sensor, station.id(), station.audio_sensor().first_data_timestamp(),
                                station.audio_sensor().last_data_timestamp())
        # recalculate metadata
        station.update_first_and_last_data_timestamps()
        station.set_packet_metadata([meta for meta in station.packet_metadata()
                                     if meta.packet_start_mach_timestamp < station.last_data_timestamp() and
                                     meta.packet_end_mach_timestamp >= station.first_data_timestamp()])
        if self._fs_writer.is_save_disk():
            station.set_save_mode(io.FileSystemSaveMode.DISK)
            station.set_save_dir(self.save_dir() if self._fs_writer.is_use_disk() else self._fs_writer.get_temp())
        self._stations.append(station)

    def process_sensor(self, sensor: SensorData, station_id: str, start_date_timestamp: float,
                       end_date_timestamp: float):
        """
        process a non audio sensor to fit within the DataWindow.  Updates sensor in place, returns nothing.

        :param sensor: sensor to process
        :param station_id: station id
        :param start_date_timestamp: start of DataWindow
        :param end_date_timestamp: end of DataWindow
        """
        if sensor.num_samples() > 0:
            # get only the timestamps between the start and end timestamps
            before_start = np.where(sensor.data_timestamps() < start_date_timestamp)[0]
            after_end = np.where(end_date_timestamp <= sensor.data_timestamps())[0]
            # start_index is inclusive of window start
            if len(before_start) > 0:
                last_before_start = before_start[-1]
                start_index = last_before_start + 1
            else:
                last_before_start = None
                start_index = 0
            # end_index is non-inclusive of window end
            if len(after_end) > 0:
                first_after_end = after_end[0]
                end_index = first_after_end
            else:
                first_after_end = None
                end_index = sensor.num_samples()
            # check if all the samples have been cut off
            is_audio = sensor.type() == SensorType.AUDIO
            if end_index <= start_index:
                if is_audio:
                    self._errors.append(f"Data window for {station_id} "
                                        f"Audio sensor has truncated all data points")
                elif last_before_start is not None and first_after_end is None:
                    first_entry = sensor.pyarrow_table().slice(last_before_start, 1).to_pydict()
                    first_entry["timestamps"] = [start_date_timestamp]
                    sensor.write_pyarrow_table(pa.Table.from_pydict(first_entry))
                elif last_before_start is None and first_after_end is not None:
                    last_entry = sensor.pyarrow_table().slice(first_after_end, 1).to_pydict()
                    last_entry["timestamps"] = [start_date_timestamp]
                    sensor.write_pyarrow_table(pa.Table.from_pydict(last_entry))
                elif last_before_start is not None and first_after_end is not None:
                    sensor.write_pyarrow_table(
                        sensor.interpolate(start_date_timestamp, last_before_start, 1,
                                           self._config.copy_edge_points == gpu.DataPointCreationMode.COPY))
                else:
                    self._errors.append(
                        f"Data window for {station_id} {sensor.type().name} "
                        f"sensor has truncated all data points"
                    )
            else:
                _arrow = sensor.pyarrow_table().slice(start_index, end_index-start_index)
                # if sensor is audio or location, we want nan'd edge points
                if sensor.type() in [SensorType.LOCATION, SensorType.AUDIO]:
                    new_point_mode = gpu.DataPointCreationMode.NAN
                else:
                    new_point_mode = self._config.copy_edge_points
                # add in the data points at the edges of the window if there are defined start and/or end times
                slice_start = _arrow["timestamps"].to_numpy()[0]
                slice_end = _arrow["timestamps"].to_numpy()[-1]
                if not is_audio:
                    end_sample_interval = end_date_timestamp - slice_end
                    end_samples_to_add = 1
                    start_sample_interval = start_date_timestamp - slice_start
                    start_samples_to_add = 1
                else:
                    end_sample_interval = dtu.seconds_to_microseconds(sensor.sample_interval_s())
                    start_sample_interval = -end_sample_interval
                    if self._config.end_datetime:
                        end_samples_to_add = int((dtu.datetime_to_epoch_microseconds_utc(self._config.end_datetime)
                                                  - slice_end) / end_sample_interval)
                    else:
                        end_samples_to_add = 0
                    if self._config.start_datetime:
                        start_samples_to_add = int((slice_start -
                                                    dtu.datetime_to_epoch_microseconds_utc(self._config.start_datetime))
                                                   / end_sample_interval)
                    else:
                        start_samples_to_add = 0
                # add to end
                _arrow = (gpu.add_data_points_to_df(data_table=_arrow, start_index=_arrow.num_rows - 1,
                                                    sample_interval_micros=end_sample_interval,
                                                    num_samples_to_add=end_samples_to_add,
                                                    point_creation_mode=new_point_mode))
                # add to begin
                _arrow = (gpu.add_data_points_to_df(data_table=_arrow, start_index=0,
                                                    sample_interval_micros=start_sample_interval,
                                                    num_samples_to_add=start_samples_to_add,
                                                    point_creation_mode=new_point_mode))
                sensor.sort_by_data_timestamps(_arrow)
        else:
            self._errors.append(f"Data window for {station_id} {sensor.type().name} "
                                f"sensor has no data points!")

    def print_errors(self):
        """
        prints errors to screen
        """
        self._errors.print()
        for stn in self._stations:
            stn.print_errors()
示例#16
0
class EventStream:
    """
    stores event stream data gathered from a single station.
    ALL timestamps in microseconds since epoch UTC unless otherwise stated
    """
    def __init__(self, name: str = "event",
                 schema: Optional[Dict[str, list]] = None,
                 save_mode: FileSystemSaveMode = FileSystemSaveMode.MEM,
                 base_dir: str = "."):
        """
        initialize EventStream for a station

        :param name: name of the EventStream.  Default "event"
        :param schema: a structured dictionary of the data table schema.  Dictionary must look like:
                    {"string": [s_values], "numeric": [n_values], "boolean": [o_values], "byte": [b_values]}
                    where [*_values] is a list of strings and can be empty.  Default None
        :param save_mode: FileSystemSaveMode that determines how data is saved.
                            Default FileSystemSaveMode.MEM (use RAM).  Other options are DISK (save to directory)
                            and TEMP (save to temporary directory)
        :param base_dir: the location of the parquet file that holds the data.  Not used if save_data is False.
                            Default current directory (".")
        """
        self.name = name
        self.timestamps_metadata = {}
        self.metadata = {}

        self._errors = RedVoxExceptions("EventStream")
        self._is_timestamps_corrected = False
        self._fs_writer = Fsw(f"event_{name}", "parquet", base_dir, save_mode)
        self._data = None
        self._schema = {"string": [], "numeric": [], "boolean": [], "byte": []}
        if schema is not None:
            self.set_schema(schema)

    def as_dict(self) -> dict:
        """
        :return: EventStream as a dictionary
        """
        return {
            "name": self.name,
            "metadata": self.metadata,
            "timestamps_metadata": self.timestamps_metadata,
            "is_timestamps_corrected": self._is_timestamps_corrected,
            "schema": self._schema,
            "file_path": self.full_path(),
            "errors": self._errors.as_dict()
        }

    @staticmethod
    def __get_items(payload: Mapping[str]):
        return payload.get_metadata().items()

    @staticmethod
    def __get_items_raw(payload):
        return payload.items()

    @staticmethod
    def __get_keys(ptype: str, payload: Mapping[str]):
        return ptype, payload.get_metadata().keys()

    @staticmethod
    def __get_keys_raw(ptype: str, payload):
        return ptype, payload.keys()

    def __set_schema(self, name: str, value: str):
        self._schema[name].append(value)

    def _get_tbl_schema(self) -> Dict[str, list]:
        """
        :return: the dictionary used to create the EventStream data object
        """
        if self._data:
            result = {}
            for f in self._data.schema.names:
                result[f] = []
        else:
            result = {"timestamps": [], "unaltered_timestamps": []}
            for t, s in self._schema.items():
                for k in s:
                    result[k] = []
        return result

    def read_events(self, eventstream: es.EventStream):
        """
        read the payloads of each event in the eventstream and separate the data by payload type

        :param eventstream: stream of events to process
        """
        self.name = eventstream.get_name()
        self._fs_writer.file_name = f"event_{self.name}"
        num_events = eventstream.get_events().get_count()
        if num_events > 1:
            tbl = self._get_tbl_schema()
            self.timestamps_metadata = eventstream.get_timestamps().get_metadata()
            self.metadata = eventstream.get_metadata()
            first_event = eventstream.get_events().get_values()[0]
            for t, c in map(self.__get_keys, ["string", "numeric", "boolean", "byte"],
                            [first_event.get_string_payload(), first_event.get_numeric_payload(),
                             first_event.get_boolean_payload(), first_event.get_byte_payload()]):
                for k in c:
                    self.add_to_schema(t, k)
                    tbl[k] = []
            for i in range(num_events):
                tbl["timestamps"].append(eventstream.get_timestamps().get_timestamps()[i])
                tbl["unaltered_timestamps"].append(eventstream.get_timestamps().get_timestamps()[i])
                evnt = eventstream.get_events().get_values()[i]
                for items in map(self.__get_items, [evnt.get_string_payload(), evnt.get_numeric_payload(),
                                                    evnt.get_boolean_payload(), evnt.get_byte_payload()]):
                    for c, st in items:
                        tbl[c].append(st)
            self._data = pa.Table.from_pydict(tbl)

    def read_raw(self, stream: RedvoxPacketM.EventStream) -> 'EventStream':
        """
        read the contents of a protobuf stream

        :param stream: the protobuf stream to read
        """
        self.name = stream.name
        self._fs_writer.file_name = f"event_{self.name}"
        num_events = len(stream.events)
        if num_events > 1:
            tbl = self._get_tbl_schema()
            self.timestamps_metadata = stream.timestamps.metadata
            self.metadata = stream.metadata
            first_event = stream.events[0]
            for t, c in map(EventStream.__get_keys_raw, ["string", "numeric", "boolean", "byte"],
                            [first_event.string_payload, first_event.numeric_payload,
                             first_event.boolean_payload, first_event.byte_payload]):
                for k in c:
                    self.add_to_schema(t, k)
                    tbl[k] = []
            for i in range(num_events):
                tbl["timestamps"].append(stream.timestamps.timestamps[i])
                tbl["unaltered_timestamps"].append(stream.timestamps.timestamps[i])
                evnt = stream.events[i]
                for items in map(EventStream.__get_items_raw, [evnt.string_payload, evnt.numeric_payload,
                                                               evnt.boolean_payload, evnt.byte_payload]):
                    for c, st in items:
                        tbl[c].append(st)
            self._data = pa.Table.from_pydict(tbl)
        return self

    def read_from_dir(self, file: str):
        """
        read a pyarrow table from a file on disk

        :param file: full path to the file to read
        """
        try:
            tbl = pq.read_table(file)
            if tbl.schema.names == self._get_tbl_schema():
                self._data = tbl
        except FileNotFoundError:
            self._errors.append("No data file was found; this event is empty.")
            self._data = None

    def get_string_schema(self) -> List[str]:
        """
        :return: the column names of string typed data as a list of strings
        """
        return self._schema["string"]

    def get_numeric_schema(self) -> List[str]:
        """
        :return: the column names of numeric typed data as a list of strings
        """
        return self._schema["numeric"]

    def get_boolean_schema(self) -> List[str]:
        """
        :return: the column names of boolean typed data as a list of strings
        """
        return self._schema["boolean"]

    def get_byte_schema(self) -> List[str]:
        """
        :return: the column names of byte typed data as a list of strings
        """
        return self._schema["byte"]

    def get_schema(self) -> dict:
        """
        :return: the schema of the EventStream
        """
        return self._schema

    def get_string_values(self) -> pa.Table:
        """
        :return: the string data as a pyarrow table
        """
        return self._data.select(self.get_string_schema()) if self._data else pa.Table.from_pydict({})

    def get_numeric_values(self) -> pa.Table:
        """
        :return: the numeric data as a pyarrow table
        """
        return self._data.select(self.get_numeric_schema()) if self._data else pa.Table.from_pydict({})

    def get_boolean_values(self) -> pa.Table:
        """
        :return: the boolean data as a pyarrow table
        """
        return self._data.select(self.get_boolean_schema()) if self._data else pa.Table.from_pydict({})

    def get_byte_values(self) -> pa.Table:
        """
        :return: the byte data as a pyarrow table
        """
        return self._data.select(self.get_byte_schema()) if self._data else pa.Table.from_pydict({})

    def _check_for_name(self, column_name: str, schema: List[str]) -> bool:
        """
        :param column_name: name of column to check for
        :param schema: list of allowed names
        :return: True if column_name is in schema, sets error and returns False if not
        """
        if column_name not in schema:
            self._errors.append(f"WARNING: Column {column_name} does not exist; try one of {schema}")
            return False
        return True

    def __get_column_data(self, schema: List[str], column_name: str) -> np.array:
        """
        :param schema: list of column names to search
        :param column_name: column name to get
        :return: the data as an np.array; if empty, column name or data doesn't exist
        """
        return self._data[column_name].to_numpy() if self._check_for_name(column_name, schema) else np.array([])

    def get_string_column(self, column_name: str) -> np.array:
        """
        :param column_name: name of string payload to retrieve
        :return: string data from the column specified
        """
        return self.__get_column_data(self.get_string_schema(), column_name)

    def get_numeric_column(self, column_name: str) -> np.array:
        """
        :param column_name: name of numeric payload to retrieve
        :return: numeric data from the column specified
        """
        return self.__get_column_data(self.get_numeric_schema(), column_name)

    def get_boolean_column(self, column_name: str) -> np.array:
        """
        :param column_name: name of boolean payload to retrieve
        :return: boolean data from the column specified
        """
        return self.__get_column_data(self.get_boolean_schema(), column_name)

    def get_byte_column(self, column_name: str) -> np.array:
        """
        :param column_name: name of byte payload to retrieve
        :return: bytes data from the column specified
        """
        return self.__get_column_data(self.get_byte_schema(), column_name)

    def set_schema(self, schema: Dict[str, list]):
        """
        sets the schema of the EventStream using a specially structured dictionary.
        Structure is:

        {"string": [s_values], "numeric": [n_values], "boolean": [o_values], "byte": [b_values]}

        where [*_values] is a list of strings and can be empty

        :param schema: specially structured dictionary of data table schema
        """
        if schema.keys() != self._schema.keys():
            self._errors.append(f"Attempted to add invalid schema with keys {list(schema.keys())} to EventStreams.\n"
                                f"Valid keys are: {list(self._schema.keys())}")
        else:
            self._schema = schema

    def add_to_schema(self, key: str, value: str):
        """
        adds a value to the schema, under the specified key

        :param key: one of "string", "numeric", "boolean", or "byte"
        :param value: the name of the column to add to the schema
        """
        if key not in self._schema.keys():
            self._errors.append("Attempted to add an unknown key to the EventStream schema.\n"
                                f"You must use one of {self._schema.keys()}.")
        elif value not in self._schema[key]:
            self._schema[key].append(value)

    def add(self, other_stream: es.EventStream):
        """
        adds a Redvox Api1000 EventStream with the same name to the data

        :param other_stream: another EventStream with the same name
        """
        if self.name != other_stream.get_name():
            self._errors.append(f"Attempted to add a stream with a different name ({other_stream.get_name()})")
        else:
            self.timestamps_metadata = {**self.timestamps_metadata, **other_stream.get_timestamps().get_metadata()}
            self.metadata = {**self.metadata, **other_stream.get_metadata()}
            num_events = other_stream.get_events().get_count()
            if num_events > 1:
                tbl = self._get_tbl_schema()
                for i in range(num_events):
                    tbl["timestamps"].append(other_stream.get_timestamps().get_timestamps()[i])
                    tbl["unaltered_timestamps"].append(other_stream.get_timestamps().get_timestamps()[i])
                    evnt = other_stream.get_events().get_values()[i]
                    for items in map(self.__get_items, [evnt.get_string_payload(), evnt.get_numeric_payload(),
                                                        evnt.get_boolean_payload(), evnt.get_byte_payload()]):
                        for c, st in items:
                            tbl[c].append(st)
                self._data = pa.concat_tables([self._data, pa.Table.from_pydict(tbl)])

    def add_raw(self, other_stream: RedvoxPacketM.EventStream):
        """
        add a protobuf EventStream with the same name to the data

        :param other_stream: a protobuf EventStream to add
        """
        if self.name != other_stream.name:
            self._errors.append(f"Attempted to add a stream with a different name ({other_stream.name})")
        else:
            self.timestamps_metadata = {**self.timestamps_metadata, **other_stream.timestamps.metadata}
            self.metadata = {**self.metadata, **other_stream.metadata}
            num_events = len(other_stream.events)
            if num_events > 1:
                tbl = self._get_tbl_schema()
                for i in range(num_events):
                    tbl["timestamps"].append(other_stream.timestamps.timestamps[i])
                    tbl["unaltered_timestamps"].append(other_stream.timestamps.timestamps[i])
                    evnt = other_stream.events[i]
                    for items in map(EventStream.__get_items_raw, [evnt.string_payload, evnt.numeric_payload,
                                                                   evnt.boolean_payload, evnt.byte_payload]):
                        for c, st in items:
                            tbl[c].append(st)
                self._data = pa.concat_tables([self._data, pa.Table.from_pydict(tbl)])

    def append(self, other_stream: "EventStream"):
        """
        add another EventStream onto the calling one if they have the same name

        :param other_stream: other stream to add to current
        """
        if other_stream.name == self.name:
            self._data = pa.concat_tables([self._data, other_stream._data])
            self.timestamps_metadata = {**self.timestamps_metadata, **other_stream.timestamps_metadata}
            self.metadata = {**self.metadata, **other_stream.metadata}
            self._errors.extend_error(other_stream.errors())

    def timestamps(self) -> np.array:
        """
        :return: the timestamps as a numpy array; returns empty array if no timestamps exist
        """
        if "timestamps" in self.data().schema.names:
            return self.data()["timestamps"].to_numpy()
        else:
            return np.array([])

    def unaltered_timestamps(self) -> np.array:
        """
        :return: the unaltered timestamps as a numpy array; returns empty array if no timestamps exist
        """
        if "unaltered_timestamps" in self.data().schema.names:
            return self.data()["unaltered_timestamps"].to_numpy()
        else:
            return np.array([])

    def update_timestamps(self, offset_model: om.OffsetModel, use_model_function: bool = False):
        """
        updates the timestamps of the data points

        :param offset_model: model used to update the timestamps
        :param use_model_function: if True, use the model's slope function to update the timestamps.
                                    otherwise uses the best offset (model's intercept value).  Default False
        """
        if self._data is not None and self._data.num_rows > 0:
            timestamps = pa.array(offset_model.update_timestamps(self._data["timestamps"].to_numpy(),
                                                                 use_model_function))
            self._data.set_column(0, "timestamps", timestamps)

    def default_json_file_name(self) -> str:
        """
        :return: default event stream json file name (event_[event.name]): note there is no extension
        """
        return f"event_{self.name}"

    def is_save_to_disk(self) -> bool:
        """
        :return: True if sensor will be saved to disk
        """
        return self._fs_writer.is_save_disk()

    def set_save_to_disk(self, save: bool):
        """
        :param save: If True, save to disk
        """
        self._fs_writer.save_to_disk = save

    def set_save_mode(self, save_mode: FileSystemSaveMode):
        """
        set the save mode

        :param save_mode: new save mode
        """
        self._fs_writer.set_save_mode(save_mode)

    def set_file_name(self, new_file: Optional[str] = None):
        """
        * set the pyarrow file name or use the default: event_{EventStream.name}
        * Do not give an extension

        :param new_file: optional file name to change to; default None (use default name)
        """
        self._fs_writer.file_name = new_file if new_file else f"event_{self.name}"

    def full_file_name(self) -> str:
        """
        :return: full name of parquet file containing the data
        """
        return self._fs_writer.full_name()

    def file_name(self) -> str:
        """
        :return: file name without extension
        """
        return self._fs_writer.file_name

    def set_save_dir(self, new_dir: Optional[str] = None):
        """
        set the pyarrow directory or use the default: "." (current directory)

        :param new_dir: the directory to change to; default None (use current directory)
        """
        self._fs_writer.base_dir = new_dir if new_dir else "."

    def save_dir(self) -> str:
        """
        :return: directory containing parquet files for the sensor
        """
        return self._fs_writer.save_dir()

    def full_path(self) -> str:
        """
        :return: the full path to the data file
        """
        return self._fs_writer.full_path()

    def fs_writer(self) -> Fsw:
        """
        :return: FileSystemWriter object
        """
        return self._fs_writer

    def write_table(self):
        """
        writes the event stream data to disk.
        """
        if self._data is not None:
            pq.write_table(self._data, self.full_path())

    def has_data(self) -> bool:
        """
        :return: True if EventStream contains at least one data point
        """
        return self.data().num_rows > 0

    def data(self) -> pa.Table:
        """
        :return: the data as a pyarrow table
        """
        if self._data is None:
            if self.is_save_to_disk():
                self._data = pq.read_table(self.full_path())
            else:
                return pa.Table.from_pydict({})
        return self._data

    @staticmethod
    def from_json_file(file_dir: str, file_name: str) -> "EventStream":
        """
        :param file_dir: full path to containing directory for the file
        :param file_name: name of file and extension to load data from
        :return: EventStream from json file
        """
        if file_name is None:
            file_name = io.get_json_file(file_dir)
            if file_name is None:
                result = EventStream("Empty")
                result.append_error("JSON file to load EventStream from not found.")
                return result
        json_data = io.json_file_to_dict(os.path.join(file_dir, f"{file_name}.json"))
        if "name" in json_data.keys():
            result = EventStream(json_data["name"], json_data["schema"], FileSystemSaveMode.DISK, file_dir)
            result.metadata = json_data["metadata"]
            result.timestamps_metadata = json_data["timestamps_metadata"]
            result.set_errors(RedVoxExceptions.from_dict(json_data["errors"]))
            result.read_from_dir(json_data["file_path"])
        else:
            result = EventStream("Empty")
            result.append_error(f"Loading from {file_name} failed; missing EventStream name.")
        return result

    def to_json_file(self, file_name: Optional[str] = None) -> Path:
        """
        saves the EventStream as a json file

        :param file_name: the optional base file name.  Do not include a file extension.
                            If None, a default file name is created using this format:
                            event_[event.name].json
        :return: path to json file
        """
        if self._fs_writer.file_extension == "parquet" and self._data is not None:
            self.write_table()
        return io.to_json_file(self, file_name)

    def errors(self) -> RedVoxExceptions:
        """
        :return: errors of the sensor
        """
        return self._errors

    def set_errors(self, errors: RedVoxExceptions):
        """
        sets the errors of the Sensor

        :param errors: errors to set
        """
        self._errors = errors

    def append_error(self, error: str):
        """
        add an error to the Sensor

        :param error: error to add
        """
        self._errors.append(error)

    def print_errors(self):
        """
        print all errors to screen
        """
        self._errors.print()