def extract_stats_parallel(
        index: io.Index,
        pool: Optional[multiprocessing.pool.Pool] = None) -> List[StationStat]:
    """
    Extracts StationStat information in parallel from packets stored in the provided index.

    :param index: Index of packets to extract information from.
    :param pool: optional multiprocessing pool.
    :return: A list of StationStat objects.
    """
    # Partition the index entries by number of cores
    num_cores: int = multiprocessing.cpu_count()
    partitioned: List[List[io.IndexEntry]] = _partition_list(
        index.entries, num_cores)
    indices: List[io.Index] = list(
        map(lambda entries: io.Index(entries), partitioned))

    # Run da buggahs in parallel
    # _pool: multiprocessing.pool.Pool = multiprocessing.Pool() if pool is None else pool
    # nested: List[List[StationStat]] = _pool.map(extract_stats_serial, indices)
    # if pool is None:
    #     _pool.close()
    nested: Iterator[List[StationStat]] = maybe_parallel_map(
        pool,
        extract_stats_serial,
        iter(indices),
        lambda: len(indices) > 128,
        chunk_size=64,
    )
    return [item for sublist in nested for item in sublist]
Exemplo n.º 2
0
    def _get_all_files(
        self, pool: Optional[multiprocessing.pool.Pool] = None
    ) -> io.Index:
        """
        get all files in the base dir of the ApiReader
        :return: index with all the files that match the filter
        """
        _pool: multiprocessing.pool.Pool = (
            multiprocessing.Pool() if pool is None else pool
        )
        index = io.Index()
        # this guarantees that all ids we search for are valid
        all_index = self._apply_filter(pool=_pool)
        for station_id in all_index.summarize().station_ids():
            station_filter = self.filter.clone()
            checked_index = self._check_station_stats(
                station_filter.with_station_ids({station_id}),
                pool=_pool
            )
            index.append(checked_index.entries)

        if pool is None:
            _pool.close()

        return index
Exemplo n.º 3
0
 def _flatten_files_index(self):
     """
     :return: flattened version of files_index
     """
     result = io.Index()
     for i in self.files_index:
         result.append(i.entries)
     return result
Exemplo n.º 4
0
    def _split_workload(self, findex: io.Index) -> List[io.Index]:
        """
        takes an index and splits it into chunks based on a size limit
        while running_total + next_file_size < limit, adds files to a chunk (Index)
        if limit is exceeded, adds the chunk and puts the next file into a new chunk

        :param findex: index of files to split
        :return: list of Index to process
        """
        packet_list = []
        chunk_queue = 0
        chunk_list = []
        for f in findex.entries:
            chunk_queue += f.file_size_bytes
            if chunk_queue > self.chunk_limit:
                packet_list.append(io.Index(chunk_list))
                chunk_queue = 0
                chunk_list = []
            chunk_list.append(f)
        packet_list.append(io.Index(chunk_list))
        return packet_list
Exemplo n.º 5
0
    def _check_station_stats(
        self,
        station_index: io.Index,
        pool: Optional[multiprocessing.pool.Pool] = None,
    ) -> List[io.Index]:
        """
        check the index's results; if it has enough information, return it, otherwise search for more data.
        The index should only request one station id
        If the station was restarted during the request period, a new group of indexes will be created
        to represent the change in station metadata.

        :param station_index: index representing the requested information
        :return: List of Indexes that includes as much information as possible that fits the request
        """
        _pool: multiprocessing.pool.Pool = multiprocessing.Pool(
        ) if pool is None else pool
        # if we found nothing, return the index
        if len(station_index.entries) < 1:
            return [station_index]

        stats = fs.extract_stats(station_index, pool=_pool)
        # Close pool if created here
        if pool is None:
            _pool.close()

        timing_offsets: Optional[
            offset_model.TimingOffsets] = offset_model.compute_offsets(stats)

        # punt if duration or other important values are invalid or if the latency array was empty
        if timing_offsets is None:
            return [station_index]

        diff_s = diff_e = timedelta(seconds=0)

        # if our filtered files do not encompass the request even when the packet times are updated
        # try getting 1.5 times the difference of the expected start/end and the start/end of the data
        insufficient_str = ""
        if self.filter.start_dt and timing_offsets.adjusted_start > self.filter.start_dt:
            insufficient_str += f" {self.filter.start_dt} (start)"
            # diff_s = self.filter.start_dt_buf + 1.5 * (timing_offsets.adjusted_start - self.filter.start_dt)
            new_end = self.filter.start_dt - self.filter.start_dt_buf
            new_start = new_end - 1.5 * (timing_offsets.adjusted_start -
                                         self.filter.start_dt)
            new_index = self._apply_filter(io.ReadFilter().with_start_dt(
                new_start).with_end_dt(new_end).with_extensions(
                    self.filter.extensions).with_api_versions(
                        self.filter.api_versions).with_station_ids(
                            set(station_index.summarize().station_ids())
                        ).with_start_dt_buf(diff_s).with_end_dt_buf(diff_e))
            if len(new_index.entries) > 0:
                station_index.append(new_index.entries)
                stats.extend(fs.extract_stats(new_index))
        if self.filter.end_dt and timing_offsets.adjusted_end < self.filter.end_dt:
            insufficient_str += f" {self.filter.end_dt} (end)"
            # diff_e = self.filter.end_dt_buf + 1.5 * (self.filter.end_dt - timing_offsets.adjusted_end)
            new_start = self.filter.end_dt + self.filter.end_dt_buf
            new_end = new_start + 1.5 * (self.filter.end_dt -
                                         timing_offsets.adjusted_end)
            new_index = self._apply_filter(io.ReadFilter().with_start_dt(
                new_start).with_end_dt(new_end).with_extensions(
                    self.filter.extensions).with_api_versions(
                        self.filter.api_versions).with_station_ids(
                            set(station_index.summarize().station_ids())
                        ).with_start_dt_buf(diff_s).with_end_dt_buf(diff_e))
            if len(new_index.entries) > 0:
                station_index.append(new_index.entries)
                stats.extend(fs.extract_stats(new_index))
        if len(insufficient_str) > 0:
            self.errors.append(
                f"Data for {station_index.summarize().station_ids()} exists, "
                f"but not at:{insufficient_str}")

        results = {}
        keys = []

        for v, e in enumerate(stats):
            key = e.app_start_dt
            if key not in keys:
                keys.append(key)
                results[key] = io.Index()

            results[key].append(entries=[station_index.entries[v]])

        return list(results.values())