示例#1
0
    def from_metadata_csv(
        cls, data_root, meta_csv_file, network=None, temp_root=gettempdir()
    ):
        """
        Load a previously created and stored filelist from
        :func:`ismn.filecollection.IsmnFileCollection.to_metadata_csv`

        Parameters
        ----------
        data_root : IsmnRoot or str or Path
            Path where the ismn data is stored, can also be a zip file
        meta_csv_file : str or Path
            Csv file where the metadata is stored.
        network : list, optional (default: None)
            List of networks that are considered.
            Filehandlers for other networks are set to None.
        temp_root : str or Path, optional (default: gettempdir())
            Temporary folder where extracted data is copied during reading from
            zip archive.
        """
        if network is not None:
            network = np.atleast_1d(network)

        if isinstance(data_root, IsmnRoot):
            root = data_root
        else:
            root = IsmnRoot(data_root)

        print(f"Found existing ismn metadata in {meta_csv_file}.")

        metadata_df = _load_metadata_df(meta_csv_file)

        filelist = OrderedDict([])

        all_networks = metadata_df["network"]["val"].values

        columns = np.array(list(metadata_df.columns))

        for i, row in enumerate(metadata_df.values):  # todo: slow!?? parallelise?
            this_nw = all_networks[i]
            if (network is not None) and not np.isin([this_nw], network)[0]:
                f = None
                continue
            else:
                vars = np.unique(columns[:-2][:, 0])
                vals = row[:-2].reshape(-1, 3)

                metadata = MetaData(
                    [
                        MetaVar.from_tuple(
                            (vars[i], vals[i][2], vals[i][0], vals[i][1])
                        )
                        for i in range(len(vars))
                    ]
                )

                f = DataFile(
                    root=root,
                    file_path=str(PurePosixPath(row[-2])),
                    load_metadata=False,
                    temp_root=temp_root,
                )

                f.metadata = metadata
                f.file_type = row[-1]

                this_nw = f.metadata["network"].val

            if this_nw not in filelist.keys():
                filelist[this_nw] = []

            filelist[this_nw].append(f)

        if network is None:
            cls.metadata_df = metadata_df
        else:
            flags = np.isin(metadata_df["network"]["val"].values, network)
            cls.metadata_df = metadata_df.loc[flags]

        return cls(root, filelist=filelist)
示例#2
0
def _read_station_dir(
    root: Union[IsmnRoot, Path, str],
    stat_dir: Union[Path, str],
    temp_root: Path,
) -> (dict, list):
    """
    Parallelizable function to read metadata for files in station dir
    """
    infos = []

    if not isinstance(root, IsmnRoot):
        proc_root = True
        root = IsmnRoot(root)
    else:
        proc_root = False

    csv = root.find_files(stat_dir, "*.csv")

    try:
        if len(csv) == 0:
            raise IsmnFileError(
                "Expected 1 csv file for station, found 0. "
                "Use empty static metadata."
            )
        else:
            if len(csv) > 1:
                infos.append(
                    f"Expected 1 csv file for station, found {len(csv)}. "
                    f"Use first file in dir."
                )
            static_meta_file = StaticMetaFile(
                root, csv[0], load_metadata=True, temp_root=temp_root
            )
            station_meta = static_meta_file.metadata
    except IsmnFileError as e:
        infos.append(f"Error loading static meta for station: {e}")
        station_meta = MetaData([MetaVar(k, v) for k, v in CSV_META_TEMPLATE.items()])

    data_files = root.find_files(stat_dir, "*.stm")

    filelist = []

    for file_path in data_files:
        try:
            f = DataFile(root, file_path, temp_root=temp_root)
        except IOError as e:
            infos.append(f"Error loading ismn file: {e}")
            continue

        f.metadata.merge(station_meta, inplace=True)

        f.metadata = f.metadata.best_meta_for_depth(
            Depth(
                f.metadata["instrument"].depth.start,
                f.metadata["instrument"].depth.end,
            )
        )

        network = f.metadata["network"].val
        station = f.metadata["station"].val

        filelist.append((network, station, f))

        infos.append(f"Processed file {file_path}")

    if proc_root:
        root.close()

    return filelist, infos
示例#3
0
    def from_metadata_csv(cls,
                          data_root,
                          meta_csv_file,
                          network=None,
                          temp_root=gettempdir()):
        """
        Load a previously created and stored filelist from pkl.

        Parameters
        ----------
        data_root : IsmnRoot or str or Path
            Path where the ismn data is stored, can also be a zip file
        meta_csv_file : str or Path
            Csv file where the metadata is stored.
        network : list, optional (default: None)
            List of networks that are considered. Other filehandlers are set to None.
        temp_root : str or Path, optional (default: gettempdir())
            Temporary folder where extracted data is copied during reading from
            zip archive.
        """
        if network is not None:
            network = np.atleast_1d(network)

        if isinstance(data_root, IsmnRoot):
            root = data_root
        else:
            root = IsmnRoot(data_root)

        print(f"Found existing ismn metadata in {meta_csv_file}.")

        metadata_df = pd.read_csv(meta_csv_file,
                                  index_col=0,
                                  header=[0, 1],
                                  low_memory=False,
                                  engine='c')

        # parse date cols as datetime
        for col in ['timerange_from', 'timerange_to']:
            metadata_df[col, 'val'] = pd.to_datetime(metadata_df[col, 'val'])

        lvars = []
        for c in metadata_df.columns:
            if c[0] not in lvars:
                lvars.append(c[0])

        # we assume triples for all vars except these, so they must be at the end
        assert lvars[-2:] == ['file_path', 'file_type'], \
            "file_type and file_path must be at the end."

        filelist = OrderedDict([])

        all_networks = metadata_df['network']['val'].values

        columns = np.array(list(metadata_df.columns))

        for i, row in enumerate(
                metadata_df.values):  # todo: slow!?? parallelise?
            this_nw = all_networks[i]
            if (network is not None) and not np.isin([this_nw], network)[0]:
                f = None
                continue
            else:
                vars = np.unique(columns[:-2][:, 0])
                vals = row[:-2].reshape(-1, 3)

                metadata = MetaData([
                    MetaVar.from_tuple(
                        (vars[i], vals[i][2], vals[i][0], vals[i][1]))
                    for i in range(len(vars))
                ])

                f = DataFile(root=root,
                             file_path=str(PurePosixPath(row[-2])),
                             load_metadata=False,
                             temp_root=temp_root)

                f.metadata = metadata
                f.file_type = row[-1]

                this_nw = f.metadata['network'].val

            if this_nw not in filelist.keys():
                filelist[this_nw] = []

            filelist[this_nw].append(f)

        return cls(root, filelist=filelist)