示例#1
0
    def _verify_tag_path_exist(adls_file_system_client: core.AzureDLFileSystem,
                               path: str):
        """
        Verify that the tag path exists, if not the `adls_file_system_client.info` will raise a FileNotFound error.

        Parameters
        ----------
        adls_file_system_client: core.AzureDLFileSystem
            the AzureDLFileSystem client to use
        path : str
            Path of tag to be checked if exists.
        """
        adls_file_system_client.info(f"{path}")
示例#2
0
    def read_tag_files(adls_file_system_client: core.AzureDLFileSystem,
                       tag: SensorTag, years: range) -> pd.Series:
        """
        Download tag files for the given years into dataframes,
        and return as one dataframe.

        Parameters
        ----------
        adls_file_system_client: core.AzureDLFileSystem
            the AzureDLFileSystem client to use
        tag: SensorTag
            the tag to download data for
        years: range
            range object providing years to include

        Returns
        -------
        pd.Series:
            Series with all years for one tag.
        """
        tag_base_path = NcsReader.base_path_from_asset(tag.asset)

        if not tag_base_path:
            raise ValueError(f"Unable to find base path from tag {tag} ")
        all_years = []

        for year in years:
            file_path = tag_base_path + f"/{tag.name}/{tag.name}_{year}.csv"
            logger.info(f"Parsing file {file_path}")

            info = adls_file_system_client.info(file_path)
            file_size = info.get("length") / (1024**2)
            logger.info(f"File size: {file_size:.2f}MB")

            with adls_file_system_client.open(file_path, "rb") as f:
                df = pd.read_csv(
                    f,
                    sep=";",
                    header=None,
                    names=["Sensor", tag.name, "Timestamp", "Status"],
                    usecols=[tag.name, "Timestamp"],
                    dtype={tag.name: np.float32},
                    parse_dates=["Timestamp"],
                    date_parser=lambda col: pd.to_datetime(col, utc=True),
                    index_col="Timestamp",
                )

                all_years.append(df)
                logger.info(f"Done parsing file {file_path}")

        combined = pd.concat(all_years)

        # There often comes duplicated timestamps, keep the last
        if combined.index.duplicated().any():
            combined = combined[~combined.index.duplicated(keep="last")]

        return combined[tag.name]
示例#3
0
    def read_tag_files(
        self,
        adls_file_system_client: core.AzureDLFileSystem,
        tag: SensorTag,
        years: range,
        dry_run: Optional[bool] = False,
        remove_status_codes: Optional[list] = [0],
        dl_base_path: Optional[str] = None,
    ) -> pd.Series:
        """
        Download tag files for the given years into dataframes,
        and return as one dataframe.

        Parameters
        ----------
        adls_file_system_client: core.AzureDLFileSystem
            the AzureDLFileSystem client to use
        tag: SensorTag
            the tag to download data for
        years: range
            range object providing years to include
        dry_run: Optional[bool]
            if True, don't download data, just check info, log, and return
        remove_status_codes: Optional[list]
            Removes data with Status code(s) in the list. By default it removes data
            with Status code 0.
        dl_base_path: Optional[str]
            Base bath used to override the asset to path dictionary. Useful for demos
            and other non-production settings.

        Returns
        -------
        pd.Series:
            Series with all years for one tag.
        """
        tag_base_path = (dl_base_path if dl_base_path else
                         NcsReader.base_path_from_asset(tag.asset))

        if not tag_base_path:
            raise ValueError(f"Unable to find base path from tag {tag} ")
        all_years = []
        logger.info(f"Downloading tag: {tag} for years: {years}")
        tag_name_encoded = quote(tag.name, safe=" ")

        NcsReader._verify_tag_path_exist(
            adls_file_system_client, f"{tag_base_path}/{tag_name_encoded}/")

        dir_path = f"{tag_base_path}/{tag_name_encoded}"
        for year in years:
            file_path = None
            file_lookup = None
            for v in self.file_lookups:
                file_path = v.lookup(adls_file_system_client, dir_path,
                                     tag_name_encoded, year)
                if file_path is not None:
                    file_lookup = v
                    break
            if file_lookup is None:
                continue
            file_type = file_lookup.file_type
            logger.info(f"Parsing file {file_path}")

            try:
                info = adls_file_system_client.info(file_path)
                file_size = info.get("length") / (1024**2)
                logger.debug(
                    f"File size for file {file_path}: {file_size:.2f}MB")

                if dry_run:
                    logger.info("Dry run only, returning empty frame early")
                    return pd.Series()
                before_downloading = timeit.default_timer()
                with adls_file_system_client.open(file_path, "rb") as f:
                    df = file_type.read_df(f)
                    df = df.rename(columns={"Value": tag.name})
                    df = df[~df["Status"].isin(remove_status_codes)]
                    df.sort_index(inplace=True)
                    all_years.append(df)
                    logger.info(
                        f"Done in {(timeit.default_timer()-before_downloading):.2f} sec {file_path}"
                    )

            except FileNotFoundError as e:
                logger.debug(f"{file_path} not found, skipping it: {e}")

        try:
            combined = pd.concat(all_years)
        except Exception as e:
            logger.debug(f"Not able to concatinate all years: {e}.")
            return pd.Series(name=tag.name, data=None)

        # There often comes duplicated timestamps, keep the last
        if combined.index.duplicated().any():
            combined = combined[~combined.index.duplicated(keep="last")]

        return combined[tag.name]
示例#4
0
def is_file(client: core.AzureDLFileSystem, path: str):
    try:
        info = client.info(path)
    except FileNotFoundError:
        return False
    return info["type"] == "FILE"
示例#5
0
    def read_tag_files(
        adls_file_system_client: core.AzureDLFileSystem,
        tag: SensorTag,
        years: range,
        dry_run: Optional[bool] = False,
        remove_status_codes: Optional[list] = [0],
        dl_base_path: Optional[str] = None,
    ) -> pd.Series:
        """
        Download tag files for the given years into dataframes,
        and return as one dataframe.

        Parameters
        ----------
        adls_file_system_client: core.AzureDLFileSystem
            the AzureDLFileSystem client to use
        tag: SensorTag
            the tag to download data for
        years: range
            range object providing years to include
        dry_run: Optional[bool]
            if True, don't download data, just check info, log, and return
        remove_status_codes: Optional[list]
            Removes data with Status code(s) in the list. By default it removes data
            with Status code 0.
        dl_base_path: Optional[str]
            Base bath used to override the asset to path dictionary. Useful for demos
            and other non-production settings.

        Returns
        -------
        pd.Series:
            Series with all years for one tag.
        """
        tag_base_path = (
            dl_base_path if dl_base_path else NcsReader.base_path_from_asset(tag.asset)
        )

        if not tag_base_path:
            raise ValueError(f"Unable to find base path from tag {tag} ")
        all_years = []
        logger.info(f"Downloading tag: {tag} for years: {years}")
        tag_name_encoded = quote(tag.name, safe=" ")

        NcsReader._verify_tag_path_exist(
            adls_file_system_client, f"{tag_base_path}/{tag_name_encoded}/"
        )

        for year in years:
            file_path = (
                f"{tag_base_path}/{tag_name_encoded}/{tag_name_encoded}_{year}.csv"
            )
            logger.info(f"Parsing file {file_path}")

            try:
                info = adls_file_system_client.info(file_path)
                file_size = info.get("length") / (1024 ** 2)
                logger.debug(f"File size for file {file_path}: {file_size:.2f}MB")

                if dry_run:
                    logger.info("Dry run only, returning empty frame early")
                    return pd.Series()

                with adls_file_system_client.open(file_path, "rb") as f:
                    df = pd.read_csv(
                        f,
                        sep=";",
                        header=None,
                        names=["Sensor", tag.name, "Timestamp", "Status"],
                        usecols=[tag.name, "Timestamp", "Status"],
                        dtype={tag.name: np.float32},
                        parse_dates=["Timestamp"],
                        date_parser=lambda col: pd.to_datetime(col, utc=True),
                        index_col="Timestamp",
                    )
                    df = df[~df["Status"].isin(remove_status_codes)]
                    all_years.append(df)
                    logger.info(f"Done parsing file {file_path}")

            except FileNotFoundError as e:
                logger.debug(f"{file_path} not found, skipping it: {e}")

        try:
            combined = pd.concat(all_years)
        except Exception as e:
            logger.debug(f"Not able to concatinate all years: {e}.")
            return pd.Series(name=tag.name, data=None)

        # There often comes duplicated timestamps, keep the last
        if combined.index.duplicated().any():
            combined = combined[~combined.index.duplicated(keep="last")]

        return combined[tag.name]