Exemplo n.º 1
0
def load_data(fname):

    # change here to get data from another sensor
    rock_temperature_file = "MH30_temperature_rock_2017.csv"
    # rock_temperature_file= "MH10_resistivity_rock_2017.csv"
    # rock_temperature_file= "MH25_vaisalawxt520windpth_2017.csv"
    # Getting cloud data
    account_name = (get_setting("azure")["account_name"]
                    if setting_exists("azure") else "storageaccountperma8980")
    account_key = (get_setting("azure")["account_key"]
                   if setting_exists("azure") else None)
    store = stuett.ABSStore(
        container="hackathon-on-permafrost",
        prefix="timeseries_derived_data_products",
        account_name=account_name,
        account_key=account_key,
    )

    rock_temperature_node = stuett.data.CsvSource(fname, store=store)
    rock_temperature = rock_temperature_node()

    # return data
    rock_temperature = rock_temperature.drop(dim="name", labels=["position"])
    return rock_temperature
Exemplo n.º 2
0
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from scipy.fftpack import fft
from sklearn.impute import SimpleImputer
import time
import os
from sklearn.decomposition import PCA

account_name = (get_setting("azure")["account_name"]
                if setting_exists("azure") else "storageaccountperma8980")
account_key = (get_setting("azure")["account_key"]
               if setting_exists("azure") else None)
store = stuett.ABSStore(
    container="hackathon-on-permafrost",
    prefix="seismic_data/4D/",
    account_name=account_name,
    account_key=account_key,
)
rock_temperature_file = "MH30_temperature_rock_2017.csv"
prec_file = "MH25_vaisalawxt520prec_2017.csv"
derived_store = stuett.ABSStore(
    container="hackathon-on-permafrost",
    prefix="timeseries_derived_data_products",
    account_name=account_name,
    account_key=account_key,
)
image_store = stuett.ABSStore(
    container="hackathon-on-permafrost",
    prefix="timelapse_images_fast",
    account_name=account_name,
    account_key=account_key,
if args.reload_all:
    args.reload_frozen = True

############ SETTING UP DATA LOADERS ############
#################################################
if not args.local:
    from stuett.global_config import get_setting, setting_exists

    account_name = (get_setting("azure")["account_name"]
                    if setting_exists("azure") else "storageaccountperma8980")
    account_key = (get_setting("azure")["account_key"]
                   if setting_exists("azure") else None)
    store = stuett.ABSStore(
        container="hackathon-on-permafrost",
        prefix=prefix,
        account_name=account_name,
        account_key=account_key,
    )
    annotation_store = stuett.ABSStore(
        container="hackathon-on-permafrost",
        prefix="annotations",
        account_name=account_name,
        account_key=account_key,
    )
else:
    store = stuett.DirectoryStore(Path(data_path).joinpath(prefix))
    if "2017-01-01/20170101_080018.JPG" not in store and "MH36/2017/EHE.D/4D.MH36.A.EHE.D.20171231_230000.miniseed" not in store:
        raise RuntimeError(
            f"Please provide a valid path to the permafrost {prefix} data or see README how to download it"
        )
    annotation_store = stuett.DirectoryStore(
Exemplo n.º 4
0
                    help="Only use local files and not data from Azure")
args = parser.parse_args()
data_path = Path(args.path)

# change here to get data from another sensor
vaisalawxt520windpth_file = "MH25_vaisalawxt520windpth_2017.csv"

# Getting either cloud or local data file
if not args.local:
    account_name = (get_setting("azure")["account_name"]
                    if setting_exists("azure") else "storageaccountperma8980")
    account_key = (get_setting("azure")["account_key"]
                   if setting_exists("azure") else None)
    store = stuett.ABSStore(
        container="hackathon-on-permafrost",
        prefix="timeseries_derived_data_products",
        account_name=account_name,
        account_key=account_key,
    )
else:
    timeseries_folder = Path(data_path).joinpath(
        "timeseries_derived_data_products").resolve()
    store = stuett.DirectoryStore(timeseries_folder)
    if vaisalawxt520windpth_file not in store:
        raise RuntimeError(
            "Please provide a valid path to the permafrost data or see README how to download it"
        )

vaisalawxt520windpth_node = stuett.data.CsvSource(vaisalawxt520windpth_file,
                                                  store=store)
vaisalawxt520windpth = vaisalawxt520windpth_node()
# Create a blob service and list all data available
block_blob_service = BlockBlobService(account_name=account_name,
                                      account_key=account_key)
print("\nList blobs in the container")
generator = block_blob_service.list_blobs("hackathon-on-permafrost")
for i, blob in enumerate(generator):
    print("\t Blob name: " + blob.name)
    if i == 5:
        break

print("List some documents")
# In stuett we can use a a zarr store and load the data from there
store = stuett.ABSStore(
    container="hackathon-on-permafrost",
    prefix="docs/",
    account_name=account_name,
    account_key=account_key,
    blob_service_kwargs={},
)

for i, key in enumerate(store.keys()):
    print(key)
    if i == 5:
        break

# # Currently, stuett (or zarr in the backend) only support azure-storage-blob==2.1.0`
# # But a newer version is available which you can use independently
# # using azure-storage-blob==12.0.0
# # untested
# from azure.storage.blob import BlobServiceClient
# service = BlobServiceClient(account_url="https://storageaccountperma8980.blob.core.windows.net/",credentials=account_key)
Exemplo n.º 6
0
if args.high_quality:
    prefix = "timelapse_images"
else:
    prefix = "timelapse_images_fast"

if not args.local:
    from stuett.global_config import get_setting, setting_exists

    account_name = (get_setting("azure")["account_name"]
                    if setting_exists("azure") else "storageaccountperma8980")
    account_key = (get_setting("azure")["account_key"]
                   if setting_exists("azure") else None)
    store = stuett.ABSStore(
        container="hackathon-on-permafrost",
        prefix=prefix,
        account_name=account_name,
        account_key=account_key,
    )
    annotation_store = stuett.ABSStore(
        container="hackathon-on-permafrost",
        prefix="annotations",
        account_name=account_name,
        account_key=account_key,
    )

else:
    store = stuett.DirectoryStore(Path(data_path).joinpath(prefix))
    if "2017-01-01/20170101_080018.JPG" not in store:
        raise RuntimeError(
            "Please provide a valid path to the permafrost timelapse_images data or see README how to download it"
        )
    "--local",
    action="store_true",
    help="Only use local files and not data from Azure",
)
args = parser.parse_args()

data_path = Path(args.path)

if not args.local:
    account_name = (get_setting("azure")["account_name"]
                    if setting_exists("azure") else "storageaccountperma8980")
    account_key = (get_setting("azure")["account_key"]
                   if setting_exists("azure") else None)
    store = stuett.ABSStore(
        container="hackathon-on-permafrost",
        prefix="seismic_data/4D/",
        account_name=account_name,
        account_key=account_key,
    )
else:
    seismic_folder = Path(data_path).joinpath("seismic_data/4D/")
    store = stuett.DirectoryStore(seismic_folder)
    if "MH36/2017/EHE.D/4D.MH36.A.EHE.D.20171231_230000.miniseed" not in store:
        raise RuntimeError(
            "Please provide a valid path to the permafrost data or see README how to download it"
        )

seismic_node = stuett.data.SeismicSource(
    store=store,
    station="MH36",
    channel=["EHE", "EHN", "EHZ"],
    start_time="2017-08-02 10:00:00",
Exemplo n.º 8
0
annotations_path = data_path.joinpath("annotations")

if args.azure:
    account_name = (
        get_setting("azure")["account_name"]
        if setting_exists("azure")
        else "storageaccountperma8980"
    )
    account_key = (
        get_setting("azure")["account_key"] if setting_exists("azure") else None
    )
    if args.to_data_storage:
        output_store = stuett.ABSStore(
            container="hackathon-on-permafrost",
            prefix="annotations",
            account_name=account_name,
            account_key=account_key,
            blob_service_kwargs={},
        )
    else:
        output_store = stuett.DirectoryStore(annotations_path)
        input_store = stuett.ABSStore(
            container="hackathon-public-rw",
            prefix="",
            account_name=account_name,
            account_key=account_key,
        )
else:
    input_store = stuett.DirectoryStore(args.user_annotations)
    output_store = stuett.DirectoryStore(annotations_path)
    def __init__(
        self,
        local,
        data_path="../data",
        transform=None,
        time_slice={
            "start_time": "2017-01-01",
            "end_time": "2017-12-31"
        },
    ):
        """
        Args:
            local (bool): Whether to read the dataset from a local storage
                location or from a public Azure share.
            data_path (str, optional): If the data should be read from a local
                location, then this folder will denote the location of the
                dataset.
            transform (callable, optional): Optional transform to be applied
                on images.
            time_slice (dict): Can be used to create a different train and test
                set. Note, this is not a pretty solution, especially because
                time values are not interleaved. I.e., if time information is
                used as input to a network, but the network has never seen
                values from the corresponding month, then it can't make
                confident predictions.
        """
        if transform is not None:
            raise NotImplementedError("transform not implemented!")
        self.transform = transform

        # This sensor contains near-surface temperature readings and is on the
        # south side and therefore receives a lot of sunshine.
        rock_temperature_file_mh10 = "MH10_temperature_rock_2017.csv"  # South

        radiation_file = "MH15_radiometer__conv_2017.csv"

        if not local:
            account_name = (get_setting("azure")["account_name"]
                            if setting_exists("azure") else
                            "storageaccountperma8980")
            account_key = (get_setting("azure")["account_key"]
                           if setting_exists("azure") else None)

            ts_store = stuett.ABSStore(
                container="hackathon-on-permafrost",
                prefix="timeseries_derived_data_products",
                account_name=account_name,
                account_key=account_key,
            )

            img_store = stuett.ABSStore(
                container="hackathon-on-permafrost",
                prefix="timelapse_images_fast",
                account_name=account_name,
                account_key=account_key,
            )

        else:
            timeseries_folder = (Path(data_path).joinpath(
                "timeseries_derived_data_products").resolve())
            ts_store = stuett.DirectoryStore(timeseries_folder)
            if rock_temperature_file_mh10 not in store:
                raise RuntimeError("Please provide a valid path to the " +
                                   "permafrost data!")

            img_store = stuett.DirectoryStore(
                Path(data_path).joinpath("timelapse_images_fast"))
            if "2017-01-01/20170101_080018.JPG" not in store:
                raise RuntimeError("Please provide a valid path to the " +
                                   "permafrost images.")

        # self._ts_store = ts_store
        self._img_store = img_store

        ### Load timeseries data.
        rock_temperature_node_mh10 = stuett.data.CsvSource(
            rock_temperature_file_mh10, store=ts_store)
        rock_temp_mh10 = rock_temperature_node_mh10(time_slice)

        radiation_node = stuett.data.CsvSource(radiation_file, store=ts_store)
        radiation = radiation_node(time_slice)

        net_radiation = radiation.loc[:, ["net_radiation"]]
        surface_temp = rock_temp_mh10.loc[:, ["temperature_nearsurface_t2"]]
        target_temp = rock_temp_mh10.loc[:, ["temperature_10cm"]]

        ### Load image filenames.
        image_node = stuett.data.MHDSLRFilenames(
            store=img_store,
            force_write_to_remote=True,
            as_pandas=False,
        )
        image_fns = image_node(time_slice)

        ### Find image filenames that were captured close to temperature
        ### measures.
        # With close we mean within a 20min window.
        # Temperature/radiation values that have no corresponding image are
        # ignored.

        # Sanity check!
        # for t1, t2 in zip(radiation['time'], rock_temp_mh10['time']):
        #    assert (t1 == t2)

        j = 0
        n = len(image_fns["time"])

        measurement_pairs = []

        for i, t in enumerate(rock_temp_mh10["time"].values):
            while j < n:
                # Translate difference in timestamps to minutes before casting
                # to int.
                diff = ((image_fns["time"][j] -
                         t).values.astype("timedelta64[m]").astype(np.int))

                if diff > 10:
                    # Image too far in the future, ignore sensor value.
                    break

                absdiff = np.abs(diff)
                if absdiff < 10:
                    # The image is very close, simply check whether the next
                    # picture is even closer. Otherwise, we take the current
                    # image.
                    if j + 1 < n:
                        absdiff2 = np.abs(
                            (image_fns["time"][j + 1] -
                             t).values.astype("timedelta64[m]").astype(np.int))
                    else:
                        absdiff2 = None

                    if absdiff2 is None or absdiff < absdiff2:
                        measurement_pairs.append((i, j))
                        j += 1
                    else:
                        measurement_pairs.append((i, j + 1))
                        j += 2

                    break

                j += 1

        ### Build dataset (make sure that there are no None values in the
        ### timeseries measurements).
        self._img_fns = []
        self._surface_temp = []
        self._target_temp = []
        self._timestamps = []
        self._radiation = []

        # This is coarse time information that one may provide as additional
        # information. We encode the (normalized) month and daytime information,
        # as this information may be quite helpful when judging temperature
        # values.
        # Though, it might also tempt the regression system to ignore all
        # other information and solely predict based on this information
        # (as a strong local minimum).
        self._month = []
        self._daytime = []

        assert np.all(~np.isnan(net_radiation.values))
        assert np.all(~np.isnan(surface_temp.values))
        # assert(np.all(~np.isnan(target_temp.values)))

        for i, j in measurement_pairs:
            if np.any(np.isnan(target_temp.values[i, 0])):
                continue

            self._target_temp.append(target_temp.values[i, 0])
            self._surface_temp.append(surface_temp.values[i, 0])
            self._radiation.append(net_radiation.values[i, 0])

            self._timestamps.append(target_temp["time"].values[i])
            ts = pd.to_datetime(self._timestamps[-1])
            self._month.append(ts.month)
            self._daytime.append(ts.hour * 60 + ts.minute)

            self._img_fns.append(str(image_fns.values[0, j]))

        self._target_temp = np.array(self._target_temp, dtype=np.float32)
        self._surface_temp = np.array(self._surface_temp, dtype=np.float32)
        self._radiation = np.array(self._radiation, dtype=np.float32)

        self._month = np.array(self._month, dtype=np.float32)
        self._daytime = np.array(self._daytime, dtype=np.float32)

        # Normalize regression values.
        self.target_temp_mean = self._target_temp.mean()
        self.target_temp_std = self._target_temp.std()

        self.surface_temp_mean = self._surface_temp.mean()
        self.surface_temp_std = self._surface_temp.std()

        self.radiation_mean = self._radiation.mean()
        self.radiation_std = self._radiation.std()

        self._target_temp = (self._target_temp -
                             self.target_temp_mean) / self.target_temp_std

        self._surface_temp = (self._surface_temp -
                              self.surface_temp_mean) / self.surface_temp_std

        self._radiation = (self._radiation -
                           self.radiation_mean) / self.radiation_std

        self._month = (self._month - self._month.mean()) / self._month.std()
        self._daytime = (self._month -
                         self._daytime.mean()) / self._daytime.std()

        print("dataset contains %d samples." % len(self._img_fns))