示例#1
0
def load_data(fname):

    # change here to get data from another sensor
    rock_temperature_file = "MH30_temperature_rock_2017.csv"
    # rock_temperature_file= "MH10_resistivity_rock_2017.csv"
    # rock_temperature_file= "MH25_vaisalawxt520windpth_2017.csv"
    # Getting cloud data
    account_name = (get_setting("azure")["account_name"]
                    if setting_exists("azure") else "storageaccountperma8980")
    account_key = (get_setting("azure")["account_key"]
                   if setting_exists("azure") else None)
    store = stuett.ABSStore(
        container="hackathon-on-permafrost",
        prefix="timeseries_derived_data_products",
        account_name=account_name,
        account_key=account_key,
    )

    rock_temperature_node = stuett.data.CsvSource(fname, store=store)
    rock_temperature = rock_temperature_node()

    # return data
    rock_temperature = rock_temperature.drop(dim="name", labels=["position"])
    return rock_temperature
示例#2
0
from matplotlib.pyplot import imshow
import anomaly_visualization
from dateutil import rrule
from datetime import date, timedelta
from datetime import datetime
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from scipy.fftpack import fft
from sklearn.impute import SimpleImputer
import time
import os
from sklearn.decomposition import PCA

account_name = (get_setting("azure")["account_name"]
                if setting_exists("azure") else "storageaccountperma8980")
account_key = (get_setting("azure")["account_key"]
               if setting_exists("azure") else None)
store = stuett.ABSStore(
    container="hackathon-on-permafrost",
    prefix="seismic_data/4D/",
    account_name=account_name,
    account_key=account_key,
)
rock_temperature_file = "MH30_temperature_rock_2017.csv"
prec_file = "MH25_vaisalawxt520prec_2017.csv"
derived_store = stuett.ABSStore(
    container="hackathon-on-permafrost",
    prefix="timeseries_derived_data_products",
    account_name=account_name,
    account_key=account_key,
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE."""

import stuett
from stuett.global_config import get_setting, setting_exists

from azure.storage.blob import (
    BlockBlobService,
)  # make sure to install 2.1 version with `pip install azure-storage-blob==2.1.0`
import zarr

# Get the account name which is "storageaccountperma8980" for the hackathon
# If you stored it in a config file you it will beloaded
if setting_exists("azure"):
    account_name = get_setting("azure")["account_name"]
else:
    account_name = "storageaccountperma8980"
account_key = get_setting("azure")["account_key"] if setting_exists(
    "azure") else None

if account_key is not None:
    print("using credentials")

# Create a blob service and list all data available
block_blob_service = BlockBlobService(account_name=account_name,
                                      account_key=account_key)
print("\nList blobs in the container")
generator = block_blob_service.list_blobs("hackathon-on-permafrost")
for i, blob in enumerate(generator):
示例#4
0
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE."""

import stuett
from stuett.global_config import get_setting, setting_exists

from azure.storage.blob import (
    BlockBlobService,
)  # make sure to install 2.1 version with `pip install azure-storage-blob==2.1.0`
import zarr

account_name = (get_setting("azure")["account_name"]
                if setting_exists("azure") else "storageaccountperma8980")
account_key = get_setting("azure")["account_key"] if setting_exists(
    "azure") else None

if account_key is not None:
    print("using credentials")

# Create a blob service and list all data available
block_blob_service = BlockBlobService(account_name="storageaccountperma8980",
                                      account_key=account_key)
print("\nList blobs in the container")
generator = block_blob_service.list_blobs("hackathon-on-permafrost")
for blob in generator:
    print("\t Blob name: " + blob.name)

# Create a zarr store and load the data from the
    def __init__(
        self,
        local,
        data_path="../data",
        transform=None,
        time_slice={
            "start_time": "2017-01-01",
            "end_time": "2017-12-31"
        },
    ):
        """
        Args:
            local (bool): Whether to read the dataset from a local storage
                location or from a public Azure share.
            data_path (str, optional): If the data should be read from a local
                location, then this folder will denote the location of the
                dataset.
            transform (callable, optional): Optional transform to be applied
                on images.
            time_slice (dict): Can be used to create a different train and test
                set. Note, this is not a pretty solution, especially because
                time values are not interleaved. I.e., if time information is
                used as input to a network, but the network has never seen
                values from the corresponding month, then it can't make
                confident predictions.
        """
        if transform is not None:
            raise NotImplementedError("transform not implemented!")
        self.transform = transform

        # This sensor contains near-surface temperature readings and is on the
        # south side and therefore receives a lot of sunshine.
        rock_temperature_file_mh10 = "MH10_temperature_rock_2017.csv"  # South

        radiation_file = "MH15_radiometer__conv_2017.csv"

        if not local:
            account_name = (get_setting("azure")["account_name"]
                            if setting_exists("azure") else
                            "storageaccountperma8980")
            account_key = (get_setting("azure")["account_key"]
                           if setting_exists("azure") else None)

            ts_store = stuett.ABSStore(
                container="hackathon-on-permafrost",
                prefix="timeseries_derived_data_products",
                account_name=account_name,
                account_key=account_key,
            )

            img_store = stuett.ABSStore(
                container="hackathon-on-permafrost",
                prefix="timelapse_images_fast",
                account_name=account_name,
                account_key=account_key,
            )

        else:
            timeseries_folder = (Path(data_path).joinpath(
                "timeseries_derived_data_products").resolve())
            ts_store = stuett.DirectoryStore(timeseries_folder)
            if rock_temperature_file_mh10 not in store:
                raise RuntimeError("Please provide a valid path to the " +
                                   "permafrost data!")

            img_store = stuett.DirectoryStore(
                Path(data_path).joinpath("timelapse_images_fast"))
            if "2017-01-01/20170101_080018.JPG" not in store:
                raise RuntimeError("Please provide a valid path to the " +
                                   "permafrost images.")

        # self._ts_store = ts_store
        self._img_store = img_store

        ### Load timeseries data.
        rock_temperature_node_mh10 = stuett.data.CsvSource(
            rock_temperature_file_mh10, store=ts_store)
        rock_temp_mh10 = rock_temperature_node_mh10(time_slice)

        radiation_node = stuett.data.CsvSource(radiation_file, store=ts_store)
        radiation = radiation_node(time_slice)

        net_radiation = radiation.loc[:, ["net_radiation"]]
        surface_temp = rock_temp_mh10.loc[:, ["temperature_nearsurface_t2"]]
        target_temp = rock_temp_mh10.loc[:, ["temperature_10cm"]]

        ### Load image filenames.
        image_node = stuett.data.MHDSLRFilenames(
            store=img_store,
            force_write_to_remote=True,
            as_pandas=False,
        )
        image_fns = image_node(time_slice)

        ### Find image filenames that were captured close to temperature
        ### measures.
        # With close we mean within a 20min window.
        # Temperature/radiation values that have no corresponding image are
        # ignored.

        # Sanity check!
        # for t1, t2 in zip(radiation['time'], rock_temp_mh10['time']):
        #    assert (t1 == t2)

        j = 0
        n = len(image_fns["time"])

        measurement_pairs = []

        for i, t in enumerate(rock_temp_mh10["time"].values):
            while j < n:
                # Translate difference in timestamps to minutes before casting
                # to int.
                diff = ((image_fns["time"][j] -
                         t).values.astype("timedelta64[m]").astype(np.int))

                if diff > 10:
                    # Image too far in the future, ignore sensor value.
                    break

                absdiff = np.abs(diff)
                if absdiff < 10:
                    # The image is very close, simply check whether the next
                    # picture is even closer. Otherwise, we take the current
                    # image.
                    if j + 1 < n:
                        absdiff2 = np.abs(
                            (image_fns["time"][j + 1] -
                             t).values.astype("timedelta64[m]").astype(np.int))
                    else:
                        absdiff2 = None

                    if absdiff2 is None or absdiff < absdiff2:
                        measurement_pairs.append((i, j))
                        j += 1
                    else:
                        measurement_pairs.append((i, j + 1))
                        j += 2

                    break

                j += 1

        ### Build dataset (make sure that there are no None values in the
        ### timeseries measurements).
        self._img_fns = []
        self._surface_temp = []
        self._target_temp = []
        self._timestamps = []
        self._radiation = []

        # This is coarse time information that one may provide as additional
        # information. We encode the (normalized) month and daytime information,
        # as this information may be quite helpful when judging temperature
        # values.
        # Though, it might also tempt the regression system to ignore all
        # other information and solely predict based on this information
        # (as a strong local minimum).
        self._month = []
        self._daytime = []

        assert np.all(~np.isnan(net_radiation.values))
        assert np.all(~np.isnan(surface_temp.values))
        # assert(np.all(~np.isnan(target_temp.values)))

        for i, j in measurement_pairs:
            if np.any(np.isnan(target_temp.values[i, 0])):
                continue

            self._target_temp.append(target_temp.values[i, 0])
            self._surface_temp.append(surface_temp.values[i, 0])
            self._radiation.append(net_radiation.values[i, 0])

            self._timestamps.append(target_temp["time"].values[i])
            ts = pd.to_datetime(self._timestamps[-1])
            self._month.append(ts.month)
            self._daytime.append(ts.hour * 60 + ts.minute)

            self._img_fns.append(str(image_fns.values[0, j]))

        self._target_temp = np.array(self._target_temp, dtype=np.float32)
        self._surface_temp = np.array(self._surface_temp, dtype=np.float32)
        self._radiation = np.array(self._radiation, dtype=np.float32)

        self._month = np.array(self._month, dtype=np.float32)
        self._daytime = np.array(self._daytime, dtype=np.float32)

        # Normalize regression values.
        self.target_temp_mean = self._target_temp.mean()
        self.target_temp_std = self._target_temp.std()

        self.surface_temp_mean = self._surface_temp.mean()
        self.surface_temp_std = self._surface_temp.std()

        self.radiation_mean = self._radiation.mean()
        self.radiation_std = self._radiation.std()

        self._target_temp = (self._target_temp -
                             self.target_temp_mean) / self.target_temp_std

        self._surface_temp = (self._surface_temp -
                              self.surface_temp_mean) / self.surface_temp_std

        self._radiation = (self._radiation -
                           self.radiation_mean) / self.radiation_std

        self._month = (self._month - self._month.mean()) / self._month.std()
        self._daytime = (self._month -
                         self._daytime.mean()) / self._daytime.std()

        print("dataset contains %d samples." % len(self._img_fns))