def load_data(fname): # change here to get data from another sensor rock_temperature_file = "MH30_temperature_rock_2017.csv" # rock_temperature_file= "MH10_resistivity_rock_2017.csv" # rock_temperature_file= "MH25_vaisalawxt520windpth_2017.csv" # Getting cloud data account_name = (get_setting("azure")["account_name"] if setting_exists("azure") else "storageaccountperma8980") account_key = (get_setting("azure")["account_key"] if setting_exists("azure") else None) store = stuett.ABSStore( container="hackathon-on-permafrost", prefix="timeseries_derived_data_products", account_name=account_name, account_key=account_key, ) rock_temperature_node = stuett.data.CsvSource(fname, store=store) rock_temperature = rock_temperature_node() # return data rock_temperature = rock_temperature.drop(dim="name", labels=["position"]) return rock_temperature
from sklearn.covariance import EllipticEnvelope from sklearn.ensemble import IsolationForest from sklearn.neighbors import LocalOutlierFactor from scipy.fftpack import fft from sklearn.impute import SimpleImputer import time import os from sklearn.decomposition import PCA account_name = (get_setting("azure")["account_name"] if setting_exists("azure") else "storageaccountperma8980") account_key = (get_setting("azure")["account_key"] if setting_exists("azure") else None) store = stuett.ABSStore( container="hackathon-on-permafrost", prefix="seismic_data/4D/", account_name=account_name, account_key=account_key, ) rock_temperature_file = "MH30_temperature_rock_2017.csv" prec_file = "MH25_vaisalawxt520prec_2017.csv" derived_store = stuett.ABSStore( container="hackathon-on-permafrost", prefix="timeseries_derived_data_products", account_name=account_name, account_key=account_key, ) image_store = stuett.ABSStore( container="hackathon-on-permafrost", prefix="timelapse_images_fast", account_name=account_name, account_key=account_key,
if args.reload_all: args.reload_frozen = True ############ SETTING UP DATA LOADERS ############ ################################################# if not args.local: from stuett.global_config import get_setting, setting_exists account_name = (get_setting("azure")["account_name"] if setting_exists("azure") else "storageaccountperma8980") account_key = (get_setting("azure")["account_key"] if setting_exists("azure") else None) store = stuett.ABSStore( container="hackathon-on-permafrost", prefix=prefix, account_name=account_name, account_key=account_key, ) annotation_store = stuett.ABSStore( container="hackathon-on-permafrost", prefix="annotations", account_name=account_name, account_key=account_key, ) else: store = stuett.DirectoryStore(Path(data_path).joinpath(prefix)) if "2017-01-01/20170101_080018.JPG" not in store and "MH36/2017/EHE.D/4D.MH36.A.EHE.D.20171231_230000.miniseed" not in store: raise RuntimeError( f"Please provide a valid path to the permafrost {prefix} data or see README how to download it" ) annotation_store = stuett.DirectoryStore(
help="Only use local files and not data from Azure") args = parser.parse_args() data_path = Path(args.path) # change here to get data from another sensor vaisalawxt520windpth_file = "MH25_vaisalawxt520windpth_2017.csv" # Getting either cloud or local data file if not args.local: account_name = (get_setting("azure")["account_name"] if setting_exists("azure") else "storageaccountperma8980") account_key = (get_setting("azure")["account_key"] if setting_exists("azure") else None) store = stuett.ABSStore( container="hackathon-on-permafrost", prefix="timeseries_derived_data_products", account_name=account_name, account_key=account_key, ) else: timeseries_folder = Path(data_path).joinpath( "timeseries_derived_data_products").resolve() store = stuett.DirectoryStore(timeseries_folder) if vaisalawxt520windpth_file not in store: raise RuntimeError( "Please provide a valid path to the permafrost data or see README how to download it" ) vaisalawxt520windpth_node = stuett.data.CsvSource(vaisalawxt520windpth_file, store=store) vaisalawxt520windpth = vaisalawxt520windpth_node()
# Create a blob service and list all data available block_blob_service = BlockBlobService(account_name=account_name, account_key=account_key) print("\nList blobs in the container") generator = block_blob_service.list_blobs("hackathon-on-permafrost") for i, blob in enumerate(generator): print("\t Blob name: " + blob.name) if i == 5: break print("List some documents") # In stuett we can use a a zarr store and load the data from there store = stuett.ABSStore( container="hackathon-on-permafrost", prefix="docs/", account_name=account_name, account_key=account_key, blob_service_kwargs={}, ) for i, key in enumerate(store.keys()): print(key) if i == 5: break # # Currently, stuett (or zarr in the backend) only support azure-storage-blob==2.1.0` # # But a newer version is available which you can use independently # # using azure-storage-blob==12.0.0 # # untested # from azure.storage.blob import BlobServiceClient # service = BlobServiceClient(account_url="https://storageaccountperma8980.blob.core.windows.net/",credentials=account_key)
if args.high_quality: prefix = "timelapse_images" else: prefix = "timelapse_images_fast" if not args.local: from stuett.global_config import get_setting, setting_exists account_name = (get_setting("azure")["account_name"] if setting_exists("azure") else "storageaccountperma8980") account_key = (get_setting("azure")["account_key"] if setting_exists("azure") else None) store = stuett.ABSStore( container="hackathon-on-permafrost", prefix=prefix, account_name=account_name, account_key=account_key, ) annotation_store = stuett.ABSStore( container="hackathon-on-permafrost", prefix="annotations", account_name=account_name, account_key=account_key, ) else: store = stuett.DirectoryStore(Path(data_path).joinpath(prefix)) if "2017-01-01/20170101_080018.JPG" not in store: raise RuntimeError( "Please provide a valid path to the permafrost timelapse_images data or see README how to download it" )
"--local", action="store_true", help="Only use local files and not data from Azure", ) args = parser.parse_args() data_path = Path(args.path) if not args.local: account_name = (get_setting("azure")["account_name"] if setting_exists("azure") else "storageaccountperma8980") account_key = (get_setting("azure")["account_key"] if setting_exists("azure") else None) store = stuett.ABSStore( container="hackathon-on-permafrost", prefix="seismic_data/4D/", account_name=account_name, account_key=account_key, ) else: seismic_folder = Path(data_path).joinpath("seismic_data/4D/") store = stuett.DirectoryStore(seismic_folder) if "MH36/2017/EHE.D/4D.MH36.A.EHE.D.20171231_230000.miniseed" not in store: raise RuntimeError( "Please provide a valid path to the permafrost data or see README how to download it" ) seismic_node = stuett.data.SeismicSource( store=store, station="MH36", channel=["EHE", "EHN", "EHZ"], start_time="2017-08-02 10:00:00",
annotations_path = data_path.joinpath("annotations") if args.azure: account_name = ( get_setting("azure")["account_name"] if setting_exists("azure") else "storageaccountperma8980" ) account_key = ( get_setting("azure")["account_key"] if setting_exists("azure") else None ) if args.to_data_storage: output_store = stuett.ABSStore( container="hackathon-on-permafrost", prefix="annotations", account_name=account_name, account_key=account_key, blob_service_kwargs={}, ) else: output_store = stuett.DirectoryStore(annotations_path) input_store = stuett.ABSStore( container="hackathon-public-rw", prefix="", account_name=account_name, account_key=account_key, ) else: input_store = stuett.DirectoryStore(args.user_annotations) output_store = stuett.DirectoryStore(annotations_path)
def __init__( self, local, data_path="../data", transform=None, time_slice={ "start_time": "2017-01-01", "end_time": "2017-12-31" }, ): """ Args: local (bool): Whether to read the dataset from a local storage location or from a public Azure share. data_path (str, optional): If the data should be read from a local location, then this folder will denote the location of the dataset. transform (callable, optional): Optional transform to be applied on images. time_slice (dict): Can be used to create a different train and test set. Note, this is not a pretty solution, especially because time values are not interleaved. I.e., if time information is used as input to a network, but the network has never seen values from the corresponding month, then it can't make confident predictions. """ if transform is not None: raise NotImplementedError("transform not implemented!") self.transform = transform # This sensor contains near-surface temperature readings and is on the # south side and therefore receives a lot of sunshine. rock_temperature_file_mh10 = "MH10_temperature_rock_2017.csv" # South radiation_file = "MH15_radiometer__conv_2017.csv" if not local: account_name = (get_setting("azure")["account_name"] if setting_exists("azure") else "storageaccountperma8980") account_key = (get_setting("azure")["account_key"] if setting_exists("azure") else None) ts_store = stuett.ABSStore( container="hackathon-on-permafrost", prefix="timeseries_derived_data_products", account_name=account_name, account_key=account_key, ) img_store = stuett.ABSStore( container="hackathon-on-permafrost", prefix="timelapse_images_fast", account_name=account_name, account_key=account_key, ) else: timeseries_folder = (Path(data_path).joinpath( "timeseries_derived_data_products").resolve()) ts_store = stuett.DirectoryStore(timeseries_folder) if rock_temperature_file_mh10 not in store: raise RuntimeError("Please provide a valid path to the " + "permafrost data!") img_store = stuett.DirectoryStore( Path(data_path).joinpath("timelapse_images_fast")) if "2017-01-01/20170101_080018.JPG" not in store: raise RuntimeError("Please provide a valid path to the " + "permafrost images.") # self._ts_store = ts_store self._img_store = img_store ### Load timeseries data. rock_temperature_node_mh10 = stuett.data.CsvSource( rock_temperature_file_mh10, store=ts_store) rock_temp_mh10 = rock_temperature_node_mh10(time_slice) radiation_node = stuett.data.CsvSource(radiation_file, store=ts_store) radiation = radiation_node(time_slice) net_radiation = radiation.loc[:, ["net_radiation"]] surface_temp = rock_temp_mh10.loc[:, ["temperature_nearsurface_t2"]] target_temp = rock_temp_mh10.loc[:, ["temperature_10cm"]] ### Load image filenames. image_node = stuett.data.MHDSLRFilenames( store=img_store, force_write_to_remote=True, as_pandas=False, ) image_fns = image_node(time_slice) ### Find image filenames that were captured close to temperature ### measures. # With close we mean within a 20min window. # Temperature/radiation values that have no corresponding image are # ignored. # Sanity check! # for t1, t2 in zip(radiation['time'], rock_temp_mh10['time']): # assert (t1 == t2) j = 0 n = len(image_fns["time"]) measurement_pairs = [] for i, t in enumerate(rock_temp_mh10["time"].values): while j < n: # Translate difference in timestamps to minutes before casting # to int. diff = ((image_fns["time"][j] - t).values.astype("timedelta64[m]").astype(np.int)) if diff > 10: # Image too far in the future, ignore sensor value. break absdiff = np.abs(diff) if absdiff < 10: # The image is very close, simply check whether the next # picture is even closer. Otherwise, we take the current # image. if j + 1 < n: absdiff2 = np.abs( (image_fns["time"][j + 1] - t).values.astype("timedelta64[m]").astype(np.int)) else: absdiff2 = None if absdiff2 is None or absdiff < absdiff2: measurement_pairs.append((i, j)) j += 1 else: measurement_pairs.append((i, j + 1)) j += 2 break j += 1 ### Build dataset (make sure that there are no None values in the ### timeseries measurements). self._img_fns = [] self._surface_temp = [] self._target_temp = [] self._timestamps = [] self._radiation = [] # This is coarse time information that one may provide as additional # information. We encode the (normalized) month and daytime information, # as this information may be quite helpful when judging temperature # values. # Though, it might also tempt the regression system to ignore all # other information and solely predict based on this information # (as a strong local minimum). self._month = [] self._daytime = [] assert np.all(~np.isnan(net_radiation.values)) assert np.all(~np.isnan(surface_temp.values)) # assert(np.all(~np.isnan(target_temp.values))) for i, j in measurement_pairs: if np.any(np.isnan(target_temp.values[i, 0])): continue self._target_temp.append(target_temp.values[i, 0]) self._surface_temp.append(surface_temp.values[i, 0]) self._radiation.append(net_radiation.values[i, 0]) self._timestamps.append(target_temp["time"].values[i]) ts = pd.to_datetime(self._timestamps[-1]) self._month.append(ts.month) self._daytime.append(ts.hour * 60 + ts.minute) self._img_fns.append(str(image_fns.values[0, j])) self._target_temp = np.array(self._target_temp, dtype=np.float32) self._surface_temp = np.array(self._surface_temp, dtype=np.float32) self._radiation = np.array(self._radiation, dtype=np.float32) self._month = np.array(self._month, dtype=np.float32) self._daytime = np.array(self._daytime, dtype=np.float32) # Normalize regression values. self.target_temp_mean = self._target_temp.mean() self.target_temp_std = self._target_temp.std() self.surface_temp_mean = self._surface_temp.mean() self.surface_temp_std = self._surface_temp.std() self.radiation_mean = self._radiation.mean() self.radiation_std = self._radiation.std() self._target_temp = (self._target_temp - self.target_temp_mean) / self.target_temp_std self._surface_temp = (self._surface_temp - self.surface_temp_mean) / self.surface_temp_std self._radiation = (self._radiation - self.radiation_mean) / self.radiation_std self._month = (self._month - self._month.mean()) / self._month.std() self._daytime = (self._month - self._daytime.mean()) / self._daytime.std() print("dataset contains %d samples." % len(self._img_fns))