def main(uri: str, filename: str): """Import Argo Profiles :param str uri: Database URI :param str filename: Argo NetCDF Filename, or directory of files """ data.observational.init_db(uri, echo=False) data.observational.create_tables() session = data.observational.db.session if os.path.isdir(filename): filenames = sorted(glob.glob(os.path.join(filename, "*.nc"))) else: filenames = [filename] for fname in filenames: print(fname) with xr.open_dataset(fname) as ds: times = pd.to_datetime(ds.JULD.values) for f in META_FIELDS: META_FIELDS[f] = ds[f].values.astype(str) for prof in ds.N_PROF.values: plat_number = ds.PLATFORM_NUMBER.values.astype(str)[prof] unique_id = f"argo_{plat_number}" # Grab the platform from the db base on the unique id platform = (session.query(Platform).filter( Platform.unique_id == unique_id, Platform.type == Platform.Type.argo, ).first()) if platform is None: # ... or make a new platform platform = Platform(type=Platform.Type.argo, unique_id=unique_id) attrs = {} for f in META_FIELDS: attrs[ds[f].long_name] = META_FIELDS[f][prof].strip() platform.attrs = attrs session.add(platform) # Make a new Station station = Station( time=times[prof], latitude=ds.LATITUDE.values[prof], longitude=ds.LONGITUDE.values[prof], ) platform.stations.append(station) # We need to commit the station here so that it'll have an id session.commit() depth = seawater.dpth(ds.PRES[prof].dropna("N_LEVELS").values, ds.LATITUDE.values[prof]) samples = [] for variable in VARIABLES: # First check our local cache for the DataType object, if # that comes up empty, check the db, and failing that, # create a new one from the variable's attributes if variable not in datatype_map: dt = DataType.query.get(ds[variable].standard_name) if dt is None: dt = DataType( key=ds[variable].standard_name, name=ds[variable].long_name, unit=ds[variable].units, ) data.observational.db.session.add(dt) # Commit the DataType right away. This might lead # to a few extra commits on the first import, but # reduces overall complexity in having to # 'remember' if we added a new one later. data.observational.db.session.commit() datatype_map[variable] = dt else: dt = datatype_map[variable] values = ds[variable][prof].dropna("N_LEVELS").values # Using station_id and datatype_key here instead of the # actual objects so that we can use bulk_save_objects--this # is much faster, but it doesn't follow any relationships. samples = [ Sample( depth=pair[0], datatype_key=dt.key, value=pair[1], station_id=station.id, ) for pair in zip(depth, values) ] data.observational.db.session.bulk_save_objects(samples) session.commit()
def main(uri: str, filename: str): """Import Glider NetCDF :param str uri: Database URI :param str filename: Glider Filename, or directory of NetCDF files """ data.observational.init_db(uri, echo=False) data.observational.create_tables() if os.path.isdir(filename): filenames = sorted(glob.glob(os.path.join(filename, "*.nc"))) else: filenames = [filename] datatype_map = {} for fname in filenames: print(fname) with xr.open_dataset(fname) as ds: variables = [v for v in VARIABLES if v in ds.variables] df = ds[['TIME', 'LATITUDE', 'LONGITUDE', 'PRES', *variables]].to_dataframe().reset_index().dropna() df['DEPTH'] = seawater.dpth(df.PRES, df.LATITUDE) for variable in variables: if variable not in datatype_map: dt = DataType.query.get(ds[variable].standard_name) if dt is None: dt = DataType(key=ds[variable].standard_name, name=ds[variable].long_name, unit=ds[variable].units) data.observational.db.session.add(dt) datatype_map[variable] = dt data.observational.db.session.commit() p = Platform(type=Platform.Type.glider, unique_id=f"glider_{ds.deployment_label}") attrs = { 'Glider Platform': ds.platform_code, 'WMO': ds.wmo_platform_code, 'Deployment': ds.deployment_label, 'Institution': ds.institution, 'Contact': ds.contact, } p.attrs = attrs data.observational.db.session.add(p) data.observational.db.session.commit() stations = [ Station( platform_id=p.id, time=row.TIME, latitude=row.LATITUDE, longitude=row.LONGITUDE, ) for idx, row in df.iterrows() ] # Using return_defaults=True here so that the stations will get # updated with id's. It's slower, but it means that we can just # put all the station ids into a pandas series to use when # constructing the samples. data.observational.db.session.bulk_save_objects( stations, return_defaults=True) df["STATION_ID"] = [s.id for s in stations] samples = [[ Sample(station_id=row.STATION_ID, depth=row.DEPTH, value=row[variable], datatype_key=datatype_map[variable].key) for variable in variables ] for idx, row in df.iterrows()] data.observational.db.session.bulk_save_objects( [item for sublist in samples for item in sublist]) data.observational.db.session.commit() data.observational.db.session.commit()
def main(uri: str, filename: str): """Import CONCEPTS drifter NetCDF :param str uri: Database URI :param str filename: Drifter Filename, or directory of NetCDF files """ data.observational.init_db(uri, echo=False) data.observational.create_tables() if os.path.isdir(filename): filenames = sorted(glob.glob(os.path.join(filename, "*.nc"))) else: filenames = [filename] for fname in filenames: print(fname) with xr.open_dataset(fname) as ds: df = ds.to_dataframe().drop(['wmo', 'deployment', 'imei'], axis=1) columns = list(filter(lambda c: c in DATATYPE_MAPPING, df.columns)) dt_map = {} for c in columns: # First check our local cache for the DataType object, if # that comes up empty, check the db, and failing that, # create a new one. if c not in dt_map: dt = DataType.query.get(DATATYPE_MAPPING[c][0]) if dt is None: dt = DataType(key=DATATYPE_MAPPING[c][0], name=DATATYPE_MAPPING[c][1], unit=DATATYPE_MAPPING[c][2]) data.observational.db.session.add(dt) dt_map[c] = dt # Commit to make sure all the variables are in the db so we don't # get any foreign key errors data.observational.db.session.commit() p = Platform(type=Platform.Type.drifter) attrs = dict(ds.attrs) attrs['wmo'] = ds.wmo.values[0] attrs['deployment'] = ds.deployment.values[0] attrs['imei'] = ds.imei.values[0] p.attrs = attrs data.observational.db.session.add(p) data.observational.db.session.commit() samples = [] for index, row in df.iterrows(): time = index[0] lat = row['latitude'] lon = row['longitude'] station = Station(time=time, latitude=lat, longitude=lon, platform_id=p.id) data.observational.db.session.bulk_save_objects( [station], return_defaults=True) for c in columns: value = row[c] if isinstance(value, pd.Timestamp): value = value.value / 10**9 if np.isfinite(value): samples.append( Sample(depth=0, datatype_key=DATATYPE_MAPPING[c][0], value=value, station_id=station.id)) # Commit every 1000 samples, that's a decent balance between # locking the db for too long and performance if len(samples) > 1000: data.observational.db.session.bulk_save_objects(samples) data.observational.db.session.commit() samples = [] # If there are any samples that haven't been committed yet, do so # now. if samples: data.observational.db.session.bulk_save_objects(samples) data.observational.db.session.commit() samples = [] data.observational.db.session.commit()
def main(uri: str, filename: str): """Import Seal Profiles :param str uri: Database URI :param str filename: Seal NetCDF Filename, or directory of files """ data.observational.init_db(uri, echo=False) data.observational.create_tables() if os.path.isdir(filename): filenames = sorted(glob.glob(os.path.join(filename, "*.nc"))) else: filenames = [filename] for fname in filenames: print(fname) # We're only loading Temperature and Salinity from these files, so # we'll just make sure the DataTypes are in the db now. if DataType.query.get("sea_water_temperature") is None: dt = DataType( key="sea_water_temperature", name="Water Temperature", unit="degree_Celsius", ) data.observational.db.session.add(dt) if DataType.query.get("sea_water_temperature") is None: dt = DataType(key="sea_water_salinity", name="Water Salinity", unit="PSU") data.observational.db.session.add(dt) data.observational.db.session.commit() with xr.open_dataset(fname) as ds: ds["TIME"] = ds.JULD.to_index().to_datetimeindex() ds["TIME"] = ds.TIME.swap_dims({"TIME": "N_PROF"}) depth = seawater.dpth( ds.PRES_ADJUSTED, np.tile(ds.LATITUDE, (ds.PRES.shape[1], 1)).transpose(), ) ds["DEPTH"] = (["N_PROF", "N_LEVELS"], depth) # This is a single platform, so we can construct it here. p = Platform(type=Platform.Type.animal, unique_id=ds.reference_file_name) p.attrs = { "Principle Investigator": ds.pi_name, "Platform Code": ds.platform_code, "Species": ds.species, } data.observational.db.session.add(p) data.observational.db.session.commit() # Generate Stations df = ds[["LATITUDE", "LONGITUDE", "TIME"]].to_dataframe() stations = [ Station( platform_id=p.id, latitude=row.LATITUDE, longitude=row.LONGITUDE, time=row.TIME, ) for idx, row in df.iterrows() ] # Using return_defaults=True here so that the stations will get # updated with id's. It's slower, but it means that we can just # put all the station ids into a pandas series to use when # constructing the samples. data.observational.db.session.bulk_save_objects( stations, return_defaults=True) df["STATION_ID"] = [s.id for s in stations] # Generate Samples df_samp = (ds[["TEMP_ADJUSTED", "PSAL_ADJUSTED", "DEPTH"]].to_dataframe().reorder_levels( ["N_PROF", "N_LEVELS"])) samples = [[ Sample( station_id=df.STATION_ID[idx[0]], datatype_key="sea_water_temperature", value=row.TEMP_ADJUSTED, depth=row.DEPTH, ), Sample( station_id=df.STATION_ID[idx[0]], datatype_key="sea_water_salinity", value=row.PSAL_ADJUSTED, depth=row.DEPTH, ), ] for idx, row in df_samp.iterrows()] samples = [item for sublist in samples for item in sublist] samples = [s for s in samples if not pd.isna(s.value)] data.observational.db.session.bulk_save_objects(samples) data.observational.db.session.commit()
def main(uri: str, filename: str): """Import NAFC CTD :param str uri: Database URI :param str filename: NetCDF file, or directory of files """ data.observational.init_db(uri, echo=False) data.observational.create_tables() datatype_map = {} if os.path.isdir(filename): filenames = sorted(glob.glob(os.path.join(filename, "*.nc"))) else: filenames = [filename] for fname in filenames: print(fname) with xr.open_dataset(fname) as ds: if len(datatype_map) == 0: # Generate the DataTypes; only consider variables that have depth for var in filter( lambda x, dataset=ds: 'level' in dataset[x].coords, [d for d in ds.data_vars]): dt = DataType.query.get(ds[var].standard_name) if dt is None: dt = DataType(key=ds[var].standard_name, name=ds[var].long_name, unit=ds[var].units) datatype_map[var] = dt data.observational.db.session.add_all(datatype_map.values()) # Query or generate the platform # The files I worked off of were not finalized -- in this case the # trip id also included the cast number, so I strip off the last 3 # digits. unique_id = f"nafc_ctd_{ds.trip_id[:-3]}" p = Platform.query.filter( Platform.unique_id == unique_id).one_or_none() if p is None: p = Platform(type=Platform.Type.mission, unique_id=unique_id) p.attrs = { 'Institution': ds.institution, 'Trip ID': ds.trip_id[:-3], 'Ship Name': ds.shipname, } data.observational.db.session.add(p) # Generate the station s = Station( latitude=ds.latitude.values[0], longitude=ds.longitude.values[0], time=pd.Timestamp(ds.time.values[0]), ) p.stations.append(s) data.observational.db.session.commit() ds['level'] = seawater.dpth(ds.level.values, ds.latitude[0].values) # Generate the samples for var, dt in datatype_map.items(): da = ds[var].dropna('level') samples = [ Sample(value=d.item(), depth=d.level.item(), datatype_key=dt.key, station_id=s.id) for d in da ] data.observational.db.session.bulk_save_objects(samples) data.observational.db.session.commit()
def main(uri: str, filename: str): """Import Seal Profiles :param str uri: Database URI :param str filename: CIOOS csv Filename, or directory of files """ data.observational.init_db(uri, echo=False) data.observational.create_tables() if os.path.isdir(filename): filenames = sorted(glob.glob(os.path.join(filename, "*.csv"))) else: filenames = [filename] for fname in filenames: print(fname) ds = pd.read_csv(fname) print(ds) # we'll just make sure the DataTypes are in the db now. if DataType.query.get('wind_speed') is None: dt = DataType(key='wind_speed', name='Wind Speed', unit='m s-1') data.observational.db.session.add(dt) data.observational.db.session.commit() # This is a single platform, so we can construct it here. p = Platform(type=Platform.Type.animal, unique_id=fname) p.attrs = { # 'Principle Investigator': ds.pi_name, # 'Platform Code': ds.platform_code, # 'Species': ds.species, } data.observational.db.session.add(p) data.observational.db.session.commit() #Generate Stations df = ds[['latitude', 'longitude', 'time']] stations = [ Station( platform_id=p.id, latitude=float(row.latitude), longitude=float(row.longitude), time=row.time, ) for idx, row in df.iterrows() ] # Using return_defaults=True here so that the stations will get # updated with id's. It's slower, but it means that we can just # put all the station ids into a pandas series to use when # constructing the samples. data.observational.db.session.bulk_save_objects(stations, return_defaults=True) df['station_name'] = [s.id for s in stations] # Generate Samples # df_samp = ds[ # ['TEMP_ADJUSTED', 'PSAL_ADJUSTED', 'DEPTH'] # ].to_dataframe().reorder_levels(['N_PROF', 'N_LEVELS']) # samples = [ [ Sample( station_id=df.STATION_ID[idx[0]], datatype_key='wind_speed', value=row.wind_spd_avg, depth=0, ), # Sample( # station_id=df.STATION_ID[idx[0]], # datatype_key='sea_water_salinity', # value=row.PSAL_ADJUSTED, # depth=row.DEPTH, # ) ] # for idx, row in df_samp.iterrows() ] samples = [item for sublist in samples for item in sublist] samples = [s for s in samples if not pd.isna(s.value)] data.observational.db.session.bulk_save_objects(samples) data.observational.db.session.commit()