def convert_ampds(input_path, output_filename, format='HDF'): """ Convert AMPds R2013 as seen on Dataverse. Download the files as CSVs and put them in the `input_path` folder for conversion. Download URL: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/MXB7VO Parameters: ----------- input_path: str The path of the directory where all the csv files are supposed to be stored output_filename: str The path of the h5 file where all the standardized data is supposed to go. The path should refer to a particular file and not just a random directory in order for this to work. format: str Defaults to HDF5 Example usage: -------------- convert('/AMPds/electricity', 'store.h5') """ check_directory_exists(input_path) files = [ f for f in listdir(input_path) if isfile(join(input_path, f)) and '.csv' in f and '.swp' not in f ] # Sorting Lexicographically files.sort() # Remove Whole Home and put it at top files.remove("WHE.csv") files.insert(0, "WHE.csv") assert isdir(input_path) store = get_datastore(output_filename, format, mode='w') for i, csv_file in enumerate(files): key = Key(building=1, meter=(i + 1)) print('Loading file #', (i + 1), ' : ', csv_file, '. Please wait...') df = pd.read_csv(join(input_path, csv_file)) # Due to fixed width, column names have spaces :( df.columns = [x.replace(" ", "") for x in df.columns] df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME], unit='s', utc=True) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df = df.tz_convert(TIMEZONE) df.rename(columns=lambda x: columnNameMapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) store.put(str(key), df) print("Done with file #", (i + 1)) store.close() metadata_path = join(_get_module_directory(), 'metadata') print('Processing metadata...') convert_yaml_to_hdf5(metadata_path, output_filename)
def _get_ac_type_map(ukdale_path): """First we need to convert the YAML metadata to HDF5 so we can load the metadata into NILMTK to allow us to use NILMTK to find the ac_type for each channel. Parameters ---------- ukdale_path : str Returns ------- ac_type_map : dict. Keys are pairs of ints: (<house_instance>, <meter_instance>) Values are list of available power ac type for that meter. """ hdf5_just_metadata = join(ukdale_path, 'metadata', 'ukdale_metadata.h5') convert_yaml_to_hdf5(join(ukdale_path, 'metadata'), hdf5_just_metadata) ukdale_dataset = DataSet(hdf5_just_metadata) ac_type_map = {} for building_i, building in iteritems(ukdale_dataset.buildings): elec = building.elec for meter in elec.meters + elec.disabled_meters: key = (building_i, meter.instance()) ac_type_map[key] = meter.available_ac_types('power') ukdale_dataset.store.close() remove(hdf5_just_metadata) return ac_type_map
def convert_ukdale(ukdale_path, hdf_filename): """Converts the UK-DALE dataset to NILMTK HDF5 format. For more information about the UK-DALE dataset, and to download it, please see http://www.doc.ic.ac.uk/~dk3810/data/ Parameters ---------- ukdale_path : str The root path of the UK-DALE dataset. It is assumed that the YAML metadata is in 'ukdale_path/metadata'. hdf_filename : str The destination HDF5 filename (including path and suffix). """ ac_type_map = _get_ac_type_map(ukdale_path) def _ukdale_measurement_mapping_func(house_id, chan_id): ac_type = ac_type_map[(house_id, chan_id)][0] return [('power', ac_type)] # Convert 6-second data _convert(ukdale_path, hdf_filename, _ukdale_measurement_mapping_func, TZ) # Add metadata convert_yaml_to_hdf5(join(ukdale_path, 'metadata'), hdf_filename) # Convert 1-second data _convert_one_sec_data(ukdale_path, hdf_filename, ac_type_map) print("Done converting UK-DALE to HDF5!")
def convert_combed(combed_path, hdf_filename): """ Parameters ---------- combed_path : str The root path of the combed dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(combed_path) # Open HDF5 file store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') chan = 1 for building, meter_array in SUBMETER_PATHS.iteritems(): for meter in meter_array: key = Key(building=1, meter=chan) dfs = [] total = pd.DataFrame() for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building, str(meter), "%s.csv" %attribute ) print(filename_attribute) dfs.append(pd.read_csv(filename_attribute, parse_dates = True, index_col = 0, header = True, names=[attribute])) total = pd.concat(dfs, axis = 1) total.rename(columns=lambda x: column_mapping[x], inplace=True) total.columns.set_names(LEVEL_NAMES, inplace=True) store.put(str(key), total, format='table') store.flush() chan = chan+ 1 convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), hdf_filename) print("Done converting COMBED to HDF5!")
def refresh_gjw_metadata(gjw_path, output_filename): """ Parameters ---------- gjw_path : str The root path of the gjw dataset. output_filename : str The destination filename (including path and suffix), will default if not specified directory and file structure nilm_gjw_data building<1> elec 4-POWER_REAL_FINE <date> Dump.csv 5-POWER_REACTIVE_STANDARD <date> Dump.csv ... ... building<n> HDF5 nilm_gjw_data.hdf5 metadata building1.yaml dataset.yaml meter_devices.yaml other files """ if gjw_path is None: gjw_path = home_dir check_directory_exists(gjw_path) os.chdir(gjw_path) gjw_path = os.getcwd() # sort out potential issue with slashes or backslashes if output_filename is None: output_filename =join(home_dir,'HDF5','nilm_gjw_data.hdf5') convert_yaml_to_hdf5(join(gjw_path, 'metadata'),output_filename) print("Done refreshing metadata")
def refresh_gjw_metadata(gjw_path, output_filename): """ Parameters ---------- gjw_path : str The root path of the gjw dataset. output_filename : str The destination filename (including path and suffix), will default if not specified directory and file structure nilm_gjw_data building<1> elec 4-POWER_REAL_FINE <date> Dump.csv 5-POWER_REACTIVE_STANDARD <date> Dump.csv ... ... building<n> HDF5 nilm_gjw_data.hdf5 metadata building1.yaml dataset.yaml meter_devices.yaml other files """ if gjw_path is None: gjw_path = home_dir check_directory_exists(gjw_path) os.chdir(gjw_path) gjw_path = os.getcwd( ) # sort out potential issue with slashes or backslashes if output_filename is None: output_filename = join(home_dir, 'HDF5', 'nilm_gjw_data.hdf5') convert_yaml_to_hdf5(join(gjw_path, 'metadata'), output_filename) print("Done refreshing metadata")
def convert_combed(combed_path, output_filename, format='HDF'): """ Parameters ---------- combed_path : str The root path of the combed dataset. output_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(combed_path) # Open store store = get_datastore(output_filename, format, mode='w') for building_name, building_mapping in overall_dataset_mapping.iteritems(): for load_name, load_mapping in building_mapping.iteritems(): for load_mapping_path, meter_number in load_mapping.iteritems(): building_number = building_number_mapping[building_name] key = Key(building=building_number, meter=meter_number) dfs = [] for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" %attribute) print(filename_attribute) dfs.append(pd.read_csv(filename_attribute, parse_dates=True, index_col=0, header=True, names=[attribute])) total = pd.concat(dfs, axis=1) total = total.tz_localize('UTC').tz_convert('Asia/Kolkata') total.rename(columns=lambda x: column_mapping[x], inplace=True) total.columns.set_names(LEVEL_NAMES, inplace=True) assert total.index.is_unique store.put(str(key), total) convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), output_filename) print("Done converting COMBED to HDF5!")
def convert_ukdale(ukdale_path, hdf_filename): """ Parameters ---------- ukdale_path : str The root path of the UK-DALE dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ def _ukdale_measurement_mapping_func(house_id, chan_id): # TODO: This needs updating. It's wrong! ac_type = 'apparent' if chan_id <= 2 else 'active' return [('power', ac_type)] _convert(ukdale_path, hdf_filename, _ukdale_measurement_mapping_func, 'Europe/London') # Add metadata convert_yaml_to_hdf5(join(get_module_directory(), 'dataset_converters', 'ukdale', 'metadata'), hdf_filename) print("Done converting UK-DALE to HDF5!")
def convert_greend(greend_path, hdf_filename): """ Parameters ---------- greend_path : str The root path of the greend dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ store = pd.HDFStore(hdf_filename, "w", complevel=9, complib="zlib") houses = sorted(__get_houses(greend_path)) print(houses) h = 1 for house in houses: print("loading " + house) abs_house = join(greend_path, house) dates = [d for d in listdir(abs_house) if d.startswith("dataset")] house_data = [] for date in dates: print("-----------------------", date) try: tmp_pandas = pd.read_csv(join(abs_house, date), na_values=["na"], error_bad_lines=False) except: # A CParserError is returned for malformed files (irregular column number) pass if "timestamp" in tmp_pandas.columns: pass else: tmp_pandas["timestamp"] = tmp_pandas.index tmp_pandas.index = tmp_pandas["timestamp"].convert_objects(convert_numeric=True).values tmp_pandas = tmp_pandas.drop("timestamp", 1) tmp_pandas = tmp_pandas.astype("float32") tmp_pandas.index = pd.to_datetime(tmp_pandas.index, unit="s") tmp_pandas = tmp_pandas.tz_localize("UTC").tz_convert("CET") tmp_pandas = tmp_pandas.drop_duplicates() # tmp_pandas = tmp_pandas.sort_index() house_data.append(tmp_pandas) overall_df = pd.concat(house_data) overall_df = overall_df.drop_duplicates() overall_df = overall_df.sort_index() m = 1 for column in overall_df.columns: print("meter" + str(m) + ": " + column) key = Key(building=h, meter=m) print("Putting into store...") store.put(str(key), overall_df[column], format="table") m += 1 print("Flushing store...") store.flush() h += 1 store.close() # needs to be edited convert_yaml_to_hdf5("/path/to/metadata", hdf_filename)
def convert_combed(combed_path, output_filename, format='HDF'): """ Parameters ---------- combed_path : str The root path of the combed dataset. output_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(combed_path) # Open store store = get_datastore(output_filename, format, mode='w') any_file_converted = False for building_name, building_mapping in iteritems(overall_dataset_mapping): for load_name, load_mapping in iteritems(building_mapping): for load_mapping_path, meter_number in iteritems(load_mapping): building_number = building_number_mapping[building_name] key = Key(building=building_number, meter=meter_number) dfs = [] for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" %attribute) if not os.path.isfile(filename_attribute): # File not found directly in the combed_path provided # Try adding 'iiitd' to it filename_attribute = join(combed_path, 'iiitd', building_name, load_name, load_mapping_path, "%s.csv" %attribute) if os.path.isfile(filename_attribute): exists = True print(filename_attribute) df = pd.read_csv(filename_attribute, names=["timestamp", attribute]) df.index = pd.to_datetime(df["timestamp"], unit='ms') df = df.drop("timestamp", 1) dfs.append(df) else: exists = False if exists: total = pd.concat(dfs, axis=1) total = total.tz_localize('UTC').tz_convert('Asia/Kolkata') total.columns = pd.MultiIndex.from_tuples([column_mapping[x] for x in total.columns]) total.columns.set_names(LEVEL_NAMES, inplace=True) assert total.index.is_unique store.put(str(key), total) any_file_converted = True if not any_file_converted: raise RuntimeError('No files converted, did you specify the correct path?') convert_yaml_to_hdf5( join(get_module_directory(), 'dataset_converters', 'combed', 'metadata'), output_filename ) print("Done converting COMBED to HDF5!")
def convert_combed(combed_path, output_filename, format='HDF'): """ Parameters ---------- combed_path : str The root path of the combed dataset. output_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(combed_path) # Open store store = get_datastore(output_filename, format, mode='w') any_file_converted = False for building_name, building_mapping in iteritems(overall_dataset_mapping): for load_name, load_mapping in iteritems(building_mapping): for load_mapping_path, meter_number in iteritems(load_mapping): building_number = building_number_mapping[building_name] key = Key(building=building_number, meter=meter_number) dfs = [] for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" %attribute) if not os.path.isfile(filename_attribute): # File not found directly in the combed_path provided # Try adding 'iiitd' to it filename_attribute = join(combed_path, 'iiitd', building_name, load_name, load_mapping_path, "%s.csv" %attribute) if os.path.isfile(filename_attribute): exists = True print(filename_attribute) df = pd.read_csv(filename_attribute, names=["timestamp", attribute]) df.index = pd.to_datetime(df["timestamp"], unit='ms') df = df.drop("timestamp", 1) dfs.append(df) else: exists = False if exists: total = pd.concat(dfs, axis=1) total = total.tz_localize('UTC').tz_convert('Asia/Kolkata') total.columns = pd.MultiIndex.from_tuples([column_mapping[x] for x in total.columns]) total.columns.set_names(LEVEL_NAMES, inplace=True) assert total.index.is_unique store.put(str(key), total) any_file_converted = True if not any_file_converted: raise RuntimeError('No files converted, did you specify the correct path?') convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), output_filename) print("Done converting COMBED to HDF5!")
def convert_ampds(input_path, output_filename, format='HDF'): """ Convert AMPds R2013 as seen on Dataverse. Download the files as CSVs and put them in the `input_path` folder for conversion. Download URL: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/MXB7VO Parameters: ----------- input_path: str The path of the directory where all the csv files are supposed to be stored output_filename: str The path of the h5 file where all the standardized data is supposed to go. The path should refer to a particular file and not just a random directory in order for this to work. format: str Defaults to HDF5 Example usage: -------------- convert('/AMPds/electricity', 'store.h5') """ check_directory_exists(input_path) files = [f for f in listdir(input_path) if isfile(join(input_path, f)) and '.csv' in f and '.swp' not in f] # Sorting Lexicographically files.sort() # Remove Whole Home and put it at top files.remove("WHE.csv") files.insert(0, "WHE.csv") assert isdir(input_path) store = get_datastore(output_filename, format, mode='w') for i, csv_file in enumerate(files): key = Key(building=1, meter=(i + 1)) print('Loading file #', (i + 1), ' : ', csv_file, '. Please wait...') df = pd.read_csv(join(input_path, csv_file)) # Due to fixed width, column names have spaces :( df.columns = [x.replace(" ", "") for x in df.columns] df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME], unit='s', utc=True) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df = df.tz_convert(TIMEZONE) df.rename(columns=lambda x: columnNameMapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) store.put(str(key), df) print("Done with file #", (i + 1)) store.close() metadata_path = join(get_module_directory(), 'dataset_converters', 'ampds', 'metadata') print('Processing metadata...') convert_yaml_to_hdf5(metadata_path, output_filename)
def convert_ukdale(ukdale_path, output_filename, format='HDF', drop_duplicates=True): """Converts the UK-DALE dataset to NILMTK HDF5 format. For more information about the UK-DALE dataset, and to download it, please see http://www.doc.ic.ac.uk/~dk3810/data/ Parameters ---------- ukdale_path : str The root path of the UK-DALE dataset. It is assumed that the YAML metadata is in 'ukdale_path/metadata'. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' drop_duplicates : bool Remove entries with duplicated timestamp (keeps the first value) Defaults to True. """ ac_type_map = _get_ac_type_map(ukdale_path) def _ukdale_measurement_mapping_func(house_id, chan_id): ac_type = ac_type_map[(house_id, chan_id)][0] return [('power', ac_type)] # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert 6-second data _convert(ukdale_path, store, _ukdale_measurement_mapping_func, TZ, sort_index=False, drop_duplicates=drop_duplicates) store.close() # Add metadata if format == 'HDF': convert_yaml_to_hdf5(join(ukdale_path, 'metadata'), output_filename) # Convert 1-second data store.open(mode='a') _convert_one_sec_data(ukdale_path, store, ac_type_map, drop_duplicates) store.close() print("Done converting UK-DALE to HDF5!")
def convert_ampds(input_path, output_filename, format="HDF"): """ Parameters: ----------- input_path: str The path of the directory where all the csv files are supposed to be stored output_filename: str The path of the h5 file where all the standardized data is supposed to go. The path should refer to a particular file and not just a random directory in order for this to work. format: str Defaults to HDF5 Example usage: -------------- convert('/AMPds/electricity', 'store.h5') """ check_directory_exists(input_path) files = [f for f in listdir(input_path) if isfile(join(input_path, f)) and ".csv" in f and ".swp" not in f] # Sorting Lexicographically files.sort() # Remove Whole Home and put it at top files.remove("WHE.csv") files.insert(0, "WHE.csv") assert isdir(input_path) store = get_datastore(output_filename, format, mode="w") for i, csv_file in enumerate(files): key = Key(building=1, meter=(i + 1)) print("Loading file #", (i + 1), " : ", csv_file, ". Please wait...") df = pd.read_csv(join(input_path, csv_file)) # Due to fixed width, column names have spaces :( df.columns = [x.replace(" ", "") for x in df.columns] df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME], unit="s", utc=True) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df = df.tz_localize("GMT").tz_convert(TIMEZONE) df.rename(columns=lambda x: columnNameMapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.convert_objects(convert_numeric=True) df = df.dropna() df = df.astype(np.float32) store.put(str(key), df) print("Done with file #", (i + 1)) store.close() metadata_path = join(_get_module_directory(), "metadata") print("Processing metadata...") convert_yaml_to_hdf5(metadata_path, output_filename)
def convert_combed(combed_path, output_filename, format='HDF'): """ Parameters ---------- combed_path : str The root path of the combed dataset. output_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(combed_path) # Open store store = get_datastore(output_filename, format, mode='w') for building_name, building_mapping in iteritems(overall_dataset_mapping): for load_name, load_mapping in iteritems(building_mapping): for load_mapping_path, meter_number in iteritems(load_mapping): building_number = building_number_mapping[building_name] key = Key(building=building_number, meter=meter_number) dfs = [] for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" % attribute) if os.path.isfile(filename_attribute): exists = True print(filename_attribute) df = pd.read_csv(filename_attribute, header=True, names=["timestamp", attribute]) df.index = pd.to_datetime(df["timestamp"], unit='ms') df = df.drop("timestamp", 1) dfs.append(df) else: exists = False if exists: total = pd.concat(dfs, axis=1) total = total.tz_localize('UTC').tz_convert('Asia/Kolkata') total.rename(columns=lambda x: column_mapping[x], inplace=True) total.columns.set_names(LEVEL_NAMES, inplace=True) assert total.index.is_unique store.put(str(key), total) convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), output_filename) print("Done converting COMBED to HDF5!")
def convert_greend(greend_path, hdf_filename): """ Parameters ---------- greend_path : str The root path of the greend dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') houses = sorted(__get_houses(greend_path)) print(houses) h = 1 for house in houses: print('loading '+house+"'s house...") abs_house = join(greend_path, house) dates = [d for d in listdir(abs_house) if d.startswith('dataset')] house_data = pd.DataFrame() for date in dates: print('-----------------------',date) tmp_pandas = pd.DataFrame.from_csv(join(abs_house, date)) tmp_pandas = tmp_pandas[tmp_pandas.index != 'timestamp'] tmp_pandas = tmp_pandas.sort_index() c = 0 tmp_pandas.index = [__timestamp(t) for t in tmp_pandas.index] house_data = house_data.append(tmp_pandas) #for testing metadata files: #break m = 1 for meter in house_data: print("meter" + str(m)+': ') key = Key(building = h, meter=m) print("Putting into store...") store.put(str(key), house_data[meter], format = 'table') m += 1 print('Flushing store...') store.flush() h += 1 store.close() #needs to be edited convert_yaml_to_hdf5('/path/to/metadata', hdf_filename)
def convert_iawe(iawe_path, output_filename, format="HDF"): """ Parameters ---------- iawe_path : str The root path of the iawe dataset. output_filename : str The destination filename (including path and suffix). """ check_directory_exists(iawe_path) idx = pd.DatetimeIndex(start=START_DATETIME, end=END_DATETIME, freq=FREQ) idx = idx.tz_localize('GMT').tz_convert(TIMEZONE) # Open data store store = get_datastore(output_filename, format, mode='w') electricity_path = join(iawe_path, "electricity") # Mains data for chan in range(1, 12): key = Key(building=1, meter=chan) filename = join(electricity_path, "%d.csv" % chan) print('Loading ', chan) df = pd.read_csv(filename, dtype=np.float64, na_values='\\N') df.drop_duplicates(subset=["timestamp"], inplace=True) df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True) df = df.tz_convert(TIMEZONE) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df.columns = pd.MultiIndex.from_tuples( [column_mapping[x] for x in df.columns], names=LEVEL_NAMES) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) df = df.sort_index() df = df.resample("1T").mean() df = reindex_fill_na(df, idx) assert df.isnull().sum().sum() == 0 store.put(str(key), df) store.close() metadata_dir = join(get_module_directory(), 'dataset_converters', 'iawe', 'metadata') convert_yaml_to_hdf5(metadata_dir, output_filename) print("Done converting iAWE to HDF5!")
def main(): parser = argparse.ArgumentParser() parser.add_argument('inpath', help='input directory (ANTgen output)', nargs='?', default='../output') parser.add_argument('outfile', help='output file (HDF5 file)', nargs='?', default='../output/ANTgen.h5') args = parser.parse_args() if not os.path.exists('metadata') or not os.path.isfile('metadata/building1.yaml'): print("No metadata found. Please run 'generate_metadata.py' before using this tool...") exit(1) print("Converting ANTgen output from '{}' to file '{}'".format(args.inpath, args.outfile)) with open('metadata/building1.yaml', 'r') as f: yaml_dict = yaml.load(f, Loader=yaml.FullLoader) channel_list = ['total'] # pre-populate with aggregate data (total.csv) for app in yaml_dict['appliances']: channel_list.append(app['original_name']) store = get_datastore(args.outfile, 'HDF', mode='w') for i, app_name in enumerate(channel_list): print("Adding virtual meter ID {:02d}: {}".format(1+i, app_name)) key = Key(building=1, meter=(i + 1)) csvfile = os.path.join(args.inpath, str(app_name)+'.csv') try: df = pd.read_csv(csvfile, sep=';', encoding='utf-8', index_col=0) df.columns = pd.MultiIndex.from_tuples([('power', 'active') for x in df.columns], names=LEVEL_NAMES) df.index = pd.to_datetime(df.index) tz_naive = df.index tz_aware = tz_naive.tz_localize(tz='Europe/Vienna', ambiguous=True, nonexistent=pd.Timedelta('1H')) df.index = tz_aware df = df.tz_convert('Europe/Vienna') store.put(str(key), df) except FileNotFoundError: print("Input file '{}' not found - your HDF5 file will be incomplete!".format(csvfile)) continue print('Adding metadata...') convert_yaml_to_hdf5('metadata/', args.outfile)
def convert_iawe(iawe_path, output_filename, format="HDF"): """ Parameters ---------- iawe_path : str The root path of the iawe dataset. output_filename : str The destination filename (including path and suffix). """ check_directory_exists(iawe_path) idx = pd.DatetimeIndex(start=START_DATETIME, end=END_DATETIME, freq=FREQ) idx = idx.tz_localize('GMT').tz_convert(TIMEZONE) # Open data store store = get_datastore(output_filename, format, mode='w') electricity_path = join(iawe_path, "electricity") # Mains data for chan in range(1, 12): key = Key(building=1, meter=chan) filename = join(electricity_path, "%d.csv" % chan) print('Loading ', chan) df = pd.read_csv(filename, dtype=np.float64, na_values='\\N') df.drop_duplicates(subset=["timestamp"], inplace=True) df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True) df = df.tz_convert(TIMEZONE) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df.rename(columns=lambda x: column_mapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) df = df.sort_index() df = df.resample("1T").mean() df = reindex_fill_na(df, idx) assert df.isnull().sum().sum() == 0 store.put(str(key), df) store.close() metadata_dir = join(get_module_directory(), 'dataset_converters', 'iawe', 'metadata') convert_yaml_to_hdf5(metadata_dir, output_filename) print("Done converting iAWE to HDF5!")
def convert_caxe(file_path): ''' Parameters ------------ Takes input csv_file name to be tested as string. Data columns of the csv should contain following the following values in columns: timestamp,reactive_power,apparent_power,current,frequency,voltage,active_power) Converts it into hdf5 Format and save as test.h5. ''' df = pd.read_csv(f'{file_path}', names=['timestamp', 'R', 'A', 'C', 'F', 'V', 'T']) column_mapping = { 'F': ('frequency', ""), 'V': ('voltage', ""), 'T': ('power', 'active'), 'C': ('current', ''), 'R': ('power', 'reactive'), 'A': ('power', 'apparent'), } output_filename = 'test.h5' # Open data store store = get_datastore(output_filename, format='HDF', mode='w') key = Key(building=1, meter=1) print('Loading ', 1) df.index = pd.to_datetime(df.timestamp.values) df = df.tz_convert( TIMEZONE) # if error occurs use tz_localize for tz naive timestamps df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df.index = pd.to_datetime(df.index.values) df.columns = pd.MultiIndex.from_tuples( [column_mapping[x] for x in df.columns], names=LEVEL_NAMES) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) df = df.sort_index() df = df.resample("1T").mean() assert df.isnull().sum().sum() == 0 store.put(str(key), df) store.close() convert_yaml_to_hdf5('./metadata', output_filename) print("Done converting test data to HDF5!")
def convert_combed(combed_path, hdf_filename): """ Parameters ---------- combed_path : str The root path of the combed dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ assert isdir(combed_path) # Open HDF5 file store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') chan = 1 for building, meter_array in SUBMETER_PATHS.iteritems(): for meter in meter_array: key = Key(building=1, meter=chan) dfs = [] total = pd.DataFrame() for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building, str(meter), "%s.csv" % attribute) print(filename_attribute) dfs.append( pd.read_csv(filename_attribute, parse_dates=True, index_col=0, header=True, names=[attribute])) total = pd.concat(dfs, axis=1) total.rename(columns=lambda x: column_mapping[x], inplace=True) total.columns.set_names(LEVEL_NAMES, inplace=True) store.put(str(key), total, format='table') store.flush() chan = chan + 1 convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), hdf_filename) print("Done converting COMBED to HDF5!")
def convert_iawe(iawe_path, hdf_filename): """ Parameters ---------- iawe_path : str The root path of the iawe dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(iawe_path) # Open HDF5 file store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') electricity_path = join(iawe_path, "electricity") # Mains data for chan in range(1, 13): key = Key(building=1, meter=chan) filename = join(electricity_path, "%d.csv" % chan) print('Loading ', chan) df = pd.read_csv(filename) df.index = pd.to_datetime( (df.timestamp.values * 1E9).astype(int), utc=True) df = df.tz_convert('Asia/Kolkata') df = df.drop('timestamp', 1) df.rename(columns=lambda x: column_mapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.convert_objects(convert_numeric=True) df = df.dropna() df = df.astype(np.float32) df = df.sort_index() store.put(str(key), df, format='table') store.flush() store.close() convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), hdf_filename) print("Done converting iAWE to HDF5!")
def convert_iawe(iawe_path, hdf_filename): """ Parameters ---------- iawe_path : str The root path of the iawe dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ assert isdir(iawe_path) # Open HDF5 file store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') electricity_path = join(iawe_path, "electricity") # Mains data for chan in range(1, 13): key = Key(building=1, meter=chan) filename = join(electricity_path, "%d.csv" % chan) print('Loading ', chan) df = pd.read_csv(filename) df.index = pd.to_datetime((df.timestamp.values * 1E9).astype(int), utc=True) df = df.tz_convert('Asia/Kolkata') df = df.drop('timestamp', 1) df.rename(columns=lambda x: column_mapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.convert_objects(convert_numeric=True) df = df.dropna() df = df.astype(np.float32) df = df.sort_index() store.put(str(key), df, format='table') store.flush() store.close() convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), hdf_filename) print("Done converting iAWE to HDF5!")
def convert_ukdale(ukdale_path, output_filename, format='HDF'): """Converts the UK-DALE dataset to NILMTK HDF5 format. For more information about the UK-DALE dataset, and to download it, please see http://www.doc.ic.ac.uk/~dk3810/data/ Parameters ---------- ukdale_path : str The root path of the UK-DALE dataset. It is assumed that the YAML metadata is in 'ukdale_path/metadata'. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ ac_type_map = _get_ac_type_map(ukdale_path) def _ukdale_measurement_mapping_func(house_id, chan_id): ac_type = ac_type_map[(house_id, chan_id)][0] return [('power', ac_type)] # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert 6-second data _convert(ukdale_path, store, _ukdale_measurement_mapping_func, TZ, sort_index=False) store.close() # Add metadata if format == 'HDF': convert_yaml_to_hdf5(join(ukdale_path, 'metadata'), output_filename) # Convert 1-second data store.open(mode='a') _convert_one_sec_data(ukdale_path, store, ac_type_map) store.close() print("Done converting UK-DALE to HDF5!")
def convert_redd(redd_path, hdf_filename): """ Parameters ---------- redd_path : str The root path of the REDD low_freq dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ assert isdir(redd_path) # Open HDF5 file store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') # Iterate though all houses and channels houses = _find_all_houses(redd_path) for house_id in houses: print("Loading house", house_id, end="... ") stdout.flush() chans = _find_all_chans(redd_path, house_id) for chan_id in chans: print(chan_id, end=" ") stdout.flush() key = Key(building=house_id, meter=chan_id) ac_type = 'apparent' if chan_id <= 2 else 'active' df = _load_chan(redd_path, key, [('power', ac_type)]) store.put(str(key), df, format='table') store.flush() print() store.close() # Add metadata convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), hdf_filename) print("Done converting REDD to HDF5!")
def convert_iawe(iawe_path, output_filename, format="HDF"): """ Parameters ---------- iawe_path : str The root path of the iawe dataset. output_filename : str The destination filename (including path and suffix). """ check_directory_exists(iawe_path) # Open data store store = get_datastore(output_filename, format, mode='w') electricity_path = join(iawe_path, "electricity") # Mains data for chan in range(1, 13): key = Key(building=1, meter=chan) filename = join(electricity_path, "%d.csv" % chan) print('Loading ', chan) df = pd.read_csv(filename) df.drop_duplicates(subset=["timestamp"], inplace=True) df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True) df = df.tz_convert(TIMEZONE) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df.rename(columns=lambda x: column_mapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.convert_objects(convert_numeric=True) df = df.dropna() df = df.astype(np.float32) df = df.sort_index() store.put(str(key), df) store.close() convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), output_filename) print("Done converting iAWE to HDF5!")
def convert_ukdale(ukdale_path, hdf_filename): """ Parameters ---------- ukdale_path : str The root path of the UK-DALE dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ def _ukdale_measurement_mapping_func(house_id, chan_id): # TODO: This needs updating. It's wrong! ac_type = 'apparent' if chan_id <= 2 else 'active' return [('power', ac_type)] _convert(ukdale_path, hdf_filename, _ukdale_measurement_mapping_func, 'Europe/London') # Add metadata convert_yaml_to_hdf5( join(get_module_directory(), 'dataset_converters', 'ukdale', 'metadata'), hdf_filename) print("Done converting UK-DALE to HDF5!")
def convert_redd(redd_path, hdf_filename): """ Parameters ---------- redd_path : str The root path of the REDD low_freq dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ def _redd_measurement_mapping_func(house_id, chan_id): ac_type = 'apparent' if chan_id <= 2 else 'active' return [('power', ac_type)] _convert(redd_path, hdf_filename, _redd_measurement_mapping_func, 'US/Eastern') # Add metadata convert_yaml_to_hdf5(join(get_module_directory(), 'dataset_converters', 'redd', 'metadata'), hdf_filename) print("Done converting REDD to HDF5!")
def convert_eco(dataset_loc, hdf_filename, timezone): """ Parameters: ----------- dataset_loc: str The root directory where the dataset is located. hdf_filename: str The location where the hdf_filename is present. The directory location has to contain the hdf5file name for the converter to work. timezone: str specifies the timezone of the dataset. """ # Creating a new HDF File store = pd.HDFStore(hdf_filename, 'w') check_directory_exists(dataset_loc) directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i] directory_list.sort() print directory_list # Traversing every folder for folder in directory_list: print 'Computing for folder',folder #Building number and meter_flag building_no = int(folder[:2]) meter_flag = 'sm' if 'sm_csv' in folder else 'plugs' dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))] dir_list.sort() print 'Current dir list:',dir_list for fl in dir_list: #Meter number to be used in key meter_num = 1 if meter_flag == 'sm' else int(fl) + 1 print 'Computing for Meter no.',meter_num fl_dir_list = [i for i in listdir(join(dataset_loc,folder,fl)) if '.csv' in i] fl_dir_list.sort() key = Key(building=building_no, meter=meter_num) for fi in fl_dir_list: #Getting dataframe for each csv file seperately df_fl = _get_df(join(dataset_loc,folder,fl),fi,meter_flag) df_fl.sort_index(ascending=True,inplace=True) df_fl = df_fl.tz_convert(timezone) # If table not present in hdf5, create or else append to existing data if not key in store: store.put(str(key), df_fl, format='Table') else: store.append(str(key), df_fl, format='Table') store.flush() print 'Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4] print "Data storage completed." store.close() # Adding the metadata to the HDF5file print "Proceeding to Metadata conversion..." meta_path = join(_get_module_directory(), 'metadata') convert_yaml_to_hdf5(meta_path, hdf_filename) print "Completed Metadata conversion."
def convert_eco(dataset_loc, hdf_filename, timezone): """ Parameters: ----------- dataset_loc: str The root directory where the dataset is located. hdf_filename: str The location where the hdf_filename is present. The directory location has to contain the hdf5file name for the converter to work. timezone: str specifies the timezone of the dataset. """ # Creating a new HDF File store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc') check_directory_exists(dataset_loc) directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i] directory_list.sort() print directory_list # Traversing every folder for folder in directory_list: if folder[0] == '.' or folder[-3:] == '.h5': print 'Skipping ', folder continue print 'Computing for folder',folder #Building number and meter_flag building_no = int(folder[:2]) meter_flag = 'sm' if 'sm_csv' in folder else 'plugs' dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))] dir_list.sort() print 'Current dir list:',dir_list for fl in dir_list: print 'Computing for folder ',fl fl_dir_list = [i for i in listdir(join(dataset_loc,folder,fl)) if '.csv' in i] fl_dir_list.sort() if meter_flag == 'sm': for fi in fl_dir_list: df = pd.read_csv(join(dataset_loc,folder,fl,fi), names=[i for i in range(1,17)], dtype=np.float32) for phase in range(1,4): key = str(Key(building=building_no, meter=phase)) df_phase = df.ix[:,[1+phase, 5+phase, 8+phase, 13+phase]] # get reactive power power = df_phase.as_matrix([1+phase, 13+phase]) reactive = power[:,0] * np.tan(power[:,1] * np.pi / 180) df_phase['Q'] = reactive df_phase.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz='GMT') df_phase = df_phase.tz_convert(timezone) sm_column_name = {1+phase:('power', 'active'), 5+phase:('current', ''), 8+phase:('voltage', ''), 13+phase:('phase_angle', ''), 'Q': ('power', 'reactive'), }; df_phase.rename(columns=sm_column_name, inplace=True) tmp_before = np.size(df_phase.power.active) df_phase = df_phase[df_phase.power.active != -1] tmp_after = np.size(df_phase.power.active) if (tmp_before != tmp_after): print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) df_phase.columns.set_names(LEVEL_NAMES, inplace=True) if not key in store: store.put(key, df_phase, format='Table') else: store.append(key, df_phase, format='Table') store.flush() print 'Building',building_no,', Meter no.',phase,'=> Done for ',fi[:-4] else: #Meter number to be used in key meter_num = int(fl) + 3 key = str(Key(building=building_no, meter=meter_num)) #Getting dataframe for each csv file seperately for fi in fl_dir_list: df = pd.read_csv(join(dataset_loc,folder,fl ,fi), names=[1], dtype=np.float64) df.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz = 'GMT') df.rename(columns=plugs_column_name, inplace=True) df = df.tz_convert(timezone) df.columns.set_names(LEVEL_NAMES, inplace=True) tmp_before = np.size(df.power.active) df = df[df.power.active != -1] tmp_after = np.size(df.power.active) if (tmp_before != tmp_after): print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) # If table not present in hdf5, create or else append to existing data if not key in store: store.put(key, df, format='Table') print 'Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4] else: store.append(key, df, format='Table') store.flush() print 'Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4] print "Data storage completed." store.close() # Adding the metadata to the HDF5file print "Proceeding to Metadata conversion..." meta_path = join(_get_module_directory(), 'metadata') convert_yaml_to_hdf5(meta_path, hdf_filename) print "Completed Metadata conversion."
def convert_hes(data_dir, output_filename, format='HDF', max_chunks=None): metadata = { 'name': 'HES', 'geographic_coordinates': (51.464462,-0.076544), # London 'timezone': 'Europe/London' } # Open DataStore store = get_datastore(output_filename, format, mode='w') # load list of appliances hes_to_nilmtk_appliance_lookup = pd.read_csv(join(get_module_directory(), 'dataset_converters', 'hes', 'hes_to_nilmtk_appliance_lookup.csv')) # load list of houses hes_house_ids = load_list_of_house_ids(data_dir) nilmtk_house_ids = np.arange(1,len(hes_house_ids)+1) hes_to_nilmtk_house_ids = dict(zip(hes_house_ids, nilmtk_house_ids)) # array of hes_house_codes: nilmtk_building_code = house_codes.index(hes_house_code) house_codes = [] # map house_appliance_codes = dict() # Iterate over files for filename in FILENAMES: # Load appliance energy data chunk-by-chunk full_filename = join(data_dir, filename) print('loading', full_filename) try: reader = pd.read_csv(full_filename, names=COL_NAMES, index_col=False, chunksize=CHUNKSIZE) except IOError as e: print(e, file=stderr) continue # Iterate over chunks in file chunk_i = 0 for chunk in reader: if max_chunks is not None and chunk_i >= max_chunks: break print(' processing chunk', chunk_i, 'of', filename) # Convert date and time columns to np.datetime64 objects dt = chunk['date'] + ' ' + chunk['time'] del chunk['date'] del chunk['time'] chunk['datetime'] = dt.apply(datetime_converter) # Data is either tenths of a Wh or tenths of a degree chunk['data'] *= 10 chunk['data'] = chunk['data'].astype(np.float32) # Iterate over houses in chunk for hes_house_id, hes_house_id_df in chunk.groupby('house id'): if hes_house_id not in house_codes: house_codes.append(hes_house_id) if hes_house_id not in house_appliance_codes.keys(): house_appliance_codes[hes_house_id] = [] nilmtk_house_id = house_codes.index(hes_house_id)+1 # Iterate over appliances in house for appliance_code, appliance_df in chunk.groupby('appliance code'): if appliance_code not in house_appliance_codes[hes_house_id]: house_appliance_codes[hes_house_id].append(appliance_code) nilmtk_meter_id = house_appliance_codes[hes_house_id].index(appliance_code)+1 _process_meter_in_chunk(nilmtk_house_id, nilmtk_meter_id, hes_house_id_df, store, appliance_code) chunk_i += 1 print('houses with some data loaded:', house_appliance_codes.keys()) store.close() # generate building yaml metadata for hes_house_id in house_codes: nilmtk_building_id = house_codes.index(hes_house_id)+1 building_metadata = {} building_metadata['instance'] = nilmtk_building_id building_metadata['original_name'] = int(hes_house_id) # use python int building_metadata['elec_meters'] = {} building_metadata['appliances'] = [] # initialise dict of instances of each appliance type instance_counter = {} for appliance_code in house_appliance_codes[hes_house_id]: nilmtk_meter_id = house_appliance_codes[hes_house_id].index(appliance_code)+1 # meter metadata if appliance_code in MAINS_CODES: meter_metadata = {'device_model': 'multivoies', 'site_meter': True} break elif appliance_code in CIRCUIT_CODES: meter_metadata = {'device_model': 'multivoies'} break elif appliance_code in TEMPERATURE_CODES: break else: # is appliance meter_metadata = {'device_model': 'wattmeter'} # only appliance meters at this point building_metadata['elec_meters'][nilmtk_meter_id] = meter_metadata # appliance metadata lookup_row = hes_to_nilmtk_appliance_lookup[hes_to_nilmtk_appliance_lookup.Code==appliance_code].iloc[0] appliance_metadata = {'original_name': lookup_row.Name, 'meters': [nilmtk_meter_id] } # appliance type appliance_metadata.update({'type': lookup_row.nilmtk_name}) # TODO appliance room # appliance instance number if instance_counter.get(lookup_row.nilmtk_name) == None: instance_counter[lookup_row.nilmtk_name] = 0 instance_counter[lookup_row.nilmtk_name] += 1 appliance_metadata['instance'] = instance_counter[lookup_row.nilmtk_name] building_metadata['appliances'].append(appliance_metadata) building = 'building{:d}'.format(nilmtk_building_id) yaml_full_filename = join(_get_module_directory(), 'metadata', building + '.yaml') with open(yaml_full_filename, 'w') as outfile: #print(building_metadata) outfile.write(yaml.dump(building_metadata)) # write yaml metadata to hdf5 convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), output_filename)
def download_dataport(database_username, database_password, hdf_filename, periods_to_load=None): """ Downloads data from dataport database into an HDF5 file. Parameters ---------- hdf_filename : str Output HDF filename. If file exists already then will be deleted. database_username, database_password : str periods_to_load : dict of tuples, optional Key of dict is the building number (int). Values are (<start date>, <end date>) e.g. ("2013-04-01", None) or ("2013-04-01", "2013-08-01") defaults to all buildings and all date ranges """ # dataport database settings database_host = 'dataport.pecanstreet.org' database_port = '5434' database_name = 'postgres' database_schema = 'university' # try to connect to database try: conn = db.connect('host=' + database_host + ' port=' + database_port + ' dbname=' + database_name + ' user='******' password='******'Could not connect to remote database') raise # set up a new HDF5 datastore (overwrites existing store) store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') # remove existing building yaml files in module dir for f in os.listdir(join(_get_module_directory(), 'metadata')): if re.search('^building', f): os.remove(join(_get_module_directory(), 'metadata', f)) """ TODO: The section below can be altered or removed, since the restructured Dataport now has only one electricity_egauge_minutes table. """ # get tables in database schema sql_query = ("SELECT table_name" + " FROM information_schema.views" + " WHERE table_schema ='" + database_schema + "'" + " ORDER BY table_name") database_tables = pd.read_sql(sql_query, conn)['table_name'].tolist() database_tables = [t for t in database_tables if 'electricity_egauge_minutes' in t] # if user has specified buildings if periods_to_load: buildings_to_load = list(periods_to_load.keys()) else: # get buildings present in all tables sql_query = '' for table in database_tables: sql_query = (sql_query + '(SELECT DISTINCT dataid' + ' FROM "' + database_schema + '".' + table + ') UNION ') sql_query = sql_query[:-7] sql_query = (sql_query + ' ORDER BY dataid') buildings_to_load = pd.read_sql(sql_query, conn)['dataid'].tolist() # for each user specified building or all buildings in database for building_id in buildings_to_load: print("Loading building {:d} @ {}" .format(building_id, datetime.datetime.now())) sys.stdout.flush() # create new list of chunks for concatenating later dataframe_list = [] # for each table of 1 month data for database_table in database_tables: print(" Loading table {:s}".format(database_table)) sys.stdout.flush() # get buildings present in electricity_egauge_minutes table sql_query = ('SELECT DISTINCT dataid' + ' FROM university.metadata' + ' WHERE egauge_min_time IS NOT NULL' + ' ORDER BY dataid') buildings_in_table = pd.read_sql(sql_query, conn)['dataid'].tolist() if building_id in buildings_in_table: # get first and last timestamps for this house in electricity_egauge_minutes table sql_query = ('SELECT MIN(egauge_min_time) AS minlocalminute,' + ' MAX(egauge_max_time) AS maxlocalminute' + ' FROM university.metadata' + ' WHERE dataid=' + str(building_id)) range = pd.read_sql(sql_query, conn) first_timestamp_in_table = range['minlocalminute'][0] last_timestamp_in_table = range['maxlocalminute'][0] # get requested start and end and localize them requested_start = None requested_end = None database_timezone = 'US/Central' if periods_to_load: if periods_to_load[building_id][0]: requested_start = pd.Timestamp(periods_to_load[building_id][0]) requested_start = requested_start.tz_localize(database_timezone) if periods_to_load[building_id][1]: requested_end = pd.Timestamp(periods_to_load[building_id][1]) requested_end = requested_end.tz_localize(database_timezone) # check user start is not after end if requested_start > requested_end: print('requested end is before requested start') sys.stdout.flush() else: # clip data to smallest range if requested_start: start = max(requested_start, first_timestamp_in_table) else: start = first_timestamp_in_table if requested_end: end = min(requested_end, last_timestamp_in_table) else: end = last_timestamp_in_table # download data in chunks chunk_start = start chunk_size = datetime.timedelta(10) # 1 day while chunk_start < end: chunk_end = chunk_start + chunk_size if chunk_end > end: chunk_end = end # subtract 1 second so end is exclusive chunk_end = chunk_end - datetime.timedelta(0, 1) # query power data for all channels format = '%Y-%m-%d %H:%M:%S' sql_query = ('SELECT *' + ' FROM "' + database_schema + '".' + database_table + ' WHERE dataid=' + str(building_id) + 'and localminute between ' + "'" + chunk_start.strftime(format) + "'" + " and " + "'" + chunk_end.strftime(format) + "'") chunk_dataframe = pd.read_sql(sql_query, conn) # nilmtk requires building indices to start at 1 nilmtk_building_id = buildings_to_load.index(building_id) + 1 # convert to nilmtk-df and save to disk nilmtk_dataframe = _dataport_dataframe_to_hdf(chunk_dataframe, store, nilmtk_building_id, building_id) # print progress print(' ' + str(chunk_start) + ' -> ' + str(chunk_end) + ': ' + str(len(chunk_dataframe.index)) + ' rows') sys.stdout.flush() # append all chunks into list for csv writing #dataframe_list.append(chunk_dataframe) # move on to next chunk chunk_start = chunk_start + chunk_size # saves all chunks in list to csv #if len(dataframe_list) > 0: #dataframe_concat = pd.concat(dataframe_list) #dataframe_concat.to_csv(output_directory + str(building_id) + '.csv') store.close() conn.close() # write yaml to hdf5 # dataset.yaml and meter_devices.yaml are static, building<x>.yaml are dynamic convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), hdf_filename)
def convert_gjw(gjw_path, output_filename): """ Parameters ---------- gjw_path : str The root path of the gjw dataset. output_filename : str The destination filename (including path and suffix), will default if not specified directory and file structure nilm_gjw_data building<1> elec 4-POWER_REAL_FINE <date> Dump.csv 5-POWER_REACTIVE_STANDARD <date> Dump.csv ... ... building<n> HDF5 nilm_gjw_data.hdf5 metadata building1.yaml dataset.yaml meter_devices.yaml other files """ if gjw_path is None: gjw_path = home_dir check_directory_exists(gjw_path) os.chdir(gjw_path) gjw_path = os.getcwd( ) # sort out potential issue with slashes or backslashes if output_filename is None: output_filename = join(home_dir, 'HDF5', 'nilm_gjw_data.hdf5') # Open data store print('opening datastore', output_filename) store = get_datastore(output_filename, format, mode='w') # walk the directory tree from the dataset home directory #clear dataframe & add column headers df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME, REACTIVE_COLUMN_NAME]) found = False for current_dir, _, files in os.walk(gjw_path): #unused second parameter of for dirs_in_current_dir if current_dir.find('.git') != -1 or current_dir.find('.ipynb') != -1: #print( 'Skipping ', current_dir) continue print('checking', current_dir) m = bld_re.search(current_dir) if m: #The csv files may be further down the tree so this section may be repeated building_name = m.group() building_nbr = int(bld_nbr_re.search(building_name).group()) meter_nbr = 1 key = Key(building=building_nbr, meter=meter_nbr) for items in fnmatch.filter(files, "4*.csv"): # process any .CSV files found found = True ds = iso_date_re.search(items).group() # print( 'found files for date:', ds,end=" ") # found files to process df1 = _read_file_pair(current_dir, ds) # read two csv files into a dataframe df = pd.concat( [df, df1]) # concatenate the results into one long dataframe if found: found = False df = _prepare_data_for_toolkit(df) _summarise_dataframe(df, 'Prepared for tool kit') store.put(str(key), df) #clear dataframe & add column headers #df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME]) break # only 1 folder with .csv files at present store.close() convert_yaml_to_hdf5(join(gjw_path, 'metadata'), output_filename) print("Done converting gjw to HDF5!")
def convert_gjw(gjw_path, output_filename): """ Parameters ---------- gjw_path : str The root path of the gjw dataset. output_filename : str The destination filename (including path and suffix), will default if not specified directory and file structure nilm_gjw_data building<1> elec 4-POWER_REAL_FINE <date> Dump.csv 5-POWER_REACTIVE_STANDARD <date> Dump.csv ... ... building<n> HDF5 nilm_gjw_data.hdf5 metadata building1.yaml dataset.yaml meter_devices.yaml other files """ if gjw_path is None: gjw_path = home_dir check_directory_exists(gjw_path) os.chdir(gjw_path) gjw_path = os.getcwd() # sort out potential issue with slashes or backslashes if output_filename is None: output_filename =join(home_dir,'HDF5','nilm_gjw_data.hdf5') # Open data store print( 'opening datastore', output_filename) store = get_datastore(output_filename, format, mode='w') # walk the directory tree from the dataset home directory #clear dataframe & add column headers df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME]) found = False for current_dir, _, files in os.walk(gjw_path): #unused second parameter of for dirs_in_current_dir if current_dir.find('.git')!=-1 or current_dir.find('.ipynb') != -1: #print( 'Skipping ', current_dir) continue print( 'checking', current_dir) m = bld_re.search(current_dir) if m: #The csv files may be further down the tree so this section may be repeated building_name = m.group() building_nbr = int(bld_nbr_re.search(building_name).group()) meter_nbr = 1 key = Key(building=building_nbr, meter=meter_nbr) for items in fnmatch.filter(files, "4*.csv"): # process any .CSV files found found = True ds = iso_date_re.search(items).group() # print( 'found files for date:', ds,end=" ") # found files to process df1 = _read_file_pair(current_dir,ds) # read two csv files into a dataframe df = pd.concat([df,df1]) # concatenate the results into one long dataframe if found: found = False df = _prepare_data_for_toolkit(df) _summarise_dataframe(df,'Prepared for tool kit') store.put(str(key), df) #clear dataframe & add column headers #df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME]) break # only 1 folder with .csv files at present store.close() convert_yaml_to_hdf5(join(gjw_path, 'metadata'),output_filename) print("Done converting gjw to HDF5!")
def convert_eco(dataset_loc, hdf_filename, timezone): """ Parameters: ----------- dataset_loc: str The root directory where the dataset is located. hdf_filename: str The location where the hdf_filename is present. The directory location has to contain the hdf5file name for the converter to work. timezone: str specifies the timezone of the dataset. """ # Creating a new HDF File store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc') check_directory_exists(dataset_loc) directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i] directory_list.sort() print(directory_list) found_any_sm = False found_any_plug = False # Traversing every folder for folder in directory_list: if folder[0] == '.' or folder[-3:] == '.h5': print('Skipping ', folder) continue #Building number and meter_flag building_no = int(folder[:2]) meter_flag = None if 'sm_csv' in folder: meter_flag = 'sm' elif 'plugs' in folder: meter_flag = 'plugs' else: print('Skipping folder', folder) continue print('Computing for folder', folder) dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))] dir_list.sort() if meter_flag == 'plugs' and len(dir_list) < 3: # Try harder to find the subfolders folder = join(folder, folder[:2]) dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))] print('Current dir list:', dir_list) for fl in dir_list: print('Computing for folder ', fl) fl_dir_list = [i for i in listdir(join(dataset_loc,folder,fl)) if '.csv' in i] fl_dir_list.sort() if meter_flag == 'sm': for fi in fl_dir_list: found_any_sm = True df = pd.read_csv(join(dataset_loc,folder,fl,fi), names=[i for i in range(1,17)], dtype=np.float32) for phase in range(1,4): key = str(Key(building=building_no, meter=phase)) df_phase = df.loc[:,[1+phase, 5+phase, 8+phase, 13+phase]] # get reactive power power = df_phase.loc[:, (1+phase, 13+phase)].values reactive = power[:,0] * np.tan(power[:,1] * np.pi / 180) df_phase['Q'] = reactive df_phase.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz='GMT') df_phase = df_phase.tz_convert(timezone) sm_column_name = { 1+phase:('power', 'active'), 5+phase:('current', ''), 8+phase:('voltage', ''), 13+phase:('phase_angle', ''), 'Q': ('power', 'reactive'), } df_phase.columns = pd.MultiIndex.from_tuples([ sm_column_name[col] for col in df_phase.columns ]) power_active = df_phase['power', 'active'] tmp_before = np.size(power_active) df_phase = df_phase[power_active != -1] power_active = df_phase['power', 'active'] tmp_after = np.size(power_active) if tmp_before != tmp_after: print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) df_phase.columns.set_names(LEVEL_NAMES, inplace=True) if not key in store: store.put(key, df_phase, format='Table') else: store.append(key, df_phase, format='Table') store.flush() print('Building', building_no, ', Meter no.', phase, '=> Done for ', fi[:-4]) else: #Meter number to be used in key meter_num = int(fl) + 3 key = str(Key(building=building_no, meter=meter_num)) current_folder = join(dataset_loc,folder,fl) if not fl_dir_list: raise RuntimeError("No CSV file found in " + current_folder) #Getting dataframe for each csv file seperately for fi in fl_dir_list: found_any_plug = True df = pd.read_csv(join(current_folder, fi), names=[1], dtype=np.float64) df.index = pd.DatetimeIndex(start=fi[:-4].replace('.', ':'), freq='s', periods=86400, tz = 'GMT') df.columns = pd.MultiIndex.from_tuples(plugs_column_name.values()) df = df.tz_convert(timezone) df.columns.set_names(LEVEL_NAMES, inplace=True) tmp_before = np.size(df.power.active) df = df[df.power.active != -1] tmp_after = np.size(df.power.active) if (tmp_before != tmp_after): print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) # If table not present in hdf5, create or else append to existing data if not key in store: store.put(key, df, format='Table') print('Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4]) else: store.append(key, df, format='Table') store.flush() print('Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4]) if not found_any_plug or not found_any_sm: raise RuntimeError('The files were not found! Please check the folder structure. Extract each ZIP file into a folder with its base name (e.g. extract "01_plugs_csv.zip" into a folder named "01_plugs_csv", etc.)') print("Data storage completed.") store.close() # Adding the metadata to the HDF5file print("Proceeding to Metadata conversion...") meta_path = join( get_module_directory(), 'dataset_converters', 'eco', 'metadata' ) convert_yaml_to_hdf5(meta_path, hdf_filename) print("Completed Metadata conversion.")
def download_dataport(database_username, database_password, hdf_filename, periods_to_load=None): """ Downloads data from dataport database into an HDF5 file. Parameters ---------- hdf_filename : str Output HDF filename. If file exists already then will be deleted. database_username, database_password : str periods_to_load : dict of tuples, optional Key of dict is the building number (int). Values are (<start date>, <end date>) e.g. ("2013-04-01", None) or ("2013-04-01", "2013-08-01") defaults to all buildings and all date ranges """ # dataport database settings database_host = 'dataport.pecanstreet.org' database_port = '5434' database_name = 'postgres' database_schema = 'university' # try to connect to database try: conn = db.connect('host=' + database_host + ' port=' + database_port + ' dbname=' + database_name + ' user='******' password='******'Could not connect to remote database') raise # set up a new HDF5 datastore (overwrites existing store) store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') # remove existing building yaml files in module dir for f in os.listdir(join(_get_module_directory(), 'metadata')): if re.search('^building', f): os.remove(join(_get_module_directory(), 'metadata', f)) """ TODO: The section below can be altered or removed, since the restructured Dataport now has only one electricity_egauge_minutes table. """ # get tables in database schema sql_query = ("SELECT table_name" + " FROM information_schema.views" + " WHERE table_schema ='" + database_schema + "'" + " ORDER BY table_name") database_tables = pd.read_sql(sql_query, conn)['table_name'].tolist() database_tables = [t for t in database_tables if 'electricity_egauge_minutes' in t] # if user has specified buildings if periods_to_load: buildings_to_load = periods_to_load.keys() else: # get buildings present in all tables sql_query = '' for table in database_tables: sql_query = (sql_query + '(SELECT DISTINCT dataid' + ' FROM "' + database_schema + '".' + table + ') UNION ') sql_query = sql_query[:-7] sql_query = (sql_query + ' ORDER BY dataid') buildings_to_load = pd.read_sql(sql_query, conn)['dataid'].tolist() # for each user specified building or all buildings in database for building_id in buildings_to_load: print("Loading building {:d} @ {}" .format(building_id, datetime.datetime.now())) sys.stdout.flush() # create new list of chunks for concatenating later dataframe_list = [] # for each table of 1 month data for database_table in database_tables: print(" Loading table {:s}".format(database_table)) sys.stdout.flush() # get buildings present in electricity_egauge_minutes table sql_query = ('SELECT DISTINCT dataid' + ' FROM university.metadata' + ' WHERE egauge_min_time IS NOT NULL' + ' ORDER BY dataid') buildings_in_table = pd.read_sql(sql_query, conn)['dataid'].tolist() if building_id in buildings_in_table: # get first and last timestamps for this house in electricity_egauge_minutes table sql_query = ('SELECT MIN(egauge_min_time) AS minlocalminute,' + ' MAX(egauge_max_time) AS maxlocalminute' + ' FROM university.metadata' + ' WHERE dataid=' + str(building_id)) range = pd.read_sql(sql_query, conn) first_timestamp_in_table = range['minlocalminute'][0] last_timestamp_in_table = range['maxlocalminute'][0] # get requested start and end and localize them requested_start = None requested_end = None database_timezone = 'US/Central' if periods_to_load: if periods_to_load[building_id][0]: requested_start = pd.Timestamp(periods_to_load[building_id][0]) requested_start = requested_start.tz_localize(database_timezone) if periods_to_load[building_id][1]: requested_end = pd.Timestamp(periods_to_load[building_id][1]) requested_end = requested_end.tz_localize(database_timezone) # check user start is not after end if requested_start > requested_end: print('requested end is before requested start') sys.stdout.flush() else: # clip data to smallest range if requested_start: start = max(requested_start, first_timestamp_in_table) else: start = first_timestamp_in_table if requested_end: end = min(requested_end, last_timestamp_in_table) else: end = last_timestamp_in_table # download data in chunks chunk_start = start chunk_size = datetime.timedelta(10) # 1 day while chunk_start < end: chunk_end = chunk_start + chunk_size if chunk_end > end: chunk_end = end # subtract 1 second so end is exclusive chunk_end = chunk_end - datetime.timedelta(0, 1) # query power data for all channels format = '%Y-%m-%d %H:%M:%S' sql_query = ('SELECT *' + ' FROM "' + database_schema + '".' + database_table + ' WHERE dataid=' + str(building_id) + 'and localminute between ' + "'" + chunk_start.strftime(format) + "'" + " and " + "'" + chunk_end.strftime(format) + "'") chunk_dataframe = pd.read_sql(sql_query, conn) # nilmtk requires building indices to start at 1 nilmtk_building_id = buildings_to_load.index(building_id) + 1 # convert to nilmtk-df and save to disk nilmtk_dataframe = _dataport_dataframe_to_hdf(chunk_dataframe, store, nilmtk_building_id, building_id) # print progress print(' ' + str(chunk_start) + ' -> ' + str(chunk_end) + ': ' + str(len(chunk_dataframe.index)) + ' rows') sys.stdout.flush() # append all chunks into list for csv writing #dataframe_list.append(chunk_dataframe) # move on to next chunk chunk_start = chunk_start + chunk_size # saves all chunks in list to csv #if len(dataframe_list) > 0: #dataframe_concat = pd.concat(dataframe_list) #dataframe_concat.to_csv(output_directory + str(building_id) + '.csv') store.close() conn.close() # write yaml to hdf5 # dataset.yaml and meter_devices.yaml are static, building<x>.yaml are dynamic convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), hdf_filename)
feed_ignore = ['gen', 'grid'] WEATHER_HVAC_STORE = os.path.join(script_path, '..', '..', '..', 'data/hvac/weather_hvac_2013.h5') temp_h5_path = os.path.expanduser("~/Downloads/wiki-temp.h5") store_total = pd.HDFStore(temp_h5_path) store_useful = pd.HDFStore(WEATHER_HVAC_STORE) useful_keys = [k[:-2] for k in store_useful.keys() if "X" in k] START, STOP = "2013-07-01", "2013-07-31" store_name = os.path.expanduser("~/wikienergy-2013.h5") with pd.HDFStore(store_name, "w") as store_to_write: for nilmtk_id, dataid_str in enumerate(useful_keys): dataid = int(dataid_str[1:]) df = store_total[dataid_str][START:STOP] if df['air1'].sum() > 0: print("Writing ", nilmtk_id, dataid) _dataport_dataframe_to_hdf(df, store_to_write, nilmtk_id + 1, dataid) else: print("Skipping", nilmtk_id, dataid) convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), store_name)
def convert_eco(dataset_loc, hdf_filename, timezone): """ Parameters: ----------- dataset_loc: str The root directory where the dataset is located. hdf_filename: str The location where the hdf_filename is present. The directory location has to contain the hdf5file name for the converter to work. timezone: str specifies the timezone of the dataset. """ # Creating a new HDF File store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc') check_directory_exists(dataset_loc) directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i] directory_list.sort() print(directory_list) # Traversing every folder for folder in directory_list: if folder[0] == '.' or folder[-3:] == '.h5': print('Skipping ', folder) continue print('Computing for folder', folder) #Building number and meter_flag building_no = int(folder[:2]) meter_flag = 'sm' if 'sm_csv' in folder else 'plugs' dir_list = [ i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc, folder, i)) ] dir_list.sort() print('Current dir list:', dir_list) for fl in dir_list: print('Computing for folder ', fl) fl_dir_list = [ i for i in listdir(join(dataset_loc, folder, fl)) if '.csv' in i ] fl_dir_list.sort() if meter_flag == 'sm': for fi in fl_dir_list: df = pd.read_csv(join(dataset_loc, folder, fl, fi), names=[i for i in range(1, 17)], dtype=np.float32) for phase in range(1, 4): key = str(Key(building=building_no, meter=phase)) df_phase = df.ix[:, [ 1 + phase, 5 + phase, 8 + phase, 13 + phase ]] # get reactive power power = df_phase.as_matrix([1 + phase, 13 + phase]) reactive = power[:, 0] * np.tan( power[:, 1] * np.pi / 180) df_phase['Q'] = reactive df_phase.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz='GMT') df_phase = df_phase.tz_convert(timezone) sm_column_name = { 1 + phase: ('power', 'active'), 5 + phase: ('current', ''), 8 + phase: ('voltage', ''), 13 + phase: ('phase_angle', ''), 'Q': ('power', 'reactive'), } df_phase.rename(columns=sm_column_name, inplace=True) tmp_before = np.size(df_phase.power.active) df_phase = df_phase[df_phase.power.active != -1] tmp_after = np.size(df_phase.power.active) if (tmp_before != tmp_after): print( 'Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) df_phase.columns.set_names(LEVEL_NAMES, inplace=True) if not key in store: store.put(key, df_phase, format='Table') else: store.append(key, df_phase, format='Table') store.flush() print('Building', building_no, ', Meter no.', phase, '=> Done for ', fi[:-4]) else: #Meter number to be used in key meter_num = int(fl) + 3 key = str(Key(building=building_no, meter=meter_num)) #Getting dataframe for each csv file seperately for fi in fl_dir_list: df = pd.read_csv(join(dataset_loc, folder, fl, fi), names=[1], dtype=np.float64) df.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz='GMT') df.rename(columns=plugs_column_name, inplace=True) df = df.tz_convert(timezone) df.columns.set_names(LEVEL_NAMES, inplace=True) tmp_before = np.size(df.power.active) df = df[df.power.active != -1] tmp_after = np.size(df.power.active) if (tmp_before != tmp_after): print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) # If table not present in hdf5, create or else append to existing data if not key in store: store.put(key, df, format='Table') print('Building', building_no, ', Meter no.', meter_num, '=> Done for ', fi[:-4]) else: store.append(key, df, format='Table') store.flush() print('Building', building_no, ', Meter no.', meter_num, '=> Done for ', fi[:-4]) print("Data storage completed.") store.close() # Adding the metadata to the HDF5file print("Proceeding to Metadata conversion...") meta_path = join(_get_module_directory(), 'metadata') convert_yaml_to_hdf5(meta_path, hdf_filename) print("Completed Metadata conversion.")
def convert_greend(greend_path, hdf_filename, use_mp=True): """ Parameters ---------- greend_path : str The root path of the greend dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). use_mp : bool Defaults to True. Use multiprocessing to load the files for each building. """ store = pd.HDFStore(hdf_filename, 'w', complevel=5, complib='zlib') houses = sorted(_get_houses(greend_path)) print('Houses found:', houses) if use_mp: pool = Pool() h = 1 # nilmtk counts buildings from 1 not from 0 as we do, so everything is shifted by 1 for house in houses: print('Loading', house) abs_house = join(greend_path, house) dates = [d for d in listdir(abs_house) if d.startswith('dataset')] target_filenames = [join(abs_house, date) for date in dates] if use_mp: house_data = pool.map(_get_blocks, target_filenames) # Ensure the blocks are sorted by date and make a plain list house_data_dfs = [] for date, data in sorted(house_data, key=lambda x: x[0]): house_data_dfs.extend(data) else: house_data_dfs = [] for fn in target_filenames: house_data_dfs.extend(_get_blocks(fn)[1]) overall_df = pd.concat(house_data_dfs).sort_index() dups_in_index = overall_df.index.duplicated(keep='first') if dups_in_index.any(): print("Found duplicated values in index, dropping them.") overall_df = overall_df[~dups_in_index] m = 1 for column in overall_df.columns: print("meter {}: {}".format(m, column)) key = Key(building=h, meter=m) print("Putting into store...") df = overall_df[column].to_frame() #.dropna(axis=0) # if drop_duplicates: # print("Dropping duplicated values in data...") # df = df.drop_duplicates() df.columns = pd.MultiIndex.from_tuples([('power', 'active')]) df.columns.set_names(LEVEL_NAMES, inplace=True) store.put(str(key), df, format='table') m += 1 # print('Flushing store...') # store.flush() h += 1 store.close() # retrieve the dataset metadata in the metadata subfolder metadata_dir = join(get_module_directory(), 'dataset_converters', 'greend', 'metadata') convert_yaml_to_hdf5(metadata_dir, hdf_filename)
def convert_greend(greend_path, hdf_filename, use_mp=True): """ Parameters ---------- greend_path : str The root path of the greend dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). use_mp : bool Defaults to True. Use multiprocessing to load the files for each building. """ store = pd.HDFStore(hdf_filename, 'w', complevel=5, complib='zlib') houses = sorted(_get_houses(greend_path)) print('Houses found:', houses) if use_mp: pool = Pool() h = 1 # nilmtk counts buildings from 1 not from 0 as we do, so everything is shifted by 1 for house in houses: print('Loading', house) abs_house = join(greend_path, house) dates = [d for d in listdir(abs_house) if d.startswith('dataset')] target_filenames = [join(abs_house, date) for date in dates] if use_mp: house_data = pool.map(_get_blocks, target_filenames) # Ensure the blocks are sorted by date and make a plain list house_data_dfs = [] for date, data in sorted(house_data, key=lambda x: x[0]): house_data_dfs.extend(data) else: house_data_dfs = [] for fn in target_filenames: house_data_dfs.extend(_get_blocks(fn)[1]) overall_df = pd.concat(house_data_dfs).sort_index() dups_in_index = overall_df.index.duplicated(keep='first') if dups_in_index.any(): print("Found duplicated values in index, dropping them.") overall_df = overall_df[~dups_in_index] m = 1 for column in overall_df.columns: print("meter {}: {}".format(m, column)) key = Key(building=h, meter=m) print("Putting into store...") df = overall_df[column].to_frame() #.dropna(axis=0) # if drop_duplicates: # print("Dropping duplicated values in data...") # df = df.drop_duplicates() df.columns = pd.MultiIndex.from_tuples([('power', 'active')]) df.columns.set_names(LEVEL_NAMES, inplace=True) store.put(str(key), df, format = 'table') m += 1 # print('Flushing store...') # store.flush() h += 1 store.close() # retrieve the dataset metadata in the metadata subfolder metadata_dir = join(get_module_directory(), 'dataset_converters', 'greend', 'metadata') convert_yaml_to_hdf5(metadata_dir, hdf_filename)
return path_to_this_file feed_ignore = ['gen', 'grid'] WEATHER_HVAC_STORE = os.path.join(script_path, '..', '..', '..', 'data/hvac/weather_hvac_2013.h5') temp_h5_path = os.path.expanduser("~/Downloads/wiki-temp.h5") store_total = pd.HDFStore(temp_h5_path) store_useful = pd.HDFStore(WEATHER_HVAC_STORE) useful_keys = [k[:-2] for k in store_useful.keys() if "X" in k] START, STOP = "2013-07-01", "2013-07-31" store_name = os.path.expanduser("~/wikienergy-2013.h5") with pd.HDFStore(store_name, "w") as store_to_write: for nilmtk_id, dataid_str in enumerate(useful_keys): dataid = int(dataid_str[1:]) df = store_total[dataid_str][START:STOP] if df['air1'].sum() > 0: print("Writing ", nilmtk_id, dataid) _dataport_dataframe_to_hdf(df, store_to_write, nilmtk_id + 1, dataid) else: print("Skipping", nilmtk_id, dataid) convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), store_name)
def convert_hes(data_dir, output_filename, format='HDF', max_chunks=None): metadata = { 'name': 'HES', 'geographic_coordinates': (51.464462, -0.076544), # London 'timezone': 'Europe/London' } # Open DataStore store = get_datastore(output_filename, format, mode='w') # load list of appliances hes_to_nilmtk_appliance_lookup = pd.read_csv( join(get_module_directory(), 'dataset_converters', 'hes', 'hes_to_nilmtk_appliance_lookup.csv')) # load list of houses hes_house_ids = load_list_of_house_ids(data_dir) nilmtk_house_ids = np.arange(1, len(hes_house_ids) + 1) hes_to_nilmtk_house_ids = dict(zip(hes_house_ids, nilmtk_house_ids)) # array of hes_house_codes: nilmtk_building_code = house_codes.index(hes_house_code) house_codes = [] # map house_appliance_codes = dict() # Create a temporary metadata dir original_metadata_dir = join(get_module_directory(), 'dataset_converters', 'hes', 'metadata') tmp_dir = tempfile.mkdtemp() metadata_dir = join(tmp_dir, 'metadata') shutil.copytree(original_metadata_dir, metadata_dir) print("Using temporary dir for metadata:", metadata_dir) # Iterate over files for filename in FILENAMES: # Load appliance energy data chunk-by-chunk full_filename = join(data_dir, filename) print('Loading', full_filename) try: reader = pd.read_csv(full_filename, names=COL_NAMES, index_col=False, chunksize=CHUNKSIZE) except IOError as e: print(e, file=stderr) continue # Iterate over chunks in file chunk_i = 0 for chunk in reader: if max_chunks is not None and chunk_i >= max_chunks: break print(' processing chunk', chunk_i, 'of', filename) # Convert date and time columns to np.datetime64 objects dt = chunk['date'] + ' ' + chunk['time'] del chunk['date'] del chunk['time'] chunk['datetime'] = pd.to_datetime(dt, format='%Y-%m-%d %H:%M:%S', utc=True) # Data is either tenths of a Wh or tenths of a degree chunk['data'] *= 10 chunk['data'] = chunk['data'].astype(np.float32) # Iterate over houses in chunk for hes_house_id, hes_house_id_df in chunk.groupby('house id'): if hes_house_id not in house_codes: house_codes.append(hes_house_id) if hes_house_id not in house_appliance_codes.keys(): house_appliance_codes[hes_house_id] = [] nilmtk_house_id = house_codes.index(hes_house_id) + 1 # Iterate over appliances in house for appliance_code, appliance_df in chunk.groupby( 'appliance code'): if appliance_code not in house_appliance_codes[ hes_house_id]: house_appliance_codes[hes_house_id].append( appliance_code) nilmtk_meter_id = house_appliance_codes[ hes_house_id].index(appliance_code) + 1 _process_meter_in_chunk(nilmtk_house_id, nilmtk_meter_id, hes_house_id_df, store, appliance_code) chunk_i += 1 print('houses with some data loaded:', house_appliance_codes.keys()) store.close() # generate building yaml metadata for hes_house_id in house_codes: nilmtk_building_id = house_codes.index(hes_house_id) + 1 building_metadata = {} building_metadata['instance'] = nilmtk_building_id building_metadata['original_name'] = int( hes_house_id) # use python int building_metadata['elec_meters'] = {} building_metadata['appliances'] = [] # initialise dict of instances of each appliance type instance_counter = {} for appliance_code in house_appliance_codes[hes_house_id]: nilmtk_meter_id = house_appliance_codes[hes_house_id].index( appliance_code) + 1 # meter metadata if appliance_code in MAINS_CODES: meter_metadata = { 'device_model': 'multivoies', 'site_meter': True } break elif appliance_code in CIRCUIT_CODES: meter_metadata = {'device_model': 'multivoies'} break elif appliance_code in TEMPERATURE_CODES: break else: # is appliance meter_metadata = {'device_model': 'wattmeter'} # only appliance meters at this point building_metadata['elec_meters'][nilmtk_meter_id] = meter_metadata # appliance metadata lookup_row = hes_to_nilmtk_appliance_lookup[ hes_to_nilmtk_appliance_lookup.Code == appliance_code].iloc[0] appliance_metadata = { 'original_name': lookup_row.Name, 'meters': [nilmtk_meter_id] } # appliance type appliance_metadata.update({'type': lookup_row.nilmtk_name}) # TODO appliance room # appliance instance number if instance_counter.get(lookup_row.nilmtk_name) == None: instance_counter[lookup_row.nilmtk_name] = 0 instance_counter[lookup_row.nilmtk_name] += 1 appliance_metadata['instance'] = instance_counter[ lookup_row.nilmtk_name] building_metadata['appliances'].append(appliance_metadata) building = 'building{:d}'.format(nilmtk_building_id) yaml_full_filename = join(metadata_dir, building + '.yaml') with open(yaml_full_filename, 'w') as outfile: #print(building_metadata) outfile.write(yaml.dump(building_metadata)) # write yaml metadata to hdf5 convert_yaml_to_hdf5(metadata_dir, output_filename) # remote the temporary dir when finished shutil.rmtree(tmp_dir)
def convert_greend(greend_path, hdf_filename): """ Parameters ---------- greend_path : str The root path of the greend dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') houses = sorted(__get_houses(greend_path)) print(houses) h = 1 # nilmtk counts buildings from 1 not from 0 as we do, so everything is shifted by 1 for house in houses: print('loading '+house) abs_house = join(greend_path, house) dates = [d for d in listdir(abs_house) if d.startswith('dataset')] house_data = [] for date in dates: print('-----------------------',date) try: tmp_pandas = pd.read_csv(join(abs_house, date), na_values=['na'], error_bad_lines=False) except: # A CParserError is returned for malformed files (irregular column number) pass # for building0 either remove the first days (with less nodes) or use __preprocess_file #import StringIO as sio #tmp_pandas = pd.DataFrame.from_csv(sio.StringIO(__preprocess_file(abs_house, date))) # if the timestamp is not correctly parsed then it's an object dtype (string), else a float64 if tmp_pandas.timestamp.dtype != np.float64: tmp_pandas = tmp_pandas[tmp_pandas.timestamp != 'timestamp'] # remove all error rows # use the cleaned column as the index tmp_pandas.index = tmp_pandas["timestamp"].convert_objects(convert_numeric=True).values tmp_pandas = tmp_pandas.drop('timestamp', 1) # remove timestamp from the columns (it's the index already) tmp_pandas = tmp_pandas.astype("float32") # convert everything back to float32 # convert the index to datetime tmp_pandas.index = pd.to_datetime(tmp_pandas.index, unit='s') tmp_pandas = tmp_pandas.tz_localize("UTC").tz_convert("CET") tmp_pandas = tmp_pandas.drop_duplicates() #tmp_pandas = tmp_pandas.sort_index() house_data.append(tmp_pandas) overall_df = pd.concat(house_data) overall_df = overall_df.drop_duplicates() overall_df = overall_df.sort_index() m = 1 for column in overall_df.columns: print("meter" + str(m)+': '+column) key = Key(building = h, meter=m) print("Putting into store...") store.put(str(key), overall_df[column], format = 'table') m += 1 print('Flushing store...') store.flush() h += 1 store.close() # retrieve the dataset metadata in the metadata subfolder import inspect convert_yaml_to_hdf5(dirname(inspect.getfile(convert_greend))+'/metadata/', hdf_filename)
for i in range(1, num_rows): time_indices.append(time_indices[i-1] + np.timedelta64('1', 's')) return time_indices if not os.path.exists('../data/'): os.makedirs('../data/') store = get_datastore("../data/converted_sum.hdf5", 'HDF', mode='w') """ Gets CLEAR and MEDAL data and puts them into the store with the right key and instance numbers. """ frames = get_clear_data() for phase in range(1, 4): key = Key(building=1, meter=phase) print('Adding phase {}'.format(phase)) store.put(str(key), frames[phase-1]) for medal_id in range(1, 16): frames = get_summary_data(medal_id) for i in range(1, 7): key = Key(building=1, meter=(((medal_id-1) * 6) + i + 3)) print('Adding ' + str(key) + ' to Store') store.put(str(key), frames[i-1]) store.close() convert_yaml_to_hdf5("../metadata_converter/dist", "../data/converted_sum.hdf5")
def download_wikienergy(database_username, database_password, hdf_filename, periods_to_load=None): """ Downloads data from WikiEnergy database into an HDF5 file. Parameters ---------- hdf_filename : str Output HDF filename. If file exists already then will be deleted. database_username, database_password : str periods_to_load : dict of tuples, optional Key of dict is the building number (int). Values are (<start date>, <end date>) e.g. ("2013-04-01", None) or ("2013-04-01", "2013-08-01") defaults to all buildings and all date ranges """ # wiki-energy database settings database_host = "db.wiki-energy.org" database_name = "postgres" database_schema = "PecanStreet_SharedData" # try to connect to database try: conn = db.connect( "host=" + database_host + " dbname=" + database_name + " user="******" password="******"Could not connect to remote database") raise # set up a new HDF5 datastore (overwrites existing store) store = pd.HDFStore(hdf_filename, "w", complevel=9, complib="zlib") # remove existing building yaml files in module dir for f in os.listdir(join(_get_module_directory(), "metadata")): if re.search("^building", f): os.remove(join(_get_module_directory(), "metadata", f)) # get tables in database schema sql_query = ( "SELECT TABLE_NAME" + " FROM INFORMATION_SCHEMA.TABLES" + " WHERE TABLE_TYPE = 'BASE TABLE'" + " AND TABLE_SCHEMA='" + database_schema + "'" + " ORDER BY TABLE_NAME" ) database_tables = pd.read_sql(sql_query, conn)["table_name"].tolist() # if user has specified buildings if periods_to_load: buildings_to_load = periods_to_load.keys() else: # get buildings present in all tables sql_query = "" for table in database_tables: sql_query = sql_query + "(SELECT DISTINCT dataid" + ' FROM "' + database_schema + '".' + table + ") UNION " sql_query = sql_query[:-7] sql_query = sql_query + " ORDER BY dataid" buildings_to_load = pd.read_sql(sql_query, conn)["dataid"].tolist() # for each user specified building or all buildings in database for building_id in buildings_to_load: print("Loading building {:d} @ {}".format(building_id, datetime.datetime.now())) sys.stdout.flush() # create new list of chunks for concatenating later dataframe_list = [] # for each table of 1 month data for database_table in database_tables: print(" Loading table {:s}".format(database_table)) sys.stdout.flush() # get buildings present in this table sql_query = ( "SELECT DISTINCT dataid" + ' FROM "' + database_schema + '".' + database_table + " ORDER BY dataid" ) buildings_in_table = pd.read_sql(sql_query, conn)["dataid"].tolist() if building_id in buildings_in_table: # get first and last timestamps for this house in this table sql_query = ( "SELECT MIN(localminute) AS minlocalminute," + " MAX(localminute) AS maxlocalminute" + ' FROM "' + database_schema + '".' + database_table + " WHERE dataid=" + str(building_id) ) range = pd.read_sql(sql_query, conn) first_timestamp_in_table = range["minlocalminute"][0] last_timestamp_in_table = range["maxlocalminute"][0] # get requested start and end and localize them requested_start = None requested_end = None database_timezone = "US/Central" if periods_to_load: if periods_to_load[building_id][0]: requested_start = pd.Timestamp(periods_to_load[building_id][0]) requested_start = requested_start.tz_localize(database_timezone) if periods_to_load[building_id][1]: requested_end = pd.Timestamp(periods_to_load[building_id][1]) requested_end = requested_end.tz_localize(database_timezone) # check user start is not after end if requested_start > requested_end: print("requested end is before requested start") sys.stdout.flush() else: # clip data to smallest range if requested_start: start = max(requested_start, first_timestamp_in_table) else: start = first_timestamp_in_table if requested_end: end = min(requested_end, last_timestamp_in_table) else: end = last_timestamp_in_table # download data in chunks chunk_start = start chunk_size = datetime.timedelta(1) # 1 day while chunk_start < end: chunk_end = chunk_start + chunk_size if chunk_end > end: chunk_end = end # subtract 1 second so end is exclusive chunk_end = chunk_end - datetime.timedelta(0, 1) # query power data for all channels format = "%Y-%m-%d %H:%M:%S" sql_query = ( "SELECT *" + ' FROM "' + database_schema + '".' + database_table + " WHERE dataid=" + str(building_id) + "and localminute between " + "'" + chunk_start.strftime(format) + "'" + " and " + "'" + chunk_end.strftime(format) + "'" + " LIMIT 2000" ) chunk_dataframe = pd.read_sql(sql_query, conn) # nilmtk requires building indices to start at 1 nilmtk_building_id = buildings_to_load.index(building_id) + 1 # convert to nilmtk-df and save to disk nilmtk_dataframe = _wikienergy_dataframe_to_hdf( chunk_dataframe, store, nilmtk_building_id, building_id ) # print progress print( " " + str(chunk_start) + " -> " + str(chunk_end) + ": " + str(len(chunk_dataframe.index)) + " rows" ) sys.stdout.flush() # append all chunks into list for csv writing # dataframe_list.append(chunk_dataframe) # move on to next chunk chunk_start = chunk_start + chunk_size # saves all chunks in list to csv # if len(dataframe_list) > 0: # dataframe_concat = pd.concat(dataframe_list) # dataframe_concat.to_csv(output_directory + str(building_id) + '.csv') store.close() conn.close() # write yaml to hdf5 # dataset.yaml and meter_devices.yaml are static, building<x>.yaml are dynamic convert_yaml_to_hdf5(join(_get_module_directory(), "metadata"), hdf_filename)
def convert_greend(greend_path, hdf_filename): """ Parameters ---------- greend_path : str The root path of the greend dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') houses = sorted(__get_houses(greend_path)) print(houses) h = 1 # nilmtk counts buildings from 1 not from 0 as we do, so everything is shifted by 1 for house in houses: print('loading ' + house) abs_house = join(greend_path, house) dates = [d for d in listdir(abs_house) if d.startswith('dataset')] house_data = [] for date in dates: print('-----------------------', date) try: tmp_pandas = pd.read_csv(join(abs_house, date), na_values=['na'], error_bad_lines=False) except: # A ParserError/ValueError is returned for malformed files (irregular column number) pass # for building0 either remove the first days (with less nodes) or use __preprocess_file #import StringIO as sio #tmp_pandas = pd.DataFrame.from_csv(sio.StringIO(__preprocess_file(abs_house, date))) # if the timestamp is not correctly parsed then it's an object dtype (string), else a float64 if tmp_pandas.timestamp.dtype != np.float64: tmp_pandas = tmp_pandas[tmp_pandas.timestamp != 'timestamp'] # remove all error rows # use the cleaned column as the index tmp_pandas.index = tmp_pandas["timestamp"].apply( pd.to_numeric, errors='ignore').values tmp_pandas = tmp_pandas.drop( 'timestamp', 1 ) # remove timestamp from the columns (it's the index already) tmp_pandas = tmp_pandas.astype( "float32") # convert everything back to float32 # convert the index to datetime tmp_pandas.index = pd.to_datetime(tmp_pandas.index, unit='s') tmp_pandas = tmp_pandas.tz_localize("UTC").tz_convert("CET") tmp_pandas = tmp_pandas.drop_duplicates() #tmp_pandas = tmp_pandas.sort_index() house_data.append(tmp_pandas) overall_df = pd.concat(house_data) overall_df = overall_df.drop_duplicates() overall_df = overall_df.sort_index() m = 1 for column in overall_df.columns: print("meter" + str(m) + ': ' + column) key = Key(building=h, meter=m) print("Putting into store...") store.put(str(key), overall_df[column], format='table') m += 1 print('Flushing store...') store.flush() h += 1 store.close() # retrieve the dataset metadata in the metadata subfolder import inspect convert_yaml_to_hdf5( dirname(inspect.getfile(convert_greend)) + '/metadata/', hdf_filename)
def convert_eco(dataset_loc, hdf_filename, timezone): """ Parameters: ----------- dataset_loc: str The root directory where the dataset is located. hdf_filename: str The location where the hdf_filename is present. The directory location has to contain the hdf5file name for the converter to work. timezone: str specifies the timezone of the dataset. """ # Creating a new HDF File store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc') check_directory_exists(dataset_loc) directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i] directory_list.sort() print(directory_list) found_any_sm = False found_any_plug = False # Traversing every folder for folder in directory_list: if folder[0] == '.' or folder[-3:] == '.h5': print('Skipping ', folder) continue #Building number and meter_flag building_no = int(folder[:2]) meter_flag = None if 'sm_csv' in folder: meter_flag = 'sm' elif 'plugs' in folder: meter_flag = 'plugs' else: print('Skipping folder', folder) continue print('Computing for folder', folder) dir_list = [ i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc, folder, i)) ] dir_list.sort() if meter_flag == 'plugs' and len(dir_list) < 3: # Try harder to find the subfolders folder = join(folder, folder[:2]) dir_list = [ i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc, folder, i)) ] print('Current dir list:', dir_list) for fl in dir_list: print('Computing for folder ', fl) fl_dir_list = [ i for i in listdir(join(dataset_loc, folder, fl)) if '.csv' in i ] fl_dir_list.sort() if meter_flag == 'sm': for fi in fl_dir_list: found_any_sm = True df = pd.read_csv(join(dataset_loc, folder, fl, fi), names=[i for i in range(1, 17)], dtype=np.float32) # SmartMeter for phase in range(1, 4): key = str(Key(building=building_no, meter=phase)) df_phase = df.loc[:, [ 1 + phase, 5 + phase, 8 + phase, 13 + phase ]] # get reactive power power = df_phase.loc[:, (1 + phase, 13 + phase)].values reactive = power[:, 0] * np.tan( power[:, 1] * np.pi / 180) df_phase['Q'] = reactive df_phase.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz='GMT') df_phase = df_phase.tz_convert(timezone) sm_column_name = { 1 + phase: ('power', 'active'), 5 + phase: ('current', ''), 8 + phase: ('voltage', ''), 13 + phase: ('phase_angle', ''), 'Q': ('power', 'reactive'), } df_phase.columns = pd.MultiIndex.from_tuples( sm_column_name[col] for col in df_phase.columns) power_active = df_phase['power', 'active'] tmp_before = np.size(power_active) df_phase = df_phase[power_active != -1] power_active = df_phase['power', 'active'] tmp_after = np.size(power_active) if tmp_before != tmp_after: print( 'Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) df_phase.columns.set_names(LEVEL_NAMES, inplace=True) if not key in store: store.put(key, df_phase, format='Table') else: store.append(key, df_phase, format='Table') store.flush() print('Building', building_no, ', Meter no.', phase, '=> Done for ', fi[:-4]) # Plugs werden auch in Meter uebersetzt und dann aber direkt mit Appliances ergaenzt else: #Meter number to be used in key meter_num = int(fl) + 3 key = str(Key(building=building_no, meter=meter_num)) current_folder = join(dataset_loc, folder, fl) if not fl_dir_list: raise RuntimeError("No CSV file found in " + current_folder) #Getting dataframe for each csv file seperately for fi in fl_dir_list: found_any_plug = True df = pd.read_csv(join(current_folder, fi), names=[1], dtype=np.float64) df.index = pd.DatetimeIndex(start=fi[:-4].replace( '.', ':'), freq='s', periods=86400, tz='GMT') df.columns = pd.MultiIndex.from_tuples( plugs_column_name.values()) df = df.tz_convert(timezone) df.columns.set_names(LEVEL_NAMES, inplace=True) # Check whether measurements removed tmp_before = np.size(df.power.active) df = df[df.power.active != -1] tmp_after = np.size(df.power.active) if (tmp_before != tmp_after): print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) # If table not present in hdf5, create or else append to existing data if not key in store: store.put(key, df, format='Table') print('Building', building_no, ', Meter no.', meter_num, '=> Done for ', fi[:-4]) else: store.append(key, df, format='Table') store.flush() print('Building', building_no, ', Meter no.', meter_num, '=> Done for ', fi[:-4]) if not found_any_plug or not found_any_sm: raise RuntimeError( 'The files were not found! Please check the folder structure. Extract each ZIP file into a folder with its base name (e.g. extract "01_plugs_csv.zip" into a folder named "01_plugs_csv", etc.)' ) print("Data storage completed.") store.close() # Adding the metadata to the HDF5file print("Proceeding to Metadata conversion...") meta_path = join(get_module_directory(), 'dataset_converters', 'eco', 'metadata') convert_yaml_to_hdf5(meta_path, hdf_filename) print("Completed Metadata conversion.")