def convert_redd(redd_path, output_filename, format='HDF'): """ Parameters ---------- redd_path : str The root path of the REDD low_freq dataset. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ def _redd_measurement_mapping_func(house_id, chan_id): ac_type = 'apparent' if chan_id <= 2 else 'active' return [('power', ac_type)] # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert raw data to DataStore _convert(redd_path, store, _redd_measurement_mapping_func, 'US/Eastern') s = join(get_module_directory(), 'dataset_converters', 'redd', 'metadata') # Add metadata save_yaml_to_datastore( join(get_module_directory(), 'dataset_converters', 'redd', 'metadata'), store) store.close() print("Done converting REDD to HDF5!")
def convert_refit(input_path, output_filename, format='HDF'): """ Parameters ---------- input_path : str The root path of the CSV files, e.g. House1.csv output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert raw data to DataStore _convert(input_path, store, 'Europe/London') # Add metadata save_yaml_to_datastore( join(get_module_directory(), 'dataset_converters', 'refit', 'metadata'), store) store.close() print("Done converting REFIT to HDF5!")
def convert_refit(input_path, output_filename, format='HDF'): """ Parameters ---------- input_path : str The root path of the CSV files, e.g. House1.csv output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert raw data to DataStore _convert(input_path, store, 'Europe/London') # Add metadata save_yaml_to_datastore(join(get_module_directory(), 'dataset_converters', 'refit', 'metadata'), store) store.close() print("Done converting REFIT to HDF5!")
def convert_ukdale(ukdale_path, hdf_filename): """ Parameters ---------- ukdale_path : str The root path of the UK-DALE dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ def _ukdale_measurement_mapping_func(house_id, chan_id): # TODO: This needs updating. It's wrong! ac_type = 'apparent' if chan_id <= 2 else 'active' return [('power', ac_type)] _convert(ukdale_path, hdf_filename, _ukdale_measurement_mapping_func, 'Europe/London') # Add metadata convert_yaml_to_hdf5(join(get_module_directory(), 'dataset_converters', 'ukdale', 'metadata'), hdf_filename) print("Done converting UK-DALE to HDF5!")
def convert_lab(lab_path, output_filename, format='HDF'): """ Parameters ---------- lab_path : str The root path of the LAB dataset. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ #estava <=2 e o primeiro ac_type = apparent def _lab_measurement_mapping_func(house_id, chan_id): ac_type = 'active' if chan_id <= 1 else 'active' return [('power', ac_type)] # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert raw data to DataStore _convert(lab_path, store, _lab_measurement_mapping_func, 'America/Fortaleza') # Add metadata save_yaml_to_datastore( join(get_module_directory(), 'dataset_converters', 'lab', 'metadata'), store) store.close() print("Done converting LAB to HDF5!")
def convert_deddiag(connection, output_filename, format='HDF', start_date=DEFAULT_START_DATE, end_date=DEFAULT_END_DATE, tz=DEFAULT_TZ): """ Parameters ---------- connection: Connection Connection to the DEDDIAG database Example: connection = Connection(host="localhost", port="5432", db_name="postgres", user="******", password="******") output_filename : str The destination filename including path and suffix Example: ./data/deddiag.h5 format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ # Open DataStore # todo try catch dest_file = get_datastore(output_filename, format, mode='w') # Convert raw data to DataStore _convert(connection, dest_file, start_date, end_date, tz) path_to_metadata = join(get_module_directory(), 'dataset_converters', 'deddiag', 'metadata') # Add metadata save_yaml_to_datastore(path_to_metadata, dest_file) print("Done converting DEDDIAG to HDF5!")
def convert_alva(alva_path, output_filename, format='HDF'): """ Parameters ---------- alva_path : str The root path of the alva low_freq dataset. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ def _alva_measurement_mapping_func(house_id, chan_id): ac_type = 'apparent' if chan_id <= 2 else 'active' return [('power', ac_type)] # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert raw data to DataStore _convert(alva_path, store, _alva_measurement_mapping_func, 'US/Eastern') # Add metadata save_yaml_to_datastore(join(get_module_directory(), 'dataset_converters', 'alva', 'metadata'), store) store.close() print("Done converting alva to HDF5!")
def convert_lab(lab_path, output_filename, format='HDF'): """ Parameters ---------- lab_path : str The root path of the LAB dataset. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ #estava <=2 e o primeiro ac_type = apparent def _lab_measurement_mapping_func(house_id, chan_id): ac_type = 'active' if chan_id <= 1 else 'active' return [('power', ac_type)] # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert raw data to DataStore _convert(lab_path, store, _lab_measurement_mapping_func, 'America/Fortaleza') # Add metadata save_yaml_to_datastore(join(get_module_directory(), 'dataset_converters', 'lab', 'metadata'), store) store.close() print("Done converting LAB to HDF5!")
def convert_redd(redd_path, output_filename, format="HDF"): """ Parameters ---------- redd_path : str The root path of the REDD low_freq dataset. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ def _redd_measurement_mapping_func(house_id, chan_id): ac_type = "apparent" if chan_id <= 2 else "active" return [("power", ac_type)] # Open DataStore store = get_datastore(output_filename, format, mode="w") # Convert raw data to DataStore _convert(redd_path, store, _redd_measurement_mapping_func, "US/Eastern") # Add metadata save_yaml_to_datastore(join(get_module_directory(), "dataset_converters", "redd", "metadata"), store) store.close() print("Done converting REDD to HDF5!")
def convert_ideal(ideal_path, output_filename, format='HDF'): """ Convert the IDEAL dataset to NILMTK HDF5 format. From https://datashare.ed.ac.uk/handle/10283/3647 download these zips below: - household_sensors.zip (14.77Gb). - room_and_appliance_sensors.zip (9.317Gb). Both zips contain a folder called "sensorsdata". Create a new folder, e.g. called "ideal_dataset", and into it - Extract the folder "household_sensors.zip/sensordata" with the name household_sensordata - Extract the folder "room_and_appliance_sensors/sensordata" with the name rooms_appliance_sensensensordata Then run the function convert_ideal with ideal_path="ideal_dataset". Parameters ---------- ideal_path : str The root path of the ideal low_freq dataset. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ def _ideal_measurement_mapping_func(house_id, chan_id, category_id): if (category_id == "electric-appliance"): ac_type = 'active' return [('power', ac_type)] else: ac_type = 'apparent' return [('power', ac_type)] # Open DataStore store = get_datastore(output_filename, format, mode='w') #household_sensordata contains mains reading #rooms_appliance_sensordata contains appliance reading folders = [] for root, dirs, files in os.walk(ideal_path): for folder in dirs: if (folder == "household_sensordata" or folder == "rooms_appliance_sensordata"): folders.append(folder) #valid_home_id are home ids which contain both mains and appliance reading valid_home_id = mains_plus_appliance_home_id(ideal_path, folders) for folder in folders: input_path = join(ideal_path, folder) # Convert raw data to DataStore _convert(input_path, store, _ideal_measurement_mapping_func, 'Europe/London', valid_home_id) metadata_path = join(get_module_directory(), 'dataset_converters', 'ideal', 'metadata') # Add metadata save_yaml_to_datastore(metadata_path, store) store.close() print("Done converting ideal to HDF5!")
def convert_ampds(input_path, output_filename, format_='HDF'): """ Convert AMPds R2013 as seen on Dataverse. Download the files as CSVs and put them in the `input_path` folder for conversion. Download URL: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/MXB7VO Parameters: ----------- input_path: str The path of the directory where all the csv files are supposed to be stored output_filename: str The path of the h5 file where all the standardized data is supposed to go. The path should refer to a particular file and not just a random directory in order for this to work. format: str Defaults to HDF5 Example usage: -------------- convert('/AMPds/electricity', 'store.h5') """ check_directory_exists(input_path) files = [f for f in listdir(input_path) if isfile(join(input_path, f)) and '.csv' in f and '.swp' not in f] # Sorting Lexicographically files.sort() # Remove Whole Home and put it at top files.remove("WHE.csv") files.insert(0, "WHE.csv") assert isdir(input_path) store = get_datastore(output_filename, format_, mode='w') for i, csv_file in enumerate(files): key = Key(building=1, meter=(i + 1)) print('Loading file #', (i + 1), ' : ', csv_file, '. Please wait...') df = pd.read_csv(join(input_path, csv_file)) # Due to fixed width, column names have spaces :( df.columns = [x.replace(" ", "") for x in df.columns] df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME], unit='s', utc=True) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df = df.tz_convert(TIMEZONE) df.columns = pd.MultiIndex.from_tuples( [columnNameMapping[x] for x in df.columns], names=LEVEL_NAMES ) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) store.put(str(key), df) print("Done with file #", (i + 1)) store.close() metadata_path = join(get_module_directory(), 'dataset_converters', 'ampds', 'metadata') print('Processing metadata...') convert_yaml_to_hdf5(metadata_path, output_filename)
def convert_combed(combed_path, output_filename, format='HDF'): """ Parameters ---------- combed_path : str The root path of the combed dataset. output_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(combed_path) # Open store store = get_datastore(output_filename, format, mode='w') any_file_converted = False for building_name, building_mapping in iteritems(overall_dataset_mapping): for load_name, load_mapping in iteritems(building_mapping): for load_mapping_path, meter_number in iteritems(load_mapping): building_number = building_number_mapping[building_name] key = Key(building=building_number, meter=meter_number) dfs = [] for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" %attribute) if not os.path.isfile(filename_attribute): # File not found directly in the combed_path provided # Try adding 'iiitd' to it filename_attribute = join(combed_path, 'iiitd', building_name, load_name, load_mapping_path, "%s.csv" %attribute) if os.path.isfile(filename_attribute): exists = True print(filename_attribute) df = pd.read_csv(filename_attribute, names=["timestamp", attribute]) df.index = pd.to_datetime(df["timestamp"], unit='ms') df = df.drop("timestamp", 1) dfs.append(df) else: exists = False if exists: total = pd.concat(dfs, axis=1) total = total.tz_localize('UTC').tz_convert('Asia/Kolkata') total.columns = pd.MultiIndex.from_tuples([column_mapping[x] for x in total.columns]) total.columns.set_names(LEVEL_NAMES, inplace=True) assert total.index.is_unique store.put(str(key), total) any_file_converted = True if not any_file_converted: raise RuntimeError('No files converted, did you specify the correct path?') convert_yaml_to_hdf5( join(get_module_directory(), 'dataset_converters', 'combed', 'metadata'), output_filename ) print("Done converting COMBED to HDF5!")
def convert_ampds(input_path, output_filename, format='HDF'): """ Convert AMPds R2013 as seen on Dataverse. Download the files as CSVs and put them in the `input_path` folder for conversion. Download URL: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/MXB7VO Parameters: ----------- input_path: str The path of the directory where all the csv files are supposed to be stored output_filename: str The path of the h5 file where all the standardized data is supposed to go. The path should refer to a particular file and not just a random directory in order for this to work. format: str Defaults to HDF5 Example usage: -------------- convert('/AMPds/electricity', 'store.h5') """ check_directory_exists(input_path) files = [f for f in listdir(input_path) if isfile(join(input_path, f)) and '.csv' in f and '.swp' not in f] # Sorting Lexicographically files.sort() # Remove Whole Home and put it at top files.remove("WHE.csv") files.insert(0, "WHE.csv") assert isdir(input_path) store = get_datastore(output_filename, format, mode='w') for i, csv_file in enumerate(files): key = Key(building=1, meter=(i + 1)) print('Loading file #', (i + 1), ' : ', csv_file, '. Please wait...') df = pd.read_csv(join(input_path, csv_file)) # Due to fixed width, column names have spaces :( df.columns = [x.replace(" ", "") for x in df.columns] df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME], unit='s', utc=True) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df = df.tz_convert(TIMEZONE) df.rename(columns=lambda x: columnNameMapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) store.put(str(key), df) print("Done with file #", (i + 1)) store.close() metadata_path = join(get_module_directory(), 'dataset_converters', 'ampds', 'metadata') print('Processing metadata...') convert_yaml_to_hdf5(metadata_path, output_filename)
def convert_unifei(redd_path, output_filename, format='HDF'): """ Parameters ---------- redd_path : str The root path of the REDD low_freq dataset. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ def _redd_measurement_mapping_func(house_id, chan_id): ac_type = 'active' return [('power', ac_type)] # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert raw data to DataStore _convert(redd_path, store, _redd_measurement_mapping_func, 'America/Sao_Paulo') print("Done convert...") #Aqui é necessário colocar o endereço de onde fica a metadata print(get_module_directory()) s=join(get_module_directory(), 'dataset_converters', 'unifei', 'metadata') print(s) # Add metadata # Aqui também é necessário colocar o endereço correto da metadata save_yaml_to_datastore(join(get_module_directory(), 'dataset_converters', 'unifei', 'metadata'), store) store.close() print("Done converting REDD to HDF5!")
def convert_iawe(iawe_path, output_filename, format="HDF"): """ Parameters ---------- iawe_path : str The root path of the iawe dataset. output_filename : str The destination filename (including path and suffix). """ check_directory_exists(iawe_path) idx = pd.DatetimeIndex(start=START_DATETIME, end=END_DATETIME, freq=FREQ) idx = idx.tz_localize('GMT').tz_convert(TIMEZONE) # Open data store store = get_datastore(output_filename, format, mode='w') electricity_path = join(iawe_path, "electricity") # Mains data for chan in range(1, 12): key = Key(building=1, meter=chan) filename = join(electricity_path, "%d.csv" % chan) print('Loading ', chan) df = pd.read_csv(filename, dtype=np.float64, na_values='\\N') df.drop_duplicates(subset=["timestamp"], inplace=True) df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True) df = df.tz_convert(TIMEZONE) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df.columns = pd.MultiIndex.from_tuples( [column_mapping[x] for x in df.columns], names=LEVEL_NAMES) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) df = df.sort_index() df = df.resample("1T").mean() df = reindex_fill_na(df, idx) assert df.isnull().sum().sum() == 0 store.put(str(key), df) store.close() metadata_dir = join(get_module_directory(), 'dataset_converters', 'iawe', 'metadata') convert_yaml_to_hdf5(metadata_dir, output_filename) print("Done converting iAWE to HDF5!")
def convert_iawe(iawe_path, output_filename, format="HDF"): """ Parameters ---------- iawe_path : str The root path of the iawe dataset. output_filename : str The destination filename (including path and suffix). """ check_directory_exists(iawe_path) idx = pd.DatetimeIndex(start=START_DATETIME, end=END_DATETIME, freq=FREQ) idx = idx.tz_localize('GMT').tz_convert(TIMEZONE) # Open data store store = get_datastore(output_filename, format, mode='w') electricity_path = join(iawe_path, "electricity") # Mains data for chan in range(1, 12): key = Key(building=1, meter=chan) filename = join(electricity_path, "%d.csv" % chan) print('Loading ', chan) df = pd.read_csv(filename, dtype=np.float64, na_values='\\N') df.drop_duplicates(subset=["timestamp"], inplace=True) df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True) df = df.tz_convert(TIMEZONE) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df.rename(columns=lambda x: column_mapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) df = df.sort_index() df = df.resample("1T").mean() df = reindex_fill_na(df, idx) assert df.isnull().sum().sum() == 0 store.put(str(key), df) store.close() metadata_dir = join(get_module_directory(), 'dataset_converters', 'iawe', 'metadata') convert_yaml_to_hdf5(metadata_dir, output_filename) print("Done converting iAWE to HDF5!")
def convert_ukdale(ukdale_path, hdf_filename): """ Parameters ---------- ukdale_path : str The root path of the UK-DALE dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ def _ukdale_measurement_mapping_func(house_id, chan_id): # TODO: This needs updating. It's wrong! ac_type = 'apparent' if chan_id <= 2 else 'active' return [('power', ac_type)] _convert(ukdale_path, hdf_filename, _ukdale_measurement_mapping_func, 'Europe/London') # Add metadata convert_yaml_to_hdf5( join(get_module_directory(), 'dataset_converters', 'ukdale', 'metadata'), hdf_filename) print("Done converting UK-DALE to HDF5!")
def convert_redd(redd_path, hdf_filename): """ Parameters ---------- redd_path : str The root path of the REDD low_freq dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ def _redd_measurement_mapping_func(house_id, chan_id): ac_type = 'apparent' if chan_id <= 2 else 'active' return [('power', ac_type)] _convert(redd_path, hdf_filename, _redd_measurement_mapping_func, 'US/Eastern') # Add metadata convert_yaml_to_hdf5(join(get_module_directory(), 'dataset_converters', 'redd', 'metadata'), hdf_filename) print("Done converting REDD to HDF5!")
def convert_eco(dataset_loc, hdf_filename, timezone): """ Parameters: ----------- dataset_loc: str The root directory where the dataset is located. hdf_filename: str The location where the hdf_filename is present. The directory location has to contain the hdf5file name for the converter to work. timezone: str specifies the timezone of the dataset. """ # Creating a new HDF File store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc') check_directory_exists(dataset_loc) directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i] directory_list.sort() print(directory_list) found_any_sm = False found_any_plug = False # Traversing every folder for folder in directory_list: if folder[0] == '.' or folder[-3:] == '.h5': print('Skipping ', folder) continue #Building number and meter_flag building_no = int(folder[:2]) meter_flag = None if 'sm_csv' in folder: meter_flag = 'sm' elif 'plugs' in folder: meter_flag = 'plugs' else: print('Skipping folder', folder) continue print('Computing for folder', folder) dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))] dir_list.sort() if meter_flag == 'plugs' and len(dir_list) < 3: # Try harder to find the subfolders folder = join(folder, folder[:2]) dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))] print('Current dir list:', dir_list) for fl in dir_list: print('Computing for folder ', fl) fl_dir_list = [i for i in listdir(join(dataset_loc,folder,fl)) if '.csv' in i] fl_dir_list.sort() if meter_flag == 'sm': for fi in fl_dir_list: found_any_sm = True df = pd.read_csv(join(dataset_loc,folder,fl,fi), names=[i for i in range(1,17)], dtype=np.float32) for phase in range(1,4): key = str(Key(building=building_no, meter=phase)) df_phase = df.loc[:,[1+phase, 5+phase, 8+phase, 13+phase]] # get reactive power power = df_phase.loc[:, (1+phase, 13+phase)].values reactive = power[:,0] * np.tan(power[:,1] * np.pi / 180) df_phase['Q'] = reactive df_phase.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz='GMT') df_phase = df_phase.tz_convert(timezone) sm_column_name = { 1+phase:('power', 'active'), 5+phase:('current', ''), 8+phase:('voltage', ''), 13+phase:('phase_angle', ''), 'Q': ('power', 'reactive'), } df_phase.columns = pd.MultiIndex.from_tuples([ sm_column_name[col] for col in df_phase.columns ]) power_active = df_phase['power', 'active'] tmp_before = np.size(power_active) df_phase = df_phase[power_active != -1] power_active = df_phase['power', 'active'] tmp_after = np.size(power_active) if tmp_before != tmp_after: print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) df_phase.columns.set_names(LEVEL_NAMES, inplace=True) if not key in store: store.put(key, df_phase, format='Table') else: store.append(key, df_phase, format='Table') store.flush() print('Building', building_no, ', Meter no.', phase, '=> Done for ', fi[:-4]) else: #Meter number to be used in key meter_num = int(fl) + 3 key = str(Key(building=building_no, meter=meter_num)) current_folder = join(dataset_loc,folder,fl) if not fl_dir_list: raise RuntimeError("No CSV file found in " + current_folder) #Getting dataframe for each csv file seperately for fi in fl_dir_list: found_any_plug = True df = pd.read_csv(join(current_folder, fi), names=[1], dtype=np.float64) df.index = pd.DatetimeIndex(start=fi[:-4].replace('.', ':'), freq='s', periods=86400, tz = 'GMT') df.columns = pd.MultiIndex.from_tuples(plugs_column_name.values()) df = df.tz_convert(timezone) df.columns.set_names(LEVEL_NAMES, inplace=True) tmp_before = np.size(df.power.active) df = df[df.power.active != -1] tmp_after = np.size(df.power.active) if (tmp_before != tmp_after): print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) # If table not present in hdf5, create or else append to existing data if not key in store: store.put(key, df, format='Table') print('Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4]) else: store.append(key, df, format='Table') store.flush() print('Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4]) if not found_any_plug or not found_any_sm: raise RuntimeError('The files were not found! Please check the folder structure. Extract each ZIP file into a folder with its base name (e.g. extract "01_plugs_csv.zip" into a folder named "01_plugs_csv", etc.)') print("Data storage completed.") store.close() # Adding the metadata to the HDF5file print("Proceeding to Metadata conversion...") meta_path = join( get_module_directory(), 'dataset_converters', 'eco', 'metadata' ) convert_yaml_to_hdf5(meta_path, hdf_filename) print("Completed Metadata conversion.")
def convert_hes(data_dir, output_filename, format='HDF', max_chunks=None): metadata = { 'name': 'HES', 'geographic_coordinates': (51.464462, -0.076544), # London 'timezone': 'Europe/London' } # Open DataStore store = get_datastore(output_filename, format, mode='w') # load list of appliances hes_to_nilmtk_appliance_lookup = pd.read_csv( join(get_module_directory(), 'dataset_converters', 'hes', 'hes_to_nilmtk_appliance_lookup.csv')) # load list of houses hes_house_ids = load_list_of_house_ids(data_dir) nilmtk_house_ids = np.arange(1, len(hes_house_ids) + 1) hes_to_nilmtk_house_ids = dict(zip(hes_house_ids, nilmtk_house_ids)) # array of hes_house_codes: nilmtk_building_code = house_codes.index(hes_house_code) house_codes = [] # map house_appliance_codes = dict() # Create a temporary metadata dir original_metadata_dir = join(get_module_directory(), 'dataset_converters', 'hes', 'metadata') tmp_dir = tempfile.mkdtemp() metadata_dir = join(tmp_dir, 'metadata') shutil.copytree(original_metadata_dir, metadata_dir) print("Using temporary dir for metadata:", metadata_dir) # Iterate over files for filename in FILENAMES: # Load appliance energy data chunk-by-chunk full_filename = join(data_dir, filename) print('Loading', full_filename) try: reader = pd.read_csv(full_filename, names=COL_NAMES, index_col=False, chunksize=CHUNKSIZE) except IOError as e: print(e, file=stderr) continue # Iterate over chunks in file chunk_i = 0 for chunk in reader: if max_chunks is not None and chunk_i >= max_chunks: break print(' processing chunk', chunk_i, 'of', filename) # Convert date and time columns to np.datetime64 objects dt = chunk['date'] + ' ' + chunk['time'] del chunk['date'] del chunk['time'] chunk['datetime'] = pd.to_datetime(dt, format='%Y-%m-%d %H:%M:%S', utc=True) # Data is either tenths of a Wh or tenths of a degree chunk['data'] *= 10 chunk['data'] = chunk['data'].astype(np.float32) # Iterate over houses in chunk for hes_house_id, hes_house_id_df in chunk.groupby('house id'): if hes_house_id not in house_codes: house_codes.append(hes_house_id) if hes_house_id not in house_appliance_codes.keys(): house_appliance_codes[hes_house_id] = [] nilmtk_house_id = house_codes.index(hes_house_id) + 1 # Iterate over appliances in house for appliance_code, appliance_df in chunk.groupby( 'appliance code'): if appliance_code not in house_appliance_codes[ hes_house_id]: house_appliance_codes[hes_house_id].append( appliance_code) nilmtk_meter_id = house_appliance_codes[ hes_house_id].index(appliance_code) + 1 _process_meter_in_chunk(nilmtk_house_id, nilmtk_meter_id, hes_house_id_df, store, appliance_code) chunk_i += 1 print('houses with some data loaded:', house_appliance_codes.keys()) store.close() # generate building yaml metadata for hes_house_id in house_codes: nilmtk_building_id = house_codes.index(hes_house_id) + 1 building_metadata = {} building_metadata['instance'] = nilmtk_building_id building_metadata['original_name'] = int( hes_house_id) # use python int building_metadata['elec_meters'] = {} building_metadata['appliances'] = [] # initialise dict of instances of each appliance type instance_counter = {} for appliance_code in house_appliance_codes[hes_house_id]: nilmtk_meter_id = house_appliance_codes[hes_house_id].index( appliance_code) + 1 # meter metadata if appliance_code in MAINS_CODES: meter_metadata = { 'device_model': 'multivoies', 'site_meter': True } break elif appliance_code in CIRCUIT_CODES: meter_metadata = {'device_model': 'multivoies'} break elif appliance_code in TEMPERATURE_CODES: break else: # is appliance meter_metadata = {'device_model': 'wattmeter'} # only appliance meters at this point building_metadata['elec_meters'][nilmtk_meter_id] = meter_metadata # appliance metadata lookup_row = hes_to_nilmtk_appliance_lookup[ hes_to_nilmtk_appliance_lookup.Code == appliance_code].iloc[0] appliance_metadata = { 'original_name': lookup_row.Name, 'meters': [nilmtk_meter_id] } # appliance type appliance_metadata.update({'type': lookup_row.nilmtk_name}) # TODO appliance room # appliance instance number if instance_counter.get(lookup_row.nilmtk_name) == None: instance_counter[lookup_row.nilmtk_name] = 0 instance_counter[lookup_row.nilmtk_name] += 1 appliance_metadata['instance'] = instance_counter[ lookup_row.nilmtk_name] building_metadata['appliances'].append(appliance_metadata) building = 'building{:d}'.format(nilmtk_building_id) yaml_full_filename = join(metadata_dir, building + '.yaml') with open(yaml_full_filename, 'w') as outfile: #print(building_metadata) outfile.write(yaml.dump(building_metadata)) # write yaml metadata to hdf5 convert_yaml_to_hdf5(metadata_dir, output_filename) # remote the temporary dir when finished shutil.rmtree(tmp_dir)
def convert_deps(deps_path, input_filename, output_filename, format='HDF'): """ Parameters ---------- deps_path : str The root path of the DEPS dataset. e.g 'C:/data/deps' input_filename : str The rawdata filename (including path and suffix). e.g 'C:/data/rawdata.csv' output_filename : str The destination HDF5 filename (including path and suffix). e.g 'C:/data/deps/DEPS_data.h5' format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' Meters & Measurements : ---------- Measurement assignment (idMeasurement) in rawdata to REDD format Measurements id's Units Meters Name 14011 14012 --> W VAr --> Main_RST 14001 14007 14014 14017 --> V A W VAr --> Main_R 14002 14008 14015 14018 --> V A W VAr --> Main_S 14003 14009 14016 14019 --> V A W VAr --> Main_T 13001 --> W --> Lights_1 13002 --> W --> Lights_2 10003 10006 10014 10018 --> V A W VAr --> HVAC_1 10002 10005 10013 10017 --> V A W VAr --> HVAC_2 10001 10004 10012 10016 --> V A W VAr --> HVAC_4 21001 21002 21003 21005 --> V A W VAr --> Rack Example ---------- raw_data.csv (input_filename): -- idMeasurement, UNIX_timestamp(tStampUTC), dataValue 14011, 1583103600, 123 14012, 1583103600, -416 14011, 1583103601, 126 14012, 1583103601, -416 ... ... ... 14011, 1583535599, 121 14012, 1583535599, -411 Outputs REDD format: deps_path/classroom1/ : -- channel_1.dat: 1583103600 123 -416 1583103600 126 -416 ... ... ... 1583103600 121 -411 -- labels.dat: 1 Main_RST Output HDF5 file: output_filename.h5 """ #-------------------------------------------------------------------- # writed by Andrés Arias Silva # Raw data converter to REDD format extracted from DEPS SQL database _deps_to_redd_format(deps_path, input_filename) #-------------------------------------------------------------------- def _deps_measurement_mapping_func(classroom_id, chan_id): if chan_id == 1: meas = ([('power', 'active'), ('power', 'reactive')]) elif chan_id > 1 and chan_id <= 4: meas = ([('voltage', ''), ('current', ''), ('power', 'active'), ('power', 'reactive')]) elif chan_id > 4 and chan_id <= 6: meas = ([('power', 'active')]) elif chan_id > 6 and chan_id <= 10: meas = ([ ('voltage', ''), ('current', ''), ('power', 'active'), ('power', 'reactive'), ]) else: raise NameError('incorrect channel number') return meas # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert raw data to DataStore _convert(deps_path, store, _deps_measurement_mapping_func, 'Europe/Madrid') # s=join(get_module_directory(), # 'dataset_converters', # 'deps', # 'metadata') # Add metadata save_yaml_to_datastore( join(get_module_directory(), 'dataset_converters', 'deps', 'metadata'), store) store.close() print("Done converting DEPS data to HDF5!")
def download_dataport(database_username, database_password, hdf_filename, database_schema='university', user_selected_table='electricity_egauge_minutes', periods_to_load=None): """ Downloads data from dataport database into an HDF5 file. Parameters ---------- hdf_filename : str Output HDF filename. If file exists already then will be deleted. database_username, database_password, database_schema,user_selected_table, hdf_filename : str periods_to_load : dict of tuples, optional Key of dict is the building number (int). Values are (<start date>, <end date>) e.g. ("2013-04-01", None) or ("2013-04-01", "2013-08-01") defaults to all buildings and all date ranges """ database_assert(user_selected_table) # dataport database settings database_host = 'dataport.pecanstreet.org' database_port = '5434' database_name = 'postgres' # try to connect to database try: conn = db.connect('host=' + database_host + ' port=' + database_port + ' dbname=' + database_name + ' user='******' password='******'Could not connect to remote database') raise # map user_selected_table and timestamp column timestamp_map = {"electricity_egauge_15min": "local_15min", "electricity_egauge_hours": "localhour", "electricity_egauge_minutes": "localminute", "electricity_egauge_seconds": "localminute"} # set up a new HDF5 datastore (overwrites existing store) store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') # Create a temporary metadata dir, remove existing building # yaml files in module dir (if any) original_metadata_dir = join(get_module_directory(), 'dataset_converters', 'dataport', 'metadata') tmp_dir = tempfile.mkdtemp() metadata_dir = join(tmp_dir, 'metadata') shutil.copytree(original_metadata_dir, metadata_dir) print("Using temporary dir for metadata:", metadata_dir) for f in os.listdir(metadata_dir): if re.search('^building', f): os.remove(join(metadata_dir, f)) """ TODO: The section below can be altered or removed, since the restructured Dataport now has only one electricity_egauge_minutes table. """ # get tables in database schema sql_query = ("SELECT table_name" + " FROM information_schema.views" + " WHERE table_schema ='" + database_schema + "'" + " ORDER BY table_name") database_tables = pd.read_sql(sql_query, conn)['table_name'].tolist() database_tables = [t for t in database_tables if user_selected_table in t] # if user has specified buildings if periods_to_load: buildings_to_load = list(periods_to_load.keys()) else: # get buildings present in all tables sql_query = '' for table in database_tables: sql_query = (sql_query + '(SELECT DISTINCT dataid' + ' FROM "' + database_schema + '".' + table + ') UNION ') sql_query = sql_query[:-7] sql_query = (sql_query + ' ORDER BY dataid') buildings_to_load = pd.read_sql(sql_query, conn)['dataid'].tolist() # for each user specified building or all buildings in database for building_id in buildings_to_load: print("Loading building {:d} @ {}" .format(building_id, datetime.datetime.now())) sys.stdout.flush() # create new list of chunks for concatenating later dataframe_list = [] # for each table of 1 month data for database_table in database_tables: print(" Loading table {:s}".format(database_table)) sys.stdout.flush() # get buildings present in electricity_egauge_minutes table sql_query = ('SELECT DISTINCT dataid' + ' FROM university.metadata' + ' WHERE egauge_min_time IS NOT NULL' + ' ORDER BY dataid') buildings_in_table = pd.read_sql(sql_query, conn)['dataid'].tolist() if building_id in buildings_in_table: # get first and last timestamps for this # house in electricity_egauge_minutes table sql_query = ('SELECT MIN(egauge_min_time) AS minlocalminute,' + ' MAX(egauge_max_time) AS maxlocalminute' + ' FROM university.metadata' + ' WHERE dataid=' + str(building_id)) range = pd.read_sql(sql_query, conn) first_timestamp_in_table = range['minlocalminute'][0] last_timestamp_in_table = range['maxlocalminute'][0] # get requested start and end and localize them requested_start = None requested_end = None database_timezone = 'US/Central' if periods_to_load: if periods_to_load[building_id][0]: requested_start = pd.Timestamp(periods_to_load[building_id][0]) requested_start = requested_start.tz_localize(database_timezone) if periods_to_load[building_id][1]: requested_end = pd.Timestamp(periods_to_load[building_id][1]) requested_end = requested_end.tz_localize(database_timezone) # check user start is not after end if requested_start > requested_end: print('requested end is before requested start') sys.stdout.flush() else: # clip data to smallest range if requested_start: start = max(requested_start, first_timestamp_in_table) else: start = first_timestamp_in_table if requested_end: end = min(requested_end, last_timestamp_in_table) else: end = last_timestamp_in_table # download data in chunks chunk_start = start chunk_size = datetime.timedelta(10) # 1 day while chunk_start < end: chunk_end = chunk_start + chunk_size if chunk_end > end: chunk_end = end # subtract 1 second so end is exclusive chunk_end = chunk_end - datetime.timedelta(0, 1) # query power data for all channels format = '%Y-%m-%d %H:%M:%S' sql_query = ('SELECT *' + ' FROM "' + database_schema + '".' + user_selected_table + ' WHERE dataid=' + str(building_id) + 'and "' + timestamp_map[user_selected_table] + '" between ' + "'" + chunk_start.strftime(format) + "'" + " and " + "'" + chunk_end.strftime(format) + "' ORDER BY "+timestamp_map[user_selected_table] ) chunk_dataframe = pd.read_sql(sql_query, conn) # nilmtk requires building indices to start at 1 nilmtk_building_id = buildings_to_load.index(building_id) + 1 # convert to nilmtk-df and save to disk nilmtk_dataframe = _dataport_dataframe_to_hdf( chunk_dataframe, store, nilmtk_building_id, building_id, timestamp_map[user_selected_table], metadata_dir ) # print progress print(' ' + str(chunk_start) + ' -> ' + str(chunk_end) + ': ' + str(len(chunk_dataframe.index)) + ' rows') sys.stdout.flush() # append all chunks into list for csv writing # dataframe_list.append(chunk_dataframe) # move on to next chunk chunk_start = chunk_start + chunk_size # saves all chunks in list to csv # if len(dataframe_list) > 0: # dataframe_concat = pd.concat(dataframe_list) # dataframe_concat.to_csv(output_directory + str(building_id) + '.csv') store.close() conn.close() # write yaml to hdf5 # dataset.yaml and meter_devices.yaml are static, building<x>.yaml are dynamic convert_yaml_to_hdf5(metadata_dir, hdf_filename) # remote the temporary dir when finished shutil.rmtree(tmp_dir)
def convert_greend(greend_path, hdf_filename, use_mp=True): """ Parameters ---------- greend_path : str The root path of the greend dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). use_mp : bool Defaults to True. Use multiprocessing to load the files for each building. """ store = pd.HDFStore(hdf_filename, 'w', complevel=5, complib='zlib') houses = sorted(_get_houses(greend_path)) print('Houses found:', houses) if use_mp: pool = Pool() h = 1 # nilmtk counts buildings from 1 not from 0 as we do, so everything is shifted by 1 for house in houses: print('Loading', house) abs_house = join(greend_path, house) dates = [d for d in listdir(abs_house) if d.startswith('dataset')] target_filenames = [join(abs_house, date) for date in dates] if use_mp: house_data = pool.map(_get_blocks, target_filenames) # Ensure the blocks are sorted by date and make a plain list house_data_dfs = [] for date, data in sorted(house_data, key=lambda x: x[0]): house_data_dfs.extend(data) else: house_data_dfs = [] for fn in target_filenames: house_data_dfs.extend(_get_blocks(fn)[1]) overall_df = pd.concat(house_data_dfs).sort_index() dups_in_index = overall_df.index.duplicated(keep='first') if dups_in_index.any(): print("Found duplicated values in index, dropping them.") overall_df = overall_df[~dups_in_index] m = 1 for column in overall_df.columns: print("meter {}: {}".format(m, column)) key = Key(building=h, meter=m) print("Putting into store...") df = overall_df[column].to_frame() #.dropna(axis=0) # if drop_duplicates: # print("Dropping duplicated values in data...") # df = df.drop_duplicates() df.columns = pd.MultiIndex.from_tuples([('power', 'active')]) df.columns.set_names(LEVEL_NAMES, inplace=True) store.put(str(key), df, format = 'table') m += 1 # print('Flushing store...') # store.flush() h += 1 store.close() # retrieve the dataset metadata in the metadata subfolder metadata_dir = join(get_module_directory(), 'dataset_converters', 'greend', 'metadata') convert_yaml_to_hdf5(metadata_dir, hdf_filename)
def convert_hes(data_dir, output_filename, format='HDF', max_chunks=None): metadata = { 'name': 'HES', 'geographic_coordinates': (51.464462,-0.076544), # London 'timezone': 'Europe/London' } # Open DataStore store = get_datastore(output_filename, format, mode='w') # load list of appliances hes_to_nilmtk_appliance_lookup = pd.read_csv(join(get_module_directory(), 'dataset_converters', 'hes', 'hes_to_nilmtk_appliance_lookup.csv')) # load list of houses hes_house_ids = load_list_of_house_ids(data_dir) nilmtk_house_ids = np.arange(1,len(hes_house_ids)+1) hes_to_nilmtk_house_ids = dict(zip(hes_house_ids, nilmtk_house_ids)) # array of hes_house_codes: nilmtk_building_code = house_codes.index(hes_house_code) house_codes = [] # map house_appliance_codes = dict() # Iterate over files for filename in FILENAMES: # Load appliance energy data chunk-by-chunk full_filename = join(data_dir, filename) print('loading', full_filename) try: reader = pd.read_csv(full_filename, names=COL_NAMES, index_col=False, chunksize=CHUNKSIZE) except IOError as e: print(e, file=stderr) continue # Iterate over chunks in file chunk_i = 0 for chunk in reader: if max_chunks is not None and chunk_i >= max_chunks: break print(' processing chunk', chunk_i, 'of', filename) # Convert date and time columns to np.datetime64 objects dt = chunk['date'] + ' ' + chunk['time'] del chunk['date'] del chunk['time'] chunk['datetime'] = dt.apply(datetime_converter) # Data is either tenths of a Wh or tenths of a degree chunk['data'] *= 10 chunk['data'] = chunk['data'].astype(np.float32) # Iterate over houses in chunk for hes_house_id, hes_house_id_df in chunk.groupby('house id'): if hes_house_id not in house_codes: house_codes.append(hes_house_id) if hes_house_id not in house_appliance_codes.keys(): house_appliance_codes[hes_house_id] = [] nilmtk_house_id = house_codes.index(hes_house_id)+1 # Iterate over appliances in house for appliance_code, appliance_df in chunk.groupby('appliance code'): if appliance_code not in house_appliance_codes[hes_house_id]: house_appliance_codes[hes_house_id].append(appliance_code) nilmtk_meter_id = house_appliance_codes[hes_house_id].index(appliance_code)+1 _process_meter_in_chunk(nilmtk_house_id, nilmtk_meter_id, hes_house_id_df, store, appliance_code) chunk_i += 1 print('houses with some data loaded:', house_appliance_codes.keys()) store.close() # generate building yaml metadata for hes_house_id in house_codes: nilmtk_building_id = house_codes.index(hes_house_id)+1 building_metadata = {} building_metadata['instance'] = nilmtk_building_id building_metadata['original_name'] = int(hes_house_id) # use python int building_metadata['elec_meters'] = {} building_metadata['appliances'] = [] # initialise dict of instances of each appliance type instance_counter = {} for appliance_code in house_appliance_codes[hes_house_id]: nilmtk_meter_id = house_appliance_codes[hes_house_id].index(appliance_code)+1 # meter metadata if appliance_code in MAINS_CODES: meter_metadata = {'device_model': 'multivoies', 'site_meter': True} break elif appliance_code in CIRCUIT_CODES: meter_metadata = {'device_model': 'multivoies'} break elif appliance_code in TEMPERATURE_CODES: break else: # is appliance meter_metadata = {'device_model': 'wattmeter'} # only appliance meters at this point building_metadata['elec_meters'][nilmtk_meter_id] = meter_metadata # appliance metadata lookup_row = hes_to_nilmtk_appliance_lookup[hes_to_nilmtk_appliance_lookup.Code==appliance_code].iloc[0] appliance_metadata = {'original_name': lookup_row.Name, 'meters': [nilmtk_meter_id] } # appliance type appliance_metadata.update({'type': lookup_row.nilmtk_name}) # TODO appliance room # appliance instance number if instance_counter.get(lookup_row.nilmtk_name) == None: instance_counter[lookup_row.nilmtk_name] = 0 instance_counter[lookup_row.nilmtk_name] += 1 appliance_metadata['instance'] = instance_counter[lookup_row.nilmtk_name] building_metadata['appliances'].append(appliance_metadata) building = 'building{:d}'.format(nilmtk_building_id) yaml_full_filename = join(_get_module_directory(), 'metadata', building + '.yaml') with open(yaml_full_filename, 'w') as outfile: #print(building_metadata) outfile.write(yaml.dump(building_metadata)) # write yaml metadata to hdf5 convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), output_filename)
def convert_eco(dataset_loc, hdf_filename, timezone): """ Parameters: ----------- dataset_loc: str The root directory where the dataset is located. hdf_filename: str The location where the hdf_filename is present. The directory location has to contain the hdf5file name for the converter to work. timezone: str specifies the timezone of the dataset. """ # Creating a new HDF File store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc') check_directory_exists(dataset_loc) directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i] directory_list.sort() print(directory_list) found_any_sm = False found_any_plug = False # Traversing every folder for folder in directory_list: if folder[0] == '.' or folder[-3:] == '.h5': print('Skipping ', folder) continue #Building number and meter_flag building_no = int(folder[:2]) meter_flag = None if 'sm_csv' in folder: meter_flag = 'sm' elif 'plugs' in folder: meter_flag = 'plugs' else: print('Skipping folder', folder) continue print('Computing for folder', folder) dir_list = [ i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc, folder, i)) ] dir_list.sort() if meter_flag == 'plugs' and len(dir_list) < 3: # Try harder to find the subfolders folder = join(folder, folder[:2]) dir_list = [ i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc, folder, i)) ] print('Current dir list:', dir_list) for fl in dir_list: print('Computing for folder ', fl) fl_dir_list = [ i for i in listdir(join(dataset_loc, folder, fl)) if '.csv' in i ] fl_dir_list.sort() if meter_flag == 'sm': for fi in fl_dir_list: found_any_sm = True df = pd.read_csv(join(dataset_loc, folder, fl, fi), names=[i for i in range(1, 17)], dtype=np.float32) # SmartMeter for phase in range(1, 4): key = str(Key(building=building_no, meter=phase)) df_phase = df.loc[:, [ 1 + phase, 5 + phase, 8 + phase, 13 + phase ]] # get reactive power power = df_phase.loc[:, (1 + phase, 13 + phase)].values reactive = power[:, 0] * np.tan( power[:, 1] * np.pi / 180) df_phase['Q'] = reactive df_phase.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz='GMT') df_phase = df_phase.tz_convert(timezone) sm_column_name = { 1 + phase: ('power', 'active'), 5 + phase: ('current', ''), 8 + phase: ('voltage', ''), 13 + phase: ('phase_angle', ''), 'Q': ('power', 'reactive'), } df_phase.columns = pd.MultiIndex.from_tuples( sm_column_name[col] for col in df_phase.columns) power_active = df_phase['power', 'active'] tmp_before = np.size(power_active) df_phase = df_phase[power_active != -1] power_active = df_phase['power', 'active'] tmp_after = np.size(power_active) if tmp_before != tmp_after: print( 'Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) df_phase.columns.set_names(LEVEL_NAMES, inplace=True) if not key in store: store.put(key, df_phase, format='Table') else: store.append(key, df_phase, format='Table') store.flush() print('Building', building_no, ', Meter no.', phase, '=> Done for ', fi[:-4]) # Plugs werden auch in Meter uebersetzt und dann aber direkt mit Appliances ergaenzt else: #Meter number to be used in key meter_num = int(fl) + 3 key = str(Key(building=building_no, meter=meter_num)) current_folder = join(dataset_loc, folder, fl) if not fl_dir_list: raise RuntimeError("No CSV file found in " + current_folder) #Getting dataframe for each csv file seperately for fi in fl_dir_list: found_any_plug = True df = pd.read_csv(join(current_folder, fi), names=[1], dtype=np.float64) df.index = pd.DatetimeIndex(start=fi[:-4].replace( '.', ':'), freq='s', periods=86400, tz='GMT') df.columns = pd.MultiIndex.from_tuples( plugs_column_name.values()) df = df.tz_convert(timezone) df.columns.set_names(LEVEL_NAMES, inplace=True) # Check whether measurements removed tmp_before = np.size(df.power.active) df = df[df.power.active != -1] tmp_after = np.size(df.power.active) if (tmp_before != tmp_after): print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) # If table not present in hdf5, create or else append to existing data if not key in store: store.put(key, df, format='Table') print('Building', building_no, ', Meter no.', meter_num, '=> Done for ', fi[:-4]) else: store.append(key, df, format='Table') store.flush() print('Building', building_no, ', Meter no.', meter_num, '=> Done for ', fi[:-4]) if not found_any_plug or not found_any_sm: raise RuntimeError( 'The files were not found! Please check the folder structure. Extract each ZIP file into a folder with its base name (e.g. extract "01_plugs_csv.zip" into a folder named "01_plugs_csv", etc.)' ) print("Data storage completed.") store.close() # Adding the metadata to the HDF5file print("Proceeding to Metadata conversion...") meta_path = join(get_module_directory(), 'dataset_converters', 'eco', 'metadata') convert_yaml_to_hdf5(meta_path, hdf_filename) print("Completed Metadata conversion.")
def convert_greend(greend_path, hdf_filename, use_mp=True): """ Parameters ---------- greend_path : str The root path of the greend dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). use_mp : bool Defaults to True. Use multiprocessing to load the files for each building. """ store = pd.HDFStore(hdf_filename, 'w', complevel=5, complib='zlib') houses = sorted(_get_houses(greend_path)) print('Houses found:', houses) if use_mp: pool = Pool() h = 1 # nilmtk counts buildings from 1 not from 0 as we do, so everything is shifted by 1 for house in houses: print('Loading', house) abs_house = join(greend_path, house) dates = [d for d in listdir(abs_house) if d.startswith('dataset')] target_filenames = [join(abs_house, date) for date in dates] if use_mp: house_data = pool.map(_get_blocks, target_filenames) # Ensure the blocks are sorted by date and make a plain list house_data_dfs = [] for date, data in sorted(house_data, key=lambda x: x[0]): house_data_dfs.extend(data) else: house_data_dfs = [] for fn in target_filenames: house_data_dfs.extend(_get_blocks(fn)[1]) overall_df = pd.concat(house_data_dfs).sort_index() dups_in_index = overall_df.index.duplicated(keep='first') if dups_in_index.any(): print("Found duplicated values in index, dropping them.") overall_df = overall_df[~dups_in_index] m = 1 for column in overall_df.columns: print("meter {}: {}".format(m, column)) key = Key(building=h, meter=m) print("Putting into store...") df = overall_df[column].to_frame() #.dropna(axis=0) # if drop_duplicates: # print("Dropping duplicated values in data...") # df = df.drop_duplicates() df.columns = pd.MultiIndex.from_tuples([('power', 'active')]) df.columns.set_names(LEVEL_NAMES, inplace=True) store.put(str(key), df, format='table') m += 1 # print('Flushing store...') # store.flush() h += 1 store.close() # retrieve the dataset metadata in the metadata subfolder metadata_dir = join(get_module_directory(), 'dataset_converters', 'greend', 'metadata') convert_yaml_to_hdf5(metadata_dir, hdf_filename)
def convert_combed(combed_path, output_filename, format='HDF'): """ Parameters ---------- combed_path : str The root path of the combed dataset. output_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(combed_path) # Open store store = get_datastore(output_filename, format, mode='w') any_file_converted = False for building_name, building_mapping in overall_dataset_mapping.items(): for load_name, load_mapping in building_mapping.items(): for load_mapping_path, meter_number in load_mapping.items(): building_number = building_number_mapping[building_name] key = Key(building=building_number, meter=meter_number) dfs = [] for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" % attribute) if not os.path.isfile(filename_attribute): # File not found directly in the combed_path provided # Try adding 'iiitd' to it filename_attribute = join(combed_path, 'iiitd', building_name, load_name, load_mapping_path, "%s.csv" % attribute) if os.path.isfile(filename_attribute): exists = True print(filename_attribute) df = pd.read_csv(filename_attribute, names=["timestamp", attribute]) df.index = pd.to_datetime(df["timestamp"], unit='ms') df = df.drop("timestamp", 1) dfs.append(df) else: exists = False if exists: total = pd.concat(dfs, axis=1) total = total.tz_localize('UTC').tz_convert('Asia/Kolkata') total.columns = pd.MultiIndex.from_tuples( [column_mapping[x] for x in total.columns]) total.columns.set_names(LEVEL_NAMES, inplace=True) assert total.index.is_unique store.put(str(key), total) any_file_converted = True if not any_file_converted: raise RuntimeError( 'No files converted, did you specify the correct path?') convert_yaml_to_hdf5( join(get_module_directory(), 'dataset_converters', 'combed', 'metadata'), output_filename) print("Done converting COMBED to HDF5!")
def download_dataport(database_username, database_password, hdf_filename, database_schema='university', user_selected_table='electricity_egauge_minutes', periods_to_load=None): """ Downloads data from dataport database into an HDF5 file. Parameters ---------- hdf_filename : str Output HDF filename. If file exists already then will be deleted. database_username, database_password, database_schema,user_selected_table, hdf_filename : str periods_to_load : dict of tuples, optional Key of dict is the building number (int). Values are (<start date>, <end date>) e.g. ("2013-04-01", None) or ("2013-04-01", "2013-08-01") defaults to all buildings and all date ranges """ database_assert(user_selected_table) # dataport database settings database_host = 'dataport.pecanstreet.org' database_port = '5434' database_name = 'postgres' # try to connect to database try: conn = db.connect('host=' + database_host + ' port=' + database_port + ' dbname=' + database_name + ' user='******' password='******'Could not connect to remote database') raise # map user_selected_table and timestamp column timestamp_map = { "electricity_egauge_15min": "local_15min", "electricity_egauge_hours": "localhour", "electricity_egauge_minutes": "localminute", "electricity_egauge_seconds": "localminute" } # set up a new HDF5 datastore (overwrites existing store) store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') # Create a temporary metadata dir, remove existing building # yaml files in module dir (if any) original_metadata_dir = join(get_module_directory(), 'dataset_converters', 'dataport', 'metadata') tmp_dir = tempfile.mkdtemp() metadata_dir = join(tmp_dir, 'metadata') shutil.copytree(original_metadata_dir, metadata_dir) print("Using temporary dir for metadata:", metadata_dir) for f in os.listdir(metadata_dir): if re.search('^building', f): os.remove(join(metadata_dir, f)) """ TODO: The section below can be altered or removed, since the restructured Dataport now has only one electricity_egauge_minutes table. """ # get tables in database schema sql_query = ("SELECT table_name" + " FROM information_schema.views" + " WHERE table_schema ='" + database_schema + "'" + " ORDER BY table_name") database_tables = pd.read_sql(sql_query, conn)['table_name'].tolist() database_tables = [t for t in database_tables if user_selected_table in t] # if user has specified buildings if periods_to_load: buildings_to_load = list(periods_to_load.keys()) else: # get buildings present in all tables sql_query = '' for table in database_tables: sql_query = (sql_query + '(SELECT DISTINCT dataid' + ' FROM "' + database_schema + '".' + table + ') UNION ') sql_query = sql_query[:-7] sql_query = (sql_query + ' ORDER BY dataid') buildings_to_load = pd.read_sql(sql_query, conn)['dataid'].tolist() # for each user specified building or all buildings in database for building_id in buildings_to_load: print("Loading building {:d} @ {}".format(building_id, datetime.datetime.now())) sys.stdout.flush() # create new list of chunks for concatenating later dataframe_list = [] # for each table of 1 month data for database_table in database_tables: print(" Loading table {:s}".format(database_table)) sys.stdout.flush() # get buildings present in electricity_egauge_minutes table sql_query = ('SELECT DISTINCT dataid' + ' FROM university.metadata' + ' WHERE egauge_min_time IS NOT NULL' + ' ORDER BY dataid') buildings_in_table = pd.read_sql(sql_query, conn)['dataid'].tolist() if building_id in buildings_in_table: # get first and last timestamps for this # house in electricity_egauge_minutes table sql_query = ('SELECT MIN(egauge_min_time) AS minlocalminute,' + ' MAX(egauge_max_time) AS maxlocalminute' + ' FROM university.metadata' + ' WHERE dataid=' + str(building_id)) range = pd.read_sql(sql_query, conn) first_timestamp_in_table = range['minlocalminute'][0] last_timestamp_in_table = range['maxlocalminute'][0] # get requested start and end and localize them requested_start = None requested_end = None database_timezone = 'US/Central' if periods_to_load: if periods_to_load[building_id][0]: requested_start = pd.Timestamp( periods_to_load[building_id][0]) requested_start = requested_start.tz_localize( database_timezone) if periods_to_load[building_id][1]: requested_end = pd.Timestamp( periods_to_load[building_id][1]) requested_end = requested_end.tz_localize( database_timezone) # check user start is not after end if requested_start > requested_end: print('requested end is before requested start') sys.stdout.flush() else: # clip data to smallest range if requested_start: start = max(requested_start, first_timestamp_in_table) else: start = first_timestamp_in_table if requested_end: end = min(requested_end, last_timestamp_in_table) else: end = last_timestamp_in_table # download data in chunks chunk_start = start chunk_size = datetime.timedelta(10) # 1 day while chunk_start < end: chunk_end = chunk_start + chunk_size if chunk_end > end: chunk_end = end # subtract 1 second so end is exclusive chunk_end = chunk_end - datetime.timedelta(0, 1) # query power data for all channels format = '%Y-%m-%d %H:%M:%S' sql_query = ( 'SELECT *' + ' FROM "' + database_schema + '".' + user_selected_table + ' WHERE dataid=' + str(building_id) + 'and "' + timestamp_map[user_selected_table] + '" between ' + "'" + chunk_start.strftime(format) + "'" + " and " + "'" + chunk_end.strftime(format) + "' ORDER BY " + timestamp_map[user_selected_table]) chunk_dataframe = pd.read_sql(sql_query, conn) # nilmtk requires building indices to start at 1 nilmtk_building_id = buildings_to_load.index( building_id) + 1 # convert to nilmtk-df and save to disk nilmtk_dataframe = _dataport_dataframe_to_hdf( chunk_dataframe, store, nilmtk_building_id, building_id, timestamp_map[user_selected_table], metadata_dir) # print progress print(' ' + str(chunk_start) + ' -> ' + str(chunk_end) + ': ' + str(len(chunk_dataframe.index)) + ' rows') sys.stdout.flush() # append all chunks into list for csv writing # dataframe_list.append(chunk_dataframe) # move on to next chunk chunk_start = chunk_start + chunk_size # saves all chunks in list to csv # if len(dataframe_list) > 0: # dataframe_concat = pd.concat(dataframe_list) # dataframe_concat.to_csv(output_directory + str(building_id) + '.csv') store.close() conn.close() # write yaml to hdf5 # dataset.yaml and meter_devices.yaml are static, building<x>.yaml are dynamic convert_yaml_to_hdf5(metadata_dir, hdf_filename) # remote the temporary dir when finished shutil.rmtree(tmp_dir)