def convert_redd(redd_path, output_filename, format='HDF'):
    """
    Parameters
    ----------
    redd_path : str
        The root path of the REDD low_freq dataset.
    output_filename : str
        The destination filename (including path and suffix).
    format : str
        format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF'
    """
    def _redd_measurement_mapping_func(house_id, chan_id):
        ac_type = 'apparent' if chan_id <= 2 else 'active'
        return [('power', ac_type)]

    # Open DataStore
    store = get_datastore(output_filename, format, mode='w')

    # Convert raw data to DataStore
    _convert(redd_path, store, _redd_measurement_mapping_func, 'US/Eastern')

    s = join(get_module_directory(), 'dataset_converters', 'redd', 'metadata')

    # Add metadata
    save_yaml_to_datastore(
        join(get_module_directory(), 'dataset_converters', 'redd', 'metadata'),
        store)
    store.close()

    print("Done converting REDD to HDF5!")
Пример #2
0
def convert_refit(input_path, output_filename, format='HDF'):
    """
    Parameters
    ----------
    input_path : str
        The root path of the CSV files, e.g. House1.csv
    output_filename : str
        The destination filename (including path and suffix).
    format : str
        format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF'
    """

    # Open DataStore
    store = get_datastore(output_filename, format, mode='w')

    # Convert raw data to DataStore
    _convert(input_path, store, 'Europe/London')

    # Add metadata
    save_yaml_to_datastore(
        join(get_module_directory(), 'dataset_converters', 'refit',
             'metadata'), store)
    store.close()

    print("Done converting REFIT to HDF5!")
Пример #3
0
def convert_refit(input_path, output_filename, format='HDF'):
    """
    Parameters
    ----------
    input_path : str
        The root path of the CSV files, e.g. House1.csv
    output_filename : str
        The destination filename (including path and suffix).
    format : str
        format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF'
    """
        
    # Open DataStore
    store = get_datastore(output_filename, format, mode='w')

    # Convert raw data to DataStore
    _convert(input_path, store, 'Europe/London')

    # Add metadata
    save_yaml_to_datastore(join(get_module_directory(), 
                              'dataset_converters', 
                              'refit', 
                              'metadata'),
                         store)
    store.close()

    print("Done converting REFIT to HDF5!")
Пример #4
0
def convert_ukdale(ukdale_path, hdf_filename):
    """
    Parameters
    ----------
    ukdale_path : str
        The root path of the UK-DALE dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    def _ukdale_measurement_mapping_func(house_id, chan_id):
        # TODO: This needs updating.  It's wrong!
        ac_type = 'apparent' if chan_id <= 2 else 'active'
        return [('power', ac_type)]

    _convert(ukdale_path, hdf_filename, _ukdale_measurement_mapping_func, 
             'Europe/London')

    # Add metadata
    convert_yaml_to_hdf5(join(get_module_directory(), 
                              'dataset_converters', 
                              'ukdale', 
                              'metadata'),
                         hdf_filename)

    print("Done converting UK-DALE to HDF5!")
Пример #5
0
def convert_lab(lab_path, output_filename, format='HDF'):
    """
    Parameters
    ----------
    lab_path : str
        The root path of the LAB dataset.
    output_filename : str
        The destination filename (including path and suffix).
    format : str
        format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF'
    """

    #estava <=2 e o primeiro ac_type = apparent
    def _lab_measurement_mapping_func(house_id, chan_id):
        ac_type = 'active' if chan_id <= 1 else 'active'
        return [('power', ac_type)]

    # Open DataStore
    store = get_datastore(output_filename, format, mode='w')

    # Convert raw data to DataStore
    _convert(lab_path, store, _lab_measurement_mapping_func,
             'America/Fortaleza')

    # Add metadata
    save_yaml_to_datastore(
        join(get_module_directory(), 'dataset_converters', 'lab', 'metadata'),
        store)
    store.close()

    print("Done converting LAB to HDF5!")
Пример #6
0
def convert_deddiag(connection,
                    output_filename,
                    format='HDF',
                    start_date=DEFAULT_START_DATE,
                    end_date=DEFAULT_END_DATE,
                    tz=DEFAULT_TZ):
    """
    Parameters
    ----------
    connection: Connection
        Connection to the DEDDIAG database
        Example: connection = Connection(host="localhost", port="5432", db_name="postgres", user="******", password="******")
    output_filename : str
        The destination filename including path and suffix
        Example: ./data/deddiag.h5
    format : str
        format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF'
    """

    # Open DataStore
    # todo try catch

    dest_file = get_datastore(output_filename, format, mode='w')

    # Convert raw data to DataStore
    _convert(connection, dest_file, start_date, end_date, tz)

    path_to_metadata = join(get_module_directory(), 'dataset_converters',
                            'deddiag', 'metadata')

    # Add metadata
    save_yaml_to_datastore(path_to_metadata, dest_file)

    print("Done converting DEDDIAG to HDF5!")
Пример #7
0
def convert_alva(alva_path, output_filename, format='HDF'):
    """
    Parameters
    ----------
    alva_path : str
        The root path of the alva low_freq dataset.
    output_filename : str
        The destination filename (including path and suffix).
    format : str
        format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF'
    """

    def _alva_measurement_mapping_func(house_id, chan_id):
        ac_type = 'apparent' if chan_id <= 2 else 'active'
        return [('power', ac_type)]
        
    # Open DataStore
    store = get_datastore(output_filename, format, mode='w')

    # Convert raw data to DataStore
    _convert(alva_path, store, _alva_measurement_mapping_func, 'US/Eastern')

    # Add metadata
    save_yaml_to_datastore(join(get_module_directory(), 
                              'dataset_converters', 
                              'alva', 
                              'metadata'),
                         store)
    store.close()

    print("Done converting alva to HDF5!")
Пример #8
0
def convert_lab(lab_path, output_filename, format='HDF'):
    """
    Parameters
    ----------
    lab_path : str
        The root path of the LAB dataset.
    output_filename : str
        The destination filename (including path and suffix).
    format : str
        format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF'
    """

    #estava <=2 e o primeiro ac_type = apparent
    def _lab_measurement_mapping_func(house_id, chan_id):
        ac_type = 'active' if chan_id <= 1 else 'active'
        return [('power', ac_type)]
        
    # Open DataStore
    store = get_datastore(output_filename, format, mode='w')

    # Convert raw data to DataStore
    _convert(lab_path, store, _lab_measurement_mapping_func, 'America/Fortaleza')

    # Add metadata
    save_yaml_to_datastore(join(get_module_directory(), 
                              'dataset_converters', 
                              'lab', 
                              'metadata'),
                         store)
    store.close()

    print("Done converting LAB to HDF5!")
Пример #9
0
def convert_redd(redd_path, output_filename, format="HDF"):
    """
    Parameters
    ----------
    redd_path : str
        The root path of the REDD low_freq dataset.
    output_filename : str
        The destination filename (including path and suffix).
    format : str
        format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF'
    """

    def _redd_measurement_mapping_func(house_id, chan_id):
        ac_type = "apparent" if chan_id <= 2 else "active"
        return [("power", ac_type)]

    # Open DataStore
    store = get_datastore(output_filename, format, mode="w")

    # Convert raw data to DataStore
    _convert(redd_path, store, _redd_measurement_mapping_func, "US/Eastern")

    # Add metadata
    save_yaml_to_datastore(join(get_module_directory(), "dataset_converters", "redd", "metadata"), store)
    store.close()

    print("Done converting REDD to HDF5!")
Пример #10
0
def convert_ideal(ideal_path, output_filename, format='HDF'):
    """
    Convert the IDEAL dataset to NILMTK HDF5 format.
    From https://datashare.ed.ac.uk/handle/10283/3647 download these zips below:
        - household_sensors.zip (14.77Gb).
        - room_and_appliance_sensors.zip (9.317Gb).
    Both zips contain a folder called "sensorsdata".
    Create a new folder, e.g. called "ideal_dataset", and into it
        - Extract the folder "household_sensors.zip/sensordata" with the name 
          household_sensordata
        - Extract the folder "room_and_appliance_sensors/sensordata" with the 
          name rooms_appliance_sensensensordata

    Then run the function convert_ideal with ideal_path="ideal_dataset".

    Parameters
    ----------
    ideal_path : str
        The root path of the ideal low_freq dataset.
    output_filename : str
        The destination filename (including path and suffix).
    format : str
        format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF'
    """
    def _ideal_measurement_mapping_func(house_id, chan_id, category_id):
        if (category_id == "electric-appliance"):
            ac_type = 'active'
            return [('power', ac_type)]
        else:
            ac_type = 'apparent'
            return [('power', ac_type)]

    # Open DataStore
    store = get_datastore(output_filename, format, mode='w')

    #household_sensordata contains mains reading
    #rooms_appliance_sensordata contains appliance reading
    folders = []
    for root, dirs, files in os.walk(ideal_path):
        for folder in dirs:
            if (folder == "household_sensordata"
                    or folder == "rooms_appliance_sensordata"):
                folders.append(folder)
    #valid_home_id are home ids which contain both mains and appliance reading
    valid_home_id = mains_plus_appliance_home_id(ideal_path, folders)
    for folder in folders:
        input_path = join(ideal_path, folder)
        # Convert raw data to DataStore
        _convert(input_path, store, _ideal_measurement_mapping_func,
                 'Europe/London', valid_home_id)

    metadata_path = join(get_module_directory(), 'dataset_converters', 'ideal',
                         'metadata')

    # Add metadata
    save_yaml_to_datastore(metadata_path, store)
    store.close()

    print("Done converting ideal to HDF5!")
Пример #11
0
def convert_ampds(input_path, output_filename, format_='HDF'):
    """
    Convert AMPds R2013 as seen on Dataverse. Download the files
    as CSVs and put them in the `input_path` folder for conversion.
    
    Download URL: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/MXB7VO
    
    Parameters: 
    -----------
    input_path: str
            The path of the directory where all the csv 
            files are supposed to be stored
    output_filename: str
            The path of the h5 file where all the 
            standardized data is supposed to go. The path 
            should refer to a particular file and not just a
             random directory in order for this to work.
    format: str
        Defaults to HDF5
    Example usage:
    --------------
    convert('/AMPds/electricity', 'store.h5')    

    """
    check_directory_exists(input_path)
    files = [f for f in listdir(input_path) if isfile(join(input_path, f)) and
             '.csv' in f and '.swp' not in f]
    # Sorting Lexicographically
    files.sort()

    # Remove Whole Home and put it at top
    files.remove("WHE.csv")
    files.insert(0, "WHE.csv")
    assert isdir(input_path)
    store = get_datastore(output_filename, format_, mode='w')
    for i, csv_file in enumerate(files):
        key = Key(building=1, meter=(i + 1))
        print('Loading file #', (i + 1), ' : ', csv_file, '. Please wait...')
        df = pd.read_csv(join(input_path, csv_file))
        # Due to fixed width, column names have spaces :(
        df.columns = [x.replace(" ", "") for x in df.columns]
        df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME], unit='s', utc=True)
        df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
        df = df.tz_convert(TIMEZONE)
        df.columns = pd.MultiIndex.from_tuples(
            [columnNameMapping[x] for x in df.columns],
            names=LEVEL_NAMES
        )
        df = df.apply(pd.to_numeric, errors='ignore')
        df = df.dropna()
        df = df.astype(np.float32)
        store.put(str(key), df)
        print("Done with file #", (i + 1))
        
    store.close()
    metadata_path = join(get_module_directory(), 'dataset_converters', 'ampds', 'metadata')
    print('Processing metadata...')
    convert_yaml_to_hdf5(metadata_path, output_filename)
Пример #12
0
def convert_combed(combed_path, output_filename, format='HDF'):
    """
    Parameters
    ----------
    combed_path : str
        The root path of the combed dataset.
    output_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    check_directory_exists(combed_path)

    # Open store
    store = get_datastore(output_filename, format, mode='w')

    any_file_converted = False
    
    for building_name, building_mapping in iteritems(overall_dataset_mapping):
        for load_name, load_mapping in iteritems(building_mapping):
            for load_mapping_path, meter_number in iteritems(load_mapping):
                building_number = building_number_mapping[building_name]
                key = Key(building=building_number, meter=meter_number)
                dfs = []
                for attribute in column_mapping.keys():
                    filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" %attribute)
                    if not os.path.isfile(filename_attribute):
                        # File not found directly in the combed_path provided
                        # Try adding 'iiitd' to it
                        filename_attribute = join(combed_path, 'iiitd', building_name, load_name, load_mapping_path, "%s.csv" %attribute)
                    
                    if os.path.isfile(filename_attribute):
                        exists = True
                        print(filename_attribute)
                        df = pd.read_csv(filename_attribute, names=["timestamp", attribute])
                        df.index = pd.to_datetime(df["timestamp"], unit='ms')
                        df = df.drop("timestamp", 1)
                        dfs.append(df)
                    else:
                        exists = False
                        
                if exists:
                    total = pd.concat(dfs, axis=1)
                    total = total.tz_localize('UTC').tz_convert('Asia/Kolkata')
                    total.columns = pd.MultiIndex.from_tuples([column_mapping[x] for x in total.columns])
                    total.columns.set_names(LEVEL_NAMES, inplace=True)
                    assert total.index.is_unique
                    store.put(str(key), total)
                    any_file_converted = True
                    
    if not any_file_converted:
        raise RuntimeError('No files converted, did you specify the correct path?')
                    
    convert_yaml_to_hdf5(
        join(get_module_directory(), 'dataset_converters', 'combed', 'metadata'),
        output_filename
    )

    print("Done converting COMBED to HDF5!")
Пример #13
0
def convert_ampds(input_path, output_filename, format='HDF'):
    """
    Convert AMPds R2013 as seen on Dataverse. Download the files
    as CSVs and put them in the `input_path` folder for conversion.
    
    Download URL: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/MXB7VO
    
    Parameters: 
    -----------
    input_path: str
            The path of the directory where all the csv 
            files are supposed to be stored
    output_filename: str
            The path of the h5 file where all the 
            standardized data is supposed to go. The path 
            should refer to a particular file and not just a
             random directory in order for this to work.
    format: str
        Defaults to HDF5
    Example usage:
    --------------
    convert('/AMPds/electricity', 'store.h5')    

    """
    check_directory_exists(input_path)
    files = [f for f in listdir(input_path) if isfile(join(input_path, f)) and
             '.csv' in f and '.swp' not in f]
    # Sorting Lexicographically
    files.sort()

    # Remove Whole Home and put it at top
    files.remove("WHE.csv")
    files.insert(0, "WHE.csv")
    assert isdir(input_path)
    store = get_datastore(output_filename, format, mode='w')
    for i, csv_file in enumerate(files):
        key = Key(building=1, meter=(i + 1))
        print('Loading file #', (i + 1), ' : ', csv_file, '. Please wait...')
        df = pd.read_csv(join(input_path, csv_file))
        # Due to fixed width, column names have spaces :(
        df.columns = [x.replace(" ", "") for x in df.columns]
        df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME], unit='s', utc=True)
        df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
        df = df.tz_convert(TIMEZONE)
        df.rename(columns=lambda x: columnNameMapping[x], inplace=True)
        df.columns.set_names(LEVEL_NAMES, inplace=True)
        df = df.apply(pd.to_numeric, errors='ignore')
        df = df.dropna()
        df = df.astype(np.float32)
        store.put(str(key), df)
        print("Done with file #", (i + 1))
        
    store.close()
    metadata_path = join(get_module_directory(), 'dataset_converters', 'ampds', 'metadata')
    print('Processing metadata...')
    convert_yaml_to_hdf5(metadata_path, output_filename)
Пример #14
0
def convert_unifei(redd_path, output_filename, format='HDF'):
    """
    Parameters
    ----------
    redd_path : str
        The root path of the REDD low_freq dataset.
    output_filename : str
        The destination filename (including path and suffix).
    format : str
        format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF'
    """

    def _redd_measurement_mapping_func(house_id, chan_id):
        ac_type = 'active'
        return [('power', ac_type)]
        
    # Open DataStore
    store = get_datastore(output_filename, format, mode='w')
    # Convert raw data to DataStore
    _convert(redd_path, store, _redd_measurement_mapping_func, 'America/Sao_Paulo')
    print("Done convert...")
    
    #Aqui é necessário colocar o endereço de onde fica a metadata
    print(get_module_directory())
    s=join(get_module_directory(),
                              'dataset_converters',
                              'unifei',
                              'metadata')
    print(s)

    # Add metadata
    # Aqui também é necessário colocar o endereço correto da metadata
    save_yaml_to_datastore(join(get_module_directory(), 
                              'dataset_converters', 
                              'unifei', 
                              'metadata'),
                         store)
    store.close()

    print("Done converting REDD to HDF5!")
Пример #15
0
def convert_iawe(iawe_path, output_filename, format="HDF"):
    """
    Parameters
    ----------
    iawe_path : str
        The root path of the iawe dataset.
    output_filename : str
        The destination filename (including path and suffix).
    """

    check_directory_exists(iawe_path)
    idx = pd.DatetimeIndex(start=START_DATETIME, end=END_DATETIME, freq=FREQ)
    idx = idx.tz_localize('GMT').tz_convert(TIMEZONE)

    # Open data store
    store = get_datastore(output_filename, format, mode='w')
    electricity_path = join(iawe_path, "electricity")

    # Mains data
    for chan in range(1, 12):
        key = Key(building=1, meter=chan)
        filename = join(electricity_path, "%d.csv" % chan)
        print('Loading ', chan)
        df = pd.read_csv(filename, dtype=np.float64, na_values='\\N')
        df.drop_duplicates(subset=["timestamp"], inplace=True)
        df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True)
        df = df.tz_convert(TIMEZONE)
        df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
        df.columns = pd.MultiIndex.from_tuples(
            [column_mapping[x] for x in df.columns], names=LEVEL_NAMES)
        df = df.apply(pd.to_numeric, errors='ignore')
        df = df.dropna()
        df = df.astype(np.float32)
        df = df.sort_index()
        df = df.resample("1T").mean()
        df = reindex_fill_na(df, idx)
        assert df.isnull().sum().sum() == 0
        store.put(str(key), df)
    store.close()

    metadata_dir = join(get_module_directory(), 'dataset_converters', 'iawe',
                        'metadata')
    convert_yaml_to_hdf5(metadata_dir, output_filename)

    print("Done converting iAWE to HDF5!")
Пример #16
0
def convert_iawe(iawe_path, output_filename, format="HDF"):
    """
    Parameters
    ----------
    iawe_path : str
        The root path of the iawe dataset.
    output_filename : str
        The destination filename (including path and suffix).
    """

    check_directory_exists(iawe_path)
    idx = pd.DatetimeIndex(start=START_DATETIME, end=END_DATETIME, freq=FREQ)
    idx = idx.tz_localize('GMT').tz_convert(TIMEZONE)

    # Open data store
    store = get_datastore(output_filename, format, mode='w')
    electricity_path = join(iawe_path, "electricity")

    # Mains data
    for chan in range(1, 12):
        key = Key(building=1, meter=chan)
        filename = join(electricity_path, "%d.csv" % chan)
        print('Loading ', chan)
        df = pd.read_csv(filename, dtype=np.float64, na_values='\\N')
        df.drop_duplicates(subset=["timestamp"], inplace=True)
        df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True)
        df = df.tz_convert(TIMEZONE)
        df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
        df.rename(columns=lambda x: column_mapping[x], inplace=True)
        df.columns.set_names(LEVEL_NAMES, inplace=True)
        df = df.apply(pd.to_numeric, errors='ignore')
        df = df.dropna()
        df = df.astype(np.float32)
        df = df.sort_index()
        df = df.resample("1T").mean()
        df = reindex_fill_na(df, idx)
        assert df.isnull().sum().sum() == 0
        store.put(str(key), df)
    store.close()
    
    metadata_dir = join(get_module_directory(), 'dataset_converters', 'iawe', 'metadata')
    convert_yaml_to_hdf5(metadata_dir, output_filename)

    print("Done converting iAWE to HDF5!")
Пример #17
0
def convert_ukdale(ukdale_path, hdf_filename):
    """
    Parameters
    ----------
    ukdale_path : str
        The root path of the UK-DALE dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """
    def _ukdale_measurement_mapping_func(house_id, chan_id):
        # TODO: This needs updating.  It's wrong!
        ac_type = 'apparent' if chan_id <= 2 else 'active'
        return [('power', ac_type)]

    _convert(ukdale_path, hdf_filename, _ukdale_measurement_mapping_func,
             'Europe/London')

    # Add metadata
    convert_yaml_to_hdf5(
        join(get_module_directory(), 'dataset_converters', 'ukdale',
             'metadata'), hdf_filename)

    print("Done converting UK-DALE to HDF5!")
Пример #18
0
def convert_redd(redd_path, hdf_filename):
    """
    Parameters
    ----------
    redd_path : str
        The root path of the REDD low_freq dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    def _redd_measurement_mapping_func(house_id, chan_id):
        ac_type = 'apparent' if chan_id <= 2 else 'active'
        return [('power', ac_type)]

    _convert(redd_path, hdf_filename, _redd_measurement_mapping_func, 'US/Eastern')

    # Add metadata
    convert_yaml_to_hdf5(join(get_module_directory(), 
                              'dataset_converters', 
                              'redd', 
                              'metadata'),
                         hdf_filename)

    print("Done converting REDD to HDF5!")
Пример #19
0
def convert_redd(redd_path, hdf_filename):
    """
    Parameters
    ----------
    redd_path : str
        The root path of the REDD low_freq dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    def _redd_measurement_mapping_func(house_id, chan_id):
        ac_type = 'apparent' if chan_id <= 2 else 'active'
        return [('power', ac_type)]

    _convert(redd_path, hdf_filename, _redd_measurement_mapping_func, 'US/Eastern')

    # Add metadata
    convert_yaml_to_hdf5(join(get_module_directory(), 
                              'dataset_converters', 
                              'redd', 
                              'metadata'),
                         hdf_filename)

    print("Done converting REDD to HDF5!")
Пример #20
0
def convert_eco(dataset_loc, hdf_filename, timezone):
    """
    Parameters:
    -----------
    dataset_loc: str
        The root directory where the dataset is located.
    hdf_filename: str
        The location where the hdf_filename is present. 
        The directory location has to contain the 
        hdf5file name for the converter to work.
    timezone: str
        specifies the timezone of the dataset.
    """

    # Creating a new HDF File
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc')    
    
    check_directory_exists(dataset_loc)
    directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i]
    directory_list.sort()
    print(directory_list)

    found_any_sm = False
    found_any_plug = False
    
    # Traversing every folder
    for folder in directory_list:
        if folder[0] == '.' or folder[-3:] == '.h5':
            print('Skipping ', folder)
            continue

        #Building number and meter_flag
        building_no = int(folder[:2])
        meter_flag = None 
        if 'sm_csv' in folder:
            meter_flag = 'sm'
        elif 'plugs' in folder:
            meter_flag = 'plugs'
        else:
            print('Skipping folder', folder)
            continue
            
        print('Computing for folder', folder)

        dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))]
        dir_list.sort()
        
        if meter_flag == 'plugs' and len(dir_list) < 3:
            # Try harder to find the subfolders
            folder = join(folder, folder[:2])
            dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))]
        
        print('Current dir list:', dir_list)

        for fl in dir_list:
            print('Computing for folder ', fl)
            
            fl_dir_list = [i for i in listdir(join(dataset_loc,folder,fl)) if '.csv' in i]
            fl_dir_list.sort()

            if meter_flag == 'sm':
                for fi in fl_dir_list:
                    found_any_sm = True
                    df = pd.read_csv(join(dataset_loc,folder,fl,fi), names=[i for i in range(1,17)], dtype=np.float32)
                    
                    for phase in range(1,4):
                        key = str(Key(building=building_no, meter=phase))
                        df_phase = df.loc[:,[1+phase, 5+phase, 8+phase, 13+phase]]

                        # get reactive power
                        power = df_phase.loc[:, (1+phase, 13+phase)].values
                        reactive = power[:,0] * np.tan(power[:,1] * np.pi / 180)
                        df_phase['Q'] = reactive
                        
                        df_phase.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz='GMT')
                        df_phase = df_phase.tz_convert(timezone)
                        
                        sm_column_name = {
                            1+phase:('power', 'active'),
                            5+phase:('current', ''),
                            8+phase:('voltage', ''),
                            13+phase:('phase_angle', ''),
                            'Q': ('power', 'reactive'),
                        }
                        df_phase.columns = pd.MultiIndex.from_tuples([
                            sm_column_name[col] for col in df_phase.columns
                        ])
                        
                        power_active = df_phase['power', 'active']
                        tmp_before = np.size(power_active)
                        df_phase = df_phase[power_active != -1]
                        power_active = df_phase['power', 'active']
                        tmp_after = np.size(power_active)
                        
                        if tmp_before != tmp_after:
                            print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after))
                        
                        df_phase.columns.set_names(LEVEL_NAMES, inplace=True)
                        if not key in store:
                            store.put(key, df_phase, format='Table')
                        else:
                            store.append(key, df_phase, format='Table')
                            store.flush()
                        print('Building', building_no, ', Meter no.', phase,
                              '=> Done for ', fi[:-4])
                
            else:
                #Meter number to be used in key
                meter_num = int(fl) + 3
                
                key = str(Key(building=building_no, meter=meter_num))

                current_folder = join(dataset_loc,folder,fl)
                if not fl_dir_list:
                    raise RuntimeError("No CSV file found in " + current_folder)
                    
                #Getting dataframe for each csv file seperately
                for fi in fl_dir_list:
                    found_any_plug = True
                    df = pd.read_csv(join(current_folder, fi), names=[1], dtype=np.float64)
                    df.index = pd.DatetimeIndex(start=fi[:-4].replace('.', ':'), freq='s', periods=86400, tz = 'GMT')
                    df.columns = pd.MultiIndex.from_tuples(plugs_column_name.values())
                    df = df.tz_convert(timezone)
                    df.columns.set_names(LEVEL_NAMES, inplace=True)

                    tmp_before = np.size(df.power.active)
                    df = df[df.power.active != -1]
                    tmp_after = np.size(df.power.active)
                    if (tmp_before != tmp_after):
                        print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after))
                    
                    # If table not present in hdf5, create or else append to existing data
                    if not key in store:
                        store.put(key, df, format='Table')
                        print('Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4])
                    else:
                        store.append(key, df, format='Table')
                        store.flush()
                        print('Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4])
            
            
    if not found_any_plug or not found_any_sm:
        raise RuntimeError('The files were not found! Please check the folder structure. Extract each ZIP file into a folder with its base name (e.g. extract "01_plugs_csv.zip" into a folder named "01_plugs_csv", etc.)')
        
    print("Data storage completed.")
    store.close()

    # Adding the metadata to the HDF5file
    print("Proceeding to Metadata conversion...")
    meta_path = join(
        get_module_directory(), 
        'dataset_converters',
        'eco',
        'metadata'
    )
    convert_yaml_to_hdf5(meta_path, hdf_filename)
    print("Completed Metadata conversion.")
Пример #21
0
def convert_hes(data_dir, output_filename, format='HDF', max_chunks=None):
    metadata = {
        'name': 'HES',
        'geographic_coordinates': (51.464462, -0.076544),  # London
        'timezone': 'Europe/London'
    }

    # Open DataStore
    store = get_datastore(output_filename, format, mode='w')

    # load list of appliances
    hes_to_nilmtk_appliance_lookup = pd.read_csv(
        join(get_module_directory(), 'dataset_converters', 'hes',
             'hes_to_nilmtk_appliance_lookup.csv'))

    # load list of houses
    hes_house_ids = load_list_of_house_ids(data_dir)
    nilmtk_house_ids = np.arange(1, len(hes_house_ids) + 1)
    hes_to_nilmtk_house_ids = dict(zip(hes_house_ids, nilmtk_house_ids))

    # array of hes_house_codes: nilmtk_building_code = house_codes.index(hes_house_code)
    house_codes = []

    # map
    house_appliance_codes = dict()

    # Create a temporary metadata dir
    original_metadata_dir = join(get_module_directory(), 'dataset_converters',
                                 'hes', 'metadata')
    tmp_dir = tempfile.mkdtemp()
    metadata_dir = join(tmp_dir, 'metadata')
    shutil.copytree(original_metadata_dir, metadata_dir)
    print("Using temporary dir for metadata:", metadata_dir)

    # Iterate over files
    for filename in FILENAMES:
        # Load appliance energy data chunk-by-chunk
        full_filename = join(data_dir, filename)
        print('Loading', full_filename)
        try:
            reader = pd.read_csv(full_filename,
                                 names=COL_NAMES,
                                 index_col=False,
                                 chunksize=CHUNKSIZE)
        except IOError as e:
            print(e, file=stderr)
            continue

        # Iterate over chunks in file
        chunk_i = 0
        for chunk in reader:
            if max_chunks is not None and chunk_i >= max_chunks:
                break

            print(' processing chunk', chunk_i, 'of', filename)
            # Convert date and time columns to np.datetime64 objects
            dt = chunk['date'] + ' ' + chunk['time']
            del chunk['date']
            del chunk['time']
            chunk['datetime'] = pd.to_datetime(dt,
                                               format='%Y-%m-%d %H:%M:%S',
                                               utc=True)

            # Data is either tenths of a Wh or tenths of a degree
            chunk['data'] *= 10
            chunk['data'] = chunk['data'].astype(np.float32)

            # Iterate over houses in chunk
            for hes_house_id, hes_house_id_df in chunk.groupby('house id'):
                if hes_house_id not in house_codes:
                    house_codes.append(hes_house_id)

                if hes_house_id not in house_appliance_codes.keys():
                    house_appliance_codes[hes_house_id] = []

                nilmtk_house_id = house_codes.index(hes_house_id) + 1

                # Iterate over appliances in house
                for appliance_code, appliance_df in chunk.groupby(
                        'appliance code'):
                    if appliance_code not in house_appliance_codes[
                            hes_house_id]:
                        house_appliance_codes[hes_house_id].append(
                            appliance_code)
                    nilmtk_meter_id = house_appliance_codes[
                        hes_house_id].index(appliance_code) + 1
                    _process_meter_in_chunk(nilmtk_house_id, nilmtk_meter_id,
                                            hes_house_id_df, store,
                                            appliance_code)

            chunk_i += 1

    print('houses with some data loaded:', house_appliance_codes.keys())

    store.close()

    # generate building yaml metadata
    for hes_house_id in house_codes:
        nilmtk_building_id = house_codes.index(hes_house_id) + 1
        building_metadata = {}
        building_metadata['instance'] = nilmtk_building_id
        building_metadata['original_name'] = int(
            hes_house_id)  # use python int
        building_metadata['elec_meters'] = {}
        building_metadata['appliances'] = []

        # initialise dict of instances of each appliance type
        instance_counter = {}

        for appliance_code in house_appliance_codes[hes_house_id]:
            nilmtk_meter_id = house_appliance_codes[hes_house_id].index(
                appliance_code) + 1
            # meter metadata
            if appliance_code in MAINS_CODES:
                meter_metadata = {
                    'device_model': 'multivoies',
                    'site_meter': True
                }
                break
            elif appliance_code in CIRCUIT_CODES:
                meter_metadata = {'device_model': 'multivoies'}
                break
            elif appliance_code in TEMPERATURE_CODES:
                break
            else:  # is appliance
                meter_metadata = {'device_model': 'wattmeter'}

            # only appliance meters at this point
            building_metadata['elec_meters'][nilmtk_meter_id] = meter_metadata
            # appliance metadata
            lookup_row = hes_to_nilmtk_appliance_lookup[
                hes_to_nilmtk_appliance_lookup.Code == appliance_code].iloc[0]
            appliance_metadata = {
                'original_name': lookup_row.Name,
                'meters': [nilmtk_meter_id]
            }
            # appliance type
            appliance_metadata.update({'type': lookup_row.nilmtk_name})
            # TODO appliance room

            # appliance instance number
            if instance_counter.get(lookup_row.nilmtk_name) == None:
                instance_counter[lookup_row.nilmtk_name] = 0
            instance_counter[lookup_row.nilmtk_name] += 1
            appliance_metadata['instance'] = instance_counter[
                lookup_row.nilmtk_name]

            building_metadata['appliances'].append(appliance_metadata)

        building = 'building{:d}'.format(nilmtk_building_id)

        yaml_full_filename = join(metadata_dir, building + '.yaml')

        with open(yaml_full_filename, 'w') as outfile:
            #print(building_metadata)
            outfile.write(yaml.dump(building_metadata))

    # write yaml metadata to hdf5
    convert_yaml_to_hdf5(metadata_dir, output_filename)

    # remote the temporary dir when finished
    shutil.rmtree(tmp_dir)
Пример #22
0
def convert_deps(deps_path, input_filename, output_filename, format='HDF'):
    """
    Parameters
    ----------
    deps_path : str
        The root path of the DEPS dataset. 
        e.g 'C:/data/deps'
    input_filename : str
        The rawdata filename (including path and suffix).
        e.g 'C:/data/rawdata.csv'
    output_filename : str
        The destination HDF5 filename (including path and suffix).
        e.g 'C:/data/deps/DEPS_data.h5'
    format : str
        format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF'
        
    Meters & Measurements :
    ----------
    Measurement assignment (idMeasurement) in rawdata to REDD format
    
    Measurements id's           Units           Meters Name
    14011 14012             --> W VAr       --> Main_RST 
    14001 14007 14014 14017 --> V A W VAr   --> Main_R 
    14002 14008 14015 14018 --> V A W VAr   --> Main_S
    14003 14009 14016 14019 --> V A W VAr   --> Main_T
    13001                   --> W           --> Lights_1
    13002                   --> W           --> Lights_2
    10003 10006 10014 10018 --> V A W VAr   --> HVAC_1 
    10002 10005 10013 10017 --> V A W VAr   --> HVAC_2
    10001 10004 10012 10016 --> V A W VAr   --> HVAC_4
    21001 21002 21003 21005 --> V A W VAr   --> Rack    
          
    Example
    ----------
    raw_data.csv (input_filename):
    --
    idMeasurement, UNIX_timestamp(tStampUTC), dataValue
    14011,         1583103600,                      123
    14012,         1583103600,                     -416
    14011,         1583103601,                      126
    14012,         1583103601,                     -416
    ...            ...                              ...
    14011,         1583535599,                      121
    14012,         1583535599,                     -411
    
    Outputs REDD format: deps_path/classroom1/ :
    --
    channel_1.dat: 
    1583103600 123 -416
    1583103600 126 -416
    ...        ...  ...  
    1583103600 121 -411
    --
    labels.dat:   
    1 Main_RST
    
    Output HDF5 file: output_filename.h5    
        
    """
    #--------------------------------------------------------------------
    # writed by Andrés Arias Silva
    # Raw data converter to REDD format extracted from DEPS SQL database
    _deps_to_redd_format(deps_path, input_filename)

    #--------------------------------------------------------------------

    def _deps_measurement_mapping_func(classroom_id, chan_id):

        if chan_id == 1:
            meas = ([('power', 'active'), ('power', 'reactive')])
        elif chan_id > 1 and chan_id <= 4:
            meas = ([('voltage', ''), ('current', ''), ('power', 'active'),
                     ('power', 'reactive')])
        elif chan_id > 4 and chan_id <= 6:
            meas = ([('power', 'active')])
        elif chan_id > 6 and chan_id <= 10:
            meas = ([
                ('voltage', ''),
                ('current', ''),
                ('power', 'active'),
                ('power', 'reactive'),
            ])
        else:
            raise NameError('incorrect channel number')
        return meas

    # Open DataStore
    store = get_datastore(output_filename, format, mode='w')

    # Convert raw data to DataStore
    _convert(deps_path, store, _deps_measurement_mapping_func, 'Europe/Madrid')

    #    s=join(get_module_directory(),
    #                              'dataset_converters',
    #                              'deps',
    #                              'metadata')

    # Add metadata
    save_yaml_to_datastore(
        join(get_module_directory(), 'dataset_converters', 'deps', 'metadata'),
        store)
    store.close()

    print("Done converting DEPS data to HDF5!")
Пример #23
0
def download_dataport(database_username,
                      database_password, hdf_filename,
                      database_schema='university',
                      user_selected_table='electricity_egauge_minutes',
                      periods_to_load=None):
    """
    Downloads data from dataport database into an HDF5 file.

    Parameters
    ----------
    hdf_filename : str
        Output HDF filename.  If file exists already then will be deleted.
    database_username, database_password, database_schema,user_selected_table, hdf_filename : str
    periods_to_load : dict of tuples, optional
       Key of dict is the building number (int).
       Values are (<start date>, <end date>)
       e.g. ("2013-04-01", None) or ("2013-04-01", "2013-08-01")
       defaults to all buildings and all date ranges
    """

    database_assert(user_selected_table)
    # dataport database settings
    database_host = 'dataport.pecanstreet.org'
    database_port = '5434'
    database_name = 'postgres'

    # try to connect to database
    try:
        conn = db.connect('host=' + database_host +
                          ' port=' + database_port +
                          ' dbname=' + database_name +
                          ' user='******' password='******'Could not connect to remote database')
        raise

    # map user_selected_table and timestamp column
    timestamp_map = {"electricity_egauge_15min": "local_15min",
                     "electricity_egauge_hours": "localhour",
                     "electricity_egauge_minutes": "localminute",
                     "electricity_egauge_seconds": "localminute"}

    # set up a new HDF5 datastore (overwrites existing store)
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')

    # Create a temporary metadata dir, remove existing building
    # yaml files in module dir (if any)
    original_metadata_dir = join(get_module_directory(),
                                 'dataset_converters',
                                 'dataport',
                                 'metadata')
    tmp_dir = tempfile.mkdtemp()
    metadata_dir = join(tmp_dir, 'metadata')
    shutil.copytree(original_metadata_dir, metadata_dir)
    print("Using temporary dir for metadata:", metadata_dir)

    for f in os.listdir(metadata_dir):
        if re.search('^building', f):
            os.remove(join(metadata_dir, f))

    """
    TODO:
    The section below can be altered or removed,
    since the restructured Dataport
    now has only one electricity_egauge_minutes table.
    """
    # get tables in database schema
    sql_query = ("SELECT table_name" +
                 " FROM information_schema.views" +
                 " WHERE table_schema ='" + database_schema + "'" +
                 " ORDER BY table_name")
    database_tables = pd.read_sql(sql_query, conn)['table_name'].tolist()
    database_tables = [t for t in database_tables if user_selected_table in t]
    # if user has specified buildings
    if periods_to_load:
        buildings_to_load = list(periods_to_load.keys())
    else:
        # get buildings present in all tables
        sql_query = ''
        for table in database_tables:
            sql_query = (sql_query + '(SELECT DISTINCT dataid' +
                         ' FROM "' + database_schema + '".' + table +
                         ') UNION ')
        sql_query = sql_query[:-7]
        sql_query = (sql_query + ' ORDER BY dataid')
        buildings_to_load = pd.read_sql(sql_query, conn)['dataid'].tolist()

    # for each user specified building or all buildings in database
    for building_id in buildings_to_load:
        print("Loading building {:d} @ {}"
              .format(building_id, datetime.datetime.now()))
        sys.stdout.flush()

        # create new list of chunks for concatenating later
        dataframe_list = []

        # for each table of 1 month data
        for database_table in database_tables:
            print("  Loading table {:s}".format(database_table))
            sys.stdout.flush()

            # get buildings present in electricity_egauge_minutes table
            sql_query = ('SELECT DISTINCT dataid' +
                         ' FROM university.metadata' +
                         ' WHERE egauge_min_time IS NOT NULL' +
                         ' ORDER BY dataid')

            buildings_in_table = pd.read_sql(sql_query,
                                             conn)['dataid'].tolist()
            if building_id in buildings_in_table:
                # get first and last timestamps for this
                # house in electricity_egauge_minutes table
                sql_query = ('SELECT MIN(egauge_min_time) AS minlocalminute,' +
                             ' MAX(egauge_max_time) AS maxlocalminute' +
                             ' FROM university.metadata' +
                             ' WHERE dataid=' + str(building_id))

                range = pd.read_sql(sql_query, conn)

                first_timestamp_in_table = range['minlocalminute'][0]
                last_timestamp_in_table = range['maxlocalminute'][0]

                # get requested start and end and localize them
                requested_start = None
                requested_end = None
                database_timezone = 'US/Central'
                if periods_to_load:
                    if periods_to_load[building_id][0]:
                        requested_start = pd.Timestamp(periods_to_load[building_id][0])
                        requested_start = requested_start.tz_localize(database_timezone)
                    if periods_to_load[building_id][1]:
                        requested_end = pd.Timestamp(periods_to_load[building_id][1])
                        requested_end = requested_end.tz_localize(database_timezone)

                # check user start is not after end
                if requested_start > requested_end:
                    print('requested end is before requested start')
                    sys.stdout.flush()
                else:
                    # clip data to smallest range
                    if requested_start:
                        start = max(requested_start, first_timestamp_in_table)
                    else:
                        start = first_timestamp_in_table
                    if requested_end:
                        end = min(requested_end, last_timestamp_in_table)
                    else:
                        end = last_timestamp_in_table

                    # download data in chunks
                    chunk_start = start
                    chunk_size = datetime.timedelta(10)  # 1 day
                    while chunk_start < end:
                        chunk_end = chunk_start + chunk_size
                        if chunk_end > end:
                            chunk_end = end
                        # subtract 1 second so end is exclusive
                        chunk_end = chunk_end - datetime.timedelta(0, 1)

                        # query power data for all channels
                        format = '%Y-%m-%d %H:%M:%S'
                        sql_query = ('SELECT *' +
                                     ' FROM "' + database_schema + '".' + user_selected_table +
                                     ' WHERE dataid=' + str(building_id) +
                                     'and "' + timestamp_map[user_selected_table] + '" between ' +
                                     "'" + chunk_start.strftime(format) + "'" +
                                     " and " +
                                     "'" + chunk_end.strftime(format) +
                                     "' ORDER BY "+timestamp_map[user_selected_table]
                                     )
                        chunk_dataframe = pd.read_sql(sql_query, conn)
                        # nilmtk requires building indices to start at 1
                        nilmtk_building_id = buildings_to_load.index(building_id) + 1
                        # convert to nilmtk-df and save to disk
                        nilmtk_dataframe = _dataport_dataframe_to_hdf(
                            chunk_dataframe, store,
                            nilmtk_building_id,
                            building_id,
                            timestamp_map[user_selected_table],
                            metadata_dir
                        )

                        # print progress
                        print('    ' + str(chunk_start) + ' -> ' +
                              str(chunk_end) + ': ' +
                              str(len(chunk_dataframe.index)) + ' rows')
                        sys.stdout.flush()

                        # append all chunks into list for csv writing
                        # dataframe_list.append(chunk_dataframe)

                        # move on to next chunk
                        chunk_start = chunk_start + chunk_size

        # saves all chunks in list to csv
        # if len(dataframe_list) > 0:
            # dataframe_concat = pd.concat(dataframe_list)
            # dataframe_concat.to_csv(output_directory + str(building_id) + '.csv')

    store.close()
    conn.close()

    # write yaml to hdf5
    # dataset.yaml and meter_devices.yaml are static, building<x>.yaml are dynamic
    convert_yaml_to_hdf5(metadata_dir, hdf_filename)

    # remote the temporary dir when finished
    shutil.rmtree(tmp_dir)
Пример #24
0
def convert_greend(greend_path, hdf_filename, use_mp=True):
    """
    Parameters
    ----------
    greend_path : str
        The root path of the greend dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    use_mp : bool 
        Defaults to True. Use multiprocessing to load the files for
        each building.
    """
    store = pd.HDFStore(hdf_filename, 'w', complevel=5, complib='zlib')
    houses = sorted(_get_houses(greend_path))
    
    print('Houses found:', houses)
    if use_mp:
        pool = Pool()
    
    h = 1 # nilmtk counts buildings from 1 not from 0 as we do, so everything is shifted by 1
    
    for house in houses:
        print('Loading', house)
        abs_house = join(greend_path, house)
        dates = [d for d in listdir(abs_house) if d.startswith('dataset')]
        target_filenames = [join(abs_house, date) for date in dates]
        if use_mp:
            house_data = pool.map(_get_blocks, target_filenames)

            # Ensure the blocks are sorted by date and make a plain list
            house_data_dfs = []
            for date, data in sorted(house_data, key=lambda x: x[0]):
                house_data_dfs.extend(data)
        else:
            house_data_dfs = []
            for fn in target_filenames:
                house_data_dfs.extend(_get_blocks(fn)[1])
            
        overall_df = pd.concat(house_data_dfs).sort_index()
        dups_in_index = overall_df.index.duplicated(keep='first')
        if dups_in_index.any():
            print("Found duplicated values in index, dropping them.")
            overall_df = overall_df[~dups_in_index]
        
        m = 1
        for column in overall_df.columns:
            print("meter {}: {}".format(m, column))
            key = Key(building=h, meter=m)
            print("Putting into store...")
            
            df = overall_df[column].to_frame() #.dropna(axis=0)
            
            # if drop_duplicates:
                # print("Dropping duplicated values in data...")
                # df = df.drop_duplicates()
            
            df.columns = pd.MultiIndex.from_tuples([('power', 'active')])
            df.columns.set_names(LEVEL_NAMES, inplace=True)
            
            store.put(str(key), df, format = 'table')
            m += 1
            # print('Flushing store...')
            # store.flush()
            
        h += 1

    store.close()
	
	# retrieve the dataset metadata in the metadata subfolder
    metadata_dir = join(get_module_directory(), 'dataset_converters', 'greend', 'metadata')
    convert_yaml_to_hdf5(metadata_dir, hdf_filename)
Пример #25
0
def convert_hes(data_dir, output_filename, format='HDF', max_chunks=None):
    metadata = {
        'name': 'HES',
        'geographic_coordinates': (51.464462,-0.076544), # London
        'timezone': 'Europe/London'
    }
    
    # Open DataStore
    store = get_datastore(output_filename, format, mode='w')
    
    # load list of appliances
    hes_to_nilmtk_appliance_lookup = pd.read_csv(join(get_module_directory(), 
                                        'dataset_converters', 
                                        'hes', 
                                        'hes_to_nilmtk_appliance_lookup.csv'))

    # load list of houses
    hes_house_ids = load_list_of_house_ids(data_dir)
    nilmtk_house_ids = np.arange(1,len(hes_house_ids)+1)
    hes_to_nilmtk_house_ids = dict(zip(hes_house_ids, nilmtk_house_ids))

    # array of hes_house_codes: nilmtk_building_code = house_codes.index(hes_house_code)
    house_codes = []
    # map 
    house_appliance_codes = dict()

    # Iterate over files
    for filename in FILENAMES:
        # Load appliance energy data chunk-by-chunk
        full_filename = join(data_dir, filename)
        print('loading', full_filename)
        try:
            reader = pd.read_csv(full_filename, names=COL_NAMES, 
                                 index_col=False, chunksize=CHUNKSIZE)
        except IOError as e:
            print(e, file=stderr)
            continue

        # Iterate over chunks in file
        chunk_i = 0
        for chunk in reader:
            if max_chunks is not None and chunk_i >= max_chunks:
                break

            print(' processing chunk', chunk_i, 'of', filename)
            # Convert date and time columns to np.datetime64 objects
            dt = chunk['date'] + ' ' + chunk['time']
            del chunk['date']
            del chunk['time']
            chunk['datetime'] = dt.apply(datetime_converter)

            # Data is either tenths of a Wh or tenths of a degree
            chunk['data'] *= 10
            chunk['data'] = chunk['data'].astype(np.float32)

            # Iterate over houses in chunk
            for hes_house_id, hes_house_id_df in chunk.groupby('house id'):
                if hes_house_id not in house_codes:
                    house_codes.append(hes_house_id)
                    
                if hes_house_id not in house_appliance_codes.keys():
                    house_appliance_codes[hes_house_id] = []
                
                nilmtk_house_id = house_codes.index(hes_house_id)+1
                
                # Iterate over appliances in house
                for appliance_code, appliance_df in chunk.groupby('appliance code'):
                    if appliance_code not in house_appliance_codes[hes_house_id]:
                        house_appliance_codes[hes_house_id].append(appliance_code)
                    nilmtk_meter_id = house_appliance_codes[hes_house_id].index(appliance_code)+1
                    _process_meter_in_chunk(nilmtk_house_id, nilmtk_meter_id, hes_house_id_df, store, appliance_code)
                    
            chunk_i += 1
    print('houses with some data loaded:', house_appliance_codes.keys())
    
    store.close()
    
    # generate building yaml metadata
    for hes_house_id in house_codes:
        nilmtk_building_id = house_codes.index(hes_house_id)+1
        building_metadata = {}
        building_metadata['instance'] = nilmtk_building_id
        building_metadata['original_name'] = int(hes_house_id) # use python int
        building_metadata['elec_meters'] = {}
        building_metadata['appliances'] = []
        
        # initialise dict of instances of each appliance type
        instance_counter = {}
        
        for appliance_code in house_appliance_codes[hes_house_id]:
            nilmtk_meter_id = house_appliance_codes[hes_house_id].index(appliance_code)+1
            # meter metadata
            if appliance_code in MAINS_CODES:
                meter_metadata = {'device_model': 'multivoies',
                                  'site_meter': True}
                break
            elif appliance_code in CIRCUIT_CODES:
                meter_metadata = {'device_model': 'multivoies'}
                break
            elif appliance_code in TEMPERATURE_CODES:
                break
            else: # is appliance
                meter_metadata = {'device_model': 'wattmeter'}
                
            # only appliance meters at this point
            building_metadata['elec_meters'][nilmtk_meter_id] = meter_metadata
            # appliance metadata
            lookup_row = hes_to_nilmtk_appliance_lookup[hes_to_nilmtk_appliance_lookup.Code==appliance_code].iloc[0]
            appliance_metadata = {'original_name': lookup_row.Name, 
                                      'meters': [nilmtk_meter_id] }
            # appliance type
            appliance_metadata.update({'type': lookup_row.nilmtk_name})
            # TODO appliance room
            
            # appliance instance number
            if instance_counter.get(lookup_row.nilmtk_name) == None:
                instance_counter[lookup_row.nilmtk_name] = 0
            instance_counter[lookup_row.nilmtk_name] += 1 
            appliance_metadata['instance'] = instance_counter[lookup_row.nilmtk_name]
            
            building_metadata['appliances'].append(appliance_metadata)
        building = 'building{:d}'.format(nilmtk_building_id)
        yaml_full_filename = join(_get_module_directory(), 'metadata', building + '.yaml')
        with open(yaml_full_filename, 'w') as outfile:
            #print(building_metadata)
            outfile.write(yaml.dump(building_metadata))
            
    
    # write yaml metadata to hdf5
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         output_filename)
Пример #26
0
def convert_eco(dataset_loc, hdf_filename, timezone):
    """
    Parameters:
    -----------
    dataset_loc: str
        The root directory where the dataset is located.
    hdf_filename: str
        The location where the hdf_filename is present. 
        The directory location has to contain the 
        hdf5file name for the converter to work.
    timezone: str
        specifies the timezone of the dataset.
    """

    # Creating a new HDF File
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc')

    check_directory_exists(dataset_loc)
    directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i]
    directory_list.sort()
    print(directory_list)

    found_any_sm = False
    found_any_plug = False

    # Traversing every folder
    for folder in directory_list:
        if folder[0] == '.' or folder[-3:] == '.h5':
            print('Skipping ', folder)
            continue

        #Building number and meter_flag
        building_no = int(folder[:2])
        meter_flag = None
        if 'sm_csv' in folder:
            meter_flag = 'sm'
        elif 'plugs' in folder:
            meter_flag = 'plugs'
        else:
            print('Skipping folder', folder)
            continue

        print('Computing for folder', folder)

        dir_list = [
            i for i in listdir(join(dataset_loc, folder))
            if isdir(join(dataset_loc, folder, i))
        ]
        dir_list.sort()

        if meter_flag == 'plugs' and len(dir_list) < 3:
            # Try harder to find the subfolders
            folder = join(folder, folder[:2])
            dir_list = [
                i for i in listdir(join(dataset_loc, folder))
                if isdir(join(dataset_loc, folder, i))
            ]

        print('Current dir list:', dir_list)

        for fl in dir_list:
            print('Computing for folder ', fl)

            fl_dir_list = [
                i for i in listdir(join(dataset_loc, folder, fl))
                if '.csv' in i
            ]
            fl_dir_list.sort()

            if meter_flag == 'sm':
                for fi in fl_dir_list:
                    found_any_sm = True
                    df = pd.read_csv(join(dataset_loc, folder, fl, fi),
                                     names=[i for i in range(1, 17)],
                                     dtype=np.float32)
                    # SmartMeter
                    for phase in range(1, 4):
                        key = str(Key(building=building_no, meter=phase))
                        df_phase = df.loc[:, [
                            1 + phase, 5 + phase, 8 + phase, 13 + phase
                        ]]

                        # get reactive power
                        power = df_phase.loc[:, (1 + phase, 13 + phase)].values
                        reactive = power[:, 0] * np.tan(
                            power[:, 1] * np.pi / 180)
                        df_phase['Q'] = reactive

                        df_phase.index = pd.DatetimeIndex(start=fi[:-4],
                                                          freq='s',
                                                          periods=86400,
                                                          tz='GMT')
                        df_phase = df_phase.tz_convert(timezone)

                        sm_column_name = {
                            1 + phase: ('power', 'active'),
                            5 + phase: ('current', ''),
                            8 + phase: ('voltage', ''),
                            13 + phase: ('phase_angle', ''),
                            'Q': ('power', 'reactive'),
                        }
                        df_phase.columns = pd.MultiIndex.from_tuples(
                            sm_column_name[col] for col in df_phase.columns)

                        power_active = df_phase['power', 'active']
                        tmp_before = np.size(power_active)
                        df_phase = df_phase[power_active != -1]
                        power_active = df_phase['power', 'active']
                        tmp_after = np.size(power_active)

                        if tmp_before != tmp_after:
                            print(
                                'Removed missing measurements - Size before: '
                                + str(tmp_before) + ', size after: ' +
                                str(tmp_after))

                        df_phase.columns.set_names(LEVEL_NAMES, inplace=True)
                        if not key in store:
                            store.put(key, df_phase, format='Table')
                        else:
                            store.append(key, df_phase, format='Table')
                            store.flush()
                        print('Building', building_no, ', Meter no.', phase,
                              '=> Done for ', fi[:-4])
            # Plugs werden auch in Meter uebersetzt und dann aber direkt mit Appliances ergaenzt
            else:
                #Meter number to be used in key
                meter_num = int(fl) + 3

                key = str(Key(building=building_no, meter=meter_num))

                current_folder = join(dataset_loc, folder, fl)
                if not fl_dir_list:
                    raise RuntimeError("No CSV file found in " +
                                       current_folder)

                #Getting dataframe for each csv file seperately
                for fi in fl_dir_list:
                    found_any_plug = True
                    df = pd.read_csv(join(current_folder, fi),
                                     names=[1],
                                     dtype=np.float64)
                    df.index = pd.DatetimeIndex(start=fi[:-4].replace(
                        '.', ':'),
                                                freq='s',
                                                periods=86400,
                                                tz='GMT')
                    df.columns = pd.MultiIndex.from_tuples(
                        plugs_column_name.values())
                    df = df.tz_convert(timezone)
                    df.columns.set_names(LEVEL_NAMES, inplace=True)

                    # Check whether measurements removed
                    tmp_before = np.size(df.power.active)
                    df = df[df.power.active != -1]
                    tmp_after = np.size(df.power.active)
                    if (tmp_before != tmp_after):
                        print('Removed missing measurements - Size before: ' +
                              str(tmp_before) + ', size after: ' +
                              str(tmp_after))

                    # If table not present in hdf5, create or else append to existing data
                    if not key in store:
                        store.put(key, df, format='Table')
                        print('Building', building_no, ', Meter no.',
                              meter_num, '=> Done for ', fi[:-4])
                    else:
                        store.append(key, df, format='Table')
                        store.flush()
                        print('Building', building_no, ', Meter no.',
                              meter_num, '=> Done for ', fi[:-4])

    if not found_any_plug or not found_any_sm:
        raise RuntimeError(
            'The files were not found! Please check the folder structure. Extract each ZIP file into a folder with its base name (e.g. extract "01_plugs_csv.zip" into a folder named "01_plugs_csv", etc.)'
        )

    print("Data storage completed.")
    store.close()

    # Adding the metadata to the HDF5file
    print("Proceeding to Metadata conversion...")
    meta_path = join(get_module_directory(), 'dataset_converters', 'eco',
                     'metadata')
    convert_yaml_to_hdf5(meta_path, hdf_filename)
    print("Completed Metadata conversion.")
Пример #27
0
def convert_greend(greend_path, hdf_filename, use_mp=True):
    """
    Parameters
    ----------
    greend_path : str
        The root path of the greend dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    use_mp : bool 
        Defaults to True. Use multiprocessing to load the files for
        each building.
    """
    store = pd.HDFStore(hdf_filename, 'w', complevel=5, complib='zlib')
    houses = sorted(_get_houses(greend_path))

    print('Houses found:', houses)
    if use_mp:
        pool = Pool()

    h = 1  # nilmtk counts buildings from 1 not from 0 as we do, so everything is shifted by 1

    for house in houses:
        print('Loading', house)
        abs_house = join(greend_path, house)
        dates = [d for d in listdir(abs_house) if d.startswith('dataset')]
        target_filenames = [join(abs_house, date) for date in dates]
        if use_mp:
            house_data = pool.map(_get_blocks, target_filenames)

            # Ensure the blocks are sorted by date and make a plain list
            house_data_dfs = []
            for date, data in sorted(house_data, key=lambda x: x[0]):
                house_data_dfs.extend(data)
        else:
            house_data_dfs = []
            for fn in target_filenames:
                house_data_dfs.extend(_get_blocks(fn)[1])

        overall_df = pd.concat(house_data_dfs).sort_index()
        dups_in_index = overall_df.index.duplicated(keep='first')
        if dups_in_index.any():
            print("Found duplicated values in index, dropping them.")
            overall_df = overall_df[~dups_in_index]

        m = 1
        for column in overall_df.columns:
            print("meter {}: {}".format(m, column))
            key = Key(building=h, meter=m)
            print("Putting into store...")

            df = overall_df[column].to_frame()  #.dropna(axis=0)

            # if drop_duplicates:
            # print("Dropping duplicated values in data...")
            # df = df.drop_duplicates()

            df.columns = pd.MultiIndex.from_tuples([('power', 'active')])
            df.columns.set_names(LEVEL_NAMES, inplace=True)

            store.put(str(key), df, format='table')
            m += 1
            # print('Flushing store...')
            # store.flush()

        h += 1

    store.close()

    # retrieve the dataset metadata in the metadata subfolder
    metadata_dir = join(get_module_directory(), 'dataset_converters', 'greend',
                        'metadata')
    convert_yaml_to_hdf5(metadata_dir, hdf_filename)
Пример #28
0
def convert_combed(combed_path, output_filename, format='HDF'):
    """
    Parameters
    ----------
    combed_path : str
        The root path of the combed dataset.
    output_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    check_directory_exists(combed_path)

    # Open store
    store = get_datastore(output_filename, format, mode='w')

    any_file_converted = False

    for building_name, building_mapping in overall_dataset_mapping.items():
        for load_name, load_mapping in building_mapping.items():
            for load_mapping_path, meter_number in load_mapping.items():
                building_number = building_number_mapping[building_name]
                key = Key(building=building_number, meter=meter_number)
                dfs = []
                for attribute in column_mapping.keys():
                    filename_attribute = join(combed_path, building_name,
                                              load_name, load_mapping_path,
                                              "%s.csv" % attribute)
                    if not os.path.isfile(filename_attribute):
                        # File not found directly in the combed_path provided
                        # Try adding 'iiitd' to it
                        filename_attribute = join(combed_path, 'iiitd',
                                                  building_name, load_name,
                                                  load_mapping_path,
                                                  "%s.csv" % attribute)

                    if os.path.isfile(filename_attribute):
                        exists = True
                        print(filename_attribute)
                        df = pd.read_csv(filename_attribute,
                                         names=["timestamp", attribute])
                        df.index = pd.to_datetime(df["timestamp"], unit='ms')
                        df = df.drop("timestamp", 1)
                        dfs.append(df)
                    else:
                        exists = False

                if exists:
                    total = pd.concat(dfs, axis=1)
                    total = total.tz_localize('UTC').tz_convert('Asia/Kolkata')
                    total.columns = pd.MultiIndex.from_tuples(
                        [column_mapping[x] for x in total.columns])
                    total.columns.set_names(LEVEL_NAMES, inplace=True)
                    assert total.index.is_unique
                    store.put(str(key), total)
                    any_file_converted = True

    if not any_file_converted:
        raise RuntimeError(
            'No files converted, did you specify the correct path?')

    convert_yaml_to_hdf5(
        join(get_module_directory(), 'dataset_converters', 'combed',
             'metadata'), output_filename)

    print("Done converting COMBED to HDF5!")
Пример #29
0
def download_dataport(database_username,
                      database_password,
                      hdf_filename,
                      database_schema='university',
                      user_selected_table='electricity_egauge_minutes',
                      periods_to_load=None):
    """
    Downloads data from dataport database into an HDF5 file.

    Parameters
    ----------
    hdf_filename : str
        Output HDF filename.  If file exists already then will be deleted.
    database_username, database_password, database_schema,user_selected_table, hdf_filename : str
    periods_to_load : dict of tuples, optional
       Key of dict is the building number (int).
       Values are (<start date>, <end date>)
       e.g. ("2013-04-01", None) or ("2013-04-01", "2013-08-01")
       defaults to all buildings and all date ranges
    """

    database_assert(user_selected_table)
    # dataport database settings
    database_host = 'dataport.pecanstreet.org'
    database_port = '5434'
    database_name = 'postgres'

    # try to connect to database
    try:
        conn = db.connect('host=' + database_host + ' port=' + database_port +
                          ' dbname=' + database_name + ' user='******' password='******'Could not connect to remote database')
        raise

    # map user_selected_table and timestamp column
    timestamp_map = {
        "electricity_egauge_15min": "local_15min",
        "electricity_egauge_hours": "localhour",
        "electricity_egauge_minutes": "localminute",
        "electricity_egauge_seconds": "localminute"
    }

    # set up a new HDF5 datastore (overwrites existing store)
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')

    # Create a temporary metadata dir, remove existing building
    # yaml files in module dir (if any)
    original_metadata_dir = join(get_module_directory(), 'dataset_converters',
                                 'dataport', 'metadata')
    tmp_dir = tempfile.mkdtemp()
    metadata_dir = join(tmp_dir, 'metadata')
    shutil.copytree(original_metadata_dir, metadata_dir)
    print("Using temporary dir for metadata:", metadata_dir)

    for f in os.listdir(metadata_dir):
        if re.search('^building', f):
            os.remove(join(metadata_dir, f))
    """
    TODO:
    The section below can be altered or removed,
    since the restructured Dataport
    now has only one electricity_egauge_minutes table.
    """
    # get tables in database schema
    sql_query = ("SELECT table_name" + " FROM information_schema.views" +
                 " WHERE table_schema ='" + database_schema + "'" +
                 " ORDER BY table_name")
    database_tables = pd.read_sql(sql_query, conn)['table_name'].tolist()
    database_tables = [t for t in database_tables if user_selected_table in t]
    # if user has specified buildings
    if periods_to_load:
        buildings_to_load = list(periods_to_load.keys())
    else:
        # get buildings present in all tables
        sql_query = ''
        for table in database_tables:
            sql_query = (sql_query + '(SELECT DISTINCT dataid' + ' FROM "' +
                         database_schema + '".' + table + ') UNION ')
        sql_query = sql_query[:-7]
        sql_query = (sql_query + ' ORDER BY dataid')
        buildings_to_load = pd.read_sql(sql_query, conn)['dataid'].tolist()

    # for each user specified building or all buildings in database
    for building_id in buildings_to_load:
        print("Loading building {:d} @ {}".format(building_id,
                                                  datetime.datetime.now()))
        sys.stdout.flush()

        # create new list of chunks for concatenating later
        dataframe_list = []

        # for each table of 1 month data
        for database_table in database_tables:
            print("  Loading table {:s}".format(database_table))
            sys.stdout.flush()

            # get buildings present in electricity_egauge_minutes table
            sql_query = ('SELECT DISTINCT dataid' +
                         ' FROM university.metadata' +
                         ' WHERE egauge_min_time IS NOT NULL' +
                         ' ORDER BY dataid')

            buildings_in_table = pd.read_sql(sql_query,
                                             conn)['dataid'].tolist()
            if building_id in buildings_in_table:
                # get first and last timestamps for this
                # house in electricity_egauge_minutes table
                sql_query = ('SELECT MIN(egauge_min_time) AS minlocalminute,' +
                             ' MAX(egauge_max_time) AS maxlocalminute' +
                             ' FROM university.metadata' + ' WHERE dataid=' +
                             str(building_id))

                range = pd.read_sql(sql_query, conn)

                first_timestamp_in_table = range['minlocalminute'][0]
                last_timestamp_in_table = range['maxlocalminute'][0]

                # get requested start and end and localize them
                requested_start = None
                requested_end = None
                database_timezone = 'US/Central'
                if periods_to_load:
                    if periods_to_load[building_id][0]:
                        requested_start = pd.Timestamp(
                            periods_to_load[building_id][0])
                        requested_start = requested_start.tz_localize(
                            database_timezone)
                    if periods_to_load[building_id][1]:
                        requested_end = pd.Timestamp(
                            periods_to_load[building_id][1])
                        requested_end = requested_end.tz_localize(
                            database_timezone)

                # check user start is not after end
                if requested_start > requested_end:
                    print('requested end is before requested start')
                    sys.stdout.flush()
                else:
                    # clip data to smallest range
                    if requested_start:
                        start = max(requested_start, first_timestamp_in_table)
                    else:
                        start = first_timestamp_in_table
                    if requested_end:
                        end = min(requested_end, last_timestamp_in_table)
                    else:
                        end = last_timestamp_in_table

                    # download data in chunks
                    chunk_start = start
                    chunk_size = datetime.timedelta(10)  # 1 day
                    while chunk_start < end:
                        chunk_end = chunk_start + chunk_size
                        if chunk_end > end:
                            chunk_end = end
                        # subtract 1 second so end is exclusive
                        chunk_end = chunk_end - datetime.timedelta(0, 1)

                        # query power data for all channels
                        format = '%Y-%m-%d %H:%M:%S'
                        sql_query = (
                            'SELECT *' + ' FROM "' + database_schema + '".' +
                            user_selected_table + ' WHERE dataid=' +
                            str(building_id) + 'and "' +
                            timestamp_map[user_selected_table] + '" between ' +
                            "'" + chunk_start.strftime(format) + "'" +
                            " and " + "'" + chunk_end.strftime(format) +
                            "' ORDER BY " + timestamp_map[user_selected_table])
                        chunk_dataframe = pd.read_sql(sql_query, conn)
                        # nilmtk requires building indices to start at 1
                        nilmtk_building_id = buildings_to_load.index(
                            building_id) + 1
                        # convert to nilmtk-df and save to disk
                        nilmtk_dataframe = _dataport_dataframe_to_hdf(
                            chunk_dataframe, store, nilmtk_building_id,
                            building_id, timestamp_map[user_selected_table],
                            metadata_dir)

                        # print progress
                        print('    ' + str(chunk_start) + ' -> ' +
                              str(chunk_end) + ': ' +
                              str(len(chunk_dataframe.index)) + ' rows')
                        sys.stdout.flush()

                        # append all chunks into list for csv writing
                        # dataframe_list.append(chunk_dataframe)

                        # move on to next chunk
                        chunk_start = chunk_start + chunk_size

        # saves all chunks in list to csv
        # if len(dataframe_list) > 0:
        # dataframe_concat = pd.concat(dataframe_list)
        # dataframe_concat.to_csv(output_directory + str(building_id) + '.csv')

    store.close()
    conn.close()

    # write yaml to hdf5
    # dataset.yaml and meter_devices.yaml are static, building<x>.yaml are dynamic
    convert_yaml_to_hdf5(metadata_dir, hdf_filename)

    # remote the temporary dir when finished
    shutil.rmtree(tmp_dir)