Пример #1
0
def convert_ampds(input_path, output_filename, format='HDF'):
    """
    Convert AMPds R2013 as seen on Dataverse. Download the files
    as CSVs and put them in the `input_path` folder for conversion.
    
    Download URL: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/MXB7VO
    
    Parameters: 
    -----------
    input_path: str
            The path of the directory where all the csv 
            files are supposed to be stored
    output_filename: str
            The path of the h5 file where all the 
            standardized data is supposed to go. The path 
            should refer to a particular file and not just a
             random directory in order for this to work.
    format: str
        Defaults to HDF5
    Example usage:
    --------------
    convert('/AMPds/electricity', 'store.h5')    

    """
    check_directory_exists(input_path)
    files = [
        f for f in listdir(input_path)
        if isfile(join(input_path, f)) and '.csv' in f and '.swp' not in f
    ]
    # Sorting Lexicographically
    files.sort()

    # Remove Whole Home and put it at top
    files.remove("WHE.csv")
    files.insert(0, "WHE.csv")
    assert isdir(input_path)
    store = get_datastore(output_filename, format, mode='w')
    for i, csv_file in enumerate(files):
        key = Key(building=1, meter=(i + 1))
        print('Loading file #', (i + 1), ' : ', csv_file, '. Please wait...')
        df = pd.read_csv(join(input_path, csv_file))
        # Due to fixed width, column names have spaces :(
        df.columns = [x.replace(" ", "") for x in df.columns]
        df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME],
                                  unit='s',
                                  utc=True)
        df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
        df = df.tz_convert(TIMEZONE)
        df.rename(columns=lambda x: columnNameMapping[x], inplace=True)
        df.columns.set_names(LEVEL_NAMES, inplace=True)
        df = df.apply(pd.to_numeric, errors='ignore')
        df = df.dropna()
        df = df.astype(np.float32)
        store.put(str(key), df)
        print("Done with file #", (i + 1))

    store.close()
    metadata_path = join(_get_module_directory(), 'metadata')
    print('Processing metadata...')
    convert_yaml_to_hdf5(metadata_path, output_filename)
Пример #2
0
def _get_ac_type_map(ukdale_path):
    """First we need to convert the YAML metadata to HDF5
    so we can load the metadata into NILMTK to allow
    us to use NILMTK to find the ac_type for each channel.
    
    Parameters
    ----------
    ukdale_path : str

    Returns
    -------
    ac_type_map : dict.  
        Keys are pairs of ints: (<house_instance>, <meter_instance>)
        Values are list of available power ac type for that meter.
    """

    hdf5_just_metadata = join(ukdale_path, 'metadata', 'ukdale_metadata.h5')
    convert_yaml_to_hdf5(join(ukdale_path, 'metadata'), hdf5_just_metadata)
    ukdale_dataset = DataSet(hdf5_just_metadata)
    ac_type_map = {}
    for building_i, building in iteritems(ukdale_dataset.buildings):
        elec = building.elec
        for meter in elec.meters + elec.disabled_meters:
            key = (building_i, meter.instance())
            ac_type_map[key] = meter.available_ac_types('power')
    ukdale_dataset.store.close()
    remove(hdf5_just_metadata)
    return ac_type_map
Пример #3
0
def convert_ukdale(ukdale_path, hdf_filename):
    """Converts the UK-DALE dataset to NILMTK HDF5 format.

    For more information about the UK-DALE dataset, and to download
    it, please see http://www.doc.ic.ac.uk/~dk3810/data/

    Parameters
    ----------
    ukdale_path : str
        The root path of the UK-DALE dataset.  It is assumed that the YAML
        metadata is in 'ukdale_path/metadata'.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """
    ac_type_map = _get_ac_type_map(ukdale_path)

    def _ukdale_measurement_mapping_func(house_id, chan_id):
        ac_type = ac_type_map[(house_id, chan_id)][0]
        return [('power', ac_type)]

    # Convert 6-second data
    _convert(ukdale_path, hdf_filename, _ukdale_measurement_mapping_func, TZ)

    # Add metadata
    convert_yaml_to_hdf5(join(ukdale_path, 'metadata'), hdf_filename)

    # Convert 1-second data
    _convert_one_sec_data(ukdale_path, hdf_filename, ac_type_map)

    print("Done converting UK-DALE to HDF5!")
Пример #4
0
def convert_combed(combed_path, hdf_filename):
    """
    Parameters
    ----------
    combed_path : str
        The root path of the combed dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    check_directory_exists(combed_path)

    # Open HDF5 file
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')
    chan = 1
    for building, meter_array in SUBMETER_PATHS.iteritems():
        for meter in meter_array:
            key = Key(building=1, meter=chan)
            dfs = []
            total = pd.DataFrame()
            for attribute in column_mapping.keys():
                filename_attribute = join(combed_path, building, str(meter), "%s.csv" %attribute )
                print(filename_attribute)
                dfs.append(pd.read_csv(filename_attribute, parse_dates = True, index_col = 0, header = True, names=[attribute]))
            total = pd.concat(dfs, axis = 1)
                   
            total.rename(columns=lambda x: column_mapping[x], inplace=True)
            total.columns.set_names(LEVEL_NAMES, inplace=True)
            store.put(str(key), total, format='table')
            store.flush()
            chan = chan+ 1
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         hdf_filename)

    print("Done converting COMBED to HDF5!")
Пример #5
0
def refresh_gjw_metadata(gjw_path, output_filename):
    """
    Parameters
    ----------
    gjw_path : str
        The root path of the gjw dataset.
    output_filename : str
        The destination filename (including path and suffix), will default if not specified
    directory and file structure
    nilm_gjw_data
        building<1>
            elec
                4-POWER_REAL_FINE <date> Dump.csv
                5-POWER_REACTIVE_STANDARD <date> Dump.csv
                ...
        ...
        building<n>
        HDF5
            nilm_gjw_data.hdf5
        metadata
            building1.yaml
            dataset.yaml
            meter_devices.yaml
        other files    
    """
    if gjw_path is None: gjw_path = home_dir
    check_directory_exists(gjw_path)
    os.chdir(gjw_path)
    gjw_path = os.getcwd()  # sort out potential issue with slashes or backslashes
    if output_filename is None:
        output_filename =join(home_dir,'HDF5','nilm_gjw_data.hdf5')
    convert_yaml_to_hdf5(join(gjw_path, 'metadata'),output_filename)
    print("Done refreshing metadata")
Пример #6
0
def refresh_gjw_metadata(gjw_path, output_filename):
    """
    Parameters
    ----------
    gjw_path : str
        The root path of the gjw dataset.
    output_filename : str
        The destination filename (including path and suffix), will default if not specified
    directory and file structure
    nilm_gjw_data
        building<1>
            elec
                4-POWER_REAL_FINE <date> Dump.csv
                5-POWER_REACTIVE_STANDARD <date> Dump.csv
                ...
        ...
        building<n>
        HDF5
            nilm_gjw_data.hdf5
        metadata
            building1.yaml
            dataset.yaml
            meter_devices.yaml
        other files    
    """
    if gjw_path is None: gjw_path = home_dir
    check_directory_exists(gjw_path)
    os.chdir(gjw_path)
    gjw_path = os.getcwd(
    )  # sort out potential issue with slashes or backslashes
    if output_filename is None:
        output_filename = join(home_dir, 'HDF5', 'nilm_gjw_data.hdf5')
    convert_yaml_to_hdf5(join(gjw_path, 'metadata'), output_filename)
    print("Done refreshing metadata")
Пример #7
0
def _get_ac_type_map(ukdale_path):
    """First we need to convert the YAML metadata to HDF5
    so we can load the metadata into NILMTK to allow
    us to use NILMTK to find the ac_type for each channel.
    
    Parameters
    ----------
    ukdale_path : str

    Returns
    -------
    ac_type_map : dict.  
        Keys are pairs of ints: (<house_instance>, <meter_instance>)
        Values are list of available power ac type for that meter.
    """

    hdf5_just_metadata = join(ukdale_path, 'metadata', 'ukdale_metadata.h5')
    convert_yaml_to_hdf5(join(ukdale_path, 'metadata'), hdf5_just_metadata)
    ukdale_dataset = DataSet(hdf5_just_metadata)
    ac_type_map = {}
    for building_i, building in iteritems(ukdale_dataset.buildings):
        elec = building.elec
        for meter in elec.meters + elec.disabled_meters:
            key = (building_i, meter.instance())
            ac_type_map[key] = meter.available_ac_types('power')

    ukdale_dataset.store.close()
    remove(hdf5_just_metadata)
    return ac_type_map
Пример #8
0
def convert_combed(combed_path, output_filename, format='HDF'):
    """
    Parameters
    ----------
    combed_path : str
        The root path of the combed dataset.
    output_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    check_directory_exists(combed_path)

    # Open store
    store = get_datastore(output_filename, format, mode='w')

    for building_name, building_mapping in overall_dataset_mapping.iteritems():
        for load_name, load_mapping in building_mapping.iteritems():
            for load_mapping_path, meter_number in load_mapping.iteritems():
                building_number = building_number_mapping[building_name]
                key = Key(building=building_number, meter=meter_number)
                dfs = []
                for attribute in column_mapping.keys():
                    filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" %attribute)
                    print(filename_attribute)
                    dfs.append(pd.read_csv(filename_attribute, parse_dates=True, index_col=0, header=True, names=[attribute]))
                total = pd.concat(dfs, axis=1)
                total = total.tz_localize('UTC').tz_convert('Asia/Kolkata')
                total.rename(columns=lambda x: column_mapping[x], inplace=True)
                total.columns.set_names(LEVEL_NAMES, inplace=True)
                assert total.index.is_unique
                store.put(str(key), total)
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         output_filename)

    print("Done converting COMBED to HDF5!")
Пример #9
0
def convert_ukdale(ukdale_path, hdf_filename):
    """
    Parameters
    ----------
    ukdale_path : str
        The root path of the UK-DALE dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    def _ukdale_measurement_mapping_func(house_id, chan_id):
        # TODO: This needs updating.  It's wrong!
        ac_type = 'apparent' if chan_id <= 2 else 'active'
        return [('power', ac_type)]

    _convert(ukdale_path, hdf_filename, _ukdale_measurement_mapping_func, 
             'Europe/London')

    # Add metadata
    convert_yaml_to_hdf5(join(get_module_directory(), 
                              'dataset_converters', 
                              'ukdale', 
                              'metadata'),
                         hdf_filename)

    print("Done converting UK-DALE to HDF5!")
Пример #10
0
def convert_greend(greend_path, hdf_filename):
    """
    Parameters
    ----------
    greend_path : str
        The root path of the greend dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """
    store = pd.HDFStore(hdf_filename, "w", complevel=9, complib="zlib")
    houses = sorted(__get_houses(greend_path))
    print(houses)
    h = 1
    for house in houses:
        print("loading " + house)
        abs_house = join(greend_path, house)
        dates = [d for d in listdir(abs_house) if d.startswith("dataset")]
        house_data = []
        for date in dates:
            print("-----------------------", date)
            try:
                tmp_pandas = pd.read_csv(join(abs_house, date), na_values=["na"], error_bad_lines=False)
            except:  # A CParserError is returned for malformed files (irregular column number)
                pass

            if "timestamp" in tmp_pandas.columns:
                pass
            else:
                tmp_pandas["timestamp"] = tmp_pandas.index
            tmp_pandas.index = tmp_pandas["timestamp"].convert_objects(convert_numeric=True).values
            tmp_pandas = tmp_pandas.drop("timestamp", 1)

            tmp_pandas = tmp_pandas.astype("float32")

            tmp_pandas.index = pd.to_datetime(tmp_pandas.index, unit="s")
            tmp_pandas = tmp_pandas.tz_localize("UTC").tz_convert("CET")
            tmp_pandas = tmp_pandas.drop_duplicates()
            # tmp_pandas = tmp_pandas.sort_index()
            house_data.append(tmp_pandas)
        overall_df = pd.concat(house_data)
        overall_df = overall_df.drop_duplicates()
        overall_df = overall_df.sort_index()

        m = 1

        for column in overall_df.columns:
            print("meter" + str(m) + ": " + column)
            key = Key(building=h, meter=m)
            print("Putting into store...")
            store.put(str(key), overall_df[column], format="table")
            m += 1
            print("Flushing store...")
            store.flush()
        h += 1

    store.close()

    # needs to be edited
    convert_yaml_to_hdf5("/path/to/metadata", hdf_filename)
Пример #11
0
def convert_combed(combed_path, output_filename, format='HDF'):
    """
    Parameters
    ----------
    combed_path : str
        The root path of the combed dataset.
    output_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    check_directory_exists(combed_path)

    # Open store
    store = get_datastore(output_filename, format, mode='w')

    any_file_converted = False
    
    for building_name, building_mapping in iteritems(overall_dataset_mapping):
        for load_name, load_mapping in iteritems(building_mapping):
            for load_mapping_path, meter_number in iteritems(load_mapping):
                building_number = building_number_mapping[building_name]
                key = Key(building=building_number, meter=meter_number)
                dfs = []
                for attribute in column_mapping.keys():
                    filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" %attribute)
                    if not os.path.isfile(filename_attribute):
                        # File not found directly in the combed_path provided
                        # Try adding 'iiitd' to it
                        filename_attribute = join(combed_path, 'iiitd', building_name, load_name, load_mapping_path, "%s.csv" %attribute)
                    
                    if os.path.isfile(filename_attribute):
                        exists = True
                        print(filename_attribute)
                        df = pd.read_csv(filename_attribute, names=["timestamp", attribute])
                        df.index = pd.to_datetime(df["timestamp"], unit='ms')
                        df = df.drop("timestamp", 1)
                        dfs.append(df)
                    else:
                        exists = False
                        
                if exists:
                    total = pd.concat(dfs, axis=1)
                    total = total.tz_localize('UTC').tz_convert('Asia/Kolkata')
                    total.columns = pd.MultiIndex.from_tuples([column_mapping[x] for x in total.columns])
                    total.columns.set_names(LEVEL_NAMES, inplace=True)
                    assert total.index.is_unique
                    store.put(str(key), total)
                    any_file_converted = True
                    
    if not any_file_converted:
        raise RuntimeError('No files converted, did you specify the correct path?')
                    
    convert_yaml_to_hdf5(
        join(get_module_directory(), 'dataset_converters', 'combed', 'metadata'),
        output_filename
    )

    print("Done converting COMBED to HDF5!")
Пример #12
0
def convert_combed(combed_path, output_filename, format='HDF'):
    """
    Parameters
    ----------
    combed_path : str
        The root path of the combed dataset.
    output_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    check_directory_exists(combed_path)

    # Open store
    store = get_datastore(output_filename, format, mode='w')

    any_file_converted = False
    
    for building_name, building_mapping in iteritems(overall_dataset_mapping):
        for load_name, load_mapping in iteritems(building_mapping):
            for load_mapping_path, meter_number in iteritems(load_mapping):
                building_number = building_number_mapping[building_name]
                key = Key(building=building_number, meter=meter_number)
                dfs = []
                for attribute in column_mapping.keys():
                    filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" %attribute)
                    if not os.path.isfile(filename_attribute):
                        # File not found directly in the combed_path provided
                        # Try adding 'iiitd' to it
                        filename_attribute = join(combed_path, 'iiitd', building_name, load_name, load_mapping_path, "%s.csv" %attribute)
                    
                    if os.path.isfile(filename_attribute):
                        exists = True
                        print(filename_attribute)
                        df = pd.read_csv(filename_attribute, names=["timestamp", attribute])
                        df.index = pd.to_datetime(df["timestamp"], unit='ms')
                        df = df.drop("timestamp", 1)
                        dfs.append(df)
                    else:
                        exists = False
                        
                if exists:
                    total = pd.concat(dfs, axis=1)
                    total = total.tz_localize('UTC').tz_convert('Asia/Kolkata')
                    total.columns = pd.MultiIndex.from_tuples([column_mapping[x] for x in total.columns])
                    total.columns.set_names(LEVEL_NAMES, inplace=True)
                    assert total.index.is_unique
                    store.put(str(key), total)
                    any_file_converted = True
                    
    if not any_file_converted:
        raise RuntimeError('No files converted, did you specify the correct path?')
                    
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         output_filename)

    print("Done converting COMBED to HDF5!")
Пример #13
0
def convert_ampds(input_path, output_filename, format='HDF'):
    """
    Convert AMPds R2013 as seen on Dataverse. Download the files
    as CSVs and put them in the `input_path` folder for conversion.
    
    Download URL: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/MXB7VO
    
    Parameters: 
    -----------
    input_path: str
            The path of the directory where all the csv 
            files are supposed to be stored
    output_filename: str
            The path of the h5 file where all the 
            standardized data is supposed to go. The path 
            should refer to a particular file and not just a
             random directory in order for this to work.
    format: str
        Defaults to HDF5
    Example usage:
    --------------
    convert('/AMPds/electricity', 'store.h5')    

    """
    check_directory_exists(input_path)
    files = [f for f in listdir(input_path) if isfile(join(input_path, f)) and
             '.csv' in f and '.swp' not in f]
    # Sorting Lexicographically
    files.sort()

    # Remove Whole Home and put it at top
    files.remove("WHE.csv")
    files.insert(0, "WHE.csv")
    assert isdir(input_path)
    store = get_datastore(output_filename, format, mode='w')
    for i, csv_file in enumerate(files):
        key = Key(building=1, meter=(i + 1))
        print('Loading file #', (i + 1), ' : ', csv_file, '. Please wait...')
        df = pd.read_csv(join(input_path, csv_file))
        # Due to fixed width, column names have spaces :(
        df.columns = [x.replace(" ", "") for x in df.columns]
        df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME], unit='s', utc=True)
        df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
        df = df.tz_convert(TIMEZONE)
        df.rename(columns=lambda x: columnNameMapping[x], inplace=True)
        df.columns.set_names(LEVEL_NAMES, inplace=True)
        df = df.apply(pd.to_numeric, errors='ignore')
        df = df.dropna()
        df = df.astype(np.float32)
        store.put(str(key), df)
        print("Done with file #", (i + 1))
        
    store.close()
    metadata_path = join(get_module_directory(), 'dataset_converters', 'ampds', 'metadata')
    print('Processing metadata...')
    convert_yaml_to_hdf5(metadata_path, output_filename)
Пример #14
0
def convert_ukdale(ukdale_path,
                   output_filename,
                   format='HDF',
                   drop_duplicates=True):
    """Converts the UK-DALE dataset to NILMTK HDF5 format.

    For more information about the UK-DALE dataset, and to download
    it, please see http://www.doc.ic.ac.uk/~dk3810/data/

    Parameters
    ----------
    ukdale_path : str
        The root path of the UK-DALE dataset.  It is assumed that the YAML
        metadata is in 'ukdale_path/metadata'.
    output_filename : str
        The destination filename (including path and suffix).
    format : str
        format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF'
    drop_duplicates : bool
        Remove entries with duplicated timestamp (keeps the first value)
        Defaults to True.
    """
    ac_type_map = _get_ac_type_map(ukdale_path)

    def _ukdale_measurement_mapping_func(house_id, chan_id):
        ac_type = ac_type_map[(house_id, chan_id)][0]
        return [('power', ac_type)]

    # Open DataStore
    store = get_datastore(output_filename, format, mode='w')

    # Convert 6-second data
    _convert(ukdale_path,
             store,
             _ukdale_measurement_mapping_func,
             TZ,
             sort_index=False,
             drop_duplicates=drop_duplicates)
    store.close()

    # Add metadata
    if format == 'HDF':
        convert_yaml_to_hdf5(join(ukdale_path, 'metadata'), output_filename)

    # Convert 1-second data
    store.open(mode='a')
    _convert_one_sec_data(ukdale_path, store, ac_type_map, drop_duplicates)

    store.close()
    print("Done converting UK-DALE to HDF5!")
Пример #15
0
def convert_ampds(input_path, output_filename, format="HDF"):
    """
    Parameters: 
    -----------
    input_path: str
            The path of the directory where all the csv 
            files are supposed to be stored
    output_filename: str
            The path of the h5 file where all the 
            standardized data is supposed to go. The path 
            should refer to a particular file and not just a
             random directory in order for this to work.
    format: str
        Defaults to HDF5
    Example usage:
    --------------
    convert('/AMPds/electricity', 'store.h5')    

    """
    check_directory_exists(input_path)
    files = [f for f in listdir(input_path) if isfile(join(input_path, f)) and ".csv" in f and ".swp" not in f]
    # Sorting Lexicographically
    files.sort()

    # Remove Whole Home and put it at top
    files.remove("WHE.csv")
    files.insert(0, "WHE.csv")
    assert isdir(input_path)
    store = get_datastore(output_filename, format, mode="w")
    for i, csv_file in enumerate(files):
        key = Key(building=1, meter=(i + 1))
        print("Loading file #", (i + 1), " : ", csv_file, ". Please wait...")
        df = pd.read_csv(join(input_path, csv_file))
        # Due to fixed width, column names have spaces :(
        df.columns = [x.replace(" ", "") for x in df.columns]
        df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME], unit="s", utc=True)
        df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
        df = df.tz_localize("GMT").tz_convert(TIMEZONE)
        df.rename(columns=lambda x: columnNameMapping[x], inplace=True)
        df.columns.set_names(LEVEL_NAMES, inplace=True)
        df = df.convert_objects(convert_numeric=True)
        df = df.dropna()
        df = df.astype(np.float32)
        store.put(str(key), df)
        print("Done with file #", (i + 1))
    store.close()
    metadata_path = join(_get_module_directory(), "metadata")
    print("Processing metadata...")
    convert_yaml_to_hdf5(metadata_path, output_filename)
Пример #16
0
def convert_combed(combed_path, output_filename, format='HDF'):
    """
    Parameters
    ----------
    combed_path : str
        The root path of the combed dataset.
    output_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    check_directory_exists(combed_path)

    # Open store
    store = get_datastore(output_filename, format, mode='w')

    for building_name, building_mapping in iteritems(overall_dataset_mapping):
        for load_name, load_mapping in iteritems(building_mapping):
            for load_mapping_path, meter_number in iteritems(load_mapping):
                building_number = building_number_mapping[building_name]
                key = Key(building=building_number, meter=meter_number)
                dfs = []
                for attribute in column_mapping.keys():
                    filename_attribute = join(combed_path, building_name,
                                              load_name, load_mapping_path,
                                              "%s.csv" % attribute)
                    if os.path.isfile(filename_attribute):
                        exists = True
                        print(filename_attribute)
                        df = pd.read_csv(filename_attribute,
                                         header=True,
                                         names=["timestamp", attribute])
                        df.index = pd.to_datetime(df["timestamp"], unit='ms')
                        df = df.drop("timestamp", 1)
                        dfs.append(df)
                    else:
                        exists = False
                if exists:
                    total = pd.concat(dfs, axis=1)
                    total = total.tz_localize('UTC').tz_convert('Asia/Kolkata')
                    total.rename(columns=lambda x: column_mapping[x],
                                 inplace=True)
                    total.columns.set_names(LEVEL_NAMES, inplace=True)
                    assert total.index.is_unique
                    store.put(str(key), total)
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         output_filename)

    print("Done converting COMBED to HDF5!")
Пример #17
0
def convert_greend(greend_path, hdf_filename):
    """
    Parameters
    ----------
    greend_path : str
        The root path of the greend dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """


    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')
    houses = sorted(__get_houses(greend_path))
    print(houses)
    h = 1
    for house in houses:
        print('loading '+house+"'s house...")
        abs_house = join(greend_path, house)
        dates = [d for d in listdir(abs_house) if d.startswith('dataset')]
        house_data = pd.DataFrame()
        for date in dates:
            print('-----------------------',date)
            tmp_pandas = pd.DataFrame.from_csv(join(abs_house, date))
            tmp_pandas = tmp_pandas[tmp_pandas.index != 'timestamp']
            tmp_pandas = tmp_pandas.sort_index()
            c = 0 
            tmp_pandas.index = [__timestamp(t) for t in tmp_pandas.index]
            house_data = house_data.append(tmp_pandas)

            #for testing metadata files:
            #break
        m = 1 


        for meter in house_data:
            print("meter" + str(m)+': ')
            key = Key(building = h, meter=m)
            print("Putting into store...")
            store.put(str(key), house_data[meter], format = 'table')
            m += 1
            print('Flushing store...')
            store.flush()
        h += 1

    store.close()

    #needs to be edited
    convert_yaml_to_hdf5('/path/to/metadata', hdf_filename)
Пример #18
0
def convert_iawe(iawe_path, output_filename, format="HDF"):
    """
    Parameters
    ----------
    iawe_path : str
        The root path of the iawe dataset.
    output_filename : str
        The destination filename (including path and suffix).
    """

    check_directory_exists(iawe_path)
    idx = pd.DatetimeIndex(start=START_DATETIME, end=END_DATETIME, freq=FREQ)
    idx = idx.tz_localize('GMT').tz_convert(TIMEZONE)

    # Open data store
    store = get_datastore(output_filename, format, mode='w')
    electricity_path = join(iawe_path, "electricity")

    # Mains data
    for chan in range(1, 12):
        key = Key(building=1, meter=chan)
        filename = join(electricity_path, "%d.csv" % chan)
        print('Loading ', chan)
        df = pd.read_csv(filename, dtype=np.float64, na_values='\\N')
        df.drop_duplicates(subset=["timestamp"], inplace=True)
        df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True)
        df = df.tz_convert(TIMEZONE)
        df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
        df.columns = pd.MultiIndex.from_tuples(
            [column_mapping[x] for x in df.columns], names=LEVEL_NAMES)
        df = df.apply(pd.to_numeric, errors='ignore')
        df = df.dropna()
        df = df.astype(np.float32)
        df = df.sort_index()
        df = df.resample("1T").mean()
        df = reindex_fill_na(df, idx)
        assert df.isnull().sum().sum() == 0
        store.put(str(key), df)
    store.close()

    metadata_dir = join(get_module_directory(), 'dataset_converters', 'iawe',
                        'metadata')
    convert_yaml_to_hdf5(metadata_dir, output_filename)

    print("Done converting iAWE to HDF5!")
Пример #19
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('inpath', help='input directory (ANTgen output)', nargs='?', default='../output')
    parser.add_argument('outfile', help='output file (HDF5 file)', nargs='?', default='../output/ANTgen.h5')
    args = parser.parse_args()

    if not os.path.exists('metadata') or not os.path.isfile('metadata/building1.yaml'):
        print("No metadata found. Please run 'generate_metadata.py' before using this tool...")
        exit(1)

    print("Converting ANTgen output from '{}' to file '{}'".format(args.inpath, args.outfile))

    with open('metadata/building1.yaml', 'r') as f:
        yaml_dict = yaml.load(f, Loader=yaml.FullLoader)

    channel_list = ['total']  # pre-populate with aggregate data (total.csv)
    for app in yaml_dict['appliances']:
        channel_list.append(app['original_name'])

    store = get_datastore(args.outfile, 'HDF', mode='w')

    for i, app_name in enumerate(channel_list):
        print("Adding virtual meter ID {:02d}: {}".format(1+i, app_name))
        key = Key(building=1, meter=(i + 1))

        csvfile = os.path.join(args.inpath, str(app_name)+'.csv')
        try:
            df = pd.read_csv(csvfile, sep=';', encoding='utf-8', index_col=0)
            df.columns = pd.MultiIndex.from_tuples([('power', 'active') for x in df.columns], names=LEVEL_NAMES)
            df.index = pd.to_datetime(df.index)

            tz_naive = df.index
            tz_aware = tz_naive.tz_localize(tz='Europe/Vienna', ambiguous=True, nonexistent=pd.Timedelta('1H'))
            df.index = tz_aware

            df = df.tz_convert('Europe/Vienna')

            store.put(str(key), df)
        except FileNotFoundError:
            print("Input file '{}' not found - your HDF5 file will be incomplete!".format(csvfile))
            continue

    print('Adding metadata...')
    convert_yaml_to_hdf5('metadata/', args.outfile)
Пример #20
0
def convert_iawe(iawe_path, output_filename, format="HDF"):
    """
    Parameters
    ----------
    iawe_path : str
        The root path of the iawe dataset.
    output_filename : str
        The destination filename (including path and suffix).
    """

    check_directory_exists(iawe_path)
    idx = pd.DatetimeIndex(start=START_DATETIME, end=END_DATETIME, freq=FREQ)
    idx = idx.tz_localize('GMT').tz_convert(TIMEZONE)

    # Open data store
    store = get_datastore(output_filename, format, mode='w')
    electricity_path = join(iawe_path, "electricity")

    # Mains data
    for chan in range(1, 12):
        key = Key(building=1, meter=chan)
        filename = join(electricity_path, "%d.csv" % chan)
        print('Loading ', chan)
        df = pd.read_csv(filename, dtype=np.float64, na_values='\\N')
        df.drop_duplicates(subset=["timestamp"], inplace=True)
        df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True)
        df = df.tz_convert(TIMEZONE)
        df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
        df.rename(columns=lambda x: column_mapping[x], inplace=True)
        df.columns.set_names(LEVEL_NAMES, inplace=True)
        df = df.apply(pd.to_numeric, errors='ignore')
        df = df.dropna()
        df = df.astype(np.float32)
        df = df.sort_index()
        df = df.resample("1T").mean()
        df = reindex_fill_na(df, idx)
        assert df.isnull().sum().sum() == 0
        store.put(str(key), df)
    store.close()
    
    metadata_dir = join(get_module_directory(), 'dataset_converters', 'iawe', 'metadata')
    convert_yaml_to_hdf5(metadata_dir, output_filename)

    print("Done converting iAWE to HDF5!")
Пример #21
0
def convert_caxe(file_path):
    '''
    Parameters
    ------------
    Takes input csv_file name to be tested as string.
    Data columns of the csv should contain following the following values in columns:
    timestamp,reactive_power,apparent_power,current,frequency,voltage,active_power) 
    Converts it into hdf5 Format and save as test.h5.
    '''
    df = pd.read_csv(f'{file_path}',
                     names=['timestamp', 'R', 'A', 'C', 'F', 'V', 'T'])
    column_mapping = {
        'F': ('frequency', ""),
        'V': ('voltage', ""),
        'T': ('power', 'active'),
        'C': ('current', ''),
        'R': ('power', 'reactive'),
        'A': ('power', 'apparent'),
    }

    output_filename = 'test.h5'

    # Open data store
    store = get_datastore(output_filename, format='HDF', mode='w')
    key = Key(building=1, meter=1)
    print('Loading ', 1)
    df.index = pd.to_datetime(df.timestamp.values)
    df = df.tz_convert(
        TIMEZONE)  #  if error occurs use tz_localize for tz naive timestamps
    df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
    df.index = pd.to_datetime(df.index.values)
    df.columns = pd.MultiIndex.from_tuples(
        [column_mapping[x] for x in df.columns], names=LEVEL_NAMES)
    df = df.apply(pd.to_numeric, errors='ignore')
    df = df.dropna()
    df = df.astype(np.float32)
    df = df.sort_index()
    df = df.resample("1T").mean()
    assert df.isnull().sum().sum() == 0
    store.put(str(key), df)
    store.close()
    convert_yaml_to_hdf5('./metadata', output_filename)

    print("Done converting test data to HDF5!")
Пример #22
0
def convert_combed(combed_path, hdf_filename):
    """
    Parameters
    ----------
    combed_path : str
        The root path of the combed dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    assert isdir(combed_path)

    # Open HDF5 file
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')
    chan = 1
    for building, meter_array in SUBMETER_PATHS.iteritems():
        for meter in meter_array:
            key = Key(building=1, meter=chan)
            dfs = []
            total = pd.DataFrame()
            for attribute in column_mapping.keys():
                filename_attribute = join(combed_path, building, str(meter),
                                          "%s.csv" % attribute)
                print(filename_attribute)
                dfs.append(
                    pd.read_csv(filename_attribute,
                                parse_dates=True,
                                index_col=0,
                                header=True,
                                names=[attribute]))
            total = pd.concat(dfs, axis=1)

            total.rename(columns=lambda x: column_mapping[x], inplace=True)
            total.columns.set_names(LEVEL_NAMES, inplace=True)
            store.put(str(key), total, format='table')
            store.flush()
            chan = chan + 1
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         hdf_filename)

    print("Done converting COMBED to HDF5!")
Пример #23
0
def convert_iawe(iawe_path, hdf_filename):
    """
    Parameters
    ----------
    iawe_path : str
        The root path of the iawe dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    check_directory_exists(iawe_path)

    # Open HDF5 file
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')

    electricity_path = join(iawe_path, "electricity")

    # Mains data
    for chan in range(1, 13):
        key = Key(building=1, meter=chan)
        filename = join(electricity_path, "%d.csv" % chan)
        print('Loading ', chan)
        df = pd.read_csv(filename)
        df.index = pd.to_datetime(
            (df.timestamp.values * 1E9).astype(int), utc=True)
        df = df.tz_convert('Asia/Kolkata')
        df = df.drop('timestamp', 1)
        df.rename(columns=lambda x: column_mapping[x], inplace=True)
        df.columns.set_names(LEVEL_NAMES, inplace=True)
        df = df.convert_objects(convert_numeric=True)
        df = df.dropna()
        df = df.astype(np.float32)
        df = df.sort_index()
        store.put(str(key), df, format='table')
        store.flush()
    store.close()
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         hdf_filename)

    print("Done converting iAWE to HDF5!")
Пример #24
0
def convert_iawe(iawe_path, hdf_filename):
    """
    Parameters
    ----------
    iawe_path : str
        The root path of the iawe dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    assert isdir(iawe_path)

    # Open HDF5 file
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')

    electricity_path = join(iawe_path, "electricity")

    # Mains data
    for chan in range(1, 13):
        key = Key(building=1, meter=chan)
        filename = join(electricity_path, "%d.csv" % chan)
        print('Loading ', chan)
        df = pd.read_csv(filename)
        df.index = pd.to_datetime((df.timestamp.values * 1E9).astype(int),
                                  utc=True)
        df = df.tz_convert('Asia/Kolkata')
        df = df.drop('timestamp', 1)
        df.rename(columns=lambda x: column_mapping[x], inplace=True)
        df.columns.set_names(LEVEL_NAMES, inplace=True)
        df = df.convert_objects(convert_numeric=True)
        df = df.dropna()
        df = df.astype(np.float32)
        df = df.sort_index()
        store.put(str(key), df, format='table')
        store.flush()
    store.close()
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         hdf_filename)

    print("Done converting iAWE to HDF5!")
Пример #25
0
def convert_ukdale(ukdale_path, output_filename, format='HDF'):
    """Converts the UK-DALE dataset to NILMTK HDF5 format.

    For more information about the UK-DALE dataset, and to download
    it, please see http://www.doc.ic.ac.uk/~dk3810/data/

    Parameters
    ----------
    ukdale_path : str
        The root path of the UK-DALE dataset.  It is assumed that the YAML
        metadata is in 'ukdale_path/metadata'.
    output_filename : str
        The destination filename (including path and suffix).
    format : str
        format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF'
    """
    ac_type_map = _get_ac_type_map(ukdale_path)

    def _ukdale_measurement_mapping_func(house_id, chan_id):
        ac_type = ac_type_map[(house_id, chan_id)][0]
        return [('power', ac_type)]

    # Open DataStore
    store = get_datastore(output_filename, format, mode='w')

    # Convert 6-second data
    _convert(ukdale_path, store, _ukdale_measurement_mapping_func, TZ,
             sort_index=False)
    store.close()

    # Add metadata
    if format == 'HDF':
        convert_yaml_to_hdf5(join(ukdale_path, 'metadata'), output_filename)

    # Convert 1-second data
    store.open(mode='a')
    _convert_one_sec_data(ukdale_path, store, ac_type_map)

    store.close()
    print("Done converting UK-DALE to HDF5!")
Пример #26
0
def convert_redd(redd_path, hdf_filename):
    """
    Parameters
    ----------
    redd_path : str
        The root path of the REDD low_freq dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    assert isdir(redd_path)

    # Open HDF5 file
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')

    # Iterate though all houses and channels
    houses = _find_all_houses(redd_path)
    for house_id in houses:
        print("Loading house", house_id, end="... ")
        stdout.flush()
        chans = _find_all_chans(redd_path, house_id)
        for chan_id in chans:
            print(chan_id, end=" ")
            stdout.flush()
            key = Key(building=house_id, meter=chan_id)
            ac_type = 'apparent' if chan_id <= 2 else 'active'
            df = _load_chan(redd_path, key, [('power', ac_type)])
            store.put(str(key), df, format='table')
            store.flush()
        print()

    store.close()
    
    # Add metadata
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         hdf_filename)

    print("Done converting REDD to HDF5!")
Пример #27
0
def convert_redd(redd_path, hdf_filename):
    """
    Parameters
    ----------
    redd_path : str
        The root path of the REDD low_freq dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    assert isdir(redd_path)

    # Open HDF5 file
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')

    # Iterate though all houses and channels
    houses = _find_all_houses(redd_path)
    for house_id in houses:
        print("Loading house", house_id, end="... ")
        stdout.flush()
        chans = _find_all_chans(redd_path, house_id)
        for chan_id in chans:
            print(chan_id, end=" ")
            stdout.flush()
            key = Key(building=house_id, meter=chan_id)
            ac_type = 'apparent' if chan_id <= 2 else 'active'
            df = _load_chan(redd_path, key, [('power', ac_type)])
            store.put(str(key), df, format='table')
            store.flush()
        print()

    store.close()

    # Add metadata
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         hdf_filename)

    print("Done converting REDD to HDF5!")
Пример #28
0
def convert_iawe(iawe_path, output_filename, format="HDF"):
    """
    Parameters
    ----------
    iawe_path : str
        The root path of the iawe dataset.
    output_filename : str
        The destination filename (including path and suffix).
    """

    check_directory_exists(iawe_path)

    # Open data store
    store = get_datastore(output_filename, format, mode='w')
    electricity_path = join(iawe_path, "electricity")

    # Mains data
    for chan in range(1, 13):
        key = Key(building=1, meter=chan)
        filename = join(electricity_path, "%d.csv" % chan)
        print('Loading ', chan)
        df = pd.read_csv(filename)
        df.drop_duplicates(subset=["timestamp"], inplace=True)
        df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True)
        df = df.tz_convert(TIMEZONE)
        df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
        df.rename(columns=lambda x: column_mapping[x], inplace=True)
        df.columns.set_names(LEVEL_NAMES, inplace=True)
        df = df.convert_objects(convert_numeric=True)
        df = df.dropna()
        df = df.astype(np.float32)
        df = df.sort_index()
        store.put(str(key), df)
    store.close()
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         output_filename)

    print("Done converting iAWE to HDF5!")
Пример #29
0
def convert_ukdale(ukdale_path, hdf_filename):
    """
    Parameters
    ----------
    ukdale_path : str
        The root path of the UK-DALE dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """
    def _ukdale_measurement_mapping_func(house_id, chan_id):
        # TODO: This needs updating.  It's wrong!
        ac_type = 'apparent' if chan_id <= 2 else 'active'
        return [('power', ac_type)]

    _convert(ukdale_path, hdf_filename, _ukdale_measurement_mapping_func,
             'Europe/London')

    # Add metadata
    convert_yaml_to_hdf5(
        join(get_module_directory(), 'dataset_converters', 'ukdale',
             'metadata'), hdf_filename)

    print("Done converting UK-DALE to HDF5!")
Пример #30
0
def convert_redd(redd_path, hdf_filename):
    """
    Parameters
    ----------
    redd_path : str
        The root path of the REDD low_freq dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    def _redd_measurement_mapping_func(house_id, chan_id):
        ac_type = 'apparent' if chan_id <= 2 else 'active'
        return [('power', ac_type)]

    _convert(redd_path, hdf_filename, _redd_measurement_mapping_func, 'US/Eastern')

    # Add metadata
    convert_yaml_to_hdf5(join(get_module_directory(), 
                              'dataset_converters', 
                              'redd', 
                              'metadata'),
                         hdf_filename)

    print("Done converting REDD to HDF5!")
Пример #31
0
def convert_redd(redd_path, hdf_filename):
    """
    Parameters
    ----------
    redd_path : str
        The root path of the REDD low_freq dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    def _redd_measurement_mapping_func(house_id, chan_id):
        ac_type = 'apparent' if chan_id <= 2 else 'active'
        return [('power', ac_type)]

    _convert(redd_path, hdf_filename, _redd_measurement_mapping_func, 'US/Eastern')

    # Add metadata
    convert_yaml_to_hdf5(join(get_module_directory(), 
                              'dataset_converters', 
                              'redd', 
                              'metadata'),
                         hdf_filename)

    print("Done converting REDD to HDF5!")
Пример #32
0
def convert_eco(dataset_loc, hdf_filename, timezone):
	"""
	Parameters:
	-----------
	dataset_loc: str
		The root directory where the dataset is located.
	hdf_filename: str
		The location where the hdf_filename is present. 
                The directory location has to contain the 
		hdf5file name for the converter to work.
	timezone: str
		specifies the timezone of the dataset.
	"""

	# Creating a new HDF File
	store = pd.HDFStore(hdf_filename, 'w')

        check_directory_exists(dataset_loc)
	directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i]
	directory_list.sort()
	print directory_list

	# Traversing every folder
	for folder in directory_list:
		print 'Computing for folder',folder

		#Building number and meter_flag
		building_no = int(folder[:2])
		meter_flag = 'sm' if 'sm_csv' in folder else 'plugs'

		dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))]
		dir_list.sort()
		print 'Current dir list:',dir_list

		for fl in dir_list:
			#Meter number to be used in key
			meter_num = 1 if meter_flag == 'sm' else int(fl) + 1

			print 'Computing for Meter no.',meter_num

			fl_dir_list = [i for i in listdir(join(dataset_loc,folder,fl)) if '.csv' in i]
			fl_dir_list.sort()

			key = Key(building=building_no, meter=meter_num)

			for fi in fl_dir_list:

				#Getting dataframe for each csv file seperately
				df_fl = _get_df(join(dataset_loc,folder,fl),fi,meter_flag)
				df_fl.sort_index(ascending=True,inplace=True)
				df_fl = df_fl.tz_convert(timezone)

				# If table not present in hdf5, create or else append to existing data
				if not key in store:
					store.put(str(key), df_fl, format='Table')
				else:
					store.append(str(key), df_fl, format='Table')
				store.flush()
				print 'Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4]

	print "Data storage completed."
	store.close()

	# Adding the metadata to the HDF5file
	print "Proceeding to Metadata conversion..."
	meta_path = join(_get_module_directory(), 'metadata')
	convert_yaml_to_hdf5(meta_path, hdf_filename)
	print "Completed Metadata conversion."
Пример #33
0
def convert_eco(dataset_loc, hdf_filename, timezone):
    """
    Parameters:
    -----------
    dataset_loc: str
        The root directory where the dataset is located.
    hdf_filename: str
        The location where the hdf_filename is present. 
        The directory location has to contain the 
        hdf5file name for the converter to work.
    timezone: str
        specifies the timezone of the dataset.
    """

    # Creating a new HDF File
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc')    
    
    check_directory_exists(dataset_loc)
    directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i]
    directory_list.sort()
    print directory_list

    # Traversing every folder
    for folder in directory_list:

        if folder[0] == '.' or folder[-3:] == '.h5':
            print 'Skipping ', folder
            continue
        print 'Computing for folder',folder

        #Building number and meter_flag
        building_no = int(folder[:2])
        meter_flag = 'sm' if 'sm_csv' in folder else 'plugs'

        dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))]
        dir_list.sort()
        print 'Current dir list:',dir_list

        for fl in dir_list:
            
            print 'Computing for folder ',fl
            
            fl_dir_list = [i for i in listdir(join(dataset_loc,folder,fl)) if '.csv' in i]
            fl_dir_list.sort()

            if meter_flag == 'sm':
                for fi in fl_dir_list:
                    df = pd.read_csv(join(dataset_loc,folder,fl,fi), names=[i for i in range(1,17)], dtype=np.float32)
                    
                    for phase in range(1,4):
                        key = str(Key(building=building_no, meter=phase))
                        df_phase = df.ix[:,[1+phase, 5+phase, 8+phase, 13+phase]]

                        # get reactive power
                        power = df_phase.as_matrix([1+phase, 13+phase])
                        reactive = power[:,0] * np.tan(power[:,1] * np.pi / 180)
                        df_phase['Q'] = reactive
                        
                        df_phase.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz='GMT')
                        df_phase = df_phase.tz_convert(timezone)
                        
                        sm_column_name = {1+phase:('power', 'active'),
                                            5+phase:('current', ''),
                                            8+phase:('voltage', ''),
                                            13+phase:('phase_angle', ''),
                                            'Q': ('power', 'reactive'),
                                            };
                        df_phase.rename(columns=sm_column_name, inplace=True)
                        
                        tmp_before = np.size(df_phase.power.active)
                        df_phase = df_phase[df_phase.power.active != -1]
                        tmp_after = np.size(df_phase.power.active)
                        if (tmp_before != tmp_after):
                            print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after))
                        
                        df_phase.columns.set_names(LEVEL_NAMES, inplace=True)
                        if not key in store:
                            store.put(key, df_phase, format='Table')
                        else:
                            store.append(key, df_phase, format='Table')
                            store.flush()
                        print 'Building',building_no,', Meter no.',phase,'=> Done for ',fi[:-4]
                
            else:
                #Meter number to be used in key
                meter_num = int(fl) + 3
                
                key = str(Key(building=building_no, meter=meter_num))
                
                #Getting dataframe for each csv file seperately
                for fi in fl_dir_list:
                    df = pd.read_csv(join(dataset_loc,folder,fl ,fi), names=[1], dtype=np.float64)
                    df.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz = 'GMT')
                    df.rename(columns=plugs_column_name, inplace=True)
                    df = df.tz_convert(timezone)
                    df.columns.set_names(LEVEL_NAMES, inplace=True)

                    tmp_before = np.size(df.power.active)
                    df = df[df.power.active != -1]
                    tmp_after = np.size(df.power.active)
                    if (tmp_before != tmp_after):
                        print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after))
                    
                    # If table not present in hdf5, create or else append to existing data
                    if not key in store:
                        store.put(key, df, format='Table')
                        print 'Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4]
                    else:
                        store.append(key, df, format='Table')
                        store.flush()
                        print 'Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4]
            
    print "Data storage completed."
    store.close()

    # Adding the metadata to the HDF5file
    print "Proceeding to Metadata conversion..."
    meta_path = join(_get_module_directory(), 'metadata')
    convert_yaml_to_hdf5(meta_path, hdf_filename)
    print "Completed Metadata conversion."
Пример #34
0
def convert_hes(data_dir, output_filename, format='HDF', max_chunks=None):
    metadata = {
        'name': 'HES',
        'geographic_coordinates': (51.464462,-0.076544), # London
        'timezone': 'Europe/London'
    }
    
    # Open DataStore
    store = get_datastore(output_filename, format, mode='w')
    
    # load list of appliances
    hes_to_nilmtk_appliance_lookup = pd.read_csv(join(get_module_directory(), 
                                        'dataset_converters', 
                                        'hes', 
                                        'hes_to_nilmtk_appliance_lookup.csv'))

    # load list of houses
    hes_house_ids = load_list_of_house_ids(data_dir)
    nilmtk_house_ids = np.arange(1,len(hes_house_ids)+1)
    hes_to_nilmtk_house_ids = dict(zip(hes_house_ids, nilmtk_house_ids))

    # array of hes_house_codes: nilmtk_building_code = house_codes.index(hes_house_code)
    house_codes = []
    # map 
    house_appliance_codes = dict()

    # Iterate over files
    for filename in FILENAMES:
        # Load appliance energy data chunk-by-chunk
        full_filename = join(data_dir, filename)
        print('loading', full_filename)
        try:
            reader = pd.read_csv(full_filename, names=COL_NAMES, 
                                 index_col=False, chunksize=CHUNKSIZE)
        except IOError as e:
            print(e, file=stderr)
            continue

        # Iterate over chunks in file
        chunk_i = 0
        for chunk in reader:
            if max_chunks is not None and chunk_i >= max_chunks:
                break

            print(' processing chunk', chunk_i, 'of', filename)
            # Convert date and time columns to np.datetime64 objects
            dt = chunk['date'] + ' ' + chunk['time']
            del chunk['date']
            del chunk['time']
            chunk['datetime'] = dt.apply(datetime_converter)

            # Data is either tenths of a Wh or tenths of a degree
            chunk['data'] *= 10
            chunk['data'] = chunk['data'].astype(np.float32)

            # Iterate over houses in chunk
            for hes_house_id, hes_house_id_df in chunk.groupby('house id'):
                if hes_house_id not in house_codes:
                    house_codes.append(hes_house_id)
                    
                if hes_house_id not in house_appliance_codes.keys():
                    house_appliance_codes[hes_house_id] = []
                
                nilmtk_house_id = house_codes.index(hes_house_id)+1
                
                # Iterate over appliances in house
                for appliance_code, appliance_df in chunk.groupby('appliance code'):
                    if appliance_code not in house_appliance_codes[hes_house_id]:
                        house_appliance_codes[hes_house_id].append(appliance_code)
                    nilmtk_meter_id = house_appliance_codes[hes_house_id].index(appliance_code)+1
                    _process_meter_in_chunk(nilmtk_house_id, nilmtk_meter_id, hes_house_id_df, store, appliance_code)
                    
            chunk_i += 1
    print('houses with some data loaded:', house_appliance_codes.keys())
    
    store.close()
    
    # generate building yaml metadata
    for hes_house_id in house_codes:
        nilmtk_building_id = house_codes.index(hes_house_id)+1
        building_metadata = {}
        building_metadata['instance'] = nilmtk_building_id
        building_metadata['original_name'] = int(hes_house_id) # use python int
        building_metadata['elec_meters'] = {}
        building_metadata['appliances'] = []
        
        # initialise dict of instances of each appliance type
        instance_counter = {}
        
        for appliance_code in house_appliance_codes[hes_house_id]:
            nilmtk_meter_id = house_appliance_codes[hes_house_id].index(appliance_code)+1
            # meter metadata
            if appliance_code in MAINS_CODES:
                meter_metadata = {'device_model': 'multivoies',
                                  'site_meter': True}
                break
            elif appliance_code in CIRCUIT_CODES:
                meter_metadata = {'device_model': 'multivoies'}
                break
            elif appliance_code in TEMPERATURE_CODES:
                break
            else: # is appliance
                meter_metadata = {'device_model': 'wattmeter'}
                
            # only appliance meters at this point
            building_metadata['elec_meters'][nilmtk_meter_id] = meter_metadata
            # appliance metadata
            lookup_row = hes_to_nilmtk_appliance_lookup[hes_to_nilmtk_appliance_lookup.Code==appliance_code].iloc[0]
            appliance_metadata = {'original_name': lookup_row.Name, 
                                      'meters': [nilmtk_meter_id] }
            # appliance type
            appliance_metadata.update({'type': lookup_row.nilmtk_name})
            # TODO appliance room
            
            # appliance instance number
            if instance_counter.get(lookup_row.nilmtk_name) == None:
                instance_counter[lookup_row.nilmtk_name] = 0
            instance_counter[lookup_row.nilmtk_name] += 1 
            appliance_metadata['instance'] = instance_counter[lookup_row.nilmtk_name]
            
            building_metadata['appliances'].append(appliance_metadata)
        building = 'building{:d}'.format(nilmtk_building_id)
        yaml_full_filename = join(_get_module_directory(), 'metadata', building + '.yaml')
        with open(yaml_full_filename, 'w') as outfile:
            #print(building_metadata)
            outfile.write(yaml.dump(building_metadata))
            
    
    # write yaml metadata to hdf5
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         output_filename)
Пример #35
0
def download_dataport(database_username, database_password, 
                     hdf_filename, periods_to_load=None):
    """
    Downloads data from dataport database into an HDF5 file.

    Parameters
    ----------
    hdf_filename : str
        Output HDF filename.  If file exists already then will be deleted.
    database_username, database_password : str
    periods_to_load : dict of tuples, optional
       Key of dict is the building number (int).
       Values are (<start date>, <end date>)
       e.g. ("2013-04-01", None) or ("2013-04-01", "2013-08-01")
       defaults to all buildings and all date ranges
    """

    # dataport database settings
    database_host = 'dataport.pecanstreet.org'
    database_port = '5434'
    database_name = 'postgres'
    database_schema = 'university'

    # try to connect to database
    try:
        conn = db.connect('host=' + database_host + 
                          ' port=' + database_port + 
                          ' dbname=' + database_name + 
                          ' user='******' password='******'Could not connect to remote database')
        raise

    # set up a new HDF5 datastore (overwrites existing store)
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')
    
    # remove existing building yaml files in module dir
    for f in os.listdir(join(_get_module_directory(), 'metadata')):
        if re.search('^building', f):
            os.remove(join(_get_module_directory(), 'metadata', f))
    
    """
    TODO:
    The section below can be altered or removed, since the restructured Dataport
    now has only one electricity_egauge_minutes table.
    """
    # get tables in database schema
    sql_query = ("SELECT table_name" +
                 " FROM information_schema.views" +
                 " WHERE table_schema ='" + database_schema + "'" +
                 " ORDER BY table_name")
    database_tables = pd.read_sql(sql_query, conn)['table_name'].tolist()
    database_tables = [t for t in database_tables if 'electricity_egauge_minutes' in t]

    # if user has specified buildings
    if periods_to_load:
        buildings_to_load = list(periods_to_load.keys())
    else:
        # get buildings present in all tables
        sql_query = ''
        for table in database_tables:
            sql_query = (sql_query + '(SELECT DISTINCT dataid' + 
                         ' FROM "' + database_schema + '".' + table + 
                         ') UNION ')
        sql_query = sql_query[:-7]
        sql_query = (sql_query + ' ORDER BY dataid') 
        buildings_to_load = pd.read_sql(sql_query, conn)['dataid'].tolist()

    # for each user specified building or all buildings in database
    for building_id in buildings_to_load:
        print("Loading building {:d} @ {}"
              .format(building_id, datetime.datetime.now()))
        sys.stdout.flush()

        # create new list of chunks for concatenating later
        dataframe_list = []

        # for each table of 1 month data
        for database_table in database_tables:
            print("  Loading table {:s}".format(database_table))
            sys.stdout.flush()

            # get buildings present in electricity_egauge_minutes table
            sql_query = ('SELECT DISTINCT dataid' +
                         ' FROM university.metadata' +
                         ' WHERE egauge_min_time IS NOT NULL' +
                         ' ORDER BY dataid')
            buildings_in_table = pd.read_sql(sql_query, conn)['dataid'].tolist()

            if building_id in buildings_in_table:
                # get first and last timestamps for this house in electricity_egauge_minutes table
                sql_query = ('SELECT MIN(egauge_min_time) AS minlocalminute,' +
                             ' MAX(egauge_max_time) AS maxlocalminute' +
                             ' FROM university.metadata' +
                             ' WHERE dataid=' + str(building_id))
                range = pd.read_sql(sql_query, conn)
                first_timestamp_in_table = range['minlocalminute'][0]
                last_timestamp_in_table = range['maxlocalminute'][0]

                # get requested start and end and localize them
                requested_start = None
                requested_end = None
                database_timezone = 'US/Central'
                if periods_to_load:
                    if periods_to_load[building_id][0]:
                        requested_start = pd.Timestamp(periods_to_load[building_id][0])
                        requested_start = requested_start.tz_localize(database_timezone)
                    if periods_to_load[building_id][1]:
                        requested_end = pd.Timestamp(periods_to_load[building_id][1])
                        requested_end = requested_end.tz_localize(database_timezone)

                # check user start is not after end
                if requested_start > requested_end:
                    print('requested end is before requested start')
                    sys.stdout.flush()
                else:                        
                    # clip data to smallest range
                    if requested_start:
                        start = max(requested_start, first_timestamp_in_table)
                    else:
                        start = first_timestamp_in_table
                    if requested_end:
                        end = min(requested_end, last_timestamp_in_table)
                    else:
                        end = last_timestamp_in_table

                    # download data in chunks
                    chunk_start = start
                    chunk_size = datetime.timedelta(10)  # 1 day
                    while chunk_start < end:
                        chunk_end = chunk_start + chunk_size 
                        if chunk_end > end:
                            chunk_end = end
                        # subtract 1 second so end is exclusive
                        chunk_end = chunk_end - datetime.timedelta(0, 1)

                        # query power data for all channels
                        format = '%Y-%m-%d %H:%M:%S'
                        sql_query = ('SELECT *' + 
                                     ' FROM "' + database_schema + '".' + database_table + 
                                     ' WHERE dataid=' + str(building_id) + 
                                     'and localminute between ' + 
                                     "'" + chunk_start.strftime(format) + "'" + 
                                     " and " + 
                                     "'" + chunk_end.strftime(format) + "'")
                        chunk_dataframe = pd.read_sql(sql_query, conn)
                        
                        # nilmtk requires building indices to start at 1
                        nilmtk_building_id = buildings_to_load.index(building_id) + 1
                        # convert to nilmtk-df and save to disk
                        nilmtk_dataframe = _dataport_dataframe_to_hdf(chunk_dataframe, store,
                                                                         nilmtk_building_id,
                                                                         building_id)

                        # print progress
                        print('    ' + str(chunk_start) + ' -> ' + 
                              str(chunk_end) + ': ' + 
                              str(len(chunk_dataframe.index)) + ' rows')
                        sys.stdout.flush()

                        # append all chunks into list for csv writing
                        #dataframe_list.append(chunk_dataframe)

                        # move on to next chunk
                        chunk_start = chunk_start + chunk_size

        # saves all chunks in list to csv
        #if len(dataframe_list) > 0:
            #dataframe_concat = pd.concat(dataframe_list)
            #dataframe_concat.to_csv(output_directory + str(building_id) + '.csv')
            
    store.close()
    conn.close()
    
    # write yaml to hdf5
    # dataset.yaml and meter_devices.yaml are static, building<x>.yaml are dynamic  
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         hdf_filename)
Пример #36
0
def convert_gjw(gjw_path, output_filename):
    """
    Parameters
    ----------
    gjw_path : str
        The root path of the gjw dataset.
    output_filename : str
        The destination filename (including path and suffix), will default if not specified
    directory and file structure
    nilm_gjw_data
        building<1>
            elec
                4-POWER_REAL_FINE <date> Dump.csv
                5-POWER_REACTIVE_STANDARD <date> Dump.csv
                ...
        ...
        building<n>
        HDF5
            nilm_gjw_data.hdf5
        metadata
            building1.yaml
            dataset.yaml
            meter_devices.yaml
        other files    
    """
    if gjw_path is None: gjw_path = home_dir
    check_directory_exists(gjw_path)
    os.chdir(gjw_path)
    gjw_path = os.getcwd(
    )  # sort out potential issue with slashes or backslashes
    if output_filename is None:
        output_filename = join(home_dir, 'HDF5', 'nilm_gjw_data.hdf5')
    # Open data store
    print('opening datastore', output_filename)
    store = get_datastore(output_filename, format, mode='w')
    # walk the directory tree from the dataset home directory
    #clear dataframe & add column headers
    df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME, REACTIVE_COLUMN_NAME])
    found = False
    for current_dir, _, files in os.walk(gjw_path):
        #unused second parameter of for dirs_in_current_dir
        if current_dir.find('.git') != -1 or current_dir.find('.ipynb') != -1:
            #print( 'Skipping ', current_dir)
            continue
        print('checking', current_dir)
        m = bld_re.search(current_dir)
        if m:  #The csv files may be further down the tree so this section may be repeated
            building_name = m.group()
            building_nbr = int(bld_nbr_re.search(building_name).group())
            meter_nbr = 1
            key = Key(building=building_nbr, meter=meter_nbr)
        for items in fnmatch.filter(files, "4*.csv"):
            # process any .CSV files found
            found = True
            ds = iso_date_re.search(items).group()
            # print( 'found files for date:', ds,end=" ")
            # found files to process
            df1 = _read_file_pair(current_dir,
                                  ds)  # read two csv files into a dataframe
            df = pd.concat(
                [df, df1])  # concatenate the results into one long dataframe
        if found:
            found = False
            df = _prepare_data_for_toolkit(df)
            _summarise_dataframe(df, 'Prepared for tool kit')
            store.put(str(key), df)
            #clear dataframe & add column headers
            #df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME])
            break  # only 1 folder with .csv files at present
    store.close()
    convert_yaml_to_hdf5(join(gjw_path, 'metadata'), output_filename)
    print("Done converting gjw to HDF5!")
Пример #37
0
def convert_gjw(gjw_path, output_filename):
    """
    Parameters
    ----------
    gjw_path : str
        The root path of the gjw dataset.
    output_filename : str
        The destination filename (including path and suffix), will default if not specified
    directory and file structure
    nilm_gjw_data
        building<1>
            elec
                4-POWER_REAL_FINE <date> Dump.csv
                5-POWER_REACTIVE_STANDARD <date> Dump.csv
                ...
        ...
        building<n>
        HDF5
            nilm_gjw_data.hdf5
        metadata
            building1.yaml
            dataset.yaml
            meter_devices.yaml
        other files    
    """
    if gjw_path is None: gjw_path = home_dir
    check_directory_exists(gjw_path)
    os.chdir(gjw_path)
    gjw_path = os.getcwd()  # sort out potential issue with slashes or backslashes
    if output_filename is None:
        output_filename =join(home_dir,'HDF5','nilm_gjw_data.hdf5')
    # Open data store
    print( 'opening datastore', output_filename)
    store = get_datastore(output_filename, format, mode='w')
    # walk the directory tree from the dataset home directory
    #clear dataframe & add column headers
    df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME])
    found = False
    for current_dir, _, files in os.walk(gjw_path):
        #unused second parameter of for dirs_in_current_dir
        if current_dir.find('.git')!=-1 or current_dir.find('.ipynb') != -1:
            #print( 'Skipping ', current_dir)
            continue
        print( 'checking', current_dir)
        m = bld_re.search(current_dir)
        if m: #The csv files may be further down the tree so this section may be repeated
            building_name = m.group()
            building_nbr = int(bld_nbr_re.search(building_name).group())
            meter_nbr = 1
            key = Key(building=building_nbr, meter=meter_nbr)
        for items in fnmatch.filter(files, "4*.csv"):
            # process any .CSV files found
            found = True
            ds = iso_date_re.search(items).group()
            # print( 'found files for date:', ds,end=" ")
            # found files to process
            df1 = _read_file_pair(current_dir,ds) # read two csv files into a dataframe    
            df = pd.concat([df,df1]) # concatenate the results into one long dataframe
        if found:
            found = False
            df = _prepare_data_for_toolkit(df)
            _summarise_dataframe(df,'Prepared for tool kit')
            store.put(str(key), df)
            #clear dataframe & add column headers
            #df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME])
            break # only 1 folder with .csv files at present
    store.close()
    convert_yaml_to_hdf5(join(gjw_path, 'metadata'),output_filename)
    print("Done converting gjw to HDF5!")
Пример #38
0
def convert_eco(dataset_loc, hdf_filename, timezone):
    """
    Parameters:
    -----------
    dataset_loc: str
        The root directory where the dataset is located.
    hdf_filename: str
        The location where the hdf_filename is present. 
        The directory location has to contain the 
        hdf5file name for the converter to work.
    timezone: str
        specifies the timezone of the dataset.
    """

    # Creating a new HDF File
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc')    
    
    check_directory_exists(dataset_loc)
    directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i]
    directory_list.sort()
    print(directory_list)

    found_any_sm = False
    found_any_plug = False
    
    # Traversing every folder
    for folder in directory_list:
        if folder[0] == '.' or folder[-3:] == '.h5':
            print('Skipping ', folder)
            continue

        #Building number and meter_flag
        building_no = int(folder[:2])
        meter_flag = None 
        if 'sm_csv' in folder:
            meter_flag = 'sm'
        elif 'plugs' in folder:
            meter_flag = 'plugs'
        else:
            print('Skipping folder', folder)
            continue
            
        print('Computing for folder', folder)

        dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))]
        dir_list.sort()
        
        if meter_flag == 'plugs' and len(dir_list) < 3:
            # Try harder to find the subfolders
            folder = join(folder, folder[:2])
            dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))]
        
        print('Current dir list:', dir_list)

        for fl in dir_list:
            print('Computing for folder ', fl)
            
            fl_dir_list = [i for i in listdir(join(dataset_loc,folder,fl)) if '.csv' in i]
            fl_dir_list.sort()

            if meter_flag == 'sm':
                for fi in fl_dir_list:
                    found_any_sm = True
                    df = pd.read_csv(join(dataset_loc,folder,fl,fi), names=[i for i in range(1,17)], dtype=np.float32)
                    
                    for phase in range(1,4):
                        key = str(Key(building=building_no, meter=phase))
                        df_phase = df.loc[:,[1+phase, 5+phase, 8+phase, 13+phase]]

                        # get reactive power
                        power = df_phase.loc[:, (1+phase, 13+phase)].values
                        reactive = power[:,0] * np.tan(power[:,1] * np.pi / 180)
                        df_phase['Q'] = reactive
                        
                        df_phase.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz='GMT')
                        df_phase = df_phase.tz_convert(timezone)
                        
                        sm_column_name = {
                            1+phase:('power', 'active'),
                            5+phase:('current', ''),
                            8+phase:('voltage', ''),
                            13+phase:('phase_angle', ''),
                            'Q': ('power', 'reactive'),
                        }
                        df_phase.columns = pd.MultiIndex.from_tuples([
                            sm_column_name[col] for col in df_phase.columns
                        ])
                        
                        power_active = df_phase['power', 'active']
                        tmp_before = np.size(power_active)
                        df_phase = df_phase[power_active != -1]
                        power_active = df_phase['power', 'active']
                        tmp_after = np.size(power_active)
                        
                        if tmp_before != tmp_after:
                            print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after))
                        
                        df_phase.columns.set_names(LEVEL_NAMES, inplace=True)
                        if not key in store:
                            store.put(key, df_phase, format='Table')
                        else:
                            store.append(key, df_phase, format='Table')
                            store.flush()
                        print('Building', building_no, ', Meter no.', phase,
                              '=> Done for ', fi[:-4])
                
            else:
                #Meter number to be used in key
                meter_num = int(fl) + 3
                
                key = str(Key(building=building_no, meter=meter_num))

                current_folder = join(dataset_loc,folder,fl)
                if not fl_dir_list:
                    raise RuntimeError("No CSV file found in " + current_folder)
                    
                #Getting dataframe for each csv file seperately
                for fi in fl_dir_list:
                    found_any_plug = True
                    df = pd.read_csv(join(current_folder, fi), names=[1], dtype=np.float64)
                    df.index = pd.DatetimeIndex(start=fi[:-4].replace('.', ':'), freq='s', periods=86400, tz = 'GMT')
                    df.columns = pd.MultiIndex.from_tuples(plugs_column_name.values())
                    df = df.tz_convert(timezone)
                    df.columns.set_names(LEVEL_NAMES, inplace=True)

                    tmp_before = np.size(df.power.active)
                    df = df[df.power.active != -1]
                    tmp_after = np.size(df.power.active)
                    if (tmp_before != tmp_after):
                        print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after))
                    
                    # If table not present in hdf5, create or else append to existing data
                    if not key in store:
                        store.put(key, df, format='Table')
                        print('Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4])
                    else:
                        store.append(key, df, format='Table')
                        store.flush()
                        print('Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4])
            
            
    if not found_any_plug or not found_any_sm:
        raise RuntimeError('The files were not found! Please check the folder structure. Extract each ZIP file into a folder with its base name (e.g. extract "01_plugs_csv.zip" into a folder named "01_plugs_csv", etc.)')
        
    print("Data storage completed.")
    store.close()

    # Adding the metadata to the HDF5file
    print("Proceeding to Metadata conversion...")
    meta_path = join(
        get_module_directory(), 
        'dataset_converters',
        'eco',
        'metadata'
    )
    convert_yaml_to_hdf5(meta_path, hdf_filename)
    print("Completed Metadata conversion.")
Пример #39
0
def download_dataport(database_username, database_password, 
                     hdf_filename, periods_to_load=None):
    """
    Downloads data from dataport database into an HDF5 file.

    Parameters
    ----------
    hdf_filename : str
        Output HDF filename.  If file exists already then will be deleted.
    database_username, database_password : str
    periods_to_load : dict of tuples, optional
       Key of dict is the building number (int).
       Values are (<start date>, <end date>)
       e.g. ("2013-04-01", None) or ("2013-04-01", "2013-08-01")
       defaults to all buildings and all date ranges
    """

    # dataport database settings
    database_host = 'dataport.pecanstreet.org'
    database_port = '5434'
    database_name = 'postgres'
    database_schema = 'university'

    # try to connect to database
    try:
        conn = db.connect('host=' + database_host + 
                          ' port=' + database_port + 
                          ' dbname=' + database_name + 
                          ' user='******' password='******'Could not connect to remote database')
        raise

    # set up a new HDF5 datastore (overwrites existing store)
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')
    
    # remove existing building yaml files in module dir
    for f in os.listdir(join(_get_module_directory(), 'metadata')):
        if re.search('^building', f):
            os.remove(join(_get_module_directory(), 'metadata', f))
    
    """
    TODO:
    The section below can be altered or removed, since the restructured Dataport
    now has only one electricity_egauge_minutes table.
    """
    # get tables in database schema
    sql_query = ("SELECT table_name" +
                 " FROM information_schema.views" +
                 " WHERE table_schema ='" + database_schema + "'" +
                 " ORDER BY table_name")
    database_tables = pd.read_sql(sql_query, conn)['table_name'].tolist()
    database_tables = [t for t in database_tables if 'electricity_egauge_minutes' in t]

    # if user has specified buildings
    if periods_to_load:
        buildings_to_load = periods_to_load.keys()
    else:
        # get buildings present in all tables
        sql_query = ''
        for table in database_tables:
            sql_query = (sql_query + '(SELECT DISTINCT dataid' + 
                         ' FROM "' + database_schema + '".' + table + 
                         ') UNION ')
        sql_query = sql_query[:-7]
        sql_query = (sql_query + ' ORDER BY dataid') 
        buildings_to_load = pd.read_sql(sql_query, conn)['dataid'].tolist()

    # for each user specified building or all buildings in database
    for building_id in buildings_to_load:
        print("Loading building {:d} @ {}"
              .format(building_id, datetime.datetime.now()))
        sys.stdout.flush()

        # create new list of chunks for concatenating later
        dataframe_list = []

        # for each table of 1 month data
        for database_table in database_tables:
            print("  Loading table {:s}".format(database_table))
            sys.stdout.flush()

            # get buildings present in electricity_egauge_minutes table
            sql_query = ('SELECT DISTINCT dataid' +
                         ' FROM university.metadata' +
                         ' WHERE egauge_min_time IS NOT NULL' +
                         ' ORDER BY dataid')
            buildings_in_table = pd.read_sql(sql_query, conn)['dataid'].tolist()

            if building_id in buildings_in_table:
                # get first and last timestamps for this house in electricity_egauge_minutes table
                sql_query = ('SELECT MIN(egauge_min_time) AS minlocalminute,' +
                             ' MAX(egauge_max_time) AS maxlocalminute' +
                             ' FROM university.metadata' +
                             ' WHERE dataid=' + str(building_id))
                range = pd.read_sql(sql_query, conn)
                first_timestamp_in_table = range['minlocalminute'][0]
                last_timestamp_in_table = range['maxlocalminute'][0]

                # get requested start and end and localize them
                requested_start = None
                requested_end = None
                database_timezone = 'US/Central'
                if periods_to_load:
                    if periods_to_load[building_id][0]:
                        requested_start = pd.Timestamp(periods_to_load[building_id][0])
                        requested_start = requested_start.tz_localize(database_timezone)
                    if periods_to_load[building_id][1]:
                        requested_end = pd.Timestamp(periods_to_load[building_id][1])
                        requested_end = requested_end.tz_localize(database_timezone)

                # check user start is not after end
                if requested_start > requested_end:
                    print('requested end is before requested start')
                    sys.stdout.flush()
                else:                        
                    # clip data to smallest range
                    if requested_start:
                        start = max(requested_start, first_timestamp_in_table)
                    else:
                        start = first_timestamp_in_table
                    if requested_end:
                        end = min(requested_end, last_timestamp_in_table)
                    else:
                        end = last_timestamp_in_table

                    # download data in chunks
                    chunk_start = start
                    chunk_size = datetime.timedelta(10)  # 1 day
                    while chunk_start < end:
                        chunk_end = chunk_start + chunk_size 
                        if chunk_end > end:
                            chunk_end = end
                        # subtract 1 second so end is exclusive
                        chunk_end = chunk_end - datetime.timedelta(0, 1)

                        # query power data for all channels
                        format = '%Y-%m-%d %H:%M:%S'
                        sql_query = ('SELECT *' + 
                                     ' FROM "' + database_schema + '".' + database_table + 
                                     ' WHERE dataid=' + str(building_id) + 
                                     'and localminute between ' + 
                                     "'" + chunk_start.strftime(format) + "'" + 
                                     " and " + 
                                     "'" + chunk_end.strftime(format) + "'")
                        chunk_dataframe = pd.read_sql(sql_query, conn)
                        
                        # nilmtk requires building indices to start at 1
                        nilmtk_building_id = buildings_to_load.index(building_id) + 1
                        # convert to nilmtk-df and save to disk
                        nilmtk_dataframe = _dataport_dataframe_to_hdf(chunk_dataframe, store,
                                                                         nilmtk_building_id,
                                                                         building_id)

                        # print progress
                        print('    ' + str(chunk_start) + ' -> ' + 
                              str(chunk_end) + ': ' + 
                              str(len(chunk_dataframe.index)) + ' rows')
                        sys.stdout.flush()

                        # append all chunks into list for csv writing
                        #dataframe_list.append(chunk_dataframe)

                        # move on to next chunk
                        chunk_start = chunk_start + chunk_size

        # saves all chunks in list to csv
        #if len(dataframe_list) > 0:
            #dataframe_concat = pd.concat(dataframe_list)
            #dataframe_concat.to_csv(output_directory + str(building_id) + '.csv')
            
    store.close()
    conn.close()
    
    # write yaml to hdf5
    # dataset.yaml and meter_devices.yaml are static, building<x>.yaml are dynamic  
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         hdf_filename)
Пример #40
0

feed_ignore = ['gen', 'grid']

WEATHER_HVAC_STORE = os.path.join(script_path, '..', '..', '..',
                                  'data/hvac/weather_hvac_2013.h5')

temp_h5_path = os.path.expanduser("~/Downloads/wiki-temp.h5")

store_total = pd.HDFStore(temp_h5_path)

store_useful = pd.HDFStore(WEATHER_HVAC_STORE)
useful_keys = [k[:-2] for k in store_useful.keys() if "X" in k]

START, STOP = "2013-07-01", "2013-07-31"

store_name = os.path.expanduser("~/wikienergy-2013.h5")
with pd.HDFStore(store_name, "w") as store_to_write:
    for nilmtk_id, dataid_str in enumerate(useful_keys):

        dataid = int(dataid_str[1:])

        df = store_total[dataid_str][START:STOP]
        if df['air1'].sum() > 0:
            print("Writing ", nilmtk_id, dataid)
            _dataport_dataframe_to_hdf(df, store_to_write, nilmtk_id + 1,
                                       dataid)
        else:
            print("Skipping", nilmtk_id, dataid)
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), store_name)
Пример #41
0
def convert_eco(dataset_loc, hdf_filename, timezone):
    """
    Parameters:
    -----------
    dataset_loc: str
        The root directory where the dataset is located.
    hdf_filename: str
        The location where the hdf_filename is present. 
        The directory location has to contain the 
        hdf5file name for the converter to work.
    timezone: str
        specifies the timezone of the dataset.
    """

    # Creating a new HDF File
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc')

    check_directory_exists(dataset_loc)
    directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i]
    directory_list.sort()
    print(directory_list)

    # Traversing every folder
    for folder in directory_list:

        if folder[0] == '.' or folder[-3:] == '.h5':
            print('Skipping ', folder)
            continue
        print('Computing for folder', folder)

        #Building number and meter_flag
        building_no = int(folder[:2])
        meter_flag = 'sm' if 'sm_csv' in folder else 'plugs'

        dir_list = [
            i for i in listdir(join(dataset_loc, folder))
            if isdir(join(dataset_loc, folder, i))
        ]
        dir_list.sort()
        print('Current dir list:', dir_list)

        for fl in dir_list:

            print('Computing for folder ', fl)

            fl_dir_list = [
                i for i in listdir(join(dataset_loc, folder, fl))
                if '.csv' in i
            ]
            fl_dir_list.sort()

            if meter_flag == 'sm':
                for fi in fl_dir_list:
                    df = pd.read_csv(join(dataset_loc, folder, fl, fi),
                                     names=[i for i in range(1, 17)],
                                     dtype=np.float32)

                    for phase in range(1, 4):
                        key = str(Key(building=building_no, meter=phase))
                        df_phase = df.ix[:, [
                            1 + phase, 5 + phase, 8 + phase, 13 + phase
                        ]]

                        # get reactive power
                        power = df_phase.as_matrix([1 + phase, 13 + phase])
                        reactive = power[:, 0] * np.tan(
                            power[:, 1] * np.pi / 180)
                        df_phase['Q'] = reactive

                        df_phase.index = pd.DatetimeIndex(start=fi[:-4],
                                                          freq='s',
                                                          periods=86400,
                                                          tz='GMT')
                        df_phase = df_phase.tz_convert(timezone)

                        sm_column_name = {
                            1 + phase: ('power', 'active'),
                            5 + phase: ('current', ''),
                            8 + phase: ('voltage', ''),
                            13 + phase: ('phase_angle', ''),
                            'Q': ('power', 'reactive'),
                        }
                        df_phase.rename(columns=sm_column_name, inplace=True)

                        tmp_before = np.size(df_phase.power.active)
                        df_phase = df_phase[df_phase.power.active != -1]
                        tmp_after = np.size(df_phase.power.active)
                        if (tmp_before != tmp_after):
                            print(
                                'Removed missing measurements - Size before: '
                                + str(tmp_before) + ', size after: ' +
                                str(tmp_after))

                        df_phase.columns.set_names(LEVEL_NAMES, inplace=True)
                        if not key in store:
                            store.put(key, df_phase, format='Table')
                        else:
                            store.append(key, df_phase, format='Table')
                            store.flush()
                        print('Building', building_no, ', Meter no.', phase,
                              '=> Done for ', fi[:-4])

            else:
                #Meter number to be used in key
                meter_num = int(fl) + 3

                key = str(Key(building=building_no, meter=meter_num))

                #Getting dataframe for each csv file seperately
                for fi in fl_dir_list:
                    df = pd.read_csv(join(dataset_loc, folder, fl, fi),
                                     names=[1],
                                     dtype=np.float64)
                    df.index = pd.DatetimeIndex(start=fi[:-4],
                                                freq='s',
                                                periods=86400,
                                                tz='GMT')
                    df.rename(columns=plugs_column_name, inplace=True)
                    df = df.tz_convert(timezone)
                    df.columns.set_names(LEVEL_NAMES, inplace=True)

                    tmp_before = np.size(df.power.active)
                    df = df[df.power.active != -1]
                    tmp_after = np.size(df.power.active)
                    if (tmp_before != tmp_after):
                        print('Removed missing measurements - Size before: ' +
                              str(tmp_before) + ', size after: ' +
                              str(tmp_after))

                    # If table not present in hdf5, create or else append to existing data
                    if not key in store:
                        store.put(key, df, format='Table')
                        print('Building', building_no, ', Meter no.',
                              meter_num, '=> Done for ', fi[:-4])
                    else:
                        store.append(key, df, format='Table')
                        store.flush()
                        print('Building', building_no, ', Meter no.',
                              meter_num, '=> Done for ', fi[:-4])

    print("Data storage completed.")
    store.close()

    # Adding the metadata to the HDF5file
    print("Proceeding to Metadata conversion...")
    meta_path = join(_get_module_directory(), 'metadata')
    convert_yaml_to_hdf5(meta_path, hdf_filename)
    print("Completed Metadata conversion.")
Пример #42
0
def convert_greend(greend_path, hdf_filename, use_mp=True):
    """
    Parameters
    ----------
    greend_path : str
        The root path of the greend dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    use_mp : bool 
        Defaults to True. Use multiprocessing to load the files for
        each building.
    """
    store = pd.HDFStore(hdf_filename, 'w', complevel=5, complib='zlib')
    houses = sorted(_get_houses(greend_path))

    print('Houses found:', houses)
    if use_mp:
        pool = Pool()

    h = 1  # nilmtk counts buildings from 1 not from 0 as we do, so everything is shifted by 1

    for house in houses:
        print('Loading', house)
        abs_house = join(greend_path, house)
        dates = [d for d in listdir(abs_house) if d.startswith('dataset')]
        target_filenames = [join(abs_house, date) for date in dates]
        if use_mp:
            house_data = pool.map(_get_blocks, target_filenames)

            # Ensure the blocks are sorted by date and make a plain list
            house_data_dfs = []
            for date, data in sorted(house_data, key=lambda x: x[0]):
                house_data_dfs.extend(data)
        else:
            house_data_dfs = []
            for fn in target_filenames:
                house_data_dfs.extend(_get_blocks(fn)[1])

        overall_df = pd.concat(house_data_dfs).sort_index()
        dups_in_index = overall_df.index.duplicated(keep='first')
        if dups_in_index.any():
            print("Found duplicated values in index, dropping them.")
            overall_df = overall_df[~dups_in_index]

        m = 1
        for column in overall_df.columns:
            print("meter {}: {}".format(m, column))
            key = Key(building=h, meter=m)
            print("Putting into store...")

            df = overall_df[column].to_frame()  #.dropna(axis=0)

            # if drop_duplicates:
            # print("Dropping duplicated values in data...")
            # df = df.drop_duplicates()

            df.columns = pd.MultiIndex.from_tuples([('power', 'active')])
            df.columns.set_names(LEVEL_NAMES, inplace=True)

            store.put(str(key), df, format='table')
            m += 1
            # print('Flushing store...')
            # store.flush()

        h += 1

    store.close()

    # retrieve the dataset metadata in the metadata subfolder
    metadata_dir = join(get_module_directory(), 'dataset_converters', 'greend',
                        'metadata')
    convert_yaml_to_hdf5(metadata_dir, hdf_filename)
Пример #43
0
def convert_greend(greend_path, hdf_filename, use_mp=True):
    """
    Parameters
    ----------
    greend_path : str
        The root path of the greend dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    use_mp : bool 
        Defaults to True. Use multiprocessing to load the files for
        each building.
    """
    store = pd.HDFStore(hdf_filename, 'w', complevel=5, complib='zlib')
    houses = sorted(_get_houses(greend_path))
    
    print('Houses found:', houses)
    if use_mp:
        pool = Pool()
    
    h = 1 # nilmtk counts buildings from 1 not from 0 as we do, so everything is shifted by 1
    
    for house in houses:
        print('Loading', house)
        abs_house = join(greend_path, house)
        dates = [d for d in listdir(abs_house) if d.startswith('dataset')]
        target_filenames = [join(abs_house, date) for date in dates]
        if use_mp:
            house_data = pool.map(_get_blocks, target_filenames)

            # Ensure the blocks are sorted by date and make a plain list
            house_data_dfs = []
            for date, data in sorted(house_data, key=lambda x: x[0]):
                house_data_dfs.extend(data)
        else:
            house_data_dfs = []
            for fn in target_filenames:
                house_data_dfs.extend(_get_blocks(fn)[1])
            
        overall_df = pd.concat(house_data_dfs).sort_index()
        dups_in_index = overall_df.index.duplicated(keep='first')
        if dups_in_index.any():
            print("Found duplicated values in index, dropping them.")
            overall_df = overall_df[~dups_in_index]
        
        m = 1
        for column in overall_df.columns:
            print("meter {}: {}".format(m, column))
            key = Key(building=h, meter=m)
            print("Putting into store...")
            
            df = overall_df[column].to_frame() #.dropna(axis=0)
            
            # if drop_duplicates:
                # print("Dropping duplicated values in data...")
                # df = df.drop_duplicates()
            
            df.columns = pd.MultiIndex.from_tuples([('power', 'active')])
            df.columns.set_names(LEVEL_NAMES, inplace=True)
            
            store.put(str(key), df, format = 'table')
            m += 1
            # print('Flushing store...')
            # store.flush()
            
        h += 1

    store.close()
	
	# retrieve the dataset metadata in the metadata subfolder
    metadata_dir = join(get_module_directory(), 'dataset_converters', 'greend', 'metadata')
    convert_yaml_to_hdf5(metadata_dir, hdf_filename)
Пример #44
0
    return path_to_this_file


feed_ignore = ['gen', 'grid']

WEATHER_HVAC_STORE = os.path.join(script_path, '..', '..', '..', 'data/hvac/weather_hvac_2013.h5')

temp_h5_path = os.path.expanduser("~/Downloads/wiki-temp.h5")

store_total = pd.HDFStore(temp_h5_path)

store_useful = pd.HDFStore(WEATHER_HVAC_STORE)
useful_keys = [k[:-2] for k in store_useful.keys() if "X" in k]

START, STOP = "2013-07-01", "2013-07-31"

store_name = os.path.expanduser("~/wikienergy-2013.h5")
with pd.HDFStore(store_name, "w") as store_to_write:
    for nilmtk_id, dataid_str in enumerate(useful_keys):

        dataid = int(dataid_str[1:])

        df = store_total[dataid_str][START:STOP]
        if df['air1'].sum() > 0:
            print("Writing ", nilmtk_id, dataid)
            _dataport_dataframe_to_hdf(df, store_to_write, nilmtk_id + 1, dataid)
        else:
            print("Skipping", nilmtk_id, dataid)
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         store_name)
Пример #45
0
def convert_hes(data_dir, output_filename, format='HDF', max_chunks=None):
    metadata = {
        'name': 'HES',
        'geographic_coordinates': (51.464462, -0.076544),  # London
        'timezone': 'Europe/London'
    }

    # Open DataStore
    store = get_datastore(output_filename, format, mode='w')

    # load list of appliances
    hes_to_nilmtk_appliance_lookup = pd.read_csv(
        join(get_module_directory(), 'dataset_converters', 'hes',
             'hes_to_nilmtk_appliance_lookup.csv'))

    # load list of houses
    hes_house_ids = load_list_of_house_ids(data_dir)
    nilmtk_house_ids = np.arange(1, len(hes_house_ids) + 1)
    hes_to_nilmtk_house_ids = dict(zip(hes_house_ids, nilmtk_house_ids))

    # array of hes_house_codes: nilmtk_building_code = house_codes.index(hes_house_code)
    house_codes = []

    # map
    house_appliance_codes = dict()

    # Create a temporary metadata dir
    original_metadata_dir = join(get_module_directory(), 'dataset_converters',
                                 'hes', 'metadata')
    tmp_dir = tempfile.mkdtemp()
    metadata_dir = join(tmp_dir, 'metadata')
    shutil.copytree(original_metadata_dir, metadata_dir)
    print("Using temporary dir for metadata:", metadata_dir)

    # Iterate over files
    for filename in FILENAMES:
        # Load appliance energy data chunk-by-chunk
        full_filename = join(data_dir, filename)
        print('Loading', full_filename)
        try:
            reader = pd.read_csv(full_filename,
                                 names=COL_NAMES,
                                 index_col=False,
                                 chunksize=CHUNKSIZE)
        except IOError as e:
            print(e, file=stderr)
            continue

        # Iterate over chunks in file
        chunk_i = 0
        for chunk in reader:
            if max_chunks is not None and chunk_i >= max_chunks:
                break

            print(' processing chunk', chunk_i, 'of', filename)
            # Convert date and time columns to np.datetime64 objects
            dt = chunk['date'] + ' ' + chunk['time']
            del chunk['date']
            del chunk['time']
            chunk['datetime'] = pd.to_datetime(dt,
                                               format='%Y-%m-%d %H:%M:%S',
                                               utc=True)

            # Data is either tenths of a Wh or tenths of a degree
            chunk['data'] *= 10
            chunk['data'] = chunk['data'].astype(np.float32)

            # Iterate over houses in chunk
            for hes_house_id, hes_house_id_df in chunk.groupby('house id'):
                if hes_house_id not in house_codes:
                    house_codes.append(hes_house_id)

                if hes_house_id not in house_appliance_codes.keys():
                    house_appliance_codes[hes_house_id] = []

                nilmtk_house_id = house_codes.index(hes_house_id) + 1

                # Iterate over appliances in house
                for appliance_code, appliance_df in chunk.groupby(
                        'appliance code'):
                    if appliance_code not in house_appliance_codes[
                            hes_house_id]:
                        house_appliance_codes[hes_house_id].append(
                            appliance_code)
                    nilmtk_meter_id = house_appliance_codes[
                        hes_house_id].index(appliance_code) + 1
                    _process_meter_in_chunk(nilmtk_house_id, nilmtk_meter_id,
                                            hes_house_id_df, store,
                                            appliance_code)

            chunk_i += 1

    print('houses with some data loaded:', house_appliance_codes.keys())

    store.close()

    # generate building yaml metadata
    for hes_house_id in house_codes:
        nilmtk_building_id = house_codes.index(hes_house_id) + 1
        building_metadata = {}
        building_metadata['instance'] = nilmtk_building_id
        building_metadata['original_name'] = int(
            hes_house_id)  # use python int
        building_metadata['elec_meters'] = {}
        building_metadata['appliances'] = []

        # initialise dict of instances of each appliance type
        instance_counter = {}

        for appliance_code in house_appliance_codes[hes_house_id]:
            nilmtk_meter_id = house_appliance_codes[hes_house_id].index(
                appliance_code) + 1
            # meter metadata
            if appliance_code in MAINS_CODES:
                meter_metadata = {
                    'device_model': 'multivoies',
                    'site_meter': True
                }
                break
            elif appliance_code in CIRCUIT_CODES:
                meter_metadata = {'device_model': 'multivoies'}
                break
            elif appliance_code in TEMPERATURE_CODES:
                break
            else:  # is appliance
                meter_metadata = {'device_model': 'wattmeter'}

            # only appliance meters at this point
            building_metadata['elec_meters'][nilmtk_meter_id] = meter_metadata
            # appliance metadata
            lookup_row = hes_to_nilmtk_appliance_lookup[
                hes_to_nilmtk_appliance_lookup.Code == appliance_code].iloc[0]
            appliance_metadata = {
                'original_name': lookup_row.Name,
                'meters': [nilmtk_meter_id]
            }
            # appliance type
            appliance_metadata.update({'type': lookup_row.nilmtk_name})
            # TODO appliance room

            # appliance instance number
            if instance_counter.get(lookup_row.nilmtk_name) == None:
                instance_counter[lookup_row.nilmtk_name] = 0
            instance_counter[lookup_row.nilmtk_name] += 1
            appliance_metadata['instance'] = instance_counter[
                lookup_row.nilmtk_name]

            building_metadata['appliances'].append(appliance_metadata)

        building = 'building{:d}'.format(nilmtk_building_id)

        yaml_full_filename = join(metadata_dir, building + '.yaml')

        with open(yaml_full_filename, 'w') as outfile:
            #print(building_metadata)
            outfile.write(yaml.dump(building_metadata))

    # write yaml metadata to hdf5
    convert_yaml_to_hdf5(metadata_dir, output_filename)

    # remote the temporary dir when finished
    shutil.rmtree(tmp_dir)
Пример #46
0
def convert_greend(greend_path, hdf_filename):
    """
    Parameters
    ----------
    greend_path : str
        The root path of the greend dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')
    houses = sorted(__get_houses(greend_path))
    print(houses)
    h = 1 # nilmtk counts buildings from 1 not from 0 as we do, so everything is shifted by 1
    for house in houses:
        print('loading '+house)
        abs_house = join(greend_path, house)
        dates = [d for d in listdir(abs_house) if d.startswith('dataset')]
        house_data = []
        for date in dates:
            print('-----------------------',date)
            try:
                tmp_pandas = pd.read_csv(join(abs_house, date), na_values=['na'], error_bad_lines=False)
            except: # A CParserError is returned for malformed files (irregular column number)
                pass 
                # for building0 either remove the first days (with less nodes) or use __preprocess_file
                #import StringIO as sio
                #tmp_pandas = pd.DataFrame.from_csv(sio.StringIO(__preprocess_file(abs_house, date)))
            
            # if the timestamp is not correctly parsed then it's an object dtype (string), else a float64
            if tmp_pandas.timestamp.dtype != np.float64:
				tmp_pandas = tmp_pandas[tmp_pandas.timestamp != 'timestamp'] # remove all error rows
			# use the cleaned column as the index
            tmp_pandas.index = tmp_pandas["timestamp"].convert_objects(convert_numeric=True).values
            tmp_pandas = tmp_pandas.drop('timestamp', 1) # remove timestamp from the columns (it's the index already)
            tmp_pandas = tmp_pandas.astype("float32") # convert everything back to float32
			# convert the index to datetime
            tmp_pandas.index = pd.to_datetime(tmp_pandas.index, unit='s')
            tmp_pandas = tmp_pandas.tz_localize("UTC").tz_convert("CET")
            tmp_pandas = tmp_pandas.drop_duplicates()
            #tmp_pandas = tmp_pandas.sort_index()
            house_data.append(tmp_pandas)
        overall_df = pd.concat(house_data)
        overall_df = overall_df.drop_duplicates()
        overall_df = overall_df.sort_index()

        m = 1

        for column in overall_df.columns:
            print("meter" + str(m)+': '+column)
            key = Key(building = h, meter=m)
            print("Putting into store...")
            store.put(str(key), overall_df[column], format = 'table')
            m += 1
            print('Flushing store...')
            store.flush()
        h += 1

    store.close()
	
	# retrieve the dataset metadata in the metadata subfolder
    import inspect
    convert_yaml_to_hdf5(dirname(inspect.getfile(convert_greend))+'/metadata/', hdf_filename)
Пример #47
0
    for i in range(1, num_rows):
        time_indices.append(time_indices[i-1] + np.timedelta64('1', 's'))
    return time_indices


if not os.path.exists('../data/'):
    os.makedirs('../data/')

store = get_datastore("../data/converted_sum.hdf5", 'HDF', mode='w')

"""
Gets CLEAR and MEDAL data and puts them into the store
with the right key and instance numbers.
"""
frames = get_clear_data()
for phase in range(1, 4):
    key = Key(building=1, meter=phase)
    print('Adding phase {}'.format(phase))
    store.put(str(key), frames[phase-1])


for medal_id in range(1, 16):
    frames = get_summary_data(medal_id)
    for i in range(1, 7):
        key = Key(building=1, meter=(((medal_id-1) * 6) + i + 3))
        print('Adding ' + str(key) + ' to Store')
        store.put(str(key), frames[i-1])

store.close()
convert_yaml_to_hdf5("../metadata_converter/dist", "../data/converted_sum.hdf5")
Пример #48
0
def download_wikienergy(database_username, database_password, hdf_filename, periods_to_load=None):
    """
    Downloads data from WikiEnergy database into an HDF5 file.

    Parameters
    ----------
    hdf_filename : str
        Output HDF filename.  If file exists already then will be deleted.
    database_username, database_password : str
    periods_to_load : dict of tuples, optional
       Key of dict is the building number (int).
       Values are (<start date>, <end date>)
       e.g. ("2013-04-01", None) or ("2013-04-01", "2013-08-01")
       defaults to all buildings and all date ranges
    """

    # wiki-energy database settings
    database_host = "db.wiki-energy.org"
    database_name = "postgres"
    database_schema = "PecanStreet_SharedData"

    # try to connect to database
    try:
        conn = db.connect(
            "host="
            + database_host
            + " dbname="
            + database_name
            + " user="******" password="******"Could not connect to remote database")
        raise

    # set up a new HDF5 datastore (overwrites existing store)
    store = pd.HDFStore(hdf_filename, "w", complevel=9, complib="zlib")

    # remove existing building yaml files in module dir
    for f in os.listdir(join(_get_module_directory(), "metadata")):
        if re.search("^building", f):
            os.remove(join(_get_module_directory(), "metadata", f))

    # get tables in database schema
    sql_query = (
        "SELECT TABLE_NAME"
        + " FROM INFORMATION_SCHEMA.TABLES"
        + " WHERE TABLE_TYPE = 'BASE TABLE'"
        + " AND TABLE_SCHEMA='"
        + database_schema
        + "'"
        + " ORDER BY TABLE_NAME"
    )
    database_tables = pd.read_sql(sql_query, conn)["table_name"].tolist()

    # if user has specified buildings
    if periods_to_load:
        buildings_to_load = periods_to_load.keys()
    else:
        # get buildings present in all tables
        sql_query = ""
        for table in database_tables:
            sql_query = sql_query + "(SELECT DISTINCT dataid" + ' FROM "' + database_schema + '".' + table + ") UNION "
        sql_query = sql_query[:-7]
        sql_query = sql_query + " ORDER BY dataid"
        buildings_to_load = pd.read_sql(sql_query, conn)["dataid"].tolist()

    # for each user specified building or all buildings in database
    for building_id in buildings_to_load:
        print("Loading building {:d} @ {}".format(building_id, datetime.datetime.now()))
        sys.stdout.flush()

        # create new list of chunks for concatenating later
        dataframe_list = []

        # for each table of 1 month data
        for database_table in database_tables:
            print("  Loading table {:s}".format(database_table))
            sys.stdout.flush()

            # get buildings present in this table
            sql_query = (
                "SELECT DISTINCT dataid" + ' FROM "' + database_schema + '".' + database_table + " ORDER BY dataid"
            )
            buildings_in_table = pd.read_sql(sql_query, conn)["dataid"].tolist()

            if building_id in buildings_in_table:
                # get first and last timestamps for this house in this table
                sql_query = (
                    "SELECT MIN(localminute) AS minlocalminute,"
                    + " MAX(localminute) AS maxlocalminute"
                    + ' FROM "'
                    + database_schema
                    + '".'
                    + database_table
                    + " WHERE dataid="
                    + str(building_id)
                )
                range = pd.read_sql(sql_query, conn)
                first_timestamp_in_table = range["minlocalminute"][0]
                last_timestamp_in_table = range["maxlocalminute"][0]

                # get requested start and end and localize them
                requested_start = None
                requested_end = None
                database_timezone = "US/Central"
                if periods_to_load:
                    if periods_to_load[building_id][0]:
                        requested_start = pd.Timestamp(periods_to_load[building_id][0])
                        requested_start = requested_start.tz_localize(database_timezone)
                    if periods_to_load[building_id][1]:
                        requested_end = pd.Timestamp(periods_to_load[building_id][1])
                        requested_end = requested_end.tz_localize(database_timezone)

                # check user start is not after end
                if requested_start > requested_end:
                    print("requested end is before requested start")
                    sys.stdout.flush()
                else:
                    # clip data to smallest range
                    if requested_start:
                        start = max(requested_start, first_timestamp_in_table)
                    else:
                        start = first_timestamp_in_table
                    if requested_end:
                        end = min(requested_end, last_timestamp_in_table)
                    else:
                        end = last_timestamp_in_table

                    # download data in chunks
                    chunk_start = start
                    chunk_size = datetime.timedelta(1)  # 1 day
                    while chunk_start < end:
                        chunk_end = chunk_start + chunk_size
                        if chunk_end > end:
                            chunk_end = end
                        # subtract 1 second so end is exclusive
                        chunk_end = chunk_end - datetime.timedelta(0, 1)

                        # query power data for all channels
                        format = "%Y-%m-%d %H:%M:%S"
                        sql_query = (
                            "SELECT *"
                            + ' FROM "'
                            + database_schema
                            + '".'
                            + database_table
                            + " WHERE dataid="
                            + str(building_id)
                            + "and localminute between "
                            + "'"
                            + chunk_start.strftime(format)
                            + "'"
                            + " and "
                            + "'"
                            + chunk_end.strftime(format)
                            + "'"
                            + " LIMIT 2000"
                        )
                        chunk_dataframe = pd.read_sql(sql_query, conn)

                        # nilmtk requires building indices to start at 1
                        nilmtk_building_id = buildings_to_load.index(building_id) + 1
                        # convert to nilmtk-df and save to disk
                        nilmtk_dataframe = _wikienergy_dataframe_to_hdf(
                            chunk_dataframe, store, nilmtk_building_id, building_id
                        )

                        # print progress
                        print(
                            "    "
                            + str(chunk_start)
                            + " -> "
                            + str(chunk_end)
                            + ": "
                            + str(len(chunk_dataframe.index))
                            + " rows"
                        )
                        sys.stdout.flush()

                        # append all chunks into list for csv writing
                        # dataframe_list.append(chunk_dataframe)

                        # move on to next chunk
                        chunk_start = chunk_start + chunk_size

        # saves all chunks in list to csv
        # if len(dataframe_list) > 0:
        # dataframe_concat = pd.concat(dataframe_list)
        # dataframe_concat.to_csv(output_directory + str(building_id) + '.csv')

    store.close()
    conn.close()

    # write yaml to hdf5
    # dataset.yaml and meter_devices.yaml are static, building<x>.yaml are dynamic
    convert_yaml_to_hdf5(join(_get_module_directory(), "metadata"), hdf_filename)
Пример #49
0
def convert_greend(greend_path, hdf_filename):
    """
    Parameters
    ----------
    greend_path : str
        The root path of the greend dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')
    houses = sorted(__get_houses(greend_path))
    print(houses)
    h = 1  # nilmtk counts buildings from 1 not from 0 as we do, so everything is shifted by 1
    for house in houses:
        print('loading ' + house)
        abs_house = join(greend_path, house)
        dates = [d for d in listdir(abs_house) if d.startswith('dataset')]
        house_data = []
        for date in dates:
            print('-----------------------', date)
            try:
                tmp_pandas = pd.read_csv(join(abs_house, date),
                                         na_values=['na'],
                                         error_bad_lines=False)
            except:  # A ParserError/ValueError is returned for malformed files (irregular column number)
                pass
                # for building0 either remove the first days (with less nodes) or use __preprocess_file
                #import StringIO as sio
                #tmp_pandas = pd.DataFrame.from_csv(sio.StringIO(__preprocess_file(abs_house, date)))

            # if the timestamp is not correctly parsed then it's an object dtype (string), else a float64
            if tmp_pandas.timestamp.dtype != np.float64:
                tmp_pandas = tmp_pandas[tmp_pandas.timestamp !=
                                        'timestamp']  # remove all error rows
                # use the cleaned column as the index
            tmp_pandas.index = tmp_pandas["timestamp"].apply(
                pd.to_numeric, errors='ignore').values
            tmp_pandas = tmp_pandas.drop(
                'timestamp', 1
            )  # remove timestamp from the columns (it's the index already)
            tmp_pandas = tmp_pandas.astype(
                "float32")  # convert everything back to float32
            # convert the index to datetime
            tmp_pandas.index = pd.to_datetime(tmp_pandas.index, unit='s')
            tmp_pandas = tmp_pandas.tz_localize("UTC").tz_convert("CET")
            tmp_pandas = tmp_pandas.drop_duplicates()
            #tmp_pandas = tmp_pandas.sort_index()
            house_data.append(tmp_pandas)
        overall_df = pd.concat(house_data)
        overall_df = overall_df.drop_duplicates()
        overall_df = overall_df.sort_index()

        m = 1

        for column in overall_df.columns:
            print("meter" + str(m) + ': ' + column)
            key = Key(building=h, meter=m)
            print("Putting into store...")
            store.put(str(key), overall_df[column], format='table')
            m += 1
            print('Flushing store...')
            store.flush()
        h += 1

    store.close()

    # retrieve the dataset metadata in the metadata subfolder
    import inspect
    convert_yaml_to_hdf5(
        dirname(inspect.getfile(convert_greend)) + '/metadata/', hdf_filename)
Пример #50
0
def convert_eco(dataset_loc, hdf_filename, timezone):
    """
    Parameters:
    -----------
    dataset_loc: str
        The root directory where the dataset is located.
    hdf_filename: str
        The location where the hdf_filename is present. 
        The directory location has to contain the 
        hdf5file name for the converter to work.
    timezone: str
        specifies the timezone of the dataset.
    """

    # Creating a new HDF File
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc')

    check_directory_exists(dataset_loc)
    directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i]
    directory_list.sort()
    print(directory_list)

    found_any_sm = False
    found_any_plug = False

    # Traversing every folder
    for folder in directory_list:
        if folder[0] == '.' or folder[-3:] == '.h5':
            print('Skipping ', folder)
            continue

        #Building number and meter_flag
        building_no = int(folder[:2])
        meter_flag = None
        if 'sm_csv' in folder:
            meter_flag = 'sm'
        elif 'plugs' in folder:
            meter_flag = 'plugs'
        else:
            print('Skipping folder', folder)
            continue

        print('Computing for folder', folder)

        dir_list = [
            i for i in listdir(join(dataset_loc, folder))
            if isdir(join(dataset_loc, folder, i))
        ]
        dir_list.sort()

        if meter_flag == 'plugs' and len(dir_list) < 3:
            # Try harder to find the subfolders
            folder = join(folder, folder[:2])
            dir_list = [
                i for i in listdir(join(dataset_loc, folder))
                if isdir(join(dataset_loc, folder, i))
            ]

        print('Current dir list:', dir_list)

        for fl in dir_list:
            print('Computing for folder ', fl)

            fl_dir_list = [
                i for i in listdir(join(dataset_loc, folder, fl))
                if '.csv' in i
            ]
            fl_dir_list.sort()

            if meter_flag == 'sm':
                for fi in fl_dir_list:
                    found_any_sm = True
                    df = pd.read_csv(join(dataset_loc, folder, fl, fi),
                                     names=[i for i in range(1, 17)],
                                     dtype=np.float32)
                    # SmartMeter
                    for phase in range(1, 4):
                        key = str(Key(building=building_no, meter=phase))
                        df_phase = df.loc[:, [
                            1 + phase, 5 + phase, 8 + phase, 13 + phase
                        ]]

                        # get reactive power
                        power = df_phase.loc[:, (1 + phase, 13 + phase)].values
                        reactive = power[:, 0] * np.tan(
                            power[:, 1] * np.pi / 180)
                        df_phase['Q'] = reactive

                        df_phase.index = pd.DatetimeIndex(start=fi[:-4],
                                                          freq='s',
                                                          periods=86400,
                                                          tz='GMT')
                        df_phase = df_phase.tz_convert(timezone)

                        sm_column_name = {
                            1 + phase: ('power', 'active'),
                            5 + phase: ('current', ''),
                            8 + phase: ('voltage', ''),
                            13 + phase: ('phase_angle', ''),
                            'Q': ('power', 'reactive'),
                        }
                        df_phase.columns = pd.MultiIndex.from_tuples(
                            sm_column_name[col] for col in df_phase.columns)

                        power_active = df_phase['power', 'active']
                        tmp_before = np.size(power_active)
                        df_phase = df_phase[power_active != -1]
                        power_active = df_phase['power', 'active']
                        tmp_after = np.size(power_active)

                        if tmp_before != tmp_after:
                            print(
                                'Removed missing measurements - Size before: '
                                + str(tmp_before) + ', size after: ' +
                                str(tmp_after))

                        df_phase.columns.set_names(LEVEL_NAMES, inplace=True)
                        if not key in store:
                            store.put(key, df_phase, format='Table')
                        else:
                            store.append(key, df_phase, format='Table')
                            store.flush()
                        print('Building', building_no, ', Meter no.', phase,
                              '=> Done for ', fi[:-4])
            # Plugs werden auch in Meter uebersetzt und dann aber direkt mit Appliances ergaenzt
            else:
                #Meter number to be used in key
                meter_num = int(fl) + 3

                key = str(Key(building=building_no, meter=meter_num))

                current_folder = join(dataset_loc, folder, fl)
                if not fl_dir_list:
                    raise RuntimeError("No CSV file found in " +
                                       current_folder)

                #Getting dataframe for each csv file seperately
                for fi in fl_dir_list:
                    found_any_plug = True
                    df = pd.read_csv(join(current_folder, fi),
                                     names=[1],
                                     dtype=np.float64)
                    df.index = pd.DatetimeIndex(start=fi[:-4].replace(
                        '.', ':'),
                                                freq='s',
                                                periods=86400,
                                                tz='GMT')
                    df.columns = pd.MultiIndex.from_tuples(
                        plugs_column_name.values())
                    df = df.tz_convert(timezone)
                    df.columns.set_names(LEVEL_NAMES, inplace=True)

                    # Check whether measurements removed
                    tmp_before = np.size(df.power.active)
                    df = df[df.power.active != -1]
                    tmp_after = np.size(df.power.active)
                    if (tmp_before != tmp_after):
                        print('Removed missing measurements - Size before: ' +
                              str(tmp_before) + ', size after: ' +
                              str(tmp_after))

                    # If table not present in hdf5, create or else append to existing data
                    if not key in store:
                        store.put(key, df, format='Table')
                        print('Building', building_no, ', Meter no.',
                              meter_num, '=> Done for ', fi[:-4])
                    else:
                        store.append(key, df, format='Table')
                        store.flush()
                        print('Building', building_no, ', Meter no.',
                              meter_num, '=> Done for ', fi[:-4])

    if not found_any_plug or not found_any_sm:
        raise RuntimeError(
            'The files were not found! Please check the folder structure. Extract each ZIP file into a folder with its base name (e.g. extract "01_plugs_csv.zip" into a folder named "01_plugs_csv", etc.)'
        )

    print("Data storage completed.")
    store.close()

    # Adding the metadata to the HDF5file
    print("Proceeding to Metadata conversion...")
    meta_path = join(get_module_directory(), 'dataset_converters', 'eco',
                     'metadata')
    convert_yaml_to_hdf5(meta_path, hdf_filename)
    print("Completed Metadata conversion.")