예제 #1
0
def _main():

    index_store = pd.HDFStore(cn.FOLDER_TO_STORE_FILES+'/'+cn.INDEX_FILENAME)
    file_index = index_store[cn.FILE_INDEX]
    site_index = index_store[cn.SITE_INDEX]

    # TODO: Improve to handle multiple sites
    # merge site information into a single site
    index_length = len(site_index.index)-1
    for row in range(index_length, 0, -1):
        # TODO: Test if distance between average of the two sets are within the variance before adding. If not, more sites should be created
        # TODO: Test if the equipment ID matches and consolidate information only for the same equipment or better, process equipment information separatly
        site_index.loc[row-1, H5.LATITUDE_MEMBER].add_set(site_index.loc[row, H5.LATITUDE_MEMBER])
        site_index.loc[row-1, H5.LONGITUDE_MEMBER].add_set(site_index.loc[row, H5.LONGITUDE_MEMBER])
        # site_index.loc[row-1, H5.LATITUDE_MEMBER].print("site row {}: ".format(row-1))
        # site_index.loc[row-1, H5.LONGITUDE_MEMBER].print("site row {}: ".format(row-1))
        site_index.drop(row, inplace=True)

    # create a data table that will be used for plotting
    site_data = pd.DataFrame(columns=[H5.CRFS_HOSTNAME,
                                      H5.LATITUDE_MEMBER,
                                      H5.LONGITUDE_MEMBER,
                                      H5.START_TIME_COARSE_ATTRIBUTE,
                                      H5.STOP_TIME_COARSE_ATTRIBUTE])

    # store the site data on the table
    # TODO: Add variance to allow ellipse plotting of the site
    site_data.loc[0, H5.CRFS_HOSTNAME] = site_index.loc[0, H5.CRFS_HOSTNAME]
    site_data.loc[0, H5.LATITUDE_MEMBER] = site_index.loc[0, H5.LATITUDE_MEMBER].mean_value
    site_data.loc[0, H5.LONGITUDE_MEMBER] = site_index.loc[0, H5.LONGITUDE_MEMBER].mean_value
    site_data.loc[0, H5.START_TIME_COARSE_ATTRIBUTE] = file_index.loc[0, H5.START_TIME_COARSE_ATTRIBUTE]
    site_data.loc[0, H5.STOP_TIME_COARSE_ATTRIBUTE] = file_index.loc[len(file_index.index)-1, H5.START_TIME_COARSE_ATTRIBUTE]

    # store the table on the index file
    index_store[cn.SITE_DATA_TABLE] = site_data

    output_file_name = cn.FOLDER_TO_STORE_FILES+'/'+cn.DATA_FILENAME
    file_data_store = pd.HDFStore(output_file_name)
    file_data_store[cn.SITE_DATA_TABLE] = site_data
    file_data_store.close()
    
    index_store.close()

    cl.log_message("Finish site data processing")
def _main():

    cl.log_message("Starting plotting level profile with average trace for all channels")

    # open file with h5Py method
    input_file_name = cn.FOLDER_TO_STORE_FILES+'/'+'data (cópia).h5' #cn.DATA_FILENAME
    input_file = h5py.File(input_file_name)

    input_group_name = H5.CHANNEL_DATA_GROUP+"/"+cn.CHANNEL_MEAN_LEVEL_CATALOG
    input_group = input_file[input_group_name]

    # create dictionary to store channel traces
    channel_traces = {}
    channel_frequency = {}
    frequency_at_peak = {}
    standard_axis = [-20, 20, 5, 45] # [xmin, xmax, ymin, ymax]

    # loop through all channels, load data into the traces dictionary and extract reference information into the channel info dataframe
    for channel_id in input_group:

        # TODO: Channels should have attributes and those should be verified to confirm the proper funcyion
        one_channel = input_group[channel_id][1]

        channel_width = len(one_channel)

        # ignore channels that are too small
        if channel_width > cn.MINIMUM_CHANNEL_WIDTH:

            # find level range
            maximum_level = np.max(one_channel)
            minimum_level = np.min(one_channel)

            # locate the index for the maximum
            index_of_maximum = np.argmax(one_channel)
            # handle multiple maximum case
            if isinstance(index_of_maximum, np.ndarray):
                index_of_maximum = np.mean(index_of_maximum)

            # store the frequency value for the maximum
            frequency_at_peak[channel_id] = input_group[channel_id][0][index_of_maximum]

            # transform level to relative a scale where the maximum equals to 1
            channel_traces[channel_id] = np.array(one_channel, dtype='float64')
            channel_frequency[channel_id] = (np.array(input_group[channel_id][0], dtype='float64')-frequency_at_peak[channel_id])/1000

            """
            # find frequency range
            maximum_freq = np.max(channel_frequency[channel_id])
            minimum_freq = np.min(channel_frequency[channel_id])

            # set the global limiting axis
            if standard_axis[0] > minimum_freq:
                standard_axis[0] = minimum_freq
            if standard_axis[1] < maximum_freq:
                standard_axis[1] = maximum_freq
            if standard_axis[2] > minimum_level:
                standard_axis[2] = minimum_level
            if standard_axis[3] < maximum_level:
                standard_axis[3] = maximum_level
            """

    # close file since all data has been loaded into memory
    input_file.close()

    # create a list of keys used for the channels trace designation on the corresponding dictionary
    channel_list = list(channel_traces.keys())

    # create array to store the condensed distance matrix
    number_of_channels = len(channel_list)

    # open data file to retrieve channel profile
    profile_file_name = cn.FOLDER_TO_STORE_FILES+'/'+'data (cópia).h5'
    profile_store = pd.HDFStore(profile_file_name)

    # loop though channels to compute distance between then
    for ref_channel_index in range(0, number_of_channels):

        # get key and trace from index
        channel_id = channel_list[ref_channel_index]

        cl.log_message("Starting channel {}".format(channel_id))

        # retrieve level profile for the channel 
        profile_group_name = H5.CHANNEL_DATA_GROUP+"/"+H5.LEVEL_PROFILE_DATA_GROUP+channel_id 
        channel_profile = profile_store[profile_group_name]

        figure_name = "Level profile channel {}".format(channel_id)
        plt.figure(figure_name)
        xy_array = channel_profile.to_numpy(dtype='float32')
        frequency_axis = channel_profile.columns.to_numpy(dtype='float64')
        x_axis = (frequency_axis-frequency_at_peak[channel_id])/1000
        y_axis = channel_profile.index.to_numpy(dtype='float64')

        plt.pcolormesh(x_axis, y_axis, xy_array, cmap='CMRmap_r')
        plt.xlabel("Frequency[kHz]")
        plt.ylabel("Level [dB\u03BCV/m]")
        plt.axis(standard_axis)

        plt.plot(channel_frequency[channel_id], channel_traces[channel_id], color='y', lw=2, path_effects=[pe.Stroke(linewidth=4, foreground='w'), pe.Normal()]) #, scaley=y_axis, 

        # plt.show()

        figure_file_name = "./Images/"+figure_name+".png"
        plt.savefig(figure_file_name)
        plt.close(figure_name)

    profile_store.close()

    cl.log_message("Finish processing")
예제 #3
0
def _main():

    file_index_store = pd.HDFStore(cn.FOLDER_TO_STORE_FILES + '/' +
                                   cn.INDEX_FILENAME)

    channel_data = file_index_store[cn.CHANNEL_DATA_TABLE]
    index_length = len(channel_data.index) - 1

    channel_index = file_index_store[cn.CHANNEL_INDEX]
    channel_index.set_index(cn.CHANNEL_ID, inplace=True)

    #channel_data_mean_level = pd.DataFrame()
    #channel_data_frequency = pd.DataFrame()

    # Loop through channels grouped collecting the required information
    for row in range(index_length):

        # create empty dataframe to store the resulting profile
        spectrogram_result = pd.DataFrame()

        # Get the channel ID
        channel_id = channel_data.loc[row, cn.CHANNEL_ID]

        # get the cut frequency as the inner edge, to enable to cut all traces to the same size as the minimum.
        initial_cut_frequency = channel_data.loc[
            row, cn.CHANNEL_INNER_EDGE_INITIAL_FREQUENCY]
        final_cut_frequency = channel_data.loc[
            row, cn.CHANNEL_INNER_EDGE_FINAL_FREQUENCY]

        # Select files that contain the designated channel
        files_with_channel = channel_index[channel_index.index == channel_id]

        # Get the number of files to be processed
        channel_index_length = len(files_with_channel.index)

        # loop through files that are marked with the indicated channel
        for channel_row in range(channel_index_length):

            # get the file and group for the channel data
            input_file_name = files_with_channel.iloc[channel_row, 0]

            input_group_name = files_with_channel.iloc[channel_row, 1]
            input_group_name = input_group_name.replace(
                H5.ACTIVITY_PROFILE_DATA_GROUP, H5.EM_SPECTRUM_DATA_GROUP)

            # open the file
            input_file_object = h5py.File(input_file_name, 'r')
            # TODO: Include test if the file follows the standard

            # Get a handle on the group
            input_group = input_file_object[H5.CHANNEL_DATA_GROUP + "/" +
                                            input_group_name]

            # recover the dataset reference handle
            spectrogram_dataset = input_group[H5.SPECTROGRAM_DATASET]

            # Get the spectrogram into a new dataset
            spectrogram_new = pd.DataFrame(spectrogram_dataset[:])

            # set the dataset columns as frequency
            frequency_axis = np.array(input_group[H5.FREQUENCY_DATASET][:] /
                                      cn.FREQUENCY_RESOLUTION)
            frequency_axis = cn.FREQUENCY_RESOLUTION * frequency_axis.round(0)
            spectrogram_new.columns = frequency_axis

            # set the dataset index as timestamp
            timestamp_coarse = input_group[H5.TIMESTAMP_COARSE_DATASET][:]
            timestamp_fine = input_group[
                H5.TIMESTAMP_FINE_DATASET][:] / cn.NANOSECONDS_IN_SECOND
            spectrogram_new.index = timestamp_coarse + timestamp_fine

            # build a reduced frequency axis containing only the needed frequencies
            axis_has_been_cut = False
            if frequency_axis[0] < initial_cut_frequency:
                frequency_axis = frequency_axis[
                    frequency_axis > initial_cut_frequency]
                axis_has_been_cut = True
            if frequency_axis[-1] > final_cut_frequency:
                frequency_axis = frequency_axis[
                    frequency_axis < final_cut_frequency]
                axis_has_been_cut = True

            # if the frequency axis has been reduced
            if axis_has_been_cut:
                # cut the spectrogram dataset using the reduced frequency axis
                spectrogram_new = spectrogram_new.filter(items=frequency_axis)

            # Merge the new profile into the result
            # Missing frequency bins are left as np.NaN
            spectrogram_result = spectrogram_result.append(spectrogram_new)

            cl.log_message("Processed {}/{}. Spectrogram shape: {}".format(
                channel_row + 1, channel_index_length,
                str(spectrogram_result.shape)))

        # Compute the mean level for all traces on the channel, for each frequency bin
        bin_mean_level = spectrogram_result.mean(axis='rows')
        mean_level_data = bin_mean_level.index.to_numpy(dtype='float64')
        mean_level_data = np.append([mean_level_data],
                                    [bin_mean_level.to_numpy(dtype='float64')],
                                    axis=0)

        # fill NaN values resulting from incorrect channel splicing with the average channel level over each bin
        spectrogram_result.fillna(value=bin_mean_level, inplace=True)

        #bin_mean_level = pd.DataFrame(bin_mean_level, columns=[channel_id])
        #bin_frequency = pd.DataFrame(bin_mean_level.index, columns=[channel_id])
        #bin_mean_level.reset_index(inplace=True, drop=True)

        # Store mean level on a channel catalog dataframe using the channel ID as index
        #channel_data_mean_level = channel_data_mean_level.join(bin_mean_level, how='outer')
        #channel_data_frequency = channel_data_frequency.join(bin_frequency, how='outer')

        cl.log_message("PROCESSED CHANNEL: {}".format(channel_id))

        # store the dataframe with the merged spectrogram
        output_file_name = cn.FOLDER_TO_STORE_FILES + '/' + cn.DATA_FILENAME
        file_data_store = pd.HDFStore(output_file_name)
        output_group_name = H5.CHANNEL_DATA_GROUP + "/" + H5.EM_SPECTRUM_DATA_GROUP + channel_id
        file_data_store[output_group_name] = spectrogram_result
        file_data_store.close()

        # reopen file with h5Py method
        file_data_store = h5py.File(output_file_name)

        # Test if the datagroup exist and create it if not
        output_group_store = file_data_store[H5.CHANNEL_DATA_GROUP]
        if cn.CHANNEL_MEAN_LEVEL_CATALOG in output_group_store:
            output_group_store = output_group_store[
                cn.CHANNEL_MEAN_LEVEL_CATALOG]

            # test if channel_id already exist and delete it in so.
            if channel_id in output_group_store:
                del output_group_store[channel_id]

            # store the channel data
            output_group_store.create_dataset(channel_id, data=mean_level_data)
        else:
            output_group_store.create_group(cn.CHANNEL_MEAN_LEVEL_CATALOG)
            output_group_name = H5.CHANNEL_DATA_GROUP + "/" + cn.CHANNEL_MEAN_LEVEL_CATALOG
            file_data_store[output_group_name].create_dataset(
                channel_id, data=mean_level_data)

        file_data_store.close()

        #cl.table_dataframe(spectrogram_result)

    #cl.table_dataframe(channel_data_mean_level)
    #cl.plot_dataframe(channel_data_mean_level.reset_index())

    # output message
    cl.log_message("Finish indexing {} channels".format(index_length))
예제 #4
0
def _main():

    index_store = pd.HDFStore(cn.FOLDER_TO_STORE_FILES + '/' +
                              cn.INDEX_FILENAME)
    file_index = index_store[cn.FILE_INDEX]
    index_length = len(file_index.index) - 1

    # create empty dataframe to store the resulting profile
    profile_array_result = pd.DataFrame()
    number_of_traces = 0

    # Loop through files collecting the required information
    for row in range(index_length):
        # Open each file
        file_name = file_index[cn.FILENAME_ATTRIBUTE][row]

        file_object = h5py.File(file_name, 'r')
        # TODO: Include test if the file follows the standard

        # Test if there is a noise group. The noise group contains all traces and thus reference to the time and frequency scope of the file content
        if H5.NOISE_DATA_GROUP in file_object:
            # Get a handle on the noise group
            noise_group = file_object[H5.NOISE_DATA_GROUP]

            # get all sub groups group names within the noise group
            for sub_group in noise_group:

                # if sub group corresponds to level profile group. Noise group also include the level profile group
                if H5.LEVEL_PROFILE_CLASS in str(
                        noise_group[sub_group].attrs[H5.CLASS_ATTRIBUTE][0]):

                    # recover the dataset reference handle
                    profile_dataset = noise_group[sub_group + '/' +
                                                  H5.LEVEL_PROFILE_DATASET]
                    # Todo: test if level units are compatible

                    # update the counter for the number of traces on the profile
                    number_of_traces += profile_dataset.attrs[
                        H5.NUMBER_OF_PROFILE_TRACES_ATTRIBUTE][0]

                    # Get the level profile
                    profile_array_new = pd.DataFrame(profile_dataset[:])
                    profile_array_new.columns = noise_group[
                        sub_group + '/' + H5.FREQUENCY_DATASET][:]
                    profile_array_new.index = noise_group[sub_group + '/' +
                                                          H5.LEVEL_DATASET][:]

                    # Merge the new profile into the result
                    profile_array_result = profile_array_result.add(
                        profile_array_new, fill_value=0)

                    cl.log_message(
                        "File {} processed. Profile shape: {}".format(
                            file_name, str(profile_array_result.shape)))

    # store the index tables created
    index_store = pd.HDFStore(cn.FOLDER_TO_STORE_FILES + '/' +
                              cn.DATA_FILENAME)
    index_store[H5.NOISE_DATA_GROUP + "/" +
                H5.LEVEL_PROFILE_DATASET] = profile_array_result
    index_store.close()

    # output message
    cl.log_message("Finish indexing {} files".format(index_length))
예제 #5
0
def _main():

    # List files on folder
    files = [
        f for f in glob.glob(cn.FOLDER_TO_GET_FILES + cn.FILE_TYPE,
                             recursive=False)
    ]
    index_length = files.__len__()

    file_index = pd.DataFrame(columns=[
        cn.FILENAME_ATTRIBUTE, H5.INITIAL_FREQUENCY_ATTRIBUTE,
        H5.FINAL_FREQUENCY_ATTRIBUTE, H5.START_TIME_COARSE_ATTRIBUTE,
        H5.STOP_TIME_COARSE_ATTRIBUTE
    ])

    # Create a dataframe with file names and reference information
    file_index[cn.FILENAME_ATTRIBUTE] = files
    file_index[H5.INITIAL_FREQUENCY_ATTRIBUTE] = [0.0] * index_length
    file_index[H5.FINAL_FREQUENCY_ATTRIBUTE] = [0.0] * index_length
    file_index[H5.START_TIME_COARSE_ATTRIBUTE] = [0.0] * index_length
    file_index[H5.STOP_TIME_COARSE_ATTRIBUTE] = [0.0] * index_length

    site_index = pd.DataFrame(
        columns=[H5.LATITUDE_MEMBER, H5.LONGITUDE_MEMBER, H5.CRFS_HOSTNAME])

    channel_index = pd.DataFrame(columns=[
        cn.CHANNEL_ID, cn.FILENAME_ATTRIBUTE, cn.GROUPNAME_ATTRIBUTE,
        H5.START_TIME_COARSE_ATTRIBUTE, H5.STOP_TIME_COARSE_ATTRIBUTE,
        H5.AVERAGE_CHANNEL_SAMPLE_RATE, H5.CHANNEL_EDGE_INITIAL_FREQUENCY,
        cn.CHANNEL_INNER_EDGE_INITIAL_FREQUENCY,
        H5.CHANNEL_CORE_INITIAL_FREQUENCY, H5.CHANNEL_CORE_FINAL_FREQUENCY,
        cn.CHANNEL_INNER_EDGE_FINAL_FREQUENCY, H5.CHANNEL_EDGE_FINAL_FREQUENCY
    ])

    channel_row = 0

    # Loop through files collecting the required information
    for row in range(index_length):
        # Open each file
        file_name = file_index[cn.FILENAME_ATTRIBUTE][row]
        cl.log_message("Processing file {}".format(file_name))

        file_object = h5py.File(file_name, 'r')
        # TODO: Include test if the file follows the standard

        # Get the site coordinates. Work only for single site set of files.
        if H5.SITE_GEOLOCATION_DATASET in file_object:
            site_dataset = file_object[H5.SITE_GEOLOCATION_DATASET]
            site_index.loc[row] = [cl.Normal(), cl.Normal(), "unknown"]
            site_index.loc[row, H5.LATITUDE_MEMBER].np_set(
                site_dataset.attrs[H5.LATITUDE_STATISTICS_ATTRIBUTE][0])
            site_index.loc[row, H5.LONGITUDE_MEMBER].np_set(
                site_dataset.attrs[H5.LONGITUDE_STATISTICS_ATTRIBUTE][0])

        # Get the unit information from the logbook. Only the first equipment ID is retrived
        if H5.LOGBOOK_DATASET in file_object:
            logbook_dataset = file_object[H5.LOGBOOK_DATASET]
            for log_entry in logbook_dataset:
                if log_entry[H5.ENTRY_TYPE_MEMBER].decode(
                        "ascii") == H5.CRFS_HOSTNAME:
                    site_index.loc[row, H5.CRFS_HOSTNAME] = log_entry[
                        H5.ENTRY_VALUE_MEMBER].decode("ascii")
                    break

        # Test if there is a noise group. The noise group contains all traces and thus reference to the time and frequency scope of the file content
        if H5.NOISE_DATA_GROUP in file_object:
            # Get a handle on the noise group
            noise_group = file_object[H5.NOISE_DATA_GROUP]

            # get all sub groups group names within the noise group
            for sub_group in noise_group:

                # if sub group corresponds to em spectrum group. Noise group also include the level profile group
                if H5.SPECTROGRAM_CLASS in str(
                        noise_group[sub_group].attrs[H5.CLASS_ATTRIBUTE][0]):

                    # Get the frequency reference data from the frequency axis dataset
                    frequency_dataset = noise_group[sub_group + '/' +
                                                    H5.FREQUENCY_DATASET]
                    file_index.loc[
                        row, H5.
                        INITIAL_FREQUENCY_ATTRIBUTE] = frequency_dataset.attrs[
                            H5.INITIAL_FREQUENCY_ATTRIBUTE][0]
                    file_index.loc[
                        row, H5.
                        FINAL_FREQUENCY_ATTRIBUTE] = frequency_dataset.attrs[
                            H5.FINAL_FREQUENCY_ATTRIBUTE][0]

                    # Get the time reference data from the timestamp coarse dataset
                    timestamp_coarse_dataset = noise_group[
                        sub_group + '/' + H5.TIMESTAMP_COARSE_DATASET]
                    file_index.loc[
                        row, H5.
                        START_TIME_COARSE_ATTRIBUTE] = timestamp_coarse_dataset.attrs[
                            H5.START_TIME_COARSE_ATTRIBUTE][0]
                    file_index.loc[
                        row, H5.
                        STOP_TIME_COARSE_ATTRIBUTE] = timestamp_coarse_dataset.attrs[
                            H5.STOP_TIME_COARSE_ATTRIBUTE][0]

        if H5.CHANNEL_DATA_GROUP in file_object:
            # Get a handle on the activity profile group
            channel_group = file_object[H5.CHANNEL_DATA_GROUP]

            # get all sub groups group names within the noise group
            for sub_group in channel_group:

                # if sub group corresponds to activity profile group.
                if H5.ACTIVITY_PROFILE_CLASS in str(
                        channel_group[sub_group].attrs[H5.CLASS_ATTRIBUTE][0]):

                    data = [
                        "",
                        file_index.loc[row, cn.FILENAME_ATTRIBUTE],
                        sub_group,
                        file_index.loc[row, H5.START_TIME_COARSE_ATTRIBUTE],
                        file_index.loc[row, H5.STOP_TIME_COARSE_ATTRIBUTE],
                        channel_group[sub_group].attrs[
                            H5.AVERAGE_CHANNEL_SAMPLE_RATE][0],
                        channel_group[sub_group].attrs[
                            H5.CHANNEL_EDGE_INITIAL_FREQUENCY][0],
                        channel_group[sub_group].attrs[
                            H5.CHANNEL_EDGE_INITIAL_FREQUENCY]
                        [0],  # channel edge inner frequency is equal to edge frequency for a single file
                        channel_group[sub_group].attrs[
                            H5.CHANNEL_CORE_INITIAL_FREQUENCY][0],
                        channel_group[sub_group].attrs[
                            H5.CHANNEL_CORE_FINAL_FREQUENCY][0],
                        channel_group[sub_group].attrs[
                            H5.CHANNEL_EDGE_FINAL_FREQUENCY]
                        [0],  # channel edge inner frequency is equal to edge frequency for a single file
                        channel_group[sub_group].attrs[
                            H5.CHANNEL_EDGE_FINAL_FREQUENCY][0]
                    ]

                    channel_index.loc[channel_row] = data
                    channel_row += 1

        # If there is no noise group
        else:
            # Issue error message and proceed to next file
            cl.log_message(
                "File {} does not include reference noise data and will be ignored"
                .format(file_name))

    # sort files by timestamp
    file_index.sort_values(by=[H5.START_TIME_COARSE_ATTRIBUTE],
                           ascending=[True],
                           inplace=True)
    file_index.reset_index(inplace=True, drop=True)
    #file_index.to_csv(cn.FOLDER_TO_STORE_FILES+'/'+'file_index_after.csv', index=None, header=True)

    # sort channels by core initial frequency and timestamp
    channel_index.sort_values(
        by=[H5.CHANNEL_CORE_INITIAL_FREQUENCY, H5.START_TIME_COARSE_ATTRIBUTE],
        ascending=[True, True],
        inplace=True)
    channel_index.reset_index(inplace=True, drop=True)
    #channel_index.to_csv(cn.FOLDER_TO_STORE_FILES+'/'+'channel_index_after.csv', index=None, header=True)

    # store the index tables created
    index_store = pd.HDFStore(cn.FOLDER_TO_STORE_FILES + '/' +
                              cn.INDEX_FILENAME)
    index_store[cn.FILE_INDEX] = file_index
    index_store[cn.SITE_INDEX] = site_index
    index_store[cn.CHANNEL_INDEX] = channel_index
    index_store.close()

    # output message
    cl.log_message("Finish indexing {} files".format(index_length))
def _main():

    # constant to activate the demonstration plot
    PLOT_COMPARISON = True

    cl.log_message("Starting channel distance processing")

    # open file with h5Py method
    input_file_name = cn.FOLDER_TO_STORE_FILES+'/'+cn.DATA_FILENAME
    input_file = h5py.File(input_file_name)

    input_group_name = H5.CHANNEL_DATA_GROUP+"/"+cn.CHANNEL_MEAN_LEVEL_CATALOG
    input_group = input_file[input_group_name]

    # create dictionary to store channel traces
    channel_traces = {}
    channel_frequency = {}

    # create dataframe to store chanel info
    channel_info = pd.DataFrame()

    # loop through all channels, load data into the traces dictionary and extract reference information into the channel info dataframe
    for channel_id in input_group:

        # TODO: Channels should have attributes and those should be verified to confirm the proper funcyion
        one_channel = input_group[channel_id][1]

        channel_width = len(one_channel)

        # ignore channels that are too small
        if channel_width > cn.MINIMUM_CHANNEL_WIDTH:

            # find maximum
            maximum_level = np.max(one_channel)

            # locate the index for the maximum
            index_of_maximum = np.argmax(one_channel)
            # handle multiple maximum case
            if isinstance(index_of_maximum, np.ndarray):
                index_of_maximum = np.mean(index_of_maximum)

            frequency_at_peak = input_group[channel_id][0][index_of_maximum]

            # transform level to relative a scale where the maximum equals to 1
            channel_traces[channel_id] = np.array(one_channel/maximum_level, dtype='float64')
            channel_frequency[channel_id] = (np.array(input_group[channel_id][0], dtype='float64')-frequency_at_peak)/1000

            channel_info.loc[channel_id, 'Max Index'] = index_of_maximum
            channel_info.loc[channel_id, 'Bin Width'] = channel_width
            channel_info.loc[channel_id, 'Min Value'] = np.min(channel_traces[channel_id])
            channel_info.loc[channel_id, 'Max Value'] = maximum_level
            channel_info.loc[channel_id, 'Frequency Max Value'] = frequency_at_peak
    # close file since all data has been loaded into memory
    input_file.close()

    # create a list of keys used for the channels trace designation on the corresponding dictionary
    channel_list = list(channel_traces.keys())

    # create array to store the condensed distance matrix
    number_of_channels = len(channel_list)
    condensed_distance = np.empty(int(round(((number_of_channels**2)-number_of_channels)/2, 0)))
    c_d_index = 0

    # loop though channels to compute distance between then
    for ref_channel_index in range(0, number_of_channels):

        # get key and trace from index
        ref_channel_id = channel_list[ref_channel_index]

        cl.log_message("Starting channel {}".format(ref_channel_id))

        # defines the lower limit to the reference channel level range
        min_ref_level = channel_info.loc[ref_channel_id, 'Min Value']

        #PLOT_THRESHOLD = 0.3
        #plot_this = (min_ref_level < PLOT_THRESHOLD)

        # loop through all channels and compute the distance
        for target_channel_index in range(ref_channel_index+1, number_of_channels):

            if PLOT_COMPARISON:
                percentage_string = "\r{}%".format(round(100.0*(target_channel_index/number_of_channels), ndigits=1))
                print(percentage_string, end="\r", flush=True)

            # get key and trace from index
            target_channel_id = channel_list[target_channel_index]
            target_channel_trace = channel_traces[target_channel_id]

            # defines the lower limit to the reference channel level range
            min_target_level = channel_info.loc[target_channel_id, 'Min Value']

            #plot_that = (min_target_level < PLOT_THRESHOLD)
            #plot_all = plot_this and plot_that

            #if (target_channel_id == '462574') and (ref_channel_id == '451011'):
            #    print('gotcha')

            # Cut the channels in the level axis such as that both have the same range. Equivalent to raise noise
            # If the reference channel has larger minimum reference level
            if min_ref_level > min_target_level:
                # Means that the peak value of the reference channel is lower, closer to the noise level
                # cut the target channel to the same range as the reference channel and copy the reference to the work variable
                target_channel_trace = target_channel_trace[target_channel_trace[:] >= min_ref_level]
                work_ref_channel_trace = channel_traces[ref_channel_id]

            elif min_ref_level < min_target_level:
                # else, if the reference channel has smaller minimum referecence level
                # cut the reference channel to the same range as the target channel and leave the target channel as it is
                work_ref_channel_trace = channel_traces[ref_channel_id][channel_traces[ref_channel_id][:] >= min_target_level]

            else:
                # else, both are the same, and just update the work variable reference
                work_ref_channel_trace = channel_traces[ref_channel_id]

            # After level cut, assign first trace on the correlation process to be the longest.
            # The correlation values are not affected by the order but their relative indexing to the result is.
            # For the implemented method of alignment, the first trace should be the largest
        
            if work_ref_channel_trace.size > target_channel_trace.size:
                smaller_trace = target_channel_trace.view()
                larger_trace = work_ref_channel_trace.view()

                if PLOT_COMPARISON:
                    figure_file_name = "./Images/"+figure1_name+".png"
                    figure1_name = "Comparison_{}-{} ".format(ref_channel_id, target_channel_id)
                    plt.figure(figure1_name)

                    sp1 = plt.subplot(311)
                    plt.title('Channel [{}] (wider)'.format(ref_channel_id))
                    plt.ylabel('Norm. Level')
                    plt.plot(channel_traces[ref_channel_id], 'r-o')
                    plt.setp(sp1.get_xticklabels(), visible=False)

                    sp2 = plt.subplot(312, sharex=sp1, sharey=sp1)
                    plt.title('Channel [{}] (narrower)'.format(target_channel_id))
                    plt.ylabel('Norm. Level')
                    plt.plot(channel_traces[target_channel_id], 'b-^')
                    plt.setp(sp2.get_xticklabels(), visible=False)

            else:
                smaller_trace = work_ref_channel_trace.view()
                larger_trace = target_channel_trace.view()

                if PLOT_COMPARISON:
                    figure1_name = "Comparison_{}-{} ".format(ref_channel_id, target_channel_id)
                    plt.figure(figure1_name)

                    sp1 = plt.subplot(311)
                    plt.title('Channel [{}] (wider)'.format(target_channel_id))
                    plt.ylabel('Norm. Level')
                    plt.plot(channel_traces[target_channel_id], 'r-o')
                    plt.setp(sp1.get_xticklabels(), visible=False)

                    sp2 = plt.subplot(312, sharex=sp1, sharey=sp1)
                    plt.title('Channel [{}] (narrower)'.format(ref_channel_id))
                    plt.ylabel('Norm. Level')
                    plt.plot(channel_traces[ref_channel_id], 'b-^')
                    plt.setp(sp2.get_xticklabels(), visible=False)

            """
            larger_trace = work_ref_channel_trace.view()
            smaller_trace = target_channel_trace.view()
            """

            # computes the cross correlation between channels
            correlation = signal.correlate(larger_trace, smaller_trace, mode='full', method='fft')
            peak_correlation_index = np.argmax(correlation)

            # compute the length of distance to be cutted out from the beginning of one of the traces such as to align it both at maximum correlation
            total_trace_shift = peak_correlation_index-(smaller_trace.size-1)
            size_difference = larger_trace.size - smaller_trace.size

            # if the total shift is negative
            if total_trace_shift < 0:
                # smaller trace needs to be moved to the left,
                # cut the begin of the smaller trace by the total shift
                smaller_trace = smaller_trace[-total_trace_shift:]
                # cut the larger trace to the same size as the second
                larger_trace = larger_trace[0:(larger_trace.size-size_difference+total_trace_shift)]
            # else, smaller trace needs to be moved to the right
            else:
                end_offset = size_difference-total_trace_shift
                # if shift is equal or smaller than the difference in size of the traces
                if end_offset >= 0:
                    # cut the begin of the larger trace by the required shift
                    # cut the end of the larger trace to match the sizes
                    larger_trace = larger_trace[total_trace_shift:larger_trace.size-end_offset]
                # else, smaller trace needs to be moved to the right but will overflow the larger trace
                else:
                    # cut the smaller trace by the difference from the shift and size difference
                    smaller_trace = smaller_trace[0:smaller_trace.size+end_offset]
                    # cut the begin of the first trace by to match the second trace
                    larger_trace = larger_trace[size_difference-end_offset:]

            # Compute the error. Uses RMSE as an normalized approximation of the euclidian distance (RSSE)
            # The use of mean is necessary due to the variable number of bins

            rms_distance = np.sqrt(np.mean((smaller_trace-larger_trace)**2))

            channel_info.loc[ref_channel_id, target_channel_id] = rms_distance

            condensed_distance[c_d_index] = rms_distance
            c_d_index += 1


            if PLOT_COMPARISON:
                sp3 = plt.subplot(313, sharex=sp1, sharey=sp1)
                plt.title('Traces aligned and cropped for comparison')
                plt.ylabel('Norm. Level')
                plt.xlabel('Frequency bin index')
                plt.plot(larger_trace, 'r-o', smaller_trace, 'b-^')
                plt.setp(sp3.get_xticklabels(), visible=True)
                plt.tight_layout()

                figure_file_name = "./Images/Compare/"+figure1_name+".png"
                plt.savefig(figure_file_name)

                #plt.show()

                plt.close(figure1_name)

            """
            if ref_channel_id == '450188': # '466862':

                half_trace_length = int(work_ref_channel_trace.size/2)
                if 2*half_trace_length < work_ref_channel_trace.size:
                    ref_index = np.arange(-half_trace_length, half_trace_length+1, 1)
                else:
                    ref_index = np.arange(-half_trace_length, half_trace_length, 1)

                half_trace_length = int(target_channel_trace.size/2)
                if 2*half_trace_length < target_channel_trace.size:
                    target_index = np.arange(-half_trace_length, half_trace_length+1, 1)
                else:
                    target_index = np.arange(-half_trace_length, half_trace_length, 1)

                half_trace_length = int(correlation.size/2)
                if 2*half_trace_length < correlation.size:
                    cor_index = np.arange(-half_trace_length, half_trace_length+1, 1)
                else:
                    cor_index = np.arange(-half_trace_length, half_trace_length, 1)

                ref_index = np.arange(0, larger_trace.size, 1)
                target_index = np.arange(0, target_channel_trace.size, 1)
                cor_index = np.arange(0, correlation.size, 1)

                plt.figure(1)
                plt.subplot(211)
                plt.plot(larger_trace, 'r-', smaller_trace, 'b-')

                plt.subplot(212)
                plt.plot(correlation, 'g-')
                plt.show()

                plt.plot(larger_trace, 'r-', smaller_trace, 'b-',correlation/np.max(correlation), 'g-')
                plt.show()

            if is_it_autocorrelation:
                autocorrelation = np.max(correlation)-np.min(correlation)
                is_it_autocorrelation = False
                channel_info.loc[ref_channel_id, target_channel_id] = 1.0
            else:
                # store the relative correlation peak as reference for the channel similarity
                channel_info.loc[ref_channel_id, target_channel_id] = (np.max(correlation)-np.min(correlation))/autocorrelation
            """

        print("\n")

    # perform grouping by the distance and plot dendograms
    NUMBER_OF_GROUPS = 6
    figure2_name = "Dendogram cut p={}".format(NUMBER_OF_GROUPS)
    plt.figure(figure2_name, figsize=(8, 6), dpi=80, frameon=False)
    linkage_matrix = linkage(condensed_distance, method="complete", optimal_ordering=True)
    cut_dendo = dendrogram(linkage_matrix,
                           labels=channel_list,
                           truncate_mode='lastp',
                           p=NUMBER_OF_GROUPS,
                           leaf_rotation=90.,
                           leaf_font_size=9.,
                           show_contracted=True)

    figure_file_name = "./Images/"+figure2_name+".png"
    plt.savefig(figure_file_name)

    figure3_name = "Dendogram Complete"
    plt.figure(figure3_name, figsize=(8, 6), dpi=80, frameon=False)
    complete_dendo = dendrogram(linkage_matrix,
                                labels=channel_list,
                                leaf_rotation=90.,
                                leaf_font_size=8.)

    figure_file_name = "./Images/"+figure3_name+".png"
    plt.savefig(figure_file_name)

    # creata a description of the channels on each group that compose a specific branch
    leaves = complete_dendo['ivl']
    branch_id = 0
    branch_dict = {branch_id:[]}
    number_of_leaves_in_branche = int(cut_dendo['ivl'][branch_id][1:-1])-1
    number_of_leaves_already_considered = 0

    for leave_index, channel_id in enumerate(leaves):
        if leave_index > number_of_leaves_in_branche+number_of_leaves_already_considered:
            branch_id += 1
            number_of_leaves_in_branche = int(cut_dendo['ivl'][branch_id][1:-1])-1
            number_of_leaves_already_considered = leave_index
            branch_dict[branch_id] = []

        branch_dict[branch_id] = branch_dict[branch_id]+[channel_id]

    classified_groups = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in branch_dict.items() ]))

    cl.table_dataframe(classified_groups)

    plt.show()

    plt.close(figure2_name)
    plt.close(figure3_name)

    cl.table_dataframe(channel_info)

    channel_ref_index = 0
    group_referece = 1

    """
    # identify channel within each group that has the highest signal to noise ratio to represent the group
    for group_size_string in cut_dendo['ivl']:
        group_size = int(group_size_string[1:-1])
        channel_id = complete_dendo['ivl'][channel_ref_index]
        minimum_channel_level = channel_info.loc[channel_id, 'Min Value']
        channel_group_reference = channel_id
        for channel_index in range(1,group_size):
            channel_ref_index += 1
            channel_id = complete_dendo['ivl'][channel_ref_index]
            current_channel_level = channel_info.loc[channel_id, 'Min Value']
            if minimum_channel_level > current_channel_level:
                minimum_channel_level = current_channel_level
                channel_group_reference = channel_id
        channel_ref_index += 1

        plt.figure("Channel {}. Reference to group {}".format(channel_group_reference, group_referece))
        plt.plot(channel_frequency[channel_group_reference],
                 channel_traces[channel_group_reference]*channel_info.loc[channel_group_reference, 'Max Value'])
        plt.ylabel("Level [dB\u03BCV/m]")
        plt.xlabel("Frequency[kHz]")

        group_referece += 1

    #dendrogram(linkage_matrix, labels=channel_list, leaf_rotation=90., leaf_font_size=12.)

    
    """

    # store the dataframe with the data.
    output_file_name = cn.FOLDER_TO_STORE_FILES+'/'+cn.DATA_FILENAME
    file_data_store = pd.HDFStore(output_file_name)
    output_group_name = H5.CHANNEL_DATA_GROUP+"/"+cn.INTER_CHANNEL_DISTANCES
    file_data_store[output_group_name] = channel_info
    output_group_name = H5.CHANNEL_DATA_GROUP+"/"+cn.INTER_CHANNEL_DISTANCES_CONDENSED_MATRIX
    file_data_store[output_group_name] = pd.DataFrame(condensed_distance)
    file_data_store.close()

    cl.log_message("Finish processing")
def _main():

    index_store = pd.HDFStore(cn.FOLDER_TO_STORE_FILES + '/' +
                              cn.INDEX_FILENAME)
    file_index = index_store[cn.FILE_INDEX]
    site_index = index_store[cn.SITE_INDEX]
    channel_index = index_store[cn.CHANNEL_INDEX]

    # update the store with the sorted file_index
    index_store[cn.FILE_INDEX] = file_index

    # create new dataframe that will store the consolidated data for the detected channels
    channel_data = pd.DataFrame(columns=[
        cn.CHANNEL_ID, H5.CHANNEL_EDGE_INITIAL_FREQUENCY,
        cn.CHANNEL_INNER_EDGE_INITIAL_FREQUENCY,
        H5.CHANNEL_CORE_INITIAL_FREQUENCY, H5.CHANNEL_CORE_FINAL_FREQUENCY,
        cn.CHANNEL_INNER_EDGE_FINAL_FREQUENCY, H5.CHANNEL_EDGE_FINAL_FREQUENCY
    ])

    # initialize the consolidated channel dataframe
    channel_row = 0
    input_file_name = channel_index.loc[channel_row, cn.FILENAME_ATTRIBUTE]
    center_frequency = round(
        (channel_index.loc[channel_row, H5.CHANNEL_CORE_FINAL_FREQUENCY] +
         channel_index.loc[channel_row, H5.CHANNEL_CORE_INITIAL_FREQUENCY]) /
        2000)
    cl.log_message("Starting channel {} at {}kHz with file {}".format(
        channel_row, center_frequency, input_file_name))

    current_channel_id = "Channel {}".format(channel_row)
    data = [
        current_channel_id,
        channel_index.iloc[channel_row, 6],  # edge initial frequency
        channel_index.iloc[channel_row, 7],  # edge inner initial frequency
        channel_index.iloc[channel_row, 8],  # core initial frequency
        channel_index.iloc[channel_row, 9],  # core final frequency
        channel_index.iloc[channel_row, 10],  # edge inner final frequency
        channel_index.iloc[channel_row, 11]
    ]  # edge final frequency

    channel_data.loc[channel_row] = data
    channel_index.loc[channel_row, cn.CHANNEL_ID] = current_channel_id

    # loop through all channel information from different files
    index_length = len(channel_index.index)
    for row in range(1, index_length, 1):
        # test if channel core on the consolidated list 'd' (channel_Data at channel_row) intersect the channel on the index file 'i' (channel_Index at row) or the other way around
        index_core_first = channel_index.loc[
            row, H5.CHANNEL_CORE_INITIAL_FREQUENCY] <= channel_data.loc[
                channel_row, H5.CHANNEL_CORE_INITIAL_FREQUENCY]
        data_core_first = channel_data.loc[
            channel_row,
            H5.CHANNEL_CORE_INITIAL_FREQUENCY] <= channel_index.loc[
                row, H5.CHANNEL_CORE_INITIAL_FREQUENCY]
        i_inside_d = index_core_first and (
            channel_data.loc[channel_row, H5.CHANNEL_CORE_INITIAL_FREQUENCY] <=
            channel_index.loc[row, H5.CHANNEL_CORE_FINAL_FREQUENCY])
        d_inside_i = data_core_first and (
            channel_index.loc[row, H5.CHANNEL_CORE_INITIAL_FREQUENCY] <=
            channel_data.loc[channel_row, H5.CHANNEL_CORE_FINAL_FREQUENCY])

        # if they do intercept, merge the channel information
        if i_inside_d or d_inside_i:
            # link channel index with the channel data
            channel_index.loc[row, cn.CHANNEL_ID] = current_channel_id

            # update the channel data boundaries. Core is the intersection, edge is the union
            # if stored edge begin is larger than the new edge
            if channel_data.iloc[channel_row, 1] > channel_index.iloc[row, 6]:
                # update edge begin. (move edge to the left, expand)
                channel_data.iloc[channel_row, 1] = channel_index.iloc[row, 6]

            # if stored edge begin is smaller than the new edge
            if channel_data.iloc[channel_row, 2] < channel_index.iloc[row, 7]:
                # update edge begin. (moved edge to the right, contract) There is no risk of crossing of edges since the core is in between, a condition guaranteed by the detection algorithm
                channel_data.iloc[channel_row, 2] = channel_index.iloc[row, 7]

            # if stored core begin is lower than the new core
            if channel_data.iloc[channel_row, 3] < channel_index.iloc[row, 8]:
                # if new core begin still smaller than the core end. Necessary to avoid core equal or smaler than zero
                if channel_index.iloc[row, 8] < channel_data.iloc[channel_row,
                                                                  4]:
                    # update core begin (move core to the right, contract)
                    channel_data.iloc[channel_row, 3] = channel_index.iloc[row,
                                                                           8]

            # if stored core end is higher than the new core
            if channel_data.iloc[channel_row, 4] > channel_index.iloc[row, 9]:
                # if new core end still larger than the core begin
                if channel_index.iloc[row, 9] > channel_data.iloc[channel_row,
                                                                  3]:
                    # update core end (move core to the left, contract)
                    channel_data.iloc[channel_row, 4] = channel_index.iloc[row,
                                                                           9]

            # if stored edge end is larger than the new edge
            if channel_data.iloc[channel_row, 5] > channel_index.iloc[row, 10]:
                # update edge end (move edge to the left, contract)
                channel_data.iloc[channel_row, 5] = channel_index.iloc[row, 10]

            # if stored edge end is smaller than the new edge
            if channel_data.iloc[channel_row, 6] < channel_index.iloc[row, 11]:
                # update edge end (move edge to the right, expand)
                channel_data.iloc[channel_row, 6] = channel_index.iloc[row, 11]

            input_file_name = channel_index.loc[row, cn.FILENAME_ATTRIBUTE]
            center_frequency = round(
                (channel_data.loc[channel_row, H5.CHANNEL_CORE_FINAL_FREQUENCY]
                 + channel_data.loc[channel_row,
                                    H5.CHANNEL_CORE_INITIAL_FREQUENCY]) / 2000)
            cl.log_message(
                "   | Merged channel {} at {}kHz with file {}".format(
                    row, center_frequency, input_file_name))

        # if they do not intercept
        else:
            # a new channel needs to be assigned
            channel_row += 1
            current_channel_id = "Channel {}".format(channel_row)
            data = [
                current_channel_id, channel_index.iloc[row, 6],
                channel_index.iloc[row, 7], channel_index.iloc[row, 8],
                channel_index.iloc[row, 9], channel_index.iloc[row, 10],
                channel_index.iloc[row, 11]
            ]

            channel_data.loc[channel_row] = data
            channel_index.loc[row, cn.CHANNEL_ID] = current_channel_id

            input_file_name = channel_index.loc[row, cn.FILENAME_ATTRIBUTE]
            center_frequency = round(
                (channel_data.loc[channel_row, H5.CHANNEL_CORE_FINAL_FREQUENCY]
                 + channel_data.loc[channel_row,
                                    H5.CHANNEL_CORE_INITIAL_FREQUENCY]) / 2000)
            cl.log_message("Starting channel {} at {}kHz with file {}".format(
                channel_row, center_frequency, input_file_name))

    # loop through channel data and reasign names to the new center frequency in kHz rounded to integer
    cl.log_message("Starting channel renaming")

    index_length = len(channel_data.index)
    for row in range(0, index_length, 1):
        center_frequency = round(
            (channel_data.loc[row, H5.CHANNEL_CORE_FINAL_FREQUENCY] +
             channel_data.loc[row, H5.CHANNEL_CORE_INITIAL_FREQUENCY]) / 2000)

        current_channel_id = "{:.0f}".format(center_frequency)

        channel_index.replace(to_replace=channel_data.loc[row, cn.CHANNEL_ID],
                              value=current_channel_id,
                              inplace=True)
        channel_data.loc[row, cn.CHANNEL_ID] = current_channel_id
        cl.log_message("Channel {} renamed at {}kHz".format(
            row, center_frequency))
        """
        print("{} - {}: {:.0f}, {:.0f}, {:.0f}, {:.0f}".format(row,
                                                               channel_data.loc[row, cn.CHANNEL_ID],
                                                               channel_data.loc[row, H5.CHANNEL_EDGE_INITIAL_FREQUENCY],
                                                               channel_data.loc[row, H5.CHANNEL_CORE_INITIAL_FREQUENCY],
                                                               channel_data.loc[row, H5.CHANNEL_CORE_FINAL_FREQUENCY],
                                                               channel_data.loc[row, H5.CHANNEL_EDGE_FINAL_FREQUENCY]))
        """
    #channel_data.to_csv(cn.FOLDER_TO_STORE_FILES+'/'+'channel_data.csv', index=None, header=True)

    index_store[cn.CHANNEL_DATA_TABLE] = channel_data
    index_store[cn.CHANNEL_INDEX] = channel_index

    index_store.close()

    cl.log_message("Finish data indexing")
예제 #8
0
def _main():

    # open file and get a list of the channel spectrogram groups. Uses H5py for better efficiency
    data_store_file = h5py.File(cn.FOLDER_TO_STORE_FILES + '/' +
                                cn.DATA_FILENAME)
    channel_spectrogram_list = list(
        data_store_file[H5.CHANNEL_DATA_GROUP].keys())
    data_store_file.close()

    # create array with bin edges to be used on the histogram
    profile_histogram_bins = np.arange(1, 8, 0.05)
    numpy_histogram_bins = np.r_[-np.inf, profile_histogram_bins, np.inf]

    # create empty dataframe to store results
    channel_distances = pd.DataFrame()

    for spectrogram_group_name in channel_spectrogram_list:

        # reopen the file with pandas HDF
        data_store_file = pd.HDFStore(cn.FOLDER_TO_STORE_FILES + '/' +
                                      cn.DATA_FILENAME)

        # Test if dataset is of the spectrogram type
        if H5.EM_SPECTRUM_DATA_GROUP in spectrogram_group_name:

            # get the channel ID
            channel_id = spectrogram_group_name.split(
                H5.EM_SPECTRUM_DATA_GROUP)[1]

            # get the dataframe
            channel_traces = data_store_file[H5.CHANNEL_DATA_GROUP + '/' +
                                             spectrogram_group_name]
            frequency_at_peak = channel_traces.idxmax(axis=1).mean()

            number_of_time_samples = channel_traces.shape[0]

            if number_of_time_samples > cn.MINIMUM_NUMBER_SAMPLES_FOR_INNER_ANALYSIS:

                # reduce the number of traces to make computation viable
                if number_of_time_samples * channel_traces.shape[
                        1] > cn.MAXIMUM_NUMBER_DATAPOINTS_FOR_INNER_ANALYSIS:
                    number_of_time_samples = int(
                        round(
                            cn.MAXIMUM_NUMBER_DATAPOINTS_FOR_INNER_ANALYSIS /
                            channel_traces.shape[1], 0))
                    channel_traces = channel_traces.iloc[
                        0:number_of_time_samples, :]

                figure_name = "Spectrogram channel {}".format(channel_id)
                plt.figure(figure_name)
                plt.subplot(121)
                plt.title("Spectrogram for channel {}".format(channel_id))
                xy_array = channel_traces.to_numpy(dtype='float32')
                frequency_axis = channel_traces.columns.to_numpy(
                    dtype='float64')
                x_axis = (frequency_axis - frequency_at_peak) / 1000.0
                y_axis = channel_traces.index.to_numpy(dtype='float64')
                y_axis = y_axis - y_axis.min()

                plt.pcolormesh(x_axis,
                               np.arange(0, xy_array.shape[0]),
                               xy_array,
                               cmap='CMRmap_r')
                plt.xlabel("Frequency[kHz]")
                plt.ylabel("Index")

                # flatten the dataframe to a series
                flatten_trace = channel_traces.to_numpy(
                    dtype='float32').flatten()
                trace_bin_width = channel_traces.shape[1]

                profile, profile_index = matrixProfile.stomp(
                    flatten_trace, trace_bin_width)

                plt.subplot(122)
                plt.title("Matrix Profile".format(channel_id))
                plt.ylim((0, profile.size))
                plt.yticks(ticks=np.arange(
                    0, profile.size, profile.size / number_of_time_samples),
                           labels='')
                plt.grid(which='major', axis='y')
                plt.plot(profile, np.arange(0, profile.size))

                combined_dataframe = pd.Series(flatten_trace).to_frame()
                combined_dataframe['profile'] = np.append(
                    profile,
                    np.zeros(trace_bin_width - 1) + np.nan)

                # no exclusion zone
                ex_zone = trace_bin_width

                # TODO: instead of using fixed number, should use a measure above noise
                number_of_modes = math.ceil(number_of_time_samples *
                                            cn.PEAK_SAMPLE_RATIO)

                # get a maximum of one anomaly for each trace
                profile_peak = discords(combined_dataframe['profile'],
                                        ex_zone,
                                        k=number_of_modes + 1)

                # get the peaks into a dataframe with corresponding matrix profile values
                profile_peak_df = combined_dataframe.loc[profile_peak,
                                                         'profile']

                # select peaks that have large profile values considering defined thershold
                # profile_peak_df=profile_peak_df[profile_peak_df.loc[:]>cn.MPROFILE_NOISE_THRESHOLD]

                profile_peak_df = profile_peak_df.reset_index()

                # compute the corresponding trace index based on the flatten index
                profile_peak_df['trace_index'] = round(
                    profile_peak_df['index'] / trace_bin_width, 0)
                order = 1

                for one_peak_index in profile_peak_df['trace_index']:
                    plt.subplot(121)
                    plot_name = "{}".format(order)
                    x_pos = x_axis.max()
                    y_pos = int(one_peak_index)
                    arrow_end_pos = x_pos + ((x_pos - x_axis.min()) / 25)
                    plt.annotate(plot_name,
                                 xy=(x_pos, y_pos),
                                 xycoords="data",
                                 xytext=(arrow_end_pos, y_pos),
                                 textcoords='data',
                                 arrowprops=dict(arrowstyle="->",
                                                 connectionstyle="arc3"))
                    order += 1

                figure_file_name = "./Images/" + figure_name + ".png"
                plt.savefig(figure_file_name)
                plt.close(figure_name)

                # TODO: Use profile_peak_df to split spectrogram into two subchannels. Compute new Profile Density

                # store the distance information as reference to the channel
                channel_distance_descriptor = pd.DataFrame()
                channel_distance_descriptor.loc[
                    channel_id,
                    cn.INNER_DISTANCE_MAX] = profile_peak_df.loc[0, 'profile']
                channel_distance_descriptor.loc[
                    channel_id, cn.INNER_DISTANCE_MEAN] = profile_peak_df.mean(
                        axis='rows')['profile']

                # compute histogram
                profile_histogram, discarted_bins = np.histogram(
                    profile, bins=numpy_histogram_bins, density=False)

                # add overflow to last bin
                profile_histogram[-2] += profile_histogram[-1]
                profile_histogram = profile_histogram[0:-1]

                # plot histogram
                histogram_maximum = profile_histogram.max()
                figure_name = "Matrix Profile histogram for channel {}".format(
                    channel_id)
                plt.figure(figure_name)
                plt.ylim(0, histogram_maximum * 1.05)
                plt.bar(profile_histogram_bins,
                        height=profile_histogram,
                        width=1)
                plt.plot(profile_histogram_bins, profile_histogram)
                figure_file_name = "./Images/" + figure_name + ".png"
                plt.savefig(figure_file_name)
                plt.close(figure_name)

                # convert np array with the histogram to a dataframe, using the bin values as column names
                profile_histogram_df = pd.DataFrame(
                    [profile_histogram],
                    index=[channel_id],
                    columns=profile_histogram_bins)

                # merge dataframes to get the complete channel inner distance profile information into the same row
                channel_distance_descriptor = channel_distance_descriptor.join(
                    profile_histogram_df)

                channel_distances = channel_distances.append(
                    channel_distance_descriptor)

                #close file used to read the data
                data_store_file.close()

                # store the dataframe with the data. Since this is a slow process, it is stored each update for safe interruption
                output_file_name = cn.FOLDER_TO_STORE_FILES + '/' + cn.DATA_FILENAME
                file_data_store = pd.HDFStore(output_file_name)
                output_group_name = H5.CHANNEL_DATA_GROUP + "/" + cn.CHANNEL_DISTANCES_DATA_GROUP
                file_data_store[output_group_name] = channel_distances
                file_data_store.close()

                cl.log_message(
                    "Processed channel {}. Inner distance of {}".format(
                        channel_id,
                        channel_distances.loc[channel_id,
                                              cn.INNER_DISTANCE_MAX]))

                #plt.show()

            else:

                #close file used to read the data
                data_store_file.close()

                channel_distances.loc[channel_id, :] = np.NaN

                output_file_name = cn.FOLDER_TO_STORE_FILES + '/' + cn.DATA_FILENAME
                file_data_store = pd.HDFStore(output_file_name)
                output_group_name = H5.CHANNEL_DATA_GROUP + "/" + cn.CHANNEL_DISTANCES_DATA_GROUP
                file_data_store[output_group_name] = channel_distances
                file_data_store.close()

                cl.log_message(
                    "Processed channel {}. Too few traces to evaluate inner distance. # traces: "
                    .format(channel_id, number_of_time_samples))
예제 #9
0
def _main():

    index_store = pd.HDFStore(cn.FOLDER_TO_STORE_FILES+'/'+cn.INDEX_FILENAME)

    channel_data = index_store[cn.CHANNEL_DATA_TABLE]
    index_length = len(channel_data.index)-1

    channel_index = index_store[cn.CHANNEL_INDEX]
    channel_index.set_index(cn.CHANNEL_ID, inplace=True)

    # Loop through channels grouped collecting the required information
    for row in range(index_length):

        # create empty dataframe to store the resulting profile
        profile_array_result = pd.DataFrame()
        number_of_traces = 0

        # Get the channel ID
        channel_id = channel_data.loc[row, cn.CHANNEL_ID]

        # Select files that contain the designated channel
        files_with_channel = channel_index[channel_index.index == channel_id]

        # Get the number of files to be processed
        channel_index_length = len(files_with_channel.index)

        # initialize bolean variable responsible to signaling the existance of at least one profile
        shall_remove_channel_reference = True

        # initialize variable to store the total number of profiles and avoid scope limitations due to conditional initialization
        number_of_traces_sum = 0

        # loop through files that are marked with the indicated channel
        for channel_row in range(channel_index_length):

            # get the file and group for the channel data
            input_file_name = files_with_channel.iloc[channel_row, 0]

            input_group_name = files_with_channel.iloc[channel_row, 1]
            input_group_name = input_group_name.replace(H5.ACTIVITY_PROFILE_DATA_GROUP, H5.LEVEL_PROFILE_DATA_GROUP)

            # open the file
            input_file_object = h5py.File(input_file_name, 'r')
            # TODO: Include test if the file follows the standard

            # Get a handle on the group
            input_profile_group = input_file_object[H5.CHANNEL_DATA_GROUP+"/"+input_group_name]

            # recover the dataset reference handle
            profile_dataset = input_profile_group[H5.LEVEL_PROFILE_DATASET]
            # Todo: test if level units are compatible

            # update the counter for the number of traces on the profile
            number_of_traces = profile_dataset.attrs[H5.NUMBER_OF_PROFILE_TRACES_ATTRIBUTE][0]
            if number_of_traces > 0:
                number_of_traces_sum += number_of_traces

                # Get the level profile
                profile_array_new = pd.DataFrame(profile_dataset[:])
                profile_array_new.columns = input_profile_group[H5.FREQUENCY_DATASET][:]
                profile_array_new.index = input_profile_group[H5.LEVEL_DATASET][:]

                # Merge the new profile into the result
                profile_array_result = profile_array_result.add(profile_array_new, fill_value=0.0)

                profile_array_result.fillna(0, inplace=True)

                shall_remove_channel_reference = False
            else:
                cl.log_message("File {} has no profile data".format(input_file_name))
                #TODO: Delete from index

            cl.log_message("Processed {}/{}. Profile shape: {}".format(input_file_name, input_group_name, str(profile_array_result.shape)))

        # If no profile is really stored for the channel. May happen if channel was created due to database reference
        if shall_remove_channel_reference:
            cl.log_message("No profile for channel {}/{}".format(input_file_name, input_group_name))
        
        else:
            # store the dataframe with the merged level profile
            output_file_name = cn.FOLDER_TO_STORE_FILES+'/'+cn.DATA_FILENAME
            index_store = pd.HDFStore(output_file_name)
            output_group_name = H5.CHANNEL_DATA_GROUP+"/"+H5.LEVEL_PROFILE_DATA_GROUP+channel_id 
            index_store[output_group_name] = profile_array_result
            index_store.close()
            
            # store the attributes to the group where the dataframe is stored
            output_file_object = h5py.File(output_file_name)
            output_file_object[output_group_name].attrs[H5.NUMBER_OF_PROFILE_TRACES_ATTRIBUTE] = number_of_traces_sum
            output_file_object.close()

    # output message
    cl.log_message("Finish indexing {} files".format(index_length))