def _main(): index_store = pd.HDFStore(cn.FOLDER_TO_STORE_FILES+'/'+cn.INDEX_FILENAME) file_index = index_store[cn.FILE_INDEX] site_index = index_store[cn.SITE_INDEX] # TODO: Improve to handle multiple sites # merge site information into a single site index_length = len(site_index.index)-1 for row in range(index_length, 0, -1): # TODO: Test if distance between average of the two sets are within the variance before adding. If not, more sites should be created # TODO: Test if the equipment ID matches and consolidate information only for the same equipment or better, process equipment information separatly site_index.loc[row-1, H5.LATITUDE_MEMBER].add_set(site_index.loc[row, H5.LATITUDE_MEMBER]) site_index.loc[row-1, H5.LONGITUDE_MEMBER].add_set(site_index.loc[row, H5.LONGITUDE_MEMBER]) # site_index.loc[row-1, H5.LATITUDE_MEMBER].print("site row {}: ".format(row-1)) # site_index.loc[row-1, H5.LONGITUDE_MEMBER].print("site row {}: ".format(row-1)) site_index.drop(row, inplace=True) # create a data table that will be used for plotting site_data = pd.DataFrame(columns=[H5.CRFS_HOSTNAME, H5.LATITUDE_MEMBER, H5.LONGITUDE_MEMBER, H5.START_TIME_COARSE_ATTRIBUTE, H5.STOP_TIME_COARSE_ATTRIBUTE]) # store the site data on the table # TODO: Add variance to allow ellipse plotting of the site site_data.loc[0, H5.CRFS_HOSTNAME] = site_index.loc[0, H5.CRFS_HOSTNAME] site_data.loc[0, H5.LATITUDE_MEMBER] = site_index.loc[0, H5.LATITUDE_MEMBER].mean_value site_data.loc[0, H5.LONGITUDE_MEMBER] = site_index.loc[0, H5.LONGITUDE_MEMBER].mean_value site_data.loc[0, H5.START_TIME_COARSE_ATTRIBUTE] = file_index.loc[0, H5.START_TIME_COARSE_ATTRIBUTE] site_data.loc[0, H5.STOP_TIME_COARSE_ATTRIBUTE] = file_index.loc[len(file_index.index)-1, H5.START_TIME_COARSE_ATTRIBUTE] # store the table on the index file index_store[cn.SITE_DATA_TABLE] = site_data output_file_name = cn.FOLDER_TO_STORE_FILES+'/'+cn.DATA_FILENAME file_data_store = pd.HDFStore(output_file_name) file_data_store[cn.SITE_DATA_TABLE] = site_data file_data_store.close() index_store.close() cl.log_message("Finish site data processing")
def _main(): cl.log_message("Starting plotting level profile with average trace for all channels") # open file with h5Py method input_file_name = cn.FOLDER_TO_STORE_FILES+'/'+'data (cópia).h5' #cn.DATA_FILENAME input_file = h5py.File(input_file_name) input_group_name = H5.CHANNEL_DATA_GROUP+"/"+cn.CHANNEL_MEAN_LEVEL_CATALOG input_group = input_file[input_group_name] # create dictionary to store channel traces channel_traces = {} channel_frequency = {} frequency_at_peak = {} standard_axis = [-20, 20, 5, 45] # [xmin, xmax, ymin, ymax] # loop through all channels, load data into the traces dictionary and extract reference information into the channel info dataframe for channel_id in input_group: # TODO: Channels should have attributes and those should be verified to confirm the proper funcyion one_channel = input_group[channel_id][1] channel_width = len(one_channel) # ignore channels that are too small if channel_width > cn.MINIMUM_CHANNEL_WIDTH: # find level range maximum_level = np.max(one_channel) minimum_level = np.min(one_channel) # locate the index for the maximum index_of_maximum = np.argmax(one_channel) # handle multiple maximum case if isinstance(index_of_maximum, np.ndarray): index_of_maximum = np.mean(index_of_maximum) # store the frequency value for the maximum frequency_at_peak[channel_id] = input_group[channel_id][0][index_of_maximum] # transform level to relative a scale where the maximum equals to 1 channel_traces[channel_id] = np.array(one_channel, dtype='float64') channel_frequency[channel_id] = (np.array(input_group[channel_id][0], dtype='float64')-frequency_at_peak[channel_id])/1000 """ # find frequency range maximum_freq = np.max(channel_frequency[channel_id]) minimum_freq = np.min(channel_frequency[channel_id]) # set the global limiting axis if standard_axis[0] > minimum_freq: standard_axis[0] = minimum_freq if standard_axis[1] < maximum_freq: standard_axis[1] = maximum_freq if standard_axis[2] > minimum_level: standard_axis[2] = minimum_level if standard_axis[3] < maximum_level: standard_axis[3] = maximum_level """ # close file since all data has been loaded into memory input_file.close() # create a list of keys used for the channels trace designation on the corresponding dictionary channel_list = list(channel_traces.keys()) # create array to store the condensed distance matrix number_of_channels = len(channel_list) # open data file to retrieve channel profile profile_file_name = cn.FOLDER_TO_STORE_FILES+'/'+'data (cópia).h5' profile_store = pd.HDFStore(profile_file_name) # loop though channels to compute distance between then for ref_channel_index in range(0, number_of_channels): # get key and trace from index channel_id = channel_list[ref_channel_index] cl.log_message("Starting channel {}".format(channel_id)) # retrieve level profile for the channel profile_group_name = H5.CHANNEL_DATA_GROUP+"/"+H5.LEVEL_PROFILE_DATA_GROUP+channel_id channel_profile = profile_store[profile_group_name] figure_name = "Level profile channel {}".format(channel_id) plt.figure(figure_name) xy_array = channel_profile.to_numpy(dtype='float32') frequency_axis = channel_profile.columns.to_numpy(dtype='float64') x_axis = (frequency_axis-frequency_at_peak[channel_id])/1000 y_axis = channel_profile.index.to_numpy(dtype='float64') plt.pcolormesh(x_axis, y_axis, xy_array, cmap='CMRmap_r') plt.xlabel("Frequency[kHz]") plt.ylabel("Level [dB\u03BCV/m]") plt.axis(standard_axis) plt.plot(channel_frequency[channel_id], channel_traces[channel_id], color='y', lw=2, path_effects=[pe.Stroke(linewidth=4, foreground='w'), pe.Normal()]) #, scaley=y_axis, # plt.show() figure_file_name = "./Images/"+figure_name+".png" plt.savefig(figure_file_name) plt.close(figure_name) profile_store.close() cl.log_message("Finish processing")
def _main(): file_index_store = pd.HDFStore(cn.FOLDER_TO_STORE_FILES + '/' + cn.INDEX_FILENAME) channel_data = file_index_store[cn.CHANNEL_DATA_TABLE] index_length = len(channel_data.index) - 1 channel_index = file_index_store[cn.CHANNEL_INDEX] channel_index.set_index(cn.CHANNEL_ID, inplace=True) #channel_data_mean_level = pd.DataFrame() #channel_data_frequency = pd.DataFrame() # Loop through channels grouped collecting the required information for row in range(index_length): # create empty dataframe to store the resulting profile spectrogram_result = pd.DataFrame() # Get the channel ID channel_id = channel_data.loc[row, cn.CHANNEL_ID] # get the cut frequency as the inner edge, to enable to cut all traces to the same size as the minimum. initial_cut_frequency = channel_data.loc[ row, cn.CHANNEL_INNER_EDGE_INITIAL_FREQUENCY] final_cut_frequency = channel_data.loc[ row, cn.CHANNEL_INNER_EDGE_FINAL_FREQUENCY] # Select files that contain the designated channel files_with_channel = channel_index[channel_index.index == channel_id] # Get the number of files to be processed channel_index_length = len(files_with_channel.index) # loop through files that are marked with the indicated channel for channel_row in range(channel_index_length): # get the file and group for the channel data input_file_name = files_with_channel.iloc[channel_row, 0] input_group_name = files_with_channel.iloc[channel_row, 1] input_group_name = input_group_name.replace( H5.ACTIVITY_PROFILE_DATA_GROUP, H5.EM_SPECTRUM_DATA_GROUP) # open the file input_file_object = h5py.File(input_file_name, 'r') # TODO: Include test if the file follows the standard # Get a handle on the group input_group = input_file_object[H5.CHANNEL_DATA_GROUP + "/" + input_group_name] # recover the dataset reference handle spectrogram_dataset = input_group[H5.SPECTROGRAM_DATASET] # Get the spectrogram into a new dataset spectrogram_new = pd.DataFrame(spectrogram_dataset[:]) # set the dataset columns as frequency frequency_axis = np.array(input_group[H5.FREQUENCY_DATASET][:] / cn.FREQUENCY_RESOLUTION) frequency_axis = cn.FREQUENCY_RESOLUTION * frequency_axis.round(0) spectrogram_new.columns = frequency_axis # set the dataset index as timestamp timestamp_coarse = input_group[H5.TIMESTAMP_COARSE_DATASET][:] timestamp_fine = input_group[ H5.TIMESTAMP_FINE_DATASET][:] / cn.NANOSECONDS_IN_SECOND spectrogram_new.index = timestamp_coarse + timestamp_fine # build a reduced frequency axis containing only the needed frequencies axis_has_been_cut = False if frequency_axis[0] < initial_cut_frequency: frequency_axis = frequency_axis[ frequency_axis > initial_cut_frequency] axis_has_been_cut = True if frequency_axis[-1] > final_cut_frequency: frequency_axis = frequency_axis[ frequency_axis < final_cut_frequency] axis_has_been_cut = True # if the frequency axis has been reduced if axis_has_been_cut: # cut the spectrogram dataset using the reduced frequency axis spectrogram_new = spectrogram_new.filter(items=frequency_axis) # Merge the new profile into the result # Missing frequency bins are left as np.NaN spectrogram_result = spectrogram_result.append(spectrogram_new) cl.log_message("Processed {}/{}. Spectrogram shape: {}".format( channel_row + 1, channel_index_length, str(spectrogram_result.shape))) # Compute the mean level for all traces on the channel, for each frequency bin bin_mean_level = spectrogram_result.mean(axis='rows') mean_level_data = bin_mean_level.index.to_numpy(dtype='float64') mean_level_data = np.append([mean_level_data], [bin_mean_level.to_numpy(dtype='float64')], axis=0) # fill NaN values resulting from incorrect channel splicing with the average channel level over each bin spectrogram_result.fillna(value=bin_mean_level, inplace=True) #bin_mean_level = pd.DataFrame(bin_mean_level, columns=[channel_id]) #bin_frequency = pd.DataFrame(bin_mean_level.index, columns=[channel_id]) #bin_mean_level.reset_index(inplace=True, drop=True) # Store mean level on a channel catalog dataframe using the channel ID as index #channel_data_mean_level = channel_data_mean_level.join(bin_mean_level, how='outer') #channel_data_frequency = channel_data_frequency.join(bin_frequency, how='outer') cl.log_message("PROCESSED CHANNEL: {}".format(channel_id)) # store the dataframe with the merged spectrogram output_file_name = cn.FOLDER_TO_STORE_FILES + '/' + cn.DATA_FILENAME file_data_store = pd.HDFStore(output_file_name) output_group_name = H5.CHANNEL_DATA_GROUP + "/" + H5.EM_SPECTRUM_DATA_GROUP + channel_id file_data_store[output_group_name] = spectrogram_result file_data_store.close() # reopen file with h5Py method file_data_store = h5py.File(output_file_name) # Test if the datagroup exist and create it if not output_group_store = file_data_store[H5.CHANNEL_DATA_GROUP] if cn.CHANNEL_MEAN_LEVEL_CATALOG in output_group_store: output_group_store = output_group_store[ cn.CHANNEL_MEAN_LEVEL_CATALOG] # test if channel_id already exist and delete it in so. if channel_id in output_group_store: del output_group_store[channel_id] # store the channel data output_group_store.create_dataset(channel_id, data=mean_level_data) else: output_group_store.create_group(cn.CHANNEL_MEAN_LEVEL_CATALOG) output_group_name = H5.CHANNEL_DATA_GROUP + "/" + cn.CHANNEL_MEAN_LEVEL_CATALOG file_data_store[output_group_name].create_dataset( channel_id, data=mean_level_data) file_data_store.close() #cl.table_dataframe(spectrogram_result) #cl.table_dataframe(channel_data_mean_level) #cl.plot_dataframe(channel_data_mean_level.reset_index()) # output message cl.log_message("Finish indexing {} channels".format(index_length))
def _main(): index_store = pd.HDFStore(cn.FOLDER_TO_STORE_FILES + '/' + cn.INDEX_FILENAME) file_index = index_store[cn.FILE_INDEX] index_length = len(file_index.index) - 1 # create empty dataframe to store the resulting profile profile_array_result = pd.DataFrame() number_of_traces = 0 # Loop through files collecting the required information for row in range(index_length): # Open each file file_name = file_index[cn.FILENAME_ATTRIBUTE][row] file_object = h5py.File(file_name, 'r') # TODO: Include test if the file follows the standard # Test if there is a noise group. The noise group contains all traces and thus reference to the time and frequency scope of the file content if H5.NOISE_DATA_GROUP in file_object: # Get a handle on the noise group noise_group = file_object[H5.NOISE_DATA_GROUP] # get all sub groups group names within the noise group for sub_group in noise_group: # if sub group corresponds to level profile group. Noise group also include the level profile group if H5.LEVEL_PROFILE_CLASS in str( noise_group[sub_group].attrs[H5.CLASS_ATTRIBUTE][0]): # recover the dataset reference handle profile_dataset = noise_group[sub_group + '/' + H5.LEVEL_PROFILE_DATASET] # Todo: test if level units are compatible # update the counter for the number of traces on the profile number_of_traces += profile_dataset.attrs[ H5.NUMBER_OF_PROFILE_TRACES_ATTRIBUTE][0] # Get the level profile profile_array_new = pd.DataFrame(profile_dataset[:]) profile_array_new.columns = noise_group[ sub_group + '/' + H5.FREQUENCY_DATASET][:] profile_array_new.index = noise_group[sub_group + '/' + H5.LEVEL_DATASET][:] # Merge the new profile into the result profile_array_result = profile_array_result.add( profile_array_new, fill_value=0) cl.log_message( "File {} processed. Profile shape: {}".format( file_name, str(profile_array_result.shape))) # store the index tables created index_store = pd.HDFStore(cn.FOLDER_TO_STORE_FILES + '/' + cn.DATA_FILENAME) index_store[H5.NOISE_DATA_GROUP + "/" + H5.LEVEL_PROFILE_DATASET] = profile_array_result index_store.close() # output message cl.log_message("Finish indexing {} files".format(index_length))
def _main(): # List files on folder files = [ f for f in glob.glob(cn.FOLDER_TO_GET_FILES + cn.FILE_TYPE, recursive=False) ] index_length = files.__len__() file_index = pd.DataFrame(columns=[ cn.FILENAME_ATTRIBUTE, H5.INITIAL_FREQUENCY_ATTRIBUTE, H5.FINAL_FREQUENCY_ATTRIBUTE, H5.START_TIME_COARSE_ATTRIBUTE, H5.STOP_TIME_COARSE_ATTRIBUTE ]) # Create a dataframe with file names and reference information file_index[cn.FILENAME_ATTRIBUTE] = files file_index[H5.INITIAL_FREQUENCY_ATTRIBUTE] = [0.0] * index_length file_index[H5.FINAL_FREQUENCY_ATTRIBUTE] = [0.0] * index_length file_index[H5.START_TIME_COARSE_ATTRIBUTE] = [0.0] * index_length file_index[H5.STOP_TIME_COARSE_ATTRIBUTE] = [0.0] * index_length site_index = pd.DataFrame( columns=[H5.LATITUDE_MEMBER, H5.LONGITUDE_MEMBER, H5.CRFS_HOSTNAME]) channel_index = pd.DataFrame(columns=[ cn.CHANNEL_ID, cn.FILENAME_ATTRIBUTE, cn.GROUPNAME_ATTRIBUTE, H5.START_TIME_COARSE_ATTRIBUTE, H5.STOP_TIME_COARSE_ATTRIBUTE, H5.AVERAGE_CHANNEL_SAMPLE_RATE, H5.CHANNEL_EDGE_INITIAL_FREQUENCY, cn.CHANNEL_INNER_EDGE_INITIAL_FREQUENCY, H5.CHANNEL_CORE_INITIAL_FREQUENCY, H5.CHANNEL_CORE_FINAL_FREQUENCY, cn.CHANNEL_INNER_EDGE_FINAL_FREQUENCY, H5.CHANNEL_EDGE_FINAL_FREQUENCY ]) channel_row = 0 # Loop through files collecting the required information for row in range(index_length): # Open each file file_name = file_index[cn.FILENAME_ATTRIBUTE][row] cl.log_message("Processing file {}".format(file_name)) file_object = h5py.File(file_name, 'r') # TODO: Include test if the file follows the standard # Get the site coordinates. Work only for single site set of files. if H5.SITE_GEOLOCATION_DATASET in file_object: site_dataset = file_object[H5.SITE_GEOLOCATION_DATASET] site_index.loc[row] = [cl.Normal(), cl.Normal(), "unknown"] site_index.loc[row, H5.LATITUDE_MEMBER].np_set( site_dataset.attrs[H5.LATITUDE_STATISTICS_ATTRIBUTE][0]) site_index.loc[row, H5.LONGITUDE_MEMBER].np_set( site_dataset.attrs[H5.LONGITUDE_STATISTICS_ATTRIBUTE][0]) # Get the unit information from the logbook. Only the first equipment ID is retrived if H5.LOGBOOK_DATASET in file_object: logbook_dataset = file_object[H5.LOGBOOK_DATASET] for log_entry in logbook_dataset: if log_entry[H5.ENTRY_TYPE_MEMBER].decode( "ascii") == H5.CRFS_HOSTNAME: site_index.loc[row, H5.CRFS_HOSTNAME] = log_entry[ H5.ENTRY_VALUE_MEMBER].decode("ascii") break # Test if there is a noise group. The noise group contains all traces and thus reference to the time and frequency scope of the file content if H5.NOISE_DATA_GROUP in file_object: # Get a handle on the noise group noise_group = file_object[H5.NOISE_DATA_GROUP] # get all sub groups group names within the noise group for sub_group in noise_group: # if sub group corresponds to em spectrum group. Noise group also include the level profile group if H5.SPECTROGRAM_CLASS in str( noise_group[sub_group].attrs[H5.CLASS_ATTRIBUTE][0]): # Get the frequency reference data from the frequency axis dataset frequency_dataset = noise_group[sub_group + '/' + H5.FREQUENCY_DATASET] file_index.loc[ row, H5. INITIAL_FREQUENCY_ATTRIBUTE] = frequency_dataset.attrs[ H5.INITIAL_FREQUENCY_ATTRIBUTE][0] file_index.loc[ row, H5. FINAL_FREQUENCY_ATTRIBUTE] = frequency_dataset.attrs[ H5.FINAL_FREQUENCY_ATTRIBUTE][0] # Get the time reference data from the timestamp coarse dataset timestamp_coarse_dataset = noise_group[ sub_group + '/' + H5.TIMESTAMP_COARSE_DATASET] file_index.loc[ row, H5. START_TIME_COARSE_ATTRIBUTE] = timestamp_coarse_dataset.attrs[ H5.START_TIME_COARSE_ATTRIBUTE][0] file_index.loc[ row, H5. STOP_TIME_COARSE_ATTRIBUTE] = timestamp_coarse_dataset.attrs[ H5.STOP_TIME_COARSE_ATTRIBUTE][0] if H5.CHANNEL_DATA_GROUP in file_object: # Get a handle on the activity profile group channel_group = file_object[H5.CHANNEL_DATA_GROUP] # get all sub groups group names within the noise group for sub_group in channel_group: # if sub group corresponds to activity profile group. if H5.ACTIVITY_PROFILE_CLASS in str( channel_group[sub_group].attrs[H5.CLASS_ATTRIBUTE][0]): data = [ "", file_index.loc[row, cn.FILENAME_ATTRIBUTE], sub_group, file_index.loc[row, H5.START_TIME_COARSE_ATTRIBUTE], file_index.loc[row, H5.STOP_TIME_COARSE_ATTRIBUTE], channel_group[sub_group].attrs[ H5.AVERAGE_CHANNEL_SAMPLE_RATE][0], channel_group[sub_group].attrs[ H5.CHANNEL_EDGE_INITIAL_FREQUENCY][0], channel_group[sub_group].attrs[ H5.CHANNEL_EDGE_INITIAL_FREQUENCY] [0], # channel edge inner frequency is equal to edge frequency for a single file channel_group[sub_group].attrs[ H5.CHANNEL_CORE_INITIAL_FREQUENCY][0], channel_group[sub_group].attrs[ H5.CHANNEL_CORE_FINAL_FREQUENCY][0], channel_group[sub_group].attrs[ H5.CHANNEL_EDGE_FINAL_FREQUENCY] [0], # channel edge inner frequency is equal to edge frequency for a single file channel_group[sub_group].attrs[ H5.CHANNEL_EDGE_FINAL_FREQUENCY][0] ] channel_index.loc[channel_row] = data channel_row += 1 # If there is no noise group else: # Issue error message and proceed to next file cl.log_message( "File {} does not include reference noise data and will be ignored" .format(file_name)) # sort files by timestamp file_index.sort_values(by=[H5.START_TIME_COARSE_ATTRIBUTE], ascending=[True], inplace=True) file_index.reset_index(inplace=True, drop=True) #file_index.to_csv(cn.FOLDER_TO_STORE_FILES+'/'+'file_index_after.csv', index=None, header=True) # sort channels by core initial frequency and timestamp channel_index.sort_values( by=[H5.CHANNEL_CORE_INITIAL_FREQUENCY, H5.START_TIME_COARSE_ATTRIBUTE], ascending=[True, True], inplace=True) channel_index.reset_index(inplace=True, drop=True) #channel_index.to_csv(cn.FOLDER_TO_STORE_FILES+'/'+'channel_index_after.csv', index=None, header=True) # store the index tables created index_store = pd.HDFStore(cn.FOLDER_TO_STORE_FILES + '/' + cn.INDEX_FILENAME) index_store[cn.FILE_INDEX] = file_index index_store[cn.SITE_INDEX] = site_index index_store[cn.CHANNEL_INDEX] = channel_index index_store.close() # output message cl.log_message("Finish indexing {} files".format(index_length))
def _main(): # constant to activate the demonstration plot PLOT_COMPARISON = True cl.log_message("Starting channel distance processing") # open file with h5Py method input_file_name = cn.FOLDER_TO_STORE_FILES+'/'+cn.DATA_FILENAME input_file = h5py.File(input_file_name) input_group_name = H5.CHANNEL_DATA_GROUP+"/"+cn.CHANNEL_MEAN_LEVEL_CATALOG input_group = input_file[input_group_name] # create dictionary to store channel traces channel_traces = {} channel_frequency = {} # create dataframe to store chanel info channel_info = pd.DataFrame() # loop through all channels, load data into the traces dictionary and extract reference information into the channel info dataframe for channel_id in input_group: # TODO: Channels should have attributes and those should be verified to confirm the proper funcyion one_channel = input_group[channel_id][1] channel_width = len(one_channel) # ignore channels that are too small if channel_width > cn.MINIMUM_CHANNEL_WIDTH: # find maximum maximum_level = np.max(one_channel) # locate the index for the maximum index_of_maximum = np.argmax(one_channel) # handle multiple maximum case if isinstance(index_of_maximum, np.ndarray): index_of_maximum = np.mean(index_of_maximum) frequency_at_peak = input_group[channel_id][0][index_of_maximum] # transform level to relative a scale where the maximum equals to 1 channel_traces[channel_id] = np.array(one_channel/maximum_level, dtype='float64') channel_frequency[channel_id] = (np.array(input_group[channel_id][0], dtype='float64')-frequency_at_peak)/1000 channel_info.loc[channel_id, 'Max Index'] = index_of_maximum channel_info.loc[channel_id, 'Bin Width'] = channel_width channel_info.loc[channel_id, 'Min Value'] = np.min(channel_traces[channel_id]) channel_info.loc[channel_id, 'Max Value'] = maximum_level channel_info.loc[channel_id, 'Frequency Max Value'] = frequency_at_peak # close file since all data has been loaded into memory input_file.close() # create a list of keys used for the channels trace designation on the corresponding dictionary channel_list = list(channel_traces.keys()) # create array to store the condensed distance matrix number_of_channels = len(channel_list) condensed_distance = np.empty(int(round(((number_of_channels**2)-number_of_channels)/2, 0))) c_d_index = 0 # loop though channels to compute distance between then for ref_channel_index in range(0, number_of_channels): # get key and trace from index ref_channel_id = channel_list[ref_channel_index] cl.log_message("Starting channel {}".format(ref_channel_id)) # defines the lower limit to the reference channel level range min_ref_level = channel_info.loc[ref_channel_id, 'Min Value'] #PLOT_THRESHOLD = 0.3 #plot_this = (min_ref_level < PLOT_THRESHOLD) # loop through all channels and compute the distance for target_channel_index in range(ref_channel_index+1, number_of_channels): if PLOT_COMPARISON: percentage_string = "\r{}%".format(round(100.0*(target_channel_index/number_of_channels), ndigits=1)) print(percentage_string, end="\r", flush=True) # get key and trace from index target_channel_id = channel_list[target_channel_index] target_channel_trace = channel_traces[target_channel_id] # defines the lower limit to the reference channel level range min_target_level = channel_info.loc[target_channel_id, 'Min Value'] #plot_that = (min_target_level < PLOT_THRESHOLD) #plot_all = plot_this and plot_that #if (target_channel_id == '462574') and (ref_channel_id == '451011'): # print('gotcha') # Cut the channels in the level axis such as that both have the same range. Equivalent to raise noise # If the reference channel has larger minimum reference level if min_ref_level > min_target_level: # Means that the peak value of the reference channel is lower, closer to the noise level # cut the target channel to the same range as the reference channel and copy the reference to the work variable target_channel_trace = target_channel_trace[target_channel_trace[:] >= min_ref_level] work_ref_channel_trace = channel_traces[ref_channel_id] elif min_ref_level < min_target_level: # else, if the reference channel has smaller minimum referecence level # cut the reference channel to the same range as the target channel and leave the target channel as it is work_ref_channel_trace = channel_traces[ref_channel_id][channel_traces[ref_channel_id][:] >= min_target_level] else: # else, both are the same, and just update the work variable reference work_ref_channel_trace = channel_traces[ref_channel_id] # After level cut, assign first trace on the correlation process to be the longest. # The correlation values are not affected by the order but their relative indexing to the result is. # For the implemented method of alignment, the first trace should be the largest if work_ref_channel_trace.size > target_channel_trace.size: smaller_trace = target_channel_trace.view() larger_trace = work_ref_channel_trace.view() if PLOT_COMPARISON: figure_file_name = "./Images/"+figure1_name+".png" figure1_name = "Comparison_{}-{} ".format(ref_channel_id, target_channel_id) plt.figure(figure1_name) sp1 = plt.subplot(311) plt.title('Channel [{}] (wider)'.format(ref_channel_id)) plt.ylabel('Norm. Level') plt.plot(channel_traces[ref_channel_id], 'r-o') plt.setp(sp1.get_xticklabels(), visible=False) sp2 = plt.subplot(312, sharex=sp1, sharey=sp1) plt.title('Channel [{}] (narrower)'.format(target_channel_id)) plt.ylabel('Norm. Level') plt.plot(channel_traces[target_channel_id], 'b-^') plt.setp(sp2.get_xticklabels(), visible=False) else: smaller_trace = work_ref_channel_trace.view() larger_trace = target_channel_trace.view() if PLOT_COMPARISON: figure1_name = "Comparison_{}-{} ".format(ref_channel_id, target_channel_id) plt.figure(figure1_name) sp1 = plt.subplot(311) plt.title('Channel [{}] (wider)'.format(target_channel_id)) plt.ylabel('Norm. Level') plt.plot(channel_traces[target_channel_id], 'r-o') plt.setp(sp1.get_xticklabels(), visible=False) sp2 = plt.subplot(312, sharex=sp1, sharey=sp1) plt.title('Channel [{}] (narrower)'.format(ref_channel_id)) plt.ylabel('Norm. Level') plt.plot(channel_traces[ref_channel_id], 'b-^') plt.setp(sp2.get_xticklabels(), visible=False) """ larger_trace = work_ref_channel_trace.view() smaller_trace = target_channel_trace.view() """ # computes the cross correlation between channels correlation = signal.correlate(larger_trace, smaller_trace, mode='full', method='fft') peak_correlation_index = np.argmax(correlation) # compute the length of distance to be cutted out from the beginning of one of the traces such as to align it both at maximum correlation total_trace_shift = peak_correlation_index-(smaller_trace.size-1) size_difference = larger_trace.size - smaller_trace.size # if the total shift is negative if total_trace_shift < 0: # smaller trace needs to be moved to the left, # cut the begin of the smaller trace by the total shift smaller_trace = smaller_trace[-total_trace_shift:] # cut the larger trace to the same size as the second larger_trace = larger_trace[0:(larger_trace.size-size_difference+total_trace_shift)] # else, smaller trace needs to be moved to the right else: end_offset = size_difference-total_trace_shift # if shift is equal or smaller than the difference in size of the traces if end_offset >= 0: # cut the begin of the larger trace by the required shift # cut the end of the larger trace to match the sizes larger_trace = larger_trace[total_trace_shift:larger_trace.size-end_offset] # else, smaller trace needs to be moved to the right but will overflow the larger trace else: # cut the smaller trace by the difference from the shift and size difference smaller_trace = smaller_trace[0:smaller_trace.size+end_offset] # cut the begin of the first trace by to match the second trace larger_trace = larger_trace[size_difference-end_offset:] # Compute the error. Uses RMSE as an normalized approximation of the euclidian distance (RSSE) # The use of mean is necessary due to the variable number of bins rms_distance = np.sqrt(np.mean((smaller_trace-larger_trace)**2)) channel_info.loc[ref_channel_id, target_channel_id] = rms_distance condensed_distance[c_d_index] = rms_distance c_d_index += 1 if PLOT_COMPARISON: sp3 = plt.subplot(313, sharex=sp1, sharey=sp1) plt.title('Traces aligned and cropped for comparison') plt.ylabel('Norm. Level') plt.xlabel('Frequency bin index') plt.plot(larger_trace, 'r-o', smaller_trace, 'b-^') plt.setp(sp3.get_xticklabels(), visible=True) plt.tight_layout() figure_file_name = "./Images/Compare/"+figure1_name+".png" plt.savefig(figure_file_name) #plt.show() plt.close(figure1_name) """ if ref_channel_id == '450188': # '466862': half_trace_length = int(work_ref_channel_trace.size/2) if 2*half_trace_length < work_ref_channel_trace.size: ref_index = np.arange(-half_trace_length, half_trace_length+1, 1) else: ref_index = np.arange(-half_trace_length, half_trace_length, 1) half_trace_length = int(target_channel_trace.size/2) if 2*half_trace_length < target_channel_trace.size: target_index = np.arange(-half_trace_length, half_trace_length+1, 1) else: target_index = np.arange(-half_trace_length, half_trace_length, 1) half_trace_length = int(correlation.size/2) if 2*half_trace_length < correlation.size: cor_index = np.arange(-half_trace_length, half_trace_length+1, 1) else: cor_index = np.arange(-half_trace_length, half_trace_length, 1) ref_index = np.arange(0, larger_trace.size, 1) target_index = np.arange(0, target_channel_trace.size, 1) cor_index = np.arange(0, correlation.size, 1) plt.figure(1) plt.subplot(211) plt.plot(larger_trace, 'r-', smaller_trace, 'b-') plt.subplot(212) plt.plot(correlation, 'g-') plt.show() plt.plot(larger_trace, 'r-', smaller_trace, 'b-',correlation/np.max(correlation), 'g-') plt.show() if is_it_autocorrelation: autocorrelation = np.max(correlation)-np.min(correlation) is_it_autocorrelation = False channel_info.loc[ref_channel_id, target_channel_id] = 1.0 else: # store the relative correlation peak as reference for the channel similarity channel_info.loc[ref_channel_id, target_channel_id] = (np.max(correlation)-np.min(correlation))/autocorrelation """ print("\n") # perform grouping by the distance and plot dendograms NUMBER_OF_GROUPS = 6 figure2_name = "Dendogram cut p={}".format(NUMBER_OF_GROUPS) plt.figure(figure2_name, figsize=(8, 6), dpi=80, frameon=False) linkage_matrix = linkage(condensed_distance, method="complete", optimal_ordering=True) cut_dendo = dendrogram(linkage_matrix, labels=channel_list, truncate_mode='lastp', p=NUMBER_OF_GROUPS, leaf_rotation=90., leaf_font_size=9., show_contracted=True) figure_file_name = "./Images/"+figure2_name+".png" plt.savefig(figure_file_name) figure3_name = "Dendogram Complete" plt.figure(figure3_name, figsize=(8, 6), dpi=80, frameon=False) complete_dendo = dendrogram(linkage_matrix, labels=channel_list, leaf_rotation=90., leaf_font_size=8.) figure_file_name = "./Images/"+figure3_name+".png" plt.savefig(figure_file_name) # creata a description of the channels on each group that compose a specific branch leaves = complete_dendo['ivl'] branch_id = 0 branch_dict = {branch_id:[]} number_of_leaves_in_branche = int(cut_dendo['ivl'][branch_id][1:-1])-1 number_of_leaves_already_considered = 0 for leave_index, channel_id in enumerate(leaves): if leave_index > number_of_leaves_in_branche+number_of_leaves_already_considered: branch_id += 1 number_of_leaves_in_branche = int(cut_dendo['ivl'][branch_id][1:-1])-1 number_of_leaves_already_considered = leave_index branch_dict[branch_id] = [] branch_dict[branch_id] = branch_dict[branch_id]+[channel_id] classified_groups = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in branch_dict.items() ])) cl.table_dataframe(classified_groups) plt.show() plt.close(figure2_name) plt.close(figure3_name) cl.table_dataframe(channel_info) channel_ref_index = 0 group_referece = 1 """ # identify channel within each group that has the highest signal to noise ratio to represent the group for group_size_string in cut_dendo['ivl']: group_size = int(group_size_string[1:-1]) channel_id = complete_dendo['ivl'][channel_ref_index] minimum_channel_level = channel_info.loc[channel_id, 'Min Value'] channel_group_reference = channel_id for channel_index in range(1,group_size): channel_ref_index += 1 channel_id = complete_dendo['ivl'][channel_ref_index] current_channel_level = channel_info.loc[channel_id, 'Min Value'] if minimum_channel_level > current_channel_level: minimum_channel_level = current_channel_level channel_group_reference = channel_id channel_ref_index += 1 plt.figure("Channel {}. Reference to group {}".format(channel_group_reference, group_referece)) plt.plot(channel_frequency[channel_group_reference], channel_traces[channel_group_reference]*channel_info.loc[channel_group_reference, 'Max Value']) plt.ylabel("Level [dB\u03BCV/m]") plt.xlabel("Frequency[kHz]") group_referece += 1 #dendrogram(linkage_matrix, labels=channel_list, leaf_rotation=90., leaf_font_size=12.) """ # store the dataframe with the data. output_file_name = cn.FOLDER_TO_STORE_FILES+'/'+cn.DATA_FILENAME file_data_store = pd.HDFStore(output_file_name) output_group_name = H5.CHANNEL_DATA_GROUP+"/"+cn.INTER_CHANNEL_DISTANCES file_data_store[output_group_name] = channel_info output_group_name = H5.CHANNEL_DATA_GROUP+"/"+cn.INTER_CHANNEL_DISTANCES_CONDENSED_MATRIX file_data_store[output_group_name] = pd.DataFrame(condensed_distance) file_data_store.close() cl.log_message("Finish processing")
def _main(): index_store = pd.HDFStore(cn.FOLDER_TO_STORE_FILES + '/' + cn.INDEX_FILENAME) file_index = index_store[cn.FILE_INDEX] site_index = index_store[cn.SITE_INDEX] channel_index = index_store[cn.CHANNEL_INDEX] # update the store with the sorted file_index index_store[cn.FILE_INDEX] = file_index # create new dataframe that will store the consolidated data for the detected channels channel_data = pd.DataFrame(columns=[ cn.CHANNEL_ID, H5.CHANNEL_EDGE_INITIAL_FREQUENCY, cn.CHANNEL_INNER_EDGE_INITIAL_FREQUENCY, H5.CHANNEL_CORE_INITIAL_FREQUENCY, H5.CHANNEL_CORE_FINAL_FREQUENCY, cn.CHANNEL_INNER_EDGE_FINAL_FREQUENCY, H5.CHANNEL_EDGE_FINAL_FREQUENCY ]) # initialize the consolidated channel dataframe channel_row = 0 input_file_name = channel_index.loc[channel_row, cn.FILENAME_ATTRIBUTE] center_frequency = round( (channel_index.loc[channel_row, H5.CHANNEL_CORE_FINAL_FREQUENCY] + channel_index.loc[channel_row, H5.CHANNEL_CORE_INITIAL_FREQUENCY]) / 2000) cl.log_message("Starting channel {} at {}kHz with file {}".format( channel_row, center_frequency, input_file_name)) current_channel_id = "Channel {}".format(channel_row) data = [ current_channel_id, channel_index.iloc[channel_row, 6], # edge initial frequency channel_index.iloc[channel_row, 7], # edge inner initial frequency channel_index.iloc[channel_row, 8], # core initial frequency channel_index.iloc[channel_row, 9], # core final frequency channel_index.iloc[channel_row, 10], # edge inner final frequency channel_index.iloc[channel_row, 11] ] # edge final frequency channel_data.loc[channel_row] = data channel_index.loc[channel_row, cn.CHANNEL_ID] = current_channel_id # loop through all channel information from different files index_length = len(channel_index.index) for row in range(1, index_length, 1): # test if channel core on the consolidated list 'd' (channel_Data at channel_row) intersect the channel on the index file 'i' (channel_Index at row) or the other way around index_core_first = channel_index.loc[ row, H5.CHANNEL_CORE_INITIAL_FREQUENCY] <= channel_data.loc[ channel_row, H5.CHANNEL_CORE_INITIAL_FREQUENCY] data_core_first = channel_data.loc[ channel_row, H5.CHANNEL_CORE_INITIAL_FREQUENCY] <= channel_index.loc[ row, H5.CHANNEL_CORE_INITIAL_FREQUENCY] i_inside_d = index_core_first and ( channel_data.loc[channel_row, H5.CHANNEL_CORE_INITIAL_FREQUENCY] <= channel_index.loc[row, H5.CHANNEL_CORE_FINAL_FREQUENCY]) d_inside_i = data_core_first and ( channel_index.loc[row, H5.CHANNEL_CORE_INITIAL_FREQUENCY] <= channel_data.loc[channel_row, H5.CHANNEL_CORE_FINAL_FREQUENCY]) # if they do intercept, merge the channel information if i_inside_d or d_inside_i: # link channel index with the channel data channel_index.loc[row, cn.CHANNEL_ID] = current_channel_id # update the channel data boundaries. Core is the intersection, edge is the union # if stored edge begin is larger than the new edge if channel_data.iloc[channel_row, 1] > channel_index.iloc[row, 6]: # update edge begin. (move edge to the left, expand) channel_data.iloc[channel_row, 1] = channel_index.iloc[row, 6] # if stored edge begin is smaller than the new edge if channel_data.iloc[channel_row, 2] < channel_index.iloc[row, 7]: # update edge begin. (moved edge to the right, contract) There is no risk of crossing of edges since the core is in between, a condition guaranteed by the detection algorithm channel_data.iloc[channel_row, 2] = channel_index.iloc[row, 7] # if stored core begin is lower than the new core if channel_data.iloc[channel_row, 3] < channel_index.iloc[row, 8]: # if new core begin still smaller than the core end. Necessary to avoid core equal or smaler than zero if channel_index.iloc[row, 8] < channel_data.iloc[channel_row, 4]: # update core begin (move core to the right, contract) channel_data.iloc[channel_row, 3] = channel_index.iloc[row, 8] # if stored core end is higher than the new core if channel_data.iloc[channel_row, 4] > channel_index.iloc[row, 9]: # if new core end still larger than the core begin if channel_index.iloc[row, 9] > channel_data.iloc[channel_row, 3]: # update core end (move core to the left, contract) channel_data.iloc[channel_row, 4] = channel_index.iloc[row, 9] # if stored edge end is larger than the new edge if channel_data.iloc[channel_row, 5] > channel_index.iloc[row, 10]: # update edge end (move edge to the left, contract) channel_data.iloc[channel_row, 5] = channel_index.iloc[row, 10] # if stored edge end is smaller than the new edge if channel_data.iloc[channel_row, 6] < channel_index.iloc[row, 11]: # update edge end (move edge to the right, expand) channel_data.iloc[channel_row, 6] = channel_index.iloc[row, 11] input_file_name = channel_index.loc[row, cn.FILENAME_ATTRIBUTE] center_frequency = round( (channel_data.loc[channel_row, H5.CHANNEL_CORE_FINAL_FREQUENCY] + channel_data.loc[channel_row, H5.CHANNEL_CORE_INITIAL_FREQUENCY]) / 2000) cl.log_message( " | Merged channel {} at {}kHz with file {}".format( row, center_frequency, input_file_name)) # if they do not intercept else: # a new channel needs to be assigned channel_row += 1 current_channel_id = "Channel {}".format(channel_row) data = [ current_channel_id, channel_index.iloc[row, 6], channel_index.iloc[row, 7], channel_index.iloc[row, 8], channel_index.iloc[row, 9], channel_index.iloc[row, 10], channel_index.iloc[row, 11] ] channel_data.loc[channel_row] = data channel_index.loc[row, cn.CHANNEL_ID] = current_channel_id input_file_name = channel_index.loc[row, cn.FILENAME_ATTRIBUTE] center_frequency = round( (channel_data.loc[channel_row, H5.CHANNEL_CORE_FINAL_FREQUENCY] + channel_data.loc[channel_row, H5.CHANNEL_CORE_INITIAL_FREQUENCY]) / 2000) cl.log_message("Starting channel {} at {}kHz with file {}".format( channel_row, center_frequency, input_file_name)) # loop through channel data and reasign names to the new center frequency in kHz rounded to integer cl.log_message("Starting channel renaming") index_length = len(channel_data.index) for row in range(0, index_length, 1): center_frequency = round( (channel_data.loc[row, H5.CHANNEL_CORE_FINAL_FREQUENCY] + channel_data.loc[row, H5.CHANNEL_CORE_INITIAL_FREQUENCY]) / 2000) current_channel_id = "{:.0f}".format(center_frequency) channel_index.replace(to_replace=channel_data.loc[row, cn.CHANNEL_ID], value=current_channel_id, inplace=True) channel_data.loc[row, cn.CHANNEL_ID] = current_channel_id cl.log_message("Channel {} renamed at {}kHz".format( row, center_frequency)) """ print("{} - {}: {:.0f}, {:.0f}, {:.0f}, {:.0f}".format(row, channel_data.loc[row, cn.CHANNEL_ID], channel_data.loc[row, H5.CHANNEL_EDGE_INITIAL_FREQUENCY], channel_data.loc[row, H5.CHANNEL_CORE_INITIAL_FREQUENCY], channel_data.loc[row, H5.CHANNEL_CORE_FINAL_FREQUENCY], channel_data.loc[row, H5.CHANNEL_EDGE_FINAL_FREQUENCY])) """ #channel_data.to_csv(cn.FOLDER_TO_STORE_FILES+'/'+'channel_data.csv', index=None, header=True) index_store[cn.CHANNEL_DATA_TABLE] = channel_data index_store[cn.CHANNEL_INDEX] = channel_index index_store.close() cl.log_message("Finish data indexing")
def _main(): # open file and get a list of the channel spectrogram groups. Uses H5py for better efficiency data_store_file = h5py.File(cn.FOLDER_TO_STORE_FILES + '/' + cn.DATA_FILENAME) channel_spectrogram_list = list( data_store_file[H5.CHANNEL_DATA_GROUP].keys()) data_store_file.close() # create array with bin edges to be used on the histogram profile_histogram_bins = np.arange(1, 8, 0.05) numpy_histogram_bins = np.r_[-np.inf, profile_histogram_bins, np.inf] # create empty dataframe to store results channel_distances = pd.DataFrame() for spectrogram_group_name in channel_spectrogram_list: # reopen the file with pandas HDF data_store_file = pd.HDFStore(cn.FOLDER_TO_STORE_FILES + '/' + cn.DATA_FILENAME) # Test if dataset is of the spectrogram type if H5.EM_SPECTRUM_DATA_GROUP in spectrogram_group_name: # get the channel ID channel_id = spectrogram_group_name.split( H5.EM_SPECTRUM_DATA_GROUP)[1] # get the dataframe channel_traces = data_store_file[H5.CHANNEL_DATA_GROUP + '/' + spectrogram_group_name] frequency_at_peak = channel_traces.idxmax(axis=1).mean() number_of_time_samples = channel_traces.shape[0] if number_of_time_samples > cn.MINIMUM_NUMBER_SAMPLES_FOR_INNER_ANALYSIS: # reduce the number of traces to make computation viable if number_of_time_samples * channel_traces.shape[ 1] > cn.MAXIMUM_NUMBER_DATAPOINTS_FOR_INNER_ANALYSIS: number_of_time_samples = int( round( cn.MAXIMUM_NUMBER_DATAPOINTS_FOR_INNER_ANALYSIS / channel_traces.shape[1], 0)) channel_traces = channel_traces.iloc[ 0:number_of_time_samples, :] figure_name = "Spectrogram channel {}".format(channel_id) plt.figure(figure_name) plt.subplot(121) plt.title("Spectrogram for channel {}".format(channel_id)) xy_array = channel_traces.to_numpy(dtype='float32') frequency_axis = channel_traces.columns.to_numpy( dtype='float64') x_axis = (frequency_axis - frequency_at_peak) / 1000.0 y_axis = channel_traces.index.to_numpy(dtype='float64') y_axis = y_axis - y_axis.min() plt.pcolormesh(x_axis, np.arange(0, xy_array.shape[0]), xy_array, cmap='CMRmap_r') plt.xlabel("Frequency[kHz]") plt.ylabel("Index") # flatten the dataframe to a series flatten_trace = channel_traces.to_numpy( dtype='float32').flatten() trace_bin_width = channel_traces.shape[1] profile, profile_index = matrixProfile.stomp( flatten_trace, trace_bin_width) plt.subplot(122) plt.title("Matrix Profile".format(channel_id)) plt.ylim((0, profile.size)) plt.yticks(ticks=np.arange( 0, profile.size, profile.size / number_of_time_samples), labels='') plt.grid(which='major', axis='y') plt.plot(profile, np.arange(0, profile.size)) combined_dataframe = pd.Series(flatten_trace).to_frame() combined_dataframe['profile'] = np.append( profile, np.zeros(trace_bin_width - 1) + np.nan) # no exclusion zone ex_zone = trace_bin_width # TODO: instead of using fixed number, should use a measure above noise number_of_modes = math.ceil(number_of_time_samples * cn.PEAK_SAMPLE_RATIO) # get a maximum of one anomaly for each trace profile_peak = discords(combined_dataframe['profile'], ex_zone, k=number_of_modes + 1) # get the peaks into a dataframe with corresponding matrix profile values profile_peak_df = combined_dataframe.loc[profile_peak, 'profile'] # select peaks that have large profile values considering defined thershold # profile_peak_df=profile_peak_df[profile_peak_df.loc[:]>cn.MPROFILE_NOISE_THRESHOLD] profile_peak_df = profile_peak_df.reset_index() # compute the corresponding trace index based on the flatten index profile_peak_df['trace_index'] = round( profile_peak_df['index'] / trace_bin_width, 0) order = 1 for one_peak_index in profile_peak_df['trace_index']: plt.subplot(121) plot_name = "{}".format(order) x_pos = x_axis.max() y_pos = int(one_peak_index) arrow_end_pos = x_pos + ((x_pos - x_axis.min()) / 25) plt.annotate(plot_name, xy=(x_pos, y_pos), xycoords="data", xytext=(arrow_end_pos, y_pos), textcoords='data', arrowprops=dict(arrowstyle="->", connectionstyle="arc3")) order += 1 figure_file_name = "./Images/" + figure_name + ".png" plt.savefig(figure_file_name) plt.close(figure_name) # TODO: Use profile_peak_df to split spectrogram into two subchannels. Compute new Profile Density # store the distance information as reference to the channel channel_distance_descriptor = pd.DataFrame() channel_distance_descriptor.loc[ channel_id, cn.INNER_DISTANCE_MAX] = profile_peak_df.loc[0, 'profile'] channel_distance_descriptor.loc[ channel_id, cn.INNER_DISTANCE_MEAN] = profile_peak_df.mean( axis='rows')['profile'] # compute histogram profile_histogram, discarted_bins = np.histogram( profile, bins=numpy_histogram_bins, density=False) # add overflow to last bin profile_histogram[-2] += profile_histogram[-1] profile_histogram = profile_histogram[0:-1] # plot histogram histogram_maximum = profile_histogram.max() figure_name = "Matrix Profile histogram for channel {}".format( channel_id) plt.figure(figure_name) plt.ylim(0, histogram_maximum * 1.05) plt.bar(profile_histogram_bins, height=profile_histogram, width=1) plt.plot(profile_histogram_bins, profile_histogram) figure_file_name = "./Images/" + figure_name + ".png" plt.savefig(figure_file_name) plt.close(figure_name) # convert np array with the histogram to a dataframe, using the bin values as column names profile_histogram_df = pd.DataFrame( [profile_histogram], index=[channel_id], columns=profile_histogram_bins) # merge dataframes to get the complete channel inner distance profile information into the same row channel_distance_descriptor = channel_distance_descriptor.join( profile_histogram_df) channel_distances = channel_distances.append( channel_distance_descriptor) #close file used to read the data data_store_file.close() # store the dataframe with the data. Since this is a slow process, it is stored each update for safe interruption output_file_name = cn.FOLDER_TO_STORE_FILES + '/' + cn.DATA_FILENAME file_data_store = pd.HDFStore(output_file_name) output_group_name = H5.CHANNEL_DATA_GROUP + "/" + cn.CHANNEL_DISTANCES_DATA_GROUP file_data_store[output_group_name] = channel_distances file_data_store.close() cl.log_message( "Processed channel {}. Inner distance of {}".format( channel_id, channel_distances.loc[channel_id, cn.INNER_DISTANCE_MAX])) #plt.show() else: #close file used to read the data data_store_file.close() channel_distances.loc[channel_id, :] = np.NaN output_file_name = cn.FOLDER_TO_STORE_FILES + '/' + cn.DATA_FILENAME file_data_store = pd.HDFStore(output_file_name) output_group_name = H5.CHANNEL_DATA_GROUP + "/" + cn.CHANNEL_DISTANCES_DATA_GROUP file_data_store[output_group_name] = channel_distances file_data_store.close() cl.log_message( "Processed channel {}. Too few traces to evaluate inner distance. # traces: " .format(channel_id, number_of_time_samples))
def _main(): index_store = pd.HDFStore(cn.FOLDER_TO_STORE_FILES+'/'+cn.INDEX_FILENAME) channel_data = index_store[cn.CHANNEL_DATA_TABLE] index_length = len(channel_data.index)-1 channel_index = index_store[cn.CHANNEL_INDEX] channel_index.set_index(cn.CHANNEL_ID, inplace=True) # Loop through channels grouped collecting the required information for row in range(index_length): # create empty dataframe to store the resulting profile profile_array_result = pd.DataFrame() number_of_traces = 0 # Get the channel ID channel_id = channel_data.loc[row, cn.CHANNEL_ID] # Select files that contain the designated channel files_with_channel = channel_index[channel_index.index == channel_id] # Get the number of files to be processed channel_index_length = len(files_with_channel.index) # initialize bolean variable responsible to signaling the existance of at least one profile shall_remove_channel_reference = True # initialize variable to store the total number of profiles and avoid scope limitations due to conditional initialization number_of_traces_sum = 0 # loop through files that are marked with the indicated channel for channel_row in range(channel_index_length): # get the file and group for the channel data input_file_name = files_with_channel.iloc[channel_row, 0] input_group_name = files_with_channel.iloc[channel_row, 1] input_group_name = input_group_name.replace(H5.ACTIVITY_PROFILE_DATA_GROUP, H5.LEVEL_PROFILE_DATA_GROUP) # open the file input_file_object = h5py.File(input_file_name, 'r') # TODO: Include test if the file follows the standard # Get a handle on the group input_profile_group = input_file_object[H5.CHANNEL_DATA_GROUP+"/"+input_group_name] # recover the dataset reference handle profile_dataset = input_profile_group[H5.LEVEL_PROFILE_DATASET] # Todo: test if level units are compatible # update the counter for the number of traces on the profile number_of_traces = profile_dataset.attrs[H5.NUMBER_OF_PROFILE_TRACES_ATTRIBUTE][0] if number_of_traces > 0: number_of_traces_sum += number_of_traces # Get the level profile profile_array_new = pd.DataFrame(profile_dataset[:]) profile_array_new.columns = input_profile_group[H5.FREQUENCY_DATASET][:] profile_array_new.index = input_profile_group[H5.LEVEL_DATASET][:] # Merge the new profile into the result profile_array_result = profile_array_result.add(profile_array_new, fill_value=0.0) profile_array_result.fillna(0, inplace=True) shall_remove_channel_reference = False else: cl.log_message("File {} has no profile data".format(input_file_name)) #TODO: Delete from index cl.log_message("Processed {}/{}. Profile shape: {}".format(input_file_name, input_group_name, str(profile_array_result.shape))) # If no profile is really stored for the channel. May happen if channel was created due to database reference if shall_remove_channel_reference: cl.log_message("No profile for channel {}/{}".format(input_file_name, input_group_name)) else: # store the dataframe with the merged level profile output_file_name = cn.FOLDER_TO_STORE_FILES+'/'+cn.DATA_FILENAME index_store = pd.HDFStore(output_file_name) output_group_name = H5.CHANNEL_DATA_GROUP+"/"+H5.LEVEL_PROFILE_DATA_GROUP+channel_id index_store[output_group_name] = profile_array_result index_store.close() # store the attributes to the group where the dataframe is stored output_file_object = h5py.File(output_file_name) output_file_object[output_group_name].attrs[H5.NUMBER_OF_PROFILE_TRACES_ATTRIBUTE] = number_of_traces_sum output_file_object.close() # output message cl.log_message("Finish indexing {} files".format(index_length))