def test_save_to_msgpack(): data_file = os.path.join('simar', 'SIMAR_1052046') file_name = 'full_simar_preprocessed.msg' path = 'D:\\REPOSITORIO GIT\\protocol_project\\data\\intermediate_files' # Read simar data_simar, _ = read.simar(data_file, tests.full_data_path) # Preproccesing simar time_step = missing_values.find_timestep(data_simar, n_random_values=10) data_clean = missing_values.erase_null_values(data_simar, method='all') data_simar_interp = missing_values.fill_missing_values( data_clean, time_step, technique='interpolation', method='nearest', limit=720, limit_direction='both') # Check missing values miss_values = missing_values.find_missing_values(data_simar_interp, time_step) # Save simar if miss_values.empty: save_to_msgpack(data_simar_interp, file_name, path) return
def test_extreme_events_full_simar_with_interpolation(): data_file = 'SIMAR_1052046' full_data_path = os.path.join('..', '..', '..', '..', '..', 'data', 'simar') data_simar, code = read.simar(data_file, path=full_data_path) var_name = 'Hm0' threshold = np.percentile(data_simar[var_name], 95) minimum_interarrival_time = pd.Timedelta('3 days') minimum_cycle_length = pd.Timedelta('3 hours') cycles, calm_periods = extremal.extreme_events(data_simar, var_name, threshold, minimum_interarrival_time, minimum_cycle_length, interpolation=True) test = extremal.events_boundaries(cycles) maximum = extremal.events_max(cycles) # frequency = pd.Series(data.index).diff().min() duration = extremal.events_duration(cycles) magnitude = extremal.events_magnitude(cycles, threshold) # noinspection PyTypeChecker print(len(cycles) == len(calm_periods))
def test_min_cycles_duration(): # Read data data_file = 'SIMAR_1052046' full_data_path = os.path.join('..', '..', '..', '..', '..', 'data', 'simar') data_simar, code = read.simar(data_file, path=full_data_path) # Input threshold = np.percentile(data_simar['Hm0'], 95) minimum_interarrival_time = pd.Timedelta('3 days') minimum_cycle_length = pd.Timedelta('3 hours') # Cycles calculation cycles, calm_periods = extremal.extreme_events(data_simar, 'Hm0', threshold, minimum_interarrival_time, minimum_cycle_length) # Cycles duration cycles_duration = extremal.events_duration(cycles) # Check if the min duration is equal to the set duration threshold min_cycles_duration = cycles_duration.min() # Find the cycles with duration less than the threshold cont = 0 list_wrong_cycles = [] for cycle in cycles_duration: if cycle == pd.Timedelta('2 hours'): list_wrong_cycles.append(cont) cont += 1 assert min_cycles_duration == minimum_cycle_length
def test_fill_missing_values(): # Read simar file data_path = os.path.join(tests.sample_data_path, 'simar') # noinspection PyTypeChecker data_simar, _ = read.simar('SIMAR_1052046_short_gap', data_path) # Calculate the time step time_step = missing_values.find_timestep(data_simar) # Fill missing values data_fill = missing_values.fill_missing_values(data_simar, time_step, technique='interpolation', method='nearest', limit=24, limit_direction='both') tolerance = 0.01 assert data_fill.loc['1958-01-04 08', 'Hm0'] == pytest.approx(2.1, tolerance) assert data_fill.loc['1958-01-04 12', 'Tp'] == pytest.approx(10.5, tolerance) assert data_fill.loc['1958-01-04 14', 'Tp'] == pytest.approx(10.6, tolerance) assert data_fill.loc['1960-12-31 19', 'Hm0'] == pytest.approx(0.6, tolerance)
def test_missing_values_report(): # Input file_name = 'gaps_report.csv' path = os.path.join('.', '..', '..', 'report', 'tests', 'output', 'tables') # Read simar file data_path = os.path.join(tests.sample_data_path, 'simar') # noinspection PyTypeChecker data_simar, _ = read.simar('SIMAR_1052046_short_gap', data_path) # Calculate the time step time_step = missing_values.find_timestep(data_simar) # Find gaps data_gaps = missing_values.find_missing_values(data_simar, time_step) # Gaps report data_gaps_report = missing_values.missing_values_report( data_simar, data_gaps) missing_values.missing_values_report_to_file(data_gaps_report, file_name, path) # Plot missing_values.plot_missing_values(data=data_simar, data_column='Hm0', data_gaps=data_gaps, title='', var_name='Hm0', var_unit='m', fig_filename='', circular=False, label='Hm0')
def test_find_timestep(): # Read simar file data_path = os.path.join(tests.sample_data_path, 'simar') # noinspection PyTypeChecker data_simar, _ = read.simar('SIMAR_1052046_short', data_path) t_step = missing_values.find_timestep(data_simar) assert t_step == pd.timedelta(hours=1)
def test_erase_null_values(): # Define null values method = 'any' # Read simar file data_path = os.path.join(tests.sample_data_path, 'simar') # noinspection PyTypeChecker data_simar, _ = read.simar('SIMAR_1052046_short_gap', data_path) # Erase null values data = missing_values.erase_null_values(data_simar, method) assert '1958-01-04 08' not in data.index assert '1958-01-04 19' not in data.index assert '1960-12-31 18' not in data.index
def test_find_missing_values(): # Read simar file data_path = os.path.join(tests.sample_data_path, 'simar') # noinspection PyTypeChecker data_simar, _ = read.simar('SIMAR_1052046_short_gap', data_path) # Calculate the time step time_step = missing_values.find_timestep(data_simar) data_gaps = missing_values.find_missing_values(data_simar, time_step) # Representation of the gaps fig = plt.figure() ax = fig.gca() ax.plot(data_simar.loc[:, 'Hm0']) ax.plot(data_simar.loc[data_gaps.loc[:, 'pos_ini'], 'Hm0'], 'k.', markersize=10) ax.plot(data_simar.loc[data_gaps.loc[:, 'pos_fin'], 'Hm0'], 'k.', markersize=10) fig.show()
def test_gev_fit_to_annual_maxima_confidence_bands(): # Inputs data_file = 'SIMAR_1052046' threshold_percentile = 95 n_sim_boot = 100 alpha = 0.05 # Confidence level # Read SIMAR full_data_path = os.path.join('..', '..', '..', '..', '..', 'data', 'simar') data_simar, code = read.simar(data_file, path=full_data_path) threshold = np.percentile(data_simar.loc[:, 'Hm0'], threshold_percentile) # Calculation of the annual maxima sample annual_maxima = extremal.annual_maxima_calculation(data_simar['Hm0']) # Annual Maxima Empirical distribution ecdf_am = empirical_distributions.ecdf_histogram(annual_maxima) ecdf_am_rp = extremal.return_period_curve(1, ecdf_am) # Fit Annual Maxima to GEV (param_orig, x_gev, y_gev, y_gev_rp) = extremal.extremal_distribution_fit( data=data_simar, var_name='Hm0', sample=annual_maxima, threshold=threshold, fit_type='gev', x_min=0.90 * min(annual_maxima), x_max=1.5 * max(annual_maxima), n_points=1000, cumulative=True) # Add confidence bands to asses the uncertainty (Bootstrapping) boot_extreme = extremal.extremal_distribution_fit_bootstrapping( sample=annual_maxima, n_sim_boot=n_sim_boot, data=data_simar, var_name='Hm0', threshold=threshold, param_orig=param_orig, fit_type='gev', x_min=0.90 * min(annual_maxima), x_max=1.5 * max(annual_maxima), alpha=alpha) # Representation extremal.plot_extremal_cdf(x_gev, y_gev, ecdf_am, n_sim_boot, boot_extreme, alpha, title='', var_name='Hm0', var_unit='m', fig_filename='', circular=False, extremal_label='GEV Fit', empirical_label='GEV ECDF') extremal.plot_extremal_return_period(x_gev, y_gev_rp, ecdf_am_rp, n_sim_boot, boot_extreme, alpha, title='', var_name='Hm0', var_unit='m', fig_filename='', circular=False, extremal_label='GEV Fit', empirical_label='GEV ECDF')
def test_gpd_fit_to_pot_confidence_bands(): # Inputs data_file = 'SIMAR_1052046' threshold_percentile = 95 minimum_interarrival_time = pd.Timedelta('3 days') minimum_cycle_length = pd.Timedelta('3 hours') interpolation = True interpolation_method = 'linear' interpolation_freq = '1min' truncate = False extra_info = False n_sim_boot = 100 alpha = 0.05 # Confidence level # Read SIMAR full_data_path = os.path.join('..', '..', '..', '..', '..', 'data', 'simar') data_simar, code = read.simar(data_file, path=full_data_path) threshold = np.percentile(data_simar.loc[:, 'Hm0'], threshold_percentile) # Storm cycles calculation cycles, calm_periods = extremal.extreme_events( data_simar, 'Hm0', threshold, minimum_interarrival_time, minimum_cycle_length, interpolation, interpolation_method, interpolation_freq, truncate, extra_info) # Peaks over threshold peaks_over_thres = extremal.events_max(cycles) # POT Empirical distribution ecdf_pot = empirical_distributions.ecdf_histogram(peaks_over_thres) n_peaks_year = len(peaks_over_thres) / len( data_simar['Hm0'].index.year.unique()) ecdf_pot_rp = extremal.return_period_curve(n_peaks_year, ecdf_pot) # Fit POT to Scipy-GPD (param_orig, x_gpd, y_gpd, y_gpd_rp) = extremal.extremal_distribution_fit( data=data_simar, var_name='Hm0', sample=peaks_over_thres, threshold=threshold, fit_type='gpd', x_min=0.90 * min(peaks_over_thres), x_max=1.5 * max(peaks_over_thres), n_points=1000, cumulative=True) # Add confidence bands to asses the uncertainty (Bootstrapping) boot_extreme = extremal.extremal_distribution_fit_bootstrapping( sample=peaks_over_thres, n_sim_boot=n_sim_boot, data=data_simar, var_name='Hm0', threshold=threshold, param_orig=param_orig, fit_type='gpd', x_min=0.90 * min(peaks_over_thres), x_max=1.5 * max(peaks_over_thres), alpha=alpha) # Representation extremal.plot_extremal_cdf(x_gpd, y_gpd, ecdf_pot, n_sim_boot, boot_extreme, alpha, title='', var_name='Hm0', var_unit='m', fig_filename='', circular=False, extremal_label='GPD Fit', empirical_label='POT ECDF') extremal.plot_extremal_return_period(x_gpd, y_gpd_rp, ecdf_pot_rp, n_sim_boot, boot_extreme, alpha, title='', var_name='Hm0', var_unit='m', fig_filename='', circular=False, extremal_label='GPD Fit', empirical_label='POT ECDF')
def test_poisson_pareto_fit_to_pot_and_gev_fit_to_annual_maxima(): # Inputs data_file = 'SIMAR_1052046' threshold_percentile = 95 minimum_interarrival_time = pd.Timedelta('3 days') minimum_cycle_length = pd.Timedelta('3 hours') interpolation = True interpolation_method = 'linear' interpolation_freq = '1min' truncate = False extra_info = False # Read SIMAR full_data_path = os.path.join('..', '..', '..', '..', '..', 'data', 'simar') data_simar, code = read.simar(data_file, path=full_data_path) threshold = np.percentile(data_simar.loc[:, 'Hm0'], threshold_percentile) # Storm cycles calculation cycles, calm_periods = extremal.extreme_events( data_simar, 'Hm0', threshold, minimum_interarrival_time, minimum_cycle_length, interpolation, interpolation_method, interpolation_freq, truncate, extra_info) # Peaks over threshold peaks_over_thres = extremal.events_max(cycles) # Calculation of the annual maxima sample annual_maxima = extremal.annual_maxima_calculation(data_simar['Hm0']) # POT Empirical distribution ecdf_pot = empirical_distributions.ecdf_histogram(peaks_over_thres) n_peaks_year = len(peaks_over_thres) / len( data_simar['Hm0'].index.year.unique()) ecdf_pot_rp = extremal.return_period_curve(n_peaks_year, ecdf_pot) # Annual Maxima Empirical distribution ecdf_am = empirical_distributions.ecdf_histogram(annual_maxima) ecdf_am_rp = extremal.return_period_curve(1, ecdf_am) # Fit Annual Maxima to GEV (param, x_gev, y_gev, y_gev_rp) = extremal.extremal_distribution_fit( data=data_simar, var_name='Hm0', sample=annual_maxima, threshold=None, fit_type='gev', x_min=0.90 * min(annual_maxima), x_max=1.5 * max(annual_maxima), n_points=1000, cumulative=True) # Fit Peaks over threshold to Poisson Pareto (param, x_pp, y_pp, y_pp_rp) = extremal.extremal_distribution_fit( data=data_simar, var_name='Hm0', sample=peaks_over_thres, threshold=threshold, fit_type='poisson', x_min=0.90 * min(peaks_over_thres), x_max=1.5 * max(peaks_over_thres), n_points=1000, cumulative=True) # Represent results plt.figure() ax = plt.axes() ax.plot(ecdf_am.index, ecdf_am, '.k', label='Annual maxima ECDF') ax.plot(x_gev, y_gev, 'k', label='GEV fit') ax.plot(x_pp, y_pp, label='Poisson-Pareto fit') plt.xlabel('Hm0 (m)') plt.ylabel('CDF') ax.legend() plt.grid() plt.show() plt.figure() ax = plt.axes() ax.semilogx(ecdf_am_rp, ecdf_am_rp.index, '.k', label='Annual maxima ECDF') ax.semilogx(y_gev_rp, x_gev, 'k', label='GEV fit') ax.semilogx(y_pp_rp, x_pp, label='Poisson-Pareto fit') plt.xlim(0, 500) plt.xlabel('Return Period (years)') plt.ylabel('Hm0 (m)') ax.legend() plt.grid() plt.show()
def read_full_simar(data_file=full_data_file, data_path=full_data_path): # noinspection PyTypeChecker data_simar, code = read.simar(data_file, data_path) return data_simar
def read_sample_simar(data_file=sample_data_file, data_path=sample_data_path): # noinspection PyTypeChecker data_simar, code = read.simar(data_file, data_path) return data_simar
def test_simar_data(): data_simar, _ = read.simar(tests.sample_data_file, tests.sample_data_path) assert data_simar.shape[0] == 216 assert data_simar.ix[153, 'Tp'] == 11.2
def test_simar_code(): _, code = read.simar(tests.sample_data_file, tests.sample_data_path) assert code == '1052046'