# key to store the results of the significance test in the datastore hypotest.sk_significance_map = 'significance' # key to store the results of the residuals test in the datastore hypotest.sk_residuals_map = 'residuals' # key to store the results of the residuals test in the datastore, in format which # is makes further processing more easy hypotest.sk_residuals_overview = 'residuals_overview' # Advanced settings # Specify what categories to ignore. # hypotest.ignore_categories = ['None','Not_familair_with','NoFruit'] # hypotest.var_ignore_categories = ['obs1':'None','obs2':'Not_familiar_with','obs1:obs2':['None','pear']] # Hypothesis tester is also applicable to continues variables once they are categorised; # ie make bins. The number of bins can be set using the following options # hypotest.default_number_of_bins = 5 # hypotest.var_default_number_of_bins = ['obs1':10,'obs2':5,'obs1:obs2':[3,3]] hypotest.logger.log_level = LogLevel.DEBUG ch.add(hypotest) # --- 4. print contents of the datastore overview = Chain('Overview') hist_summary = visualization.DfSummary(name='HistogramSummary', read_key=hypotest.hist_dict_key, pages_key=hypotest.pages_key) overview.add(hist_summary) ######################################################################################### logger.debug('Done parsing configuration file esk410_testing_correlations_between_categories')
# --- now set up the chains and links based on configuration flags ch = proc_mgr.add_chain('Data') # --- 0. readdata keeps on opening the next file in the file list. # all kwargs are passed on to pandas file reader. readdata = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv', sep=' ') readdata.path = input_files ch.add_link(readdata) # --- 1. add data-frame summary link to "Summary" chain summarizer = visualization.DfSummary(name='Create_stats_overview', read_key=readdata.key, pages_key='report_pages') ch.add_link(summarizer) # --- 2. Fill 2d histogrammar histograms hf = analysis.HistogrammarFiller() hf.read_key = 'accounts' hf.store_key = 'hist' hf.set_log_level(logging.DEBUG) hf.columns = [['x1', 'x2'], ['x1', 'x3'], ['x1', 'x4'], ['x1', 'x5'], ['x2', 'x3'], ['x2', 'x4'], ['x2', 'x5'], ['x3', 'x4'], ['x3', 'x4'], ['x4', 'x5']] hf._unit_bin_specs = {'bin_width': 0.2, 'bin_offset': 0.0} ch.add_link(hf) hs = visualization.DfSummary(name='HistogramSummary1',
'latitude': {'bin_width': 5, 'bin_offset': 0}} # as we are running in a loop, store the resulting histograms in the finalize() of the link, # after having looped through all (small) datasets. vc.store_at_finalize = True ch.add(vc) # --- this serves as the continue statement of the loop. go back to start of the chain. repeater = core_ops.RepeatChain() # repeat until readdata says halt. repeater.listen_to = 'chainRepeatRequestBy_' + read_data.name ch.add(repeater) link = core_ops.DsObjectDeleter() link.keep_only = ['hist', 'n_sum_rc'] ch.add(link) # --- print contents of the datastore overview = Chain('Overview') pds = core_ops.PrintDs(name='End') pds.keys = ['n_sum_rc'] overview.add(pds) # --- make a nice summary report of the created histograms hist_summary = visualization.DfSummary(name='HistogramSummary', read_key=vc.store_key_hists) overview.add(hist_summary) ######################################################################################### logger.debug('Done parsing configuration file esk302_histogram_filler_plotter.')
hf = root_analysis.RootHistFiller() # columns is a list of single observables or sub-lists in case of multi-dimensional histograms hf.columns = [ 'x1', 'x2', 'x3', 'x4', 'x5', ['x1', 'x2'], ['x2', 'x3'], ['x4', 'x5'] ] hf.read_key = 'correlated_data' hf.store_key = 'hist' hf.var_min_value = {'x2': -5, 'x3': -5, 'x4': -5, 'x5': -5} hf.var_max_value = {'x2': 5, 'x3': 5, 'x4': 5, 'x5': 5} ch.add_link(hf) if settings['make_plot']: ch = proc_mgr.add_chain('Plotting') # --- 2. make a nice summary report of the created histograms hs = visualization.DfSummary(name='HistogramSummary', read_key=hf.store_key) ch.add_link(hs) if settings['convert_to_rdh']: ch = proc_mgr.add_chain('Convert1') # --- 3. convert a root histogram to a RooDataHist object h2rdh = root_analysis.ConvertRootHist2RooDataHist() h2rdh.read_key = 'x1' h2rdh.hist_dict_key = 'hist' h2rdh.create_hist_pdf = 'hpdf' #h2rds.into_ws = True ch.add_link(h2rdh) if settings['convert_to_rds']: ch = proc_mgr.add_chain('Convert2')
SIZE = 10000 VAR_LABELS = dict(var_a='Variable A', var_b='Variable B', var_c='Variable C') VAR_UNITS = dict(var_b='m/s') GEN_CONF = dict(var_b=dict(mean=42., std=2.), var_c=dict(mean=42, std=2, dtype=int)) ######################################################################################### # --- now set up the chains and links based on configuration flags data = Chain('Data') # add data-generator link to "Data" chain generator = analysis.BasicGenerator(name='Generate_data', key='data', columns=COLUMNS, size=SIZE, gen_config=GEN_CONF) data.add(generator) # add data-frame summary link to "Summary" chain # can provide labels and units for the variables in the dataset summary = Chain('Summary') summarizer = visualization.DfSummary(name='Create_stats_overview', read_key=generator.key, var_labels=VAR_LABELS, var_units=VAR_UNITS) summary.add(summarizer) ######################################################################################### logger.debug('Done parsing configuration file esk301_dfsummary_plotter')
ch = proc_mgr.add_chain('Conversion2') rds2df = root_analysis.ConvertRooDataSet2DataFrame() rds2df.read_key = 'simdata' rds2df.store_key = 'df_simdata' rds2df.remove_original = True ch.add_link(rds2df) if settings['summary']: proc_mgr.add_chain('Summary') # print contents of the workspace pws = root_analysis.PrintWs() ch.add_link(pws) # print contents of datastore pds = core_ops.PrintDs(name='pds2') #pds.keys = ['accounts'] ch.add_link(pds) # --- make a summary document of simulated dataframe summarizer = visualization.DfSummary(name='Create_stats_overview', read_key=rds2df.store_key) proc_mgr.get_chain('Summary').add_link(summarizer) ######################################################################################### log.debug( 'Done parsing configuration file esk404_workspace_createpdf_simulate_fit_plot' )
data = Chain('Data') # add data-frame reader to "Data" chain reader = analysis.ReadToDf(name='Read_LA_ozone', path='LAozone.data', reader=pd.read_csv, key='data') data.add(reader) # add conversion functions to "Data" chain transform = analysis.ApplyFuncToDf(name='Transform', read_key=reader.key, store_key='transformed_data', apply_funcs=conv_funcs) data.add(transform) # create second chain summary = Chain('Summary') # add data-frame summary link to "Summary" chain summarizer = visualization.DfSummary(name='Create_stats_overview', read_key=transform.store_key, var_labels=VAR_LABELS, var_units=VAR_UNITS) summary.add(summarizer) ######################################################################################### # --- exercises # # 1. # Run the macro and take a look at the output. # 2. # Now add your own transformation to the ApplyFuncToDf class. # We want to transform the temperature to Celsius, so use the code in the comments and fill it out. # As you can see the output will be written in the DataFrame to the column 'temp_c'. # We also want to include in the plot the wind speed in km/h. # Rerun the macro and take a look at the output. The output can be found in decision_engine/results