# key to store the results of the significance test in the datastore
hypotest.sk_significance_map = 'significance'
# key to store the results of the residuals test in the datastore
hypotest.sk_residuals_map = 'residuals'
# key to store the results of the residuals test in the datastore, in format which
# is makes further processing more easy
hypotest.sk_residuals_overview = 'residuals_overview'

# Advanced settings
# Specify what categories to ignore.
# hypotest.ignore_categories = ['None','Not_familair_with','NoFruit']
# hypotest.var_ignore_categories = ['obs1':'None','obs2':'Not_familiar_with','obs1:obs2':['None','pear']]
# Hypothesis tester is also applicable to continues variables once they are categorised;
# ie make bins. The number of bins can be set using the following options
# hypotest.default_number_of_bins = 5
# hypotest.var_default_number_of_bins = ['obs1':10,'obs2':5,'obs1:obs2':[3,3]]

hypotest.logger.log_level = LogLevel.DEBUG
ch.add(hypotest)

# --- 4. print contents of the datastore
overview = Chain('Overview')
hist_summary = visualization.DfSummary(name='HistogramSummary',
                                       read_key=hypotest.hist_dict_key,
                                       pages_key=hypotest.pages_key)
overview.add(hist_summary)

#########################################################################################

logger.debug('Done parsing configuration file esk410_testing_correlations_between_categories')
# --- now set up the chains and links based on configuration flags

ch = proc_mgr.add_chain('Data')

# --- 0. readdata keeps on opening the next file in the file list.
#     all kwargs are passed on to pandas file reader.
readdata = analysis.ReadToDf(name='dflooper',
                             key='accounts',
                             reader='csv',
                             sep=' ')
readdata.path = input_files
ch.add_link(readdata)

# --- 1. add data-frame summary link to "Summary" chain
summarizer = visualization.DfSummary(name='Create_stats_overview',
                                     read_key=readdata.key,
                                     pages_key='report_pages')
ch.add_link(summarizer)

# --- 2. Fill 2d histogrammar histograms
hf = analysis.HistogrammarFiller()
hf.read_key = 'accounts'
hf.store_key = 'hist'
hf.set_log_level(logging.DEBUG)
hf.columns = [['x1', 'x2'], ['x1', 'x3'], ['x1', 'x4'], ['x1', 'x5'],
              ['x2', 'x3'], ['x2', 'x4'], ['x2', 'x5'], ['x3', 'x4'],
              ['x3', 'x4'], ['x4', 'x5']]
hf._unit_bin_specs = {'bin_width': 0.2, 'bin_offset': 0.0}
ch.add_link(hf)

hs = visualization.DfSummary(name='HistogramSummary1',
예제 #3
0
                    'latitude': {'bin_width': 5, 'bin_offset': 0}}
    # as we are running in a loop, store the resulting histograms in the finalize() of the link,
    # after having looped through all (small) datasets.
    vc.store_at_finalize = True
    ch.add(vc)

    # --- this serves as the continue statement of the loop. go back to start of the chain.
    repeater = core_ops.RepeatChain()
    # repeat until readdata says halt.
    repeater.listen_to = 'chainRepeatRequestBy_' + read_data.name
    ch.add(repeater)

    link = core_ops.DsObjectDeleter()
    link.keep_only = ['hist', 'n_sum_rc']
    ch.add(link)

# --- print contents of the datastore
overview = Chain('Overview')
pds = core_ops.PrintDs(name='End')
pds.keys = ['n_sum_rc']
overview.add(pds)

# --- make a nice summary report of the created histograms
hist_summary = visualization.DfSummary(name='HistogramSummary',
                                       read_key=vc.store_key_hists)
overview.add(hist_summary)

#########################################################################################

logger.debug('Done parsing configuration file esk302_histogram_filler_plotter.')
예제 #4
0
    hf = root_analysis.RootHistFiller()
    # columns is a list of single observables or sub-lists in case of multi-dimensional histograms
    hf.columns = [
        'x1', 'x2', 'x3', 'x4', 'x5', ['x1', 'x2'], ['x2', 'x3'], ['x4', 'x5']
    ]
    hf.read_key = 'correlated_data'
    hf.store_key = 'hist'
    hf.var_min_value = {'x2': -5, 'x3': -5, 'x4': -5, 'x5': -5}
    hf.var_max_value = {'x2': 5, 'x3': 5, 'x4': 5, 'x5': 5}
    ch.add_link(hf)

if settings['make_plot']:
    ch = proc_mgr.add_chain('Plotting')

    # --- 2. make a nice summary report of the created histograms
    hs = visualization.DfSummary(name='HistogramSummary',
                                 read_key=hf.store_key)
    ch.add_link(hs)

if settings['convert_to_rdh']:
    ch = proc_mgr.add_chain('Convert1')

    # --- 3. convert a root histogram to a RooDataHist object
    h2rdh = root_analysis.ConvertRootHist2RooDataHist()
    h2rdh.read_key = 'x1'
    h2rdh.hist_dict_key = 'hist'
    h2rdh.create_hist_pdf = 'hpdf'
    #h2rds.into_ws = True
    ch.add_link(h2rdh)

if settings['convert_to_rds']:
    ch = proc_mgr.add_chain('Convert2')
예제 #5
0
SIZE = 10000
VAR_LABELS = dict(var_a='Variable A', var_b='Variable B', var_c='Variable C')
VAR_UNITS = dict(var_b='m/s')
GEN_CONF = dict(var_b=dict(mean=42., std=2.),
                var_c=dict(mean=42, std=2, dtype=int))

#########################################################################################
# --- now set up the chains and links based on configuration flags

data = Chain('Data')
# add data-generator link to "Data" chain
generator = analysis.BasicGenerator(name='Generate_data',
                                    key='data',
                                    columns=COLUMNS,
                                    size=SIZE,
                                    gen_config=GEN_CONF)
data.add(generator)

# add data-frame summary link to "Summary" chain
# can provide labels and units for the variables in the dataset
summary = Chain('Summary')
summarizer = visualization.DfSummary(name='Create_stats_overview',
                                     read_key=generator.key,
                                     var_labels=VAR_LABELS,
                                     var_units=VAR_UNITS)
summary.add(summarizer)

#########################################################################################

logger.debug('Done parsing configuration file esk301_dfsummary_plotter')
    ch = proc_mgr.add_chain('Conversion2')

    rds2df = root_analysis.ConvertRooDataSet2DataFrame()
    rds2df.read_key = 'simdata'
    rds2df.store_key = 'df_simdata'
    rds2df.remove_original = True
    ch.add_link(rds2df)

if settings['summary']:
    proc_mgr.add_chain('Summary')

    # print contents of the workspace
    pws = root_analysis.PrintWs()
    ch.add_link(pws)

    # print contents of datastore
    pds = core_ops.PrintDs(name='pds2')
    #pds.keys = ['accounts']
    ch.add_link(pds)

    # --- make a summary document of simulated dataframe
    summarizer = visualization.DfSummary(name='Create_stats_overview',
                                         read_key=rds2df.store_key)
    proc_mgr.get_chain('Summary').add_link(summarizer)

#########################################################################################

log.debug(
    'Done parsing configuration file esk404_workspace_createpdf_simulate_fit_plot'
)
예제 #7
0
data = Chain('Data')

# add data-frame reader to "Data" chain
reader = analysis.ReadToDf(name='Read_LA_ozone', path='LAozone.data', reader=pd.read_csv, key='data')
data.add(reader)

# add conversion functions to "Data" chain
transform = analysis.ApplyFuncToDf(name='Transform', read_key=reader.key, store_key='transformed_data',
                                   apply_funcs=conv_funcs)
data.add(transform)

# create second chain
summary = Chain('Summary')

# add data-frame summary link to "Summary" chain
summarizer = visualization.DfSummary(name='Create_stats_overview', read_key=transform.store_key,
                                     var_labels=VAR_LABELS, var_units=VAR_UNITS)
summary.add(summarizer)


#########################################################################################
# --- exercises
#
# 1.
# Run the macro and take a look at the output.

# 2.
# Now add your own transformation to the ApplyFuncToDf class.
# We want to transform the temperature to Celsius, so use the code in the comments and fill it out.
# As you can see the output will be written in the DataFrame to the column 'temp_c'.
# We also want to include in the plot the wind speed in km/h.
# Rerun the macro and take a look at the output. The output can be found in decision_engine/results