df2rdh.columns = ['longitude', 'age', 'eyeColor'] # be careful not to blow up the total number of bins. # do this by setting the maximum total number of bins allowed. df2rdh.n_max_total_bins = 1e6 # a histogram-based pdf is created out of the roodatahist object # we use this pdf below to simulate a new dataset with the same properties as the original df2rdh.create_hist_pdf = 'hpdf_Ndim' # all output is stored in the workspace, not datastore df2rdh.into_ws = True ch.add(df2rdh) # --- Print overview pws = root_analysis.PrintWs() ch.add(pws) pds = core_ops.PrintDs() ch.add(pds) # --- 3. resimulate the data with the created hist-pdf, and plot these data and the pdf ch = Chain('WsOps') wsu = root_analysis.WsUtils() wsu.add_simulate(pdf='hpdf_Ndim', obs='rdh_vars', num=10000, key='simdata') wsu.add_plot(obs='age', data='simdata', pdf='hpdf_Ndim', output_file='test.pdf', pdf_kwargs={'ProjWData': ('rdh_cats', 'simdata')}) ch.add(wsu) #########################################################################################
# Any other kwargs given to ApplySelectionToDf are passed on the the # pandas query() function. ch.add_link(link) # --- As an example, will merge reduced datasets back into a single, merged dataframe. concat = analysis.DfConcatenator() concat.readKeys = ['merged', 'reduced_data'] concat.storeKey = 'merged' concat.ignore_missing_input = True # in first iteration input 'merged' is missing. ch.add_link(concat) # --- this serves as the continue statement of the loop. go back to start of the chain. repeater = core_ops.RepeatChain() # repeat until readdata says halt. repeater.listenTo = 'chainRepeatRequestBy_' + readdata.name # repeat max of 10 times #repeater.maxcount = 10 ch.add_link(repeater) # --- print contents of the datastore proc_mgr.add_chain('Overview') pds = core_ops.PrintDs(name='End') pds.keys = [ 'n_test1', 'n_sum_test1', 'n_test2', 'n_sum_test2', 'test2', 'n_merged' ] proc_mgr.get_chain('Overview').add_link(pds) ######################################################################################### log.debug('Done parsing configuration file esk209_read_big_data_itr')
ch = proc_mgr.add_chain('chain1') # the link ToDsDict adds objects to the datastore # by default this happens at the execution of the link. # (optionally, this can be done at initialization.) # Here it is used as a dummy data generator. link = core_ops.ToDsDict(name='intods_1') link.obj = f # copydict = true: all items in dict f are added to the datastore link.copydict = True ch.add_link(link) # print contents of datastore link = core_ops.PrintDs() link.keys = ['n_favorite', 'hello'] ch.add_link(link) ######### # chain 2 # - asserting the presence of items in the datastore. # - deleting individual items from the datastore. ch = proc_mgr.add_chain('chain2') # the link AssertInDs checks the presence # of certain objects in the datastore link = core_ops.AssertInDs() link.keySet = ['hello', 'n_favorite'] ch.add_link(link)
data_path = persistence.io_path('data', settings.io_conf(), 'dummy.csv') ######################################################################################### # --- now set up the chains and links based on configuration flags ch1 = proc_mgr.add_chain('Factorize') # --- read dummy dataset readdata = analysis.ReadToDf(key='test1', sep='|', reader='csv', path=data_path) ch1.add_link(readdata) # --- print contents of the datastore pds = core_ops.PrintDs(name='printer1') pds.keys = ['test1'] ch1.add_link(pds) # --- add the record factorizer # Here the columns dummy and loc of the input dataset are factorized # e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column: # x = [0, 1, 2, 0, 2] # By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original' fact = analysis.RecordFactorizer(name='rf1') fact.columns = ['dummy', 'loc'] fact.read_key = 'test1' fact.store_key = 'test1_fact' fact.sk_map_to_original = 'to_original' fact.set_log_level(logging.DEBUG) ch1.add_link(fact)
######################################################################################### # --- now set up the chains and links based on configuration flags proc_mgr = ProcessManager() ch = proc_mgr.add_chain('DataPrep') # querySet = seletions that are applies to incoming_records # after selections, only keep column in selectColumns ('a', 'c') # add conversion functions to "Data" chain link = analysis.ApplyFuncToDf(name='Transform', read_key='incoming_data', store_key='transformed_data', apply_funcs=conv_funcs) # Any other kwargs given to ApplyFuncToDf are passed on the the # pandas query() function. link.set_log_level(logging.DEBUG) ch.add_link(link) link = core_ops.DsObjectDeleter() link.deletionKeys = ['incoming_data'] ch.add_link(link) link = core_ops.PrintDs() link.keys = ['transformed_data'] ch.add_link(link) ######################################################################################### log.debug('Done parsing configuration file esk203_apply_func_to_pandas_df')
ch = proc_mgr.add_chain('Conversion2') rds2df = root_analysis.ConvertRooDataSet2DataFrame() rds2df.read_key = 'simdata' rds2df.store_key = 'df_simdata' rds2df.remove_original = True ch.add_link(rds2df) if settings['summary']: proc_mgr.add_chain('Summary') # print contents of the workspace pws = root_analysis.PrintWs() ch.add_link(pws) # print contents of datastore pds = core_ops.PrintDs(name='pds2') #pds.keys = ['accounts'] ch.add_link(pds) # --- make a summary document of simulated dataframe summarizer = visualization.DfSummary(name='Create_stats_overview', read_key=rds2df.store_key) proc_mgr.get_chain('Summary').add_link(summarizer) ######################################################################################### log.debug( 'Done parsing configuration file esk404_workspace_createpdf_simulate_fit_plot' )
ch.add(read_data) # --- 1. convert into a roofit dataset (roodataset) # build a KEYS pdf out of the dataset as well df2rds = root_analysis.ConvertDataFrame2RooDataSet() df2rds.read_key = read_data.key df2rds.store_key = 'rds_' + read_data.key df2rds.store_key_vars = 'keys_varset' df2rds.columns = ['x2', 'x3', 'x4' ] if settings['high_num_dims'] else ['x2', 'x3'] df2rds.store_index = False # build a KEYS pdf out of the roodataset, used for simulation below df2rds.create_keys_pdf = 'keys_Ndim' ch.add(df2rds) pds = core_ops.PrintDs(name='pds1') ch.add(pds) if settings['generate']: # --- 2. simulate a new dataset with the keys pdf, and then plot this dataset ch = Chain('WsOps') wsu = root_analysis.WsUtils() wsu.add_simulate(pdf='keys_Ndim', obs='keys_varset', num=5000, key='simdata', into_ws=True) wsu.add_plot(obs='x2', data='simdata', output_file='x2_simdata.pdf') wsu.add_plot(obs='x3', data='simdata', output_file='x3_simdata.pdf') if settings['high_num_dims']: wsu.add_plot(obs='x4', data='simdata', output_file='x4_simdata.pdf')
ds = ProcessManager().service(DataStore) ds['hello'] = 'world' ds['d'] = {'a': 1, 'b': 2, 'c': 3} ######################################################################################### # --- now set up the chains and links based on configuration flags proc_mgr = ProcessManager() ch = proc_mgr.add_chain('Overview') # 1. printdatastore prints an overview of the contents in the datastore # at the state of executing the link. # The overview consists of list of keys in the datastore and and the object types. link = core_ops.PrintDs(name='printer1') # keys are the items for which the contents of the actual item is printed. link.keys = ['hello', 'd'] ch.add_link(link) # 2. This link will start an interactive ipython session. # from this session, one can access the datastore and the configobject with: # >>> ds # or # >>> settings # Try to add something to the datastore in this session! # >>> ds['foo'] = 'bar' if not settings['TESTING']: link = core_ops.IPythonEmbed() ch.add_link(link)