# turn on/off the 2 examples settings['do_example1'] = True settings['do_example2'] = True ######################################################################################### # --- Set path of data data_path = persistence.io_path('data', settings.io_conf(), 'dummy.csv') ######################################################################################### # --- now set up the chains and links, based on configuration flags proc_mgr = ProcessManager() # --- example 1: readdata with one input file if settings['do_example1']: ch1 = proc_mgr.add_chain('MyChain1') readdata = analysis.ReadToDf(key ='test1', sep='|', reader='csv', path=data_path) ch1.add_link(readdata) # --- do something useful with the test dataset here ... # --- example 2: readdata with default settings reads all three input files simultaneously. # all extra key word arguments are passed on to pandas reader. if settings['do_example2']: ch2 = proc_mgr.add_chain('MyChain2') # --- a loop is set up in the chain MyChain. # we iterate over (chunks of) the next file in the list until the iterator is done.
# --- minimal analysis information proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) settings['analysisName'] = 'esk408_classification_error_propagation_after_fit' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. ######################################################################################### # --- now set up the chains and links based on configuration flags # --- generate pdf, simulate, fit, and plot ch = proc_mgr.add_chain('WsOps') # 1. simulate output score of machine learning classifier wsu = root_analysis.WsUtils(name='DataSimulator') wsu.factory = [ "RooGaussian::high_risk(score[0,1],1,0.15)", "RooPolynomial::low_risk(score,{-0.3,-0.3})", "SUM::model(frac[0.1,0.,1.]*high_risk,low_risk)" ] wsu.add_simulate(pdf='model', obs='score', num=500, key='data', into_ws=True) wsu.add_fit(pdf='model', data='data', key='fit_result', into_ws=True) wsu.add_plot(obs='score', data='data', pdf='model', key='simplot') wsu.add_plot(obs='score', pdf='model', \ pdf_args=(RooFit.Components('low_risk'), RooFit.LineColor(ROOT.kRed), \ RooFit.LineStyle(ROOT.kDashed)), file='data_with_generator_model.pdf', key='simplot')
settings = proc_mgr.service(ConfigObject) settings['analysisName'] = 'esk405_simulation_based_on_binned_data' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. settings['high_num_dims'] = False input_files = [os.environ['ESKAPADE'] + '/data/mock_accounts.csv.gz'] ######################################################################################### # --- now set up the chains and links based on configuration flags ch = proc_mgr.add_chain('Data') # --- 0. read input data readdata = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv') readdata.path = input_files ch.add_link(readdata) # --- 1. add the record factorizer # Here the columns dummy and loc of the input dataset are factorized # e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column: # x = [0, 1, 2, 0, 2] # By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original' fact = analysis.RecordFactorizer(name='rf1') fact.columns = ['isActive', 'eyeColor', 'favoriteFruit', 'gender'] fact.read_key = 'accounts' fact.inplace = True
proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) settings['analysisName'] = 'esk208_record_factorizer' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. # --- Set path of data data_path = persistence.io_path('data', settings.io_conf(), 'dummy.csv') ######################################################################################### # --- now set up the chains and links based on configuration flags ch1 = proc_mgr.add_chain('Factorize') # --- read dummy dataset readdata = analysis.ReadToDf(key='test1', sep='|', reader='csv', path=data_path) ch1.add_link(readdata) # --- print contents of the datastore pds = core_ops.PrintDs(name='printer1') pds.keys = ['test1'] ch1.add_link(pds) # --- add the record factorizer # Here the columns dummy and loc of the input dataset are factorized
log.info('Successfully found ROOT class %s' % pdf_name) ######################################################################################### msg = r""" The plots and latex files produced by this tutorial can be found in dir: %s """ % (settings['resultsDir'] + '/' + settings['analysisName'] + '/data/v0/report/') log.info(msg) ######################################################################################### # --- now set up the chains and links based on configuration flags # --- generate pdf, simulate, fit, and plot ch = proc_mgr.add_chain('WsOps') # --- 1. define a model by passing strings to the rooworkspace factory # For the workspace factory syntax, see: # https://root.cern.ch/doc/master/RooFactoryWSTool_8cxx_source.html#l00722 # For rooworkspace factory examples see: # https://root.cern.ch/root/html/tutorials/roofit/rf511_wsfactory_basic.C.html # https://root.cern.ch/root/html/tutorials/roofit/rf512_wsfactory_oper.C.html # https://root.cern.ch/root/html/tutorials/roofit/rf513_wsfactory_tools.C.html # Here we use the pdf class we just created (MyPdfV3), with observable y and parameter A and B, # with ranges (-10,10), (0,100) and (-10,10) respectively. The starting values of A and B are # 10 and 2 respectively. wsu = root_analysis.WsUtils(name='modeller') wsu.factory = ["MyPdfV3::testpdf(y[-10,10],A[10,0,100],B[2,-10,10])"] ch.add_link(wsu)
nan,3,bal,3,bla,bar,c,1 ,nan,NaN,NaN,nan,nan,d,2 ,,,,,,,3 1,2,,,,,,,6 """ f = tempfile.NamedTemporaryFile(delete=False) f.write(tmp) f.close() # file is not immediately deleted because we used delete=False # used below with f.name ######################################################################################### # --- now set up the chains and links based on configuration flags ch = proc_mgr.add_chain('DataPrep') # --- 0. pandas read_csv has multiple settings to help reading in of buggy csv's. # o The option error_bad_lines=False skips lines with too few or too many values # o The option encoding='latin1' interprets most non-standard characters readdata = analysis.ReadToDf(key='vrh', reader='csv', path=f.name, error_bad_lines=False, encoding='latin1') ch.add_link(readdata) # --- 1. standard setting: # o convert all nans to np.nan (= float) # o convert all rows in a column to most occuring datatype in that column fixer = data_quality.FixPandasDataFrame(name='fixer1')
COLUMNS = ['var_a', 'var_b', 'var_c'] SIZE = 10000 VAR_LABELS = dict(var_a='Variable A', var_b='Variable B', var_c='Variable C') VAR_UNITS = dict(var_b='m/s') GEN_CONF = dict(var_b=dict(mean=42., std=2.), var_c=dict(mean=42, std=2, dtype=int)) ######################################################################################### # --- now set up the chains and links based on configuration flags # create process manager proc_mgr = ProcessManager() # create chains proc_mgr.add_chain('Data') proc_mgr.add_chain('Summary') # add data-generator link to "Data" chain generator = analysis.BasicGenerator(name='Generate_data', key='data', columns=COLUMNS, size=SIZE, gen_config=GEN_CONF) proc_mgr.get_chain('Data').add_link(generator) # add data-frame summary link to "Summary" chain # can provide labels and units for the variables in the dataset summarizer = visualization.DfSummary(name='Create_stats_overview', read_key=generator.key, var_labels=VAR_LABELS,
sm.spark_streaming_context = ssc # define data stream ds = proc_mgr.service(DataStore) if not stream_type or stream_type == 'file': ds['dstream'] = ssc.textFileStream('/tmp/eskapade_stream_test/') elif stream_type == 'tcp': ds['dstream'] = ssc.socketTextStream('localhost', 9999) else: log.error('unsupported stream_type specified: {}'.format(stream_type)) ########################################################################## # --- now set up the chains and links based on configuration flags proc_mgr.add_chain('SparkStreaming') # the word count example wordcount_link = spark_analysis.SparkStreamingWordCount( name='SparkStreamingWordCount', read_key='dstream', store_key='wordcounts') proc_mgr.get_chain('SparkStreaming').add_link(wordcount_link) # store output writer_link = spark_analysis.SparkStreamingWriter( name='SparkStreamingWriter', read_key=wordcount_link.store_key, output_path='file:' + persistence.io_dir('results_data', settings.io_conf()) + '/dstream/wordcount', mode='overwrite', suffix='txt',
var_range=(0., MAX_AGE), var=('redeem_age', 0.), max_var=('age', MAX_AGE), exp=[('rate_fast', FAST_REDEEM_RATE), ('rate_slow', SLOW_REDEEM_RATE)], fracs=[('frac_fast', FAST_FRAC)]) model.build_model() model.var.SetTitle('Redeem age') model.max_var.SetTitle('Age') model.var.setUnit('days') model.max_var.setUnit('days') ############################################################################### # --- create chain for generating voucher redeem data ch = proc_mgr.add_chain('Generation') gen_link = TruncExpGen(name='Generate', store_key=REDEEM_DATA_KEY, max_var_data_key=AGE_DATA_KEY, model_name=MODEL_NAME, event_frac=REDEEM_FRAC) ch.add_link(gen_link) np.random.seed(settings['seeds']['NumPy']) ROOT.RooRandom.randomGenerator().SetSeed(settings['seeds']['RooFit']) ############################################################################### # --- create chain for fitting voucher redeem model to generated data ch = proc_mgr.add_chain('Fitting') fit_link = TruncExpFit(name='Fit',
COLUMNS = ['var_a', 'var_b', 'var_c'] SIZE = 10000 VAR_LABELS = dict(var_a='Variable A', var_b='Variable B', var_c='Variable C') VAR_UNITS = dict(var_b='m/s') GEN_CONF = dict(var_a=dict(choice=['alpha', 'beta', 'gamma'], dtype=str), var_b=dict(mean=3., std=1.), var_c=dict(choice=['delta', 'epsilon', 'zeta', 'eta'], dtype=str)) ######################################################################################### # --- now set up the chains and links based on configuration flags # create process manager proc_mgr = ProcessManager() # create chains proc_mgr.add_chain('Data') proc_mgr.add_chain('BoxPlot') # add data-generator link to "Data" chain generator = analysis.BasicGenerator(name='Generate_data', key='data', columns=COLUMNS, size=SIZE, gen_config=GEN_CONF) proc_mgr.get_chain('Data').add_link(generator) # add data-frame summary link to "Boxplot" chain # can provide labels and units for the variables in the dataset, and set the statistics to print in output file boxplot = visualization.DfBoxplot(name='Create_stats_overview', read_key=generator.key,
def firstword(x): return x.split()[0] ######################################################################################### # --- now set up the chains and links based on configuration flags # This chain does 'mapping'. (macro B does 'reduction'.) proc_mgr = ProcessManager() # --- mapper: chain with event looper # this eventlooper link serves as a mapper. # in this example the lines are converted to lower chars, and the first word is selected. if settings['do_map']: ch = proc_mgr.add_chain("Mapper") looper = core_ops.EventLooper(name='listener') looper.skip_line_beginning_with = ['#'] looper.line_processor_set = [firstword, tolower] if settings['TESTING']: looper.filename = f.name ch.add_link(looper) # --- reducer: chain with event looper # this eventlooper link serves as a reducer # in this example the lines are grouped together into unique sets. if settings['do_reduce']: ch = proc_mgr.add_chain("Reducer") looper = core_ops.EventLooper(name='grouper') # reducer selects all unique lines looper.sort = True
settings['do_readdata'] = True settings['do_writedata'] = True ######################################################################################### # --- Set path of data data_path = persistence.io_path('data', settings.io_conf(), 'dummy.csv') ######################################################################################### # --- now set up the chains and links based on configuration flags proc_mgr = ProcessManager() # --- readdata with default settings reads all three input files simultaneously. # all extra key word arguments are passed on to pandas reader. if settings['do_readdata']: ch = proc_mgr.add_chain('ReadData') # --- readdata keeps on opening the next file in the file list. # all kwargs are passed on to pandas file reader. readdata = analysis.ReadToDf(name='reader', key='test', sep='|', reader='csv', path=[data_path] * 3) ch.add_link(readdata) if settings['do_writedata']: ch = proc_mgr.add_chain('WriteData') # --- writedata needs a specified output format ('writer' argument). # if this is not set, try to determine this from the extension from the filename.
# turn on/off the example settings['do_example'] = True ######################################################################################### # --- now set up the chains and links, based on configuration flags proc_mgr = ProcessManager() # --- example loops over the first chain 10 times. if settings['do_example']: # --- a loop is set up in the chain MyChain. # we iterate over the chain until the link RepeatChain is done. # then move on to the next chain (Overview) ch = proc_mgr.add_chain('MyChain') link = core_ops.HelloWorld(name='HelloWorld') link.set_log_level(logging.DEBUG) ch.add_link(link) # --- this link sends out a signal to repeat the execution of the chain. # It serves as the 'continue' statement of the loop. # go back to start of the chain until counter reaches 10. repeater = core_ops.RepeatChain() # repeat max of 10 times repeater.maxcount = 10 repeater.set_log_level(logging.DEBUG) ch.add_link(repeater) # --- print contents of the datastore.
settings = proc_mgr.service(ConfigObject) settings['analysisName'] = 'esk404_workspace_createpdf_simulate_fit_plot' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. settings['generate_fit_plot'] = True settings['summary'] = True ######################################################################################### # --- now set up the chains and links based on configuration flags if settings['generate_fit_plot']: # --- generate pdf, simulate, fit, and plot ch = proc_mgr.add_chain('WsOps') # --- 1. define a model by passing strings to the rooworkspace factory # for details on rooworkspace factory see: # https://root.cern.ch/root/html/tutorials/roofit/rf511_wsfactory_basic.C.html wsu = root_analysis.WsUtils(name='modeller') wsu.factory = [ "Gaussian::sig1(x[-10,10],mean[5,0,10],0.5)", "Gaussian::sig2(x,mean,1)", "Chebychev::bkg(x,{a0[0.5,0.,1],a1[-0.2,-1,1]})", "SUM::sig(sig1frac[0.8,0.,1.]*sig1,sig2)", "SUM::model(bkgfrac[0.5,0.,1.]*bkg,sig)" ] ch.add_link(wsu) # --- 2. simulation: 1000 records of observable 'x' with pdf 'model'.
spark = proc_mgr.service(SparkManager).create_session( eskapade_settings=settings) ########################################################################## # CSV and dataframe settings # NB: local file may not be accessible to worker node in cluster mode file_path = [ 'file:' + persistence.io_path('data', settings.io_conf(), 'dummy1.csv') ] ########################################################################## # Now set up the chains and links based on configuration flags proc_mgr.add_chain('Read') # create read link for each data file read_link = spark_analysis.SparkDfReader(name='ReadFile', store_key='spark_df', read_methods=['csv']) # set CSV read arguments read_link.read_meth_args['csv'] = (file_path, ) read_link.read_meth_kwargs['csv'] = dict(sep='|', header=True, inferSchema=True) # add link to chain proc_mgr.get_chain('Read').add_link(read_link)
settings[ 'TESTING'] = False if not 'TESTING' in settings else settings['TESTING'] ######################################################################################### # --- now set up the chains and links based on configuration flags ds = ProcessManager().service(DataStore) ds['hello'] = 'world' ds['d'] = {'a': 1, 'b': 2, 'c': 3} ######################################################################################### # --- now set up the chains and links based on configuration flags proc_mgr = ProcessManager() ch = proc_mgr.add_chain('Overview') # 1. printdatastore prints an overview of the contents in the datastore # at the state of executing the link. # The overview consists of list of keys in the datastore and and the object types. link = core_ops.PrintDs(name='printer1') # keys are the items for which the contents of the actual item is printed. link.keys = ['hello', 'd'] ch.add_link(link) # 2. This link will start an interactive ipython session. # from this session, one can access the datastore and the configobject with: # >>> ds # or # >>> settings # Try to add something to the datastore in this session!
ts = pd.Timestamp(x.split()[0]) return ts except: pass return x ######################################################################################### # --- now set up the chains and links, based on configuration flags procMgr = ProcessManager() # --- example 2: readdata loops over the input files, with file chunking. if settings['do_loop']: ch = procMgr.add_chain('Data') # --- a loop is set up in the chain MyChain. # we iterate over (chunks of) the next file in the list until the iterator is done. # then move on to the next chain (Overview) # --- readdata keeps on opening the next 400 lines of the open or next file in the file list. # all kwargs are passed on to pandas file reader. readdata = analysis.ReadToDf(name='dflooper', key='rc', reader='csv') readdata.chunksize = chunksize readdata.path = input_files ch.add_link(readdata) # add conversion functions to "Data" chain # here, convert column 'registered', an integer, to an actual timestamp. conv_funcs = [{'func': to_date, 'colin': 'registered', 'colout': 'date'}]
proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) settings['analysisName'] = 'esk403_roodataset_convert' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. input_files = [os.environ['ESKAPADE'] + '/data/mock_accounts.csv.gz'] ######################################################################################### # --- now set up the chains and links based on configuration flags ch = proc_mgr.add_chain('Data') # --- 0. read the input data # all kwargs are passed on to pandas file reader. readdata = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv') readdata.path = input_files ch.add_link(readdata) ch = proc_mgr.add_chain('Conversion1') # --- 1. add the record factorizer # Here the columns dummy and loc of the input dataset are factorized # e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column: # x = [0, 1, 2, 0, 2] # By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original' fact = analysis.RecordFactorizer(name='rf1')
schema.keys())) # Spark data frame ds['pd'] = pd.DataFrame(ds['rows'], columns=schema.keys()) # Pandas data frame ########################################################################## # --- now set up the chains and links based on configuration flags # define function to set number of partitions def set_num_parts(df, max_num_parts): if df.rdd.getNumPartitions() > max_num_parts: df = df.repartition(max_num_parts) return df # create chain and data-frame-creator links chain = proc_mgr.add_chain('Create') for ds_key, lnk_schema in zip(('rows', 'rdd', 'df', 'pd'), (list(schema.keys()), schema, schema, None)): # create data-frame-creator link lnk = spark_analysis.SparkDfCreator( name='df_creator_{}'.format(ds_key), read_key=ds_key, store_key='{}_df'.format(ds_key), schema=lnk_schema, process_methods=['filter', set_num_parts, 'cache']) # set post-process-method arguments lnk.process_meth_args['filter'] = ('index > 19', ) # select rows with index > 19 lnk.process_meth_kwargs[set_num_parts] = dict( max_num_parts=2) # set maximum number of partitions to 2
schema=columns, write_methods=['csv'], num_files=num_files) # create RDD-CSV-writer link writers['rdd_csv_writer'] = spark_analysis.SparkDataToCsv(name='rdd_csv_writer', read_key='rdd', output_path='{}/rdd_csv'.format(output_dir), mode='overwrite', sep=separator, header=columns if write_header else False, num_files=num_files) # set generic-writer arguments for input_format in ('df', 'rdd'): key = '{}_generic_writer'.format(input_format) writers[key].write_meth_args['csv'] = ('{0:s}/{1:s}_generic'.format(output_dir, input_format),) writers[key].write_meth_kwargs['csv'] = dict(sep=separator, header=write_header, mode='overwrite') # add links to chain chain = proc_mgr.add_chain('Write') for lnk in writers.values(): chain.add_link(lnk) ########################################################################## log.debug('Done parsing configuration file esk603_read_csv_to_spark_df')
######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. settings['do_chain0'] = True settings['do_chain1'] = True settings['do_chain2'] = True ######################################################################################### # --- now set up the chains and links based on configuration flags # Three simple chains are set up. proc_mgr = ProcessManager() if settings['do_chain0']: ch = proc_mgr.add_chain('Chain0') link = core_ops.HelloWorld(name='hello0') link.hello = 'Town' ch.add_link(link) # adding more chains is as easy as calling add_chain and passing a new name. if settings['do_chain1']: ch = proc_mgr.add_chain('Chain1') link = core_ops.HelloWorld(name='hello1') link.hello = 'World' ch.add_link(link) if settings['do_chain2']: ch = proc_mgr.add_chain('Chain2') link = core_ops.HelloWorld(name='hello2')
proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) settings['analysisName'] = 'esk207_record_vectorizer' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. # --- Set path of data data_path = persistence.io_path('data', settings.io_conf(), 'dummy.csv') ######################################################################################### # --- now set up the chains and links based on configuration flags ch1 = proc_mgr.add_chain('MyChain1') # --- read dummy dataset readdata = analysis.ReadToDf(key='test1', sep='|', reader='csv', path=data_path) ch1.add_link(readdata) # --- print contents of the datastore pds = core_ops.PrintDs(name='printer1') pds.keys = ['test1'] ch1.add_link(pds) # --- add the record vectorizer # Here the columns x and y of the input dataset are vectorized