# by default all set to false, unless already configured in # configobject or vars() # turn on/off the example settings['do_example'] = True ######################################################################################### # --- now set up the chains and links, based on configuration flags # --- example loops over the first chain 10 times. if settings['do_example']: # --- a loop is set up in the chain MyChain. # we iterate over the chain until the link RepeatChain is done. # then move on to the next chain (Overview) ch = Chain('MyChain') link = core_ops.HelloWorld(name='HelloWorld') link.logger.log_level = LogLevel.DEBUG ch.add(link) # --- this link sends out a signal to repeat the execution of the chain. # It serves as the 'continue' statement of the loop. # go back to start of the chain until counter reaches 10. repeater = core_ops.RepeatChain() # repeat max of 10 times repeater.maxcount = 10 repeater.logger.log_level = LogLevel.DEBUG ch.add(repeater) # --- print contents of the datastore.
######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk410_testing_correlations_between_categories' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. input_files = [resources.fixture('mock_accounts.csv.gz')] ######################################################################################### # --- now set up the chains and links based on configuration flags ch = Chain('Data') # --- 0. readdata keeps on opening the next file in the file list. # all kwargs are passed on to pandas file reader. read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv') read_data.path = input_files ch.add(read_data) # --- 1. add the record factorizer to convert categorical observables into integers # Here the columns dummy and loc of the input dataset are factorized # e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column: # x = [0, 1, 2, 0, 2] # By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original' fact = analysis.RecordFactorizer(name='rf1') fact.columns = ['eyeColor', 'favoriteFruit'] # ['Obs_*'] fact.read_key = read_data.key
settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk405_simulation_based_on_binned_data' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. settings['high_num_dims'] = False input_files = [resources.fixture('mock_accounts.csv.gz')] ######################################################################################### # --- now set up the chains and links based on configuration flags ch = Chain('Data') # --- 0. read input data read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv') read_data.path = input_files ch.add(read_data) # --- 1. add the record factorizer # Here the columns dummy and loc of the input dataset are factorized # e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column: # x = [0, 1, 2, 0, 2] # By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original' fact = analysis.RecordFactorizer(name='rf1') fact.columns = ['isActive', 'eyeColor', 'favoriteFruit', 'gender'] fact.read_key = 'accounts' fact.inplace = True
} process_meth_kwargs = { 'df': { set_num_parts: dict(max_num_parts=2) }, 'rdd': {}, 'list': { filter_list: dict(min_index=20) }, 'pd': { filter_pd: dict(min_index=20) } } # create chain and data-frame-creator links chain = Chain('Create') for out_format in process_methods: # create data-frame-conversion link lnk = spark_analysis.SparkDfConverter( name='df_to_{}_converter'.format(out_format), read_key='df', store_key='{}_output'.format(out_format), schema_key='{}_schema'.format(out_format), output_format=out_format, preserve_col_names=False, process_methods=process_methods[out_format], process_meth_args=process_meth_args[out_format], process_meth_kwargs=process_meth_kwargs[out_format]) # add link to chain chain.add(lnk)
import pandas as pd try: ts = pd.Timestamp(x.split()[0]) x = ts except Exception: logger.warning('Date conversion failed!') return x ######################################################################################### # --- now set up the chains and links, based on configuration flags # --- example 2: readdata loops over the input files, with file chunking. if settings['do_loop']: ch = Chain('Data') # --- a loop is set up in the chain MyChain. # we iterate over (chunks of) the next file in the list until the iterator is done. # then move on to the next chain (Overview) # --- readdata keeps on opening the next 400 lines of the open or next file in the file list. # all kwargs are passed on to pandas file reader. read_data = analysis.ReadToDf(name='dflooper', key='rc', reader='csv') read_data.chunksize = chunk_size read_data.path = input_files ch.add(read_data) # add conversion functions to "Data" chain # here, convert column 'registered', an integer, to an actual timestamp. conv_funcs = [{'func': to_date, 'colin': 'registered', 'colout': 'date'}]
data_path=settings['resultsDir'] + '/' + settings['analysisName'] + '/data/v0/', conf_path=settings['resultsDir'] + '/' + settings['analysisName'] + '/config/v0/') # dummy information used in this macro, added to each chain below. f = {'hello': 'world', 'v': [3, 1, 4, 1, 5], 'n_favorite': 7} g = {'a': 1, 'b': 2, 'c': 3} h = [2, 7] ######################################################################################### # --- now set up the chains and links based on configuration flags ######### # chain 1 ch = Chain('chain1') # the link ToDsDict adds objects to the datastore at link execution. link = core_ops.ToDsDict(name='intods_1') link.store_key = 'f' link.obj = f ch.add(link) # print contents of datastore link = core_ops.PrintDs() ch.add(link) ######### # chain 2 ch = Chain('chain2')
""" logger.info(msg, path=settings['resultsDir'] + '/' + settings['analysisName'] + '/data/v0/report/') COLUMNS = ['var_a', 'var_b', 'var_c'] SIZE = 10000 VAR_LABELS = dict(var_a='Variable A', var_b='Variable B', var_c='Variable C') VAR_UNITS = dict(var_b='m/s') GEN_CONF = dict(var_b=dict(mean=42., std=2.), var_c=dict(mean=42, std=2, dtype=int)) ######################################################################################### # --- now set up the chains and links based on configuration flags data = Chain('Data') # add data-generator link to "Data" chain generator = analysis.BasicGenerator(name='Generate_data', key='data', columns=COLUMNS, size=SIZE, gen_config=GEN_CONF) data.add(generator) # add data-frame summary link to "Summary" chain # can provide labels and units for the variables in the dataset summary = Chain('Summary') summarizer = visualization.DfSummary(name='Create_stats_overview', read_key=generator.key, var_labels=VAR_LABELS, var_units=VAR_UNITS)
######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. settings['input_path'] = resources.fixture('correlated_data.sv.gz') settings['reader'] = 'csv' settings['separator'] = ' ' settings['correlations'] = [ 'pearson', 'kendall', 'spearman', 'correlation_ratio' ] ######################################################################################### # --- now set up the chains and links based on configuration flags # create chains data = Chain('Data') # load data reader = analysis.ReadToDf(name='reader', path=settings['input_path'], sep=settings['separator'], key='input_data', reader=settings['reader']) data.add(reader) # make visualizations of correlations summary = Chain('Summary') corr_link = visualization.CorrelationSummary(name='correlation_summary', read_key='input_data',
######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk101_helloworld' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. # E.g. define flags turn on or off certain chains with links. # by default all set to false, unless already configured in # configobject or vars() settings['do_hello'] = True settings['n_repeat'] = 2 ######################################################################################### # --- now set up the chains and links based on configuration flags if settings['do_hello']: hello = Chain(name='Hello') link = core_ops.HelloWorld(name='HelloWorld') link.logger.log_level = LogLevel.DEBUG link.repeat = settings['n_repeat'] hello.add(link) ######################################################################################### logger.debug('Done parsing configuration file esk101_helloworld')
############################################################################### msg = r""" The plots and latex files produced by this tutorial can be found in dir: {path} """ logger.info(msg, path=settings['resultsDir'] + '/' + settings['analysisName'] + '/data/v0/report/') ############################################################################### # --- now set up the chains and links based on configuration flags # --- generate pdf, simulate, fit, and plot ch = Chain('WsOps') # --- 1. define a model by passing strings to the rooworkspace factory # For the workspace factory syntax, see: # https://root.cern.ch/doc/master/RooFactoryWSTool_8cxx_source.html#l00722 # For rooworkspace factory examples see: # https://root.cern.ch/root/html/tutorials/roofit/rf511_wsfactory_basic.C.html # https://root.cern.ch/root/html/tutorials/roofit/rf512_wsfactory_oper.C.html # https://root.cern.ch/root/html/tutorials/roofit/rf513_wsfactory_tools.C.html # Here we use the pdf class we just created (MyPdfV3), with observable y and parameter A and B, # with ranges (-10,10), (0,100) and (-10,10) respectively. The starting values of A and B are # 10 and 2 respectively. wsu = WsUtils(name='modeller') wsu.factory = [ '{pdf}::testpdf(y[-10,10],A[10,0,100],B[2,-10,10])'.format(pdf=pdf_name) ]
logger.info(msg, path=persistence.io_path('results_data', 'report')) COLUMNS = ['var_a', 'var_b', 'var_c'] SIZE = 10000 VAR_LABELS = dict(var_a='Variable A', var_b='Variable B', var_c='Variable C') VAR_UNITS = dict(var_b='m/s') GEN_CONF = dict(var_a=dict(choice=['alpha', 'beta', 'gamma'], dtype=str), var_b=dict(mean=3., std=1.), var_c=dict(choice=['delta', 'epsilon', 'zeta', 'eta'], dtype=str)) ######################################################################################### # --- now set up the chains and links based on configuration flags # create chains data = Chain('Data') # add data-generator link to "Data" chain generator = analysis.BasicGenerator(name='Generate_data', key='data', columns=COLUMNS, size=SIZE, gen_config=GEN_CONF) data.add(generator) # add data-frame summary link to "Boxplot" chain # can provide labels and units for the variables in the dataset, and set the statistics to print in output file plot = Chain('BoxPlot') box_plot = visualization.DfBoxplot( name='Create_stats_overview', read_key=generator.key,
# --- now set up the chains and links based on configuration flags # create read link read_link = spark_analysis.SparkDfReader(name='Reader', store_key='spark_df', read_methods=['csv']) # set CSV read arguments read_link.read_meth_args['csv'] = (file_paths,) read_link.read_meth_kwargs['csv'] = dict(sep=separator, header=has_header, inferSchema=infer_schema) if columns: # add select function read_link.read_methods.append('select') read_link.read_meth_args['select'] = tuple(columns) if num_partitions: # add repartition function read_link.read_methods.append('repartition') read_link.read_meth_args['repartition'] = (num_partitions,) # add link to chain read = Chain('Read') read.add(read_link) ########################################################################## logger.debug('Done parsing configuration file esk602_read_csv_to_spark_df.')
# --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk207_record_vectorizer' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. # --- Set path of data data_path = resources.fixture('dummy.csv') ######################################################################################### # --- now set up the chains and links based on configuration flags ch1 = Chain('MyChain1') # --- read dummy dataset read_data = analysis.ReadToDf(key='test1', sep='|', reader='csv', path=data_path) ch1.add(read_data) # --- print contents of the datastore pds = core_ops.PrintDs(name='printer1') pds.keys = ['test1'] ch1.add(pds) # --- add the record vectorizer # Here the columns x and y of the input dataset are vectorized
# --- METHOD 1: configuration file spark = sm.create_session(eskapade_settings=settings) sc = spark.sparkContext logger.info('---> METHOD 1: configuration file') logger.info(str(sc.getConf().getAll())) ########################################################################## # --- METHOD 2: link conf_link = SparkConfigurator(name='SparkConfigurator', log_level='WARN') conf_link.spark_settings = [('spark.app.name', settings['analysisName'] + '_link'), ('spark.master', 'local[42]'), ('spark.driver.host', '127.0.0.1')] config = Chain('Config') config.add(conf_link) logger.info('---> METHOD 2: link') logger.info('NB: settings will be printed at time of link execution.') ########################################################################## # --- running spark session will be stopped automatically at end ########################################################################### # --- the end logger.debug('Done parsing configuration file esk601_spark_configuration.')
settings['generate'] = True # settings['read_data'] = not settings['generate'] settings['model'] = True settings['process'] = True settings['fit_plot'] = True settings['summary'] = True fitpdf = 'sum3pdf' n_percentile_bins = 300 ######################################################################################### # --- now set up the chains and links based on configuration flags if settings['model']: # --- generate pdf ch = Chain('Model') # --- 1. define a model wsu = root_analysis.WsUtils(name='modeller') factory = [ "RooWeibull::wb1(t[0,110000000],a1[0.93,0,2],b1[2.2e-4,0,1e-3])", "RooWeibull::wb2(t,a2[0.61,0,2],b2[1.1e-5,0,1e-3])", "RooWeibull::wb3(t,a3[0.43,0,2],b3[4.7e-7,0,1e-3])", "RooWeibull::wb4(t,a4[0.43,0,2],b4[2.2e-7,0,1e-3])", "SUM::sum2pdf(N1[580000,0,2e6]*wb1,N2[895000,0,2e6]*wb2)", "SUM::sum3pdf(N1[580000,0,2e6]*wb1,N2[895000,0,2e6]*wb2,N3[150500,0,2e6]*wb3)", "SUM::sum4pdf(N1[580000,0,2e6]*wb1,N2[895000,0,2e6]*wb2,N3[150500,0,2e6]*wb3,N4[1e5,0,2e6]*wb4)" ] wsu.factory += factory ch.add(wsu)
settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk404_workspace_createpdf_simulate_fit_plot' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. settings['generate_fit_plot'] = True settings['summary'] = True ######################################################################################### # --- now set up the chains and links based on configuration flags if settings['generate_fit_plot']: # --- generate pdf, simulate, fit, and plot ch = Chain('WsOps') # --- 1. define a model by passing strings to the rooworkspace factory # For the workspace factory syntax, see: # https://root.cern.ch/doc/master/RooFactoryWSTool_8cxx_source.html#l00722 # For rooworkspace factory examples see: # https://root.cern.ch/root/html/tutorials/roofit/rf511_wsfactory_basic.C.html # https://root.cern.ch/root/html/tutorials/roofit/rf512_wsfactory_oper.C.html # https://root.cern.ch/root/html/tutorials/roofit/rf513_wsfactory_tools.C.html wsu = root_analysis.WsUtils(name='modeller') wsu.factory = ["Gaussian::sig1(x[-10,10],mean[5,0,10],0.5)", "Gaussian::sig2(x,mean,1)", "Chebychev::bkg(x,{a0[0.5,0.,1],a1[-0.2,-1,1]})", "SUM::sig(sig1frac[0.8,0.,1.]*sig1,sig2)", "SUM::model(bkgfrac[0.5,0.,1.]*bkg,sig)"] ch.add(wsu)
######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk306_concatenate_reports' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. input_files = resources.fixture('correlated_data.sv.gz') ######################################################################################### # --- now set up the chains and links based on configuration flags data = Chain('Data') # --- 0. readdata keeps on opening the next file in the file list. # all kwargs are passed on to pandas file reader. read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv', sep=' ') read_data.path = input_files data.add(read_data) # --- 1. add data-frame summary link to "Summary" chain summarizer = visualization.DfSummary(name='Create_stats_overview', read_key=read_data.key, pages_key='report_pages') data.add(summarizer)
######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk408_classification_error_propagation_after_fit' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. ######################################################################################### # --- now set up the chains and links based on configuration flags # --- generate pdf, simulate, fit, and plot ch = Chain('WsOps') # 1. simulate output score of machine learning classifier wsu = root_analysis.WsUtils(name='DataSimulator') wsu.factory = ["RooGaussian::high_risk(score[0,1],1,0.15)", "RooPolynomial::low_risk(score,{-0.3,-0.3})", "SUM::model(frac[0.1,0.,1.]*high_risk,low_risk)"] wsu.add_simulate(pdf='model', obs='score', num=500, key='data', into_ws=True) wsu.add_fit(pdf='model', data='data', key='fit_result', into_ws=True) wsu.add_plot(obs='score', data='data', pdf='model', key='simplot') wsu.add_plot(obs='score', pdf='model', pdf_args=(RooFit.Components('low_risk'), RooFit.LineColor(ROOT.kRed), RooFit.LineStyle(ROOT.kDashed)), output_file='data_with_generator_model.pdf', key='simplot') ch.add(wsu)
######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. settings['read_data'] = True settings['generate'] = True settings['make_plot'] = True settings['high_num_dims'] = False input_files = [resources.fixture('correlated_data.sv.gz')] ######################################################################################### # --- now set up the chains and links based on configuration flags if settings['read_data']: ch = Chain('Data') # --- 0. read the input dataset read_data = analysis.ReadToDf(name='reader', key='correlated_data', reader='csv', sep=' ') read_data.path = input_files ch.add(read_data) # --- 1. convert into a roofit dataset (roodataset) # build a KEYS pdf out of the dataset as well df2rds = root_analysis.ConvertDataFrame2RooDataSet() df2rds.read_key = read_data.key df2rds.store_key = 'rds_' + read_data.key df2rds.store_key_vars = 'keys_varset'
######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk402_roodatahist_fill' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. input_files = [resources.fixture('mock_accounts.csv.gz')] ######################################################################################### # --- now set up the chains and links based on configuration flags ch = Chain('Data') # --- 0. readdata keeps on opening the next file in the file list. # all kwargs are passed on to pandas file reader. read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv') read_data.path = input_files # readdata.itr_over_files = True ch.add(read_data) # --- 1. add the record factorizer to convert categorical observables into integers # Here the columns dummy and loc of the input dataset are factorized # e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column: # x = [0, 1, 2, 0, 2] # By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original' fact = analysis.RecordFactorizer(name='rf1') fact.columns = ['isActive', 'eyeColor', 'favoriteFruit', 'gender']
# define data stream ds = process_manager.service(DataStore) if not stream_type or stream_type == 'file': ds['dstream'] = ssc.textFileStream('/tmp/eskapade_stream_test/') elif stream_type == 'tcp': ds['dstream'] = ssc.socketTextStream('localhost', 9999) else: logger.error('unsupported stream_type specified: {type}.', type=stream_type) ########################################################################## # --- now set up the chains and links based on configuration flags spark_streaming = Chain('SparkStreaming') # the word count example wordcount_link = SparkStreamingWordCount(name='SparkStreamingWordCount', read_key='dstream', store_key='wordcounts') spark_streaming.add(wordcount_link) # store output writer_link = SparkStreamingWriter( name='SparkStreamingWriter', read_key=wordcount_link.store_key, output_path='file:' + persistence.io_path('results_data', '/dstream/wordcount'), suffix='txt', repartition=1)
########################################################################## # Start Spark session spark = process_manager.service(SparkManager).create_session( eskapade_settings=settings) ########################################################################## # CSV and dataframe settings # NB: local file may not be accessible to worker node in cluster mode file_path = ['file:' + resources.fixture('dummy1.csv')] ########################################################################## # Now set up the chains and links based on configuration flags read = Chain('Read') # create read link for each data file read_link = SparkDfReader(name='ReadFile', store_key='spark_df', read_methods=['csv']) # set CSV read arguments read_link.read_meth_args['csv'] = (file_path, ) read_link.read_meth_kwargs['csv'] = dict(sep='|', header=True, inferSchema=True) # add link to chain read.add(read_link)
sum_bar = sum(r['bar'] for r in rows) return [r + (sum_bar, ) for r in rows] ########################################################################## # --- input data ds = process_manager.service(DataStore) rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2.) for it in range(100)] ds['df'] = spark.createDataFrame(rows, schema=['index', 'foo', 'bar']) ########################################################################## # --- now set up the chains and links based on configuration flags # create chain chain = Chain('Map') # create a link to convert the data frame into an RDD conv_lnk = spark_analysis.SparkDfConverter(name='DfConverter', read_key='df', store_key='rdd', output_format='rdd', preserve_col_names=True) chain.add(conv_lnk) # create a link to calculate the sum of "bar" for each group of ten rows map_lnk = spark_analysis.RddGroupMapper(name='Mapper', read_key='rdd', store_key='map_rdd', group_map=sum, input_map=lambda r:
######################################################################################### # when chunking through an input file, pick up only N lines in each iteration. chunk_size = 5 ######################################################################################### # --- Set path of data data_path = resources.fixture('dummy.csv') ######################################################################################### # --- now set up the chains and links, based on configuration flags # --- example 1: readdata loops over the input files, but no file chunking. if settings.get('do_example1', True): ch = Chain('MyChain1') # --- a loop is set up in the chain MyChain. # we iterate over (chunks of) the next file in the list until the iterator is done. # then move on to the next chain (Overview) # --- readdata keeps on opening the next file in the file list. # all kwargs are passed on to pandas file reader. read_data = analysis.ReadToDf(name='dflooper1', key='test1', sep='|', reader='csv', usecols=['x', 'y']) read_data.path = [data_path] * 3 read_data.itr_over_files = True ch.add(read_data)
spark = process_manager.service(SparkManager).create_session(eskapade_settings=settings) ########################################################################## # CSV and dataframe settings # NB: local file may not be accessible to worker node in cluster mode file_paths = ['file:' + resources.fixture('dummy1.csv'), 'file:' + resources.fixture('dummy2.csv')] # define store_key for all data files to be read in STORE_KEYS = ['spark_df1', 'spark_df2'] ########################################################################## # Now set up the chains and links based on configuration flags read = Chain('Read') # create read link for each data file for index, key in enumerate(STORE_KEYS): read_link = SparkDfReader(name='Reader' + str(index + 1), store_key=key, read_methods=['csv']) # set CSV read arguments read_link.read_meth_args['csv'] = (file_paths[index],) read_link.read_meth_kwargs['csv'] = dict(sep='|', header=True, inferSchema=True) # add link to chain read.add(read_link) # create SQL-query link sql_link = SparkExecuteQuery(name='SparkSQL', store_key='spark_df_sql')
logger.debug( 'Now parsing configuration file esk407_classification_unbiased_fit_estimate' ) ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk407_classification_unbiased_fit_estimate' settings['version'] = 0 ######################################################################################### # --- now set up the chains and links based on configuration flags # --- generate pdf, simulate, fit, and plot ch = Chain('WsOps') # 1. simulate output score of machine learning classifier wsu = WsUtils(name='DataSimulator') wsu.factory = [ "expr::trans('@0-@1',{score[0,1],1})", "RooExponential::high_risk(trans,10)", "RooPolynomial::low_risk(score,{-0.4,-0.4})", "SUM::model(low_risk_frac[0.95,0.,1.]*low_risk,high_risk)" ] wsu.add_simulate(pdf='high_risk', obs='score', num=1000, key='unbiased_high_risk_testdata', into_ws=True) wsu.add_simulate(pdf='low_risk',
from eskapade import data_mimic from eskapade import process_manager from eskapade.logger import Logger, LogLevel logger = Logger() logger.debug('Now parsing configuration file esk703_mimic_data') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk703_mimic_data' settings['version'] = 0 np.random.seed(42) ch = Chain('DataPrep') ch.logger.log_level = LogLevel.DEBUG sim_data = data_mimic.MixedVariablesSimulation(store_key='df', n_obs=100000, p_unordered=np.array( [[0.2, 0.2, 0.3, 0.3], [0.3, 0.7]])) sim_data.logger.log_level = LogLevel.DEBUG ch.add(sim_data) pre_data = data_mimic.KDEPreparation( read_key='df', data_store_key='data', data_smoothed_store_key='data_smoothed',
######################################################################################### # when chunking through an input file, pick up only N lines in each iteration. chunk_size = 5 ######################################################################################### # --- Set path of data data_path = resources.fixture('dummy.csv') ######################################################################################### # --- now set up the chains and links, based on configuration flags # --- example 2: readdata loops over the input files, with file chunking. if settings.get('do_example2', True): ch = Chain('MyChain2') ch.n_fork = 10 # --- a loop is set up in the chain MyChain. # we iterate over (chunks of) the next file in the list until the iterator is done. # then move on to the next chain (Overview) # --- readdata keeps on opening the next 4 lines of the open or next file in the file list. # all kwargs are passed on to pandas file reader. read_data = analysis.ReadToDf(name='dflooper2', key='test2', sep='|', reader='csv', usecols=['x', 'y'], chunksize=chunk_size) read_data.path = [data_path] * 3 ch.add(read_data) # --- do something useful with the test dataset here ... # e.g. apply selections, or collect into histograms.
var_range=(0., MAX_AGE), var=('redeem_age', 0.), max_var=('age', MAX_AGE), exp=[('rate_fast', FAST_REDEEM_RATE), ('rate_slow', SLOW_REDEEM_RATE)], fracs=[('frac_fast', FAST_FRAC)]) model.build_model() model.var.SetTitle('Redeem age') model.max_var.SetTitle('Age') model.var.setUnit('days') model.max_var.setUnit('days') ############################################################################### # --- create chain for generating voucher redeem data ch = Chain('Generation') gen_link = TruncExpGen(name='Generate', store_key=REDEEM_DATA_KEY, max_var_data_key=AGE_DATA_KEY, model_name=MODEL_NAME, event_frac=REDEEM_FRAC) ch.add(gen_link) np.random.seed(settings['seeds']['NumPy']) ROOT.RooRandom.randomGenerator().SetSeed(settings['seeds']['RooFit']) ############################################################################### # --- create chain for fitting voucher redeem model to generated data ch = Chain('Fitting') fit_link = TruncExpFit(name='Fit',
nan,3,bal,3,bla,bar,c,1 ,nan,NaN,NaN,nan,nan,d,2 ,,,,,,,3 1,2,,,,,,,6 """ f = tempfile.NamedTemporaryFile(delete=False) f.write(tmp) f.close() # file is not immediately deleted because we used delete=False # used below with f.name ######################################################################################### # --- now set up the chains and links based on configuration flags ch = Chain('DataPrep') # --- 0. pandas read_csv has multiple settings to help reading in of buggy csv's. # o The option error_bad_lines=False skips lines with too few or too many values # o The option encoding='latin1' interprets most non-standard characters read_data = analysis.ReadToDf(key='vrh', reader='csv', path=f.name, error_bad_lines=False, encoding='latin1') ch.add(read_data) # --- 1. standard setting: # o convert all nans to np.nan (= float) # o convert all rows in a column to most occuring datatype in that column fixer = data_quality.FixPandasDataFrame(name='fixer1')