예제 #1
0
# turn on/off the 2 examples 
settings['do_example1'] = True 
settings['do_example2'] = True 

#########################################################################################
# --- Set path of data
data_path = persistence.io_path('data', settings.io_conf(), 'dummy.csv')

#########################################################################################
# --- now set up the chains and links, based on configuration flags

proc_mgr = ProcessManager()

# --- example 1: readdata with one input file
if settings['do_example1']:
    ch1 = proc_mgr.add_chain('MyChain1')

    readdata = analysis.ReadToDf(key ='test1', sep='|', reader='csv', path=data_path)
    ch1.add_link(readdata)

    # --- do something useful with the test dataset here ...



# --- example 2: readdata with default settings reads all three input files simultaneously.
#                all extra key word arguments are passed on to pandas reader.
if settings['do_example2']:
    ch2 = proc_mgr.add_chain('MyChain2')

    # --- a loop is set up in the chain MyChain.
    #     we iterate over (chunks of) the next file in the list until the iterator is done.
# --- minimal analysis information

proc_mgr = ProcessManager()

settings = proc_mgr.service(ConfigObject)
settings['analysisName'] = 'esk408_classification_error_propagation_after_fit'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

#########################################################################################
# --- now set up the chains and links based on configuration flags

# --- generate pdf, simulate, fit, and plot
ch = proc_mgr.add_chain('WsOps')

# 1. simulate output score of machine learning classifier
wsu = root_analysis.WsUtils(name='DataSimulator')
wsu.factory = [
    "RooGaussian::high_risk(score[0,1],1,0.15)",
    "RooPolynomial::low_risk(score,{-0.3,-0.3})",
    "SUM::model(frac[0.1,0.,1.]*high_risk,low_risk)"
]
wsu.add_simulate(pdf='model', obs='score', num=500, key='data', into_ws=True)
wsu.add_fit(pdf='model', data='data', key='fit_result', into_ws=True)
wsu.add_plot(obs='score', data='data', pdf='model', key='simplot')
wsu.add_plot(obs='score', pdf='model', \
             pdf_args=(RooFit.Components('low_risk'), RooFit.LineColor(ROOT.kRed), \
                       RooFit.LineStyle(ROOT.kDashed)),
             file='data_with_generator_model.pdf', key='simplot')
settings = proc_mgr.service(ConfigObject)
settings['analysisName'] = 'esk405_simulation_based_on_binned_data'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

settings['high_num_dims'] = False

input_files = [os.environ['ESKAPADE'] + '/data/mock_accounts.csv.gz']

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch = proc_mgr.add_chain('Data')

# --- 0. read input data
readdata = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv')
readdata.path = input_files
ch.add_link(readdata)

# --- 1. add the record factorizer
#     Here the columns dummy and loc of the input dataset are factorized
#     e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column:
#     x = [0, 1, 2, 0, 2]
#     By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original'
fact = analysis.RecordFactorizer(name='rf1')
fact.columns = ['isActive', 'eyeColor', 'favoriteFruit', 'gender']
fact.read_key = 'accounts'
fact.inplace = True
예제 #4
0
proc_mgr = ProcessManager()

settings = proc_mgr.service(ConfigObject)
settings['analysisName'] = 'esk208_record_factorizer'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

# --- Set path of data
data_path = persistence.io_path('data', settings.io_conf(), 'dummy.csv')

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch1 = proc_mgr.add_chain('Factorize')

# --- read dummy dataset
readdata = analysis.ReadToDf(key='test1',
                             sep='|',
                             reader='csv',
                             path=data_path)
ch1.add_link(readdata)

# --- print contents of the datastore
pds = core_ops.PrintDs(name='printer1')
pds.keys = ['test1']
ch1.add_link(pds)

# --- add the record factorizer
#     Here the columns dummy and loc of the input dataset are factorized
예제 #5
0
    log.info('Successfully found ROOT class %s' % pdf_name)

#########################################################################################

msg = r"""
The plots and latex files produced by this tutorial can be found in dir:
%s
""" % (settings['resultsDir'] + '/' + settings['analysisName'] +
       '/data/v0/report/')
log.info(msg)

#########################################################################################
# --- now set up the chains and links based on configuration flags

# --- generate pdf, simulate, fit, and plot
ch = proc_mgr.add_chain('WsOps')

# --- 1. define a model by passing strings to the rooworkspace factory
#     For the workspace factory syntax, see:
#     https://root.cern.ch/doc/master/RooFactoryWSTool_8cxx_source.html#l00722
#     For rooworkspace factory examples see:
#     https://root.cern.ch/root/html/tutorials/roofit/rf511_wsfactory_basic.C.html
#     https://root.cern.ch/root/html/tutorials/roofit/rf512_wsfactory_oper.C.html
#     https://root.cern.ch/root/html/tutorials/roofit/rf513_wsfactory_tools.C.html
#     Here we use the pdf class we just created (MyPdfV3), with observable y and parameter A and B,
#     with ranges (-10,10), (0,100) and (-10,10) respectively. The starting values of A and B are
#     10 and 2 respectively.
wsu = root_analysis.WsUtils(name='modeller')
wsu.factory = ["MyPdfV3::testpdf(y[-10,10],A[10,0,100],B[2,-10,10])"]
ch.add_link(wsu)
nan,3,bal,3,bla,bar,c,1
,nan,NaN,NaN,nan,nan,d,2
,,,,,,,3
1,2,,,,,,,6
"""

f = tempfile.NamedTemporaryFile(delete=False)
f.write(tmp)
f.close()
# file is not immediately deleted because we used delete=False
# used below with f.name

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch = proc_mgr.add_chain('DataPrep')

# --- 0. pandas read_csv has multiple settings to help reading in of buggy csv's.
#     o The option error_bad_lines=False skips lines with too few or too many values
#     o The option encoding='latin1' interprets most non-standard characters
readdata = analysis.ReadToDf(key='vrh',
                             reader='csv',
                             path=f.name,
                             error_bad_lines=False,
                             encoding='latin1')
ch.add_link(readdata)

# --- 1. standard setting:
#     o convert all nans to np.nan (= float)
#     o convert all rows in a column to most occuring datatype in that column
fixer = data_quality.FixPandasDataFrame(name='fixer1')
예제 #7
0
COLUMNS = ['var_a', 'var_b', 'var_c']
SIZE = 10000
VAR_LABELS = dict(var_a='Variable A', var_b='Variable B', var_c='Variable C')
VAR_UNITS = dict(var_b='m/s')
GEN_CONF = dict(var_b=dict(mean=42., std=2.),
                var_c=dict(mean=42, std=2, dtype=int))

#########################################################################################
# --- now set up the chains and links based on configuration flags

# create process manager
proc_mgr = ProcessManager()

# create chains
proc_mgr.add_chain('Data')
proc_mgr.add_chain('Summary')

# add data-generator link to "Data" chain
generator = analysis.BasicGenerator(name='Generate_data',
                                    key='data',
                                    columns=COLUMNS,
                                    size=SIZE,
                                    gen_config=GEN_CONF)
proc_mgr.get_chain('Data').add_link(generator)

# add data-frame summary link to "Summary" chain
# can provide labels and units for the variables in the dataset
summarizer = visualization.DfSummary(name='Create_stats_overview',
                                     read_key=generator.key,
                                     var_labels=VAR_LABELS,
예제 #8
0
sm.spark_streaming_context = ssc

# define data stream
ds = proc_mgr.service(DataStore)

if not stream_type or stream_type == 'file':
    ds['dstream'] = ssc.textFileStream('/tmp/eskapade_stream_test/')
elif stream_type == 'tcp':
    ds['dstream'] = ssc.socketTextStream('localhost', 9999)
else:
    log.error('unsupported stream_type specified: {}'.format(stream_type))

##########################################################################
# --- now set up the chains and links based on configuration flags

proc_mgr.add_chain('SparkStreaming')

# the word count example
wordcount_link = spark_analysis.SparkStreamingWordCount(
    name='SparkStreamingWordCount', read_key='dstream', store_key='wordcounts')
proc_mgr.get_chain('SparkStreaming').add_link(wordcount_link)

# store output
writer_link = spark_analysis.SparkStreamingWriter(
    name='SparkStreamingWriter',
    read_key=wordcount_link.store_key,
    output_path='file:' +
    persistence.io_dir('results_data', settings.io_conf()) +
    '/dstream/wordcount',
    mode='overwrite',
    suffix='txt',
                      var_range=(0., MAX_AGE),
                      var=('redeem_age', 0.),
                      max_var=('age', MAX_AGE),
                      exp=[('rate_fast', FAST_REDEEM_RATE),
                           ('rate_slow', SLOW_REDEEM_RATE)],
                      fracs=[('frac_fast', FAST_FRAC)])
    model.build_model()
    model.var.SetTitle('Redeem age')
    model.max_var.SetTitle('Age')
    model.var.setUnit('days')
    model.max_var.setUnit('days')

###############################################################################
# --- create chain for generating voucher redeem data

ch = proc_mgr.add_chain('Generation')
gen_link = TruncExpGen(name='Generate',
                       store_key=REDEEM_DATA_KEY,
                       max_var_data_key=AGE_DATA_KEY,
                       model_name=MODEL_NAME,
                       event_frac=REDEEM_FRAC)
ch.add_link(gen_link)

np.random.seed(settings['seeds']['NumPy'])
ROOT.RooRandom.randomGenerator().SetSeed(settings['seeds']['RooFit'])

###############################################################################
# --- create chain for fitting voucher redeem model to generated data

ch = proc_mgr.add_chain('Fitting')
fit_link = TruncExpFit(name='Fit',
예제 #10
0
COLUMNS = ['var_a', 'var_b', 'var_c']
SIZE = 10000
VAR_LABELS = dict(var_a='Variable A', var_b='Variable B', var_c='Variable C')
VAR_UNITS = dict(var_b='m/s')
GEN_CONF = dict(var_a=dict(choice=['alpha', 'beta', 'gamma'], dtype=str), var_b=dict(mean=3., std=1.),
                var_c=dict(choice=['delta', 'epsilon', 'zeta', 'eta'], dtype=str))

#########################################################################################
# --- now set up the chains and links based on configuration flags

# create process manager
proc_mgr = ProcessManager()

# create chains
proc_mgr.add_chain('Data')
proc_mgr.add_chain('BoxPlot')

# add data-generator link to "Data" chain

generator = analysis.BasicGenerator(name='Generate_data',
                                    key='data',
                                    columns=COLUMNS,
                                    size=SIZE,
                                    gen_config=GEN_CONF)
proc_mgr.get_chain('Data').add_link(generator)

# add data-frame summary link to "Boxplot" chain
# can provide labels and units for the variables in the dataset, and set the statistics to print in output file
boxplot = visualization.DfBoxplot(name='Create_stats_overview',
                                  read_key=generator.key,
예제 #11
0
def firstword(x):
    return x.split()[0]


#########################################################################################
# --- now set up the chains and links based on configuration flags

# This chain does 'mapping'. (macro B does 'reduction'.)

proc_mgr = ProcessManager()

# --- mapper: chain with event looper
#     this eventlooper link serves as a mapper.
#     in this example the lines are converted to lower chars, and the first word is selected.
if settings['do_map']:
    ch = proc_mgr.add_chain("Mapper")
    looper = core_ops.EventLooper(name='listener')
    looper.skip_line_beginning_with = ['#']
    looper.line_processor_set = [firstword, tolower]
    if settings['TESTING']:
        looper.filename = f.name
    ch.add_link(looper)

# --- reducer: chain with event looper
#     this eventlooper link serves as a reducer
#     in this example the lines are grouped together into unique sets.
if settings['do_reduce']:
    ch = proc_mgr.add_chain("Reducer")
    looper = core_ops.EventLooper(name='grouper')
    # reducer selects all unique lines
    looper.sort = True
예제 #12
0
settings['do_readdata'] = True
settings['do_writedata'] = True

#########################################################################################
# --- Set path of data
data_path = persistence.io_path('data', settings.io_conf(), 'dummy.csv')

#########################################################################################
# --- now set up the chains and links based on configuration flags

proc_mgr = ProcessManager()

# --- readdata with default settings reads all three input files simultaneously.
#     all extra key word arguments are passed on to pandas reader.
if settings['do_readdata']:
    ch = proc_mgr.add_chain('ReadData')

    # --- readdata keeps on opening the next file in the file list.
    #     all kwargs are passed on to pandas file reader.
    readdata = analysis.ReadToDf(name='reader',
                                 key='test',
                                 sep='|',
                                 reader='csv',
                                 path=[data_path] * 3)
    ch.add_link(readdata)

if settings['do_writedata']:
    ch = proc_mgr.add_chain('WriteData')

    # --- writedata needs a specified output format ('writer' argument).
    #     if this is not set, try to determine this from the extension from the filename.
예제 #13
0
# turn on/off the example
settings['do_example'] = True

#########################################################################################
# --- now set up the chains and links, based on configuration flags

proc_mgr = ProcessManager()

# --- example loops over the first chain 10 times.

if settings['do_example']:
    # --- a loop is set up in the chain MyChain.
    #     we iterate over the chain until the link RepeatChain is done.
    #     then move on to the next chain (Overview)
    ch = proc_mgr.add_chain('MyChain')

    link = core_ops.HelloWorld(name='HelloWorld')
    link.set_log_level(logging.DEBUG)
    ch.add_link(link)

    # --- this link sends out a signal to repeat the execution of the chain.
    #     It serves as the 'continue' statement of the loop.
    #     go back to start of the chain until counter reaches 10.
    repeater = core_ops.RepeatChain()
    # repeat max of 10 times
    repeater.maxcount = 10
    repeater.set_log_level(logging.DEBUG)
    ch.add_link(repeater)

# --- print contents of the datastore.
settings = proc_mgr.service(ConfigObject)
settings['analysisName'] = 'esk404_workspace_createpdf_simulate_fit_plot'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

settings['generate_fit_plot'] = True
settings['summary'] = True

#########################################################################################
# --- now set up the chains and links based on configuration flags

if settings['generate_fit_plot']:
    # --- generate pdf, simulate, fit, and plot
    ch = proc_mgr.add_chain('WsOps')

    # --- 1. define a model by passing strings to the rooworkspace factory
    #     for details on rooworkspace factory see:
    #     https://root.cern.ch/root/html/tutorials/roofit/rf511_wsfactory_basic.C.html
    wsu = root_analysis.WsUtils(name='modeller')
    wsu.factory = [
        "Gaussian::sig1(x[-10,10],mean[5,0,10],0.5)",
        "Gaussian::sig2(x,mean,1)",
        "Chebychev::bkg(x,{a0[0.5,0.,1],a1[-0.2,-1,1]})",
        "SUM::sig(sig1frac[0.8,0.,1.]*sig1,sig2)",
        "SUM::model(bkgfrac[0.5,0.,1.]*bkg,sig)"
    ]
    ch.add_link(wsu)

    # --- 2. simulation: 1000 records of observable 'x' with pdf 'model'.
예제 #15
0
spark = proc_mgr.service(SparkManager).create_session(
    eskapade_settings=settings)

##########################################################################
# CSV and dataframe settings

# NB: local file may not be accessible to worker node in cluster mode
file_path = [
    'file:' + persistence.io_path('data', settings.io_conf(), 'dummy1.csv')
]

##########################################################################
# Now set up the chains and links based on configuration flags

proc_mgr.add_chain('Read')

# create read link for each data file
read_link = spark_analysis.SparkDfReader(name='ReadFile',
                                         store_key='spark_df',
                                         read_methods=['csv'])

# set CSV read arguments
read_link.read_meth_args['csv'] = (file_path, )
read_link.read_meth_kwargs['csv'] = dict(sep='|',
                                         header=True,
                                         inferSchema=True)

# add link to chain
proc_mgr.get_chain('Read').add_link(read_link)
예제 #16
0
settings[
    'TESTING'] = False if not 'TESTING' in settings else settings['TESTING']

#########################################################################################
# --- now set up the chains and links based on configuration flags

ds = ProcessManager().service(DataStore)
ds['hello'] = 'world'
ds['d'] = {'a': 1, 'b': 2, 'c': 3}

#########################################################################################
# --- now set up the chains and links based on configuration flags

proc_mgr = ProcessManager()

ch = proc_mgr.add_chain('Overview')

# 1. printdatastore prints an overview of the contents in the datastore
# at the state of executing the link.
# The overview consists of list of keys in the datastore and and the object types.
link = core_ops.PrintDs(name='printer1')
# keys are the items for which the contents of the actual item is printed.
link.keys = ['hello', 'd']
ch.add_link(link)

# 2. This link will start an interactive ipython session.
# from this session, one can access the datastore and the configobject with:
# >>> ds
# or
# >>> settings
# Try to add something to the datastore in this session!
예제 #17
0
        ts = pd.Timestamp(x.split()[0])
        return ts
    except:
        pass
    return x


#########################################################################################
# --- now set up the chains and links, based on configuration flags

procMgr = ProcessManager()

# --- example 2: readdata loops over the input files, with file chunking.

if settings['do_loop']:
    ch = procMgr.add_chain('Data')

    # --- a loop is set up in the chain MyChain.
    #     we iterate over (chunks of) the next file in the list until the iterator is done.
    #     then move on to the next chain (Overview)

    # --- readdata keeps on opening the next 400 lines of the open or next file in the file list.
    #     all kwargs are passed on to pandas file reader.
    readdata = analysis.ReadToDf(name='dflooper', key='rc', reader='csv')
    readdata.chunksize = chunksize
    readdata.path = input_files
    ch.add_link(readdata)

    # add conversion functions to "Data" chain
    # here, convert column 'registered', an integer, to an actual timestamp.
    conv_funcs = [{'func': to_date, 'colin': 'registered', 'colout': 'date'}]
예제 #18
0
proc_mgr = ProcessManager()

settings = proc_mgr.service(ConfigObject)
settings['analysisName'] = 'esk403_roodataset_convert'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

input_files = [os.environ['ESKAPADE'] + '/data/mock_accounts.csv.gz']

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch = proc_mgr.add_chain('Data')

# --- 0. read the input data
#     all kwargs are passed on to pandas file reader.
readdata = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv')
readdata.path = input_files
ch.add_link(readdata)

ch = proc_mgr.add_chain('Conversion1')

# --- 1. add the record factorizer
#     Here the columns dummy and loc of the input dataset are factorized
#     e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column:
#     x = [0, 1, 2, 0, 2]
#     By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original'
fact = analysis.RecordFactorizer(name='rf1')
예제 #19
0
    schema.keys()))  # Spark data frame
ds['pd'] = pd.DataFrame(ds['rows'], columns=schema.keys())  # Pandas data frame

##########################################################################
# --- now set up the chains and links based on configuration flags


# define function to set number of partitions
def set_num_parts(df, max_num_parts):
    if df.rdd.getNumPartitions() > max_num_parts:
        df = df.repartition(max_num_parts)
    return df


# create chain and data-frame-creator links
chain = proc_mgr.add_chain('Create')
for ds_key, lnk_schema in zip(('rows', 'rdd', 'df', 'pd'),
                              (list(schema.keys()), schema, schema, None)):
    # create data-frame-creator link
    lnk = spark_analysis.SparkDfCreator(
        name='df_creator_{}'.format(ds_key),
        read_key=ds_key,
        store_key='{}_df'.format(ds_key),
        schema=lnk_schema,
        process_methods=['filter', set_num_parts, 'cache'])

    # set post-process-method arguments
    lnk.process_meth_args['filter'] = ('index > 19',
                                       )  # select rows with index > 19
    lnk.process_meth_kwargs[set_num_parts] = dict(
        max_num_parts=2)  # set maximum number of partitions to 2
                                                             schema=columns,
                                                             write_methods=['csv'],
                                                             num_files=num_files)

# create RDD-CSV-writer link
writers['rdd_csv_writer'] = spark_analysis.SparkDataToCsv(name='rdd_csv_writer',
                                                          read_key='rdd',
                                                          output_path='{}/rdd_csv'.format(output_dir),
                                                          mode='overwrite',
                                                          sep=separator,
                                                          header=columns if write_header else False,
                                                          num_files=num_files)

# set generic-writer arguments
for input_format in ('df', 'rdd'):
    key = '{}_generic_writer'.format(input_format)
    writers[key].write_meth_args['csv'] = ('{0:s}/{1:s}_generic'.format(output_dir, input_format),)
    writers[key].write_meth_kwargs['csv'] = dict(sep=separator,
                                                 header=write_header,
                                                 mode='overwrite')

# add links to chain
chain = proc_mgr.add_chain('Write')
for lnk in writers.values():
    chain.add_link(lnk)


##########################################################################

log.debug('Done parsing configuration file esk603_read_csv_to_spark_df')
예제 #21
0
#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

settings['do_chain0'] = True
settings['do_chain1'] = True
settings['do_chain2'] = True

#########################################################################################
# --- now set up the chains and links based on configuration flags

# Three simple chains are set up.

proc_mgr = ProcessManager()

if settings['do_chain0']:
    ch = proc_mgr.add_chain('Chain0')
    link = core_ops.HelloWorld(name='hello0')
    link.hello = 'Town'
    ch.add_link(link)

# adding more chains is as easy as calling add_chain and passing a new name.

if settings['do_chain1']:
    ch = proc_mgr.add_chain('Chain1')
    link = core_ops.HelloWorld(name='hello1')
    link.hello = 'World'
    ch.add_link(link)

if settings['do_chain2']:
    ch = proc_mgr.add_chain('Chain2')
    link = core_ops.HelloWorld(name='hello2')
예제 #22
0
proc_mgr = ProcessManager()

settings = proc_mgr.service(ConfigObject)
settings['analysisName'] = 'esk207_record_vectorizer'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

# --- Set path of data
data_path = persistence.io_path('data', settings.io_conf(), 'dummy.csv')

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch1 = proc_mgr.add_chain('MyChain1')

# --- read dummy dataset
readdata = analysis.ReadToDf(key='test1',
                             sep='|',
                             reader='csv',
                             path=data_path)
ch1.add_link(readdata)

# --- print contents of the datastore
pds = core_ops.PrintDs(name='printer1')
pds.keys = ['test1']
ch1.add_link(pds)

# --- add the record vectorizer
#     Here the columns x and y of the input dataset are vectorized