Python Chain примеры, eskapade.Chain Python примеры использования

Пример #1

0

Показать файл

Файл: esk107_chain_looper.py Проект: evoloji/Eskapade

#     by default all set to false, unless already configured in
#     configobject or vars()

# turn on/off the example
settings['do_example'] = True

#########################################################################################
# --- now set up the chains and links, based on configuration flags

# --- example loops over the first chain 10 times.

if settings['do_example']:
    # --- a loop is set up in the chain MyChain.
    #     we iterate over the chain until the link RepeatChain is done.
    #     then move on to the next chain (Overview)
    ch = Chain('MyChain')

    link = core_ops.HelloWorld(name='HelloWorld')
    link.logger.log_level = LogLevel.DEBUG
    ch.add(link)

    # --- this link sends out a signal to repeat the execution of the chain.
    #     It serves as the 'continue' statement of the loop.
    #     go back to start of the chain until counter reaches 10.
    repeater = core_ops.RepeatChain()
    # repeat max of 10 times
    repeater.maxcount = 10
    repeater.logger.log_level = LogLevel.DEBUG
    ch.add(repeater)

# --- print contents of the datastore.

Пример #2

0

Показать файл

Файл: esk410_testing_correlations_between_categories.py Проект: evoloji/Eskapade

#########################################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk410_testing_correlations_between_categories'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

input_files = [resources.fixture('mock_accounts.csv.gz')]

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch = Chain('Data')

# --- 0. readdata keeps on opening the next file in the file list.
#     all kwargs are passed on to pandas file reader.
read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv')
read_data.path = input_files
ch.add(read_data)

# --- 1. add the record factorizer to convert categorical observables into integers
#     Here the columns dummy and loc of the input dataset are factorized
#     e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column:
#     x = [0, 1, 2, 0, 2]
#     By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original'
fact = analysis.RecordFactorizer(name='rf1')
fact.columns = ['eyeColor', 'favoriteFruit']  # ['Obs_*']
fact.read_key = read_data.key

Пример #3

0

Показать файл

Файл: esk405_simulation_based_on_binned_data.py Проект: evoloji/Eskapade

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk405_simulation_based_on_binned_data'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

settings['high_num_dims'] = False

input_files = [resources.fixture('mock_accounts.csv.gz')]

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch = Chain('Data')

# --- 0. read input data
read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv')
read_data.path = input_files
ch.add(read_data)

# --- 1. add the record factorizer
#     Here the columns dummy and loc of the input dataset are factorized
#     e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column:
#     x = [0, 1, 2, 0, 2]
#     By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original'
fact = analysis.RecordFactorizer(name='rf1')
fact.columns = ['isActive', 'eyeColor', 'favoriteFruit', 'gender']
fact.read_key = 'accounts'
fact.inplace = True

Пример #4

0

Показать файл

}
process_meth_kwargs = {
    'df': {
        set_num_parts: dict(max_num_parts=2)
    },
    'rdd': {},
    'list': {
        filter_list: dict(min_index=20)
    },
    'pd': {
        filter_pd: dict(min_index=20)
    }
}

# create chain and data-frame-creator links
chain = Chain('Create')
for out_format in process_methods:
    # create data-frame-conversion link
    lnk = spark_analysis.SparkDfConverter(
        name='df_to_{}_converter'.format(out_format),
        read_key='df',
        store_key='{}_output'.format(out_format),
        schema_key='{}_schema'.format(out_format),
        output_format=out_format,
        preserve_col_names=False,
        process_methods=process_methods[out_format],
        process_meth_args=process_meth_args[out_format],
        process_meth_kwargs=process_meth_kwargs[out_format])

    # add link to chain
    chain.add(lnk)

Пример #5

0

Показать файл

    import pandas as pd
    try:
        ts = pd.Timestamp(x.split()[0])
        x = ts
    except Exception:
        logger.warning('Date conversion failed!')
    return x


#########################################################################################
# --- now set up the chains and links, based on configuration flags

# --- example 2: readdata loops over the input files, with file chunking.

if settings['do_loop']:
    ch = Chain('Data')

    # --- a loop is set up in the chain MyChain.
    #     we iterate over (chunks of) the next file in the list until the iterator is done.
    #     then move on to the next chain (Overview)

    # --- readdata keeps on opening the next 400 lines of the open or next file in the file list.
    #     all kwargs are passed on to pandas file reader.
    read_data = analysis.ReadToDf(name='dflooper', key='rc', reader='csv')
    read_data.chunksize = chunk_size
    read_data.path = input_files
    ch.add(read_data)

    # add conversion functions to "Data" chain
    # here, convert column 'registered', an integer, to an actual timestamp.
    conv_funcs = [{'func': to_date, 'colin': 'registered', 'colout': 'date'}]

Пример #6

0

Показать файл

Файл: esk105_datastore_pickling.py Проект: evoloji/Eskapade

            data_path=settings['resultsDir'] + '/' + settings['analysisName'] +
            '/data/v0/',
            conf_path=settings['resultsDir'] + '/' + settings['analysisName'] +
            '/config/v0/')

# dummy information used in this macro, added to each chain below.
f = {'hello': 'world', 'v': [3, 1, 4, 1, 5], 'n_favorite': 7}
g = {'a': 1, 'b': 2, 'c': 3}
h = [2, 7]

#########################################################################################
# --- now set up the chains and links based on configuration flags

#########
# chain 1
ch = Chain('chain1')

# the link ToDsDict adds objects to the datastore at link execution.
link = core_ops.ToDsDict(name='intods_1')
link.store_key = 'f'
link.obj = f
ch.add(link)

# print contents of datastore
link = core_ops.PrintDs()
ch.add(link)

#########
# chain 2
ch = Chain('chain2')

Пример #7

0

Показать файл

"""
logger.info(msg,
            path=settings['resultsDir'] + '/' + settings['analysisName'] +
            '/data/v0/report/')

COLUMNS = ['var_a', 'var_b', 'var_c']
SIZE = 10000
VAR_LABELS = dict(var_a='Variable A', var_b='Variable B', var_c='Variable C')
VAR_UNITS = dict(var_b='m/s')
GEN_CONF = dict(var_b=dict(mean=42., std=2.),
                var_c=dict(mean=42, std=2, dtype=int))

#########################################################################################
# --- now set up the chains and links based on configuration flags

data = Chain('Data')
# add data-generator link to "Data" chain
generator = analysis.BasicGenerator(name='Generate_data',
                                    key='data',
                                    columns=COLUMNS,
                                    size=SIZE,
                                    gen_config=GEN_CONF)
data.add(generator)

# add data-frame summary link to "Summary" chain
# can provide labels and units for the variables in the dataset
summary = Chain('Summary')
summarizer = visualization.DfSummary(name='Create_stats_overview',
                                     read_key=generator.key,
                                     var_labels=VAR_LABELS,
                                     var_units=VAR_UNITS)

Пример #8

0

Показать файл

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

settings['input_path'] = resources.fixture('correlated_data.sv.gz')
settings['reader'] = 'csv'
settings['separator'] = ' '
settings['correlations'] = [
    'pearson', 'kendall', 'spearman', 'correlation_ratio'
]

#########################################################################################
# --- now set up the chains and links based on configuration flags

# create chains
data = Chain('Data')

# load data
reader = analysis.ReadToDf(name='reader',
                           path=settings['input_path'],
                           sep=settings['separator'],
                           key='input_data',
                           reader=settings['reader'])

data.add(reader)

# make visualizations of correlations
summary = Chain('Summary')

corr_link = visualization.CorrelationSummary(name='correlation_summary',
                                             read_key='input_data',

Пример #9

0

Показать файл

Файл: esk101_helloworld.py Проект: evoloji/Eskapade

#########################################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk101_helloworld'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

#     E.g. define flags turn on or off certain chains with links.
#     by default all set to false, unless already configured in
#     configobject or vars()

settings['do_hello'] = True
settings['n_repeat'] = 2

#########################################################################################
# --- now set up the chains and links based on configuration flags

if settings['do_hello']:
    hello = Chain(name='Hello')
    link = core_ops.HelloWorld(name='HelloWorld')
    link.logger.log_level = LogLevel.DEBUG
    link.repeat = settings['n_repeat']
    hello.add(link)

#########################################################################################

logger.debug('Done parsing configuration file esk101_helloworld')

Пример #10

0

Показать файл

###############################################################################

msg = r"""
The plots and latex files produced by this tutorial can be found in dir:
{path}
"""
logger.info(msg,
            path=settings['resultsDir'] + '/' + settings['analysisName'] +
            '/data/v0/report/')

###############################################################################
# --- now set up the chains and links based on configuration flags

# --- generate pdf, simulate, fit, and plot
ch = Chain('WsOps')

# --- 1. define a model by passing strings to the rooworkspace factory
#     For the workspace factory syntax, see:
#     https://root.cern.ch/doc/master/RooFactoryWSTool_8cxx_source.html#l00722
#     For rooworkspace factory examples see:
#     https://root.cern.ch/root/html/tutorials/roofit/rf511_wsfactory_basic.C.html
#     https://root.cern.ch/root/html/tutorials/roofit/rf512_wsfactory_oper.C.html
#     https://root.cern.ch/root/html/tutorials/roofit/rf513_wsfactory_tools.C.html
#     Here we use the pdf class we just created (MyPdfV3), with observable y and parameter A and B,
#     with ranges (-10,10), (0,100) and (-10,10) respectively. The starting values of A and B are
#     10 and 2 respectively.
wsu = WsUtils(name='modeller')
wsu.factory = [
    '{pdf}::testpdf(y[-10,10],A[10,0,100],B[2,-10,10])'.format(pdf=pdf_name)
]

Пример #11

0

Показать файл

logger.info(msg, path=persistence.io_path('results_data', 'report'))

COLUMNS = ['var_a', 'var_b', 'var_c']
SIZE = 10000
VAR_LABELS = dict(var_a='Variable A', var_b='Variable B', var_c='Variable C')
VAR_UNITS = dict(var_b='m/s')
GEN_CONF = dict(var_a=dict(choice=['alpha', 'beta', 'gamma'], dtype=str),
                var_b=dict(mean=3., std=1.),
                var_c=dict(choice=['delta', 'epsilon', 'zeta', 'eta'],
                           dtype=str))

#########################################################################################
# --- now set up the chains and links based on configuration flags

# create chains
data = Chain('Data')

# add data-generator link to "Data" chain
generator = analysis.BasicGenerator(name='Generate_data',
                                    key='data',
                                    columns=COLUMNS,
                                    size=SIZE,
                                    gen_config=GEN_CONF)
data.add(generator)

# add data-frame summary link to "Boxplot" chain
# can provide labels and units for the variables in the dataset, and set the statistics to print in output file
plot = Chain('BoxPlot')
box_plot = visualization.DfBoxplot(
    name='Create_stats_overview',
    read_key=generator.key,

Пример #12

0

Показать файл

Файл: esk602_read_csv_to_spark_df.py Проект: evoloji/Eskapade

# --- now set up the chains and links based on configuration flags

# create read link
read_link = spark_analysis.SparkDfReader(name='Reader',
                                         store_key='spark_df',
                                         read_methods=['csv'])

# set CSV read arguments
read_link.read_meth_args['csv'] = (file_paths,)
read_link.read_meth_kwargs['csv'] = dict(sep=separator,
                                         header=has_header,
                                         inferSchema=infer_schema)

if columns:
    # add select function
    read_link.read_methods.append('select')
    read_link.read_meth_args['select'] = tuple(columns)

if num_partitions:
    # add repartition function
    read_link.read_methods.append('repartition')
    read_link.read_meth_args['repartition'] = (num_partitions,)

# add link to chain
read = Chain('Read')
read.add(read_link)

##########################################################################

logger.debug('Done parsing configuration file esk602_read_csv_to_spark_df.')

Пример #13

0

Показать файл

Файл: esk207_record_vectorizer.py Проект: mbaak/Eskapade

# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk207_record_vectorizer'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

# --- Set path of data
data_path = resources.fixture('dummy.csv')

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch1 = Chain('MyChain1')

# --- read dummy dataset
read_data = analysis.ReadToDf(key='test1',
                              sep='|',
                              reader='csv',
                              path=data_path)
ch1.add(read_data)

# --- print contents of the datastore
pds = core_ops.PrintDs(name='printer1')
pds.keys = ['test1']
ch1.add(pds)

# --- add the record vectorizer
#     Here the columns x and y of the input dataset are vectorized

Пример #14

0

Показать файл

Файл: esk601_spark_configuration.py Проект: mbaak/Eskapade-Spark

# --- METHOD 1: configuration file

spark = sm.create_session(eskapade_settings=settings)
sc = spark.sparkContext

logger.info('---> METHOD 1: configuration file')
logger.info(str(sc.getConf().getAll()))

##########################################################################
# --- METHOD 2: link

conf_link = SparkConfigurator(name='SparkConfigurator', log_level='WARN')
conf_link.spark_settings = [('spark.app.name',
                             settings['analysisName'] + '_link'),
                            ('spark.master', 'local[42]'),
                            ('spark.driver.host', '127.0.0.1')]

config = Chain('Config')
config.add(conf_link)

logger.info('---> METHOD 2: link')
logger.info('NB: settings will be printed at time of link execution.')

##########################################################################
# --- running spark session will be stopped automatically at end

###########################################################################
# --- the end

logger.debug('Done parsing configuration file esk601_spark_configuration.')

Пример #15

0

Показать файл

settings['generate'] = True
# settings['read_data'] = not settings['generate']
settings['model'] = True
settings['process'] = True
settings['fit_plot'] = True
settings['summary'] = True

fitpdf = 'sum3pdf'
n_percentile_bins = 300

#########################################################################################
# --- now set up the chains and links based on configuration flags

if settings['model']:
    # --- generate pdf
    ch = Chain('Model')

    # --- 1. define a model
    wsu = root_analysis.WsUtils(name='modeller')
    factory = [
        "RooWeibull::wb1(t[0,110000000],a1[0.93,0,2],b1[2.2e-4,0,1e-3])",
        "RooWeibull::wb2(t,a2[0.61,0,2],b2[1.1e-5,0,1e-3])",
        "RooWeibull::wb3(t,a3[0.43,0,2],b3[4.7e-7,0,1e-3])",
        "RooWeibull::wb4(t,a4[0.43,0,2],b4[2.2e-7,0,1e-3])",
        "SUM::sum2pdf(N1[580000,0,2e6]*wb1,N2[895000,0,2e6]*wb2)",
        "SUM::sum3pdf(N1[580000,0,2e6]*wb1,N2[895000,0,2e6]*wb2,N3[150500,0,2e6]*wb3)",
        "SUM::sum4pdf(N1[580000,0,2e6]*wb1,N2[895000,0,2e6]*wb2,N3[150500,0,2e6]*wb3,N4[1e5,0,2e6]*wb4)"
    ]
    wsu.factory += factory
    ch.add(wsu)

Пример #16

0

Показать файл

Файл: esk404_workspace_createpdf_simulate_fit_plot.py Проект: evoloji/Eskapade

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk404_workspace_createpdf_simulate_fit_plot'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

settings['generate_fit_plot'] = True
settings['summary'] = True

#########################################################################################
# --- now set up the chains and links based on configuration flags

if settings['generate_fit_plot']:
    # --- generate pdf, simulate, fit, and plot
    ch = Chain('WsOps')

    # --- 1. define a model by passing strings to the rooworkspace factory
    #     For the workspace factory syntax, see:
    #     https://root.cern.ch/doc/master/RooFactoryWSTool_8cxx_source.html#l00722
    #     For rooworkspace factory examples see:
    #     https://root.cern.ch/root/html/tutorials/roofit/rf511_wsfactory_basic.C.html
    #     https://root.cern.ch/root/html/tutorials/roofit/rf512_wsfactory_oper.C.html
    #     https://root.cern.ch/root/html/tutorials/roofit/rf513_wsfactory_tools.C.html
    wsu = root_analysis.WsUtils(name='modeller')
    wsu.factory = ["Gaussian::sig1(x[-10,10],mean[5,0,10],0.5)",
                   "Gaussian::sig2(x,mean,1)",
                   "Chebychev::bkg(x,{a0[0.5,0.,1],a1[-0.2,-1,1]})",
                   "SUM::sig(sig1frac[0.8,0.,1.]*sig1,sig2)",
                   "SUM::model(bkgfrac[0.5,0.,1.]*bkg,sig)"]
    ch.add(wsu)

Пример #17

0

Показать файл

#########################################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk306_concatenate_reports'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

input_files = resources.fixture('correlated_data.sv.gz')

#########################################################################################
# --- now set up the chains and links based on configuration flags

data = Chain('Data')

# --- 0. readdata keeps on opening the next file in the file list.
#     all kwargs are passed on to pandas file reader.
read_data = analysis.ReadToDf(name='dflooper',
                              key='accounts',
                              reader='csv',
                              sep=' ')
read_data.path = input_files
data.add(read_data)

# --- 1. add data-frame summary link to "Summary" chain
summarizer = visualization.DfSummary(name='Create_stats_overview',
                                     read_key=read_data.key,
                                     pages_key='report_pages')
data.add(summarizer)

Пример #18

0

Показать файл

#########################################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk408_classification_error_propagation_after_fit'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

#########################################################################################
# --- now set up the chains and links based on configuration flags

# --- generate pdf, simulate, fit, and plot
ch = Chain('WsOps')

# 1. simulate output score of machine learning classifier
wsu = root_analysis.WsUtils(name='DataSimulator')
wsu.factory = ["RooGaussian::high_risk(score[0,1],1,0.15)",
               "RooPolynomial::low_risk(score,{-0.3,-0.3})",
               "SUM::model(frac[0.1,0.,1.]*high_risk,low_risk)"]
wsu.add_simulate(pdf='model', obs='score', num=500, key='data', into_ws=True)
wsu.add_fit(pdf='model', data='data', key='fit_result', into_ws=True)
wsu.add_plot(obs='score', data='data', pdf='model', key='simplot')
wsu.add_plot(obs='score', pdf='model',
             pdf_args=(RooFit.Components('low_risk'), RooFit.LineColor(ROOT.kRed),
                       RooFit.LineStyle(ROOT.kDashed)),
             output_file='data_with_generator_model.pdf', key='simplot')
ch.add(wsu)

Пример #19

0

Показать файл

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

settings['read_data'] = True
settings['generate'] = True
settings['make_plot'] = True
settings['high_num_dims'] = False

input_files = [resources.fixture('correlated_data.sv.gz')]

#########################################################################################
# --- now set up the chains and links based on configuration flags

if settings['read_data']:
    ch = Chain('Data')

    # --- 0. read the input dataset
    read_data = analysis.ReadToDf(name='reader',
                                  key='correlated_data',
                                  reader='csv',
                                  sep=' ')
    read_data.path = input_files
    ch.add(read_data)

    # --- 1. convert into a roofit dataset (roodataset)
    #        build a KEYS pdf out of the dataset as well
    df2rds = root_analysis.ConvertDataFrame2RooDataSet()
    df2rds.read_key = read_data.key
    df2rds.store_key = 'rds_' + read_data.key
    df2rds.store_key_vars = 'keys_varset'

Пример #20

0

Показать файл

Файл: esk402_roodatahist_fill.py Проект: mbaak/Eskapade-ROOT

#########################################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk402_roodatahist_fill'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

input_files = [resources.fixture('mock_accounts.csv.gz')]

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch = Chain('Data')

# --- 0. readdata keeps on opening the next file in the file list.
#     all kwargs are passed on to pandas file reader.
read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv')
read_data.path = input_files
# readdata.itr_over_files = True
ch.add(read_data)

# --- 1. add the record factorizer to convert categorical observables into integers
#     Here the columns dummy and loc of the input dataset are factorized
#     e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column:
#     x = [0, 1, 2, 0, 2]
#     By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original'
fact = analysis.RecordFactorizer(name='rf1')
fact.columns = ['isActive', 'eyeColor', 'favoriteFruit', 'gender']

Пример #21

0

Показать файл

Файл: esk610_spark_streaming_wordcount.py Проект: mbaak/Eskapade-Spark

# define data stream
ds = process_manager.service(DataStore)

if not stream_type or stream_type == 'file':
    ds['dstream'] = ssc.textFileStream('/tmp/eskapade_stream_test/')
elif stream_type == 'tcp':
    ds['dstream'] = ssc.socketTextStream('localhost', 9999)
else:
    logger.error('unsupported stream_type specified: {type}.',
                 type=stream_type)

##########################################################################
# --- now set up the chains and links based on configuration flags

spark_streaming = Chain('SparkStreaming')

# the word count example
wordcount_link = SparkStreamingWordCount(name='SparkStreamingWordCount',
                                         read_key='dstream',
                                         store_key='wordcounts')
spark_streaming.add(wordcount_link)

# store output
writer_link = SparkStreamingWriter(
    name='SparkStreamingWriter',
    read_key=wordcount_link.store_key,
    output_path='file:' +
    persistence.io_path('results_data', '/dstream/wordcount'),
    suffix='txt',
    repartition=1)

Пример #22

0

Показать файл

Файл: esk607_spark_with_column.py Проект: mbaak/Eskapade-Spark

##########################################################################
# Start Spark session

spark = process_manager.service(SparkManager).create_session(
    eskapade_settings=settings)

##########################################################################
# CSV and dataframe settings

# NB: local file may not be accessible to worker node in cluster mode
file_path = ['file:' + resources.fixture('dummy1.csv')]

##########################################################################
# Now set up the chains and links based on configuration flags

read = Chain('Read')

# create read link for each data file
read_link = SparkDfReader(name='ReadFile',
                          store_key='spark_df',
                          read_methods=['csv'])

# set CSV read arguments
read_link.read_meth_args['csv'] = (file_path, )
read_link.read_meth_kwargs['csv'] = dict(sep='|',
                                         header=True,
                                         inferSchema=True)

# add link to chain
read.add(read_link)

Пример #23

0

Показать файл

    sum_bar = sum(r['bar'] for r in rows)
    return [r + (sum_bar, ) for r in rows]


##########################################################################
# --- input data

ds = process_manager.service(DataStore)
rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2.) for it in range(100)]
ds['df'] = spark.createDataFrame(rows, schema=['index', 'foo', 'bar'])

##########################################################################
# --- now set up the chains and links based on configuration flags

# create chain
chain = Chain('Map')

# create a link to convert the data frame into an RDD
conv_lnk = spark_analysis.SparkDfConverter(name='DfConverter',
                                           read_key='df',
                                           store_key='rdd',
                                           output_format='rdd',
                                           preserve_col_names=True)
chain.add(conv_lnk)

# create a link to calculate the sum of "bar" for each group of ten rows
map_lnk = spark_analysis.RddGroupMapper(name='Mapper',
                                        read_key='rdd',
                                        store_key='map_rdd',
                                        group_map=sum,
                                        input_map=lambda r:

Пример #24

0

Показать файл

Файл: esk209_read_big_data_itr.py Проект: mbaak/Eskapade

#########################################################################################

# when chunking through an input file, pick up only N lines in each iteration.
chunk_size = 5

#########################################################################################
# --- Set path of data
data_path = resources.fixture('dummy.csv')

#########################################################################################
# --- now set up the chains and links, based on configuration flags

# --- example 1: readdata loops over the input files, but no file chunking.

if settings.get('do_example1', True):
    ch = Chain('MyChain1')

    # --- a loop is set up in the chain MyChain.
    #     we iterate over (chunks of) the next file in the list until the iterator is done.
    #     then move on to the next chain (Overview)

    # --- readdata keeps on opening the next file in the file list.
    #     all kwargs are passed on to pandas file reader.
    read_data = analysis.ReadToDf(name='dflooper1',
                                  key='test1',
                                  sep='|',
                                  reader='csv',
                                  usecols=['x', 'y'])
    read_data.path = [data_path] * 3
    read_data.itr_over_files = True
    ch.add(read_data)

Пример #25

0

Показать файл

Файл: esk604_spark_execute_query.py Проект: mbaak/Eskapade-Spark

spark = process_manager.service(SparkManager).create_session(eskapade_settings=settings)

##########################################################################
# CSV and dataframe settings

# NB: local file may not be accessible to worker node in cluster mode
file_paths = ['file:' + resources.fixture('dummy1.csv'),
              'file:' + resources.fixture('dummy2.csv')]

# define store_key for all data files to be read in
STORE_KEYS = ['spark_df1', 'spark_df2']

##########################################################################
# Now set up the chains and links based on configuration flags

read = Chain('Read')

# create read link for each data file
for index, key in enumerate(STORE_KEYS):
    read_link = SparkDfReader(name='Reader' + str(index + 1), store_key=key, read_methods=['csv'])

    # set CSV read arguments
    read_link.read_meth_args['csv'] = (file_paths[index],)
    read_link.read_meth_kwargs['csv'] = dict(sep='|', header=True, inferSchema=True)

    # add link to chain
    read.add(read_link)

# create SQL-query link
sql_link = SparkExecuteQuery(name='SparkSQL', store_key='spark_df_sql')

Пример #26

0

Показать файл

logger.debug(
    'Now parsing configuration file esk407_classification_unbiased_fit_estimate'
)

#########################################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk407_classification_unbiased_fit_estimate'
settings['version'] = 0

#########################################################################################
# --- now set up the chains and links based on configuration flags

# --- generate pdf, simulate, fit, and plot
ch = Chain('WsOps')

# 1. simulate output score of machine learning classifier
wsu = WsUtils(name='DataSimulator')
wsu.factory = [
    "expr::trans('@0-@1',{score[0,1],1})",
    "RooExponential::high_risk(trans,10)",
    "RooPolynomial::low_risk(score,{-0.4,-0.4})",
    "SUM::model(low_risk_frac[0.95,0.,1.]*low_risk,high_risk)"
]
wsu.add_simulate(pdf='high_risk',
                 obs='score',
                 num=1000,
                 key='unbiased_high_risk_testdata',
                 into_ws=True)
wsu.add_simulate(pdf='low_risk',

Пример #27

0

Показать файл

Файл: esk702_mimic_data_only_unordered.py Проект: mbaak/Eskapade

from eskapade import data_mimic
from eskapade import process_manager
from eskapade.logger import Logger, LogLevel

logger = Logger()
logger.debug('Now parsing configuration file esk703_mimic_data')

#########################################################################################
# --- minimal analysis information
settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk703_mimic_data'
settings['version'] = 0

np.random.seed(42)

ch = Chain('DataPrep')
ch.logger.log_level = LogLevel.DEBUG

sim_data = data_mimic.MixedVariablesSimulation(store_key='df',
                                               n_obs=100000,
                                               p_unordered=np.array(
                                                   [[0.2, 0.2, 0.3, 0.3],
                                                    [0.3, 0.7]]))

sim_data.logger.log_level = LogLevel.DEBUG
ch.add(sim_data)

pre_data = data_mimic.KDEPreparation(
    read_key='df',
    data_store_key='data',
    data_smoothed_store_key='data_smoothed',

Пример #28

0

Показать файл

#########################################################################################

# when chunking through an input file, pick up only N lines in each iteration.
chunk_size = 5

#########################################################################################
# --- Set path of data
data_path = resources.fixture('dummy.csv')

#########################################################################################
# --- now set up the chains and links, based on configuration flags

# --- example 2: readdata loops over the input files, with file chunking.

if settings.get('do_example2', True):
    ch = Chain('MyChain2')
    ch.n_fork = 10

    # --- a loop is set up in the chain MyChain.
    #     we iterate over (chunks of) the next file in the list until the iterator is done.
    #     then move on to the next chain (Overview)

    # --- readdata keeps on opening the next 4 lines of the open or next file in the file list.
    #     all kwargs are passed on to pandas file reader.
    read_data = analysis.ReadToDf(name='dflooper2', key='test2', sep='|', reader='csv', usecols=['x', 'y'],
                                  chunksize=chunk_size)
    read_data.path = [data_path] * 3
    ch.add(read_data)

    # --- do something useful with the test dataset here ...
    #     e.g. apply selections, or collect into histograms.

Пример #29

0

Показать файл

                      var_range=(0., MAX_AGE),
                      var=('redeem_age', 0.),
                      max_var=('age', MAX_AGE),
                      exp=[('rate_fast', FAST_REDEEM_RATE),
                           ('rate_slow', SLOW_REDEEM_RATE)],
                      fracs=[('frac_fast', FAST_FRAC)])
    model.build_model()
    model.var.SetTitle('Redeem age')
    model.max_var.SetTitle('Age')
    model.var.setUnit('days')
    model.max_var.setUnit('days')

###############################################################################
# --- create chain for generating voucher redeem data

ch = Chain('Generation')
gen_link = TruncExpGen(name='Generate',
                       store_key=REDEEM_DATA_KEY,
                       max_var_data_key=AGE_DATA_KEY,
                       model_name=MODEL_NAME,
                       event_frac=REDEEM_FRAC)
ch.add(gen_link)

np.random.seed(settings['seeds']['NumPy'])
ROOT.RooRandom.randomGenerator().SetSeed(settings['seeds']['RooFit'])

###############################################################################
# --- create chain for fitting voucher redeem model to generated data

ch = Chain('Fitting')
fit_link = TruncExpFit(name='Fit',

Пример #30

0

Показать файл

Файл: esk501_fix_pandas_dataframe.py Проект: evoloji/Eskapade

nan,3,bal,3,bla,bar,c,1
,nan,NaN,NaN,nan,nan,d,2
,,,,,,,3
1,2,,,,,,,6
"""

f = tempfile.NamedTemporaryFile(delete=False)
f.write(tmp)
f.close()
# file is not immediately deleted because we used delete=False
# used below with f.name

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch = Chain('DataPrep')

# --- 0. pandas read_csv has multiple settings to help reading in of buggy csv's.
#     o The option error_bad_lines=False skips lines with too few or too many values
#     o The option encoding='latin1' interprets most non-standard characters
read_data = analysis.ReadToDf(key='vrh',
                              reader='csv',
                              path=f.name,
                              error_bad_lines=False,
                              encoding='latin1')
ch.add(read_data)

# --- 1. standard setting:
#     o convert all nans to np.nan (= float)
#     o convert all rows in a column to most occuring datatype in that column
fixer = data_quality.FixPandasDataFrame(name='fixer1')

Python Chain примеры использования