settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

settings['high_num_dims'] = False

input_files = [resources.fixture('mock_accounts.csv.gz')]

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch = Chain('Data')

# --- 0. read input data
read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv')
read_data.path = input_files
ch.add(read_data)

# --- 1. add the record factorizer
#     Here the columns dummy and loc of the input dataset are factorized
#     e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column:
#     x = [0, 1, 2, 0, 2]
#     By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original'
fact = analysis.RecordFactorizer(name='rf1')
fact.columns = ['isActive', 'eyeColor', 'favoriteFruit', 'gender']
fact.read_key = 'accounts'
fact.inplace = True
fact.sk_map_to_original = 'to_original'
fact.sk_map_to_factorized = 'to_factorized'
fact.logger.log_level = LogLevel.DEBUG
Exemplo n.º 2
0
The plots and latex files produced by link hist_summary can be found in dir:
%s
""" % (settings['resultsDir'] + '/' + settings['analysisName'] +
       '/data/v0/report/')
log.info(msg)

#########################################################################################
# --- now set up the chains and links based on configuration flags

if settings['read_data']:
    ch = proc_mgr.add_chain('Data')

    # --- 0. read input data
    readdata = analysis.ReadToDf(name='reader',
                                 key='correlated_data',
                                 reader='csv',
                                 sep=' ')
    readdata.path = input_files
    ch.add_link(readdata)

    # --- 1. Fill root histograms
    #        For now, RootHistFiller only accepts numeric observables
    hf = root_analysis.RootHistFiller()
    # columns is a list of single observables or sub-lists in case of multi-dimensional histograms
    hf.columns = [
        'x1', 'x2', 'x3', 'x4', 'x5', ['x1', 'x2'], ['x2', 'x3'], ['x4', 'x5']
    ]
    hf.read_key = 'correlated_data'
    hf.store_key = 'hist'
    hf.var_min_value = {'x2': -5, 'x3': -5, 'x4': -5, 'x5': -5}
    hf.var_max_value = {'x2': 5, 'x3': 5, 'x4': 5, 'x5': 5}
Exemplo n.º 3
0
proc_mgr = ProcessManager()

# --- example 1: readdata loops over the input files, but no file chunking.

if settings.get('do_example1', True):
    ch = proc_mgr.add_chain('MyChain1')

    # --- a loop is set up in the chain MyChain.
    #     we iterate over (chunks of) the next file in the list until the iterator is done.
    #     then move on to the next chain (Overview)

    # --- readdata keeps on opening the next file in the file list.
    #     all kwargs are passed on to pandas file reader.
    readdata = analysis.ReadToDf(name='dflooper1',
                                 key='test1',
                                 sep='|',
                                 reader='csv',
                                 usecols=['x', 'y'])
    readdata.path = [data_path] * 3
    readdata.itr_over_files = True
    ch.add_link(readdata)

    # --- this serves as the break statement from this loop.
    #     if dataset test is empty, which can happen as the very last dataset by readdata,
    #     then skip the rest of this chain.
    skipper = core_ops.SkipChainIfEmpty()
    skipper.collectionSet = ['test1']
    skipper.checkAtInitialize = False
    skipper.checkAtExecute = True
    ch.add_link(skipper)
Exemplo n.º 4
0
settings['separator'] = ' '
settings['correlations'] = [
    'pearson', 'kendall', 'spearman', 'correlation_ratio'
]

#########################################################################################
# --- now set up the chains and links based on configuration flags

# create chains
proc_mgr.add_chain('Data')
proc_mgr.add_chain('Summary')

# load data
reader = analysis.ReadToDf(name='reader',
                           path=settings['input_path'],
                           sep=settings['separator'],
                           key='input_data',
                           reader=settings['reader'])

proc_mgr.get_chain('Data').add_link(reader)

# make visualizations of correlations
corr_link = visualization.CorrelationSummary(name='correlation_summary',
                                             read_key='input_data',
                                             store_key='correlations',
                                             methods=settings['correlations'])

proc_mgr.get_chain('Summary').add_link(corr_link)

#########################################################################################
Exemplo n.º 5
0
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

# --- Set path of data
data_path = persistence.io_path('data', settings.io_conf(), 'dummy.csv')

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch1 = proc_mgr.add_chain('Factorize')

# --- read dummy dataset
readdata = analysis.ReadToDf(key='test1',
                             sep='|',
                             reader='csv',
                             path=data_path)
ch1.add_link(readdata)

# --- print contents of the datastore
pds = core_ops.PrintDs(name='printer1')
pds.keys = ['test1']
ch1.add_link(pds)

# --- add the record factorizer
#     Here the columns dummy and loc of the input dataset are factorized
#     e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column:
#     x = [0, 1, 2, 0, 2]
#     By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original'
fact = analysis.RecordFactorizer(name='rf1')
fact.columns = ['dummy', 'loc']
Exemplo n.º 6
0
        'colin': 'vis',
        'colout': 'vis_km'
    },
    # {'func': mph_to_kph, 'colin': , 'colout': 'wind_kph'},
    # {'func': F_to_C, 'colin': , 'colout': 'temp_c'},
]

#########################################################################################
# --- now set up the chains and links based on configuration flags

# create first chain
proc_mgr.add_chain('Data')

# add data-frame reader to "Data" chain
reader = analysis.ReadToDf(name='Read_LA_ozone',
                           path=DATA_FILE_PATH,
                           reader=pd.read_csv,
                           key='data')
proc_mgr.get_chain('Data').add_link(reader)

# add conversion functions to "Data" chain
transform = analysis.ApplyFuncToDf(name='Transform',
                                   read_key=reader.key,
                                   store_key='transformed_data',
                                   apply_funcs=conv_funcs)
proc_mgr.get_chain('Data').add_link(transform)

# create second chain
proc_mgr.add_chain('Summary')

## add data-frame summary link to "Summary" chain
#summarizer = visualization.DfSummary(name='Create_stats_overview', read_key=transform.store_key,
Exemplo n.º 7
0
# --- now set up the chains and links, based on configuration flags

procMgr = ProcessManager()

# --- example 2: readdata loops over the input files, with file chunking.

if settings['do_loop']:
    ch = procMgr.add_chain('Data')

    # --- a loop is set up in the chain MyChain.
    #     we iterate over (chunks of) the next file in the list until the iterator is done.
    #     then move on to the next chain (Overview)

    # --- readdata keeps on opening the next 400 lines of the open or next file in the file list.
    #     all kwargs are passed on to pandas file reader.
    readdata = analysis.ReadToDf(name='dflooper', key='rc', reader='csv')
    readdata.chunksize = chunksize
    readdata.path = input_files
    ch.add_link(readdata)

    # add conversion functions to "Data" chain
    # here, convert column 'registered', an integer, to an actual timestamp.
    conv_funcs = [{'func': to_date, 'colin': 'registered', 'colout': 'date'}]
    transform = analysis.ApplyFuncToDf(name='Transform',
                                       read_key=readdata.key,
                                       apply_funcs=conv_funcs)
    ch.add_link(transform)

    # --- As an example, will fill histogram iteratively over the file loop
    hf = analysis.HistogrammarFiller()
    hf.read_key = 'rc'
f.write(tmp)
f.close()
# file is not immediately deleted because we used delete=False
# used below with f.name

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch = Chain('DataPrep')

# --- 0. pandas read_csv has multiple settings to help reading in of buggy csv's.
#     o The option error_bad_lines=False skips lines with too few or too many values
#     o The option encoding='latin1' interprets most non-standard characters
read_data = analysis.ReadToDf(key='vrh',
                              reader='csv',
                              path=f.name,
                              error_bad_lines=False,
                              encoding='latin1')
ch.add(read_data)

# --- 1. standard setting:
#     o convert all nans to np.nan (= float)
#     o convert all rows in a column to most occuring datatype in that column
fixer = data_quality.FixPandasDataFrame(name='fixer1')
fixer.read_key = 'vrh'
fixer.store_key = 'vrh_fix1'
ch.add(fixer)

# --- 2. force certain columns to specified datatype
fixer = data_quality.FixPandasDataFrame(name='fixer2')
fixer.read_key = 'vrh'
Exemplo n.º 9
0
# turn on/off the 2 examples
settings['do_example1'] = True
settings['do_example2'] = True

#########################################################################################
# --- Set path of data
data_path = resources.fixture('dummy.csv')

#########################################################################################
# --- now set up the chains and links, based on configuration flags

# --- example 1: readdata with one input file
if settings['do_example1']:
    ch1 = Chain('MyChain1')
    read_data = analysis.ReadToDf(key='test1', sep='|', reader='csv', path=data_path)
    ch1.add(read_data)

    # --- do something useful with the test dataset here ...

# --- example 2: readdata with default settings reads all three input files simultaneously.
#                all extra key word arguments are passed on to pandas reader.
if settings['do_example2']:
    ch2 = Chain('MyChain2')

    # --- a loop is set up in the chain MyChain.
    #     we iterate over (chunks of) the next file in the list until the iterator is done.
    #     then move on to the next chain (Overview)

    # --- readdata keeps on opening the next file in the file list.
    #     all kwargs are passed on to pandas file reader.
Exemplo n.º 10
0

conv_funcs = [{'func': comp_date, 'colin': 'doy', 'colout': 'date'},
              {'func': mi_to_km, 'colin': 'vis', 'colout': 'vis_km'},
              # {'func': mph_to_kph, 'colin': , 'colout': 'wind_kph'},
              # {'func': F_to_C, 'colin': , 'colout': 'temp_c'},
              ]

#########################################################################################
# --- now set up the chains and links based on configuration flags

# create first chain
data = Chain('Data')

# add data-frame reader to "Data" chain
reader = analysis.ReadToDf(name='Read_LA_ozone', path='LAozone.data', reader=pd.read_csv, key='data')
data.add(reader)

# add conversion functions to "Data" chain
transform = analysis.ApplyFuncToDf(name='Transform', read_key=reader.key, store_key='transformed_data',
                                   apply_funcs=conv_funcs)
data.add(transform)

# create second chain
summary = Chain('Summary')

# add data-frame summary link to "Summary" chain
summarizer = visualization.DfSummary(name='Create_stats_overview', read_key=transform.store_key,
                                     var_labels=VAR_LABELS, var_units=VAR_UNITS)
summary.add(summarizer)
Exemplo n.º 11
0
# var_binning = {'INDICATOR_MATERIAL': binind,
#                'INDICATOR_INJURY': binind}
var_binning = {}

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch = Chain('Data')

# --- read materials file
read_data = analysis.ReadToDf(
    name='reader',
    path=input_path,
    sep=num_separator,
    decimal=num_decimal,
    key='input_data',
    usecols=readcols,
    # parse_dates=['DATE_OF_BIRTH'],
    reader=reader_type)

ch.add(read_data)

# --- filter data
link = analysis.ApplySelectionToDf(read_key=read_data.key,
                                   query_set=filter_query)
ch = Chain('Fix')

# --- percentile binning, done *before* nans get converted info floats below,
# such that these do not affect the percentile bins
# pb = RooFitPercentileBinning()
Exemplo n.º 12
0
#########################################################################################
# --- Set path of data
data_path = resources.fixture('dummy.csv')

#########################################################################################
# --- now set up the chains and links based on configuration flags

# --- readdata with default settings reads all three input files simultaneously.
#     all extra key word arguments are passed on to pandas reader.
if settings['do_readdata']:
    read = Chain('ReadData')
    # --- readdata keeps on opening the next file in the file list.
    #     all kwargs are passed on to pandas file reader.
    read_data = analysis.ReadToDf(name='reader',
                                  key='test',
                                  sep='|',
                                  reader='csv',
                                  path=[data_path] * 3)
    read.add(read_data)

if settings['do_writedata']:
    write = Chain('WriteData')
    # --- writedata needs a specified output format ('writer' argument).
    #     if this is not set, try to determine this from the extension from the filename.
    #     'key' is picked up from the datastore. 'path' is the output filename.
    #     all other kwargs are passed on to pandas file writer.
    write_data = analysis.WriteFromDf(name='writer',
                                      key='test',
                                      path='tmp3.csv',
                                      writer='csv')
    write.add(write_data)