# during link execution.

df = DataFrame(randn(100, 3), columns=list('abc'))

ds = process_manager.service(DataStore)
ds['incoming_records'] = df

#########################################################################################
# --- Here we apply example selections to a dataframe picked up from the datastore.

data_prep = Chain('DataPrep')

# query_set = seletions that are applies to incoming_records
# after selections, only keep column in select_columns ('a', 'c')
link = analysis.ApplySelectionToDf(read_key='incoming_records',
                                   store_key='outgoing_records',
                                   query_set=['a>0', 'c<b'],
                                   select_columns=['a', 'c'])
# Any other kwargs given to ApplySelectionToDf are passed on the the
# pandas query() function.
link.logger.log_level = LogLevel.DEBUG
data_prep.add(link)

link = core_ops.DsObjectDeleter()
link.deletion_keys = ['incoming_records']
data_prep.add(link)

link = core_ops.PrintDs()
link.keys = ['n_outgoing_records', 'outgoing_records']
data_prep.add(link)

#########################################################################################
Exemplo n.º 2
0
    # --- this serves as the break statement from this loop.
    #     if dataset test is empty, which can happen as the very last dataset by readdata,
    #     then skip the rest of this chain.
    skipper = core_ops.SkipChainIfEmpty()
    skipper.collectionSet = ['test2']
    skipper.checkAtInitialize = False
    skipper.checkAtExecute = True
    ch.add_link(skipper)

    # --- do something useful with the test dataset here ...
    #     e.g. apply selections, or collect into histograms.

    # querySet = seletions that are applies to incoming_records
    # after selections, only keep column in selectColumns ('a', 'c')
    link = analysis.ApplySelectionToDf(readKey='test2',
                                       storeKey='reduced_data',
                                       querySet=['x>1'])
    # Any other kwargs given to ApplySelectionToDf are passed on the the
    # pandas query() function.
    ch.add_link(link)

    # --- As an example, will merge reduced datasets back into a single, merged dataframe.
    concat = analysis.DfConcatenator()
    concat.readKeys = ['merged', 'reduced_data']
    concat.storeKey = 'merged'
    concat.ignore_missing_input = True  # in first iteration input 'merged' is missing.
    ch.add_link(concat)

    # --- this serves as the continue statement of the loop. go back to start of the chain.
    repeater = core_ops.RepeatChain()
    # repeat until readdata says halt.
Exemplo n.º 3
0
# --- read materials file
read_data = analysis.ReadToDf(
    name='reader',
    path=input_path,
    sep=num_separator,
    decimal=num_decimal,
    key='input_data',
    usecols=readcols,
    # parse_dates=['DATE_OF_BIRTH'],
    reader=reader_type)

ch.add(read_data)

# --- filter data
link = analysis.ApplySelectionToDf(read_key=read_data.key,
                                   query_set=filter_query)
ch = Chain('Fix')

# --- percentile binning, done *before* nans get converted info floats below,
# such that these do not affect the percentile bins
# pb = RooFitPercentileBinning()
# pb.read_key = read_data.key
# pb.var_number_of_bins = var_number_of_bins
# pb.binning_name = 'percentile'
# ch.add(pb)

# --- fix nans if they exist in a row (set to same dtype with convert_inconsistent_nans)
fixer = data_quality.FixPandasDataFrame(name='fixer')
fixer.read_key = read_data.key
# fixer.read_key = transform.store_key
fixer.store_key = 'fix_nan'