예제 #1
0
    # --- a loop is set up in the chain MyChain.
    #     we iterate over (chunks of) the next file in the list until the iterator is done.
    #     then move on to the next chain (Overview)

    # --- readdata keeps on opening the next 400 lines of the open or next file in the file list.
    #     all kwargs are passed on to pandas file reader.
    read_data = analysis.ReadToDf(name='dflooper', key='rc', reader='csv')
    read_data.chunksize = chunk_size
    read_data.path = input_files
    ch.add(read_data)

    # add conversion functions to "Data" chain
    # here, convert column 'registered', an integer, to an actual timestamp.
    conv_funcs = [{'func': to_date, 'colin': 'registered', 'colout': 'date'}]
    transform = analysis.ApplyFuncToDf(name='Transform', read_key=read_data.key,
                                       apply_funcs=conv_funcs)
    ch.add(transform)

    # --- As an example, will fill histogram iteratively over the file loop
    vc = analysis.ValueCounter()
    vc.read_key = 'rc'
    vc.store_key_hists = 'hist'
    vc.logger.log_level = LogLevel.DEBUG
    # colums that are picked up to do value_counting on in the input dataset
    # note: can also be 2-dim: ['isActive','age']
    # in this example, the rest are one-dimensional histograms
    vc.columns = ['date', 'isActive', 'age', 'eyeColor', 'gender', 'company', 'latitude', 'longitude',
                  ['isActive', 'age']]
    # binning is apply to all input columns that are numeric or timestamps.
    # default binning is: bin_width = 1, bin_offset = 0
    # for timestamps, default binning is:
예제 #2
0
#########################################################################################
# --- now set up the chains and links based on configuration flags

# create first chain
proc_mgr.add_chain('Data')

# add data-frame reader to "Data" chain
reader = analysis.ReadToDf(name='Read_LA_ozone',
                           path=DATA_FILE_PATH,
                           reader=pd.read_csv,
                           key='data')
proc_mgr.get_chain('Data').add_link(reader)

# add conversion functions to "Data" chain
transform = analysis.ApplyFuncToDf(name='Transform',
                                   read_key=reader.key,
                                   store_key='transformed_data',
                                   apply_funcs=conv_funcs)
proc_mgr.get_chain('Data').add_link(transform)

# create second chain
proc_mgr.add_chain('Summary')

## add data-frame summary link to "Summary" chain
#summarizer = visualization.DfSummary(name='Create_stats_overview', read_key=transform.store_key,
#                                     var_labels=VAR_LABELS, var_units=VAR_UNITS)
#proc_mgr.get_chain('Summary').add_link(summarizer)

#########################################################################################
# --- Exercises
#
# 1.
ds = ProcessManager().service(DataStore)
ds['incoming_data'] = df

#########################################################################################
# --- now set up the chains and links based on configuration flags

proc_mgr = ProcessManager()

ch = proc_mgr.add_chain('DataPrep')

# querySet = seletions that are applies to incoming_records
# after selections, only keep column in selectColumns ('a', 'c')
# add conversion functions to "Data" chain
link = analysis.ApplyFuncToDf(name='Transform',
                              read_key='incoming_data',
                              store_key='transformed_data',
                              apply_funcs=conv_funcs)
# Any other kwargs given to ApplyFuncToDf are passed on the the
# pandas query() function.
link.set_log_level(logging.DEBUG)
ch.add_link(link)

link = core_ops.DsObjectDeleter()
link.deletionKeys = ['incoming_data']
ch.add_link(link)

link = core_ops.PrintDs()
link.keys = ['transformed_data']
ch.add_link(link)

#########################################################################################
예제 #4
0
    nan_sizes=[2000, 2000],
    nan_columns=['b', 'f'])
sim_data.logger.log_level = LogLevel.DEBUG
ch.add(sim_data)


# A 'business rule' column is added to the data for demoing purposes. Because the 'business rule' column has a high
# (100%) correlation with another column, this column is not used in the KDE and resample step.
def business_rule(x):
    return x + 1


add_business_rule = analysis.ApplyFuncToDf(
    read_key='df',
    apply_funcs=[{
        'colin': settings['business_rules_base_columns'][0],
        'colout': settings['business_rules_columns'][0],
        'func': business_rule
    }])
add_business_rule.logger.log_level = LogLevel.DEBUG
ch.add(add_business_rule)

#all data has been loaded, time to change column names
for column_name in settings['column_names_to_hash']:
    for setting in [
            settings['columns_to_hash'],
            settings['unordered_categorical_columns'],
            settings['ordered_categorical_columns'],
            settings['continuous_columns'], settings['string_columns']
    ]:
        if column_name in setting: