# --- a loop is set up in the chain MyChain. # we iterate over (chunks of) the next file in the list until the iterator is done. # then move on to the next chain (Overview) # --- readdata keeps on opening the next 400 lines of the open or next file in the file list. # all kwargs are passed on to pandas file reader. read_data = analysis.ReadToDf(name='dflooper', key='rc', reader='csv') read_data.chunksize = chunk_size read_data.path = input_files ch.add(read_data) # add conversion functions to "Data" chain # here, convert column 'registered', an integer, to an actual timestamp. conv_funcs = [{'func': to_date, 'colin': 'registered', 'colout': 'date'}] transform = analysis.ApplyFuncToDf(name='Transform', read_key=read_data.key, apply_funcs=conv_funcs) ch.add(transform) # --- As an example, will fill histogram iteratively over the file loop vc = analysis.ValueCounter() vc.read_key = 'rc' vc.store_key_hists = 'hist' vc.logger.log_level = LogLevel.DEBUG # colums that are picked up to do value_counting on in the input dataset # note: can also be 2-dim: ['isActive','age'] # in this example, the rest are one-dimensional histograms vc.columns = ['date', 'isActive', 'age', 'eyeColor', 'gender', 'company', 'latitude', 'longitude', ['isActive', 'age']] # binning is apply to all input columns that are numeric or timestamps. # default binning is: bin_width = 1, bin_offset = 0 # for timestamps, default binning is:
######################################################################################### # --- now set up the chains and links based on configuration flags # create first chain proc_mgr.add_chain('Data') # add data-frame reader to "Data" chain reader = analysis.ReadToDf(name='Read_LA_ozone', path=DATA_FILE_PATH, reader=pd.read_csv, key='data') proc_mgr.get_chain('Data').add_link(reader) # add conversion functions to "Data" chain transform = analysis.ApplyFuncToDf(name='Transform', read_key=reader.key, store_key='transformed_data', apply_funcs=conv_funcs) proc_mgr.get_chain('Data').add_link(transform) # create second chain proc_mgr.add_chain('Summary') ## add data-frame summary link to "Summary" chain #summarizer = visualization.DfSummary(name='Create_stats_overview', read_key=transform.store_key, # var_labels=VAR_LABELS, var_units=VAR_UNITS) #proc_mgr.get_chain('Summary').add_link(summarizer) ######################################################################################### # --- Exercises # # 1.
ds = ProcessManager().service(DataStore) ds['incoming_data'] = df ######################################################################################### # --- now set up the chains and links based on configuration flags proc_mgr = ProcessManager() ch = proc_mgr.add_chain('DataPrep') # querySet = seletions that are applies to incoming_records # after selections, only keep column in selectColumns ('a', 'c') # add conversion functions to "Data" chain link = analysis.ApplyFuncToDf(name='Transform', read_key='incoming_data', store_key='transformed_data', apply_funcs=conv_funcs) # Any other kwargs given to ApplyFuncToDf are passed on the the # pandas query() function. link.set_log_level(logging.DEBUG) ch.add_link(link) link = core_ops.DsObjectDeleter() link.deletionKeys = ['incoming_data'] ch.add_link(link) link = core_ops.PrintDs() link.keys = ['transformed_data'] ch.add_link(link) #########################################################################################
nan_sizes=[2000, 2000], nan_columns=['b', 'f']) sim_data.logger.log_level = LogLevel.DEBUG ch.add(sim_data) # A 'business rule' column is added to the data for demoing purposes. Because the 'business rule' column has a high # (100%) correlation with another column, this column is not used in the KDE and resample step. def business_rule(x): return x + 1 add_business_rule = analysis.ApplyFuncToDf( read_key='df', apply_funcs=[{ 'colin': settings['business_rules_base_columns'][0], 'colout': settings['business_rules_columns'][0], 'func': business_rule }]) add_business_rule.logger.log_level = LogLevel.DEBUG ch.add(add_business_rule) #all data has been loaded, time to change column names for column_name in settings['column_names_to_hash']: for setting in [ settings['columns_to_hash'], settings['unordered_categorical_columns'], settings['ordered_categorical_columns'], settings['continuous_columns'], settings['string_columns'] ]: if column_name in setting: