settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. settings['high_num_dims'] = False input_files = [resources.fixture('mock_accounts.csv.gz')] ######################################################################################### # --- now set up the chains and links based on configuration flags ch = Chain('Data') # --- 0. read input data read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv') read_data.path = input_files ch.add(read_data) # --- 1. add the record factorizer # Here the columns dummy and loc of the input dataset are factorized # e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column: # x = [0, 1, 2, 0, 2] # By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original' fact = analysis.RecordFactorizer(name='rf1') fact.columns = ['isActive', 'eyeColor', 'favoriteFruit', 'gender'] fact.read_key = 'accounts' fact.inplace = True fact.sk_map_to_original = 'to_original' fact.sk_map_to_factorized = 'to_factorized' fact.logger.log_level = LogLevel.DEBUG
The plots and latex files produced by link hist_summary can be found in dir: %s """ % (settings['resultsDir'] + '/' + settings['analysisName'] + '/data/v0/report/') log.info(msg) ######################################################################################### # --- now set up the chains and links based on configuration flags if settings['read_data']: ch = proc_mgr.add_chain('Data') # --- 0. read input data readdata = analysis.ReadToDf(name='reader', key='correlated_data', reader='csv', sep=' ') readdata.path = input_files ch.add_link(readdata) # --- 1. Fill root histograms # For now, RootHistFiller only accepts numeric observables hf = root_analysis.RootHistFiller() # columns is a list of single observables or sub-lists in case of multi-dimensional histograms hf.columns = [ 'x1', 'x2', 'x3', 'x4', 'x5', ['x1', 'x2'], ['x2', 'x3'], ['x4', 'x5'] ] hf.read_key = 'correlated_data' hf.store_key = 'hist' hf.var_min_value = {'x2': -5, 'x3': -5, 'x4': -5, 'x5': -5} hf.var_max_value = {'x2': 5, 'x3': 5, 'x4': 5, 'x5': 5}
proc_mgr = ProcessManager() # --- example 1: readdata loops over the input files, but no file chunking. if settings.get('do_example1', True): ch = proc_mgr.add_chain('MyChain1') # --- a loop is set up in the chain MyChain. # we iterate over (chunks of) the next file in the list until the iterator is done. # then move on to the next chain (Overview) # --- readdata keeps on opening the next file in the file list. # all kwargs are passed on to pandas file reader. readdata = analysis.ReadToDf(name='dflooper1', key='test1', sep='|', reader='csv', usecols=['x', 'y']) readdata.path = [data_path] * 3 readdata.itr_over_files = True ch.add_link(readdata) # --- this serves as the break statement from this loop. # if dataset test is empty, which can happen as the very last dataset by readdata, # then skip the rest of this chain. skipper = core_ops.SkipChainIfEmpty() skipper.collectionSet = ['test1'] skipper.checkAtInitialize = False skipper.checkAtExecute = True ch.add_link(skipper)
settings['separator'] = ' ' settings['correlations'] = [ 'pearson', 'kendall', 'spearman', 'correlation_ratio' ] ######################################################################################### # --- now set up the chains and links based on configuration flags # create chains proc_mgr.add_chain('Data') proc_mgr.add_chain('Summary') # load data reader = analysis.ReadToDf(name='reader', path=settings['input_path'], sep=settings['separator'], key='input_data', reader=settings['reader']) proc_mgr.get_chain('Data').add_link(reader) # make visualizations of correlations corr_link = visualization.CorrelationSummary(name='correlation_summary', read_key='input_data', store_key='correlations', methods=settings['correlations']) proc_mgr.get_chain('Summary').add_link(corr_link) #########################################################################################
settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. # --- Set path of data data_path = persistence.io_path('data', settings.io_conf(), 'dummy.csv') ######################################################################################### # --- now set up the chains and links based on configuration flags ch1 = proc_mgr.add_chain('Factorize') # --- read dummy dataset readdata = analysis.ReadToDf(key='test1', sep='|', reader='csv', path=data_path) ch1.add_link(readdata) # --- print contents of the datastore pds = core_ops.PrintDs(name='printer1') pds.keys = ['test1'] ch1.add_link(pds) # --- add the record factorizer # Here the columns dummy and loc of the input dataset are factorized # e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column: # x = [0, 1, 2, 0, 2] # By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original' fact = analysis.RecordFactorizer(name='rf1') fact.columns = ['dummy', 'loc']
'colin': 'vis', 'colout': 'vis_km' }, # {'func': mph_to_kph, 'colin': , 'colout': 'wind_kph'}, # {'func': F_to_C, 'colin': , 'colout': 'temp_c'}, ] ######################################################################################### # --- now set up the chains and links based on configuration flags # create first chain proc_mgr.add_chain('Data') # add data-frame reader to "Data" chain reader = analysis.ReadToDf(name='Read_LA_ozone', path=DATA_FILE_PATH, reader=pd.read_csv, key='data') proc_mgr.get_chain('Data').add_link(reader) # add conversion functions to "Data" chain transform = analysis.ApplyFuncToDf(name='Transform', read_key=reader.key, store_key='transformed_data', apply_funcs=conv_funcs) proc_mgr.get_chain('Data').add_link(transform) # create second chain proc_mgr.add_chain('Summary') ## add data-frame summary link to "Summary" chain #summarizer = visualization.DfSummary(name='Create_stats_overview', read_key=transform.store_key,
# --- now set up the chains and links, based on configuration flags procMgr = ProcessManager() # --- example 2: readdata loops over the input files, with file chunking. if settings['do_loop']: ch = procMgr.add_chain('Data') # --- a loop is set up in the chain MyChain. # we iterate over (chunks of) the next file in the list until the iterator is done. # then move on to the next chain (Overview) # --- readdata keeps on opening the next 400 lines of the open or next file in the file list. # all kwargs are passed on to pandas file reader. readdata = analysis.ReadToDf(name='dflooper', key='rc', reader='csv') readdata.chunksize = chunksize readdata.path = input_files ch.add_link(readdata) # add conversion functions to "Data" chain # here, convert column 'registered', an integer, to an actual timestamp. conv_funcs = [{'func': to_date, 'colin': 'registered', 'colout': 'date'}] transform = analysis.ApplyFuncToDf(name='Transform', read_key=readdata.key, apply_funcs=conv_funcs) ch.add_link(transform) # --- As an example, will fill histogram iteratively over the file loop hf = analysis.HistogrammarFiller() hf.read_key = 'rc'
f.write(tmp) f.close() # file is not immediately deleted because we used delete=False # used below with f.name ######################################################################################### # --- now set up the chains and links based on configuration flags ch = Chain('DataPrep') # --- 0. pandas read_csv has multiple settings to help reading in of buggy csv's. # o The option error_bad_lines=False skips lines with too few or too many values # o The option encoding='latin1' interprets most non-standard characters read_data = analysis.ReadToDf(key='vrh', reader='csv', path=f.name, error_bad_lines=False, encoding='latin1') ch.add(read_data) # --- 1. standard setting: # o convert all nans to np.nan (= float) # o convert all rows in a column to most occuring datatype in that column fixer = data_quality.FixPandasDataFrame(name='fixer1') fixer.read_key = 'vrh' fixer.store_key = 'vrh_fix1' ch.add(fixer) # --- 2. force certain columns to specified datatype fixer = data_quality.FixPandasDataFrame(name='fixer2') fixer.read_key = 'vrh'
# turn on/off the 2 examples settings['do_example1'] = True settings['do_example2'] = True ######################################################################################### # --- Set path of data data_path = resources.fixture('dummy.csv') ######################################################################################### # --- now set up the chains and links, based on configuration flags # --- example 1: readdata with one input file if settings['do_example1']: ch1 = Chain('MyChain1') read_data = analysis.ReadToDf(key='test1', sep='|', reader='csv', path=data_path) ch1.add(read_data) # --- do something useful with the test dataset here ... # --- example 2: readdata with default settings reads all three input files simultaneously. # all extra key word arguments are passed on to pandas reader. if settings['do_example2']: ch2 = Chain('MyChain2') # --- a loop is set up in the chain MyChain. # we iterate over (chunks of) the next file in the list until the iterator is done. # then move on to the next chain (Overview) # --- readdata keeps on opening the next file in the file list. # all kwargs are passed on to pandas file reader.
conv_funcs = [{'func': comp_date, 'colin': 'doy', 'colout': 'date'}, {'func': mi_to_km, 'colin': 'vis', 'colout': 'vis_km'}, # {'func': mph_to_kph, 'colin': , 'colout': 'wind_kph'}, # {'func': F_to_C, 'colin': , 'colout': 'temp_c'}, ] ######################################################################################### # --- now set up the chains and links based on configuration flags # create first chain data = Chain('Data') # add data-frame reader to "Data" chain reader = analysis.ReadToDf(name='Read_LA_ozone', path='LAozone.data', reader=pd.read_csv, key='data') data.add(reader) # add conversion functions to "Data" chain transform = analysis.ApplyFuncToDf(name='Transform', read_key=reader.key, store_key='transformed_data', apply_funcs=conv_funcs) data.add(transform) # create second chain summary = Chain('Summary') # add data-frame summary link to "Summary" chain summarizer = visualization.DfSummary(name='Create_stats_overview', read_key=transform.store_key, var_labels=VAR_LABELS, var_units=VAR_UNITS) summary.add(summarizer)
# var_binning = {'INDICATOR_MATERIAL': binind, # 'INDICATOR_INJURY': binind} var_binning = {} ######################################################################################### # --- now set up the chains and links based on configuration flags ch = Chain('Data') # --- read materials file read_data = analysis.ReadToDf( name='reader', path=input_path, sep=num_separator, decimal=num_decimal, key='input_data', usecols=readcols, # parse_dates=['DATE_OF_BIRTH'], reader=reader_type) ch.add(read_data) # --- filter data link = analysis.ApplySelectionToDf(read_key=read_data.key, query_set=filter_query) ch = Chain('Fix') # --- percentile binning, done *before* nans get converted info floats below, # such that these do not affect the percentile bins # pb = RooFitPercentileBinning()
######################################################################################### # --- Set path of data data_path = resources.fixture('dummy.csv') ######################################################################################### # --- now set up the chains and links based on configuration flags # --- readdata with default settings reads all three input files simultaneously. # all extra key word arguments are passed on to pandas reader. if settings['do_readdata']: read = Chain('ReadData') # --- readdata keeps on opening the next file in the file list. # all kwargs are passed on to pandas file reader. read_data = analysis.ReadToDf(name='reader', key='test', sep='|', reader='csv', path=[data_path] * 3) read.add(read_data) if settings['do_writedata']: write = Chain('WriteData') # --- writedata needs a specified output format ('writer' argument). # if this is not set, try to determine this from the extension from the filename. # 'key' is picked up from the datastore. 'path' is the output filename. # all other kwargs are passed on to pandas file writer. write_data = analysis.WriteFromDf(name='writer', key='test', path='tmp3.csv', writer='csv') write.add(write_data)