# key to store the results of the residuals test in the datastore hypotest.sk_residuals_map = 'residuals' # key to store the results of the residuals test in the datastore, in format which # is makes further processing more easy hypotest.sk_residuals_overview = 'residuals_overview' # Advanced settings # Specify what categories to ignore. # hypotest.ignore_categories = ['None','Not_familair_with','NoFruit'] # hypotest.var_ignore_categories = ['obs1':'None','obs2':'Not_familiar_with','obs1:obs2':['None','pear']] # Hypothesis tester is also applicable to continues variables once they are categorised; # ie make bins. The number of bins can be set using the following options # hypotest.default_number_of_bins = 5 # hypotest.var_default_number_of_bins = ['obs1':10,'obs2':5,'obs1:obs2':[3,3]] hypotest.set_log_level(logging.DEBUG) ch.add_link(hypotest) # --- 4. print contents of the datastore proc_mgr.add_chain('Overview') hist_summary = visualization.DfSummary(name='HistogramSummary', read_key=hypotest.hist_dict_key, pages_key=hypotest.pages_key) proc_mgr.get_chain('Overview').add_link(hist_summary) ######################################################################################### log.debug( 'Done parsing configuration file esk410_testing_correlations_between_categories' )
if not stream_type or stream_type == 'file': ds['dstream'] = ssc.textFileStream('/tmp/eskapade_stream_test/') elif stream_type == 'tcp': ds['dstream'] = ssc.socketTextStream('localhost', 9999) else: log.error('unsupported stream_type specified: {}'.format(stream_type)) ########################################################################## # --- now set up the chains and links based on configuration flags proc_mgr.add_chain('SparkStreaming') # the word count example wordcount_link = spark_analysis.SparkStreamingWordCount( name='SparkStreamingWordCount', read_key='dstream', store_key='wordcounts') proc_mgr.get_chain('SparkStreaming').add_link(wordcount_link) # store output writer_link = spark_analysis.SparkStreamingWriter( name='SparkStreamingWriter', read_key=wordcount_link.store_key, output_path='file:' + persistence.io_dir('results_data', settings.io_conf()) + '/dstream/wordcount', mode='overwrite', suffix='txt', repartition=1) proc_mgr.get_chain('SparkStreaming').add_link(writer_link) # start/stop of Spark Streaming control_link = spark_analysis.SparkStreamingController(
# create process manager proc_mgr = ProcessManager() # create chains proc_mgr.add_chain('Data') proc_mgr.add_chain('BoxPlot') # add data-generator link to "Data" chain generator = analysis.BasicGenerator(name='Generate_data', key='data', columns=COLUMNS, size=SIZE, gen_config=GEN_CONF) proc_mgr.get_chain('Data').add_link(generator) # add data-frame summary link to "Boxplot" chain # can provide labels and units for the variables in the dataset, and set the statistics to print in output file boxplot = visualization.DfBoxplot(name='Create_stats_overview', read_key=generator.key, statistics=['count', 'mean', 'min', 'max', 'std'], var_labels=VAR_LABELS, var_units=VAR_UNITS, column='var_b', cause_columns=['var_a', 'var_c'], results_path=persistence.io_path('results_data', settings.io_conf(), 'report')) proc_mgr.get_chain('BoxPlot').add_link(boxplot) #########################################################################################
fact = analysis.RecordFactorizer(name='rf1') fact.columns = ['isActive', 'eyeColor', 'favoriteFruit', 'gender'] fact.read_key = 'accounts' fact.inplace = True # factorizer stores a dict with the mappings that have been applied to all observables fact.sk_map_to_original = 'to_original' # factorizer also stores a dict with the mappings back to the original observables fact.sk_map_to_factorized = 'to_factorized' fact.set_log_level(logging.DEBUG) ch.add_link(fact) # --- 2. Fill a roodatahist df2rdh = root_analysis.RooDataHistFiller() df2rdh.read_key = readdata.key df2rdh.store_key = 'rdh_' + readdata.key # the observables in this map are treated as categorical observables by roofit (roocategories) df2rdh.map_to_factorized = 'to_factorized' df2rdh.columns = ['transaction', 'latitude', 'longitude', 'age', 'eyeColor', 'favoriteFruit'] #df2rdh.into_ws = True ch.add_link(df2rdh) # --- print contents of the datastore proc_mgr.add_chain('Overview') pds = core_ops.PrintDs() pds.keys = ['n_rdh_accounts', 'n_accounts'] proc_mgr.get_chain('Overview').add_link(pds) ######################################################################################### log.debug('Done parsing configuration file esk402_roodatahist_fill')
ch = proc_mgr.add_chain('Conversion2') rds2df = root_analysis.ConvertRooDataSet2DataFrame() rds2df.read_key = 'simdata' rds2df.store_key = 'df_simdata' rds2df.remove_original = True ch.add_link(rds2df) if settings['summary']: proc_mgr.add_chain('Summary') # print contents of the workspace pws = root_analysis.PrintWs() ch.add_link(pws) # print contents of datastore pds = core_ops.PrintDs(name='pds2') #pds.keys = ['accounts'] ch.add_link(pds) # --- make a summary document of simulated dataframe summarizer = visualization.DfSummary(name='Create_stats_overview', read_key=rds2df.store_key) proc_mgr.get_chain('Summary').add_link(summarizer) ######################################################################################### log.debug( 'Done parsing configuration file esk404_workspace_createpdf_simulate_fit_plot' )
######################################################################################### # --- now set up the chains and links based on configuration flags # create chains proc_mgr.add_chain('Data') proc_mgr.add_chain('Summary') # load data reader = analysis.ReadToDf(name='reader', path=settings['input_path'], sep=settings['separator'], key='input_data', reader=settings['reader']) proc_mgr.get_chain('Data').add_link(reader) # make visualizations for corr in settings['correlations']: corr_link = visualization.CorrelationSummary(name=corr + '_summary', read_key='input_data', write_key=corr + '_correlations', method=corr) proc_mgr.get_chain('Summary').add_link(corr_link) ######################################################################################### log.debug('Done parsing configuration file esk305_correlation_summary')