Пример #1
0
#########################################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk406_simulation_based_on_unbinned_data'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

settings['read_data'] = True
settings['generate'] = True
settings['make_plot'] = True
settings['high_num_dims'] = False

input_files = [resources.fixture('correlated_data.sv.gz')]

#########################################################################################
# --- now set up the chains and links based on configuration flags

if settings['read_data']:
    ch = Chain('Data')

    # --- 0. read the input dataset
    read_data = analysis.ReadToDf(name='reader',
                                  key='correlated_data',
                                  reader='csv',
                                  sep=' ')
    read_data.path = input_files
    ch.add(read_data)
logger.debug(
    'Now parsing configuration file esk405_simulation_based_on_binned_data')

#########################################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk405_simulation_based_on_binned_data'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

settings['high_num_dims'] = False

input_files = [resources.fixture('mock_accounts.csv.gz')]

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch = Chain('Data')

# --- 0. read input data
read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv')
read_data.path = input_files
ch.add(read_data)

# --- 1. add the record factorizer
#     Here the columns dummy and loc of the input dataset are factorized
#     e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column:
#     x = [0, 1, 2, 0, 2]
Пример #3
0
settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk211_fork_read_data_itr'
settings['version'] = 0

# no need to set this normally, but illustrates how to throttle the number of concurrent processes.
# default is set to number of available cpu cores.
process_manager.num_cpu = 4

#########################################################################################

# when chunking through an input file, pick up only N lines in each iteration.
chunk_size = 5

#########################################################################################
# --- Set path of data
data_path = resources.fixture('dummy.csv')

#########################################################################################
# --- now set up the chains and links, based on configuration flags

# --- example 2: readdata loops over the input files, with file chunking.

if settings.get('do_example2', True):
    ch = Chain('MyChain2')
    ch.n_fork = 10

    # --- a loop is set up in the chain MyChain.
    #     we iterate over (chunks of) the next file in the list until the iterator is done.
    #     then move on to the next chain (Overview)

    # --- readdata keeps on opening the next 4 lines of the open or next file in the file list.
settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk604_spark_execute_query'
settings['version'] = 0

##########################################################################
# Start Spark session

spark = process_manager.service(SparkManager).create_session(
    eskapade_settings=settings)

##########################################################################
# CSV and dataframe settings

# NB: local file may not be accessible to worker node in cluster mode
file_paths = [
    'file:' + resources.fixture('dummy1.csv'),
    'file:' + resources.fixture('dummy2.csv')
]

# define store_key for all data files to be read in
STORE_KEYS = ['spark_df1', 'spark_df2']

##########################################################################
# Now set up the chains and links based on configuration flags

read = Chain('Read')

# create read link for each data file
for index, key in enumerate(STORE_KEYS):
    read_link = spark_analysis.SparkDfReader(name='Reader' + str(index + 1),
                                             store_key=key,
Пример #5
0
logger = Logger()

logger.debug('Now parsing configuration file esk305_correlation_summary.')

#########################################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk305_correlation_summary'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

settings['input_path'] = resources.fixture('correlated_data.sv.gz')
settings['reader'] = 'csv'
settings['separator'] = ' '
settings['correlations'] = [
    'pearson', 'kendall', 'spearman', 'correlation_ratio'
]

#########################################################################################
# --- now set up the chains and links based on configuration flags

# create chains
data = Chain('Data')

# load data
reader = analysis.ReadToDf(name='reader',
                           path=settings['input_path'],
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk602_read_csv_to_spark_df'
settings['version'] = 0

##########################################################################
# --- start Spark session

spark = process_manager.service(SparkManager).create_session(eskapade_settings=settings)

##########################################################################
# --- CSV and data-frame settings

# NB: local file may not be accessible to worker node in cluster mode
file_paths = ['file:' + resources.fixture('dummy1.csv'),
              'file:' + resources.fixture('dummy2.csv')]

separator = '|'
has_header = True
infer_schema = True
num_partitions = 5
columns = ['date', 'loc', 'x', 'y']

##########################################################################
# --- now set up the chains and links based on configuration flags

# create read link
read_link = spark_analysis.SparkDfReader(name='Reader',
                                         store_key='spark_df',
                                         read_methods=['csv'])
settings['analysisName'] = 'esk210_dataframe_restoration'
settings['version'] = 0

ds = process_manager.service(DataStore)
#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.
# Two writers that are able to restore dataframes have been included in eskapade.
# To turn one of them off set the below to False
settings['do_numpy'] = True
settings['do_feather'] = True

#########################################################################################
# --- Set path of data
# messy_dtypes is a small files with some complex data types that are
# not guaranteed to be read back properly when using csv
data_path = resources.fixture('messy_dtypes.csv')

# The actual fundamental data types are:
dtypes = [
    'str', 'int64', 'float32', 'float64', 'S32', 'str', 'bool', 'str',
    'uint64', 'str'
]

#   Inferred  |   True
# -----------------------
# 'object'    | 'str',
# 'int64'     | 'int64'
# 'float64'   | 'float32'
# 'float64'   | 'float64'
# 'object'    | 'S32'
# 'int64'     | 'str'