示例#1
0
def process(msg):

    att_dict = msg.attributes

    logger, log_stream = slog.set_logging('word_regex', api.config.debug_mode)
    logger.info("Main Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    df = msg.body

    if not isinstance(df, pd.DataFrame) or df.empty:
        logger.warning('Empty dataframe, no output send!')
        api.send(outports[0]['name'], log_stream.getvalue())
        api.send(outports[2]['name'], api.Message(attributes=att_dict,
                                                  body=df))
        return 0

    df['count'] = df['count'].astype('int32')

    # word type
    word_types = tfp.read_list(api.config.word_types)
    if word_types:
        df = df.loc[df['type'].isin(word_types)]

    # Language filter
    language_filter = tfp.read_list(api.config.language_filter)
    if language_filter:
        df = df.loc[df['language'].isin(language_filter)]

    df = df.groupby(['language', 'type',
                     'word'])['count'].agg('sum').reset_index()

    api.send(outports[1]['name'], api.Message(attributes=att_dict, body=df))
    api.send(outports[0]['name'], log_stream.getvalue())
示例#2
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'groupby'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    prev_att = msg.attributes
    df = msg.body

    ###### start of doing calculation

    # groupby list
    cols = tfp.read_list(api.config.groupby)
    att_dict['config']['groupby'] = api.config.groupby

    # mapping aggregation
    try:
        colagg = tfp.read_dict(api.config.aggregation)
    except IndexError:
        logger.info('Aggregation is not a map, try to parse a value instead')
        colagg = tfp.read_value(api.config.aggregation)
    att_dict['config']['aggregation'] = api.config.aggregation

    # groupby
    logger.debug('Group columns: {}'.format(cols))
    logger.debug('Aggregation: {}'.format(colagg))
    logger.debug('Index: {}'.format(api.config.index))
    df = df.groupby(cols, as_index=api.config.index).agg(colagg)

    # drop col
    att_dict['config']['dropcols'] = api.config.drop_columns
    dropcols = tfp.read_list(api.config.drop_columns)
    if dropcols:
        df.drop(columns=dropcols, inplace=True)

    ##############################################
    #  final infos to attributes and info message
    ##############################################
    att_dict['operator'] = 'groupbyDataFrame'
    att_dict['name'] = prev_att['name']
    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = list(df.columns)
    att_dict['number_columns'] = df.shape[1]
    att_dict['number_rows'] = df.shape[0]

    example_rows = EXAMPLE_ROWS if att_dict[
        'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows']
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict, body=df)
    return log, msg
示例#3
0
def process(msg) :
    att_dict = msg.attributes
    att_dict['operator'] = 'groupby'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    prev_att = msg.attributes
    df = msg.body
    prev_shape = df.shape

    ###### start of doing calculation

    # groupby list
    cols = tfp.read_list(api.config.groupby)

    # mapping aggregation
    try :
        colagg = tfp.read_dict(api.config.aggregation)
    except IndexError :
        logger.info('Aggregation is not a map, try to parse a value instead')
        colagg = tfp.read_value(api.config.aggregation)

    # groupby
    logger.info('Group columns: {}'.format(cols))
    logger.info('Aggregation: {}'.format(colagg))
    logger.info('Index: {}'.format(api.config.index))
    df = df.groupby(cols, as_index=api.config.index).agg(colagg)

    # drop col
    dropcols = tfp.read_list(api.config.drop_columns)
    if dropcols :
        logger.info('Drop columns: {}'.format(dropcols))
        df.drop(columns=dropcols,inplace=True)

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
示例#4
0
def process(msg):

    logger, log_stream = set_logging('DEBUG')

    # start custom process definition
    prev_att = msg.attributes
    df = msg.body

    att_dict = dict()
    att_dict['config'] = dict()

    ###### start of doing calculation

    # groupby list
    cols = tfp.read_list(api.config.groupby)
    att_dict['config']['groupby'] = api.config.groupby

    # mapping
    colagg = tfp.read_dict(api.config.aggregation)
    att_dict['config']['aggregation'] = api.config.aggregation

    # groupby
    df = df.groupby(cols, as_index=api.config.index).agg(colagg)

    # drop col
    att_dict['config']['dropcols'] = api.config.drop_columns
    dropcols = tfp.read_list(api.config.drop_columns)
    if dropcols:
        df.drop(columns=dropcols, inplace=True)

    ##############################################
    #  final infos to attributes and info message
    ##############################################
    att_dict['operator'] = 'groupbyDataFrame'
    att_dict['name'] = prev_att['name']
    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = list(df.columns)
    att_dict['number_columns'] = df.shape[1]
    att_dict['number_rows'] = df.shape[0]

    example_rows = EXAMPLE_ROWS if att_dict[
        'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows']
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict, body=df)
    return log, msg
示例#5
0
def process(msg):
    logger, log_stream = slog.set_logging('DEBUG')
    time_monitor = tp.progress()

    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())

    df = msg.body
    # Columns with 1 unique value
    columns = tfp.read_list(api.config.columns,df.columns)
    col1val_data = {'column': [], 'type': [], 'unique_vals': [], 'action': []}
    for col in columns:
        vals = df[col].unique()
        if len(vals) == 1:
            col1val_data['column'].append(col)
            col1val_data['type'].append(str(df[col].dtype))
            col1val_data['unique_vals'].append(vals)
            col1val_data['action'].append('drop')
            if not api.config.info_only:
                df.drop(columns=[col], inplace=True)

    logger.debug('End of Process Function')
    logger.debug('End time: ' + time_monitor.elapsed_time())
    return log_stream.getvalue(), api.Message(attributes={'name':'drop_duplicates','type':'DataFrame'},body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(col1val_data))
示例#6
0
def process(msg):
    logger, log_stream = slog.set_logging('DEBUG')
    time_monitor = tp.progress()

    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())

    df = msg.body

    columns = tfp.read_list(api.config.columns, df.columns, test_number=False)
    info_only = api.config.info_only
    threshold = api.config.threshold

    transform_data = {
        'column': [],
        'dtype': [],
        'unique_values': [],
        'action': []
    }
    for col in df[columns].select_dtypes(np.object):
        unique_vals_num = len(df[col].unique())
        frac_unique_vals = unique_vals_num / df.shape[0]
        if frac_unique_vals > threshold:
            transform_data['column'].append(col)
            transform_data['dtype'].append(df[col].dtype)
            transform_data['unique_values'].append(frac_unique_vals)
            transform_data['action'].append('drop')
            if info_only == False:
                df.drop(columns=[col], inplace=True)

    logger.debug('End of Process Function')
    logger.debug('End time: ' + time_monitor.elapsed_time())
    return log_stream.getvalue(), api.Message(attributes={'name':'filter_by_population','type':'DataFrame'},body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
示例#7
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'drop_duplicates'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    time_monitor = tp.progress()

    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())

    df = msg.body
    before_num_rows = df.shape[0]
    drop_cols_test = tfp.read_list(api.config.columns,df.columns)
    keep = tfp.read_value(api.config.keep,test_number=False)
    df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True)
    logger.debug('Duplicate Rows: {}'.format(before_num_rows - df.shape[0]))

    logger.debug('End of Process Function')
    logger.debug('End time: ' + time_monitor.elapsed_time())

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))

    return log_stream.getvalue(), api.Message(attributes={'name':'drop_duplicates','type':'DataFrame'},body=df),
def process(msg):

    att_dict = msg.attributes

    logger, log_stream = slog.set_logging('word_index_regex',
                                          api.config.debug_mode)
    logger.info("Main Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    # regex patterns
    regex_patterns = tfp.read_list(api.config.patterns)

    # word type
    word_types = tfp.read_list(api.config.word_types)
    if not word_types:
        logger.warning(
            'Word types had to be defined. Default word type : \'PROPN\'')
        word_types = ['PROPN']

    # pandas Dataframe and select only values with right word_type
    cols = [c["name"] for c in msg.attributes['table']['columns']]
    df = pd.DataFrame(msg.body, columns=cols)
    df_p = df.loc[df['TYPE'].isin(word_types)]

    # Language filter
    language_filter = tfp.read_list(api.config.language_filter)
    if language_filter:
        df_p = df_p.loc[df['LANGUAGE'].isin(language_filter)]

    # get unique words to get words that comply with regex
    words = df_p['WORD'].unique()
    logger.info('Number of words to test with regex pattern: {}'.format(
        len(words)))

    for ipat, pat in enumerate(regex_patterns):
        if pat == '':
            logger.warning('Empty pattern')
            continue
        logger.info('Execute pattern: {} ({}/{})'.format(
            pat, ipat, len(regex_patterns)))
        cleansing_words = [w for w in words if re.match(pat, w)]

    df = df.loc[~df['WORD'].isin(cleansing_words)]

    api.send(outports[1]['name'],
             api.Message(attributes=att_dict, body=df.values.tolist()))
    api.send(outports[0]['name'], log_stream.getvalue())
def process(msg):
    global last_msg
    global hash_list
    global lexicon_stem, lexicon

    logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode)

    # Check if setup complete
    msg = check_for_setup(logger, msg)
    if not msg:
        api.send(outports[0]['name'], log_stream.flush())
        return 0

    logger.info("Main Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    att_dict = msg.attributes

    # pandas Dataframe and select only values with right word_type
    cols = [c["name"] for c in msg.attributes['table']['columns']]
    df = pd.DataFrame(msg.body, columns=cols)

    # word type
    types = tfp.read_list(api.config.types)
    if not types:
        logger.warning(
            'Word types had to be defined. Default word type : \'PROPN\'')
        types = ['PROPN']

    # Language filter
    languages = tfp.read_list(api.config.languages)
    if not languages:
        logger.warning(
            'Languages had to be defined. Default languages : EN, FR, ES, DE')
        languages = ['EN', 'FR', 'ES', 'DE']

    for lang in lexicon:
        for w in lexicon[lang]:
            df.loc[(df['TYPE'].isin(types)) & (df['LANGUAGE'] == lang) &
                   (df['WORD'] == w)] = lexicon[lang][w]
        for w in lexicon_stem[lang]:
            df.loc[(df['TYPE'].isin(types)) & (df['LANGUAGE'] == lang) &
                   (df['WORD'] == w)] = lexicon_stem[lang][w]

    api.send(outports[1]['name'],
             api.Message(attributes=att_dict, body=df.values.tolist()))
    api.send(outports[0]['name'], log_stream.getvalue())
示例#10
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'drop_highly_unique'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    time_monitor = tp.progress()
    logger.debug('Start time: ' + time_monitor.get_start_time())

    df = msg.body

    columns = tfp.read_list(api.config.columns, df.columns, test_number=False)
    info_only = api.config.info_only
    threshold = api.config.threshold

    transform_data = {
        'column': [],
        'dtype': [],
        'unique_values': [],
        'action': []
    }
    for col in df[columns].select_dtypes(np.object).columns:
        unique_vals_num = len(df[col].unique())
        frac_unique_vals = unique_vals_num / df.shape[0]
        if frac_unique_vals > threshold:
            transform_data['column'].append(col)
            transform_data['dtype'].append(df[col].dtype)
            transform_data['unique_values'].append(frac_unique_vals)
            transform_data['action'].append('drop')
            if info_only == False:
                df.drop(columns=[col], inplace=True)

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict[
                'storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],
                                          att_dict['storage.fileCount'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict,body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
示例#11
0
def process(msg):

    att_dict = msg.attributes
    att_dict['operator'] = 'dropColumns'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start of doing calculation
    drop_cols = tfp.read_list(api.config.drop_columns, df.columns)
    if drop_cols:
        logger.debug("Drops columns: {}".format(str(drop_cols)))
        df = df.drop(columns=drop_cols)

    map_names = tfp.read_dict(api.config.rename_columns)
    if map_names:
        df.rename(columns=map_names, inplace=True)

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(
                att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(
                att_dict['storage.fileIndex'] + 1,
                att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
示例#12
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'drop_1valuecolumns'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    df = msg.body
    prev_shape = df.shape
    # Columns with 1 unique value
    columns = tfp.read_list(api.config.columns, df.columns)
    transform_data = {
        'column': [],
        'type': [],
        'unique_vals': [],
        'action': []
    }
    for col in columns:
        vals = df[col].unique()
        if len(vals) == 1:
            transform_data['column'].append(col)
            transform_data['type'].append(str(df[col].dtype))
            transform_data['unique_vals'].append(vals)
            transform_data['action'].append('drop')
            if not api.config.info_only:
                df.drop(columns=[col], inplace=True)

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict[
                'storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],
                                          att_dict['storage.fileCount'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict,body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
示例#13
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'filter_by_population'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    df = msg.body
    prev_cols = df.shape[1]

    columns = tfp.read_list(api.config.columns,df.columns,test_number=False)
    info_only = api.config.info_only
    threshold = api.config.threshold
    logger.debug('Parameter  Threshold: {}   Data Modification:{} '.format(threshold,info_only))

    transform_data = {'column': [], 'dtype': [], 'unique_vals': [],'action': []}
    for col in columns:
        population = df[col].count() / df.shape[0]
        unique_vals = df[col].unique()
        if population < threshold and not (len(unique_vals) == 1 and np.isnan(unique_vals[0])):
            unique_vals = df[col].unique()
            transform_data['column'].append(col)
            transform_data['dtype'].append(df[col].dtype)
            transform_data['unique_vals'].append(unique_vals)
            transform_data['action'].append('drop')
            if not info_only:
                df.drop(columns=[col], inplace=True)

    # end custom process definition
    if df.empty :
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1]))
    logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(i,str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(att_dict['storage.fileIndex'] + 1,
                                                              att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    return log_stream.getvalue(), api.Message(attributes=att_dict,body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
def process(msg):

    words = msg.body
    att_dict = msg.attributes

    logger, log_stream = slog.set_logging('word_regex_cleansing',
                                          api.config.debug_mode)
    logger.info("Main Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    if isinstance(words[0], list):
        words = [w[0] for w in words]

    regex_patterns = tfp.read_list(api.config.patterns)

    logger.info('Test mode: {}'.format(api.config.test_mode))
    logger.info('Number of words to cleanse: {}'.format(len(words)))

    word_type = tfp.read_value(api.config.word_type)
    if len(word_type) > 1:
        logger.warning(
            'Only one word type can be processed. Take first one only: {}'.
            format(word_type[0]))

    count = 0
    for ipat, pat in enumerate(regex_patterns):
        if pat == '':
            logger.warning('Empty pattern')
            continue
        cleansing_words = [w for w in words if re.match(pat, w)]
        logger.info('Execute pattern: {} ({}/{})'.format(
            pat, ipat, len(regex_patterns)))
        logger.info('Number of DELETE statements: {}'.format(
            len(cleansing_words)))
        api.send(outports[0]['name'], log_stream.getvalue())
        log_stream.seek(0)
        log_stream.truncate()
        if not api.config.test_mode:
            for iw, w in enumerate(cleansing_words):
                if word_type:
                    sql = 'DELETE FROM WORD_INDEX WHERE WORD = \'' + w + '\' AND WORD_TYPE = \'' + word_type + '\';'
                else:
                    sql = 'DELETE FROM WORD_INDEX WHERE WORD = \'' + w + '\';'
                att_dict['message.indexBatch'] = count
                att_dict['message.lastBatch'] = False
                api.send(outports[1]['name'],
                         api.Message(attributes=att_dict, body=sql))
                count += 1

    sql = 'SELECT * FROM DUMMY;'
    att_dict['message.indexBatch'] = count
    att_dict['message.lastBatch'] = True
    api.send(outports[1]['name'], api.Message(attributes=att_dict, body=sql))
    api.send(outports[0]['name'], log_stream.getvalue())
示例#15
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'filter_by_population'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    time_monitor = tp.progress()

    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())

    df = msg.body

    columns = tfp.read_list(api.config.columns, df.columns, test_number=False)
    info_only = api.config.info_only
    threshold = api.config.threshold

    transform_data = {
        'column': [],
        'dtype': [],
        'unique_vals': [],
        'action': []
    }
    for col in columns:
        population = df[col].count() / df.shape[0]
        unique_vals = df[col].unique()
        if population < threshold and not (len(unique_vals) == 1
                                           and np.isnan(unique_vals[0])):
            unique_vals = df[col].unique()
            transform_data['column'].append(col)
            transform_data['dtype'].append(df[col].dtype)
            transform_data['unique_vals'].append(unique_vals)
            transform_data['action'].append('drop')
            if not info_only:
                df.drop(columns=[col], inplace=True)

    logger.debug('End of Process Function')
    logger.debug('End time: ' + time_monitor.elapsed_time())

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))

    return log_stream.getvalue(), api.Message(attributes={'name':'filter_by_population','type':'DataFrame'},body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
示例#16
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'drop_highly_unique'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    time_monitor = tp.progress()
    logger.debug('Start time: ' + time_monitor.get_start_time())

    df = msg.body
    prev_shape = df.shape

    columns = tfp.read_list(api.config.columns, df.columns, test_number=False)
    info_only = api.config.info_only
    threshold = api.config.threshold

    transform_data = {
        'column': [],
        'dtype': [],
        'unique_values': [],
        'action': []
    }
    for col in df[columns].select_dtypes(np.object).columns:
        unique_vals_num = len(df[col].unique())
        frac_unique_vals = unique_vals_num / df.shape[0]
        if frac_unique_vals > threshold:
            transform_data['column'].append(col)
            transform_data['dtype'].append(df[col].dtype)
            transform_data['unique_values'].append(frac_unique_vals)
            transform_data['action'].append('drop')
            if info_only == False:
                df.drop(columns=[col], inplace=True)

    logger.info('End of Process: {}'.format(time_monitor.elapsed_time()))

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))

    logger.info('Dropped Columns: {}'.format(prev_shape[1] - df.shape[1]))
    logger.debug('Dropped Columns: {}'.format(prev_shape[1] - df.shape[1]))

    return log_stream.getvalue(), api.Message(attributes={'name':'filter_by_population','type':'DataFrame'},body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
示例#17
0
def process(msg) :

    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'dropColumns'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start of doing calculation
    att_dict['config']['drop_columns'] = api.config.drop_columns
    drop_cols = tfp.read_list(api.config.drop_columns, df.columns)
    if drop_cols:
        logger.debug("Drops columns: {}".format(str(drop_cols)))
        df = df.drop(columns=drop_cols)

    att_dict['config']['rename_columns'] = api.config.rename_columns
    map_names = tfp.read_dict(api.config.rename_columns)
    if map_names:

        df.rename(columns=map_names, inplace=True)
    ###### end of doing calculation
    logger.info('End of Process: {}'.format(time_monitor.elapsed_time()))

    # df from body
    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
        logger.debug('Head data: {}'.format(att_dict['row_' + str(i)]))
    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict,body=df)
    return log, msg
示例#18
0
def process(msg):
    global blacklist
    global last_msg
    global id_set

    logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode)

    # Check if setup complete
    msg = check_for_setup(logger, msg)
    if not msg:
        api.send(outports[0]['name'], log_stream.flush())
        return 0

    logger.info("Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    att_dict = msg.attributes
    df = msg.body

    # word type
    word_types = tfp.read_list(api.config.word_types)
    if not word_types :
        word_types = list(df['type'].unique())

    # Language filter
    language_filter = tfp.read_list(api.config.language_filter)
    if not language_filter :
        language_filter = list(df['language'].unique())

    df = df.loc[~(df['type'].isin(word_types) & df['language'].isin(language_filter) & df['word'].isin(blacklist)) ]

    # test for duplicates
    dup_s = df.duplicated(subset=['text_id','language','type','word']).value_counts()
    num_duplicates = dup_s[True] if True in dup_s  else 0
    logger.info('Duplicates: {} / {}'.format(num_duplicates, df.shape[0]))
    logger.info('End process: {}'.format(time_monitor.elapsed_time()))

    api.send(outports[1]['name'], api.Message(attributes=att_dict, body=df))
    api.send(outports[0]['name'],log_stream.getvalue())
示例#19
0
def process(msg):

    logger, log_stream = slog.set_logging('DEBUG')

    # start custom process definition
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    att_dict = dict()
    att_dict['config'] = dict()

    ###### start of doing calculation
    att_dict['config']['drop_columns'] = api.config.drop_columns
    drop_cols = tfp.read_list(api.config.drop_columns, df.columns)
    if drop_cols:
        logger.debug("Drops columns: {}".format(str(drop_cols)))
        df = df.drop(columns=drop_cols)

    att_dict['config']['rename_columns'] = api.config.rename_columns
    map_names = tfp.read_dict(api.config.rename_columns)
    if map_names:

        df.rename(columns=map_names, inplace=True)
    ###### end of doing calculation

    ##############################################
    #  final infos to attributes and info message
    ##############################################

    # df from body
    att_dict['operator'] = 'dropColumns'  # name of operator
    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['name'] = prev_att['name']
    att_dict['columns'] = list(df.columns)
    att_dict['number_columns'] = df.shape[1]
    att_dict['number_rows'] = df.shape[0]

    example_rows = EXAMPLE_ROWS if att_dict[
        'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows']
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])

    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict, body=df)
    return log, msg
示例#20
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'drop_duplicates'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    df = msg.body
    prev_shape = df.shape

    drop_cols_test = tfp.read_list(api.config.columns, df.columns)
    keep = tfp.read_value(api.config.keep, test_number=False)
    df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True)

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(
                att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(
                att_dict['storage.fileIndex'] + 1,
                att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
示例#21
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'drop_1valuecolumns'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    df = msg.body
    prev_shape = df.shape
    # Columns with 1 unique value
    columns = tfp.read_list(api.config.columns, df.columns)
    col1val_data = {'column': [], 'type': [], 'unique_vals': [], 'action': []}
    for col in columns:
        vals = df[col].unique()
        if len(vals) == 1:
            col1val_data['column'].append(col)
            col1val_data['type'].append(str(df[col].dtype))
            col1val_data['unique_vals'].append(vals)
            col1val_data['action'].append('drop')
            if not api.config.info_only:
                df.drop(columns=[col], inplace=True)

    logger.info('End of Process: {}'.format(time_monitor.elapsed_time()))

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))

    logger.debug('Dropped columns: {}'.format(prev_shape[1] - df.shape[1]))
    logger.info('Dropped columns: {}'.format(prev_shape[1] - df.shape[1]))

    return log_stream.getvalue(), api.Message(attributes={'name':'drop_duplicates','type':'DataFrame'},body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(col1val_data))
示例#22
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'drop_duplicates'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    df = msg.body
    prev_shape = df.shape

    drop_cols_test = tfp.read_list(api.config.columns, df.columns)
    keep = tfp.read_value(api.config.keep, test_number=False)
    df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True)

    logger.info('End of Process: {}'.format(time_monitor.elapsed_time()))

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))

    logger.debug('Dropped duplicates: {}'.format(prev_shape[0] - df.shape[0]))
    logger.info('Dropped duplicates: {}'.format(prev_shape[0] - df.shape[0]))

    return log_stream.getvalue(), api.Message(attributes={
        'name': 'drop_duplicates',
        'type': 'DataFrame'
    },
                                              body=df),
示例#23
0
def process(msg):
    logger, log_stream = slog.set_logging('DEBUG')
    time_monitor = tp.progress()

    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())

    df = msg.body
    before_num_rows = df.shape[0]
    drop_cols_test = tfp.read_list(api.config.columns, df.columns)
    keep = tfp.read_value(api.config.keep, test_number=False)
    df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True)
    logger.debug('Duplicate Rows: {}'.format(before_num_rows - df.shape[0]))

    logger.debug('End of Process Function')
    logger.debug('End time: ' + time_monitor.elapsed_time())
    return log_stream.getvalue(), api.Message(attributes={
        'name': 'drop_duplicates',
        'type': 'DataFrame'
    },
                                              body=df),
示例#24
0
def process(msg):
    logger, log_stream = slog.set_logging('DEBUG')
    time_monitor = tp.progress()

    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())

    df = msg.body

    columns = tfp.read_list(api.config.columns, df.columns, test_number=False)
    info_only = api.config.info_only
    threshold = api.config.threshold

    transform_data = {
        'column': [],
        'dtype': [],
        'unique_vals': [],
        'action': []
    }
    for col in columns:
        population = df[col].count() / df.shape[0]
        unique_vals = df[col].unique()
        if population < threshold and not (len(unique_vals) == 1
                                           and np.isnan(unique_vals[0])):
            unique_vals = df[col].unique()
            transform_data['column'].append(col)
            transform_data['dtype'].append(df[col].dtype)
            transform_data['unique_vals'].append(unique_vals)
            transform_data['action'].append('drop')
            if not info_only:
                df.drop(columns=[col], inplace=True)

    logger.debug('End of Process Function')
    logger.debug('End time: ' + time_monitor.elapsed_time())
    return log_stream.getvalue(), api.Message(attributes={'name':'filter_by_population','type':'DataFrame'},body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
def process(db_msg):

    logger, log_stream = slog.set_logging('topic_identification',
                                          loglevel=api.config.debug_mode)
    logger.info("Process started")
    time_monitor = tp.progress()

    columns = [c['name'] for c in db_msg.attributes['table']['columns']]
    df = pd.DataFrame(db_msg.body, columns=columns)

    # Language filter
    language_filter = tfp.read_list(api.config.language_filter)
    if language_filter:
        df = df.loc[df["LANGUAGE"].isin(language_filter)]
        logger.info('Languages : {}'.format(language_filter))
    # Word type filter
    word_type_filter = tfp.read_value(api.config.word_type_filter)
    if word_type_filter:
        types = [c for c in word_type_filter]
        df = df.loc[df["TYPE"].isin(types)]
        logger.info('Word restricted to types : {}'.format(word_type_filter))

    # groupby and concatenate words
    gdf = df.groupby('HASH_TEXT').agg({
        "LANGUAGE":
        'first',
        "WORD": [(lambda x: ' '.join(x)), 'count']
    })
    gdf.columns = gdf.columns.droplevel(level=0)
    gdf.rename(columns={
        "first": 'LANGUAGE',
        "count": 'NUM_WORDS',
        '<lambda_0>': 'WORDS'
    },
               inplace=True)

    # create document-term matrix - no tokenization or text prep are needed
    tf_vectorizer = CountVectorizer(analyzer='word',
                                    min_df=1,
                                    lowercase=False,
                                    tokenizer=str.split)

    # tf means term-frequency in a document for each language
    date_today = str(date.today())

    # 2-array with TOPIC, LANGUAGE, TYPE, DATE, EXPIRY_DATE, ATTRIBUTE, KEYWORD_i (num of topics)
    topic_list = list()
    for lang in language_filter:
        lang_gdf = gdf.loc[gdf['LANGUAGE'] == lang]
        logger.info(
            "Language: {}    #articles: {}    av.words/article: {:.1f}".format(
                lang, lang_gdf.shape[0], lang_gdf['NUM_WORDS'].mean()))
        dtm_tf = tf_vectorizer.fit_transform(lang_gdf['WORDS'])
        # for tf dtm
        lda_tf = LatentDirichletAllocation(n_components=api.config.num_topics,
                                           learning_method='online',
                                           evaluate_every=-1,
                                           n_jobs=-1)
        lda_tf.fit(dtm_tf)
        feature_names = tf_vectorizer.get_feature_names()

        for i, topic in enumerate(lda_tf.components_):
            topic_words = [
                feature_names[f]
                for f in topic.argsort()[:-api.config.topic_words - 1:-1]
            ]
            print('Len: {}  topic_words:{}'.format(len(topic_words),
                                                   topic_words))
            row = [
                date_today + "-" + str(i), lang, 'ALGO', date_today, None, None
            ] + topic_words
            print('Len: {}  record:{}'.format(len(row), row))
            topic_list.append(row)

    attributes = {
        "table": {
            "columns": [{
                "class": "string",
                "name": "TOPIC",
                "nullable": False,
                "size": 80,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "LANGUAGE",
                "nullable": False,
                "size": 2,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "TYPE",
                "nullable": False,
                "size": 10,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "DATE",
                "nullable": True,
                "type": {
                    "hana": "DATE"
                }
            }, {
                "class": "string",
                "name": "EXPIRY_DATE",
                "nullable": True,
                "type": {
                    "hana": "DATE"
                }
            }, {
                "class": "string",
                "name": "ATTRIBUTE",
                "nullable": True,
                "size": 25,
                "type": {
                    "hana": "NVACHAR"
                }
            }],
            "name":
            "DIPROJECTS.WORD_INDEX",
            "version":
            1
        }
    }
    for i in range(1, api.config.topic_words + 1):
        attributes['table']['columns'].append({
            "class": "string",
            "name": "KEYWORD_" + str(i),
            "nullable": True,
            "size": 80,
            "type": {
                "hana": "NVARCHAR"
            }
        })

    msg = api.Message(attributes=attributes, body=topic_list)
    logger.debug('Process ended, topics processed {}'.format(
        time_monitor.elapsed_time()))
    api.send(outports[0]['name'], log_stream.getvalue())
    api.send(outports[1]['name'], msg)
示例#26
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'lgbm_classifier'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='DEBUG')
    else :
        logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start of doing calculation

    model = LGBMRegressor(
        n_estimators=200,
        learning_rate=0.03,
        num_leaves=32,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=8,
        reg_alpha=0.04,
        reg_lambda=0.073,
        min_split_gain=0.0222415,
        min_child_weight=40)

    train_cols = tfp.read_list(api.config.train_cols, df.columns)
    logger.info('Train columns: {}'.format(train_cols))

    label = tfp.read_value(api.config.label_col)
    logger.info('Label column: {}'.format(label))
    if not label:
        raise ValueError('Label is mandatory')

    # cast to categorical dtype
    for c in df[train_cols].select_dtypes(include='category').columns:
        unique_num = len(df[c].unique())
        nan_num = df[c].isna().count()
        logger.debug('Cast to category - {}: unique {}, nan: {} of {}'.format(c, unique_num, nan_num, df.shape[0]))
        df[c] = df[c].cat.codes
        df[c] = df[c].astype('int32')

    if pd.api.types.is_categorical(df[label]):
        df[label] = df[label].astype('category')
        logger.debug('Cast label to <category>')
        df[label] = df[label].cat.codes
        df[label] = df[label].astype('int32')

    print(df.select_dtypes(include='category').head(10))
    logger.debug('Train with {} features'.format(len(train_cols)))
    print(train_cols)
    model.fit(df[train_cols], df[label], eval_metric='auc')

    ###### end of doing calculation

    # end custom process definition
    if df.empty :
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1]))
    logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(i,str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(att_dict['storage.fileIndex'] + 1,
                                                              att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    return log_stream.getvalue(), api.Message(attributes=att_dict,body=df)
示例#27
0
def process(left_msg, right_msg):

    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'join'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    # start custom process definition

    l_att = left_msg.attributes
    r_att = right_msg.attributes

    if l_att['name'] == r_att['name']:
        att_dict['name'] = l_att['name']
    else:
        att_dict['name'] = l_att['name'] + '-' + r_att['name']
    att_dict['config'] = dict()

    # read stream from memory
    left_df = left_msg.body
    right_df = right_msg.body

    ###### start of doing calculation
    how = tfp.read_value(api.config.how)

    # merge according to config
    att_dict['config']['on_index'] = api.config.on_index
    if api.config.on_index:
        df = pd.merge(left_df,
                      right_df,
                      how=how,
                      left_index=True,
                      right_index=True)
    elif api.config.left_on and api.config.right_on:
        att_dict['config']['left_on'] = api.config.left_on
        att_dict['config']['right_on'] = api.config.right_on

        left_on_list = tfp.read_list(api.config.left_on)
        right_on_list = tfp.read_list(api.config.right_on)
        left_df.reset_index(inplace=True)
        right_df.reset_index(inplace=True)

        df = pd.merge(left_df,
                      right_df,
                      how=how,
                      left_on=left_on_list,
                      right_on=right_on_list)

        # removing second index - might be a more elegant solution
        if 'index_x' in df.columns:
            df.drop(columns=['index_x'], inplace=True)
    else:
        raise ValueError(
            "Config setting: Either <on> or both <left_on> and <right_on> has to be set in order to join the dataframes"
        )

    att_dict['config']['new_indices'] = api.config.new_indices
    index_list = tfp.read_list(api.config.new_indices)
    if index_list:
        df.set_index(keys=index_list, inplace=True)

    att_dict['config']['drop_columns'] = api.config.drop_columns
    col_list = tfp.read_list(api.config.drop_columns)
    if col_list:
        df.drop(labels=col_list, axis=1, inplace=True)

    ##############################################
    #  final infos to attributes and info message
    ##############################################
    if df.empty == True:
        raise ValueError('Merged Dataframe is empty')

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
        logger.debug('Head data: {}'.format(att_dict['row_' + str(i)]))

    # end custom process definition

    log = log_stream.getvalue()

    msg = api.Message(attributes=att_dict, body=df)
    return log, msg
示例#28
0
def process(msg):

    att_dict = msg.attributes

    global result_df

    att_dict['operator'] = 'fromCSV'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    logger.info('Filename: {} index: {}  count: {}  endofSeq: {}'.format(msg.attributes["storage.filename"], \
                                                                         msg.attributes["storage.fileIndex"], \
                                                                         msg.attributes["storage.fileCount"], \
                                                                         msg.attributes["storage.endOfSequence"]))

    if msg.body == None:
        logger.info('Process ended.')
        msg = api.Message(attributes=att_dict, body=result_df)
        log = log_stream.getvalue()
        return log, msg
    elif isinstance(msg.body, str):
        csv_io = io.StringIO(msg.body)
        logger.debug("Input format: <string>")
    elif isinstance(msg.body, bytes):
        csv_io = io.BytesIO(msg.body)
        logger.debug("Input format: <bytes>")
    elif isinstance(msg.body, io.BytesIO):
        logger.debug("Input format: <io.Bytes>")
        csv_io = msg.body
    else:
        raise TypeError('Message body has unsupported type' +
                        str(type(msg.body)))

    # nrows
    nrows = None
    if not api.config.limit_rows == 0:
        nrows = api.config.limit_rows

    # usecols
    use_cols = tfp.read_list(api.config.use_columns)
    logger.debug('Columns used: {}'.format(use_cols))

    # dtypes mapping
    typemap = tfp.read_dict(api.config.dtypes)
    logger.debug('Type cast: {}'.format(str(typemap)))

    kwargs = tfp.read_dict(text=api.config.keyword_args, map_sep='=')

    ##### Read string from buffer
    logger.debug("Read from input")
    df = pd.read_csv(csv_io, api.config.separator, usecols=use_cols, dtype=typemap, decimal=api.config.decimal, \
                     nrows=nrows, **kwargs)

    # Data from filename
    if api.config.data_from_filename and not api.config.data_from_filename == 'None':
        col = api.config.data_from_filename.split(':')[0].strip().strip(
            "'").strip('"')
        pat = api.config.data_from_filename.split(':')[1].strip().strip(
            "'").strip('"')
        logger.debug('Filename: {}  pattern: {}'.format(
            att_dict['filename'], pat))
        try:
            dataff = re.match('.*(\d{4}-\d+-\d+).*', att_dict['filename'])
            df[col] = dataff.group(1)
        except AttributeError:
            raise ValueError(
                'Pattern not found - Filename: {}  pattern: {}'.format(
                    att_dict['filename'], pat))

    # To Datetime
    if api.config.todatetime and not api.config.todatetime == 'None':
        dt_fmt = tfp.read_dict(api.config.todatetime)
        logger.debug('Time conversion {} by using UTC {}'.format(
            api.config.todatetime, api.config.utc))
        for col, fmt in dt_fmt.items():
            df[col] = pd.to_datetime(df[col], format=fmt, utc=api.config.utc)

    ###### Downcasting
    # save memory footprint for calculating the savings of the downcast
    logger.debug('Memory used before downcast: {}'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    if api.config.downcast_int:
        df, dci = downcast(df, 'int', 'unsigned')
    if api.config.downcast_float:
        df, dcf = downcast(df, 'float', 'float')

    # check if index is provided and set
    index_list = tfp.read_list(api.config.index_cols)
    if index_list:
        df.set_index(index_list, inplace=True)

    if api.config.collect:
        # stores the result in global variable result_df
        if msg.attributes['storage.fileIndex'] == 0:
            logger.debug('Added to DataFrame: {}'.format(
                att_dict['storage.filename']))
            result_df = df
        else:
            try:
                result_df = pd.concat([result_df, df], axis=0, sort=False)
            except Exception as e:
                logger.error(str(e))
                result_df = df
    else:
        result_df = df

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict[
                'storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],
                                          att_dict['storage.fileCount'])
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
示例#29
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'anonymizeData'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    time_monitor = tp.progress()

    result = ''
    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    att_dict = dict()
    att_dict['config'] = dict()

    ###### start of doing calculation

    model = LGBMRegressor(n_estimators=200,
                          learning_rate=0.03,
                          num_leaves=32,
                          colsample_bytree=0.9497036,
                          subsample=0.8715623,
                          max_depth=8,
                          reg_alpha=0.04,
                          reg_lambda=0.073,
                          min_split_gain=0.0222415,
                          min_child_weight=40)

    att_dict['config']['train columns'] = api.config.train_cols
    train_cols = tfp.read_list(api.config.train_cols, df.columns)

    att_dict['config']['label'] = api.config.label
    label = tfp.read_value(api.config.label)
    if not label:
        raise ValueError('Label is mandatory')

    # cast to categorical dtype
    for c in df[train_cols].select_dtypes(include='category').columns:
        unique_num = len(df[c].unique())
        nan_num = df[c].isna().count()
        logger.debug('Cast to category - {}: unique {}, nan: {} of {}'.format(
            c, unique_num, nan_num, df.shape[0]))
        df[c] = df[c].cat.codes
        df[c] = df[c].astype('int32')

    if pd.api.types.is_categorical(df[label]):
        df[label] = df[label].astype('category')
        logger.debug('Cast label to <category>')
        df[label] = df[label].cat.codes
        df[label] = df[label].astype('int32')

    print(df.select_dtypes(include='category').head(10))
    logger.debug('Train with {} features'.format(len(train_cols)))
    print(train_cols)
    model.fit(df[train_cols], df[label], eval_metric='auc')

    ###### end of doing calculation

    ##############################################
    #  final infos to attributes and info message
    ##############################################

    if df.empty:
        raise ValueError('DataFrame is empty')

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))
    logger.debug('End of Process Function')
    logger.debug('End time: ' + time_monitor.elapsed_time())
    return log_stream.getvalue(), api.Message(attributes=att_dict, body=model)
示例#30
0
def process(left_msg, right_msg):

    att_dict = left_msg.attributes
    att_dict['operator'] = 'join'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition

    l_att = left_msg.attributes
    r_att = right_msg.attributes

    # read stream from memory
    left_df = left_msg.body
    right_df = right_msg.body

    ###### start of doing calculation
    how = tfp.read_value(api.config.how)

    # merge according to config
    if api.config.on_index:
        df = pd.merge(left_df,
                      right_df,
                      how=how,
                      left_index=True,
                      right_index=True)
    elif api.config.left_on and api.config.right_on:
        left_on_list = tfp.read_list(api.config.left_on)
        right_on_list = tfp.read_list(api.config.right_on)
        logger.info('Join DataFrames on {} - {}'.format(
            left_on_list, right_on_list))
        left_df.reset_index(inplace=True)
        right_df.reset_index(inplace=True)

        df = pd.merge(left_df,
                      right_df,
                      how=how,
                      left_on=left_on_list,
                      right_on=right_on_list)

        # removing second index - might be a more elegant solution
        if 'index_x' in df.columns:
            df.drop(columns=['index_x'], inplace=True)
    else:
        raise ValueError(
            "Config setting: Either <on> or both <left_on> and <right_on> has to be set in order to join the dataframes"
        )

    index_list = tfp.read_list(api.config.new_indices)
    if index_list:
        df.set_index(keys=index_list, inplace=True)
        logger.info('Set index: {}'.format(index_list))

    col_list = tfp.read_list(api.config.drop_columns)
    if col_list:
        df.drop(labels=col_list, axis=1, inplace=True)
        logger.info('Drop columns: {}'.format(col_list))

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict[
                'storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],
                                          att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)