示例#1
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'drop_duplicates'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    time_monitor = tp.progress()

    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())

    df = msg.body
    before_num_rows = df.shape[0]
    drop_cols_test = tfp.read_list(api.config.columns,df.columns)
    keep = tfp.read_value(api.config.keep,test_number=False)
    df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True)
    logger.debug('Duplicate Rows: {}'.format(before_num_rows - df.shape[0]))

    logger.debug('End of Process Function')
    logger.debug('End time: ' + time_monitor.elapsed_time())

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))

    return log_stream.getvalue(), api.Message(attributes={'name':'drop_duplicates','type':'DataFrame'},body=df),
示例#2
0
def process(msg):
    att_dict = msg.attributes

    att_dict['operator'] = 'keyword_search'
    logger, log_stream = slog.set_logging(att_dict['operator'],
                                          loglevel=api.config.debug_mode)
    logger.info("Process started")
    time_monitor = tp.progress()

    adict = msg.body

    global word_count

    for a in adict:
        cw = Counter(a['words'])
        word_count = word_count.add(pd.Series(cw), fill_value=0)

    word_count.sort_values(ascending=False, inplace=True)
    msg, progress_str = create_msg(attributes=att_dict,
                                   body=word_count.to_dict(),
                                   collect=api.config.collect)
    if msg:
        word_count.sort_values(ascending=False, inplace=True)
        msg.body = word_count.head(api.config.num_words).to_dict()
        api.send(outports[1]['name'], msg)

    logger.debug('Process ended, articles processed {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))
    api.send(outports[0]['name'], log_stream.getvalue())
def process(msg):

    logger, log_stream = slog.set_logging('topic dispatcher',
                                          loglevel=api.config.debug_mode)
    logger.info("Process started")
    time_monitor = tp.progress()

    table = tfp.read_value(api.config.topic_table)
    column = tfp.read_value(api.config.table_colum)

    topics = msg.body
    for t in topics:
        topic_keywords = [
            "'" + t[i] + "'" for i in range(6, len(t)) if not t[i] == ''
        ]
        sql = 'SELECT * FROM ' + table + ' WHERE ' + column + ' IN(' + ','.join(
            topic_keywords) + ')'
        att_dict = {'topic': t[0], 'keywords': topic_keywords}
        sql_msg = api.Message(attributes=att_dict, body=sql)
        api.send(outports[1]['name'], sql_msg)
        logger.debug('Send sql: {}'.format(sql))

    logger.debug('Process ended, topics processed {}  - {}  '.format(
        len(topics), time_monitor.elapsed_time()))
    api.send(outports[0]['name'], log_stream.getvalue())
示例#4
0
def process(msg):
    logger, log_stream = slog.set_logging('DEBUG')
    time_monitor = tp.progress()

    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())

    df = msg.body

    columns = tfp.read_list(api.config.columns, df.columns, test_number=False)
    info_only = api.config.info_only
    threshold = api.config.threshold

    transform_data = {
        'column': [],
        'dtype': [],
        'unique_values': [],
        'action': []
    }
    for col in df[columns].select_dtypes(np.object):
        unique_vals_num = len(df[col].unique())
        frac_unique_vals = unique_vals_num / df.shape[0]
        if frac_unique_vals > threshold:
            transform_data['column'].append(col)
            transform_data['dtype'].append(df[col].dtype)
            transform_data['unique_values'].append(frac_unique_vals)
            transform_data['action'].append('drop')
            if info_only == False:
                df.drop(columns=[col], inplace=True)

    logger.debug('End of Process Function')
    logger.debug('End time: ' + time_monitor.elapsed_time())
    return log_stream.getvalue(), api.Message(attributes={'name':'filter_by_population','type':'DataFrame'},body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
示例#5
0
def process(msg):

    logger, log_stream = slog.set_logging('word_indexing', loglevel=api.config.debug_mode)
    logger.info("Process started")
    time_monitor = tp.progress()

    articles = msg.body
    word_index = list()
    # as table
    for article in articles :
        word_index.extend([[article[0], article[1], article[2], w[0],w[1]] for w in article[3]])

    att_dict = msg.attributes
    att_dict['table'] = {"columns": [{"class": "string", "name": "HASH_TEXT", "nullable": True, "type": {"hana": "INTEGER"}},
                              {"class": "string", "name": "LANGUAGE", "nullable": True, "size": 2,"type": {"hana": "NVARCHAR"}},
                              {"class": "string", "name": "TYPE", "nullable": True, "size": 1,"type": {"hana": "NVARCHAR"}},
                              {"class": "string", "name": "WORD", "nullable": True, "size": 80,"type": {"hana": "NVARCHAR"}},
                              {"class": "string", "name": "COUNT", "nullable": True, "type": {"hana": "INTEGER"}}],
                         "name": "DIPROJECTS.WORD_INDEX", "version": 1}

    logger.debug('Process ended, articles processed {}  - {}  '.format(len(articles), time_monitor.elapsed_time()))
    api.send(outports[0]['name'], log_stream.getvalue())

    msg = api.Message(attributes=att_dict,body=word_index)
    api.send(outports[1]['name'], msg)
示例#6
0
def process():

    operator_name = 'sql_word_index'
    logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode)
    logger.info("Main Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    language = tfp.read_value(api.config.language)
    type_limit = tfp.read_dict(api.config.type_limit_map)
    table_name = tfp.read_value(api.config.table_name)
    text_id_col = tfp.read_value(api.config.text_id_col)

    for i, [wtype, limit] in enumerate(type_limit.items()):
        sql_s = "SELECT {tid}, \"{tn}\".LANGUAGE, \"{tn}\".TYPE, \"{tn}\".WORD, COUNT FROM \"{tn}\" INNER JOIN"\
                "(SELECT WORD, TYPE, LANGUAGE, SUM(COUNT) as CUMS FROM \"{tn}\" "\
                "WHERE LANGUAGE = \'{lang}\' AND TYPE = \'{wt}\' "\
                "GROUP BY WORD, TYPE, LANGUAGE) AS CTABLE ON "\
                "\"{tn}\".WORD = CTABLE.WORD AND \"{tn}\".TYPE = CTABLE.TYPE AND \"{tn}\".LANGUAGE = CTABLE.LANGUAGE "\
                "WHERE CUMS >= {lt}".format(tid = text_id_col,tn = table_name,lang=language,wt = wtype,lt=limit)

        lastbatch = True if len(type_limit) == i + 1 else False
        att_dict = attributes={'operator':operator_name,'parameter':{'type':wtype,'limit':limit,'language':language},\
                               'message.batchIndex':i,'message.batchSize':len(type_limit),'message.lastBatch':lastbatch}
        msg = api.Message(attributes=att_dict, body=sql_s)
        api.send(outports[1]['name'], msg)

    api.send(outports[0]['name'], log_stream.getvalue())
示例#7
0
def process(msg):

    att_dict = msg.attributes

    logger, log_stream = slog.set_logging('word_regex', api.config.debug_mode)
    logger.info("Main Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    df = msg.body

    if not isinstance(df, pd.DataFrame) or df.empty:
        logger.warning('Empty dataframe, no output send!')
        api.send(outports[0]['name'], log_stream.getvalue())
        api.send(outports[2]['name'], api.Message(attributes=att_dict,
                                                  body=df))
        return 0

    df['count'] = df['count'].astype('int32')

    # word type
    word_types = tfp.read_list(api.config.word_types)
    if word_types:
        df = df.loc[df['type'].isin(word_types)]

    # Language filter
    language_filter = tfp.read_list(api.config.language_filter)
    if language_filter:
        df = df.loc[df['language'].isin(language_filter)]

    df = df.groupby(['language', 'type',
                     'word'])['count'].agg('sum').reset_index()

    api.send(outports[1]['name'], api.Message(attributes=att_dict, body=df))
    api.send(outports[0]['name'], log_stream.getvalue())
示例#8
0
def process(msg):
    logger, log_stream = slog.set_logging('DEBUG')
    time_monitor = tp.progress()

    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())

    df = msg.body
    # Columns with 1 unique value
    columns = tfp.read_list(api.config.columns,df.columns)
    col1val_data = {'column': [], 'type': [], 'unique_vals': [], 'action': []}
    for col in columns:
        vals = df[col].unique()
        if len(vals) == 1:
            col1val_data['column'].append(col)
            col1val_data['type'].append(str(df[col].dtype))
            col1val_data['unique_vals'].append(vals)
            col1val_data['action'].append('drop')
            if not api.config.info_only:
                df.drop(columns=[col], inplace=True)

    logger.debug('End of Process Function')
    logger.debug('End time: ' + time_monitor.elapsed_time())
    return log_stream.getvalue(), api.Message(attributes={'name':'drop_duplicates','type':'DataFrame'},body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(col1val_data))
示例#9
0
def process(msg) :

    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'setValue'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='DEBUG')
    else :
        logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    df = msg.body
    if not isinstance(df,pd.DataFrame) :
        raise TypeError('Message body does not contain a pandas DataFrame')


    ###### start of doing calculation

    # map_values : column1: {from_value: to_value}, column2: {from_value: to_value}
    att_dict['config']['set_value'] = api.config.map_values
    maps_map = tfp.read_dict_of_dict(api.config.map_values)
    df.replace(maps_map,inplace=True)

    # Fill NaN value : column1: value, column2: value,
    att_dict['config']['fill_nan_values'] = api.config.fill_nan_values
    map_dict = tfp.read_dict(api.config.fill_nan_values)
    if map_dict :
        df.fillna(map_dict,inplace=True)

    ##############################################
    #  final infos to attributes and info message
    ##############################################

    if df.empty :
        raise ValueError('DataFrame is empty')

    logger.info('End of Process: {}'.format(time_monitor.elapsed_time()))

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
        logger.debug('Head data: {}'.format(att_dict['row_' + str(i)]))

    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict,body=df)
    return log, msg
示例#10
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'setValue'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start of doing calculation

    # map_values : column1: {from_value: to_value}, column2: {from_value: to_value}
    maps_map = tfp.read_dict_of_dict(api.config.map_values)
    df.replace(maps_map, inplace=True)
    logger.info('Replace values: {}'.format(maps_map))

    # Fill NaN value : column1: value, column2: value,
    map_dict = tfp.read_dict(api.config.fill_nan_values)
    if map_dict:
        df.fillna(map_dict, inplace=True)
        logger.info('Fill nan values: {}'.format(map_dict))

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(
                att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(
                att_dict['storage.fileIndex'] + 1,
                att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
示例#11
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'drop_highly_unique'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    time_monitor = tp.progress()
    logger.debug('Start time: ' + time_monitor.get_start_time())

    df = msg.body

    columns = tfp.read_list(api.config.columns, df.columns, test_number=False)
    info_only = api.config.info_only
    threshold = api.config.threshold

    transform_data = {
        'column': [],
        'dtype': [],
        'unique_values': [],
        'action': []
    }
    for col in df[columns].select_dtypes(np.object).columns:
        unique_vals_num = len(df[col].unique())
        frac_unique_vals = unique_vals_num / df.shape[0]
        if frac_unique_vals > threshold:
            transform_data['column'].append(col)
            transform_data['dtype'].append(df[col].dtype)
            transform_data['unique_values'].append(frac_unique_vals)
            transform_data['action'].append('drop')
            if info_only == False:
                df.drop(columns=[col], inplace=True)

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict[
                'storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],
                                          att_dict['storage.fileCount'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict,body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
示例#12
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'drop_1valuecolumns'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    df = msg.body
    prev_shape = df.shape
    # Columns with 1 unique value
    columns = tfp.read_list(api.config.columns, df.columns)
    transform_data = {
        'column': [],
        'type': [],
        'unique_vals': [],
        'action': []
    }
    for col in columns:
        vals = df[col].unique()
        if len(vals) == 1:
            transform_data['column'].append(col)
            transform_data['type'].append(str(df[col].dtype))
            transform_data['unique_vals'].append(vals)
            transform_data['action'].append('drop')
            if not api.config.info_only:
                df.drop(columns=[col], inplace=True)

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict[
                'storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],
                                          att_dict['storage.fileCount'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict,body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
示例#13
0
def process(msg):
    global setup_data
    global last_msg
    global hash_text_list

    operator_name = 'sentiment analysis'
    logger, log_stream = slog.set_logging(operator_name, loglevel=api.config.debug_mode)
    logger.info("Process started")
    time_monitor = tp.progress()

    if api.config.debug_mode:
        api.send(outports[0]['name'], log_stream.getvalue())

    article_list = msg.body
    att_dict = msg.attributes
    att_dict['operator'] = operator_name

    sentiments_list = list()
    sentiments_table = list()
    media_set = set()
    for article in article_list:
        media_set.add(article['media'])

        # Ensure that text only analysed once
        if article['hash_text'] in hash_text_list:
            continue
        hash_text_list.append(article['hash_text'])

        if not language_dict[article['media']] in supported_languages :
            continue

        polarity, subjectivity =  get_article_sentiment(article)

        sentiments_list.append({'HASH_TEXT': article['hash_text'],'POLARITY': polarity, 'SUBJECTIVITY': subjectivity})
        sentiments_table.append([article['hash_text'],polarity,subjectivity])


    logger.debug('Process ended, analysed media: {} - article sentiments analysed {}  - {}'.format(str(media_set), len(sentiments_list),\
                                                                                      time_monitor.elapsed_time()))


    table_att = {"columns": [
        {"class": "string", "name": "HASH_TEXT", "nullable": False, "type": {"hana": "INTEGER"}},
        {"class": "string", "name": "POLARITY", "nullable": True,"type": {"hana": "DOUBLE"}},
        {"class": "string", "name": "SUBJECTIVITY", "nullable": True, "type": {"hana": "DOUBLE"}}],
              "name": "DIPROJECTS.SENTIMENTS", "version": 1}

    api.send(outports[0]['name'], log_stream.getvalue())
    if len(sentiments_list) :
        logger.debug("First Record: {}".format(str(sentiments_list[0])))
    api.send(outports[2]['name'], api.Message(attributes=att_dict, body=sentiments_list))

    att_dict['table'] = table_att
    if len(sentiments_table) :
        logger.debug("First Record: {}".format(str(sentiments_table[0])))
    msg = api.Message(attributes=att_dict, body=sentiments_table)
    api.send(outports[1]['name'], msg)
示例#14
0
def process(msg) :
    att_dict = msg.attributes
    att_dict['operator'] = 'groupby'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    prev_att = msg.attributes
    df = msg.body
    prev_shape = df.shape

    ###### start of doing calculation

    # groupby list
    cols = tfp.read_list(api.config.groupby)

    # mapping aggregation
    try :
        colagg = tfp.read_dict(api.config.aggregation)
    except IndexError :
        logger.info('Aggregation is not a map, try to parse a value instead')
        colagg = tfp.read_value(api.config.aggregation)

    # groupby
    logger.info('Group columns: {}'.format(cols))
    logger.info('Aggregation: {}'.format(colagg))
    logger.info('Index: {}'.format(api.config.index))
    df = df.groupby(cols, as_index=api.config.index).agg(colagg)

    # drop col
    dropcols = tfp.read_list(api.config.drop_columns)
    if dropcols :
        logger.info('Drop columns: {}'.format(dropcols))
        df.drop(columns=dropcols,inplace=True)

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
示例#15
0
def process(db_msg):

    logger, log_stream = slog.set_logging('topic dispatcher',
                                          loglevel=api.config.debug_mode)
    logger.info("Process started")
    time_monitor = tp.progress()

    columns = [c['name'] for c in db_msg.attributes['table']['columns']]
    df = pd.DataFrame(db_msg.body, columns=columns)

    # groupby and concatenate words
    gdf = df.groupby(['HASH_TEXT'])['WORD'].apply(' '.join)

    # create document-term matrix
    #tf_vectorizer = CountVectorizer(analyzer='word',
    #                                min_df=1,  # minimum reqd occurences of a word
    #                                # stop_words='german',             # remove stop words
    #                                lowercase=False,  # convert all words to lowercase
    #                                # token_pattern='[a-zA-Z0-9]{1,}',  # num chars > 3
    #                                # max_features=5000,             # max number of uniq words
    #                                )

    # tf means term-frequency in a document
    #dtm_tf = tf_vectorizer.fit_transform(gdf)
    # for tf dtm
    #lda_tf = LatentDirichletAllocation(n_components=30, learning_method='online', evaluate_every=-1, n_jobs=-1)
    #lda_tf.fit(dtm_tf)

    # get the first 10 keywords of each topic
    # get the words
    #feature_names = tf_vectorizer.get_feature_names()
    # topics can be extracted from components_
    #    date_today = str(date.today()) + '-'

    #    for topic_ii, topic in enumerate(lda_tf.components_):
    #        topic_id = str(date.today()) + '-' + str(topic_ii)
    #        language = 'DE'
    #        topic_type = 'LDA'
    #        topic_date = str(date.today())
    #        experiy_date = None
    #        attribute = None
    #        topic_words = [feature_names[ii] for ii in topic.argsort()[:-11:-1]]
    #        row = [topic_id, language, topic_type, topic_date, experiy_date, attribute]
    #        row.extend(topic_words)
    #        topic_list.append(row)

    #topic_np = np.array(topic_list, dtype='object')
    #col_names = ['TOPIC', 'LANGUAGE', 'TYPE', 'DATE', 'EXPERIY_DATE', 'ATTRIBUTE']
    #for ii in range(1, 11):
    #    col_names.append(f'KEYWORD_{ii}')

    #self.topic_df = pd.DataFrame(topic_np, columns=col_names)

    logger.debug('Process ended, topics processed {}'.format(
        time_monitor.elapsed_time()))
    api.send(outports[0]['name'], log_stream.getvalue())
示例#16
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'filter_by_population'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    df = msg.body
    prev_cols = df.shape[1]

    columns = tfp.read_list(api.config.columns,df.columns,test_number=False)
    info_only = api.config.info_only
    threshold = api.config.threshold
    logger.debug('Parameter  Threshold: {}   Data Modification:{} '.format(threshold,info_only))

    transform_data = {'column': [], 'dtype': [], 'unique_vals': [],'action': []}
    for col in columns:
        population = df[col].count() / df.shape[0]
        unique_vals = df[col].unique()
        if population < threshold and not (len(unique_vals) == 1 and np.isnan(unique_vals[0])):
            unique_vals = df[col].unique()
            transform_data['column'].append(col)
            transform_data['dtype'].append(df[col].dtype)
            transform_data['unique_vals'].append(unique_vals)
            transform_data['action'].append('drop')
            if not info_only:
                df.drop(columns=[col], inplace=True)

    # end custom process definition
    if df.empty :
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1]))
    logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(i,str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(att_dict['storage.fileIndex'] + 1,
                                                              att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    return log_stream.getvalue(), api.Message(attributes=att_dict,body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
示例#17
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'filter_by_population'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    time_monitor = tp.progress()

    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())

    df = msg.body

    columns = tfp.read_list(api.config.columns, df.columns, test_number=False)
    info_only = api.config.info_only
    threshold = api.config.threshold

    transform_data = {
        'column': [],
        'dtype': [],
        'unique_vals': [],
        'action': []
    }
    for col in columns:
        population = df[col].count() / df.shape[0]
        unique_vals = df[col].unique()
        if population < threshold and not (len(unique_vals) == 1
                                           and np.isnan(unique_vals[0])):
            unique_vals = df[col].unique()
            transform_data['column'].append(col)
            transform_data['dtype'].append(df[col].dtype)
            transform_data['unique_vals'].append(unique_vals)
            transform_data['action'].append('drop')
            if not info_only:
                df.drop(columns=[col], inplace=True)

    logger.debug('End of Process Function')
    logger.debug('End time: ' + time_monitor.elapsed_time())

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))

    return log_stream.getvalue(), api.Message(attributes={'name':'filter_by_population','type':'DataFrame'},body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
示例#18
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'toCSV'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    df = msg.body
    if api.config.reset_index:
        logger.debug('Reset Index')
        df = df.reset_index()

    kwargs = tfp.read_dict(text=api.config.keyword_args, map_sep='=')

    if not kwargs == None:
        data_str = df.to_csv(sep=api.config.separator,
                             index=api.config.write_index,
                             **kwargs)
    else:
        data_str = df.to_csv(sep=api.config.separator,
                             index=api.config.write_index)
    # end custom process definition
    logger.info('End of Process: {}'.format(time_monitor.elapsed_time()))

    # create dict of columns and types for HANA
    map_hana = {'int8': 'TINYINT', 'int16': 'SMALLINT', 'int32': 'INTEGER', 'int64': 'BIGINT', 'float32': 'FLOAT',
                'float64': 'DOUBLE', \
                'object': 'VARCHAR', 'datetime64': 'TIMESTAMP'}
    col_dict = {c: str(df[c].dtype) for c in df.columns}
    hana_table_dict = list()
    for c, ty in col_dict.items():
        if ty == 'object':
            size = df[c].str.len().max()
            hana_table_dict.append({
                'name': c,
                'type': map_hana[col_dict[c]],
                'size': size
            })
        elif 'datetime64' in ty:
            hana_table_dict.append({'name': c, 'type': 'TIMESTAMP'})
        else:
            hana_table_dict.append({'name': c, 'type': map_hana[col_dict[c]]})
    logger.info('For Hana table definition: {}'.format(hana_table_dict))

    log = log_stream.getvalue()
    return log, data_str
def process(msg):

    words = msg.body
    att_dict = msg.attributes

    logger, log_stream = slog.set_logging('word_regex_cleansing',
                                          api.config.debug_mode)
    logger.info("Main Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    if isinstance(words[0], list):
        words = [w[0] for w in words]

    regex_patterns = tfp.read_list(api.config.patterns)

    logger.info('Test mode: {}'.format(api.config.test_mode))
    logger.info('Number of words to cleanse: {}'.format(len(words)))

    word_type = tfp.read_value(api.config.word_type)
    if len(word_type) > 1:
        logger.warning(
            'Only one word type can be processed. Take first one only: {}'.
            format(word_type[0]))

    count = 0
    for ipat, pat in enumerate(regex_patterns):
        if pat == '':
            logger.warning('Empty pattern')
            continue
        cleansing_words = [w for w in words if re.match(pat, w)]
        logger.info('Execute pattern: {} ({}/{})'.format(
            pat, ipat, len(regex_patterns)))
        logger.info('Number of DELETE statements: {}'.format(
            len(cleansing_words)))
        api.send(outports[0]['name'], log_stream.getvalue())
        log_stream.seek(0)
        log_stream.truncate()
        if not api.config.test_mode:
            for iw, w in enumerate(cleansing_words):
                if word_type:
                    sql = 'DELETE FROM WORD_INDEX WHERE WORD = \'' + w + '\' AND WORD_TYPE = \'' + word_type + '\';'
                else:
                    sql = 'DELETE FROM WORD_INDEX WHERE WORD = \'' + w + '\';'
                att_dict['message.indexBatch'] = count
                att_dict['message.lastBatch'] = False
                api.send(outports[1]['name'],
                         api.Message(attributes=att_dict, body=sql))
                count += 1

    sql = 'SELECT * FROM DUMMY;'
    att_dict['message.indexBatch'] = count
    att_dict['message.lastBatch'] = True
    api.send(outports[1]['name'], api.Message(attributes=att_dict, body=sql))
    api.send(outports[0]['name'], log_stream.getvalue())
示例#20
0
def process(msg):

    att_dict = msg.attributes
    att_dict['operator'] = 'castColumns'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    df = msg.body

    castmap = tfp.read_dict(api.config.cast)

    if castmap:
        for col, casttype in castmap.items():
            if api.config.round:
                df[col] = df[col].round()
            df[col] = df[col].astype(casttype)

    ###### end calculation

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(
                att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(
                att_dict['storage.fileIndex'] + 1,
                att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
示例#21
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'drop_highly_unique'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    time_monitor = tp.progress()
    logger.debug('Start time: ' + time_monitor.get_start_time())

    df = msg.body
    prev_shape = df.shape

    columns = tfp.read_list(api.config.columns, df.columns, test_number=False)
    info_only = api.config.info_only
    threshold = api.config.threshold

    transform_data = {
        'column': [],
        'dtype': [],
        'unique_values': [],
        'action': []
    }
    for col in df[columns].select_dtypes(np.object).columns:
        unique_vals_num = len(df[col].unique())
        frac_unique_vals = unique_vals_num / df.shape[0]
        if frac_unique_vals > threshold:
            transform_data['column'].append(col)
            transform_data['dtype'].append(df[col].dtype)
            transform_data['unique_values'].append(frac_unique_vals)
            transform_data['action'].append('drop')
            if info_only == False:
                df.drop(columns=[col], inplace=True)

    logger.info('End of Process: {}'.format(time_monitor.elapsed_time()))

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))

    logger.info('Dropped Columns: {}'.format(prev_shape[1] - df.shape[1]))
    logger.debug('Dropped Columns: {}'.format(prev_shape[1] - df.shape[1]))

    return log_stream.getvalue(), api.Message(attributes={'name':'filter_by_population','type':'DataFrame'},body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
示例#22
0
def process(msg) :

    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'dropColumns'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start of doing calculation
    att_dict['config']['drop_columns'] = api.config.drop_columns
    drop_cols = tfp.read_list(api.config.drop_columns, df.columns)
    if drop_cols:
        logger.debug("Drops columns: {}".format(str(drop_cols)))
        df = df.drop(columns=drop_cols)

    att_dict['config']['rename_columns'] = api.config.rename_columns
    map_names = tfp.read_dict(api.config.rename_columns)
    if map_names:

        df.rename(columns=map_names, inplace=True)
    ###### end of doing calculation
    logger.info('End of Process: {}'.format(time_monitor.elapsed_time()))

    # df from body
    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
        logger.debug('Head data: {}'.format(att_dict['row_' + str(i)]))
    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict,body=df)
    return log, msg
示例#23
0
def process(msg):

    att_dict = msg.attributes

    logger, log_stream = slog.set_logging('word_index_regex',
                                          api.config.debug_mode)
    logger.info("Main Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    # regex patterns
    regex_patterns = tfp.read_list(api.config.patterns)

    # word type
    word_types = tfp.read_list(api.config.word_types)
    if not word_types:
        logger.warning(
            'Word types had to be defined. Default word type : \'PROPN\'')
        word_types = ['PROPN']

    # pandas Dataframe and select only values with right word_type
    cols = [c["name"] for c in msg.attributes['table']['columns']]
    df = pd.DataFrame(msg.body, columns=cols)
    df_p = df.loc[df['TYPE'].isin(word_types)]

    # Language filter
    language_filter = tfp.read_list(api.config.language_filter)
    if language_filter:
        df_p = df_p.loc[df['LANGUAGE'].isin(language_filter)]

    # get unique words to get words that comply with regex
    words = df_p['WORD'].unique()
    logger.info('Number of words to test with regex pattern: {}'.format(
        len(words)))

    for ipat, pat in enumerate(regex_patterns):
        if pat == '':
            logger.warning('Empty pattern')
            continue
        logger.info('Execute pattern: {} ({}/{})'.format(
            pat, ipat, len(regex_patterns)))
        cleansing_words = [w for w in words if re.match(pat, w)]

    df = df.loc[~df['WORD'].isin(cleansing_words)]

    api.send(outports[1]['name'],
             api.Message(attributes=att_dict, body=df.values.tolist()))
    api.send(outports[0]['name'], log_stream.getvalue())
def process(msg):
    global last_msg
    global hash_list
    global lexicon_stem, lexicon

    logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode)

    # Check if setup complete
    msg = check_for_setup(logger, msg)
    if not msg:
        api.send(outports[0]['name'], log_stream.flush())
        return 0

    logger.info("Main Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    att_dict = msg.attributes

    # pandas Dataframe and select only values with right word_type
    cols = [c["name"] for c in msg.attributes['table']['columns']]
    df = pd.DataFrame(msg.body, columns=cols)

    # word type
    types = tfp.read_list(api.config.types)
    if not types:
        logger.warning(
            'Word types had to be defined. Default word type : \'PROPN\'')
        types = ['PROPN']

    # Language filter
    languages = tfp.read_list(api.config.languages)
    if not languages:
        logger.warning(
            'Languages had to be defined. Default languages : EN, FR, ES, DE')
        languages = ['EN', 'FR', 'ES', 'DE']

    for lang in lexicon:
        for w in lexicon[lang]:
            df.loc[(df['TYPE'].isin(types)) & (df['LANGUAGE'] == lang) &
                   (df['WORD'] == w)] = lexicon[lang][w]
        for w in lexicon_stem[lang]:
            df.loc[(df['TYPE'].isin(types)) & (df['LANGUAGE'] == lang) &
                   (df['WORD'] == w)] = lexicon_stem[lang][w]

    api.send(outports[1]['name'],
             api.Message(attributes=att_dict, body=df.values.tolist()))
    api.send(outports[0]['name'], log_stream.getvalue())
示例#25
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'drop_duplicates'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    df = msg.body
    prev_shape = df.shape

    drop_cols_test = tfp.read_list(api.config.columns, df.columns)
    keep = tfp.read_value(api.config.keep, test_number=False)
    df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True)

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(
                att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(
                att_dict['storage.fileIndex'] + 1,
                att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
示例#26
0
def process(msg) :

    att_dict = msg.attributes
    att_dict['operator'] = 'dropColumns'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start of doing calculation
    drop_cols = tfp.read_list(api.config.drop_columns, df.columns)
    if drop_cols:
        logger.debug("Drops columns: {}".format(str(drop_cols)))
        df = df.drop(columns=drop_cols)

    map_names = tfp.read_dict(api.config.rename_columns)
    if map_names:
        df.rename(columns=map_names, inplace=True)

    # end custom process definition
    if df.empty :
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1]))
    logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(i,str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict :
        if not att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount'] :
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(progress_str,time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict,body=df)
def process(msg):
    logger, log_stream = slog.set_logging('DEBUG')
    time_monitor = tp.progress()

    result = ''
    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())
    for i in range(0, msg.body):
        result += str(
            i) + ':' + api.config.var1 + ' - ' + api.config.var2 + '    '
    logger.debug('End of Process Function')
    logger.debug('End time: ' + time_monitor.elapsed_time())
    return api.Message(attributes={
        'name': 'concat',
        'type': 'str'
    },
                       body=result), log_stream.getvalue()
示例#28
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'drop_1valuecolumns'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    df = msg.body
    prev_shape = df.shape
    # Columns with 1 unique value
    columns = tfp.read_list(api.config.columns, df.columns)
    col1val_data = {'column': [], 'type': [], 'unique_vals': [], 'action': []}
    for col in columns:
        vals = df[col].unique()
        if len(vals) == 1:
            col1val_data['column'].append(col)
            col1val_data['type'].append(str(df[col].dtype))
            col1val_data['unique_vals'].append(vals)
            col1val_data['action'].append('drop')
            if not api.config.info_only:
                df.drop(columns=[col], inplace=True)

    logger.info('End of Process: {}'.format(time_monitor.elapsed_time()))

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))

    logger.debug('Dropped columns: {}'.format(prev_shape[1] - df.shape[1]))
    logger.info('Dropped columns: {}'.format(prev_shape[1] - df.shape[1]))

    return log_stream.getvalue(), api.Message(attributes={'name':'drop_duplicates','type':'DataFrame'},body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(col1val_data))
def process(msg):

    logger, log_stream = slog.set_logging('metadata_articles', loglevel=api.config.debug_mode)
    logger.info("Process started")
    time_monitor = tp.progress()

    adict = msg.body
    att_dict = msg.attributes

    metadata_articles = list()
    articles_table = list()

    for index_article, article in enumerate(adict):
        metadata = {'media': article['media'], 'date': article['date'], 'language': language[article['media']], \
                    'hash_text': article['hash_text'], 'url': article['url'][:255], 'rubrics': article['rubrics'],
                    'title': article['title'][:255]}
        metadata['num_characters'] = len(article['text'])
        metadata_articles.append(metadata)
        datea = datetime.strptime(article['date'], '%Y-%m-%d').replace(tzinfo=timezone.utc)
        articles_table.append([article['media'],datea,language[article['media']],article['hash_text'],\
                              article['url'][:255],article['rubrics'],article['title'][:255]])


    table_att = {"columns": [
        {"class": "string", "name": "MEDIA", "nullable": True,  "size": 80, "type": {"hana": "NVARCHAR"}},
        {"class": "string", "name": "DATE", "nullable": False, "type": {"hana": "DATETIME"}},
        {"class": "string", "name": "LANGUAGE", "nullable": True, "size": 2, "type": {"hana": "NVARCHAR"}},
        {"class": "string", "name": "HASH_TEXT", "nullable": True, "type": {"hana": "INTEGER"}},
        {"class": "string", "name": "URL", "nullable": True, "size": 255,"type": {"hana": "NVARCHAR"}},
        {"class": "string", "name": "RUBRICS", "nullable": True, "size": 80,"type": {"hana": "NVARCHAR"}},
        {"class": "string", "name": "TITLE", "nullable": True, "size": 255,"type": {"hana": "NVARCHAR"}}],
              "name": "DIPROJECTS.ARTICLES_METADATA2", "version": 1}

    logger.debug('Process ended, articles processed {}  - {}  '.format(len(adict), time_monitor.elapsed_time()))

    att_dict['content'] = 'metadata for articles'
    # JSON
    msg = api.Message(attributes=att_dict, body=metadata_articles)
    api.send(outports[2]['name'], msg)
    # TABLE
    att_dict['table'] = table_att
    msg = api.Message(attributes=att_dict, body=articles_table)
    api.send(outports[1]['name'], msg)

    api.send(outports[0]['name'], log_stream.getvalue())
示例#30
0
def process(db_msg):

    logger, log_stream = slog.set_logging('topic_index',
                                          loglevel=api.config.debug_mode)
    logger.info("Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    att_dict = dict()
    topic = db_msg.attributes['topic']
    keywords = db_msg.attributes['keywords']
    columns = [c['name'] for c in db_msg.attributes['table']['columns']]
    columns = ['KEYWORD' if c == 'WORD' else c for c in columns]

    att_dict['topic'] = topic
    att_dict['tolerance'] = api.config.tolerance
    if api.config.tolerance < 1.0:
        min_keyword_num = len(keywords) - int(
            api.config.tolerance * len(keywords))
    else:
        min_keyword_num = len(keywords) - api.config.tolerance

    df = pd.DataFrame(db_msg.body, columns=columns)
    logger.debug('Input DataFrame: {} - {}'.format(df.shape[0], df.shape[1]))
    # filter out all indices of not containing keyword - not needed when getting the sql output directly
    #df = df.loc[df['KEYWORD'].isin(keywords)]

    num_keyword_articles = df.shape[0]
    g_df = df.groupby(['HASH_TEXT']).count().reset_index()
    g_df = g_df.loc[g_df['KEYWORD'] >= min_keyword_num]
    g_df['count'] = min_keyword_num
    g_df['topic'] = topic
    g_df.rename(columns={'KEYWORD': 'count', 'count': 'min_num'}, inplace=True)
    g_df = g_df[['HASH_TEXT', 'topic', 'count', 'min_num']]

    #print(g_df)
    topic_index = g_df.to_dict('records')

    topic_index_msg = api.Message(attributes=att_dict, body=topic_index)
    logger.info('Topic found in articles: \"{}\" in {}/{} ({}/{})'.format(topic,g_df.shape[0],num_keyword_articles,\
                                                                          min_keyword_num,len(keywords)))

    logger.debug('Process ended,  {}'.format(time_monitor.elapsed_time()))
    api.send(outports[1]['name'], topic_index_msg)
    api.send(outports[0]['name'], log_stream.getvalue())