Exemplo n.º 1
0
def process(msg):

    logger, log_stream = slog.set_logging('topic dispatcher',
                                          loglevel=api.config.debug_mode)
    logger.info("Process started")
    time_monitor = tp.progress()

    table = tfp.read_value(api.config.topic_table)
    column = tfp.read_value(api.config.table_colum)

    topics = msg.body
    for t in topics:
        topic_keywords = [
            "'" + t[i] + "'" for i in range(6, len(t)) if not t[i] == ''
        ]
        sql = 'SELECT * FROM ' + table + ' WHERE ' + column + ' IN(' + ','.join(
            topic_keywords) + ')'
        att_dict = {'topic': t[0], 'keywords': topic_keywords}
        sql_msg = api.Message(attributes=att_dict, body=sql)
        api.send(outports[1]['name'], sql_msg)
        logger.debug('Send sql: {}'.format(sql))

    logger.debug('Process ended, topics processed {}  - {}  '.format(
        len(topics), time_monitor.elapsed_time()))
    api.send(outports[0]['name'], log_stream.getvalue())
Exemplo n.º 2
0
def process():

    operator_name = 'sql_word_index'
    logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode)
    logger.info("Main Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    language = tfp.read_value(api.config.language)
    type_limit = tfp.read_dict(api.config.type_limit_map)
    table_name = tfp.read_value(api.config.table_name)
    text_id_col = tfp.read_value(api.config.text_id_col)

    for i, [wtype, limit] in enumerate(type_limit.items()):
        sql_s = "SELECT {tid}, \"{tn}\".LANGUAGE, \"{tn}\".TYPE, \"{tn}\".WORD, COUNT FROM \"{tn}\" INNER JOIN"\
                "(SELECT WORD, TYPE, LANGUAGE, SUM(COUNT) as CUMS FROM \"{tn}\" "\
                "WHERE LANGUAGE = \'{lang}\' AND TYPE = \'{wt}\' "\
                "GROUP BY WORD, TYPE, LANGUAGE) AS CTABLE ON "\
                "\"{tn}\".WORD = CTABLE.WORD AND \"{tn}\".TYPE = CTABLE.TYPE AND \"{tn}\".LANGUAGE = CTABLE.LANGUAGE "\
                "WHERE CUMS >= {lt}".format(tid = text_id_col,tn = table_name,lang=language,wt = wtype,lt=limit)

        lastbatch = True if len(type_limit) == i + 1 else False
        att_dict = attributes={'operator':operator_name,'parameter':{'type':wtype,'limit':limit,'language':language},\
                               'message.batchIndex':i,'message.batchSize':len(type_limit),'message.lastBatch':lastbatch}
        msg = api.Message(attributes=att_dict, body=sql_s)
        api.send(outports[1]['name'], msg)

    api.send(outports[0]['name'], log_stream.getvalue())
Exemplo n.º 3
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'drop_duplicates'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    time_monitor = tp.progress()

    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())

    df = msg.body
    before_num_rows = df.shape[0]
    drop_cols_test = tfp.read_list(api.config.columns,df.columns)
    keep = tfp.read_value(api.config.keep,test_number=False)
    df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True)
    logger.debug('Duplicate Rows: {}'.format(before_num_rows - df.shape[0]))

    logger.debug('End of Process Function')
    logger.debug('End time: ' + time_monitor.elapsed_time())

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))

    return log_stream.getvalue(), api.Message(attributes={'name':'drop_duplicates','type':'DataFrame'},body=df),
Exemplo n.º 4
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'groupby'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    prev_att = msg.attributes
    df = msg.body

    ###### start of doing calculation

    # groupby list
    cols = tfp.read_list(api.config.groupby)
    att_dict['config']['groupby'] = api.config.groupby

    # mapping aggregation
    try:
        colagg = tfp.read_dict(api.config.aggregation)
    except IndexError:
        logger.info('Aggregation is not a map, try to parse a value instead')
        colagg = tfp.read_value(api.config.aggregation)
    att_dict['config']['aggregation'] = api.config.aggregation

    # groupby
    logger.debug('Group columns: {}'.format(cols))
    logger.debug('Aggregation: {}'.format(colagg))
    logger.debug('Index: {}'.format(api.config.index))
    df = df.groupby(cols, as_index=api.config.index).agg(colagg)

    # drop col
    att_dict['config']['dropcols'] = api.config.drop_columns
    dropcols = tfp.read_list(api.config.drop_columns)
    if dropcols:
        df.drop(columns=dropcols, inplace=True)

    ##############################################
    #  final infos to attributes and info message
    ##############################################
    att_dict['operator'] = 'groupbyDataFrame'
    att_dict['name'] = prev_att['name']
    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = list(df.columns)
    att_dict['number_columns'] = df.shape[1]
    att_dict['number_rows'] = df.shape[0]

    example_rows = EXAMPLE_ROWS if att_dict[
        'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows']
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict, body=df)
    return log, msg
Exemplo n.º 5
0
def process(msg) :
    att_dict = msg.attributes
    att_dict['operator'] = 'groupby'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    prev_att = msg.attributes
    df = msg.body
    prev_shape = df.shape

    ###### start of doing calculation

    # groupby list
    cols = tfp.read_list(api.config.groupby)

    # mapping aggregation
    try :
        colagg = tfp.read_dict(api.config.aggregation)
    except IndexError :
        logger.info('Aggregation is not a map, try to parse a value instead')
        colagg = tfp.read_value(api.config.aggregation)

    # groupby
    logger.info('Group columns: {}'.format(cols))
    logger.info('Aggregation: {}'.format(colagg))
    logger.info('Index: {}'.format(api.config.index))
    df = df.groupby(cols, as_index=api.config.index).agg(colagg)

    # drop col
    dropcols = tfp.read_list(api.config.drop_columns)
    if dropcols :
        logger.info('Drop columns: {}'.format(dropcols))
        df.drop(columns=dropcols,inplace=True)

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg):

    words = msg.body
    att_dict = msg.attributes

    logger, log_stream = slog.set_logging('word_regex_cleansing',
                                          api.config.debug_mode)
    logger.info("Main Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    if isinstance(words[0], list):
        words = [w[0] for w in words]

    regex_patterns = tfp.read_list(api.config.patterns)

    logger.info('Test mode: {}'.format(api.config.test_mode))
    logger.info('Number of words to cleanse: {}'.format(len(words)))

    word_type = tfp.read_value(api.config.word_type)
    if len(word_type) > 1:
        logger.warning(
            'Only one word type can be processed. Take first one only: {}'.
            format(word_type[0]))

    count = 0
    for ipat, pat in enumerate(regex_patterns):
        if pat == '':
            logger.warning('Empty pattern')
            continue
        cleansing_words = [w for w in words if re.match(pat, w)]
        logger.info('Execute pattern: {} ({}/{})'.format(
            pat, ipat, len(regex_patterns)))
        logger.info('Number of DELETE statements: {}'.format(
            len(cleansing_words)))
        api.send(outports[0]['name'], log_stream.getvalue())
        log_stream.seek(0)
        log_stream.truncate()
        if not api.config.test_mode:
            for iw, w in enumerate(cleansing_words):
                if word_type:
                    sql = 'DELETE FROM WORD_INDEX WHERE WORD = \'' + w + '\' AND WORD_TYPE = \'' + word_type + '\';'
                else:
                    sql = 'DELETE FROM WORD_INDEX WHERE WORD = \'' + w + '\';'
                att_dict['message.indexBatch'] = count
                att_dict['message.lastBatch'] = False
                api.send(outports[1]['name'],
                         api.Message(attributes=att_dict, body=sql))
                count += 1

    sql = 'SELECT * FROM DUMMY;'
    att_dict['message.indexBatch'] = count
    att_dict['message.lastBatch'] = True
    api.send(outports[1]['name'], api.Message(attributes=att_dict, body=sql))
    api.send(outports[0]['name'], log_stream.getvalue())
Exemplo n.º 7
0
def process(msg):
    global blacklist
    global last_msg
    global id_set

    logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode)

    # Check if setup complete
    msg = check_for_setup(logger, msg)
    if not msg:
        api.send(outports[0]['name'], log_stream.flush())
        return 0

    logger.info("Main Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    att_dict = msg.attributes

    language = tfp.read_value(api.config.language)
    type = tfp.read_value(api.config.type)
    if len(type) > 1:
        logger.warning(
            'Only one type can be transformed. Take only first one: {}'.format(
                type[0]))
        type = type[0]

    # DELETE all new type rows
    sql = 'DELETE FROM "WORD_INDEX" WHERE  "TYPE" = \'Q\' '
    # COPY 'TYPE' to 'NEW TYPE'
    sql = 'SELECT * FROM "WORD_INDEX" WHERE "TYPE" = \'' + type + '\' '
    if language:
        sql += '"LANGUAGE" = \'' + language + '\' '

    # REMOVE ALL BLACKLIST

    # REPLACE LEXICON

    api.send(outports[0]['name'], log_stream.getvalue())
Exemplo n.º 8
0
def process(msg):

    logger, log_stream = slog.set_logging('word_frquency_sql', loglevel=api.config.debug_mode)
    logger.info("Process started")
    time_monitor = tp.progress()

    language = tfp.read_value(api.config.language)
    type = tfp.read_value(api.config.type)
    min_count = api.config.min_count

    sql_statement ='INSERT INTO WORD_FREQUENCY ("DATE", "LANGUAGE", "TYPE", "WORD", "COUNT") '\
                   'SELECT * FROM (SELECT AM."DATE", WI."LANGUAGE", "TYPE","WORD", sum("COUNT") ' \
                   'AS "COUNT" FROM "WORD_INDEX"  AS WI INNER JOIN "ARTICLES_METADATA" AS AM ' \
                   'ON WI."HASH_TEXT" = AM."HASH_TEXT"'\

    if type or language :
        sql_statement += ' WHERE'

    if language :
        sql_statement += ' WI."LANGUAGE" = \'' + language + '\''

    if type :
        if language :
            sql_statement += ' AND ('
        for i, t in enumerate(type):
            if i > 0 :
                sql_statement += ' OR'
            sql_statement += ' "TYPE" = \'' + t + '\''
        if language :
            sql_statement += ')'

    sql_statement += ' GROUP BY "DATE", "WORD","TYPE",WI."LANGUAGE") WHERE "COUNT">= ' + str(min_count) + ' ;'

    logger.debug('Process ended, articles processed {}'.format(time_monitor.elapsed_time()))
    api.send(outports[0]['name'], log_stream.getvalue())
    api.send(outports[1]['name'], sql_statement)
Exemplo n.º 9
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'drop_duplicates'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    df = msg.body
    prev_shape = df.shape

    drop_cols_test = tfp.read_list(api.config.columns, df.columns)
    keep = tfp.read_value(api.config.keep, test_number=False)
    df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True)

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(
                att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(
                att_dict['storage.fileIndex'] + 1,
                att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
Exemplo n.º 10
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'drop_duplicates'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    df = msg.body
    prev_shape = df.shape

    drop_cols_test = tfp.read_list(api.config.columns, df.columns)
    keep = tfp.read_value(api.config.keep, test_number=False)
    df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True)

    logger.info('End of Process: {}'.format(time_monitor.elapsed_time()))

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))

    logger.debug('Dropped duplicates: {}'.format(prev_shape[0] - df.shape[0]))
    logger.info('Dropped duplicates: {}'.format(prev_shape[0] - df.shape[0]))

    return log_stream.getvalue(), api.Message(attributes={
        'name': 'drop_duplicates',
        'type': 'DataFrame'
    },
                                              body=df),
Exemplo n.º 11
0
def process(msg):
    logger, log_stream = slog.set_logging('DEBUG')
    time_monitor = tp.progress()

    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())

    df = msg.body
    before_num_rows = df.shape[0]
    drop_cols_test = tfp.read_list(api.config.columns, df.columns)
    keep = tfp.read_value(api.config.keep, test_number=False)
    df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True)
    logger.debug('Duplicate Rows: {}'.format(before_num_rows - df.shape[0]))

    logger.debug('End of Process Function')
    logger.debug('End time: ' + time_monitor.elapsed_time())
    return log_stream.getvalue(), api.Message(attributes={
        'name': 'drop_duplicates',
        'type': 'DataFrame'
    },
                                              body=df),
Exemplo n.º 12
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'anonymizeData'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    time_monitor = tp.progress()

    result = ''
    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    att_dict = dict()
    att_dict['config'] = dict()

    ###### start of doing calculation

    model = LGBMRegressor(n_estimators=200,
                          learning_rate=0.03,
                          num_leaves=32,
                          colsample_bytree=0.9497036,
                          subsample=0.8715623,
                          max_depth=8,
                          reg_alpha=0.04,
                          reg_lambda=0.073,
                          min_split_gain=0.0222415,
                          min_child_weight=40)

    att_dict['config']['train columns'] = api.config.train_cols
    train_cols = tfp.read_list(api.config.train_cols, df.columns)

    att_dict['config']['label'] = api.config.label
    label = tfp.read_value(api.config.label)
    if not label:
        raise ValueError('Label is mandatory')

    # cast to categorical dtype
    for c in df[train_cols].select_dtypes(include='category').columns:
        unique_num = len(df[c].unique())
        nan_num = df[c].isna().count()
        logger.debug('Cast to category - {}: unique {}, nan: {} of {}'.format(
            c, unique_num, nan_num, df.shape[0]))
        df[c] = df[c].cat.codes
        df[c] = df[c].astype('int32')

    if pd.api.types.is_categorical(df[label]):
        df[label] = df[label].astype('category')
        logger.debug('Cast label to <category>')
        df[label] = df[label].cat.codes
        df[label] = df[label].astype('int32')

    print(df.select_dtypes(include='category').head(10))
    logger.debug('Train with {} features'.format(len(train_cols)))
    print(train_cols)
    model.fit(df[train_cols], df[label], eval_metric='auc')

    ###### end of doing calculation

    ##############################################
    #  final infos to attributes and info message
    ##############################################

    if df.empty:
        raise ValueError('DataFrame is empty')

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))
    logger.debug('End of Process Function')
    logger.debug('End time: ' + time_monitor.elapsed_time())
    return log_stream.getvalue(), api.Message(attributes=att_dict, body=model)
Exemplo n.º 13
0
def process(msg):

    global blacklist
    global last_msg
    global hash_text_list

    operator_name = 'keyword_search'
    logger, log_stream = slog.set_logging(operator_name,
                                          loglevel=api.config.debug_mode)
    logger.info("Process started")
    time_monitor = tp.progress()

    # Case: Keywords has been set before
    msg = check_for_setup(logger, msg)
    if not msg:
        api.send(outports[0]['name'], log_stream.flush())
        return 0

    if api.config.debug_mode:
        api.send(outports[0]['name'], log_stream.getvalue())

    article_list = msg.body
    att_dict = msg.attributes
    att_dict['operator'] = operator_name
    kw_index = list()

    word_mode = tfp.read_value(api.config.mode)
    media_set = set()
    for article in article_list:
        media_set.add(article['media'])
        if article['hash_text'] in hash_text_list:
            continue
        hash_text_list.append(article['hash_text'])
        alanguage = language_dict[article['media']]
        if not alanguage in setup_data:
            continue
        words = get_words(logger,
                          article['text'],
                          language=alanguage,
                          mode=word_mode)
        matched_words = [w for w in words if w in setup_data[alanguage]]
        s_words = [w for w in words if len(w) > 4 and w[-1] == 's']
        s_matched_words = [
            w[:-1] for w in s_words if w in setup_data[alanguage]
        ]
        matched_words.extend(s_matched_words)
        word_counter = Counter(matched_words)
        for word in word_counter:
            keyword_rec = {
                'hash_text': article['hash_text'],
                'keyword': word,
                'count': word_counter[word]
            }
            kw_index.append(keyword_rec)
            #logger.debug('Keyword: {}'.format(str(keyword_rec)))
    logger.debug(
        'Process ended, searched media: {} - keywords found {}  - {}'.format(
            str(media_set), len(kw_index), time_monitor.elapsed_time()))

    api.send(outports[0]['name'], log_stream.getvalue())
    api.send(outports[1]['name'],
             api.Message(attributes=att_dict, body=kw_index))
Exemplo n.º 14
0
def process(msg):
    global blacklist
    global last_msg
    global word_counter
    global hash_list
    global article_count
    global word_lang_counter

    logger, log_stream = slog.set_logging('word_frequency',
                                          loglevel=api.config.debug_mode)
    logger.info("Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    msg = check_for_setup(logger, msg)
    if not msg:
        api.send(outports[0]['name'], log_stream.flush())
        return 0

    adict = msg.body
    att_dict = msg.attributes

    end_date = datetime.strptime(api.config.date, '%Y-%m-%d')
    start_date = end_date - timedelta(days=api.config.days_into_past)

    language_filter = tfp.read_value(api.config.language)
    media_filter = tfp.read_value(api.config.media)

    for index_article, article in enumerate(adict):

        # filter article
        adate = datetime.strptime(article['date'], '%Y-%m-%d')
        if not start_date <= adate <= end_date:
            #logger.debug('Date of article out of range: {} ({} - {})'.format(adate,start_date,end_date))
            continue

        language = language_dict[article['media']]

        # filter language
        if language_filter and not language_filter == language:
            #logger.debug('Language filtered out: {} ({})'.format(language, language_filter))
            continue

        # filter media
        if media_filter and not media_filter == article['media']:
            #logger.debug('Media filtered out: {} ({})'.format(article['media'], media_filter))
            continue

        article_count += 1
        # check if article has been processed
        if article['hash_text'] in hash_list:
            logger.debug(
                'Article has already been processed: {} - {} - {}'.format(
                    article['date'], article['media'], article['hash_text']))
            word_counter.update(hash_list[article['hash_text']])
            continue

        text = article['text']

        # Language settings
        if language == 'DE':
            doc = nlp_g(text)
        elif language == 'FR':
            doc = nlp_fr(text)
        elif language == 'ES':
            doc = nlp_es(text)
        else:
            logger.warning('Language not implemented')
            doc = None
            words = []

        # only when doc has been created - language exists
        if doc:
            if api.config.mode == 'NOUN':
                words = [
                    token.lemma_[:api.config.max_word_len] for token in doc
                    if token.pos_ in ['PROPN', 'NOUN']
                ]
            elif api.config.mode == 'PROPN':
                words = [
                    token.lemma_[:api.config.max_word_len] for token in doc
                    if token.pos_ == 'PROPN'
                ]
            else:
                words = [
                    token.text[:api.config.max_word_len] for token in doc
                    if not token.is_stop
                ]

        # Remove blackist words
        words = [w for w in words if w not in setup_data]

        word_lang_counter[language].update(words)
        #word_counter.update(words)
        hash_list[article['hash_text']] = words

    result, progress_str = test_last_batch(attributes=att_dict,
                                           collect=api.config.collect)
    if result:
        # check for ending 's'
        word_freq = list()
        for lang in word_lang_counter:
            word_lang_counter[lang] = check_ending_s(logger,
                                                     word_lang_counter[lang])
            if api.config.limit > 0:
                common_words = word_lang_counter[lang].most_common(
                    api.config.limit)
            else:
                common_words = word_lang_counter[lang].most_common()
            wf = [ {'date':api.config.date,'days_into_past':api.config.days_into_past, 'language':lang, \
                           'media':api.config.media,'mode':api.config.mode, 'index':i,'word':w[0], 'frequency': w[1]} \
                          for i,w in enumerate(common_words) ]
            word_freq.extend(wf)
        msg = api.Message(attributes=att_dict, body=word_freq)
        api.send(outports[1]['name'], msg)
        logger.info("Articles processed:{}  - Hashed articles: {}".format(
            article_count, len(hash_list)))

    logger.info('File processed: {} #Articles: {}  Collection:{}'.format(
        att_dict["storage.filename"], len(adict), api.config.collect))
    logger.debug('Process ended,  {}  - {}'.format(
        progress_str, time_monitor.elapsed_time()))
    api.send(outports[0]['name'], log_stream.getvalue())
Exemplo n.º 15
0
def process(left_msg, right_msg):

    att_dict = left_msg.attributes
    att_dict['operator'] = 'join'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition

    l_att = left_msg.attributes
    r_att = right_msg.attributes

    # read stream from memory
    left_df = left_msg.body
    right_df = right_msg.body

    ###### start of doing calculation
    how = tfp.read_value(api.config.how)

    # merge according to config
    if api.config.on_index:
        df = pd.merge(left_df,
                      right_df,
                      how=how,
                      left_index=True,
                      right_index=True)
    elif api.config.left_on and api.config.right_on:
        left_on_list = tfp.read_list(api.config.left_on)
        right_on_list = tfp.read_list(api.config.right_on)
        logger.info('Join DataFrames on {} - {}'.format(
            left_on_list, right_on_list))
        left_df.reset_index(inplace=True)
        right_df.reset_index(inplace=True)

        df = pd.merge(left_df,
                      right_df,
                      how=how,
                      left_on=left_on_list,
                      right_on=right_on_list)

        # removing second index - might be a more elegant solution
        if 'index_x' in df.columns:
            df.drop(columns=['index_x'], inplace=True)
    else:
        raise ValueError(
            "Config setting: Either <on> or both <left_on> and <right_on> has to be set in order to join the dataframes"
        )

    index_list = tfp.read_list(api.config.new_indices)
    if index_list:
        df.set_index(keys=index_list, inplace=True)
        logger.info('Set index: {}'.format(index_list))

    col_list = tfp.read_list(api.config.drop_columns)
    if col_list:
        df.drop(labels=col_list, axis=1, inplace=True)
        logger.info('Drop columns: {}'.format(col_list))

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict[
                'storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],
                                          att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
Exemplo n.º 16
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'splitSample'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    time_monitor = tp.progress()

    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start of doing calculation
    att_dict['config']['split'] = api.config.split
    if api.config.split > df.shape[0]:
        warning = 'Split larger than whole sample'
        split = 1
    elif api.config.split > 1:
        split = api.config.split / df.shape[0]
    else:
        split = api.config.split

    att_dict['config']['to_category'] = api.config.to_category
    if api.config.to_category:
        for col in df.select_dtypes(include=np.object).columns:
            unique_num = len(df[col].unique())
            nan_num = df[col].isna().count()
            logger.debug(
                'Cast to category - {}: unique {}, nan: {} of {}'.format(
                    col, unique_num, nan_num, df.shape[0]))
            df[col] = df[col].astype('category')

    att_dict['config']['label'] = api.config.label
    label = tfp.read_value(api.config.label)
    if label:
        label_vals = list(df[label].unique())
        tdf = list()
        for lab in label_vals:
            tdf.append(df.loc[df[label] == lab].sample(
                frac=split, random_state=api.config.seed))
        train_df = pd.concat(tdf)
    else:
        train_df = df.sample(
            frac=split,
            random_state=api.config.seed)  # random state is a seed value

    test_df = df.drop(train_df.index)
    ###### end of doing calculation

    ##############################################
    #  final infos to attributes and info message
    ##############################################

    if df.empty:
        raise ValueError('DataFrame is empty')

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
        logger.debug('Head data: {}'.format(att_dict['row_' + str(i)]))

    train_msg = api.Message(attributes=att_dict, body=train_df)
    test_msg = api.Message(attributes=att_dict, body=test_df)
    logger.debug('End time: ' + time_monitor.elapsed_time())

    return log_stream.getvalue(), train_msg, test_msg
Exemplo n.º 17
0
def process(test_msg, base_msg):
    att_dict = base_msg.attributes
    att_dict['operator'] = 'fuzzyjoin'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    testdf_index = tfp.read_value(api.config.test_index)
    if not testdf_index:
        logger.error('Index of test data is mandatory')
        raise ValueError('Index of test data is mandatory')

    # get the columns to check
    mapping = tfp.read_dict(api.config.check_columns)
    df = pd.DataFrame()

    if mapping:
        # read stream from memory
        test_df = test_msg.body

        # test if all mapping cols in testdf
        checkcols = [
            elem in list(test_df.columns) for elem in list(mapping.keys())
        ]
        if not all(checkcols):
            error_txt = 'Elements in mapping are not contained in columns of test df : ' + \
                        str(list(mapping.keys())) + '-' + str(list(test_df.columns)) + ' - ' + str(checkcols)
            logger.error(error_txt)
            raise ValueError(error_txt)

        if not testdf_index in test_df.columns:
            logger.error('Test index needs to be column')
            raise ValueError('Test index needs to be column')

        tcols = ['t_' + c for c in list(mapping.keys())]
        tdf = pd.DataFrame(columns=tcols)

        df = base_msg.body
        df = pd.concat([df, tdf], axis=1)

        num_cols = len(mapping)
        # run over all left df rows to test in right_df
        for index, test_row in test_df.iterrows():
            # apply function
            def get_ratio(row):
                sc = 0
                for tcol, bcol in mapping.items():
                    sc = sc + fuzz.token_sort_ratio(test_row[tcol], row[bcol])
                return sc / num_cols

            df['tscore'] = df.apply(get_ratio, axis=1)
            # get best matching and store index in v_dict
            max_score = df['tscore'].max()
            if max_score >= api.config.limit:
                mask = (df['tscore'] == max_score)
                df.loc[mask, 'score'] = max_score
                df.loc[mask, 'external_id'] = test_row[testdf_index]
                for coli in mapping:
                    df.loc[mask, 't_' + coli] = test_row[coli]

            df.drop(columns=['tscore'], inplace=True)

        # remove external_id when test column value has none

        t_cols = ['t_' + t for t in mapping.keys()] + ['external_id', 'score']
        for bcol in mapping.values():
            mask = df[bcol].isna()
            df.loc[mask, t_cols] = np.nan

        if api.config.only_index:
            df = df[list(base_msg.body.columns) + ['external_id']]

        if api.config.only_matching_rows:
            df = df.loc[~df['score'].isna()]

        basedf_index = tfp.read_value(api.config.base_index)

        if api.config.joint_id:
            if not basedf_index:
                raise ValueError(
                    "For <joint_id> a value for <base_index> is necessary ")
            df.loc[~df['external_id'].isna(),
                   'joint_id'] = df.loc[~df['external_id'].isna(),
                                        'external_id']
            df.loc[df['external_id'].isna(),
                   'joint_id'] = df.loc[df['external_id'].isna(), basedf_index]

        if api.config.add_non_matching:
            # test if same columns
            if not all(
                [elem in test_df.columns for elem in base_msg.body.columns]):
                raise ValueError("Adding test dataframe only possible when having same columns " + str(test_df.columns) \
                                 + ' vs. ' + str(base_msg.body.columns))
            matched_ids = df['external_id'].unique()
            addto_df = test_df.loc[~test_df[testdf_index].isin(matched_ids
                                                               )].copy()
            addto_df['joint_id'] = addto_df[testdf_index]
            df = pd.concat([df, addto_df], axis=0, sort=False)
    else:
        logger.warning('No columns to check')

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict[
                'storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],
                                          att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
Exemplo n.º 18
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'splitSample'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    time_monitor = tp.progress()

    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start of doing calculation
    if api.config.split > df.shape[0]:
        warning = 'Split larger than whole sample'
        split = 1
    elif api.config.split > 1:
        split = api.config.split / df.shape[0]
    else:
        split = api.config.split
    logger.info('Split DataFrame: {}'.format(split))

    if api.config.to_category:
        cast_cols = df.select_dtypes(include=np.object).columns
        for col in cast_cols:
            unique_num = len(df[col].unique())
            nan_num = df[col].isna().count()
            logger.debug(
                'Cast to category - {}: unique {}, nan: {} of {}'.format(
                    col, unique_num, nan_num, df.shape[0]))
            df[col] = df[col].astype('category')
        logger.info('Cast to category type: {}'.format(cast_cols))

    label = tfp.read_value(api.config.label_col)
    if label:
        label_vals = list(df[label].unique())
        tdf = list()
        for lab in label_vals:
            tdf.append(df.loc[df[label] == lab].sample(
                frac=split, random_state=api.config.seed))
        train_df = pd.concat(tdf)
        logger.info('Consider label ratio for splitting: {}'.format(label))
    else:
        train_df = df.sample(
            frac=split,
            random_state=api.config.seed)  # random state is a seed value

    test_df = df.drop(train_df.index)
    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict[
                'storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],
                                          att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict,
                                              body=train_df), api.Message(
                                                  attributes=att_dict,
                                                  body=test_df)
Exemplo n.º 19
0
def process(test_msg, base_msg) :

    logger, log_stream = set_logging('DEBUG')

    # start custom process definition
    test_att = test_msg.attributes
    base_att = base_msg.attributes

    att_dict = dict()

    if test_att['name'] == base_att['name']:
        att_dict['name'] = test_att['name']
    else:
        att_dict['name'] = test_att['name'] + '-' + base_att['name']
    att_dict['config'] = dict()

    att_dict['config']['test_index'] = api.config.test_index
    testdf_index = tfp.read_value(api.config.test_index)
    if not testdf_index:
        logger.error('Index of test data is mandatory')
        raise ValueError('Index of test data is mandatory')

    att_dict['number_rows'] = str(base_msg.body.shape[0])

    # get the columns to check

    mapping = tfp.read_dict(api.config.check_columns)
    df = pd.DataFrame()

    if mapping:

        att_dict['config']['check_columns'] = str(mapping)
        att_dict['config']['limit'] = api.config.limit

        # read stream from memory
        test_df = test_msg.body

        # test if all mapping cols in testdf
        checkcols = [elem in list(test_df.columns) for elem in list(mapping.keys())]
        if not all(checkcols):
            error_txt = 'Elements in mapping are not contained in columns of test df : ' + \
                        str(list(mapping.keys())) + '-' + str(list(test_df.columns)) + ' - ' + str(checkcols)
            logger.error(error_txt)
            raise ValueError(error_txt)

        if not testdf_index in test_df.columns:
            logger.error('Test index needs to be column')
            raise ValueError('Test index needs to be column')

        tcols = ['t_' + c for c in list(mapping.keys())]
        tdf = pd.DataFrame(columns=tcols)

        df = base_msg.body
        df = pd.concat([df, tdf], axis=1)

        num_cols = len(mapping)
        # run over all left df rows to test in right_df
        for index, test_row in test_df.iterrows():
            # apply function
            def get_ratio(row):
                sc = 0
                for tcol, bcol in mapping.items():
                    sc = sc + fuzz.token_sort_ratio(test_row[tcol], row[bcol])
                return sc / num_cols

            df['tscore'] = df.apply(get_ratio, axis=1)
            # get best matching and store index in v_dict
            max_score = df['tscore'].max()
            if max_score >= api.config.limit:
                mask = (df['tscore'] == max_score)
                df.loc[mask, 'score'] = max_score
                df.loc[mask, 'external_id'] = test_row[testdf_index]
                for coli in mapping:
                    df.loc[mask, 't_' + coli] = test_row[coli]

            df.drop(columns=['tscore'], inplace=True)

        # remove external_id when test column value has none

        t_cols = ['t_' + t for t in mapping.keys()] + ['external_id', 'score']
        for bcol in mapping.values():
            mask = df[bcol].isna()
            df.loc[mask, t_cols] = np.nan

        if api.config.only_index:
            df = df[list(base_msg.body.columns) + ['external_id']]
        att_dict['config']['only_index'] = api.config.only_index

        if api.config.only_matching_rows:
            df = df.loc[~df['score'].isna()]
        att_dict['config']['only_matching_rows'] = api.config.only_matching_rows

        basedf_index = tfp.read_value(api.config.base_index)
        att_dict['config']['base_index'] = basedf_index

        if api.config.joint_id:
            if not basedf_index:
                raise ValueError("For <joint_id> a value for <base_index> is necessary ")
            df.loc[~df['external_id'].isna(), 'joint_id'] = df.loc[~df['external_id'].isna(), 'external_id']
            df.loc[df['external_id'].isna(), 'joint_id'] = df.loc[df['external_id'].isna(), basedf_index]
        att_dict['config']['joint_id'] = api.config.joint_id

        if api.config.add_non_matching:
            # test if same columns
            if not all([elem in test_df.columns for elem in base_msg.body.columns]):
                raise ValueError("Adding test dataframe only possible when having same columns " + str(test_df.columns) \
                                 + ' vs. ' + str(base_msg.body.columns))
            matched_ids = df['external_id'].unique()
            addto_df = test_df.loc[~test_df[testdf_index].isin(matched_ids)].copy()
            addto_df['joint_id'] = addto_df[testdf_index]
            df = pd.concat([df, addto_df], axis=0, sort=False)
        att_dict['config']['add_non_matching'] = api.config.add_non_matching

    else:
        logger.warning('No columns to check')

    ##############################################
    #  final infos to attributes and info message
    ##############################################
    if df.empty:
        logger.warning('DataFrame is empty')
    else :
        att_dict['operator'] = 'fuzzyjoinDataFrames'
        att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2
        att_dict['columns'] = str(list(df.columns))
        att_dict['number_columns'] = df.shape[1]
        att_dict['number_rows'] = df.shape[0]
        if 'id' in base_att.keys():
            att_dict['id'] = base_att['id'] + '; ' + att_dict['operator'] + ': ' + str(id(df))
        else:
            att_dict['id'] = att_dict['operator'] + ': ' + str(id(df))

        example_rows = EXAMPLE_ROWS if att_dict['number_rows'] > EXAMPLE_ROWS else att_dict['number_rows']
        for i in range(0, example_rows):
            att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict,body = df)
    return log, msg
Exemplo n.º 20
0
def process(msg) :

    logger, log_stream = slog.set_logging('DEBUG')

    # start custom process definition
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    att_dict = dict()
    att_dict['config'] = dict()

    ###### start of doing calculation

    att_dict['config']['reset_index'] = api.config.reset_index
    if api.config.reset_index:
        df.reset_index(inplace=True)

    # create DataFrame with numbered columns add concat it to df
    att_dict['config']['transpose_column'] = api.config.transpose_column
    trans_col = tfp.read_value(api.config.transpose_column)

    att_dict['config']['value_column'] = api.config.value_column
    val_col = tfp.read_value(api.config.value_column)

    # new columns
    tvals = list(df[trans_col].unique())
    if api.config.prefix:
        new_cols = {trans_col + '_' + str(v): v for v in tvals}
    else:
        new_cols = {str(v): v for v in tvals}
    t_df = pd.DataFrame(columns=new_cols.keys(), index=df.index)
    df = pd.concat([df, t_df], axis=1)

    # setting the corresponding column to the value of the value column
    for col, val in new_cols.items():
        df.loc[df[trans_col] == val, col] = df.loc[df[trans_col] == val, val_col]
    df.drop(columns=[trans_col, val_col], inplace=True)

    att_dict['config']['groupby'] = api.config.groupby
    gbcols = tfp.read_list(api.config.groupby, df.columns)
    # group df
    if gbcols:
        aggr_trans = api.config.aggr_trans.strip()
        aggr_default = api.config.aggr_default.strip()

        aggregation = dict()
        for col in df.columns:
            aggregation[col] = aggr_trans if col in new_cols else aggr_default
        aggregation = {c: a for c, a in aggregation.items() if c not in gbcols}

        df = df.groupby(gbcols, as_index=api.config.as_index).agg(aggregation)

    #####################
    #  final infos to attributes and info message
    #####################

    # df from body
    att_dict['operator'] = 'transposeColumnDataFrame'  # name of operator
    att_dict['mem_usage'] = df.memory_usage(deep=True).sum() / 1024 ** 2
    att_dict['name'] = prev_att['name']
    att_dict['columns'] = list(df.columns)
    att_dict['number_columns'] = len(att_dict['columns'])
    att_dict['number_rows'] = len(df.index)
    att_dict['example_row_1'] = str(df.iloc[0, :].tolist())

    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict,body=df)
    return log, msg
Exemplo n.º 21
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'transposeColumn'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start of doing calculation
    if api.config.reset_index:
        df.reset_index(inplace=True)
        logger.info('Reset index')

    # create DataFrame with numbered columns add concat it to df
    trans_col = tfp.read_value(api.config.transpose_column)
    logger.info('Transpose column: {}'.format(trans_col))

    val_col = tfp.read_value(api.config.value_column)
    logger.info('Value column: {}'.format(val_col))

    # new columns
    tvals = list(df[trans_col].unique())
    if api.config.prefix:
        new_cols = {trans_col + '_' + str(v): v for v in tvals}
    else:
        new_cols = {str(v): v for v in tvals}
    t_df = pd.DataFrame(columns=new_cols.keys(), index=df.index)
    df = pd.concat([df, t_df], axis=1)

    # setting the corresponding column to the value of the value column
    for col, val in new_cols.items():
        df.loc[df[trans_col] == val, col] = df.loc[df[trans_col] == val,
                                                   val_col]
    df.drop(columns=[trans_col, val_col], inplace=True)

    gbcols = tfp.read_list(api.config.groupby, df.columns)
    # group df
    if gbcols:
        aggr_trans = api.config.aggr_trans.strip()
        aggr_default = api.config.aggr_default.strip()

        aggregation = dict()
        for col in df.columns:
            aggregation[col] = aggr_trans if col in new_cols else aggr_default
        aggregation = {c: a for c, a in aggregation.items() if c not in gbcols}

        df = df.groupby(gbcols, as_index=api.config.as_index).agg(aggregation)
        logger.info('Groupby: {}'.format(gbcols))

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(
                att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(
                att_dict['storage.fileIndex'] + 1,
                att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
Exemplo n.º 22
0
def process(msg):
    global id_set

    operator_name = 'text_words'
    logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode)

    logger.info("Main Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    df = msg.body
    att_dict = msg.attributes

    # Remove ID that has been processed
    df = df.loc[~df['ID'].isin(id_set)]
    id_set.update(df['ID'].unique().tolist())

    # Languages
    language_filter = tfp.read_value(api.config.language)
    logger.info('Language filter: {}'.format(language_filter))
    if not language_filter:
        language_filter = df['LANGUAGE'].unique().tolist()
    language_filter = [
        lang for lang in language_filter if lang in language_models.keys()
    ]
    nlp = dict()
    for lc in language_filter:
        nlp[lc] = spacy.load(language_models[lc])
    df = df.loc[df['LANGUAGE'].isin(language_filter)]

    # Warning for languages not supported
    languages_not_supported = [
        lang for lang in language_filter if not lang in language_models.keys()
    ]
    if languages_not_supported:
        logger.warning((
            'The text of following langauges not analysed due to unsupported language: {}'
            .format(languages_not_supported)))

    # word types
    types = tfp.read_list(api.config.types)
    logger.info('Word types to be extracted: {}'.format(types))

    entity_types = tfp.read_list(api.config.entity_types)
    logger.info('Entity types to be extracted: {}'.format(entity_types))

    # Create doc for all
    word_bag_list = list()

    def get_words(id, language, text):
        if not isinstance(text, str):
            logger.warning(('Record with error - ID: {} - {}'.format(id,
                                                                     text)))
            return -1
        doc = nlp[language](text)
        words = list()
        for t in types:
            words.extend(
                [[id, language, t, token.lemma_[:api.config.max_word_len]]
                 for token in doc if token.pos_ == t])
        for et in entity_types:
            words.extend(
                [[id, language, et, ent.text[:api.config.max_word_len]]
                 for ent in doc.ents if ent.label_ == et])
        word_bag_list.append(
            pd.DataFrame(words, columns=['ID', 'LANGUAGE', 'TYPE', 'WORD']))

    df.apply(lambda x: get_words(x['ID'], x['LANGUAGE'], x['TEXT']), axis=1)
    word_bag = pd.concat(word_bag_list)
    word_bag = word_bag.loc[
        word_bag['WORD'].str.len() >= api.config.min_word_len]
    word_bag['COUNT'] = 1
    word_bag = word_bag.groupby(['ID', 'LANGUAGE', 'TYPE',
                                 'WORD'])['COUNT'].sum().reset_index()

    # test for duplicates
    dup_s = word_bag.duplicated(
        subset=['ID', 'LANGUAGE', 'TYPE', 'WORD']).value_counts()
    num_duplicates = dup_s[True] if True in dup_s else 0
    logger.info('Duplicates: {} / {}'.format(num_duplicates,
                                             word_bag.shape[0]))

    att_dict['message.lastBatch'] = True
    table_msg = api.Message(attributes=att_dict, body=word_bag)

    logger.info('Labels in document: {}'.format(
        word_bag['TYPE'].unique().tolist()))
    api.send(outports[0]['name'], log_stream.getvalue())
    api.send(outports[1]['name'], table_msg)
Exemplo n.º 23
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'lgbm_classifier'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='DEBUG')
    else :
        logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start of doing calculation

    model = LGBMRegressor(
        n_estimators=200,
        learning_rate=0.03,
        num_leaves=32,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=8,
        reg_alpha=0.04,
        reg_lambda=0.073,
        min_split_gain=0.0222415,
        min_child_weight=40)

    train_cols = tfp.read_list(api.config.train_cols, df.columns)
    logger.info('Train columns: {}'.format(train_cols))

    label = tfp.read_value(api.config.label_col)
    logger.info('Label column: {}'.format(label))
    if not label:
        raise ValueError('Label is mandatory')

    # cast to categorical dtype
    for c in df[train_cols].select_dtypes(include='category').columns:
        unique_num = len(df[c].unique())
        nan_num = df[c].isna().count()
        logger.debug('Cast to category - {}: unique {}, nan: {} of {}'.format(c, unique_num, nan_num, df.shape[0]))
        df[c] = df[c].cat.codes
        df[c] = df[c].astype('int32')

    if pd.api.types.is_categorical(df[label]):
        df[label] = df[label].astype('category')
        logger.debug('Cast label to <category>')
        df[label] = df[label].cat.codes
        df[label] = df[label].astype('int32')

    print(df.select_dtypes(include='category').head(10))
    logger.debug('Train with {} features'.format(len(train_cols)))
    print(train_cols)
    model.fit(df[train_cols], df[label], eval_metric='auc')

    ###### end of doing calculation

    # end custom process definition
    if df.empty :
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1]))
    logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(i,str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(att_dict['storage.fileIndex'] + 1,
                                                              att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    return log_stream.getvalue(), api.Message(attributes=att_dict,body=df)
Exemplo n.º 24
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'sample'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    # test if body refers to a DataFrame type
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        logger.error('Message body does not contain a pandas DataFrame')
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start  calculation

    sample_size = api.config.sample_size
    if sample_size < 1:
        sample_size = int(sample_size * df.shape[0])
        if sample_size < 1:
            sample_size = 1
            logger.warning(
                "Fraction of sample size too small. Set sample size to 1.")
    elif sample_size > df.shape[0]:
        logger.warning("Sample size larger than number of rows")

    logger.debug("Samples_size: {}/() ({})".format(sample_size, df.shape[0],
                                                   sample_size / df.shape[0]))
    random_state = api.config.random_state

    invariant_column = tfp.read_value(api.config.invariant_column)
    if invariant_column and sample_size < df.shape[0]:
        # get the average number of records for each value of invariant
        sc_df = df.groupby(invariant_column)[invariant_column].count()
        sample_size_invariant = int(sample_size / sc_df.mean())
        sample_size_invariant = 1 if sample_size_invariant == 0 else sample_size_invariant  # ensure minimum
        sc_df = sc_df.sample(n=sample_size_invariant,
                             random_state=random_state).to_frame()
        sc_df.rename(columns={invariant_column: 'sum'}, inplace=True)
        # sample the df by merge 2 df
        df = pd.merge(df,
                      sc_df,
                      how='inner',
                      right_index=True,
                      left_on=invariant_column)
        df.drop(columns=['sum'], inplace=True)
    else:
        df = df.sample(n=sample_size, random_state=random_state)

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(
                att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(
                att_dict['storage.fileIndex'] + 1,
                att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
Exemplo n.º 25
0
def process(msg) :
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'sample'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    # test if body refers to a DataFrame type
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        logger.error('Message body does not contain a pandas DataFrame')
        raise TypeError('Message body does not contain a pandas DataFrame')

    att_dict = dict()
    att_dict['config'] = dict()

    ###### start  calculation

    sample_size = api.config.sample_size
    if sample_size < 1 :
        sample_size = int(sample_size * df.shape[0])
        if sample_size < 1 :
            sample_size = 1
            logger.warning("Fraction of sample size too small. Set sample size to 1.")
    elif sample_size > df.shape[0]:
        logger.warning("Sample size larger than number of rows")

    logger.debug("Samples_size: {}/() ({})".format(sample_size,df.shape[0],sample_size/df.shape[0]))
    random_state = api.config.random_state

    invariant_column = tfp.read_value(api.config.invariant_column)
    if invariant_column and sample_size < df.shape[0]:
        # get the average number of records for each value of invariant
        sc_df = df.groupby(invariant_column)[invariant_column].count()
        sample_size_invariant = int(sample_size / sc_df.mean())
        sample_size_invariant = 1 if sample_size_invariant == 0 else sample_size_invariant  # ensure minimum
        sc_df = sc_df.sample(n=sample_size_invariant, random_state=random_state).to_frame()
        sc_df.rename(columns={invariant_column: 'sum'}, inplace=True)
        # sample the df by merge 2 df
        df = pd.merge(df, sc_df, how='inner', right_index=True, left_on=invariant_column)
        df.drop(columns=['sum'], inplace=True)
    else:
        df = df.sample(n=sample_size, random_state=random_state)

    ###### end  calculation

    ##############################################
    #  final infos to attributes and info message
    ##############################################

    if df.empty:
        raise ValueError('DataFrame is empty')

    logger.info('End of Process: {}'.format(time_monitor.elapsed_time()))

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
        logger.debug('Head data: {}'.format(att_dict['row_' + str(i)]))

    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict,body=df)
    return log, msg
Exemplo n.º 26
0
def process(msg):
    global setup_data
    global last_msg
    global hash_list

    logger, log_stream = slog.set_logging('text cleansing',
                                          loglevel=api.config.debug_mode)
    logger.info("Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    msg = check_for_setup(logger, msg)
    if not msg:
        api.send(outports[0]['name'], log_stream.flush())
        return 0

    adict = msg.body

    language_filter = tfp.read_value(api.config.language)
    mode = tfp.read_value(api.config.mode)
    supported_modes = 'PNX'
    if not mode or not any(m in mode for m in supported_modes):
        raise Exception(
            'Mode is mandatory parameter and valid values are: {}'.format(
                supported_modes))

    article_words = list()
    article_count = 0
    for index_article, article in enumerate(adict):

        language = language_dict[article['media']]

        # filter language
        if language_filter and not language_filter == language:
            #logger.debug('Language filtered out: {} ({})'.format(language, language_filter))
            continue

        article_count += 1
        # check if article has been processed
        if article['hash_text'] in hash_list:
            logger.debug(
                'Article has already been processed: {} - {} - {}'.format(
                    article['date'], article['media'], article['hash_text']))
            continue
        hash_list.append(article['hash_text'])

        text = article['text']
        # might interfer with
        #text = re.sub(r'\d+', '', text.lower())
        #text = re.sub(r'\b[a-z]\b', '', text)

        # Language settings
        if language == 'DE':
            doc = nlp_g(text)
        elif language == 'FR':
            doc = nlp_fr(text)
        elif language == 'ES':
            doc = nlp_es(text)
        else:
            logger.warning('Language not implemented: {}'.format(language))
            doc = None

        words = dict()
        # only when doc has been created - language exists
        if doc:
            if 'P' in api.config.mode:
                words['P'] = [
                    token.lemma_[:api.config.max_word_len] for token in doc
                    if token.pos_ == 'PROPN'
                ]
            if 'N' in api.config.mode:
                words['N'] = [
                    token.lemma_[:api.config.max_word_len] for token in doc
                    if token.pos_ == 'NOUN'
                ]
            if 'X' in api.config.mode:
                words['X'] = [
                    token.text[:api.config.max_word_len] for token in doc
                    if not token.is_stop
                ]

        for m in words:
            # heuristics
            #min_length = 2
            #words[m] = [ re.sub('^[-]','',w) for w in words[m] if len(w) > min_length]
            # Remove blacklist words
            words[m] = [w for w in words[m] if w not in setup_data]
            if api.config.counter:
                article_words.append([
                    article['hash_text'], language, m,
                    collections.Counter(words[m]).most_common()
                ])
            else:
                article_words.append(
                    [article['hash_text'], language, m, words[m]])

    attributes = {
        "table": {
            "columns": [{
                "class": "string",
                "name": "HASH_TEXT",
                "nullable": True,
                "type": {
                    "hana": "INTEGER"
                }
            }, {
                "class": "string",
                "name": "LANGUAGE",
                "nullable": True,
                "size": 2,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "TYPE",
                "nullable": True,
                "size": 1,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "WORDS",
                "nullable": True,
                "type": {
                    "hana": "ARRAY"
                }
            }],
            "name":
            "DIPROJECTS.WORD_INDEX3",
            "version":
            1
        },
        "storage.filename": msg.attributes["storage.filename"]
    }

    attributes['counter'] = 'Y' if api.config.counter else 'N'

    table_msg = api.Message(attributes=attributes, body=article_words)
    logger.info('File processed: {} #Articles: {} '.format(
        msg.attributes["storage.filename"], len(adict)))
    api.send(outports[0]['name'], log_stream.getvalue())
    api.send(outports[1]['name'], table_msg)
Exemplo n.º 27
0
def process(left_msg, right_msg):

    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'join'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    # start custom process definition

    l_att = left_msg.attributes
    r_att = right_msg.attributes

    if l_att['name'] == r_att['name']:
        att_dict['name'] = l_att['name']
    else:
        att_dict['name'] = l_att['name'] + '-' + r_att['name']
    att_dict['config'] = dict()

    # read stream from memory
    left_df = left_msg.body
    right_df = right_msg.body

    ###### start of doing calculation
    how = tfp.read_value(api.config.how)

    # merge according to config
    att_dict['config']['on_index'] = api.config.on_index
    if api.config.on_index:
        df = pd.merge(left_df,
                      right_df,
                      how=how,
                      left_index=True,
                      right_index=True)
    elif api.config.left_on and api.config.right_on:
        att_dict['config']['left_on'] = api.config.left_on
        att_dict['config']['right_on'] = api.config.right_on

        left_on_list = tfp.read_list(api.config.left_on)
        right_on_list = tfp.read_list(api.config.right_on)
        left_df.reset_index(inplace=True)
        right_df.reset_index(inplace=True)

        df = pd.merge(left_df,
                      right_df,
                      how=how,
                      left_on=left_on_list,
                      right_on=right_on_list)

        # removing second index - might be a more elegant solution
        if 'index_x' in df.columns:
            df.drop(columns=['index_x'], inplace=True)
    else:
        raise ValueError(
            "Config setting: Either <on> or both <left_on> and <right_on> has to be set in order to join the dataframes"
        )

    att_dict['config']['new_indices'] = api.config.new_indices
    index_list = tfp.read_list(api.config.new_indices)
    if index_list:
        df.set_index(keys=index_list, inplace=True)

    att_dict['config']['drop_columns'] = api.config.drop_columns
    col_list = tfp.read_list(api.config.drop_columns)
    if col_list:
        df.drop(labels=col_list, axis=1, inplace=True)

    ##############################################
    #  final infos to attributes and info message
    ##############################################
    if df.empty == True:
        raise ValueError('Merged Dataframe is empty')

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
        logger.debug('Head data: {}'.format(att_dict['row_' + str(i)]))

    # end custom process definition

    log = log_stream.getvalue()

    msg = api.Message(attributes=att_dict, body=df)
    return log, msg
Exemplo n.º 28
0
def process(msg):

    att_dict = msg.attributes
    att_dict['operator'] = 'anonymizeData'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    logger.debug("Process started")
    time_monitor = tp.progress()

    result = ''
    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())

    prev_att = msg.attributes
    df = msg.body

    ###### start of doing calculation
    to_nan = tfp.read_value(api.config.to_nan, test_number=False)
    if to_nan:
        df.replace(to_nan, np.nan, inplace=True)

    anonymize_to_int_cols = tfp.read_list(api.config.anonymize_to_int_cols,
                                          list(df.columns))
    anonymize_cols = tfp.read_list(api.config.anonymize_cols, list(df.columns))

    ## Anonymize columns
    if anonymize_cols:
        logger.debug('Anonymize Columns: {}'.format(str(anonymize_cols)))
        # ensure that ids are not anonymized in the section but exclusively in the id-section
        anonymize_cols = [
            c for c in anonymize_cols if not c in anonymize_to_int_cols
        ]

        # replaceing string with a random string
        for c in df[anonymize_cols].select_dtypes(include='object'):
            unique_list = df[c].unique()
            n = int(math.log10(len(unique_list))) + 2
            # create random map first then check if keys have the values of the keep_list and replace the random values
            rep_map = {
                x: ''.join(random.choices(string.ascii_letters, k=n))
                for x in unique_list if isinstance(x, str)
            }
            for ktk, ktv in keep_terms.items():
                if ktk in rep_map.keys():
                    rep_map[ktk] = ktv
            df[c].replace(rep_map, inplace=True)

        # linear shift of integer
        for c in df[anonymize_cols].select_dtypes(include='int'):
            unique_i = df[c].unique()
            max_i = max(unique_i)
            min_i = min(unique_i)
            length = max_i - min_i
            rand_int1 = random.randint(0, 100)
            rand_int2 = random.randint(0, 100)
            # preserves existing/binary values 0 and 1
            if not (len(unique_i) == 2 and 0 in unique_i and 1 in unique_i):
                df[c] = ((df[c] - min_i) / length * rand_int1 +
                         rand_int2).astype('int')

        # linear shift of float
        for c in df[anonymize_cols].select_dtypes(include='float'):
            unique_f = df[c].unique()
            max_f = max(unique_f)
            min_f = min(unique_f)
            length = max_f - min_f
            rand_float1 = random.random()
            rand_float2 = random.random()
            df[c] = (
                (df[c] - min_f) / length * rand_float1 + rand_float2) / 2.0

    if anonymize_to_int_cols:
        logger.debug('Anonymize to Integer Columns: {}'.format(
            str(anonymize_to_int_cols)))
        # replaceing string with a random string
        for c in df[anonymize_to_int_cols]:
            unique_list = df[c].unique()
            rand_list = list(
                np.random.choice(1000 * len(unique_list),
                                 len(unique_list),
                                 replace=False))
            # create random map first then check if keys have the values of the keep_list and replace the random values
            rep_map = dict(zip(unique_list, rand_list))
            df[c].replace(rep_map, inplace=True)

    enumerate_cols = tfp.read_list(api.config.enumerate_cols, list(df.columns))
    if enumerate_cols:
        ncols = int(math.log10(len(enumerate_cols))) + 1
        prefix_cols = tfp.read_value(api.config.prefix_cols)
        if not prefix_cols:
            prefix_cols = 'Att_'
        cols_map = {
            oc: prefix_cols + str(i).zfill(ncols)
            for i, oc in enumerate(enumerate_cols)
        }
        df.rename(columns=cols_map, inplace=True)

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(
                att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(
                att_dict['storage.fileIndex'] + 1,
                att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(db_msg):

    logger, log_stream = slog.set_logging('topic_identification',
                                          loglevel=api.config.debug_mode)
    logger.info("Process started")
    time_monitor = tp.progress()

    columns = [c['name'] for c in db_msg.attributes['table']['columns']]
    df = pd.DataFrame(db_msg.body, columns=columns)

    # Language filter
    language_filter = tfp.read_list(api.config.language_filter)
    if language_filter:
        df = df.loc[df["LANGUAGE"].isin(language_filter)]
        logger.info('Languages : {}'.format(language_filter))
    # Word type filter
    word_type_filter = tfp.read_value(api.config.word_type_filter)
    if word_type_filter:
        types = [c for c in word_type_filter]
        df = df.loc[df["TYPE"].isin(types)]
        logger.info('Word restricted to types : {}'.format(word_type_filter))

    # groupby and concatenate words
    gdf = df.groupby('HASH_TEXT').agg({
        "LANGUAGE":
        'first',
        "WORD": [(lambda x: ' '.join(x)), 'count']
    })
    gdf.columns = gdf.columns.droplevel(level=0)
    gdf.rename(columns={
        "first": 'LANGUAGE',
        "count": 'NUM_WORDS',
        '<lambda_0>': 'WORDS'
    },
               inplace=True)

    # create document-term matrix - no tokenization or text prep are needed
    tf_vectorizer = CountVectorizer(analyzer='word',
                                    min_df=1,
                                    lowercase=False,
                                    tokenizer=str.split)

    # tf means term-frequency in a document for each language
    date_today = str(date.today())

    # 2-array with TOPIC, LANGUAGE, TYPE, DATE, EXPIRY_DATE, ATTRIBUTE, KEYWORD_i (num of topics)
    topic_list = list()
    for lang in language_filter:
        lang_gdf = gdf.loc[gdf['LANGUAGE'] == lang]
        logger.info(
            "Language: {}    #articles: {}    av.words/article: {:.1f}".format(
                lang, lang_gdf.shape[0], lang_gdf['NUM_WORDS'].mean()))
        dtm_tf = tf_vectorizer.fit_transform(lang_gdf['WORDS'])
        # for tf dtm
        lda_tf = LatentDirichletAllocation(n_components=api.config.num_topics,
                                           learning_method='online',
                                           evaluate_every=-1,
                                           n_jobs=-1)
        lda_tf.fit(dtm_tf)
        feature_names = tf_vectorizer.get_feature_names()

        for i, topic in enumerate(lda_tf.components_):
            topic_words = [
                feature_names[f]
                for f in topic.argsort()[:-api.config.topic_words - 1:-1]
            ]
            print('Len: {}  topic_words:{}'.format(len(topic_words),
                                                   topic_words))
            row = [
                date_today + "-" + str(i), lang, 'ALGO', date_today, None, None
            ] + topic_words
            print('Len: {}  record:{}'.format(len(row), row))
            topic_list.append(row)

    attributes = {
        "table": {
            "columns": [{
                "class": "string",
                "name": "TOPIC",
                "nullable": False,
                "size": 80,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "LANGUAGE",
                "nullable": False,
                "size": 2,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "TYPE",
                "nullable": False,
                "size": 10,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "DATE",
                "nullable": True,
                "type": {
                    "hana": "DATE"
                }
            }, {
                "class": "string",
                "name": "EXPIRY_DATE",
                "nullable": True,
                "type": {
                    "hana": "DATE"
                }
            }, {
                "class": "string",
                "name": "ATTRIBUTE",
                "nullable": True,
                "size": 25,
                "type": {
                    "hana": "NVACHAR"
                }
            }],
            "name":
            "DIPROJECTS.WORD_INDEX",
            "version":
            1
        }
    }
    for i in range(1, api.config.topic_words + 1):
        attributes['table']['columns'].append({
            "class": "string",
            "name": "KEYWORD_" + str(i),
            "nullable": True,
            "size": 80,
            "type": {
                "hana": "NVARCHAR"
            }
        })

    msg = api.Message(attributes=attributes, body=topic_list)
    logger.debug('Process ended, topics processed {}'.format(
        time_monitor.elapsed_time()))
    api.send(outports[0]['name'], log_stream.getvalue())
    api.send(outports[1]['name'], msg)
Exemplo n.º 30
0
def process(msg):
    global id_set
    global list_df
    global sentiment_words_df
    global att_dict

    logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode)
    logger.info("Main Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    if msg:
        att_dict = msg.attributes

    # sync data and setup msg
    # Case: do sentiment AND sentiment_setup not done/msg with data
    if api.config.sentiments and sentiment_words_df.empty:
        logger.info('Sentiment word list not setup yet!')
        api.send(outports[0]['name'], log_stream.getvalue())
        att_dict = msg.attributes
        if list_df.empty:
            list_df = msg.body
        else:
            list_df = pd.concat(list_df, msg.body)
        return 0
    # Case: Sentiment setup called process and data has been sent previously
    elif msg == None and not list_df.empty:
        df = list_df
    # Case: Sentiment setup called process and data has NOT been sent previously
    elif msg == None and list_df.empty:
        return 0
    # Case:  data sent and no sentiment analysis is required
    else:
        df = msg.body

    logger.debug('Attributes: {}'.format(att_dict))
    logger.debug('DataFrame: {} - {}'.format(df.shape[0], df.shape[1]))

    if df.shape[0] == 0:
        logger.warning('Empty DataFrame')
        return 0

        # Remove ID that has been processed
    # Should already been filtered out by 'Doc Prepare'
    prev_num_rows = df.shape[0]
    df = df.loc[~df['text_id'].isin(id_set)]
    post_num_rows = df.shape[0]
    if prev_num_rows != post_num_rows:
        logger.warning('Processed text_id has been found:  {} -> {}'.format(
            prev_num_rows, post_num_rows))
    id_set.update(df['text_id'].unique().tolist())

    # Languages
    language_filter = tfp.read_value(api.config.language)
    logger.info('Language filter: {}'.format(language_filter))
    if not language_filter:
        language_filter = df['language'].unique().tolist()
    language_filter = [
        lang for lang in language_filter if lang in language_models.keys()
    ]
    nlp = dict()
    for lc in language_filter:
        nlp[lc] = spacy.load(language_models[lc])
    df = df.loc[df['language'].isin(language_filter)]

    # Warning for languages not supported
    languages_not_supported = [
        lang for lang in language_filter if not lang in language_models.keys()
    ]
    if languages_not_supported:
        logger.warning((
            'The text of following langauges not analysed due to unsupported language: {}'
            .format(languages_not_supported)))

    # word types
    types = tfp.read_list(api.config.types)
    logger.info('Word types to be extracted: {}'.format(types))

    entity_types = tfp.read_list(api.config.entity_types)
    logger.info('Entity types to be extracted: {}'.format(entity_types))

    # Create doc for all
    word_bag_list = list()
    sentiment_list = list()

    def get_words(id, language, text):
        if not isinstance(text, str):
            logger.warning(('Record with error - ID: {} - {}'.format(id,
                                                                     text)))
            return -1
        doc = nlp[language](text)
        words = list()
        for t in types:
            words.extend(
                [[id, language, t, token.lemma_[:api.config.max_word_len]]
                 for token in doc if token.pos_ == t])
        for et in entity_types:
            words.extend(
                [[id, language, et, ent.text[:api.config.max_word_len]]
                 for ent in doc.ents if ent.label_ == et])
        # sentiment
        if language == 'DE' and api.config.sentiments:
            # collect all the lemmatized words of text
            # get all rows from sentiment_words_df and calculate mean
            lemmatized = [
                token.lemma_.lower() for token in doc
                if token.pos_ in ['NOUN', 'VERB', 'ADJ']
            ]
            # logger.debug('lemmatized words: {}'.format(lemmatized[0:10]))
            text_sentiment_words = sentiment_words_df.loc[
                sentiment_words_df.index.isin(lemmatized), 'value']
            text_sentiment_value = text_sentiment_words.mean()
            sentiment_list.append([id, text_sentiment_value])
            # logger.debug('id: {}  #sentiment words: {}   sentitment:{}'.format(id,text_sentiment_words.shape[0],text_sentiment_value))

        word_bag_list.append(
            pd.DataFrame(words,
                         columns=['text_id', 'language', 'type', 'word']))

    df.apply(lambda x: get_words(x['text_id'], x['language'], x['text']),
             axis=1)

    # data message
    try:
        word_bag = pd.concat(word_bag_list)
    except ValueError as ex:
        logger.error('No words in message: {}'.format(att_dict))
        api.send(outports[0]['name'], log_stream.getvalue())
        log_stream.seek(0)
        log_stream.truncate()
        return 0

    word_bag = word_bag.loc[
        word_bag['word'].str.len() >= api.config.min_word_len]
    word_bag['count'] = 1
    word_bag = word_bag.groupby(['text_id', 'language', 'type',
                                 'word'])['count'].sum().reset_index()

    # check for duplicates - should be unnecessary
    prev_num_rows = word_bag.shape[0]
    word_bag.drop_duplicates(ignore_index=True, inplace=True)
    post_num_rows = word_bag.shape[0]
    if not prev_num_rows == post_num_rows:
        logger.warning('Duplicates has been found:  {} -> {}'.format(
            prev_num_rows, post_num_rows))
        api.send(outports[0]['name'], log_stream.getvalue())
        log_stream.seek(0)
        log_stream.truncate()

    data_msg = api.Message(attributes=att_dict, body=word_bag)
    logger.info('Labels in document: {}'.format(
        word_bag['type'].unique().tolist()))
    logger.debug('DataFrame shape: {} - {}'.format(word_bag.shape[0],
                                                   word_bag.shape[1]))
    api.send(outports[2]['name'], data_msg)

    # sentiment message
    if api.config.sentiments and len(sentiment_list) > 0:
        sentiment_df = pd.DataFrame(sentiment_list,
                                    columns=['text_id', 'sentiment'])
        sentiment_df.drop_duplicates(inplace=True)
        sentiment_csv = sentiment_df.to_csv(index=False)
        sentiment_msg = api.Message(attributes=att_dict, body=sentiment_csv)
        api.send(outports[1]['name'], sentiment_msg)

    # log message
    api.send(outports[0]['name'], log_stream.getvalue())