def process(msg): logger, log_stream = slog.set_logging('topic dispatcher', loglevel=api.config.debug_mode) logger.info("Process started") time_monitor = tp.progress() table = tfp.read_value(api.config.topic_table) column = tfp.read_value(api.config.table_colum) topics = msg.body for t in topics: topic_keywords = [ "'" + t[i] + "'" for i in range(6, len(t)) if not t[i] == '' ] sql = 'SELECT * FROM ' + table + ' WHERE ' + column + ' IN(' + ','.join( topic_keywords) + ')' att_dict = {'topic': t[0], 'keywords': topic_keywords} sql_msg = api.Message(attributes=att_dict, body=sql) api.send(outports[1]['name'], sql_msg) logger.debug('Send sql: {}'.format(sql)) logger.debug('Process ended, topics processed {} - {} '.format( len(topics), time_monitor.elapsed_time())) api.send(outports[0]['name'], log_stream.getvalue())
def process(): operator_name = 'sql_word_index' logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode) logger.info("Main Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() language = tfp.read_value(api.config.language) type_limit = tfp.read_dict(api.config.type_limit_map) table_name = tfp.read_value(api.config.table_name) text_id_col = tfp.read_value(api.config.text_id_col) for i, [wtype, limit] in enumerate(type_limit.items()): sql_s = "SELECT {tid}, \"{tn}\".LANGUAGE, \"{tn}\".TYPE, \"{tn}\".WORD, COUNT FROM \"{tn}\" INNER JOIN"\ "(SELECT WORD, TYPE, LANGUAGE, SUM(COUNT) as CUMS FROM \"{tn}\" "\ "WHERE LANGUAGE = \'{lang}\' AND TYPE = \'{wt}\' "\ "GROUP BY WORD, TYPE, LANGUAGE) AS CTABLE ON "\ "\"{tn}\".WORD = CTABLE.WORD AND \"{tn}\".TYPE = CTABLE.TYPE AND \"{tn}\".LANGUAGE = CTABLE.LANGUAGE "\ "WHERE CUMS >= {lt}".format(tid = text_id_col,tn = table_name,lang=language,wt = wtype,lt=limit) lastbatch = True if len(type_limit) == i + 1 else False att_dict = attributes={'operator':operator_name,'parameter':{'type':wtype,'limit':limit,'language':language},\ 'message.batchIndex':i,'message.batchSize':len(type_limit),'message.lastBatch':lastbatch} msg = api.Message(attributes=att_dict, body=sql_s) api.send(outports[1]['name'], msg) api.send(outports[0]['name'], log_stream.getvalue())
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'drop_duplicates' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') time_monitor = tp.progress() logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) df = msg.body before_num_rows = df.shape[0] drop_cols_test = tfp.read_list(api.config.columns,df.columns) keep = tfp.read_value(api.config.keep,test_number=False) df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True) logger.debug('Duplicate Rows: {}'.format(before_num_rows - df.shape[0])) logger.debug('End of Process Function') logger.debug('End time: ' + time_monitor.elapsed_time()) att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) return log_stream.getvalue(), api.Message(attributes={'name':'drop_duplicates','type':'DataFrame'},body=df),
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'groupby' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') prev_att = msg.attributes df = msg.body ###### start of doing calculation # groupby list cols = tfp.read_list(api.config.groupby) att_dict['config']['groupby'] = api.config.groupby # mapping aggregation try: colagg = tfp.read_dict(api.config.aggregation) except IndexError: logger.info('Aggregation is not a map, try to parse a value instead') colagg = tfp.read_value(api.config.aggregation) att_dict['config']['aggregation'] = api.config.aggregation # groupby logger.debug('Group columns: {}'.format(cols)) logger.debug('Aggregation: {}'.format(colagg)) logger.debug('Index: {}'.format(api.config.index)) df = df.groupby(cols, as_index=api.config.index).agg(colagg) # drop col att_dict['config']['dropcols'] = api.config.drop_columns dropcols = tfp.read_list(api.config.drop_columns) if dropcols: df.drop(columns=dropcols, inplace=True) ############################################## # final infos to attributes and info message ############################################## att_dict['operator'] = 'groupbyDataFrame' att_dict['name'] = prev_att['name'] att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = list(df.columns) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] example_rows = EXAMPLE_ROWS if att_dict[ 'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(msg) : att_dict = msg.attributes att_dict['operator'] = 'groupby' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() prev_att = msg.attributes df = msg.body prev_shape = df.shape ###### start of doing calculation # groupby list cols = tfp.read_list(api.config.groupby) # mapping aggregation try : colagg = tfp.read_dict(api.config.aggregation) except IndexError : logger.info('Aggregation is not a map, try to parse a value instead') colagg = tfp.read_value(api.config.aggregation) # groupby logger.info('Group columns: {}'.format(cols)) logger.info('Aggregation: {}'.format(colagg)) logger.info('Index: {}'.format(api.config.index)) df = df.groupby(cols, as_index=api.config.index).agg(colagg) # drop col dropcols = tfp.read_list(api.config.drop_columns) if dropcols : logger.info('Drop columns: {}'.format(dropcols)) df.drop(columns=dropcols,inplace=True) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format(i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format(progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg): words = msg.body att_dict = msg.attributes logger, log_stream = slog.set_logging('word_regex_cleansing', api.config.debug_mode) logger.info("Main Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() if isinstance(words[0], list): words = [w[0] for w in words] regex_patterns = tfp.read_list(api.config.patterns) logger.info('Test mode: {}'.format(api.config.test_mode)) logger.info('Number of words to cleanse: {}'.format(len(words))) word_type = tfp.read_value(api.config.word_type) if len(word_type) > 1: logger.warning( 'Only one word type can be processed. Take first one only: {}'. format(word_type[0])) count = 0 for ipat, pat in enumerate(regex_patterns): if pat == '': logger.warning('Empty pattern') continue cleansing_words = [w for w in words if re.match(pat, w)] logger.info('Execute pattern: {} ({}/{})'.format( pat, ipat, len(regex_patterns))) logger.info('Number of DELETE statements: {}'.format( len(cleansing_words))) api.send(outports[0]['name'], log_stream.getvalue()) log_stream.seek(0) log_stream.truncate() if not api.config.test_mode: for iw, w in enumerate(cleansing_words): if word_type: sql = 'DELETE FROM WORD_INDEX WHERE WORD = \'' + w + '\' AND WORD_TYPE = \'' + word_type + '\';' else: sql = 'DELETE FROM WORD_INDEX WHERE WORD = \'' + w + '\';' att_dict['message.indexBatch'] = count att_dict['message.lastBatch'] = False api.send(outports[1]['name'], api.Message(attributes=att_dict, body=sql)) count += 1 sql = 'SELECT * FROM DUMMY;' att_dict['message.indexBatch'] = count att_dict['message.lastBatch'] = True api.send(outports[1]['name'], api.Message(attributes=att_dict, body=sql)) api.send(outports[0]['name'], log_stream.getvalue())
def process(msg): global blacklist global last_msg global id_set logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode) # Check if setup complete msg = check_for_setup(logger, msg) if not msg: api.send(outports[0]['name'], log_stream.flush()) return 0 logger.info("Main Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() att_dict = msg.attributes language = tfp.read_value(api.config.language) type = tfp.read_value(api.config.type) if len(type) > 1: logger.warning( 'Only one type can be transformed. Take only first one: {}'.format( type[0])) type = type[0] # DELETE all new type rows sql = 'DELETE FROM "WORD_INDEX" WHERE "TYPE" = \'Q\' ' # COPY 'TYPE' to 'NEW TYPE' sql = 'SELECT * FROM "WORD_INDEX" WHERE "TYPE" = \'' + type + '\' ' if language: sql += '"LANGUAGE" = \'' + language + '\' ' # REMOVE ALL BLACKLIST # REPLACE LEXICON api.send(outports[0]['name'], log_stream.getvalue())
def process(msg): logger, log_stream = slog.set_logging('word_frquency_sql', loglevel=api.config.debug_mode) logger.info("Process started") time_monitor = tp.progress() language = tfp.read_value(api.config.language) type = tfp.read_value(api.config.type) min_count = api.config.min_count sql_statement ='INSERT INTO WORD_FREQUENCY ("DATE", "LANGUAGE", "TYPE", "WORD", "COUNT") '\ 'SELECT * FROM (SELECT AM."DATE", WI."LANGUAGE", "TYPE","WORD", sum("COUNT") ' \ 'AS "COUNT" FROM "WORD_INDEX" AS WI INNER JOIN "ARTICLES_METADATA" AS AM ' \ 'ON WI."HASH_TEXT" = AM."HASH_TEXT"'\ if type or language : sql_statement += ' WHERE' if language : sql_statement += ' WI."LANGUAGE" = \'' + language + '\'' if type : if language : sql_statement += ' AND (' for i, t in enumerate(type): if i > 0 : sql_statement += ' OR' sql_statement += ' "TYPE" = \'' + t + '\'' if language : sql_statement += ')' sql_statement += ' GROUP BY "DATE", "WORD","TYPE",WI."LANGUAGE") WHERE "COUNT">= ' + str(min_count) + ' ;' logger.debug('Process ended, articles processed {}'.format(time_monitor.elapsed_time())) api.send(outports[0]['name'], log_stream.getvalue()) api.send(outports[1]['name'], sql_statement)
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'drop_duplicates' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() df = msg.body prev_shape = df.shape drop_cols_test = tfp.read_list(api.config.columns, df.columns) keep = tfp.read_value(api.config.keep, test_number=False) df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '<BATCH ENDED><1>' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '<BATCH ENDED><{}>'.format( att_dict['storage.fileCount']) else: progress_str = '<BATCH IN-PROCESS><{}/{}>'.format( att_dict['storage.fileIndex'] + 1, att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'drop_duplicates' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() df = msg.body prev_shape = df.shape drop_cols_test = tfp.read_list(api.config.columns, df.columns) keep = tfp.read_value(api.config.keep, test_number=False) df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True) logger.info('End of Process: {}'.format(time_monitor.elapsed_time())) att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) logger.debug('Dropped duplicates: {}'.format(prev_shape[0] - df.shape[0])) logger.info('Dropped duplicates: {}'.format(prev_shape[0] - df.shape[0])) return log_stream.getvalue(), api.Message(attributes={ 'name': 'drop_duplicates', 'type': 'DataFrame' }, body=df),
def process(msg): logger, log_stream = slog.set_logging('DEBUG') time_monitor = tp.progress() logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) df = msg.body before_num_rows = df.shape[0] drop_cols_test = tfp.read_list(api.config.columns, df.columns) keep = tfp.read_value(api.config.keep, test_number=False) df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True) logger.debug('Duplicate Rows: {}'.format(before_num_rows - df.shape[0])) logger.debug('End of Process Function') logger.debug('End time: ' + time_monitor.elapsed_time()) return log_stream.getvalue(), api.Message(attributes={ 'name': 'drop_duplicates', 'type': 'DataFrame' }, body=df),
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'anonymizeData' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') time_monitor = tp.progress() result = '' logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() ###### start of doing calculation model = LGBMRegressor(n_estimators=200, learning_rate=0.03, num_leaves=32, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.04, reg_lambda=0.073, min_split_gain=0.0222415, min_child_weight=40) att_dict['config']['train columns'] = api.config.train_cols train_cols = tfp.read_list(api.config.train_cols, df.columns) att_dict['config']['label'] = api.config.label label = tfp.read_value(api.config.label) if not label: raise ValueError('Label is mandatory') # cast to categorical dtype for c in df[train_cols].select_dtypes(include='category').columns: unique_num = len(df[c].unique()) nan_num = df[c].isna().count() logger.debug('Cast to category - {}: unique {}, nan: {} of {}'.format( c, unique_num, nan_num, df.shape[0])) df[c] = df[c].cat.codes df[c] = df[c].astype('int32') if pd.api.types.is_categorical(df[label]): df[label] = df[label].astype('category') logger.debug('Cast label to <category>') df[label] = df[label].cat.codes df[label] = df[label].astype('int32') print(df.select_dtypes(include='category').head(10)) logger.debug('Train with {} features'.format(len(train_cols))) print(train_cols) model.fit(df[train_cols], df[label], eval_metric='auc') ###### end of doing calculation ############################################## # final infos to attributes and info message ############################################## if df.empty: raise ValueError('DataFrame is empty') att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) logger.debug('End of Process Function') logger.debug('End time: ' + time_monitor.elapsed_time()) return log_stream.getvalue(), api.Message(attributes=att_dict, body=model)
def process(msg): global blacklist global last_msg global hash_text_list operator_name = 'keyword_search' logger, log_stream = slog.set_logging(operator_name, loglevel=api.config.debug_mode) logger.info("Process started") time_monitor = tp.progress() # Case: Keywords has been set before msg = check_for_setup(logger, msg) if not msg: api.send(outports[0]['name'], log_stream.flush()) return 0 if api.config.debug_mode: api.send(outports[0]['name'], log_stream.getvalue()) article_list = msg.body att_dict = msg.attributes att_dict['operator'] = operator_name kw_index = list() word_mode = tfp.read_value(api.config.mode) media_set = set() for article in article_list: media_set.add(article['media']) if article['hash_text'] in hash_text_list: continue hash_text_list.append(article['hash_text']) alanguage = language_dict[article['media']] if not alanguage in setup_data: continue words = get_words(logger, article['text'], language=alanguage, mode=word_mode) matched_words = [w for w in words if w in setup_data[alanguage]] s_words = [w for w in words if len(w) > 4 and w[-1] == 's'] s_matched_words = [ w[:-1] for w in s_words if w in setup_data[alanguage] ] matched_words.extend(s_matched_words) word_counter = Counter(matched_words) for word in word_counter: keyword_rec = { 'hash_text': article['hash_text'], 'keyword': word, 'count': word_counter[word] } kw_index.append(keyword_rec) #logger.debug('Keyword: {}'.format(str(keyword_rec))) logger.debug( 'Process ended, searched media: {} - keywords found {} - {}'.format( str(media_set), len(kw_index), time_monitor.elapsed_time())) api.send(outports[0]['name'], log_stream.getvalue()) api.send(outports[1]['name'], api.Message(attributes=att_dict, body=kw_index))
def process(msg): global blacklist global last_msg global word_counter global hash_list global article_count global word_lang_counter logger, log_stream = slog.set_logging('word_frequency', loglevel=api.config.debug_mode) logger.info("Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() msg = check_for_setup(logger, msg) if not msg: api.send(outports[0]['name'], log_stream.flush()) return 0 adict = msg.body att_dict = msg.attributes end_date = datetime.strptime(api.config.date, '%Y-%m-%d') start_date = end_date - timedelta(days=api.config.days_into_past) language_filter = tfp.read_value(api.config.language) media_filter = tfp.read_value(api.config.media) for index_article, article in enumerate(adict): # filter article adate = datetime.strptime(article['date'], '%Y-%m-%d') if not start_date <= adate <= end_date: #logger.debug('Date of article out of range: {} ({} - {})'.format(adate,start_date,end_date)) continue language = language_dict[article['media']] # filter language if language_filter and not language_filter == language: #logger.debug('Language filtered out: {} ({})'.format(language, language_filter)) continue # filter media if media_filter and not media_filter == article['media']: #logger.debug('Media filtered out: {} ({})'.format(article['media'], media_filter)) continue article_count += 1 # check if article has been processed if article['hash_text'] in hash_list: logger.debug( 'Article has already been processed: {} - {} - {}'.format( article['date'], article['media'], article['hash_text'])) word_counter.update(hash_list[article['hash_text']]) continue text = article['text'] # Language settings if language == 'DE': doc = nlp_g(text) elif language == 'FR': doc = nlp_fr(text) elif language == 'ES': doc = nlp_es(text) else: logger.warning('Language not implemented') doc = None words = [] # only when doc has been created - language exists if doc: if api.config.mode == 'NOUN': words = [ token.lemma_[:api.config.max_word_len] for token in doc if token.pos_ in ['PROPN', 'NOUN'] ] elif api.config.mode == 'PROPN': words = [ token.lemma_[:api.config.max_word_len] for token in doc if token.pos_ == 'PROPN' ] else: words = [ token.text[:api.config.max_word_len] for token in doc if not token.is_stop ] # Remove blackist words words = [w for w in words if w not in setup_data] word_lang_counter[language].update(words) #word_counter.update(words) hash_list[article['hash_text']] = words result, progress_str = test_last_batch(attributes=att_dict, collect=api.config.collect) if result: # check for ending 's' word_freq = list() for lang in word_lang_counter: word_lang_counter[lang] = check_ending_s(logger, word_lang_counter[lang]) if api.config.limit > 0: common_words = word_lang_counter[lang].most_common( api.config.limit) else: common_words = word_lang_counter[lang].most_common() wf = [ {'date':api.config.date,'days_into_past':api.config.days_into_past, 'language':lang, \ 'media':api.config.media,'mode':api.config.mode, 'index':i,'word':w[0], 'frequency': w[1]} \ for i,w in enumerate(common_words) ] word_freq.extend(wf) msg = api.Message(attributes=att_dict, body=word_freq) api.send(outports[1]['name'], msg) logger.info("Articles processed:{} - Hashed articles: {}".format( article_count, len(hash_list))) logger.info('File processed: {} #Articles: {} Collection:{}'.format( att_dict["storage.filename"], len(adict), api.config.collect)) logger.debug('Process ended, {} - {}'.format( progress_str, time_monitor.elapsed_time())) api.send(outports[0]['name'], log_stream.getvalue())
def process(left_msg, right_msg): att_dict = left_msg.attributes att_dict['operator'] = 'join' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition l_att = left_msg.attributes r_att = right_msg.attributes # read stream from memory left_df = left_msg.body right_df = right_msg.body ###### start of doing calculation how = tfp.read_value(api.config.how) # merge according to config if api.config.on_index: df = pd.merge(left_df, right_df, how=how, left_index=True, right_index=True) elif api.config.left_on and api.config.right_on: left_on_list = tfp.read_list(api.config.left_on) right_on_list = tfp.read_list(api.config.right_on) logger.info('Join DataFrames on {} - {}'.format( left_on_list, right_on_list)) left_df.reset_index(inplace=True) right_df.reset_index(inplace=True) df = pd.merge(left_df, right_df, how=how, left_on=left_on_list, right_on=right_on_list) # removing second index - might be a more elegant solution if 'index_x' in df.columns: df.drop(columns=['index_x'], inplace=True) else: raise ValueError( "Config setting: Either <on> or both <left_on> and <right_on> has to be set in order to join the dataframes" ) index_list = tfp.read_list(api.config.new_indices) if index_list: df.set_index(keys=index_list, inplace=True) logger.info('Set index: {}'.format(index_list)) col_list = tfp.read_list(api.config.drop_columns) if col_list: df.drop(labels=col_list, axis=1, inplace=True) logger.info('Drop columns: {}'.format(col_list)) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict[ 'storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'splitSample' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') time_monitor = tp.progress() logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation att_dict['config']['split'] = api.config.split if api.config.split > df.shape[0]: warning = 'Split larger than whole sample' split = 1 elif api.config.split > 1: split = api.config.split / df.shape[0] else: split = api.config.split att_dict['config']['to_category'] = api.config.to_category if api.config.to_category: for col in df.select_dtypes(include=np.object).columns: unique_num = len(df[col].unique()) nan_num = df[col].isna().count() logger.debug( 'Cast to category - {}: unique {}, nan: {} of {}'.format( col, unique_num, nan_num, df.shape[0])) df[col] = df[col].astype('category') att_dict['config']['label'] = api.config.label label = tfp.read_value(api.config.label) if label: label_vals = list(df[label].unique()) tdf = list() for lab in label_vals: tdf.append(df.loc[df[label] == lab].sample( frac=split, random_state=api.config.seed)) train_df = pd.concat(tdf) else: train_df = df.sample( frac=split, random_state=api.config.seed) # random state is a seed value test_df = df.drop(train_df.index) ###### end of doing calculation ############################################## # final infos to attributes and info message ############################################## if df.empty: raise ValueError('DataFrame is empty') att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) logger.debug('Head data: {}'.format(att_dict['row_' + str(i)])) train_msg = api.Message(attributes=att_dict, body=train_df) test_msg = api.Message(attributes=att_dict, body=test_df) logger.debug('End time: ' + time_monitor.elapsed_time()) return log_stream.getvalue(), train_msg, test_msg
def process(test_msg, base_msg): att_dict = base_msg.attributes att_dict['operator'] = 'fuzzyjoin' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition testdf_index = tfp.read_value(api.config.test_index) if not testdf_index: logger.error('Index of test data is mandatory') raise ValueError('Index of test data is mandatory') # get the columns to check mapping = tfp.read_dict(api.config.check_columns) df = pd.DataFrame() if mapping: # read stream from memory test_df = test_msg.body # test if all mapping cols in testdf checkcols = [ elem in list(test_df.columns) for elem in list(mapping.keys()) ] if not all(checkcols): error_txt = 'Elements in mapping are not contained in columns of test df : ' + \ str(list(mapping.keys())) + '-' + str(list(test_df.columns)) + ' - ' + str(checkcols) logger.error(error_txt) raise ValueError(error_txt) if not testdf_index in test_df.columns: logger.error('Test index needs to be column') raise ValueError('Test index needs to be column') tcols = ['t_' + c for c in list(mapping.keys())] tdf = pd.DataFrame(columns=tcols) df = base_msg.body df = pd.concat([df, tdf], axis=1) num_cols = len(mapping) # run over all left df rows to test in right_df for index, test_row in test_df.iterrows(): # apply function def get_ratio(row): sc = 0 for tcol, bcol in mapping.items(): sc = sc + fuzz.token_sort_ratio(test_row[tcol], row[bcol]) return sc / num_cols df['tscore'] = df.apply(get_ratio, axis=1) # get best matching and store index in v_dict max_score = df['tscore'].max() if max_score >= api.config.limit: mask = (df['tscore'] == max_score) df.loc[mask, 'score'] = max_score df.loc[mask, 'external_id'] = test_row[testdf_index] for coli in mapping: df.loc[mask, 't_' + coli] = test_row[coli] df.drop(columns=['tscore'], inplace=True) # remove external_id when test column value has none t_cols = ['t_' + t for t in mapping.keys()] + ['external_id', 'score'] for bcol in mapping.values(): mask = df[bcol].isna() df.loc[mask, t_cols] = np.nan if api.config.only_index: df = df[list(base_msg.body.columns) + ['external_id']] if api.config.only_matching_rows: df = df.loc[~df['score'].isna()] basedf_index = tfp.read_value(api.config.base_index) if api.config.joint_id: if not basedf_index: raise ValueError( "For <joint_id> a value for <base_index> is necessary ") df.loc[~df['external_id'].isna(), 'joint_id'] = df.loc[~df['external_id'].isna(), 'external_id'] df.loc[df['external_id'].isna(), 'joint_id'] = df.loc[df['external_id'].isna(), basedf_index] if api.config.add_non_matching: # test if same columns if not all( [elem in test_df.columns for elem in base_msg.body.columns]): raise ValueError("Adding test dataframe only possible when having same columns " + str(test_df.columns) \ + ' vs. ' + str(base_msg.body.columns)) matched_ids = df['external_id'].unique() addto_df = test_df.loc[~test_df[testdf_index].isin(matched_ids )].copy() addto_df['joint_id'] = addto_df[testdf_index] df = pd.concat([df, addto_df], axis=0, sort=False) else: logger.warning('No columns to check') # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict[ 'storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'splitSample' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() time_monitor = tp.progress() df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation if api.config.split > df.shape[0]: warning = 'Split larger than whole sample' split = 1 elif api.config.split > 1: split = api.config.split / df.shape[0] else: split = api.config.split logger.info('Split DataFrame: {}'.format(split)) if api.config.to_category: cast_cols = df.select_dtypes(include=np.object).columns for col in cast_cols: unique_num = len(df[col].unique()) nan_num = df[col].isna().count() logger.debug( 'Cast to category - {}: unique {}, nan: {} of {}'.format( col, unique_num, nan_num, df.shape[0])) df[col] = df[col].astype('category') logger.info('Cast to category type: {}'.format(cast_cols)) label = tfp.read_value(api.config.label_col) if label: label_vals = list(df[label].unique()) tdf = list() for lab in label_vals: tdf.append(df.loc[df[label] == lab].sample( frac=split, random_state=api.config.seed)) train_df = pd.concat(tdf) logger.info('Consider label ratio for splitting: {}'.format(label)) else: train_df = df.sample( frac=split, random_state=api.config.seed) # random state is a seed value test_df = df.drop(train_df.index) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict[ 'storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict, body=train_df), api.Message( attributes=att_dict, body=test_df)
def process(test_msg, base_msg) : logger, log_stream = set_logging('DEBUG') # start custom process definition test_att = test_msg.attributes base_att = base_msg.attributes att_dict = dict() if test_att['name'] == base_att['name']: att_dict['name'] = test_att['name'] else: att_dict['name'] = test_att['name'] + '-' + base_att['name'] att_dict['config'] = dict() att_dict['config']['test_index'] = api.config.test_index testdf_index = tfp.read_value(api.config.test_index) if not testdf_index: logger.error('Index of test data is mandatory') raise ValueError('Index of test data is mandatory') att_dict['number_rows'] = str(base_msg.body.shape[0]) # get the columns to check mapping = tfp.read_dict(api.config.check_columns) df = pd.DataFrame() if mapping: att_dict['config']['check_columns'] = str(mapping) att_dict['config']['limit'] = api.config.limit # read stream from memory test_df = test_msg.body # test if all mapping cols in testdf checkcols = [elem in list(test_df.columns) for elem in list(mapping.keys())] if not all(checkcols): error_txt = 'Elements in mapping are not contained in columns of test df : ' + \ str(list(mapping.keys())) + '-' + str(list(test_df.columns)) + ' - ' + str(checkcols) logger.error(error_txt) raise ValueError(error_txt) if not testdf_index in test_df.columns: logger.error('Test index needs to be column') raise ValueError('Test index needs to be column') tcols = ['t_' + c for c in list(mapping.keys())] tdf = pd.DataFrame(columns=tcols) df = base_msg.body df = pd.concat([df, tdf], axis=1) num_cols = len(mapping) # run over all left df rows to test in right_df for index, test_row in test_df.iterrows(): # apply function def get_ratio(row): sc = 0 for tcol, bcol in mapping.items(): sc = sc + fuzz.token_sort_ratio(test_row[tcol], row[bcol]) return sc / num_cols df['tscore'] = df.apply(get_ratio, axis=1) # get best matching and store index in v_dict max_score = df['tscore'].max() if max_score >= api.config.limit: mask = (df['tscore'] == max_score) df.loc[mask, 'score'] = max_score df.loc[mask, 'external_id'] = test_row[testdf_index] for coli in mapping: df.loc[mask, 't_' + coli] = test_row[coli] df.drop(columns=['tscore'], inplace=True) # remove external_id when test column value has none t_cols = ['t_' + t for t in mapping.keys()] + ['external_id', 'score'] for bcol in mapping.values(): mask = df[bcol].isna() df.loc[mask, t_cols] = np.nan if api.config.only_index: df = df[list(base_msg.body.columns) + ['external_id']] att_dict['config']['only_index'] = api.config.only_index if api.config.only_matching_rows: df = df.loc[~df['score'].isna()] att_dict['config']['only_matching_rows'] = api.config.only_matching_rows basedf_index = tfp.read_value(api.config.base_index) att_dict['config']['base_index'] = basedf_index if api.config.joint_id: if not basedf_index: raise ValueError("For <joint_id> a value for <base_index> is necessary ") df.loc[~df['external_id'].isna(), 'joint_id'] = df.loc[~df['external_id'].isna(), 'external_id'] df.loc[df['external_id'].isna(), 'joint_id'] = df.loc[df['external_id'].isna(), basedf_index] att_dict['config']['joint_id'] = api.config.joint_id if api.config.add_non_matching: # test if same columns if not all([elem in test_df.columns for elem in base_msg.body.columns]): raise ValueError("Adding test dataframe only possible when having same columns " + str(test_df.columns) \ + ' vs. ' + str(base_msg.body.columns)) matched_ids = df['external_id'].unique() addto_df = test_df.loc[~test_df[testdf_index].isin(matched_ids)].copy() addto_df['joint_id'] = addto_df[testdf_index] df = pd.concat([df, addto_df], axis=0, sort=False) att_dict['config']['add_non_matching'] = api.config.add_non_matching else: logger.warning('No columns to check') ############################################## # final infos to attributes and info message ############################################## if df.empty: logger.warning('DataFrame is empty') else : att_dict['operator'] = 'fuzzyjoinDataFrames' att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2 att_dict['columns'] = str(list(df.columns)) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] if 'id' in base_att.keys(): att_dict['id'] = base_att['id'] + '; ' + att_dict['operator'] + ': ' + str(id(df)) else: att_dict['id'] = att_dict['operator'] + ': ' + str(id(df)) example_rows = EXAMPLE_ROWS if att_dict['number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict,body = df) return log, msg
def process(msg) : logger, log_stream = slog.set_logging('DEBUG') # start custom process definition prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() ###### start of doing calculation att_dict['config']['reset_index'] = api.config.reset_index if api.config.reset_index: df.reset_index(inplace=True) # create DataFrame with numbered columns add concat it to df att_dict['config']['transpose_column'] = api.config.transpose_column trans_col = tfp.read_value(api.config.transpose_column) att_dict['config']['value_column'] = api.config.value_column val_col = tfp.read_value(api.config.value_column) # new columns tvals = list(df[trans_col].unique()) if api.config.prefix: new_cols = {trans_col + '_' + str(v): v for v in tvals} else: new_cols = {str(v): v for v in tvals} t_df = pd.DataFrame(columns=new_cols.keys(), index=df.index) df = pd.concat([df, t_df], axis=1) # setting the corresponding column to the value of the value column for col, val in new_cols.items(): df.loc[df[trans_col] == val, col] = df.loc[df[trans_col] == val, val_col] df.drop(columns=[trans_col, val_col], inplace=True) att_dict['config']['groupby'] = api.config.groupby gbcols = tfp.read_list(api.config.groupby, df.columns) # group df if gbcols: aggr_trans = api.config.aggr_trans.strip() aggr_default = api.config.aggr_default.strip() aggregation = dict() for col in df.columns: aggregation[col] = aggr_trans if col in new_cols else aggr_default aggregation = {c: a for c, a in aggregation.items() if c not in gbcols} df = df.groupby(gbcols, as_index=api.config.as_index).agg(aggregation) ##################### # final infos to attributes and info message ##################### # df from body att_dict['operator'] = 'transposeColumnDataFrame' # name of operator att_dict['mem_usage'] = df.memory_usage(deep=True).sum() / 1024 ** 2 att_dict['name'] = prev_att['name'] att_dict['columns'] = list(df.columns) att_dict['number_columns'] = len(att_dict['columns']) att_dict['number_rows'] = len(df.index) att_dict['example_row_1'] = str(df.iloc[0, :].tolist()) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict,body=df) return log, msg
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'transposeColumn' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation if api.config.reset_index: df.reset_index(inplace=True) logger.info('Reset index') # create DataFrame with numbered columns add concat it to df trans_col = tfp.read_value(api.config.transpose_column) logger.info('Transpose column: {}'.format(trans_col)) val_col = tfp.read_value(api.config.value_column) logger.info('Value column: {}'.format(val_col)) # new columns tvals = list(df[trans_col].unique()) if api.config.prefix: new_cols = {trans_col + '_' + str(v): v for v in tvals} else: new_cols = {str(v): v for v in tvals} t_df = pd.DataFrame(columns=new_cols.keys(), index=df.index) df = pd.concat([df, t_df], axis=1) # setting the corresponding column to the value of the value column for col, val in new_cols.items(): df.loc[df[trans_col] == val, col] = df.loc[df[trans_col] == val, val_col] df.drop(columns=[trans_col, val_col], inplace=True) gbcols = tfp.read_list(api.config.groupby, df.columns) # group df if gbcols: aggr_trans = api.config.aggr_trans.strip() aggr_default = api.config.aggr_default.strip() aggregation = dict() for col in df.columns: aggregation[col] = aggr_trans if col in new_cols else aggr_default aggregation = {c: a for c, a in aggregation.items() if c not in gbcols} df = df.groupby(gbcols, as_index=api.config.as_index).agg(aggregation) logger.info('Groupby: {}'.format(gbcols)) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '<BATCH ENDED><1>' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '<BATCH ENDED><{}>'.format( att_dict['storage.fileCount']) else: progress_str = '<BATCH IN-PROCESS><{}/{}>'.format( att_dict['storage.fileIndex'] + 1, att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg): global id_set operator_name = 'text_words' logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode) logger.info("Main Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() df = msg.body att_dict = msg.attributes # Remove ID that has been processed df = df.loc[~df['ID'].isin(id_set)] id_set.update(df['ID'].unique().tolist()) # Languages language_filter = tfp.read_value(api.config.language) logger.info('Language filter: {}'.format(language_filter)) if not language_filter: language_filter = df['LANGUAGE'].unique().tolist() language_filter = [ lang for lang in language_filter if lang in language_models.keys() ] nlp = dict() for lc in language_filter: nlp[lc] = spacy.load(language_models[lc]) df = df.loc[df['LANGUAGE'].isin(language_filter)] # Warning for languages not supported languages_not_supported = [ lang for lang in language_filter if not lang in language_models.keys() ] if languages_not_supported: logger.warning(( 'The text of following langauges not analysed due to unsupported language: {}' .format(languages_not_supported))) # word types types = tfp.read_list(api.config.types) logger.info('Word types to be extracted: {}'.format(types)) entity_types = tfp.read_list(api.config.entity_types) logger.info('Entity types to be extracted: {}'.format(entity_types)) # Create doc for all word_bag_list = list() def get_words(id, language, text): if not isinstance(text, str): logger.warning(('Record with error - ID: {} - {}'.format(id, text))) return -1 doc = nlp[language](text) words = list() for t in types: words.extend( [[id, language, t, token.lemma_[:api.config.max_word_len]] for token in doc if token.pos_ == t]) for et in entity_types: words.extend( [[id, language, et, ent.text[:api.config.max_word_len]] for ent in doc.ents if ent.label_ == et]) word_bag_list.append( pd.DataFrame(words, columns=['ID', 'LANGUAGE', 'TYPE', 'WORD'])) df.apply(lambda x: get_words(x['ID'], x['LANGUAGE'], x['TEXT']), axis=1) word_bag = pd.concat(word_bag_list) word_bag = word_bag.loc[ word_bag['WORD'].str.len() >= api.config.min_word_len] word_bag['COUNT'] = 1 word_bag = word_bag.groupby(['ID', 'LANGUAGE', 'TYPE', 'WORD'])['COUNT'].sum().reset_index() # test for duplicates dup_s = word_bag.duplicated( subset=['ID', 'LANGUAGE', 'TYPE', 'WORD']).value_counts() num_duplicates = dup_s[True] if True in dup_s else 0 logger.info('Duplicates: {} / {}'.format(num_duplicates, word_bag.shape[0])) att_dict['message.lastBatch'] = True table_msg = api.Message(attributes=att_dict, body=word_bag) logger.info('Labels in document: {}'.format( word_bag['TYPE'].unique().tolist())) api.send(outports[0]['name'], log_stream.getvalue()) api.send(outports[1]['name'], table_msg)
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'lgbm_classifier' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='DEBUG') else : logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation model = LGBMRegressor( n_estimators=200, learning_rate=0.03, num_leaves=32, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.04, reg_lambda=0.073, min_split_gain=0.0222415, min_child_weight=40) train_cols = tfp.read_list(api.config.train_cols, df.columns) logger.info('Train columns: {}'.format(train_cols)) label = tfp.read_value(api.config.label_col) logger.info('Label column: {}'.format(label)) if not label: raise ValueError('Label is mandatory') # cast to categorical dtype for c in df[train_cols].select_dtypes(include='category').columns: unique_num = len(df[c].unique()) nan_num = df[c].isna().count() logger.debug('Cast to category - {}: unique {}, nan: {} of {}'.format(c, unique_num, nan_num, df.shape[0])) df[c] = df[c].cat.codes df[c] = df[c].astype('int32') if pd.api.types.is_categorical(df[label]): df[label] = df[label].astype('category') logger.debug('Cast label to <category>') df[label] = df[label].cat.codes df[label] = df[label].astype('int32') print(df.select_dtypes(include='category').head(10)) logger.debug('Train with {} features'.format(len(train_cols))) print(train_cols) model.fit(df[train_cols], df[label], eval_metric='auc') ###### end of doing calculation # end custom process definition if df.empty : raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1])) logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format(i,str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '<BATCH ENDED><1>' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '<BATCH ENDED><{}>'.format(att_dict['storage.fileCount']) else: progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(att_dict['storage.fileIndex'] + 1, att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Process ended: {} - {} '.format(progress_str, time_monitor.elapsed_time())) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) return log_stream.getvalue(), api.Message(attributes=att_dict,body=df)
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'sample' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition # test if body refers to a DataFrame type prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): logger.error('Message body does not contain a pandas DataFrame') raise TypeError('Message body does not contain a pandas DataFrame') ###### start calculation sample_size = api.config.sample_size if sample_size < 1: sample_size = int(sample_size * df.shape[0]) if sample_size < 1: sample_size = 1 logger.warning( "Fraction of sample size too small. Set sample size to 1.") elif sample_size > df.shape[0]: logger.warning("Sample size larger than number of rows") logger.debug("Samples_size: {}/() ({})".format(sample_size, df.shape[0], sample_size / df.shape[0])) random_state = api.config.random_state invariant_column = tfp.read_value(api.config.invariant_column) if invariant_column and sample_size < df.shape[0]: # get the average number of records for each value of invariant sc_df = df.groupby(invariant_column)[invariant_column].count() sample_size_invariant = int(sample_size / sc_df.mean()) sample_size_invariant = 1 if sample_size_invariant == 0 else sample_size_invariant # ensure minimum sc_df = sc_df.sample(n=sample_size_invariant, random_state=random_state).to_frame() sc_df.rename(columns={invariant_column: 'sum'}, inplace=True) # sample the df by merge 2 df df = pd.merge(df, sc_df, how='inner', right_index=True, left_on=invariant_column) df.drop(columns=['sum'], inplace=True) else: df = df.sample(n=sample_size, random_state=random_state) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '<BATCH ENDED><1>' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '<BATCH ENDED><{}>'.format( att_dict['storage.fileCount']) else: progress_str = '<BATCH IN-PROCESS><{}/{}>'.format( att_dict['storage.fileIndex'] + 1, att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg) : att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'sample' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition # test if body refers to a DataFrame type prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): logger.error('Message body does not contain a pandas DataFrame') raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() ###### start calculation sample_size = api.config.sample_size if sample_size < 1 : sample_size = int(sample_size * df.shape[0]) if sample_size < 1 : sample_size = 1 logger.warning("Fraction of sample size too small. Set sample size to 1.") elif sample_size > df.shape[0]: logger.warning("Sample size larger than number of rows") logger.debug("Samples_size: {}/() ({})".format(sample_size,df.shape[0],sample_size/df.shape[0])) random_state = api.config.random_state invariant_column = tfp.read_value(api.config.invariant_column) if invariant_column and sample_size < df.shape[0]: # get the average number of records for each value of invariant sc_df = df.groupby(invariant_column)[invariant_column].count() sample_size_invariant = int(sample_size / sc_df.mean()) sample_size_invariant = 1 if sample_size_invariant == 0 else sample_size_invariant # ensure minimum sc_df = sc_df.sample(n=sample_size_invariant, random_state=random_state).to_frame() sc_df.rename(columns={invariant_column: 'sum'}, inplace=True) # sample the df by merge 2 df df = pd.merge(df, sc_df, how='inner', right_index=True, left_on=invariant_column) df.drop(columns=['sum'], inplace=True) else: df = df.sample(n=sample_size, random_state=random_state) ###### end calculation ############################################## # final infos to attributes and info message ############################################## if df.empty: raise ValueError('DataFrame is empty') logger.info('End of Process: {}'.format(time_monitor.elapsed_time())) att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) logger.debug('Head data: {}'.format(att_dict['row_' + str(i)])) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict,body=df) return log, msg
def process(msg): global setup_data global last_msg global hash_list logger, log_stream = slog.set_logging('text cleansing', loglevel=api.config.debug_mode) logger.info("Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() msg = check_for_setup(logger, msg) if not msg: api.send(outports[0]['name'], log_stream.flush()) return 0 adict = msg.body language_filter = tfp.read_value(api.config.language) mode = tfp.read_value(api.config.mode) supported_modes = 'PNX' if not mode or not any(m in mode for m in supported_modes): raise Exception( 'Mode is mandatory parameter and valid values are: {}'.format( supported_modes)) article_words = list() article_count = 0 for index_article, article in enumerate(adict): language = language_dict[article['media']] # filter language if language_filter and not language_filter == language: #logger.debug('Language filtered out: {} ({})'.format(language, language_filter)) continue article_count += 1 # check if article has been processed if article['hash_text'] in hash_list: logger.debug( 'Article has already been processed: {} - {} - {}'.format( article['date'], article['media'], article['hash_text'])) continue hash_list.append(article['hash_text']) text = article['text'] # might interfer with #text = re.sub(r'\d+', '', text.lower()) #text = re.sub(r'\b[a-z]\b', '', text) # Language settings if language == 'DE': doc = nlp_g(text) elif language == 'FR': doc = nlp_fr(text) elif language == 'ES': doc = nlp_es(text) else: logger.warning('Language not implemented: {}'.format(language)) doc = None words = dict() # only when doc has been created - language exists if doc: if 'P' in api.config.mode: words['P'] = [ token.lemma_[:api.config.max_word_len] for token in doc if token.pos_ == 'PROPN' ] if 'N' in api.config.mode: words['N'] = [ token.lemma_[:api.config.max_word_len] for token in doc if token.pos_ == 'NOUN' ] if 'X' in api.config.mode: words['X'] = [ token.text[:api.config.max_word_len] for token in doc if not token.is_stop ] for m in words: # heuristics #min_length = 2 #words[m] = [ re.sub('^[-]','',w) for w in words[m] if len(w) > min_length] # Remove blacklist words words[m] = [w for w in words[m] if w not in setup_data] if api.config.counter: article_words.append([ article['hash_text'], language, m, collections.Counter(words[m]).most_common() ]) else: article_words.append( [article['hash_text'], language, m, words[m]]) attributes = { "table": { "columns": [{ "class": "string", "name": "HASH_TEXT", "nullable": True, "type": { "hana": "INTEGER" } }, { "class": "string", "name": "LANGUAGE", "nullable": True, "size": 2, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "TYPE", "nullable": True, "size": 1, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "WORDS", "nullable": True, "type": { "hana": "ARRAY" } }], "name": "DIPROJECTS.WORD_INDEX3", "version": 1 }, "storage.filename": msg.attributes["storage.filename"] } attributes['counter'] = 'Y' if api.config.counter else 'N' table_msg = api.Message(attributes=attributes, body=article_words) logger.info('File processed: {} #Articles: {} '.format( msg.attributes["storage.filename"], len(adict))) api.send(outports[0]['name'], log_stream.getvalue()) api.send(outports[1]['name'], table_msg)
def process(left_msg, right_msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'join' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') # start custom process definition l_att = left_msg.attributes r_att = right_msg.attributes if l_att['name'] == r_att['name']: att_dict['name'] = l_att['name'] else: att_dict['name'] = l_att['name'] + '-' + r_att['name'] att_dict['config'] = dict() # read stream from memory left_df = left_msg.body right_df = right_msg.body ###### start of doing calculation how = tfp.read_value(api.config.how) # merge according to config att_dict['config']['on_index'] = api.config.on_index if api.config.on_index: df = pd.merge(left_df, right_df, how=how, left_index=True, right_index=True) elif api.config.left_on and api.config.right_on: att_dict['config']['left_on'] = api.config.left_on att_dict['config']['right_on'] = api.config.right_on left_on_list = tfp.read_list(api.config.left_on) right_on_list = tfp.read_list(api.config.right_on) left_df.reset_index(inplace=True) right_df.reset_index(inplace=True) df = pd.merge(left_df, right_df, how=how, left_on=left_on_list, right_on=right_on_list) # removing second index - might be a more elegant solution if 'index_x' in df.columns: df.drop(columns=['index_x'], inplace=True) else: raise ValueError( "Config setting: Either <on> or both <left_on> and <right_on> has to be set in order to join the dataframes" ) att_dict['config']['new_indices'] = api.config.new_indices index_list = tfp.read_list(api.config.new_indices) if index_list: df.set_index(keys=index_list, inplace=True) att_dict['config']['drop_columns'] = api.config.drop_columns col_list = tfp.read_list(api.config.drop_columns) if col_list: df.drop(labels=col_list, axis=1, inplace=True) ############################################## # final infos to attributes and info message ############################################## if df.empty == True: raise ValueError('Merged Dataframe is empty') att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) logger.debug('Head data: {}'.format(att_dict['row_' + str(i)])) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'anonymizeData' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') logger.debug("Process started") time_monitor = tp.progress() result = '' logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) prev_att = msg.attributes df = msg.body ###### start of doing calculation to_nan = tfp.read_value(api.config.to_nan, test_number=False) if to_nan: df.replace(to_nan, np.nan, inplace=True) anonymize_to_int_cols = tfp.read_list(api.config.anonymize_to_int_cols, list(df.columns)) anonymize_cols = tfp.read_list(api.config.anonymize_cols, list(df.columns)) ## Anonymize columns if anonymize_cols: logger.debug('Anonymize Columns: {}'.format(str(anonymize_cols))) # ensure that ids are not anonymized in the section but exclusively in the id-section anonymize_cols = [ c for c in anonymize_cols if not c in anonymize_to_int_cols ] # replaceing string with a random string for c in df[anonymize_cols].select_dtypes(include='object'): unique_list = df[c].unique() n = int(math.log10(len(unique_list))) + 2 # create random map first then check if keys have the values of the keep_list and replace the random values rep_map = { x: ''.join(random.choices(string.ascii_letters, k=n)) for x in unique_list if isinstance(x, str) } for ktk, ktv in keep_terms.items(): if ktk in rep_map.keys(): rep_map[ktk] = ktv df[c].replace(rep_map, inplace=True) # linear shift of integer for c in df[anonymize_cols].select_dtypes(include='int'): unique_i = df[c].unique() max_i = max(unique_i) min_i = min(unique_i) length = max_i - min_i rand_int1 = random.randint(0, 100) rand_int2 = random.randint(0, 100) # preserves existing/binary values 0 and 1 if not (len(unique_i) == 2 and 0 in unique_i and 1 in unique_i): df[c] = ((df[c] - min_i) / length * rand_int1 + rand_int2).astype('int') # linear shift of float for c in df[anonymize_cols].select_dtypes(include='float'): unique_f = df[c].unique() max_f = max(unique_f) min_f = min(unique_f) length = max_f - min_f rand_float1 = random.random() rand_float2 = random.random() df[c] = ( (df[c] - min_f) / length * rand_float1 + rand_float2) / 2.0 if anonymize_to_int_cols: logger.debug('Anonymize to Integer Columns: {}'.format( str(anonymize_to_int_cols))) # replaceing string with a random string for c in df[anonymize_to_int_cols]: unique_list = df[c].unique() rand_list = list( np.random.choice(1000 * len(unique_list), len(unique_list), replace=False)) # create random map first then check if keys have the values of the keep_list and replace the random values rep_map = dict(zip(unique_list, rand_list)) df[c].replace(rep_map, inplace=True) enumerate_cols = tfp.read_list(api.config.enumerate_cols, list(df.columns)) if enumerate_cols: ncols = int(math.log10(len(enumerate_cols))) + 1 prefix_cols = tfp.read_value(api.config.prefix_cols) if not prefix_cols: prefix_cols = 'Att_' cols_map = { oc: prefix_cols + str(i).zfill(ncols) for i, oc in enumerate(enumerate_cols) } df.rename(columns=cols_map, inplace=True) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '<BATCH ENDED><1>' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '<BATCH ENDED><{}>'.format( att_dict['storage.fileCount']) else: progress_str = '<BATCH IN-PROCESS><{}/{}>'.format( att_dict['storage.fileIndex'] + 1, att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(db_msg): logger, log_stream = slog.set_logging('topic_identification', loglevel=api.config.debug_mode) logger.info("Process started") time_monitor = tp.progress() columns = [c['name'] for c in db_msg.attributes['table']['columns']] df = pd.DataFrame(db_msg.body, columns=columns) # Language filter language_filter = tfp.read_list(api.config.language_filter) if language_filter: df = df.loc[df["LANGUAGE"].isin(language_filter)] logger.info('Languages : {}'.format(language_filter)) # Word type filter word_type_filter = tfp.read_value(api.config.word_type_filter) if word_type_filter: types = [c for c in word_type_filter] df = df.loc[df["TYPE"].isin(types)] logger.info('Word restricted to types : {}'.format(word_type_filter)) # groupby and concatenate words gdf = df.groupby('HASH_TEXT').agg({ "LANGUAGE": 'first', "WORD": [(lambda x: ' '.join(x)), 'count'] }) gdf.columns = gdf.columns.droplevel(level=0) gdf.rename(columns={ "first": 'LANGUAGE', "count": 'NUM_WORDS', '<lambda_0>': 'WORDS' }, inplace=True) # create document-term matrix - no tokenization or text prep are needed tf_vectorizer = CountVectorizer(analyzer='word', min_df=1, lowercase=False, tokenizer=str.split) # tf means term-frequency in a document for each language date_today = str(date.today()) # 2-array with TOPIC, LANGUAGE, TYPE, DATE, EXPIRY_DATE, ATTRIBUTE, KEYWORD_i (num of topics) topic_list = list() for lang in language_filter: lang_gdf = gdf.loc[gdf['LANGUAGE'] == lang] logger.info( "Language: {} #articles: {} av.words/article: {:.1f}".format( lang, lang_gdf.shape[0], lang_gdf['NUM_WORDS'].mean())) dtm_tf = tf_vectorizer.fit_transform(lang_gdf['WORDS']) # for tf dtm lda_tf = LatentDirichletAllocation(n_components=api.config.num_topics, learning_method='online', evaluate_every=-1, n_jobs=-1) lda_tf.fit(dtm_tf) feature_names = tf_vectorizer.get_feature_names() for i, topic in enumerate(lda_tf.components_): topic_words = [ feature_names[f] for f in topic.argsort()[:-api.config.topic_words - 1:-1] ] print('Len: {} topic_words:{}'.format(len(topic_words), topic_words)) row = [ date_today + "-" + str(i), lang, 'ALGO', date_today, None, None ] + topic_words print('Len: {} record:{}'.format(len(row), row)) topic_list.append(row) attributes = { "table": { "columns": [{ "class": "string", "name": "TOPIC", "nullable": False, "size": 80, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "LANGUAGE", "nullable": False, "size": 2, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "TYPE", "nullable": False, "size": 10, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "DATE", "nullable": True, "type": { "hana": "DATE" } }, { "class": "string", "name": "EXPIRY_DATE", "nullable": True, "type": { "hana": "DATE" } }, { "class": "string", "name": "ATTRIBUTE", "nullable": True, "size": 25, "type": { "hana": "NVACHAR" } }], "name": "DIPROJECTS.WORD_INDEX", "version": 1 } } for i in range(1, api.config.topic_words + 1): attributes['table']['columns'].append({ "class": "string", "name": "KEYWORD_" + str(i), "nullable": True, "size": 80, "type": { "hana": "NVARCHAR" } }) msg = api.Message(attributes=attributes, body=topic_list) logger.debug('Process ended, topics processed {}'.format( time_monitor.elapsed_time())) api.send(outports[0]['name'], log_stream.getvalue()) api.send(outports[1]['name'], msg)
def process(msg): global id_set global list_df global sentiment_words_df global att_dict logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode) logger.info("Main Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() if msg: att_dict = msg.attributes # sync data and setup msg # Case: do sentiment AND sentiment_setup not done/msg with data if api.config.sentiments and sentiment_words_df.empty: logger.info('Sentiment word list not setup yet!') api.send(outports[0]['name'], log_stream.getvalue()) att_dict = msg.attributes if list_df.empty: list_df = msg.body else: list_df = pd.concat(list_df, msg.body) return 0 # Case: Sentiment setup called process and data has been sent previously elif msg == None and not list_df.empty: df = list_df # Case: Sentiment setup called process and data has NOT been sent previously elif msg == None and list_df.empty: return 0 # Case: data sent and no sentiment analysis is required else: df = msg.body logger.debug('Attributes: {}'.format(att_dict)) logger.debug('DataFrame: {} - {}'.format(df.shape[0], df.shape[1])) if df.shape[0] == 0: logger.warning('Empty DataFrame') return 0 # Remove ID that has been processed # Should already been filtered out by 'Doc Prepare' prev_num_rows = df.shape[0] df = df.loc[~df['text_id'].isin(id_set)] post_num_rows = df.shape[0] if prev_num_rows != post_num_rows: logger.warning('Processed text_id has been found: {} -> {}'.format( prev_num_rows, post_num_rows)) id_set.update(df['text_id'].unique().tolist()) # Languages language_filter = tfp.read_value(api.config.language) logger.info('Language filter: {}'.format(language_filter)) if not language_filter: language_filter = df['language'].unique().tolist() language_filter = [ lang for lang in language_filter if lang in language_models.keys() ] nlp = dict() for lc in language_filter: nlp[lc] = spacy.load(language_models[lc]) df = df.loc[df['language'].isin(language_filter)] # Warning for languages not supported languages_not_supported = [ lang for lang in language_filter if not lang in language_models.keys() ] if languages_not_supported: logger.warning(( 'The text of following langauges not analysed due to unsupported language: {}' .format(languages_not_supported))) # word types types = tfp.read_list(api.config.types) logger.info('Word types to be extracted: {}'.format(types)) entity_types = tfp.read_list(api.config.entity_types) logger.info('Entity types to be extracted: {}'.format(entity_types)) # Create doc for all word_bag_list = list() sentiment_list = list() def get_words(id, language, text): if not isinstance(text, str): logger.warning(('Record with error - ID: {} - {}'.format(id, text))) return -1 doc = nlp[language](text) words = list() for t in types: words.extend( [[id, language, t, token.lemma_[:api.config.max_word_len]] for token in doc if token.pos_ == t]) for et in entity_types: words.extend( [[id, language, et, ent.text[:api.config.max_word_len]] for ent in doc.ents if ent.label_ == et]) # sentiment if language == 'DE' and api.config.sentiments: # collect all the lemmatized words of text # get all rows from sentiment_words_df and calculate mean lemmatized = [ token.lemma_.lower() for token in doc if token.pos_ in ['NOUN', 'VERB', 'ADJ'] ] # logger.debug('lemmatized words: {}'.format(lemmatized[0:10])) text_sentiment_words = sentiment_words_df.loc[ sentiment_words_df.index.isin(lemmatized), 'value'] text_sentiment_value = text_sentiment_words.mean() sentiment_list.append([id, text_sentiment_value]) # logger.debug('id: {} #sentiment words: {} sentitment:{}'.format(id,text_sentiment_words.shape[0],text_sentiment_value)) word_bag_list.append( pd.DataFrame(words, columns=['text_id', 'language', 'type', 'word'])) df.apply(lambda x: get_words(x['text_id'], x['language'], x['text']), axis=1) # data message try: word_bag = pd.concat(word_bag_list) except ValueError as ex: logger.error('No words in message: {}'.format(att_dict)) api.send(outports[0]['name'], log_stream.getvalue()) log_stream.seek(0) log_stream.truncate() return 0 word_bag = word_bag.loc[ word_bag['word'].str.len() >= api.config.min_word_len] word_bag['count'] = 1 word_bag = word_bag.groupby(['text_id', 'language', 'type', 'word'])['count'].sum().reset_index() # check for duplicates - should be unnecessary prev_num_rows = word_bag.shape[0] word_bag.drop_duplicates(ignore_index=True, inplace=True) post_num_rows = word_bag.shape[0] if not prev_num_rows == post_num_rows: logger.warning('Duplicates has been found: {} -> {}'.format( prev_num_rows, post_num_rows)) api.send(outports[0]['name'], log_stream.getvalue()) log_stream.seek(0) log_stream.truncate() data_msg = api.Message(attributes=att_dict, body=word_bag) logger.info('Labels in document: {}'.format( word_bag['type'].unique().tolist())) logger.debug('DataFrame shape: {} - {}'.format(word_bag.shape[0], word_bag.shape[1])) api.send(outports[2]['name'], data_msg) # sentiment message if api.config.sentiments and len(sentiment_list) > 0: sentiment_df = pd.DataFrame(sentiment_list, columns=['text_id', 'sentiment']) sentiment_df.drop_duplicates(inplace=True) sentiment_csv = sentiment_df.to_csv(index=False) sentiment_msg = api.Message(attributes=att_dict, body=sentiment_csv) api.send(outports[1]['name'], sentiment_msg) # log message api.send(outports[0]['name'], log_stream.getvalue())