def process(): operator_name = 'sql_word_index' logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode) logger.info("Main Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() language = tfp.read_value(api.config.language) type_limit = tfp.read_dict(api.config.type_limit_map) table_name = tfp.read_value(api.config.table_name) text_id_col = tfp.read_value(api.config.text_id_col) for i, [wtype, limit] in enumerate(type_limit.items()): sql_s = "SELECT {tid}, \"{tn}\".LANGUAGE, \"{tn}\".TYPE, \"{tn}\".WORD, COUNT FROM \"{tn}\" INNER JOIN"\ "(SELECT WORD, TYPE, LANGUAGE, SUM(COUNT) as CUMS FROM \"{tn}\" "\ "WHERE LANGUAGE = \'{lang}\' AND TYPE = \'{wt}\' "\ "GROUP BY WORD, TYPE, LANGUAGE) AS CTABLE ON "\ "\"{tn}\".WORD = CTABLE.WORD AND \"{tn}\".TYPE = CTABLE.TYPE AND \"{tn}\".LANGUAGE = CTABLE.LANGUAGE "\ "WHERE CUMS >= {lt}".format(tid = text_id_col,tn = table_name,lang=language,wt = wtype,lt=limit) lastbatch = True if len(type_limit) == i + 1 else False att_dict = attributes={'operator':operator_name,'parameter':{'type':wtype,'limit':limit,'language':language},\ 'message.batchIndex':i,'message.batchSize':len(type_limit),'message.lastBatch':lastbatch} msg = api.Message(attributes=att_dict, body=sql_s) api.send(outports[1]['name'], msg) api.send(outports[0]['name'], log_stream.getvalue())
def process(msg) : att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'setValue' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='DEBUG') else : logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition df = msg.body if not isinstance(df,pd.DataFrame) : raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation # map_values : column1: {from_value: to_value}, column2: {from_value: to_value} att_dict['config']['set_value'] = api.config.map_values maps_map = tfp.read_dict_of_dict(api.config.map_values) df.replace(maps_map,inplace=True) # Fill NaN value : column1: value, column2: value, att_dict['config']['fill_nan_values'] = api.config.fill_nan_values map_dict = tfp.read_dict(api.config.fill_nan_values) if map_dict : df.fillna(map_dict,inplace=True) ############################################## # final infos to attributes and info message ############################################## if df.empty : raise ValueError('DataFrame is empty') logger.info('End of Process: {}'.format(time_monitor.elapsed_time())) att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) logger.debug('Head data: {}'.format(att_dict['row_' + str(i)])) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict,body=df) return log, msg
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'groupby' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') prev_att = msg.attributes df = msg.body ###### start of doing calculation # groupby list cols = tfp.read_list(api.config.groupby) att_dict['config']['groupby'] = api.config.groupby # mapping aggregation try: colagg = tfp.read_dict(api.config.aggregation) except IndexError: logger.info('Aggregation is not a map, try to parse a value instead') colagg = tfp.read_value(api.config.aggregation) att_dict['config']['aggregation'] = api.config.aggregation # groupby logger.debug('Group columns: {}'.format(cols)) logger.debug('Aggregation: {}'.format(colagg)) logger.debug('Index: {}'.format(api.config.index)) df = df.groupby(cols, as_index=api.config.index).agg(colagg) # drop col att_dict['config']['dropcols'] = api.config.drop_columns dropcols = tfp.read_list(api.config.drop_columns) if dropcols: df.drop(columns=dropcols, inplace=True) ############################################## # final infos to attributes and info message ############################################## att_dict['operator'] = 'groupbyDataFrame' att_dict['name'] = prev_att['name'] att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = list(df.columns) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] example_rows = EXAMPLE_ROWS if att_dict[ 'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'setValue' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation # map_values : column1: {from_value: to_value}, column2: {from_value: to_value} maps_map = tfp.read_dict_of_dict(api.config.map_values) df.replace(maps_map, inplace=True) logger.info('Replace values: {}'.format(maps_map)) # Fill NaN value : column1: value, column2: value, map_dict = tfp.read_dict(api.config.fill_nan_values) if map_dict: df.fillna(map_dict, inplace=True) logger.info('Fill nan values: {}'.format(map_dict)) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '<BATCH ENDED><1>' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '<BATCH ENDED><{}>'.format( att_dict['storage.fileCount']) else: progress_str = '<BATCH IN-PROCESS><{}/{}>'.format( att_dict['storage.fileIndex'] + 1, att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg) : att_dict = msg.attributes att_dict['operator'] = 'groupby' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() prev_att = msg.attributes df = msg.body prev_shape = df.shape ###### start of doing calculation # groupby list cols = tfp.read_list(api.config.groupby) # mapping aggregation try : colagg = tfp.read_dict(api.config.aggregation) except IndexError : logger.info('Aggregation is not a map, try to parse a value instead') colagg = tfp.read_value(api.config.aggregation) # groupby logger.info('Group columns: {}'.format(cols)) logger.info('Aggregation: {}'.format(colagg)) logger.info('Index: {}'.format(api.config.index)) df = df.groupby(cols, as_index=api.config.index).agg(colagg) # drop col dropcols = tfp.read_list(api.config.drop_columns) if dropcols : logger.info('Drop columns: {}'.format(dropcols)) df.drop(columns=dropcols,inplace=True) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format(i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format(progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'toCSV' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition df = msg.body if api.config.reset_index: logger.debug('Reset Index') df = df.reset_index() kwargs = tfp.read_dict(text=api.config.keyword_args, map_sep='=') if not kwargs == None: data_str = df.to_csv(sep=api.config.separator, index=api.config.write_index, **kwargs) else: data_str = df.to_csv(sep=api.config.separator, index=api.config.write_index) # end custom process definition logger.info('End of Process: {}'.format(time_monitor.elapsed_time())) # create dict of columns and types for HANA map_hana = {'int8': 'TINYINT', 'int16': 'SMALLINT', 'int32': 'INTEGER', 'int64': 'BIGINT', 'float32': 'FLOAT', 'float64': 'DOUBLE', \ 'object': 'VARCHAR', 'datetime64': 'TIMESTAMP'} col_dict = {c: str(df[c].dtype) for c in df.columns} hana_table_dict = list() for c, ty in col_dict.items(): if ty == 'object': size = df[c].str.len().max() hana_table_dict.append({ 'name': c, 'type': map_hana[col_dict[c]], 'size': size }) elif 'datetime64' in ty: hana_table_dict.append({'name': c, 'type': 'TIMESTAMP'}) else: hana_table_dict.append({'name': c, 'type': map_hana[col_dict[c]]}) logger.info('For Hana table definition: {}'.format(hana_table_dict)) log = log_stream.getvalue() return log, data_str
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'castColumns' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition df = msg.body castmap = tfp.read_dict(api.config.cast) if castmap: for col, casttype in castmap.items(): if api.config.round: df[col] = df[col].round() df[col] = df[col].astype(casttype) ###### end calculation # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '<BATCH ENDED><1>' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '<BATCH ENDED><{}>'.format( att_dict['storage.fileCount']) else: progress_str = '<BATCH IN-PROCESS><{}/{}>'.format( att_dict['storage.fileIndex'] + 1, att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg) : att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'dropColumns' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation att_dict['config']['drop_columns'] = api.config.drop_columns drop_cols = tfp.read_list(api.config.drop_columns, df.columns) if drop_cols: logger.debug("Drops columns: {}".format(str(drop_cols))) df = df.drop(columns=drop_cols) att_dict['config']['rename_columns'] = api.config.rename_columns map_names = tfp.read_dict(api.config.rename_columns) if map_names: df.rename(columns=map_names, inplace=True) ###### end of doing calculation logger.info('End of Process: {}'.format(time_monitor.elapsed_time())) # df from body att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) logger.debug('Head data: {}'.format(att_dict['row_' + str(i)])) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict,body=df) return log, msg
def process(msg): logger, log_stream = slog.set_logging('DEBUG') # start custom process definition prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() ###### start of doing calculation # map_values : column1: {from_value: to_value}, column2: {from_value: to_value} att_dict['config']['set_value'] = api.config.map_values maps_map = tfp.read_dict_of_dict(api.config.map_values) df.replace(maps_map, inplace=True) # Fill NaN value : column1: value, column2: value, att_dict['config']['fill_nan_values'] = api.config.fill_nan_values map_dict = tfp.read_dict(api.config.fill_nan_values) if map_dict: df.fillna(map_dict, inplace=True) ############################################## # final infos to attributes and info message ############################################## if df.empty: raise ValueError('DataFrame is empty') att_dict['operator'] = 'setValue' att_dict['name'] = prev_att['name'] att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] example_rows = EXAMPLE_ROWS if att_dict[ 'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(msg): logger, log_stream = slog.set_logging('DEBUG') # start custom process definition prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() ###### start of doing calculation att_dict['config']['drop_columns'] = api.config.drop_columns drop_cols = tfp.read_list(api.config.drop_columns, df.columns) if drop_cols: logger.debug("Drops columns: {}".format(str(drop_cols))) df = df.drop(columns=drop_cols) att_dict['config']['rename_columns'] = api.config.rename_columns map_names = tfp.read_dict(api.config.rename_columns) if map_names: df.rename(columns=map_names, inplace=True) ###### end of doing calculation ############################################## # final infos to attributes and info message ############################################## # df from body att_dict['operator'] = 'dropColumns' # name of operator att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['name'] = prev_att['name'] att_dict['columns'] = list(df.columns) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] example_rows = EXAMPLE_ROWS if att_dict[ 'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(msg): logger, log_stream = set_logging('DEBUG') # start custom process definition prev_att = msg.attributes df = msg.body att_dict = dict() att_dict['config'] = dict() ###### start of doing calculation # groupby list cols = tfp.read_list(api.config.groupby) att_dict['config']['groupby'] = api.config.groupby # mapping colagg = tfp.read_dict(api.config.aggregation) att_dict['config']['aggregation'] = api.config.aggregation # groupby df = df.groupby(cols, as_index=api.config.index).agg(colagg) # drop col att_dict['config']['dropcols'] = api.config.drop_columns dropcols = tfp.read_list(api.config.drop_columns) if dropcols: df.drop(columns=dropcols, inplace=True) ############################################## # final infos to attributes and info message ############################################## att_dict['operator'] = 'groupbyDataFrame' att_dict['name'] = prev_att['name'] att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = list(df.columns) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] example_rows = EXAMPLE_ROWS if att_dict[ 'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(msg): logger, log_stream = slog.set_logging('DEBUG') # start custom process definition prev_att = msg.attributes df = msg.body att_dict = dict() att_dict['config'] = dict() castmap = tfp.read_dict(api.config.cast) if castmap: for col, casttype in castmap.items(): if api.config.round: df[col] = df[col].round() df[col] = df[col].astype(casttype) ###### end calculation ############################################## # final infos to attributes and info message ############################################## att_dict['operator'] = 'castDataFrame' att_dict['name'] = prev_att['name'] att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] if 'id' in prev_att.keys(): att_dict[ 'id'] = prev_att['id'] + '; ' + att_dict['operator'] + ': ' + str( id(df)) else: att_dict['id'] = att_dict['operator'] + ': ' + str(id(df)) example_rows = EXAMPLE_ROWS if att_dict[ 'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) msg = api.Message(attributes=att_dict, body=df) # end custom process definition log = log_stream.getvalue() return log, msg
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'castColumns' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') logger.debug("Process started") # start custom process definition prev_att = msg.attributes df = msg.body castmap = tfp.read_dict(api.config.cast) if castmap: for col, casttype in castmap.items(): if api.config.round: df[col] = df[col].round() df[col] = df[col].astype(casttype) ###### end calculation ############################################## # final infos to attributes and info message ############################################## att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) logger.debug('Head data: {}'.format(att_dict['row_' + str(i)])) msg = api.Message(attributes=att_dict, body=df) # end custom process definition log = log_stream.getvalue() return log, msg
def process(msg) : att_dict = msg.attributes att_dict['operator'] = 'dropColumns' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation drop_cols = tfp.read_list(api.config.drop_columns, df.columns) if drop_cols: logger.debug("Drops columns: {}".format(str(drop_cols))) df = df.drop(columns=drop_cols) map_names = tfp.read_dict(api.config.rename_columns) if map_names: df.rename(columns=map_names, inplace=True) # end custom process definition if df.empty : raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1])) logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format(i,str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict : if not att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount'] : progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format(progress_str,time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict,body=df)
def process(msg_coef, msg_data): logger, log_stream = slog.set_logging('DEBUG') # start custom process definition prev_att = msg_data.attributes df = msg_data.body coef_df = msg_coef.body if not isinstance(df, pd.DataFrame): logger.error('Message body does not contain a pandas DataFrame') raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() ###### start of doing calculation # segment columns segment_cols = None if 'segmentation_columns' in msg_coef.attributes: segment_cols = msg_coef.attributes['segmentation_columns'] # regression columns regression_cols = msg_coef.attributes['regression_columns'] # prediction column prediction_col = msg_coef.attributes['prediction_column'] # setting values of regression column values (if not in the dataMsg already done att_dict['config'][ 'regresssion_cols_value'] = api.config.regresssion_cols_value valmap = tfp.read_dict(api.config.regresssion_cols_value) if valmap: for col, val in valmap.items(): if np.issubdtype(df[col].dtype, np.integer): val = int(val) elif np.issubdtype(df[col].dtype, np.float): val = float(val) else: raise ValueError('Regression value needs to be numeric') df[col] = val # merge data and coef df if segment_cols: df = pd.merge(df, coef_df, how='inner', left_on=segment_cols, right_on=segment_cols) prefix = tfp.read_value(api.config.prediction_prefix) if prefix == None: prefix = '' pcol = prefix + prediction_col if segment_cols: def predict(x): x[pcol] = np.dot(x['coef'], x[regression_cols].values) + x['intercept'] return x df = df.apply(predict, axis=1, result_type=None) df.drop(columns=['coef', 'intercept'], inplace=True) else: def predict(x): x[pcol] = np.dot(coef_df['coef'], x[regression_cols].values) + coef_df['intercept'] return x df = df.apply(predict, axis=1, result_type=None) # cast type of prediction col from prediction variable if df[prediction_col].dtype == np.integer: logger.debug('Cast prediction column to <int>') df[pcol] = df[pcol].round().astype(df[prediction_col].dtype) if api.config.prediction_col_only: logger.debug('Output only prediction columns') if segment_cols: df[prediction_col] = df[pcol] df = df[segment_cols + [prediction_col]] else: df = df[prediction_col] att_dict['config']['prediction_col_only'] = api.config.prediction_col_only ###### end of doing calculation ############################################## # final infos to attributes and info message ############################################## if df.empty: raise ValueError('DataFrame is empty') att_dict['operator'] = 'regressionTrainingDataFrame' att_dict['name'] = prev_att['name'] # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(test_msg, base_msg): att_dict = base_msg.attributes att_dict['operator'] = 'fuzzyjoin' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition testdf_index = tfp.read_value(api.config.test_index) if not testdf_index: logger.error('Index of test data is mandatory') raise ValueError('Index of test data is mandatory') # get the columns to check mapping = tfp.read_dict(api.config.check_columns) df = pd.DataFrame() if mapping: # read stream from memory test_df = test_msg.body # test if all mapping cols in testdf checkcols = [ elem in list(test_df.columns) for elem in list(mapping.keys()) ] if not all(checkcols): error_txt = 'Elements in mapping are not contained in columns of test df : ' + \ str(list(mapping.keys())) + '-' + str(list(test_df.columns)) + ' - ' + str(checkcols) logger.error(error_txt) raise ValueError(error_txt) if not testdf_index in test_df.columns: logger.error('Test index needs to be column') raise ValueError('Test index needs to be column') tcols = ['t_' + c for c in list(mapping.keys())] tdf = pd.DataFrame(columns=tcols) df = base_msg.body df = pd.concat([df, tdf], axis=1) num_cols = len(mapping) # run over all left df rows to test in right_df for index, test_row in test_df.iterrows(): # apply function def get_ratio(row): sc = 0 for tcol, bcol in mapping.items(): sc = sc + fuzz.token_sort_ratio(test_row[tcol], row[bcol]) return sc / num_cols df['tscore'] = df.apply(get_ratio, axis=1) # get best matching and store index in v_dict max_score = df['tscore'].max() if max_score >= api.config.limit: mask = (df['tscore'] == max_score) df.loc[mask, 'score'] = max_score df.loc[mask, 'external_id'] = test_row[testdf_index] for coli in mapping: df.loc[mask, 't_' + coli] = test_row[coli] df.drop(columns=['tscore'], inplace=True) # remove external_id when test column value has none t_cols = ['t_' + t for t in mapping.keys()] + ['external_id', 'score'] for bcol in mapping.values(): mask = df[bcol].isna() df.loc[mask, t_cols] = np.nan if api.config.only_index: df = df[list(base_msg.body.columns) + ['external_id']] if api.config.only_matching_rows: df = df.loc[~df['score'].isna()] basedf_index = tfp.read_value(api.config.base_index) if api.config.joint_id: if not basedf_index: raise ValueError( "For <joint_id> a value for <base_index> is necessary ") df.loc[~df['external_id'].isna(), 'joint_id'] = df.loc[~df['external_id'].isna(), 'external_id'] df.loc[df['external_id'].isna(), 'joint_id'] = df.loc[df['external_id'].isna(), basedf_index] if api.config.add_non_matching: # test if same columns if not all( [elem in test_df.columns for elem in base_msg.body.columns]): raise ValueError("Adding test dataframe only possible when having same columns " + str(test_df.columns) \ + ' vs. ' + str(base_msg.body.columns)) matched_ids = df['external_id'].unique() addto_df = test_df.loc[~test_df[testdf_index].isin(matched_ids )].copy() addto_df['joint_id'] = addto_df[testdf_index] df = pd.concat([df, addto_df], axis=0, sort=False) else: logger.warning('No columns to check') # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict[ 'storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(test_msg, base_msg) : logger, log_stream = set_logging('DEBUG') # start custom process definition test_att = test_msg.attributes base_att = base_msg.attributes att_dict = dict() if test_att['name'] == base_att['name']: att_dict['name'] = test_att['name'] else: att_dict['name'] = test_att['name'] + '-' + base_att['name'] att_dict['config'] = dict() att_dict['config']['test_index'] = api.config.test_index testdf_index = tfp.read_value(api.config.test_index) if not testdf_index: logger.error('Index of test data is mandatory') raise ValueError('Index of test data is mandatory') att_dict['number_rows'] = str(base_msg.body.shape[0]) # get the columns to check mapping = tfp.read_dict(api.config.check_columns) df = pd.DataFrame() if mapping: att_dict['config']['check_columns'] = str(mapping) att_dict['config']['limit'] = api.config.limit # read stream from memory test_df = test_msg.body # test if all mapping cols in testdf checkcols = [elem in list(test_df.columns) for elem in list(mapping.keys())] if not all(checkcols): error_txt = 'Elements in mapping are not contained in columns of test df : ' + \ str(list(mapping.keys())) + '-' + str(list(test_df.columns)) + ' - ' + str(checkcols) logger.error(error_txt) raise ValueError(error_txt) if not testdf_index in test_df.columns: logger.error('Test index needs to be column') raise ValueError('Test index needs to be column') tcols = ['t_' + c for c in list(mapping.keys())] tdf = pd.DataFrame(columns=tcols) df = base_msg.body df = pd.concat([df, tdf], axis=1) num_cols = len(mapping) # run over all left df rows to test in right_df for index, test_row in test_df.iterrows(): # apply function def get_ratio(row): sc = 0 for tcol, bcol in mapping.items(): sc = sc + fuzz.token_sort_ratio(test_row[tcol], row[bcol]) return sc / num_cols df['tscore'] = df.apply(get_ratio, axis=1) # get best matching and store index in v_dict max_score = df['tscore'].max() if max_score >= api.config.limit: mask = (df['tscore'] == max_score) df.loc[mask, 'score'] = max_score df.loc[mask, 'external_id'] = test_row[testdf_index] for coli in mapping: df.loc[mask, 't_' + coli] = test_row[coli] df.drop(columns=['tscore'], inplace=True) # remove external_id when test column value has none t_cols = ['t_' + t for t in mapping.keys()] + ['external_id', 'score'] for bcol in mapping.values(): mask = df[bcol].isna() df.loc[mask, t_cols] = np.nan if api.config.only_index: df = df[list(base_msg.body.columns) + ['external_id']] att_dict['config']['only_index'] = api.config.only_index if api.config.only_matching_rows: df = df.loc[~df['score'].isna()] att_dict['config']['only_matching_rows'] = api.config.only_matching_rows basedf_index = tfp.read_value(api.config.base_index) att_dict['config']['base_index'] = basedf_index if api.config.joint_id: if not basedf_index: raise ValueError("For <joint_id> a value for <base_index> is necessary ") df.loc[~df['external_id'].isna(), 'joint_id'] = df.loc[~df['external_id'].isna(), 'external_id'] df.loc[df['external_id'].isna(), 'joint_id'] = df.loc[df['external_id'].isna(), basedf_index] att_dict['config']['joint_id'] = api.config.joint_id if api.config.add_non_matching: # test if same columns if not all([elem in test_df.columns for elem in base_msg.body.columns]): raise ValueError("Adding test dataframe only possible when having same columns " + str(test_df.columns) \ + ' vs. ' + str(base_msg.body.columns)) matched_ids = df['external_id'].unique() addto_df = test_df.loc[~test_df[testdf_index].isin(matched_ids)].copy() addto_df['joint_id'] = addto_df[testdf_index] df = pd.concat([df, addto_df], axis=0, sort=False) att_dict['config']['add_non_matching'] = api.config.add_non_matching else: logger.warning('No columns to check') ############################################## # final infos to attributes and info message ############################################## if df.empty: logger.warning('DataFrame is empty') else : att_dict['operator'] = 'fuzzyjoinDataFrames' att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2 att_dict['columns'] = str(list(df.columns)) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] if 'id' in base_att.keys(): att_dict['id'] = base_att['id'] + '; ' + att_dict['operator'] + ': ' + str(id(df)) else: att_dict['id'] = att_dict['operator'] + ': ' + str(id(df)) example_rows = EXAMPLE_ROWS if att_dict['number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict,body = df) return log, msg
def process(msg): logger, log_stream = slog.set_logging('DEBUG') logger.debug("Process started") # start custom process definition att_dict = dict() att_dict['config'] = dict() global result_df # json string of attributes already converted to dict # att_dict['prev_attributes'] = msg.attributes att_dict['filename'] = msg.attributes["storage.filename"] logger.info('Filename: {} index: {} count: {} endofSeq: {}'.format(msg.attributes["storage.filename"], \ msg.attributes["storage.fileIndex"], \ msg.attributes["storage.fileCount"], \ msg.attributes["storage.endOfSequence"])) # using file name from attributes of ReadFile if not api.config.df_name or api.config.df_name == "DataFrame": att_dict['name'] = att_dict['filename'].split(".")[0] if isinstance(msg.body, str): csv_io = io.StringIO(msg.body) logger.debug("Input format: <string>") elif isinstance(msg.body, bytes): csv_io = io.BytesIO(msg.body) logger.debug("Input format: <bytes>") elif isinstance(msg.body, io.BytesIO): logger.debug("Input format: <io.Bytes>") csv_io = msg.body else: raise TypeError('Message body has unsupported type' + str(type(msg.body))) # nrows nrows = None if not api.config.limit_rows == 0: nrows = api.config.limit_rows # usecols att_dict['config']['use_columns'] = api.config.use_columns use_cols = tfp.read_list(api.config.use_columns) # dtypes mapping att_dict['config']['dtypes'] = api.config.dtypes typemap = tfp.read_dict(api.config.dtypes) kwargs = tfp.read_dict(text=api.config.keyword_args, map_sep='=') ##### Read string from buffer logger.debug("Read from input") df = pd.read_csv(csv_io, api.config.separator, usecols=use_cols, dtype=typemap, decimal=api.config.decimal, \ nrows=nrows, **kwargs) # Data from filename if api.config.data_from_filename and not api.config.data_from_filename == 'None': col = api.config.data_from_filename.split(':')[0].strip().strip( "'").strip('"') pat = api.config.data_from_filename.split(':')[1].strip().strip( "'").strip('"') logger.debug('Filename: {} pattern: {}'.format( att_dict['filename'], pat)) try: dataff = re.match('.*(\d{4}-\d+-\d+).*', att_dict['filename']) df[col] = dataff.group(1) except AttributeError: raise ValueError( 'Pattern not found - Filename: {} pattern: {}'.format( att_dict['filename'], pat)) # To Datetime if api.config.todatetime and not api.config.todatetime == 'None': coldate = api.config.todatetime.split(':')[0].strip().strip("'").strip( '"') dformat = api.config.todatetime.split(':')[1].strip().strip("'").strip( '"') df[coldate] = pd.to_datetime(df[coldate], format=dformat) ###### Downcasting # save memory footprint for calculating the savings of the downcast att_dict['previous_memory'] = df.memory_usage(deep=True).sum() / 1024**2 if api.config.downcast_int: df, dci = downcast(df, 'int', 'unsigned') if api.config.downcast_float: df, dcf = downcast(df, 'float', 'float') # check if index is provided and set index_list = tfp.read_list(api.config.index_cols) att_dict['config']['index_cols'] = str(index_list) att_dict['index_cols'] = str(index_list) if index_list: df.set_index(index_list, inplace=True) # stores the result in global variable result_df if msg.attributes['storage.fileIndex'] == 0: result_df = df else: result_df = pd.concat([result_df, df], axis=0, sort=False) ############################################## # final infos to attributes and info message ############################################## att_dict['operator'] = 'fromCSVDataFrame' att_dict['memory'] = result_df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = list(result_df.columns) att_dict['dtypes'] = { col: str(ty) for col, ty in df.dtypes.to_dict().items() } att_dict['number_columns'] = result_df.shape[1] att_dict['number_rows'] = result_df.shape[0] att_dict['id'] = str(id(result_df)) example_rows = EXAMPLE_ROWS if att_dict[ 'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in result_df.iloc[i, :].tolist()]) # end custom process definition msg = api.Message(attributes=att_dict, body=result_df) log = log_stream.getvalue() return log, msg
def process(msg) : att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'groupby' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() prev_att = msg.attributes df = msg.body prev_shape = df.shape ###### start of doing calculation # groupby list cols = tfp.read_list(api.config.groupby) att_dict['config']['groupby'] = api.config.groupby # mapping aggregation try : colagg = tfp.read_dict(api.config.aggregation) except IndexError : logger.info('Aggregation is not a map, try to parse a value instead') colagg = tfp.read_value(api.config.aggregation) att_dict['config']['aggregation'] = api.config.aggregation # groupby logger.debug('Group columns: {}'.format(cols)) logger.debug('Aggregation: {}'.format(colagg)) logger.debug('Index: {}'.format(api.config.index)) df = df.groupby(cols, as_index=api.config.index).agg(colagg) # drop col att_dict['config']['dropcols'] = api.config.drop_columns dropcols = tfp.read_list(api.config.drop_columns) if dropcols : df.drop(columns=dropcols,inplace=True) ############################################## # final infos to attributes and info message ############################################## logger.info('End of Process: {}'.format(time_monitor.elapsed_time())) att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) #logger.debug('Dropped duplicates: {}'.format(prev_shape[0] - df.shape[0])) logger.info('Dropped rows: {}'.format(prev_shape[0]-df.shape[0])) logger.info('Dropped columns: {}'.format(prev_shape[1] - df.shape[1])) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'toCSV' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition df = msg.body if api.config.reset_index: logger.debug('Reset Index') df = df.reset_index() kwargs = tfp.read_dict(text=api.config.keyword_args, map_sep='=') if not kwargs == None: data_str = df.to_csv(sep=api.config.separator, index=api.config.write_index, **kwargs) else: data_str = df.to_csv(sep=api.config.separator, index=api.config.write_index) # end custom process definition att_dict['process_list'].append(att_dict['operator']) logger.info('Process list: {}'.format(att_dict['process_list'])) # create dict of columns and types for HANA map_hana = {'int8': 'TINYINT', 'int16': 'SMALLINT', 'int32': 'INTEGER', 'int64': 'BIGINT', 'float32': 'FLOAT', 'float64': 'DOUBLE', \ 'object': 'VARCHAR', 'datetime64': 'TIMESTAMP'} col_dict = {c: str(df[c].dtype) for c in df.columns} hana_table_dict = list() for c, ty in col_dict.items(): if ty == 'object': size = df[c].str.len().max() hana_table_dict.append({ 'name': c, 'type': map_hana[col_dict[c]], 'size': size }) elif 'datetime64' in ty: hana_table_dict.append({'name': c, 'type': 'TIMESTAMP'}) else: hana_table_dict.append({'name': c, 'type': map_hana[col_dict[c]]}) logger.info('For Hana table definition: {}'.format(hana_table_dict)) progress_str = '<BATCH ENDED><1>' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '<BATCH ENDED><{}>'.format( att_dict['storage.fileCount']) else: progress_str = '<BATCH IN-PROCESS><{}/{}>'.format( att_dict['storage.fileIndex'] + 1, att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) log = log_stream.getvalue() return log, data_str
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'word_regex' logger, log_stream = slog.set_logging(att_dict['operator'], api.config.debug_mode) logger.info("Main Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() # Dataframe df = msg.body logger.debug('Attributes: {}'.format(str(msg.attributes))) df = msg.body if not isinstance(df, pd.DataFrame) or df.empty: logger.warning('Empty dataframe, no output send!') api.send(outports[0]['name'], log_stream.getvalue()) api.send(outports[2]['name'], api.Message(attributes=att_dict, body=df)) return 0 # in case the input is from a DB df.rename(columns = {'TEXT_ID':'text_id','LANGUAGE':'language','TYPE':'type','WORD':'word','COUNT':'count'},inplace=True) logger.debug('DataFrame columns: {}'.format(df.columns)) df['word_m'] = np.nan df['word_r'] = np.nan df['word_orig'] = df['word'] # word type word_types = tfp.read_list(api.config.word_types) if not word_types : word_types = list(df['type'].unique()) logger.debug('Word types: {}'.format(word_types)) # Language filter language_filter = tfp.read_list(api.config.language_filter) if not language_filter : language_filter = list(df['language'].unique()) logger.debug('Language filter: {}'.format(language_filter)) mask = df['language'].isin(language_filter) & df['type'].isin(word_types) # regex patterns word removal regex_wordr = tfp.read_list(api.config.pattern_word_removal) remove_words = list() if regex_wordr : for ipat, pat in enumerate(regex_wordr): logger.info('Execute pattern: {} ({}/{})'.format(pat,ipat,len(regex_wordr))) api.send(outports[0]['name'], log_stream.getvalue()) log_stream.truncate() log_stream.seek(0) df.loc[mask & df['word'].str.contains(pat = pat),'word_r'] = pat # regex patterns word removal regex_ssr = tfp.read_dict(api.config.pattern_substring_replace) if regex_ssr : for ipat, pat in enumerate(regex_ssr.items()): logger.info('Execute replace pattern: {} ({}/{})'.format(pat,ipat,len(regex_ssr))) api.send(outports[0]['name'], log_stream.getvalue()) log_stream.truncate() log_stream.seek(0) df.loc[mask & df['word'].str.contains(pat=pat[0]), 'word_m'] = pat[0] df.loc[mask,'word'] = df.loc[mask,'word'].str.replace(pat[0],pat[1],regex = True) # send removed or replace words to port removed rm_df= df.loc[df[['word_r','word_m']].any(axis=1),['word_orig','word','word_r','word_m']].drop_duplicates() rm_csv = rm_df.to_csv(index=False) attributes_removed = att_dict.copy() attributes_removed['port'] = outports[1]['name'] logger.debug('CSV send to port {} with #rows: {})'.format(attributes_removed['port'],rm_df.shape[0])) api.send(outports[1]['name'], api.Message(attributes=attributes_removed, body=rm_csv)) # delete rows with not-nan value in 'word_r and drop word_r, word_m columns df.drop(df.loc[~df['word_r'].isnull()].index,axis = 0, inplace = True) df.drop(columns=['word_r','word_m'],inplace = True) # group on text_id, language, type, word df = df.groupby(by=['text_id','language','type','word'])['count'].sum().reset_index() logger.info('Dataframe shape: {} - {}'.format(df.shape[0],df.shape[1])) att_dict['port'] = outports[2]['name'] api.send(outports[2]['name'], api.Message(attributes=att_dict, body=df)) api.send(outports[0]['name'],log_stream.getvalue())
def process(msg): att_dict = msg.attributes global result_df att_dict['operator'] = 'fromCSV' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() logger.info('Filename: {} index: {} count: {} endofSeq: {}'.format(msg.attributes["storage.filename"], \ msg.attributes["storage.fileIndex"], \ msg.attributes["storage.fileCount"], \ msg.attributes["storage.endOfSequence"])) if msg.body == None: logger.info('Process ended.') msg = api.Message(attributes=att_dict, body=result_df) log = log_stream.getvalue() return log, msg elif isinstance(msg.body, str): csv_io = io.StringIO(msg.body) logger.debug("Input format: <string>") elif isinstance(msg.body, bytes): csv_io = io.BytesIO(msg.body) logger.debug("Input format: <bytes>") elif isinstance(msg.body, io.BytesIO): logger.debug("Input format: <io.Bytes>") csv_io = msg.body else: raise TypeError('Message body has unsupported type' + str(type(msg.body))) # nrows nrows = None if not api.config.limit_rows == 0: nrows = api.config.limit_rows # usecols use_cols = tfp.read_list(api.config.use_columns) logger.debug('Columns used: {}'.format(use_cols)) # dtypes mapping typemap = tfp.read_dict(api.config.dtypes) logger.debug('Type cast: {}'.format(str(typemap))) kwargs = tfp.read_dict(text=api.config.keyword_args, map_sep='=') ##### Read string from buffer logger.debug("Read from input") df = pd.read_csv(csv_io, api.config.separator, usecols=use_cols, dtype=typemap, decimal=api.config.decimal, \ nrows=nrows, **kwargs) # Data from filename if api.config.data_from_filename and not api.config.data_from_filename == 'None': col = api.config.data_from_filename.split(':')[0].strip().strip( "'").strip('"') pat = api.config.data_from_filename.split(':')[1].strip().strip( "'").strip('"') logger.debug('Filename: {} pattern: {}'.format( att_dict['filename'], pat)) try: dataff = re.match('.*(\d{4}-\d+-\d+).*', att_dict['filename']) df[col] = dataff.group(1) except AttributeError: raise ValueError( 'Pattern not found - Filename: {} pattern: {}'.format( att_dict['filename'], pat)) # To Datetime if api.config.todatetime and not api.config.todatetime == 'None': dt_fmt = tfp.read_dict(api.config.todatetime) logger.debug('Time conversion {} by using UTC {}'.format( api.config.todatetime, api.config.utc)) for col, fmt in dt_fmt.items(): df[col] = pd.to_datetime(df[col], format=fmt, utc=api.config.utc) ###### Downcasting # save memory footprint for calculating the savings of the downcast logger.debug('Memory used before downcast: {}'.format( df.memory_usage(deep=True).sum() / 1024**2)) if api.config.downcast_int: df, dci = downcast(df, 'int', 'unsigned') if api.config.downcast_float: df, dcf = downcast(df, 'float', 'float') # check if index is provided and set index_list = tfp.read_list(api.config.index_cols) if index_list: df.set_index(index_list, inplace=True) if api.config.collect: # stores the result in global variable result_df if msg.attributes['storage.fileIndex'] == 0: logger.debug('Added to DataFrame: {}'.format( att_dict['storage.filename'])) result_df = df else: try: result_df = pd.concat([result_df, df], axis=0, sort=False) except Exception as e: logger.error(str(e)) result_df = df else: result_df = df # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict[ 'storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg): global ID_list operator_name = 'doc_prepare' logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode) logger.info("Main Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() att_dict = msg.attributes df = msg.body text_column = tfp.read_value(api.config.text_column) if not text_column: text_column = 'text' id_column = tfp.read_value(api.config.id_column) if not id_column: id_column = 'text_id' default_language = 'DE' if api.config.media_docs: language_column = 'language' df['language'] = default_language df.loc[df['media'].isin(['Lefigaro', 'Lemonde']), 'language'] = 'FR' df.loc[df['media'].isin(['Elpais', 'Elmundo']), 'language'] = 'ES' else: language_column = tfp.read_value(api.config.language_column) if not language_column: language_column = 'language' r_default_language = tfp.read_value(api.config.default_language) if r_default_language: default_language = r_default_language if not language_column in df.columns: df[language_column] = default_language else: df.loc[df['language'].isna()] = default_language df.rename(columns={ text_column: 'text', id_column: 'text_id', language_column: 'language' }, inplace=True) logger.debug('Columns: {}'.format(df.columns)) logger.info("Default language: {}".format(default_language)) # if text is a binary if type(df['text'].iloc[0]) == bytes: logger.info('Text is bytes. Decoded to \'utf-8\'') df.text = df.text.str.decode('utf-8') # remove duplicates prev_num_rows = df.shape[0] df.drop_duplicates(subset=['text_id'], inplace=True) df = df.loc[~df['text_id'].isin(ID_set)] post_num_rows = df.shape[0] logger.debug('Docs reduced due to be already processed: {} - {}'.format( prev_num_rows, post_num_rows)) ID_set.update(df.text_id.values.tolist()) # replace html tags if api.config.remove_html_tags: df['text'] = df['text'].str.replace('<.*?>', '', regex=True) # correct common text format errors #repl_pattern = list() #repl_pattern.append([r'([\:,\.?!\)])([A-Z])',r'\1 \2']) #repl_pattern.append([r'(,)([a-z])', r'\1 \2']) #repl_pattern.append([r'(\"\.)([A-Z])', r'\1 \2']) #repl_pattern.append([r'(\.)(\"[A-Z])', r'\1 \2']) repl_pattern = tfp.read_dict(api.config.pattern_substring_replace) repl_pattern = repl_pattern if repl_pattern: logger.info('Apply regex to text: {}'.format(repl_pattern)) for pat, repl in repl_pattern.items(): mask = df['text'].str.contains(pat) df.loc[mask, 'text'] = df.loc[mask, 'text'].str.replace(pat, repl, regex=True) api.send(outports[0]['name'], log_stream.getvalue()) api.send( outports[1]['name'], api.Message(attributes=att_dict, body=df[['text_id', 'language', 'text']]))