def process(msg): global setup_data global last_msg global hash_list logger, log_stream = slog.set_logging('text cleansing', loglevel=api.config.debug_mode) logger.info("Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() msg = check_for_setup(logger, msg) if not msg: api.send(outports[0]['name'], log_stream.flush()) return 0 adict = msg.body att_dict = msg.attributes language_filter = tfp.read_value(api.config.language) article_words = dict() article_count = 0 for index_article, article in enumerate(adict): language = language_dict[article['media']] # filter language if language_filter and not language_filter == language : #logger.debug('Language filtered out: {} ({})'.format(language, language_filter)) continue article_count += 1 # check if article has been processed if article['hash_text'] in hash_list : logger.debug('Article has already been processed: {} - {} - {}'.format(article['date'],article['media'],article['hash_text'])) continue hash_list.append(article['hash_text']) text = article['text'] text = re.sub(r'\d+', '', text.lower()) text = re.sub(r'\b[a-z]\b', '', text) # Language settings if language == 'DE': doc = nlp_g(text) elif language == 'FR': doc = nlp_fr(text) elif language == 'ES': doc = nlp_es(text) else : logger.warning('Language not implemented') doc = None words = [] # only when doc has been created - language exists if doc : if api.config.mode == 'P+NOUN' : words = [token.lemma_[:api.config.max_word_len] for token in doc if token.pos_ in ['PROPN', 'NOUN'] ] elif api.config.mode == 'NOUN': words = [token.lemma_[:api.config.max_word_len] for token in doc if token.pos_ == 'NOUN'] elif api.config.mode == 'PROPN' : words = [token.lemma_[:api.config.max_word_len] for token in doc if token.pos_ == 'PROPN'] else : words = [token.text[:api.config.max_word_len] for token in doc if not token.is_stop] # Remove blacklist words words = [ w for w in words if w not in setup_data] if api.config.counter: article_words[article['hash_text']] = collections.Counter(words) else : article_words[article['hash_text']] = words msg = api.Message(attributes=att_dict,body = article_words) logger.info('File processed: {} #Articles: {} '.format(att_dict["storage.filename"],len(adict))) api.send(outports[0]['name'], log_stream.getvalue()) api.send(outports[1]['name'], msg)
def process(msg): att_dict = msg.attributes global result_df att_dict['operator'] = 'fromCSV' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() logger.info('Filename: {} index: {} count: {} endofSeq: {}'.format(msg.attributes["storage.filename"], \ msg.attributes["storage.fileIndex"], \ msg.attributes["storage.fileCount"], \ msg.attributes["storage.endOfSequence"])) if msg.body == None: logger.info('Process ended.') msg = api.Message(attributes=att_dict, body=result_df) log = log_stream.getvalue() return log, msg elif isinstance(msg.body, str): csv_io = io.StringIO(msg.body) logger.debug("Input format: <string>") elif isinstance(msg.body, bytes): csv_io = io.BytesIO(msg.body) logger.debug("Input format: <bytes>") elif isinstance(msg.body, io.BytesIO): logger.debug("Input format: <io.Bytes>") csv_io = msg.body else: raise TypeError('Message body has unsupported type' + str(type(msg.body))) # nrows nrows = None if not api.config.limit_rows == 0: nrows = api.config.limit_rows # usecols use_cols = tfp.read_list(api.config.use_columns) logger.debug('Columns used: {}'.format(use_cols)) # dtypes mapping typemap = tfp.read_dict(api.config.dtypes) logger.debug('Type cast: {}'.format(str(typemap))) kwargs = tfp.read_dict(text=api.config.keyword_args, map_sep='=') ##### Read string from buffer logger.debug("Read from input") df = pd.read_csv(csv_io, api.config.separator, usecols=use_cols, dtype=typemap, decimal=api.config.decimal, \ nrows=nrows, **kwargs) # Data from filename if api.config.data_from_filename and not api.config.data_from_filename == 'None': col = api.config.data_from_filename.split(':')[0].strip().strip( "'").strip('"') pat = api.config.data_from_filename.split(':')[1].strip().strip( "'").strip('"') logger.debug('Filename: {} pattern: {}'.format( att_dict['filename'], pat)) try: dataff = re.match('.*(\d{4}-\d+-\d+).*', att_dict['filename']) df[col] = dataff.group(1) except AttributeError: raise ValueError( 'Pattern not found - Filename: {} pattern: {}'.format( att_dict['filename'], pat)) # To Datetime if api.config.todatetime and not api.config.todatetime == 'None': dt_fmt = tfp.read_dict(api.config.todatetime) logger.debug('Time conversion {} by using UTC {}'.format( api.config.todatetime, api.config.utc)) for col, fmt in dt_fmt.items(): df[col] = pd.to_datetime(df[col], format=fmt, utc=api.config.utc) ###### Downcasting # save memory footprint for calculating the savings of the downcast logger.debug('Memory used before downcast: {}'.format( df.memory_usage(deep=True).sum() / 1024**2)) if api.config.downcast_int: df, dci = downcast(df, 'int', 'unsigned') if api.config.downcast_float: df, dcf = downcast(df, 'float', 'float') # check if index is provided and set index_list = tfp.read_list(api.config.index_cols) if index_list: df.set_index(index_list, inplace=True) if api.config.collect: # stores the result in global variable result_df if msg.attributes['storage.fileIndex'] == 0: logger.debug('Added to DataFrame: {}'.format( att_dict['storage.filename'])) result_df = df else: try: result_df = pd.concat([result_df, df], axis=0, sort=False) except Exception as e: logger.error(str(e)) result_df = df else: result_df = df # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '<BATCH ENDED><1>' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '<BATCH ENDED><{}>'.format( att_dict['storage.fileCount']) else: progress_str = '<BATCH IN-PROCESS><{}/{}>'.format( att_dict['storage.fileIndex'] + 1, att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'transposeColumn' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') # start custom process definition prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation att_dict['config']['reset_index'] = api.config.reset_index if api.config.reset_index: df.reset_index(inplace=True) # create DataFrame with numbered columns add concat it to df att_dict['config']['transpose_column'] = api.config.transpose_column trans_col = tfp.read_value(api.config.transpose_column) att_dict['config']['value_column'] = api.config.value_column val_col = tfp.read_value(api.config.value_column) # new columns tvals = list(df[trans_col].unique()) if api.config.prefix: new_cols = {trans_col + '_' + str(v): v for v in tvals} else: new_cols = {str(v): v for v in tvals} t_df = pd.DataFrame(columns=new_cols.keys(), index=df.index) df = pd.concat([df, t_df], axis=1) # setting the corresponding column to the value of the value column for col, val in new_cols.items(): df.loc[df[trans_col] == val, col] = df.loc[df[trans_col] == val, val_col] df.drop(columns=[trans_col, val_col], inplace=True) att_dict['config']['groupby'] = api.config.groupby gbcols = tfp.read_list(api.config.groupby, df.columns) # group df if gbcols: aggr_trans = api.config.aggr_trans.strip() aggr_default = api.config.aggr_default.strip() aggregation = dict() for col in df.columns: aggregation[col] = aggr_trans if col in new_cols else aggr_default aggregation = {c: a for c, a in aggregation.items() if c not in gbcols} df = df.groupby(gbcols, as_index=api.config.as_index).agg(aggregation) ##################### # final infos to attributes and info message ##################### att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) logger.debug('Head data: {}'.format(att_dict['row_' + str(i)])) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(msg): logger, log_stream = slog.set_logging('DEBUG') time_monitor = tp.progress() logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) df = msg.body columns = tfp.read_list(api.config.columns, df.columns, test_number=False) info_only = api.config.info_only equal_only = api.config.equal_only threshold = api.config.threshold upper_threshold = api.config.upper_threshold num_values = api.config.num_values transform_data = { 'column': [], 'dtype': [], 'unique_values': [], 'action': [] } for col in df[columns].select_dtypes(np.object): unique_vals = df.loc[df[col].notnull(), col].unique() if (len(unique_vals) == num_values) or (len(unique_vals) <= num_values and not equal_only): population = df[col].count() / df.shape[0] if population > upper_threshold and len(unique_vals) == 2: transform_data['column'].append(col) transform_data['dtype'].append(df[col].dtype) v0 = 0 v1 = 1 if df.loc[df[col] == unique_vals[0], col].count() > df.shape[0] * 0.5: v0 = 1 v1 = 0 # per definition first unique value 0, second unique value 1 if v0 == 0: transform_data['unique_values'].append(unique_vals) else: transform_data['unique_values'].append( [unique_vals[1], unique_vals[0]]) transform_data['action'].append('map2') # print('{}: {} -> {}'.format(vals[0],df.loc[df[col]==vals[0],col].count(),v0)) # print('{}: {} -> {}'.format(vals[1],df.loc[df[col]==vals[1],col].count(),v1)) if not info_only: df.loc[df[col] == unique_vals[0], col] = v0 df.loc[df[col] == unique_vals[1], col] = v1 df.loc[df[col].isnull(), col] = 0 df[col] = df[col].astype('int8') elif population < threshold or len(unique_vals) == 1: transform_data['column'].append(col) transform_data['dtype'].append(df[col].dtype) transform_data['unique_values'].append(unique_vals) transform_data['action'].append('map1') if not info_only: df.loc[df[col].isin(unique_vals), col] = 1 df.loc[df[col].isnull(), col] = 0 df[col] = df[col].astype('int8') logger.debug('End of Process Function') logger.debug('End time: ' + time_monitor.elapsed_time()) return log_stream.getvalue(), api.Message(attributes={'name':'filter_by_population','type':'DataFrame'},body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
def process(msg): logger, log_stream = slog.set_logging('DEBUG') time_monitor = tp.progress() result = '' logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() ###### start of doing calculation model = LGBMRegressor( n_estimators=200, learning_rate=0.03, num_leaves=32, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.04, reg_lambda=0.073, min_split_gain=0.0222415, min_child_weight=40) att_dict['config']['train columns'] = api.config.train_cols train_cols = tfp.read_list(api.config.train_cols, df.columns) att_dict['config']['label'] = api.config.label label = tfp.read_value(api.config.label) if not label: raise ValueError('Label is mandatory') # cast to categorical dtype for c in df[train_cols].select_dtypes(include='category').columns: unique_num = len(df[c].unique()) nan_num = df[c].isna().count() logger.debug('Cast to category - {}: unique {}, nan: {} of {}'.format(c, unique_num, nan_num, df.shape[0])) df[c] = df[c].cat.codes df[c] = df[c].astype('int32') if pd.api.types.is_categorical(df[label]): df[label] = df[label].astype('category') logger.debug('Cast label to <category>') df[label] = df[label].cat.codes df[label] = df[label].astype('int32') print(df.select_dtypes(include='category').head(10)) logger.debug('Train with {} features'.format(len(train_cols))) print(train_cols) model.fit(df[train_cols], df[label], eval_metric='auc') ###### end of doing calculation ############################################## # final infos to attributes and info message ############################################## if df.empty: raise ValueError('DataFrame is empty') att_dict['operator'] = 'lgbm_classifier' att_dict['name'] = prev_att['name'] logger.debug('End of Process Function') logger.debug('End time: ' + time_monitor.elapsed_time()) return log_stream.getvalue(), api.Message(attributes=att_dict,body=model)
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'sample' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition # test if body refers to a DataFrame type prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): logger.error('Message body does not contain a pandas DataFrame') raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() ###### start calculation sample_size = api.config.sample_size if sample_size < 1: sample_size = int(sample_size * df.shape[0]) if sample_size < 1: sample_size = 1 logger.warning( "Fraction of sample size too small. Set sample size to 1.") elif sample_size > df.shape[0]: logger.warning("Sample size larger than number of rows") logger.debug("Samples_size: {}/() ({})".format(sample_size, df.shape[0], sample_size / df.shape[0])) random_state = api.config.random_state invariant_column = tfp.read_value(api.config.invariant_column) if invariant_column and sample_size < df.shape[0]: # get the average number of records for each value of invariant sc_df = df.groupby(invariant_column)[invariant_column].count() sample_size_invariant = int(sample_size / sc_df.mean()) sample_size_invariant = 1 if sample_size_invariant == 0 else sample_size_invariant # ensure minimum sc_df = sc_df.sample(n=sample_size_invariant, random_state=random_state).to_frame() sc_df.rename(columns={invariant_column: 'sum'}, inplace=True) # sample the df by merge 2 df df = pd.merge(df, sc_df, how='inner', right_index=True, left_on=invariant_column) df.drop(columns=['sum'], inplace=True) else: df = df.sample(n=sample_size, random_state=random_state) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict[ 'storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'selectValues' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition df = msg.body ######################### Start Calculation # save and reset indices index_names = df.index.names if index_names[0]: logger.debug("Reset index") df.reset_index(inplace=True) # prepare selection for numbers if api.config.selection_num and not api.config.selection_num.upper( ) == 'NONE': selection_map = tfp.read_relations(api.config.selection_num) for s in selection_map: if s[1] == '≤': df = df.loc[df[s[0]] <= s[2]] elif s[1] == '<': df = df.loc[df[s[0]] < s[2]] elif s[1] == '≥': df = df.loc[df[s[0]] >= s[2]] elif s[1] == '>': df = df.loc[df[s[0]] > s[2]] elif s[1] == '=': df = df.loc[df[s[0]] == s[2]] elif s[1] == '!': df = df.loc[df[s[0]] != s[2]] else: raise ValueError('Unknown relation: ' + str(s)) att_dict['config']['selection_num'] = api.config.selection_num if api.config.selection_list and not api.config.selection_list.upper( ) == 'NONE': value_list_dict = tfp.read_dict_of_list(api.config.selection_list) for key, vl in value_list_dict.items(): df = df.loc[df[key].isin(vl)] att_dict['config']['selection_list'] = api.config.selection_list # set index again if index_names[0]: att_dict['indices'] = index_names logger.debug('Set indices to: {}'.format(str(index_names))) df.set_index(keys=index_names, inplace=True) if df.empty: logger.error('DataFrame is empty') raise ValueError('DataFrame is empty') # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict[ 'storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'splitSample' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() time_monitor = tp.progress() df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation if api.config.split > df.shape[0]: warning = 'Split larger than whole sample' split = 1 elif api.config.split > 1: split = api.config.split / df.shape[0] else: split = api.config.split logger.info('Split DataFrame: {}'.format(split)) if api.config.to_category: cast_cols = df.select_dtypes(include=np.object).columns for col in cast_cols: unique_num = len(df[col].unique()) nan_num = df[col].isna().count() logger.debug( 'Cast to category - {}: unique {}, nan: {} of {}'.format( col, unique_num, nan_num, df.shape[0])) df[col] = df[col].astype('category') logger.info('Cast to category type: {}'.format(cast_cols)) label = tfp.read_value(api.config.label_col) if label: label_vals = list(df[label].unique()) tdf = list() for lab in label_vals: tdf.append(df.loc[df[label] == lab].sample( frac=split, random_state=api.config.seed)) train_df = pd.concat(tdf) logger.info('Consider label ratio for splitting: {}'.format(label)) else: train_df = df.sample( frac=split, random_state=api.config.seed) # random state is a seed value test_df = df.drop(train_df.index) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '<BATCH ENDED><1>' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '<BATCH ENDED><{}>'.format( att_dict['storage.fileCount']) else: progress_str = '<BATCH IN-PROCESS><{}/{}>'.format( att_dict['storage.fileIndex'] + 1, att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) return log_stream.getvalue(), api.Message(attributes=att_dict, body=train_df), api.Message( attributes=att_dict, body=test_df)
def process(msg): global blacklist global last_msg global hash_list logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode) # Check if setup complete msg = check_for_setup(logger, msg, mode=api.config.mode, use_blacklist=api.config.use_blacklist) if not msg: api.send(outports[0]['name'], log_stream.flush()) return 0 logger.info("Main Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() adict = msg.body language_filter = tfp.read_value(api.config.language) mode = tfp.read_value(api.config.mode) if not mode or not any(m in mode for m in supported_word_types): raise Exception( 'Mode is mandatory parameter and valid values are: {}'.format( supported_word_types)) use_keywords = True if 'K' in mode else False use_lexicon = True if 'L' in mode else False use_blacklist = api.config.use_blacklist logger.info('Usage: keywords: {} lexicon: {} blacklist: {} '.format( use_keywords, use_lexicon, use_blacklist)) article_words = list() article_count = 0 for index_article, article in enumerate(adict): language = media_languages[article['media']] # filter language if language_filter and not language_filter == language: # logger.debug('Language filtered out: {} ({})'.format(language, language_filter)) continue article_count += 1 # check if article has been processed if article['hash_text'] in hash_list: logger.debug( 'Article has already been processed: {} - {} - {}'.format( article['date'], article['media'], article['hash_text'])) continue hash_list.append(article['hash_text']) doc = nlp_doc(logger, language, article['text']) words = dict() # only when doc has been created - language exists if doc: if 'P' in api.config.mode: words['P'] = [ token.lemma_[:api.config.max_word_len] for token in doc if token.pos_ == 'PROPN' ] if 'N' in api.config.mode: words['N'] = [ token.lemma_[:api.config.max_word_len] for token in doc if token.pos_ == 'NOUN' ] if 'X' in api.config.mode: words['X'] = [ token.text[:api.config.max_word_len] for token in doc if not token.is_stop ] if use_keywords: #words['K'] = [token.lemma_ for kw in keywords for token in doc if re.match(kw, token.lemma_)] words['K'] = [ token.lemma_ for kw in keywords for token in doc if kw == token.lemma ] if use_lexicon and lexicon_languages[language]: words['L'] = [ lexicon_stem[language][lw] for lw in lexicon_stem[language] for token in doc if re.match(lw, token.lemma_) ] words['L'] = [ lexicon[language][lw] for lw in lexicon[language] for token in doc if lw == token.lemma ] for m in words: # heuristics # remove preceding non-alpha characters words[m] = [re.sub('^[-\'\./+]', '', w) for w in words[m]] # remove trailing non-alpha characters words[m] = [re.sub('[-\./+]$', '', w) for w in words[m]] # minimum word length words[m] = [ w for w in words[m] if len(w) >= api.config.min_word_len ] # remove numbers and dates words[m] = [ w for w in words[m] if not (re.findall('\d+[\.,]\d+', w) or re.findall('^\d+$', w)) ] # Remove blacklist words if use_blacklist: words[m] = [w for w in words[m] if w not in blacklist] article_words.append([ article['hash_text'], language, m, collections.Counter(words[m]).most_common() ]) attributes = { "table": { "columns": [{ "class": "string", "name": "HASH_TEXT", "nullable": True, "type": { "hana": "INTEGER" } }, { "class": "string", "name": "LANGUAGE", "nullable": True, "size": 2, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "TYPE", "nullable": True, "size": 1, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "WORDS", "nullable": True, "type": { "hana": "ARRAY" } }], "name": "DIPROJECTS.WORD_INDEX", "version": 1 }, "storage.filename": msg.attributes["storage.filename"] } attributes['counter'] = 'Y' if api.config.counter else 'N' table_msg = api.Message(attributes=attributes, body=article_words) logger.info('File processed: {} #Articles: {} '.format( msg.attributes["storage.filename"], len(adict))) api.send(outports[0]['name'], log_stream.getvalue()) api.send(outports[1]['name'], table_msg)
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'transposeColumn' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation if api.config.reset_index: df.reset_index(inplace=True) logger.info('Reset index') # create DataFrame with numbered columns add concat it to df trans_col = tfp.read_value(api.config.transpose_column) logger.info('Transpose column: {}'.format(trans_col)) val_col = tfp.read_value(api.config.value_column) logger.info('Value column: {}'.format(val_col)) # new columns tvals = list(df[trans_col].unique()) if api.config.prefix: new_cols = {trans_col + '_' + str(v): v for v in tvals} else: new_cols = {str(v): v for v in tvals} t_df = pd.DataFrame(columns=new_cols.keys(), index=df.index) df = pd.concat([df, t_df], axis=1) # setting the corresponding column to the value of the value column for col, val in new_cols.items(): df.loc[df[trans_col] == val, col] = df.loc[df[trans_col] == val, val_col] df.drop(columns=[trans_col, val_col], inplace=True) gbcols = tfp.read_list(api.config.groupby, df.columns) # group df if gbcols: aggr_trans = api.config.aggr_trans.strip() aggr_default = api.config.aggr_default.strip() aggregation = dict() for col in df.columns: aggregation[col] = aggr_trans if col in new_cols else aggr_default aggregation = {c: a for c, a in aggregation.items() if c not in gbcols} df = df.groupby(gbcols, as_index=api.config.as_index).agg(aggregation) logger.info('Groupby: {}'.format(gbcols)) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict[ 'storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'lgbm_classifier' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='DEBUG') else : logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation model = LGBMRegressor( n_estimators=200, learning_rate=0.03, num_leaves=32, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.04, reg_lambda=0.073, min_split_gain=0.0222415, min_child_weight=40) train_cols = tfp.read_list(api.config.train_cols, df.columns) logger.info('Train columns: {}'.format(train_cols)) label = tfp.read_value(api.config.label_col) logger.info('Label column: {}'.format(label)) if not label: raise ValueError('Label is mandatory') # cast to categorical dtype for c in df[train_cols].select_dtypes(include='category').columns: unique_num = len(df[c].unique()) nan_num = df[c].isna().count() logger.debug('Cast to category - {}: unique {}, nan: {} of {}'.format(c, unique_num, nan_num, df.shape[0])) df[c] = df[c].cat.codes df[c] = df[c].astype('int32') if pd.api.types.is_categorical(df[label]): df[label] = df[label].astype('category') logger.debug('Cast label to <category>') df[label] = df[label].cat.codes df[label] = df[label].astype('int32') print(df.select_dtypes(include='category').head(10)) logger.debug('Train with {} features'.format(len(train_cols))) print(train_cols) model.fit(df[train_cols], df[label], eval_metric='auc') ###### end of doing calculation # end custom process definition if df.empty : raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1])) logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format(i,str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict : if not att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount'] : progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format(progress_str,time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict,body=df)
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'anonymizeData' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') logger.debug("Process started") time_monitor = tp.progress() result = '' logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) prev_att = msg.attributes df = msg.body ###### start of doing calculation att_dict['config']['to_nan'] = api.config.to_nan to_nan = tfp.read_value(api.config.to_nan, test_number=False) if to_nan: df.replace(to_nan, np.nan, inplace=True) att_dict['config'][ 'anonymize_to_int_cols'] = api.config.anonymize_to_int_cols anonymize_to_int_cols = tfp.read_list(api.config.anonymize_to_int_cols, list(df.columns)) att_dict['config']['anonymize_cols'] = api.config.anonymize_cols anonymize_cols = tfp.read_list(api.config.anonymize_cols, list(df.columns)) ## Anonymize columns if anonymize_cols: logger.debug('Anonymize Columns: {}'.format(str(anonymize_cols))) # ensure that ids are not anonymized in the section but exclusively in the id-section anonymize_cols = [ c for c in anonymize_cols if not c in anonymize_to_int_cols ] # replaceing string with a random string for c in df[anonymize_cols].select_dtypes(include='object'): unique_list = df[c].unique() n = int(math.log10(len(unique_list))) + 2 # create random map first then check if keys have the values of the keep_list and replace the random values rep_map = { x: ''.join(random.choices(string.ascii_letters, k=n)) for x in unique_list if isinstance(x, str) } for ktk, ktv in keep_terms.items(): if ktk in rep_map.keys(): rep_map[ktk] = ktv df[c].replace(rep_map, inplace=True) # linear shift of integer for c in df[anonymize_cols].select_dtypes(include='int'): unique_i = df[c].unique() max_i = max(unique_i) min_i = min(unique_i) length = max_i - min_i rand_int1 = random.randint(0, 100) rand_int2 = random.randint(0, 100) # preserves existing/binary values 0 and 1 if not (len(unique_i) == 2 and 0 in unique_i and 1 in unique_i): df[c] = ((df[c] - min_i) / length * rand_int1 + rand_int2).astype('int') # linear shift of float for c in df[anonymize_cols].select_dtypes(include='float'): unique_f = df[c].unique() max_f = max(unique_f) min_f = min(unique_f) length = max_f - min_f rand_float1 = random.random() rand_float2 = random.random() df[c] = ( (df[c] - min_f) / length * rand_float1 + rand_float2) / 2.0 if anonymize_to_int_cols: logger.debug('Anonymize to Integer Columns: {}'.format( str(anonymize_to_int_cols))) # replaceing string with a random string for c in df[anonymize_to_int_cols]: unique_list = df[c].unique() rand_list = list( np.random.choice(1000 * len(unique_list), len(unique_list), replace=False)) # create random map first then check if keys have the values of the keep_list and replace the random values rep_map = dict(zip(unique_list, rand_list)) df[c].replace(rep_map, inplace=True) att_dict['config']['enumerate_cols'] = api.config.enumerate_cols att_dict['config']['prefix_cols'] = api.config.prefix_cols enumerate_cols = tfp.read_list(api.config.enumerate_cols, list(df.columns)) if enumerate_cols: ncols = int(math.log10(len(enumerate_cols))) + 1 prefix_cols = tfp.read_value(api.config.prefix_cols) if not prefix_cols: prefix_cols = 'Att_' cols_map = { oc: prefix_cols + str(i).zfill(ncols) for i, oc in enumerate(enumerate_cols) } df.rename(columns=cols_map, inplace=True) ###### end of doing calculation ############################################## # final infos to attributes and info message ############################################## if df.empty: raise ValueError('DataFrame is empty') att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) logger.debug('Head data: {}'.format(att_dict['row_' + str(i)])) logger.debug('End of Process Function') logger.debug('End time: ' + time_monitor.elapsed_time()) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg) : logger, log_stream = slog.set_logging('DEBUG') # start custom process definition prev_att = msg.attributes df = msg.body att_dict = dict() att_dict['config'] = dict() ######################### Start Calculation # save and reset indices index_names = df.index.names if index_names[0]: logger.debug("Reset index") df.reset_index(inplace=True) # prepare selection for numbers if api.config.selection_num and not api.config.selection_num.upper() == 'NONE': selection_map = tfp.read_relations(api.config.selection_num) for s in selection_map: if s[1] == '≤': df = df.loc[df[s[0]] <= s[2]] elif s[1] == '<': df = df.loc[df[s[0]] < s[2]] elif s[1] == '≥': df = df.loc[df[s[0]] >= s[2]] elif s[1] == '>': df = df.loc[df[s[0]] > s[2]] elif s[1] == '=': df = df.loc[df[s[0]] == s[2]] elif s[1] == '!': df = df.loc[df[s[0]] != s[2]] else: raise ValueError('Unknown relation: ' + str(s)) att_dict['config']['selection_num'] = api.config.selection_num if api.config.selection_list and not api.config.selection_list.upper() == 'NONE': value_list_dict = tfp.read_dict_of_list(api.config.selection_list) for key, vl in value_list_dict.items(): df = df.loc[df[key].isin(vl)] att_dict['config']['selection_list'] = api.config.selection_list # set index again if index_names[0]: att_dict['indices'] = index_names logger.debug('Set indices to: {}'.format(str(index_names))) df.set_index(keys=index_names, inplace=True) if df.empty: logger.error('DataFrame is empty') raise ValueError('DataFrame is empty') ######################### End Calculation ############################################## # final infos to attributes and info message ############################################## att_dict['operator'] = 'selectDataFrame' att_dict['name'] = prev_att['name'] att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2 att_dict['columns'] = str(list(df.columns)) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] if 'id' in prev_att.keys(): att_dict['id'] = prev_att['id'] + '; ' + att_dict['operator'] + ': ' + str(id(df)) else: att_dict['id'] = att_dict['operator'] + ': ' + str(id(df)) example_rows = EXAMPLE_ROWS if att_dict['number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(msg1, msg2, msg3, msg4, msg5): adict = msg1.attributes msg_list = [msg1, msg2, msg3, msg4, msg5] if api.config.debug_mode == True: logger, log_stream = slog.set_logging('scrapy', loglevel='DEBUG') else: logger, log_stream = slog.set_logging('scrapy', loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # logger, log_stream = slog.set_logging('scrapy',loglevel=api.config.debug_mode) scrapy_dir = tfp.read_value(api.config.scrapy_dir) if not scrapy_dir: logger.error('Scrapy direcory mandatory entry field') raise ValueError('Missing Scrapy Directory') logger.info('Change directory to: {}'.format(scrapy_dir)) os.chdir(scrapy_dir) project_dir = tfp.read_value(api.config.project_dir) if not project_dir: logger.error('Scrapy project direcory mandatory entry field') raise ValueError('Missing Scrapy Project Directory') project_dir = os.path.join(scrapy_dir, project_dir) new_file_list = [] for msg in msg_list: filename = msg.attributes["storage.filename"] if filename == 'spider.py': filename = os.path.join(project_dir, 'spiders', filename) else: filename = os.path.join(project_dir, filename) # copy files to directories with open(filename, 'wb') as fout: logger.info('Write to filename (binary): {}'.format(filename)) fout.write(msg.body) fout.close() new_file_list.append(filename) for f in new_file_list: if os.path.isfile(filename): logger.info('File successfully written: {} ({})'.format( filename, time.ctime(os.path.getmtime(filename)))) else: logger.error('File does not exist: {}'.format(filename)) api.send(outports[0]['name'], log_stream.getvalue()) log_stream.truncate(0) spiderlist = tfp.read_list(api.config.spider) num_spiders = len(spiderlist) num_batches = 0 num_all_articles = 0 if api.config.start_cmd: for i, spider in enumerate(spiderlist): media = spider.split('_')[0] cmd = ['scrapy', 'crawl', spider] logger.info('Start scrapy: {} ({}/{})'.format(cmd, i, num_spiders)) api.send(outports[0]['name'], log_stream.getvalue()) log_stream.truncate(0) #proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd = scrapy_dir,universal_newlines=True) proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=scrapy_dir, universal_newlines=True) #proc = subprocess.Popen(['python','/Users/Shared/data/onlinemedia/outputgen.py'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) #print('CWD: {}'.format(os.getcwd())) api.send(outports[1]['name'], proc.stderr) count_articles = 0 articles_list = list() # run through stdout after scrape has ended and add to batch_output last_article = dict() for line in proc.stdout.splitlines(): adict = format_check_output(line, logger) if adict: adict['media'] = media articles_list.append(adict) last_article = adict count_articles += 1 # send result to outport if len(articles_list) == 0: logger.warning('No articles found: {}'.format(media)) continue num_batches += 1 attributes = { k: v for k, v in last_article.items() if k in ['website', 'date', 'columns'] } attributes['filename'] = '{}_{}.json'.format( media, last_article['date']) attributes['batch.index'] = i attributes['batch.number'] = num_spiders if i + 1 == num_spiders: attributes['batch.last'] = True msg = api.Message(attributes=attributes, body=articles_list) api.send(outports[3]['name'], msg) if api.config.json_string_output: attributes['format'] = 'JSON String' msg = api.Message(attributes=attributes, body=json.dumps(articles_list, ensure_ascii=False, indent=4)) api.send(outports[2]['name'], msg) logger.info('Spider completed: {} - #articles: {}'.format( spider, count_articles)) num_all_articles += count_articles logger.info('Process ended: {} '.format(time_monitor.elapsed_time())) logger.info('<SCAN ENDED><{}>'.format(num_batches)) api.send(outports[0]['name'], log_stream.getvalue()) return 0
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'splitSample' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') time_monitor = tp.progress() logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation att_dict['config']['split'] = api.config.split if api.config.split > df.shape[0]: warning = 'Split larger than whole sample' split = 1 elif api.config.split > 1: split = api.config.split / df.shape[0] else: split = api.config.split att_dict['config']['to_category'] = api.config.to_category if api.config.to_category: for col in df.select_dtypes(include=np.object).columns: unique_num = len(df[col].unique()) nan_num = df[col].isna().count() logger.debug( 'Cast to category - {}: unique {}, nan: {} of {}'.format(col, unique_num, nan_num, df.shape[0])) df[col] = df[col].astype('category') att_dict['config']['label'] = api.config.label label = tfp.read_value(api.config.label) if label: label_vals = list(df[label].unique()) tdf = list() for lab in label_vals: tdf.append(df.loc[df[label] == lab].sample(frac=split, random_state=api.config.seed)) train_df = pd.concat(tdf) else: train_df = df.sample(frac=split, random_state=api.config.seed) # random state is a seed value test_df = df.drop(train_df.index) ###### end of doing calculation ############################################## # final infos to attributes and info message ############################################## if df.empty: raise ValueError('DataFrame is empty') att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) logger.debug('Head data: {}'.format(att_dict['row_' + str(i)])) train_msg = api.Message(attributes=att_dict, body=train_df) test_msg = api.Message(attributes=att_dict, body=test_df) logger.debug('End time: ' + time_monitor.elapsed_time()) return log_stream.getvalue(), train_msg, test_msg
def process(msg) : logger, log_stream = slog.set_logging('DEBUG') # start custom process definition prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() ###### start of doing calculation att_dict['prev_number_columns'] = df.shape[1] att_dict['prev_number_rows'] = df.shape[0] # att_dict['config']['remove_duplicates_cols'] = api.config.remove_duplicates_cols remove_duplicates_cols = tfp.read_list(api.config.remove_duplicates_cols) if remove_duplicates_cols: df = df.groupby(remove_duplicates_cols).first().reset_index() logger.debug('#Dropped duplicates: {} - {} = {}'.format(att_dict['prev_number_rows'], df.shape[0], \ att_dict['prev_number_rows'] - df.shape[0])) att_dict['config']['value_to_nan'] = api.config.value_to_nan value_to_nan = tfp.read_value(api.config.value_to_nan) if value_to_nan: df.select_dtypes(include='object').replace(value_to_nan, value_to_nan.nan, inplace=True) att_dict['config']['yes_no_to_boolean'] = str(api.config.yes_no_to_num) if api.config.yes_no_to_num: prev_categoricals = len(df.select_dtypes(include=np.object).columns) for col in df.select_dtypes(include=np.object): df[col] = df[col].str.upper() vals = [x for x in df.loc[df[col].notnull(), col].unique()] if len(vals) == 1 and vals[0] in ['YES', 'Y']: df.loc[df[col].notnull(), col] = 1 df.loc[df[col].isnull(), col] = 0 try: df[col] = df[col].astype('int8') except ValueError: print('Value Error: {}'.format(col)) print(df[col].unique()) if len(vals) == 1 and vals[0] in ['NO', 'N']: df.loc[df[col].notnull(), col] = 1 df.loc[df[col].isnull(), col] = 0 df[col] = df[col].astype('int8') if len(vals) == 2 and (all(i in vals for i in ['YES', 'NO']) or all(i in vals for i in ['Y', 'N'])): df[col].replace(to_replace={'NO': 0, 'N': 0, 'no': 0, 'n': 0, 'YES': 1, 'Y': 1, 'yes': 1, 'y': 1}) df[col] = df[col].astype('int8') after_categoricals = len(df.select_dtypes(include=np.object).columns) logger.debug('<yes_no_to_boolean> impact: {} -> {}'.format(prev_categoricals, after_categoricals)) att_dict['config']['all_constant_to_NaN'] = str(api.config.all_constant_to_NaN) if api.config.all_constant_to_NaN: num_constant_cols = 0 for col in df.columns: unique_vals = df[col].unique() if len(unique_vals) == 1: df[col] = np.nan num_constant_cols = num_constant_cols + 1 logger.debug('<all_constant_to_NaN> number of columns: {}'.format(num_constant_cols)) # remove rare value rows with quantile att_dict['config']['rare_value_cols'] = api.config.rare_value_cols att_dict['config']['rare_value_quantile'] = api.config.rare_value_quantile att_dict['config']['rare_value_std'] = api.config.rare_value_std rare_value_cols = tfp.read_list(api.config.rare_value_cols, list(df.columns)) if rare_value_cols: logger.debug('quantile') # drop rare values by quantile if api.config.rare_value_quantile > 0: if not api.config.rare_value_quantile >= 0 and api.config.rare_value_quantile < 1: raise ValueError('Quantile value range: [0,1[, not {}'.format(api.config.rare_value_quantile)) num_reduce_categoricals_col = 0 for col in rare_value_cols: unique_num = len(df[col].unique()) val_num = df[col].count() ratio = df[col].count() / len(df[col].unique()) threshold = df[col].count() / len(df[col].unique()) * api.config.rare_value_quantile value_counts = df[col].value_counts() # Specific column # kept_values = value_counts[value_counts > threshold].count() if value_counts[value_counts > threshold].count() > 1: to_remove = value_counts[value_counts <= threshold].index if len(to_remove) > 0: logger.debug( 'Drop rare value by quantile: Column {}: {}/{} '.format(col, len(to_remove), unique_num)) df[col].replace(to_remove, np.nan, inplace=True) num_reduce_categoricals_col += 1 logger.debug('<rare_value_quantile> impact on columns: {}/{}'.format(num_reduce_categoricals_col, len(rare_value_cols))) # drop rare values by std if api.config.rare_value_std > 0: num_reduce_categoricals_col = 0 for col in df.columns: unique_num = len(df[col].unique()) value_counts = df[col].value_counts() mean = value_counts.mean() threshold = value_counts.mean() - value_counts.std() * api.config.rare_value_std if threshold > 1: to_remove = value_counts[value_counts <= threshold].index if len(to_remove) > 0: logger.debug( 'Drop rare value by std: Column {}: {}/{} '.format(col, len(to_remove), unique_num)) df[col].replace(to_remove, np.nan, inplace=True) num_reduce_categoricals_col += 1 logger.debug( '<rare_value_std> impact on columns: {}/{}'.format(num_reduce_categoricals_col, len(rare_value_cols))) # for unique values less then threshold_unique set to 1. All NaN set to 0 att_dict['config']['threshold_unique_cols'] = api.config.threshold_unique_cols att_dict['config']['threshold_unique'] = api.config.threshold_unique threshold_unique_cols = tfp.read_list(api.config.threshold_unique_cols, list(df.columns)) if threshold_unique_cols: prev_obj_cols = len(df.select_dtypes("object")) for col in threshold_unique_cols: if df[col].dtype == np.object: unique_vals = list(df[col].unique()) if len(unique_vals) <= api.config.threshold_unique: # test if one of the values is nan if np.nan in unique_vals: df.loc[df[col].notnull(), col] = 1 df.loc[df[col].isnull(), col] = 0 df[col] = df[col].astype('int8') after_obj_cols = len(df.select_dtypes("object")) logger.debug( 'Threshold unique effect on number of categorical columns: {} -> {}'.format(prev_obj_cols, after_obj_cols)) # for count values less then threshold_count set to NaN att_dict['config']['sparse_cols'] = api.config.sparse_cols att_dict['config']['sparse'] = api.config.sparse sparse_cols = tfp.read_list(api.config.sparse_cols) if sparse_cols: logger.debug('Sparse check') if api.config.reduce_categoricals_only: test_cols = [ot for ot in sparse_cols if df[ot].dtype == np.object] if api.config.sparse < 1: api.config.sparse = api.config.sparse * df.shape[0] for col in sparse_cols: if df[col].count() < api.config.sparse_freq: logger.debug('Threshold_count: Removed column {} (#values {})'.format(col, df[col].count())) df[col] = np.nan # removes columns with to many category values that could not be transposed att_dict['config']['max_cat_num'] = api.config.max_cat_num att_dict['config']['max_cat_num_cols'] = api.config.max_cat_num_cols max_cat_num_cols = tfp.read_list(api.config.max_cat_num_cols) if api.config.max_cat_num > 0 and max_cat_num_cols: drop_cols = list() for col in max_cat_num_cols: if df[col].dtype == np.object: if len(df[col].unique()) > api.config.max_cat_num: drop_cols.append(col) df.drop(columns=drop_cols, inplace=True) # remove cols with only NaN att_dict['config']['drop_nan_columns'] = api.config.drop_nan_columns if api.config.drop_nan_columns: df.dropna(axis='columns', how='all', inplace=True) # remove rows with NAN except for dimension cols att_dict['config']['drop_nan_rows_cols'] = api.config.drop_nan_rows_cols drop_nan_rows_cols = tfp.read_list(api.config.drop_nan_rows_cols, df.columns) if drop_nan_rows_cols: prev_row_num = df.shape[0] df[drop_nan_rows_cols].dropna(subset=drop_nan_rows_cols, how='all', inplace=True) logger.debug('<drop_nan_rows_cols> deleted rows: {}/{}'.format(prev_row_num - df.shape[0], prev_row_num)) # maps a certain value to nan for all object type columns if tfp.read_value(api.config.fill_categoricals_nan): cat_cols = df.select_dtypes(include='object') for col in cat_cols: df[col].fillna(value=api.config.fill_categoricals_nan, inplace=True) # im construction error-prone and ugly #if api.config.cut_obj_size > 0: # cols_obj = df.select_dtypes(include='object') # dict_mapping = dict() # for col in cols_obj: # if df[col].str.len().max() > api.config.cut_obj_size: # catmap = dict(enumerate(df[col].unique())) # valmap = {val: val[:api.config.cut_obj_size - 3] + '_' + str(cat) for cat, val in catmap.items()} # if len(api.config.fill_categoricals_nan) > 0: # if api.config.fill_categoricals_nan in valmap.keys(): # valmap[api.config.fill_categoricals_nan] = api.config.fill_categoricals_nan # df[col] = df[col].map(valmap) # problem # df[col].str.replace(r'[,\.:;]', '') # print(dict_mapping) if api.config.fill_numeric_nan_zero: cols_num = df.select_dtypes(include=np.number) for col in cols_num: df[col] = df[col].fillna(0.0) print('Cols: {} -> {} Rows: {} -> {}'.format(att_dict['prev_number_columns'], df.shape[1], att_dict['prev_number_rows'], df.shape[0])) ###### end of doing calculation ############################################## # final infos to attributes and info message ############################################## if df.empty: raise ValueError('DataFrame is empty') att_dict['operator'] = 'selectDataFrame' att_dict['name'] = prev_att['name'] att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2 att_dict['columns'] = str(list(df.columns)) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] example_rows = EXAMPLE_ROWS if att_dict['number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() return log, msg
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'groupby' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() prev_att = msg.attributes df = msg.body prev_shape = df.shape ###### start of doing calculation # groupby list cols = tfp.read_list(api.config.groupby) # mapping aggregation try: colagg = tfp.read_dict(api.config.aggregation) except IndexError: logger.info('Aggregation is not a map, try to parse a value instead') colagg = tfp.read_value(api.config.aggregation) # groupby logger.info('Group columns: {}'.format(cols)) logger.info('Aggregation: {}'.format(colagg)) logger.info('Index: {}'.format(api.config.index)) df = df.groupby(cols, as_index=api.config.index).agg(colagg) # drop col dropcols = tfp.read_list(api.config.drop_columns) if dropcols: logger.info('Drop columns: {}'.format(dropcols)) df.drop(columns=dropcols, inplace=True) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict[ 'storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(left_msg, right_msg): att_dict = left_msg.attributes att_dict['operator'] = 'join' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition l_att = left_msg.attributes r_att = right_msg.attributes # read stream from memory left_df = left_msg.body right_df = right_msg.body ###### start of doing calculation how = tfp.read_value(api.config.how) # merge according to config if api.config.on_index: df = pd.merge(left_df, right_df, how=how, left_index=True, right_index=True) elif api.config.left_on and api.config.right_on: left_on_list = tfp.read_list(api.config.left_on) right_on_list = tfp.read_list(api.config.right_on) logger.info('Join DataFrames on {} - {}'.format( left_on_list, right_on_list)) left_df.reset_index(inplace=True) right_df.reset_index(inplace=True) df = pd.merge(left_df, right_df, how=how, left_on=left_on_list, right_on=right_on_list) # removing second index - might be a more elegant solution if 'index_x' in df.columns: df.drop(columns=['index_x'], inplace=True) else: raise ValueError( "Config setting: Either <on> or both <left_on> and <right_on> has to be set in order to join the dataframes" ) index_list = tfp.read_list(api.config.new_indices) if index_list: df.set_index(keys=index_list, inplace=True) logger.info('Set index: {}'.format(index_list)) col_list = tfp.read_list(api.config.drop_columns) if col_list: df.drop(labels=col_list, axis=1, inplace=True) logger.info('Drop columns: {}'.format(col_list)) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict[ 'storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'fromCSV' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') logger.debug("Process started") global result_df att_dict['filename'] = msg.attributes["storage.filename"] logger.info('Filename: {} index: {} count: {} endofSeq: {}'.format(msg.attributes["storage.filename"], \ msg.attributes["storage.fileIndex"], \ msg.attributes["storage.fileCount"], \ msg.attributes["storage.endOfSequence"])) # using file name from attributes of ReadFile if not api.config.df_name or api.config.df_name == "DataFrame": att_dict['name'] = att_dict['filename'].split(".")[0] if isinstance(msg.body, str): csv_io = io.StringIO(msg.body) logger.debug("Input format: <string>") elif isinstance(msg.body, bytes): csv_io = io.BytesIO(msg.body) logger.debug("Input format: <bytes>") elif isinstance(msg.body, io.BytesIO): logger.debug("Input format: <io.Bytes>") csv_io = msg.body else: raise TypeError('Message body has unsupported type' + str(type(msg.body))) # nrows nrows = None if not api.config.limit_rows == 0: nrows = api.config.limit_rows # usecols att_dict['config']['use_columns'] = api.config.use_columns use_cols = tfp.read_list(api.config.use_columns) # dtypes mapping att_dict['config']['dtypes'] = api.config.dtypes typemap = tfp.read_dict(api.config.dtypes) kwargs = tfp.read_dict(text=api.config.keyword_args, map_sep='=') ##### Read string from buffer logger.debug("Read from input") df = pd.read_csv(csv_io, api.config.separator, usecols=use_cols, dtype=typemap, decimal=api.config.decimal, \ nrows=nrows, **kwargs) # Data from filename if api.config.data_from_filename and not api.config.data_from_filename == 'None': col = api.config.data_from_filename.split(':')[0].strip().strip( "'").strip('"') pat = api.config.data_from_filename.split(':')[1].strip().strip( "'").strip('"') logger.debug('Filename: {} pattern: {}'.format( att_dict['filename'], pat)) try: dataff = re.match('.*(\d{4}-\d+-\d+).*', att_dict['filename']) df[col] = dataff.group(1) except AttributeError: raise ValueError( 'Pattern not found - Filename: {} pattern: {}'.format( att_dict['filename'], pat)) # To Datetime if api.config.todatetime and not api.config.todatetime == 'None': coldate = api.config.todatetime.split(':')[0].strip().strip("'").strip( '"') dformat = api.config.todatetime.split(':')[1].strip().strip("'").strip( '"') df[coldate] = pd.to_datetime(df[coldate], format=dformat) ###### Downcasting # save memory footprint for calculating the savings of the downcast att_dict['previous_memory'] = df.memory_usage(deep=True).sum() / 1024**2 if api.config.downcast_int: df, dci = downcast(df, 'int', 'unsigned') if api.config.downcast_float: df, dcf = downcast(df, 'float', 'float') # check if index is provided and set index_list = tfp.read_list(api.config.index_cols) att_dict['config']['index_cols'] = str(index_list) att_dict['index_cols'] = str(index_list) if index_list: df.set_index(index_list, inplace=True) # stores the result in global variable result_df if msg.attributes['storage.fileIndex'] == 0: logger.debug('Added to DataFrame: {}'.format(att_dict['filename'])) result_df = df else: result_df = pd.concat([result_df, df], axis=0, sort=False) ############################################## # final infos to attributes and info message ############################################## att_dict['memory'] = result_df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = list(result_df.columns) att_dict['dtypes'] = { col: str(ty) for col, ty in df.dtypes.to_dict().items() } att_dict['shape'] = result_df.shape att_dict['id'] = str(id(result_df)) logger.debug('Columns: {}'.format(str(result_df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( result_df.shape[0], result_df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) example_rows = EXAMPLE_ROWS if result_df.shape[ 0] > EXAMPLE_ROWS else result_df.shape[0] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in result_df.iloc[i, :].tolist()]) logger.debug('Head data: {}'.format(att_dict['row_' + str(i)])) # end custom process definition msg = api.Message(attributes=att_dict, body=result_df) log = log_stream.getvalue() return log, msg
def process(msg): logger, log_stream = slog.set_logging('DEBUG') # start custom process definition prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): logger.error('Message body does not contain a pandas DataFrame') raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() ###### start of doing calculation # segment columns att_dict['config']['segment_cols'] = api.config.segment_cols segment_cols = tfp.read_list(api.config.segment_cols) # regression columns att_dict['config']['regression_cols'] = api.config.regression_cols regression_cols = tfp.read_list(api.config.regression_cols) if not regression_cols: logger.error('No Regression Columns - mandatory data') raise ValueError('No Regression Columns - mandatory data') # prediction column att_dict['config']['prediction_col'] = api.config.prediction_col prediction_col = tfp.read_value(api.config.prediction_col) if not prediction_col: raise ValueError('No Predicition Column - mandatory data') training_cols = regression_cols + [prediction_col] model = LinearRegression(fit_intercept=True) def fit(x): model.fit(x[regression_cols], x[prediction_col]) return pd.Series([model.coef_, model.intercept_], index=['coef', 'intercept']) if segment_cols: coef_df = df.groupby(segment_cols)[training_cols].apply( fit).reset_index() else: model.fit(df[regression_cols], df[prediction_col]) coef_df = pd.Series([model.coef_, model.intercept_], index=['coef', 'intercept']) ############################################## # final infos to attributes and info message ############################################## if df.empty: raise ValueError('DataFrame is empty') att_dict['operator'] = 'regressionTrainingDataFrame' att_dict['name'] = prev_att['name'] att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] example_rows = EXAMPLE_ROWS if att_dict[ 'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() coef_att = { 'segmentation_columns': segment_cols, 'regression_columns': regression_cols, 'prediction_column': prediction_col } msg_coef = api.Message(attributes=coef_att, body=coef_df) msg_data = api.Message(attributes=att_dict, body=df) return log, msg_coef, msg_data
def process(msg): logger, log_stream = slog.set_logging('word_indexing', loglevel=api.config.debug_mode) logger.info("Process started") time_monitor = tp.progress() articles = msg.body word_index = list() # as table for article in articles: word_index.extend([[article[0], article[1], article[2], w[0], w[1]] for w in article[3]]) attributes = { "table": { "columns": [{ "class": "string", "name": "HASH_TEXT", "nullable": True, "type": { "hana": "INTEGER" } }, { "class": "string", "name": "LANGUAGE", "nullable": True, "size": 2, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "TYPE", "nullable": True, "size": 1, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "WORD", "nullable": True, "size": 80, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "COUNT", "nullable": True, "type": { "hana": "INTEGER" } }], "name": "DIPROJECTS.WORD_INDEX3", "version": 1 } } logger.debug('Process ended, articles processed {} - {} '.format( len(articles), time_monitor.elapsed_time())) api.send(outports[0]['name'], log_stream.getvalue()) msg = api.Message(attributes=attributes, body=word_index) api.send(outports[1]['name'], msg)
def process(msg): global blacklist global last_msg global word_counter global hash_list logger, log_stream = slog.set_logging('word_frequency', loglevel=api.config.debug_mode) logger.info("Process started") time_monitor = tp.progress() msg = check_for_setup(logger, msg) if not msg: api.send(outports[0]['name'], log_stream.flush()) return 0 adict = msg.body att_dict = msg.attributes end_date = datetime.strptime(api.config.date, '%Y-%m-%d') start_date = end_date - timedelta(days=api.config.days_into_past) language_filter = tfp.read_value(api.config.language) media_filter = tfp.read_value(api.config.media) for index_article, article in enumerate(adict): # filter article adate = datetime.strptime(article['date'], '%Y-%m-%d') if not start_date <= adate <= end_date: #logger.debug('Date of article out of range: {} ({} - {})'.format(adate,start_date,end_date)) continue # filter language if language_filter and not language_filter == language_dict[ article['media']]: #logger.debug('Language filtered out: {} ({})'.format(language_dict[article['media']], language_filter)) continue # filter media if media_filter and not media_filter == article['media']: #logger.debug('Media filtered out: {} ({})'.format(article['media'], media_filter)) continue # check if article has been processed if article['hash_text'] in hash_list: logger.debug( 'Article has already been processed: {} - {} - {}'.format( article['date'], article['media'], article['hash_text'])) word_counter.update(hash_list[article['hash_text']]) continue language = language_dict[article['media']] text = article['text'] # Language settings if language == 'G': doc = nlp_g(text) elif language == 'F': doc = nlp_fr(text) elif language == 'S': doc = nlp_es(text) else: logger.warning('Language not implmented') doc = None words = [] # only when doc has been created - language exists if doc: if api.config.mode == 'NOUN': words = [ token.lemma_[:api.config.max_word_len] for token in doc if token.pos_ in ['PROPN', 'NOUN'] ] elif api.config.mode == 'PROPN': words = [ token.lemma_[:api.config.max_word_len] for token in doc if token.pos_ == 'PROPN' ] else: words = [ token.text[:api.config.max_word_len] for token in doc if not token.is_stop ] word_counter.update(words) hash_list[article['hash_text']] = words if api.config.limit > 0: common_words = word_counter.most_common(api.config.limit) result, progress_str = test_last_batch(attributes=att_dict, collect=api.config.collect) if result: word_freq = [ {'date':api.config.date,'days_into_past':api.config.days_into_past, 'language':api.config.language, \ 'media':api.config.media,'mode':api.config.mode, 'word':w, 'frequency': f} for w,f in common_words ] msg = api.Message(attributes=att_dict, body=word_freq) api.send(outports[2]['name'], msg) if api.config.json_string_output: json_data = json.dumps(word_freq) api.send(outports[1]['name'], msg) logger.debug('Process ended, {} - {} '.format( progress_str, time_monitor.elapsed_time())) api.send(outports[0]['name'], log_stream.getvalue())
def process(db_msg): logger, log_stream = slog.set_logging('topic_identification', loglevel=api.config.debug_mode) logger.info("Process started") time_monitor = tp.progress() columns = [c['name'] for c in db_msg.attributes['table']['columns']] df = pd.DataFrame(db_msg.body, columns=columns) # Language filter language_filter = tfp.read_list(api.config.language_filter) if language_filter: df = df.loc[df["LANGUAGE"].isin(language_filter)] else: language_filter = list(df['LANGUAGE'].unique()) logger.info('Languages : {}'.format(language_filter)) # Word type filter word_type_filter = tfp.read_value(api.config.word_type_filter) if word_type_filter: types = [c for c in word_type_filter] df = df.loc[df["TYPE"].isin(types)] logger.info('Word restricted to types : {}'.format(word_type_filter)) # groupby and concatenate words gdf = df.groupby(by=['HASH_TEXT', 'LANGUAGE'])['WORD'].apply( lambda x: ' '.join(x)).reset_index() logger.info('Topic identification: ') for lang in language_filter: logger.info('Language: {} #Documents: {} #Words: {}'.format(lang,gdf.loc[gdf['LANGUAGE']==lang].shape[0],\ df.loc[df['LANGUAGE'] == lang].shape[0])) api.send(outports[0]['name'], log_stream.getvalue()) log_stream.seek(0) # create document-term matrix - no tokenization or text prep are needed tf_vectorizer = CountVectorizer(analyzer='word', min_df=1, lowercase=False, tokenizer=str.split) # tf means term-frequency in a document for each language date_today = str(date.today()) # 2-array with TOPIC, LANGUAGE, TYPE, DATE, EXPIRY_DATE, ATTRIBUTE, KEYWORD_i (num of topics) topic_list = list() for lang in language_filter: logger.info('Process all texts for language: {}'.format(lang)) lang_gdf = gdf.loc[gdf['LANGUAGE'] == lang] dtm_tf = tf_vectorizer.fit_transform(lang_gdf['WORD']) # for tf dtm lda_tf = LatentDirichletAllocation(n_components=api.config.num_topics, learning_method='online', evaluate_every=-1, n_jobs=-1) lda_tf.fit(dtm_tf) feature_names = tf_vectorizer.get_feature_names() for i, topic in enumerate(lda_tf.components_): topic_words = [ feature_names[f] for f in topic.argsort()[:-api.config.topic_num_words - 1:-1] ] logger.debug('Len: {} topic_words:{}'.format( len(topic_words), topic_words)) row = [ date_today + "-" + str(i), lang, 'ALGO', date_today, None, None ] + topic_words topic_list.append(row) attributes = { "table": { "columns": [{ "class": "string", "name": "TOPIC", "nullable": False, "size": 80, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "LANGUAGE", "nullable": False, "size": 2, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "TYPE", "nullable": False, "size": 10, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "DATE", "nullable": True, "type": { "hana": "DATE" } }, { "class": "string", "name": "EXPIRY_DATE", "nullable": True, "type": { "hana": "DATE" } }, { "class": "string", "name": "ATTRIBUTE", "nullable": True, "size": 25, "type": { "hana": "NVACHAR" } }], "name": "DIPROJECTS.WORD_INDEX", "version": 1 } } for i in range(1, api.config.topic_num_words + 1): attributes['table']['columns'].append({ "class": "string", "name": "KEYWORD_" + str(i), "nullable": True, "size": 80, "type": { "hana": "NVARCHAR" } }) msg = api.Message(attributes=attributes, body=topic_list) logger.debug('Process ended, topics processed {}'.format( time_monitor.elapsed_time())) api.send(outports[0]['name'], log_stream.getvalue()) api.send(outports[1]['name'], msg)
def process(test_msg, base_msg): logger, log_stream = slog.set_logging('DEBUG') # start custom process definition test_att = test_msg.attributes base_att = base_msg.attributes att_dict = dict() if test_att['name'] == base_att['name']: att_dict['name'] = test_att['name'] else: att_dict['name'] = test_att['name'] + '-' + base_att['name'] att_dict['config'] = dict() att_dict['config']['test_index'] = api.config.test_index testdf_index = tfp.read_value(api.config.test_index) if not testdf_index: logger.error('Index of test data is mandatory') raise ValueError('Index of test data is mandatory') att_dict['number_rows'] = str(base_msg.body.shape[0]) # get the columns to check mapping = tfp.read_dict(api.config.check_columns) df = pd.DataFrame() if mapping: att_dict['config']['check_columns'] = str(mapping) att_dict['config']['limit'] = api.config.limit # read stream from memory test_df = test_msg.body # test if all mapping cols in testdf checkcols = [ elem in list(test_df.columns) for elem in list(mapping.keys()) ] if not all(checkcols): error_txt = 'Elements in mapping are not contained in columns of test df : ' + \ str(list(mapping.keys())) + '-' + str(list(test_df.columns)) + ' - ' + str(checkcols) logger.error(error_txt) raise ValueError(error_txt) if not testdf_index in test_df.columns: logger.error('Test index needs to be column') raise ValueError('Test index needs to be column') tcols = ['t_' + c for c in list(mapping.keys())] tdf = pd.DataFrame(columns=tcols) df = base_msg.body df = pd.concat([df, tdf], axis=1) num_cols = len(mapping) # run over all left df rows to test in right_df for index, test_row in test_df.iterrows(): # apply function def get_ratio(row): sc = 0 for tcol, bcol in mapping.items(): sc = sc + fuzz.token_sort_ratio(test_row[tcol], row[bcol]) return sc / num_cols df['tscore'] = df.apply(get_ratio, axis=1) # get best matching and store index in v_dict max_score = df['tscore'].max() if max_score >= api.config.limit: mask = (df['tscore'] == max_score) df.loc[mask, 'score'] = max_score df.loc[mask, 'external_id'] = test_row[testdf_index] for coli in mapping: df.loc[mask, 't_' + coli] = test_row[coli] df.drop(columns=['tscore'], inplace=True) # remove external_id when test column value has none t_cols = ['t_' + t for t in mapping.keys()] + ['external_id', 'score'] for bcol in mapping.values(): mask = df[bcol].isna() df.loc[mask, t_cols] = np.nan if api.config.only_index: df = df[list(base_msg.body.columns) + ['external_id']] att_dict['config']['only_index'] = api.config.only_index if api.config.only_matching_rows: df = df.loc[~df['score'].isna()] att_dict['config'][ 'only_matching_rows'] = api.config.only_matching_rows basedf_index = tfp.read_value(api.config.base_index) att_dict['config']['base_index'] = basedf_index if api.config.joint_id: if not basedf_index: raise ValueError( "For <joint_id> a value for <base_index> is necessary ") df.loc[~df['external_id'].isna(), 'joint_id'] = df.loc[~df['external_id'].isna(), 'external_id'] df.loc[df['external_id'].isna(), 'joint_id'] = df.loc[df['external_id'].isna(), basedf_index] att_dict['config']['joint_id'] = api.config.joint_id if api.config.add_non_matching: # test if same columns if not all( [elem in test_df.columns for elem in base_msg.body.columns]): raise ValueError("Adding test dataframe only possible when having same columns " + str(test_df.columns) \ + ' vs. ' + str(base_msg.body.columns)) matched_ids = df['external_id'].unique() addto_df = test_df.loc[~test_df[testdf_index].isin(matched_ids )].copy() addto_df['joint_id'] = addto_df[testdf_index] df = pd.concat([df, addto_df], axis=0, sort=False) att_dict['config']['add_non_matching'] = api.config.add_non_matching else: logger.warning('No columns to check') ############################################## # final infos to attributes and info message ############################################## if df.empty: logger.warning('DataFrame is empty') else: att_dict['operator'] = 'fuzzyjoinDataFrames' att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] if 'id' in base_att.keys(): att_dict['id'] = base_att['id'] + '; ' + att_dict[ 'operator'] + ': ' + str(id(df)) else: att_dict['id'] = att_dict['operator'] + ': ' + str(id(df)) example_rows = EXAMPLE_ROWS if att_dict[ 'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(msg): global blacklist global last_msg global hash_list logger, log_stream = slog.set_logging('word_extraction', loglevel=api.config.debug_mode) logger.info("Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() msg = check_for_setup(logger, msg, mode=api.config.use_blacklist) if not msg: api.send(outports[0]['name'], log_stream.flush()) return 0 adict = msg.body language_filter = tfp.read_value(api.config.language) mode = tfp.read_value(api.config.mode) if not mode or not any(m in mode for m in supported_modes): raise Exception( 'Mode is mandatory parameter and valid values are: {}'.format( supported_modes)) article_words = list() article_count = 0 for index_article, article in enumerate(adict): language = language_dict[article['media']] # filter language if language_filter and not language_filter == language: #logger.debug('Language filtered out: {} ({})'.format(language, language_filter)) continue article_count += 1 # check if article has been processed if article['hash_text'] in hash_list: logger.debug( 'Article has already been processed: {} - {} - {}'.format( article['date'], article['media'], article['hash_text'])) continue hash_list.append(article['hash_text']) text = article['text'] # might interfer with #text = re.sub(r'\d+', '', text.lower()) #text = re.sub(r'\b[a-z]\b', '', text) # Language settings if language == 'DE': doc = nlp_g(text) elif language == 'FR': doc = nlp_fr(text) elif language == 'ES': doc = nlp_es(text) else: logger.warning('Language not implemented: {}'.format(language)) doc = None words = dict() # only when doc has been created - language exists if doc: if 'P' in api.config.mode: words['P'] = [ token.lemma_[:api.config.max_word_len] for token in doc if token.pos_ == 'PROPN' ] if 'N' in api.config.mode: words['N'] = [ token.lemma_[:api.config.max_word_len] for token in doc if token.pos_ == 'NOUN' ] if 'X' in api.config.mode: words['X'] = [ token.text[:api.config.max_word_len] for token in doc if not token.is_stop ] for m in words: # heuristics # remove preceding non-alpha characters words[m] = [re.sub('^[-\'\./]', '', w) for w in words[m]] # remove trailing non-alpha characters words[m] = [re.sub('[-\./]$', '', w) for w in words[m]] # minimum word length words[m] = [ w for w in words[m] if len(w) >= api.config.min_word_len ] # remove date like words words[m] = [w for w in words[m] if not re.findall('\d+\.\d+', w)] # Remove blacklist words if api.config.use_blacklist: words[m] = [w for w in words[m] if w not in blacklist] if api.config.counter: article_words.append([ article['hash_text'], language, m, collections.Counter(words[m]).most_common() ]) else: article_words.append( [article['hash_text'], language, m, words[m]]) attributes = { "table": { "columns": [{ "class": "string", "name": "HASH_TEXT", "nullable": True, "type": { "hana": "INTEGER" } }, { "class": "string", "name": "LANGUAGE", "nullable": True, "size": 2, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "TYPE", "nullable": True, "size": 1, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "WORDS", "nullable": True, "type": { "hana": "ARRAY" } }], "name": "DIPROJECTS.WORD_INDEX3", "version": 1 }, "storage.filename": msg.attributes["storage.filename"] } attributes['counter'] = 'Y' if api.config.counter else 'N' table_msg = api.Message(attributes=attributes, body=article_words) logger.info('File processed: {} #Articles: {} '.format( msg.attributes["storage.filename"], len(adict))) api.send(outports[0]['name'], log_stream.getvalue()) api.send(outports[1]['name'], table_msg)
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'categorical2exist' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() logger.info('Start Process') time_monitor.get_start_time() df = msg.body prev_cat_col = len(df.select_dtypes(np.object).columns) columns = tfp.read_list(api.config.columns, df.columns, test_number=False) info_only = api.config.info_only equal_only = api.config.equal_only threshold = api.config.threshold upper_threshold = api.config.upper_threshold num_values = api.config.num_values logger.debug('PARAMETER threshold: {} upper_threshold: {} num_values: {} Modification: {}'\ .format(threshold,upper_threshold,num_values,info_only)) transform_data = { 'column': [], 'dtype': [], 'unique_values': [], 'action': [] } for col in df[columns].select_dtypes(np.object).columns: unique_vals = df.loc[df[col].notnull(), col].unique() if (len(unique_vals) == num_values) or (len(unique_vals) <= num_values and not equal_only): population = df[col].count() / df.shape[0] if population > upper_threshold and len(unique_vals) == 2: transform_data['column'].append(col) transform_data['dtype'].append(df[col].dtype) v0 = 0 v1 = 1 if df.loc[df[col] == unique_vals[0], col].count() > df.shape[0] * 0.5: v0 = 1 v1 = 0 # per definition first unique value 0, second unique value 1 if v0 == 0: transform_data['unique_values'].append(unique_vals) else: transform_data['unique_values'].append( [unique_vals[1], unique_vals[0]]) transform_data['action'].append('map2') # print('{}: {} -> {}'.format(vals[0],df.loc[df[col]==vals[0],col].count(),v0)) # print('{}: {} -> {}'.format(vals[1],df.loc[df[col]==vals[1],col].count(),v1)) if not info_only: df.loc[df[col] == unique_vals[0], col] = v0 df.loc[df[col] == unique_vals[1], col] = v1 df.loc[df[col].isnull(), col] = 0 df[col] = df[col].astype('int8') elif population < threshold or len(unique_vals) == 1: transform_data['column'].append(col) transform_data['dtype'].append(df[col].dtype) transform_data['unique_values'].append(unique_vals) transform_data['action'].append('map1') if not info_only: df.loc[df[col].isin(unique_vals), col] = 1 df.loc[df[col].isnull(), col] = 0 df[col] = df[col].astype('int8') logger.info('End of Process: {}'.format(time_monitor.elapsed_time())) att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) cat_cols = len(df.select_dtypes(np.object).columns) logger.info('Categoricals to Numeric: {} - {} = {}'.format( prev_cat_col, cat_cols, prev_cat_col - cat_cols)) return log_stream.getvalue(), api.Message(attributes={'name':'filter_by_population','type':'DataFrame'},body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'filter_by_population' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() df = msg.body prev_cols = df.shape[1] columns = tfp.read_list(api.config.columns, df.columns, test_number=False) info_only = api.config.info_only threshold = api.config.threshold logger.debug('Parameter Threshold: {} Data Modification:{} '.format( threshold, info_only)) transform_data = { 'column': [], 'dtype': [], 'unique_vals': [], 'action': [] } for col in columns: population = df[col].count() / df.shape[0] unique_vals = df[col].unique() if population < threshold and not (len(unique_vals) == 1 and np.isnan(unique_vals[0])): unique_vals = df[col].unique() transform_data['column'].append(col) transform_data['dtype'].append(df[col].dtype) transform_data['unique_vals'].append(unique_vals) transform_data['action'].append('drop') if not info_only: df.drop(columns=[col], inplace=True) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '<BATCH ENDED><1>' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '<BATCH ENDED><{}>'.format( att_dict['storage.fileCount']) else: progress_str = '<BATCH IN-PROCESS><{}/{}>'.format( att_dict['storage.fileIndex'] + 1, att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) return log_stream.getvalue(), api.Message(attributes=att_dict,body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'sample' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') # start custom process definition # test if body refers to a DataFrame type prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): logger.error('Message body does not contain a pandas DataFrame') raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() ###### start calculation sample_size = api.config.sample_size if sample_size < 1: sample_size = int(sample_size * df.shape[0]) if sample_size < 1: sample_size = 1 logger.warning( "Fraction of sample size too small. Set sample size to 1.") elif sample_size > df.shape[0]: logger.warning("Sample size larger than number of rows") logger.debug("Samples_size: {}/() ({})".format(sample_size, df.shape[0], sample_size / df.shape[0])) random_state = api.config.random_state invariant_column = tfp.read_value(api.config.invariant_column) if invariant_column and sample_size < df.shape[0]: # get the average number of records for each value of invariant sc_df = df.groupby(invariant_column)[invariant_column].count() sample_size_invariant = int(sample_size / sc_df.mean()) sample_size_invariant = 1 if sample_size_invariant == 0 else sample_size_invariant # ensure minimum sc_df = sc_df.sample(n=sample_size_invariant, random_state=random_state).to_frame() sc_df.rename(columns={invariant_column: 'sum'}, inplace=True) # sample the df by merge 2 df df = pd.merge(df, sc_df, how='inner', right_index=True, left_on=invariant_column) df.drop(columns=['sum'], inplace=True) else: df = df.sample(n=sample_size, random_state=random_state) ###### end calculation ############################################## # final infos to attributes and info message ############################################## if df.empty: raise ValueError('DataFrame is empty') att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) logger.debug('Head data: {}'.format(att_dict['row_' + str(i)])) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(msg1, msg2, msg3, msg4, msg5): adict = msg1.attributes msg_list = [msg1, msg2, msg3, msg4, msg5] logger, log_stream = slog.set_logging('scrapy', loglevel='DEBUG') logger.info("Process started") time_monitor = tp.progress() # logger, log_stream = slog.set_logging('scrapy',loglevel=api.config.debug_mode) scrapy_dir = tfp.read_value(api.config.scrapy_dir) if not scrapy_dir: logger.error('Scrapy direcory mandatory entry field') raise ValueError('Missing Scrapy Directory') logger.info('Change directory to: {}'.format(scrapy_dir)) os.chdir(scrapy_dir) project_dir = tfp.read_value(api.config.project_dir) if not project_dir: logger.error('Scrapy project direcory mandatory entry field') raise ValueError('Missing Scrapy Project Directory') project_dir = os.path.join(scrapy_dir,project_dir) new_file_list = [] for msg in msg_list: filename = os.path.basename(msg.attributes["file"]["path"]) if filename == 'spider.py': filename = os.path.join(project_dir, 'spiders', filename) else: filename = os.path.join(project_dir, filename) # copy files to directories try: with open(filename, 'wb') as fout: logger.info('Write to filename (binary): {}'.format(filename)) fout.write(msg.body) fout.close() except IOError: logger.warning('File not found: {}'.format(filename)) logger.debug('Current directory: {}'.format(os.getcwd())) f = [] for (dirpath, dirnames, filenames) in os.walk('/home/onlinemedia/'): f.extend(filenames) break logger.debug('Files under directory onlinemedia: {}'.format(f)) api.send(outports[0]['name'], log_stream.getvalue()) time.sleep(5) exit(-1) new_file_list.append(filename) for f in new_file_list: if os.path.isfile(filename): logger.info('File successfully written: {} ({})'.format(filename, time.ctime(os.path.getmtime(filename)))) else: logger.error('File does not exist: {}'.format(filename)) api.send(outports[0]['name'], log_stream.getvalue()) log_stream.seek(0) log_stream.truncate(0) spiderlist = tfp.read_list(api.config.spider) num_spiders = len(spiderlist) num_batches = 0 num_all_articles = 0 for i, spider in enumerate(spiderlist) : media = spider.split('_')[0] today_date = datetime.today().strftime('%Y-%m-%d') cmd = ['scrapy', 'crawl', spider] logger.info('Start scrapy: {} ({}/{})'.format(cmd,i,num_spiders)) #proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd = scrapy_dir,universal_newlines=True) proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=scrapy_dir,universal_newlines=True) #proc = subprocess.Popen(['python','/Users/Shared/data/onlinemedia/outputgen.py'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) #print('CWD: {}'.format(os.getcwd())) logger.info(proc.stderr) api.send(outports[0]['name'], log_stream.getvalue()) log_stream.seek(0) log_stream.truncate(0) count_articles = 0 articles_list = list() # run through stdout after scrape has ended and add to batch_output last_article = dict() for line in proc.stdout.splitlines(): adict = format_check_output(line, logger) if adict: adict['media'] = media adict['date'] = today_date articles_list.append(adict) last_article = adict count_articles += 1 # send result to outport if len(articles_list) == 0 : logger.warning('No articles found: {}'.format(media)) continue num_batches += 1 attributes = { k:v for k,v in last_article.items() if k in ['website','date','columns']} attributes['media'] = media if media in media_languages : attributes['language'] = media_languages[media] else : attributes['language'] = 'unknown' attributes['today_str'] = today_date attributes['month'] = datetime.today().strftime("%B") attributes['message.indexBatch'] = i attributes['message.countBatch'] = num_spiders attributes['message.lastBatch'] = True if i+1 == num_spiders else False df = pd.DataFrame(articles_list) df = df.drop_duplicates(subset=['text_id']) df = df[['media','date','text_id','title','rubric','url','paywall','num_characters','text']] msg = api.Message(attributes=attributes, body=df) api.send(outports[1]['name'], msg) logger.info('Spider completed: {} - #articles: {}'.format(spider,count_articles)) num_all_articles += count_articles logger.info('Process ended: {} Articles processed: {} '.format(time_monitor.elapsed_time(),num_all_articles )) api.send(outports[0]['name'], log_stream.getvalue()) return 0
operator_description = "Text Sentiment Analysis" operator_description_long = "Text Sentiment Analysis using Textblob. " add_readme = dict() debug_mode = True config_params['debug_mode'] = { 'title': 'Debug mode', 'description': 'Sending debug level information to log port', 'type': 'boolean' } last_msg = None id_set = set() operator_name = 'sentiment analysis' logger, log_stream = slog.set_logging(operator_name, loglevel=api.config.debug_mode) logger.info("Process started") time_monitor = tp.progress() def get_sentiment(text, language): if isinstance(text, str): if language == 'DE': blob = TextBlobDE(text) return [blob.sentiment.polarity, blob.sentiment.subjectivity] elif language == 'FR': tb = Blobber(pos_tagger=PatternTaggerFR(), analyzer=PatternAnalyzerFR()) blob = tb(text) return blob.sentiment else: