def clean_text_data(df): """ Uniformize a column in a DataFrame """ n = Normalizer() df['merge_col'] = df['merge_col'].map(lambda x: x.encode('utf-8') if x != '' else x) #df['brands_tags'] = df['brands_tags'].fillna('').map(lambda x: x.encode('utf-8') if x != '' else x) #df['brands_tags'] = df['brands_tags'].map(lambda x: n.simple_normalize(str(x))) df['merge_cols_simpl'] = df['merge_col'].map(lambda x: n.end_to_end_normalize(x)) # Suppress double words and keep order df['merge_cols_simpl'] = df['merge_cols_simpl'].map(lambda x: n.clean_duplicate_string(x)) # Create columns with first letters df['merge_col_3'] = df['merge_cols_simpl'].map(lambda x: n.keep_first_letters(x, 3)) df['merge_col_4'] = df['merge_cols_simpl'].map(lambda x: n.keep_first_letters(x, 4)) df['merge_col_5'] = df['merge_cols_simpl'].map(lambda x: n.keep_first_letters(x, 5)) # Merge columns without duplicates df['merge_final'] = df['merge_cols_simpl'] + ' ' + df['merge_col_3'] + ' ' + df['merge_col_4'] + ' ' + df['merge_col_5']+ ' ' + df['brands_tags'].fillna('') return df
import boto3 import pandas as pd import re import sklearn.metrics as m from sklearn.model_selection import cross_val_score from utilCsv import UnicodeWriter, Utils from Code.Utils.utilNormalizer import Normalizer from Code.db_access.dynamoDB import AccessDB from tfidfClassification import Classification # TODO : create context a = AccessDB() c = Classification() n = Normalizer() u = Utils() # AWS Access client = boto3.client('s3', region_name="eu-west-1") s3 = boto3.resource('s3') tfidfp = client.get_object(Bucket='smartticket-analytics', Key='dumpTfIdf.pkl') tf_idf_load_from_pickle = pickle.loads(tfidfp['Body'].read()) rfp = client.get_object(Bucket='smartticket-analytics', Key='dumpRf.pkl') rf_load_from_pickle = pickle.loads(rfp['Body'].read()) table = boto3.resource('dynamodb').Table('prod-analytics-smarttickets') classif_result = s3.Object(bucket_name='smartticket-analytics', key='classif_results.csv')