示例#1
0
    def clean_text_data(df):
        """ Uniformize a column in a DataFrame """
        n = Normalizer()
        df['merge_col'] = df['merge_col'].map(lambda x: x.encode('utf-8') if x != '' else x)
        #df['brands_tags'] = df['brands_tags'].fillna('').map(lambda x: x.encode('utf-8') if x != '' else x)
        #df['brands_tags'] =  df['brands_tags'].map(lambda x: n.simple_normalize(str(x)))
        df['merge_cols_simpl'] = df['merge_col'].map(lambda x: n.end_to_end_normalize(x))
        # Suppress double words and keep order
        df['merge_cols_simpl'] = df['merge_cols_simpl'].map(lambda x: n.clean_duplicate_string(x))
        # Create columns with first letters
        df['merge_col_3'] = df['merge_cols_simpl'].map(lambda x: n.keep_first_letters(x, 3))
        df['merge_col_4'] = df['merge_cols_simpl'].map(lambda x: n.keep_first_letters(x, 4))
        df['merge_col_5'] = df['merge_cols_simpl'].map(lambda x: n.keep_first_letters(x, 5))
        # Merge columns without duplicates
        df['merge_final'] = df['merge_cols_simpl'] + ' ' + df['merge_col_3'] + ' ' + df['merge_col_4'] + ' ' + df['merge_col_5']+ ' ' + df['brands_tags'].fillna('')

        return df
示例#2
0
import boto3
import pandas as pd
import re
import sklearn.metrics as m
from sklearn.model_selection import cross_val_score
from utilCsv import UnicodeWriter, Utils

from Code.Utils.utilNormalizer import Normalizer
from Code.db_access.dynamoDB import AccessDB
from tfidfClassification import Classification

# TODO : create context
a = AccessDB()
c = Classification()
n = Normalizer()
u = Utils()

# AWS Access
client = boto3.client('s3', region_name="eu-west-1")
s3 = boto3.resource('s3')
tfidfp = client.get_object(Bucket='smartticket-analytics', Key='dumpTfIdf.pkl')
tf_idf_load_from_pickle = pickle.loads(tfidfp['Body'].read())
rfp = client.get_object(Bucket='smartticket-analytics', Key='dumpRf.pkl')
rf_load_from_pickle = pickle.loads(rfp['Body'].read())

table = boto3.resource('dynamodb').Table('prod-analytics-smarttickets')
classif_result = s3.Object(bucket_name='smartticket-analytics',
                           key='classif_results.csv')