def features_standardization(dataframe, fit=False): """ Performs feature standardization using a Standard scaler. After standardization, features will have zero means and unit standard deviation, changing the original distribution. --- Arguments dataframe: pd.DataFrame Dataframe with encoded data fit: boolean Indicates if we should train or load a scaler Returns dataframe: pd.DataFrame Dataframe with scaled features """ # Train or load a scaler if fit: scaler = StandardScaler() scaler.fit(dataframe) pickle_obj(scaler, 'standard_scaler') else: scaler = unpickle_obj('standard_scaler') # Transform data and recreate dataframe from np.array X = scaler.transform(dataframe) df = pd.DataFrame(X, columns=dataframe.columns) return df
def save_feature_set(dataframe, attributes_df, label='features_of_interest', save_original_features=True): """ Save list of features using their original or current names --- Arguments dataframe: pd.DataFrame Dataframe with pre-processed data attributes_df: pd.DataFrame Dataframe with feature attributes, based on the data dictionary label: string Filename for serialization save_original_features: boolean Flag indicating if we should save original or current feature names Returns None """ # get current feature names renamed_features = set(dataframe.columns.values) # retrieve original feature names, using attributes dataframe original_features = attributes_df[attributes_df.Renaming.isin(renamed_features)].Feature.values # decide which feature set to save based on save_original_features flag features = original_features if save_original_features else renamed_features # serialize list of features pickle_obj(features, label)
def encode_high_cardinality_categorical_df(dataframe, fit=False): """ Encode high cardinality categorical features using Binary Encoding and dropping invariant features In Binary Encoding, features are converted to a binary representation and binary digits are used as new features. --- Arguments dataframe: pd.DataFrame Dataframe with pre-processed data (i.e. renamed features), high card. categorical features only fit: boolean Indicates if we should train or load an encoder Returns dataframe: pd.DataFrame Dataframe with encoded data """ # Train or load an encoder if fit: encoder = BinaryEncoder(cols=dataframe.columns.values, drop_invariant=True) encoder.fit(dataframe) pickle_obj(encoder, 'high_card_categorical_encoder') else: encoder = unpickle_obj('high_card_categorical_encoder') # transform data return encoder.transform(dataframe)
def features_normalization(dataframe, fit=False): """ Performs feature normalization using a MinMax scaler. After normalization, features values will be in the range [0:1] while preserving original distribution. --- Arguments dataframe: pd.DataFrame Dataframe with encoded data fit: boolean Indicates if we should train or load a scaler Returns dataframe: pd.DataFrame Dataframe with scaled features """ # Train or load a scaler if fit: scaler = MinMaxScaler() scaler.fit(dataframe) pickle_obj(scaler, 'minmax_scaler') else: scaler = unpickle_obj('minmax_scaler') # Transform data and recreate dataframe from np.array X = scaler.transform(dataframe) df = pd.DataFrame(X, columns=dataframe.columns) return df
def ordinal_values_imputation(ordinal_dataframe, fit=False, nan_value=-1): """ Perform imputation of missing values for ordinal features using median value. --- Arguments ordinal_dataframe: pd.DataFrame Dataframe with pre-processed data (i.e. renamed features), ordinal features only fit: boolean Indicates if we should train or load an imputer nan_value: Any Value to be considered as missing value Returns dataframe: pd.DataFrame Dataframe with missing values imputed """ # Train or load a simple imputer, responsible for filling missing values with feature median if fit: imputer = SimpleImputer(missing_values=nan_value, strategy='median') imputer.fit(ordinal_dataframe) pickle_obj(imputer, 'ordinal_imputer') else: imputer = unpickle_obj('ordinal_imputer') # input missing values transformed = imputer.transform(ordinal_dataframe) # construct a dataframe from np.array values and original column names return pd.DataFrame(transformed, columns=ordinal_dataframe.columns.values)
def train_and_save(): # logging to file logger = logging.getLogger('tensorflow') logger.setLevel(logging.DEBUG) fhandler = logging.FileHandler('tensorflow.log') for hdlr in logger.handlers: logger.removeHandler(hdlr) logger.addHandler(fhandler) # Preprocess the files sgm_preprocessor = preprocess.SkipGramPreprocess(config['nltk_packages'], config['tokenizer_path'], config['data_root'], config['vocab_size']) # Generate context target pairs context_target_pairs = preprocess.SkipGramContextTargetPair( sgm_preprocessor, seed=42).get_context_target_pairs(20) np.random.seed(42) np.random.shuffle(context_target_pairs) contexts = context_target_pairs[:, 0] targets = np.expand_dims(context_target_pairs[:, 1], 1) input_fn_ = lambda: input_fn( contexts, targets, batch_size=config['batch_size']) w2v = tf.estimator.Estimator(model_fn=word2vec, model_dir=config['model_dir'], params={ 'vocab_size': config['vocab_size'], 'embedding_size': config['embedding_size'], 'num_sampled': config['num_neg_samples'] }) steps = config['epochs'] * contexts.shape[0] // config['batch_size'] print('* Starting to train') print('\t- Number of epochs: {0:,}'.format(config['epochs'])) print('\t- Number of steps : {0:,}'.format(steps)) w2v.train(input_fn=input_fn_, steps=steps) print('* End of training') print('\t- For training logs see tensorflow.log') print('* Collecting Embedding matrix') input_fn_ = lambda: input_fn(contexts[:10], targets[:10], repeat=1) embedding_matrix = next(w2v.predict(input_fn_))['embedding_matrix'] # Save embeddings print('* Saving Embeddings') if not os.path.isdir(config['w2v_root']): os.makedirs(config['w2v_root']) pickle_obj(sgm_preprocessor.word_to_ids, os.path.join(config['w2v_root'], 'word_ids.pickle')) np.save(os.path.join(config['w2v_root'], 'embedding_matrix.npy'), embedding_matrix)
def main(): print('* Preprocessing Raw Corpus') preprocessor = preprocess.LangModelPreprocess(config['nltk_packages'], config['tokenizer_path'], config['data_root'], config['vocab_size'], config['ppdata_root']) print('* Generating Sorted File') sort_seq_len(os.path.join(config['ppdata_root'], 'pp.txt'), os.path.join(config['ppdata_root'], 'pp_sorted.txt')) print('* Build Logger') logger = get_tensorflow_logger('tensorflow.log') print('* Estimator Instance Created') train_1_input_fn = lambda: input_fn(glob.glob(os.path.join(config['ppdata_root'], 'pp_sorted.txt')), batch_size=config['batch_size'], padding_val=config['vocab_size']-1, shuffle=False) train_input_fn = lambda: input_fn(glob.glob(os.path.join(config['ppdata_root'], 'pp.txt')), batch_size=config['batch_size'], padding_val=config['vocab_size']-1) lang_model = tf.estimator.Estimator(model_fn, model_dir=config['model_dir'], params={ 'lr': config['lr'], 'vocab_size': config['vocab_size'], 'embedding_size': config['embedding_size'], 'hidden_units': config['hidden_units'], 'keep_rate': config['keep_rate'], 'num_layers': config['num_layers'], 'max_gradient_norm': config['max_gradient_norm'] }) print('* Start Training - Training logs to tensorflow.log') print('\t-Training 1 Epoch over sorted sequences') lang_model.train(train_1_input_fn, steps=config['steps_per_epoch']*1) print('\t-Training {} Epoch over random sequences'.format(config['epochs']-1)) lang_model.train(train_input_fn, steps=config['steps_per_epoch']*(config['epochs'] - 1)) print('* Saving word id map') if os.path.isfile(os.path.join(config['ppdata_root'], 'word_ids.pickle')): print('\t-File {} already present'.format(os.path.join(config['ppdata_root'], 'word_ids.pickle'))) else: pickle_obj(preprocessor.word_to_ids, os.path.join(config['ppdata_root'], 'word_ids.pickle'))
def get_low_and_high_cardinality_categorical_dfs(dataframe, attributes_df, threshold=5, fit=False): """ Returns a tuple of dataframes containing categorical features only: - low cardinality: features with number of unique categories less than or equal to threshold - high cardinality: features with number of unique categories higher than threshold --- Arguments dataframe: pd.DataFrame Dataset dataframe with pre-processed data (i.e. renamed features) attributes_df: pd.DataFrame Dataframe with feature attributes, based on the data dictionary threshold: int Threshold to consider high cardinality, based on number of categories fit: boolean Indicates if we should measure cardinality or consider previously measured data Returns tuple: (pd.DataFrame, pd.DataFrame) (Dataset view with low cardinality features, Dataset view with high cardinality features) """ # retrieve categorical features categorical_df = get_categorical_dataframe(dataframe, attributes_df) features = categorical_df.columns.values cardinality_count = {} # measure or read features cardinality if fit: for col in features: cardinality_count[col] = len(categorical_df[col].unique()) pickle_obj(cardinality_count, 'cardinality_count') else: cardinality_count = unpickle_obj('cardinality_count') # split low and high cardinality features, based on threshold high_cardinality_features = [feature for feature, cardinality in cardinality_count.items() if cardinality > threshold] low_cardinality_features = set(features)-set(high_cardinality_features) # create cardinality views low_cardinality_cat_df = categorical_df.loc[:, low_cardinality_features] high_cardinality_cat_df = categorical_df.loc[:, high_cardinality_features] return low_cardinality_cat_df, high_cardinality_cat_df
def encode_low_cardinality_categorical_df(dataframe, fit=False): """ Encode low cardinality categorical features using OneHot Encoding and dropping invariant features --- Arguments dataframe: pd.DataFrame Dataframe with pre-processed data (i.e. renamed features), low card. categorical features only fit: boolean Indicates if we should train or load an encoder Returns dataframe: pd.DataFrame Dataframe with encoded data """ # Train or load an encoder if fit: encoder = OneHotEncoder(cols=dataframe.columns.values, drop_invariant=True) encoder.fit(dataframe) pickle_obj(encoder, 'low_card_categorical_encoder') else: encoder = unpickle_obj('low_card_categorical_encoder') # transform data return encoder.transform(dataframe)
def encode_ordinal_df(dataframe, fit=False): """ Encode ordinal features, preserving the notion of order and dropping invariant features --- Arguments dataframe: pd.DataFrame Dataframe with pre-processed data (i.e. renamed features), ordinal features only fit: boolean Indicates if we should train or load an encoder Returns dataframe: pd.DataFrame Dataframe with encoded data """ # Train or load an encoder if fit: encoder = OrdinalEncoder(cols=dataframe.columns.values, drop_invariant=True) encoder.fit(dataframe) pickle_obj(encoder, 'ordinal_encoder') else: encoder = unpickle_obj('ordinal_encoder') # transform data return encoder.transform(dataframe)
def save_activations(idx, activation, dump_path): myutils.mkdir(dump_path) myutils.pickle_obj( activation, os.path.join(dump_path, 'model_{}_activations'.format(idx)))