enc = CategoricalEncoder(encoding='onehot-dense') X_2 = np.array(X[:, 0].reshape(-1, 1)) Xq_2 = np.array(X_q[:, 0].reshape(-1, 1)) attributes = [dataset['attributes'][0][0]] for i, (name, relation) in enumerate(dataset['attributes'][1:-1]): if relation == 'NUMERIC': X_2 = np.hstack((X_2, X[:, i + 1].reshape(-1, 1))) Xq_2 = np.hstack((Xq_2, X_q[:, i + 1].reshape(-1, 1))) attributes.append(name) continue X_2 = np.hstack((X_2, enc.fit_transform(X[:, i + 1].reshape(-1, 1)))) Xq_2 = np.hstack((Xq_2, enc.transform(X_q[:, i + 1].reshape(-1, 1)))) for category in enc.categories_[0]: attributes.append(category) X = X_2.astype(float) X_q = Xq_2.astype(float) print('Num features: %d' % len(attributes)) print(attributes) # We now have 51 features, for example feature entrepreneur can get value 0 or 1, 0 meaning persion is not entrepreneur and 1 meaning he is. # ### Most informative features # Before we use PCA to remove some features we will see which features are considered most informative when we use Logistic Regression classifier.
GloveDimOption = '50' # this could be 50 (171.4 MB), 100 (347.1 MB), 200 (693.4 MB), or 300 (1 GB) embeddings_index = loadGloveModel('data/glove.6B.' + GloveDimOption + 'd.txt') # print(embeddings_index['apple']) # print(embeddings_index['mango']) embeddings_index[''] = np.zeros(50) embeddings_index['*root'] = np.ones(50) enc = CategoricalEncoder(encoding='onehot') X_pos = [['ADJ'], ['ADP'], ['ADV'], ['AUX'], ['CCONJ'], ['DET'], ['INTJ'], ['NOUN'], ['NUM'], ['PART'], ['PRON'], ['PROPN'], ['PUNCT'], ['SCONJ'], ['SYM'], ['VERB'], ['X']] enc.fit(X_pos) for i in X_pos: embeddings_index[i[0]] = pad_sequences(enc.transform([[i[0]]]).toarray(), maxlen=50, padding='post')[0] #embeddings_index[i[0]] = pad_sequences(enc.transform([[i[0]]]).toarray(), maxlen=18, padding='post')[0] # print(embeddings_index[i[0]]) # print(embeddings_index['apple']) feat_vect, transit_vect = [], [] # feat_vect = np.array(()) # transit_vect = np.array(()) for i in feat: #print(i) sd = np.array(()) for w in i: # if(w in embeddings_index.keys()): # sd = np.concatenate(sd,embeddings_index[w])
n_estimators=n_estimator, random_state=0) rt_lm = LogisticRegression() pipeline = make_pipeline(rt, rt_lm) pipeline.fit(X_train, y_train) y_pred_rt = pipeline.predict_proba(X_test)[:, 1] fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt) # Supervised transformation based on random forests rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator) rf_enc = CategoricalEncoder() rf_lm = LogisticRegression() rf.fit(X_train, y_train) rf_enc.fit(rf.apply(X_train)) rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm) grd = GradientBoostingClassifier(n_estimators=n_estimator) grd_enc = CategoricalEncoder() grd_lm = LogisticRegression() grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) y_pred_grd_lm = grd_lm.predict_proba( grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1] fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
def extract_features(df_train, df_inference, selected_feature_names_categ, selected_feature_names_interval, shuffle=True, fuzzy_matching=True, use_onehot=True, use_sentence_vec=False): features_to_use = [] variable_types = [] if not (use_onehot): for feature in selected_feature_names_categ: features_to_use.append(feature + '_encoded') variable_types.append('categorical_nominal') # Append interval AFTER categorical!! for feature in selected_feature_names_interval: features_to_use.append(feature + '_normed') variable_types.append('numerical') # Check to ensure all cols exist (avoid keyerrors) for df in [df_train, df_inference]: df[selected_feature_names_categ + selected_feature_names_interval] print(df['combined_str']) # for feature in selected_feature_names_categ: # le = preprocessing.LabelEncoder() # print(print_attr_overview(df[feature], True, topn=10)) # df[feature + '_encoded'] = le.fit_transform(df[feature]) # features_to_use.append(feature + '_encoded') if use_onehot: # Each Feature has its own vocab... vocabs = defaultdict(list) X = pd.concat([df_train[colnames_categ], df_inference[colnames_categ]]) X = df_train[colnames_categ] X = X.apply(preprocess_categ_series) enc = CategoricalEncoder(handle_unknown='ignore') enc.fit_transform(X) # pprint(enc.categories_) else: le = preprocessing.LabelEncoder() all_unique = [] # FIT LABEL_ENCODER (combine vocabs for train and inference) for df in [df_train, df_inference]: for feature in selected_feature_names_categ: # print(print_attr_overview(df[feature])) s = df[feature] # Remove categorical entries with less than 10 occurances a = s.value_counts() s[s.isin(a.index[a < 12])] = np.nan s[s.isnull()] = "EMPTY_PLACEHOLDER" s = s.map(lambda x: x.lower() if type(x) == str else x) # print(np.unique(df[feature])) all_unique.extend(np.unique(s)) le.fit(all_unique) # TRANSFORM LABEL_ENCODER for df in [df_train, df_inference]: for feature in selected_feature_names_categ: print(feature) # print(df[feature]) s = df[feature] s = s.map(lambda x: x.lower() if type(x) == str else x) df[feature + '_encoded'] = le.transform(s) print(feature, len(np.unique(s))) for df in [df_train, df_inference]: for feature in selected_feature_names_interval: s = df[feature] s = s.map(lambda x: x.replace(',', '') if type(x) == str else x) # print(s) s = pd.to_numeric(s, errors='coerce') # Set null values to zero # TODO: try set nan to the mean instead of zero # TODO: try different types of normalisation s[np.logical_not(s.notnull())] = 0.0 df[feature + '_normed'] = norm_zscore(s) # features_to_use.append('sentence_vec') # variable_types.append('embedding') if use_sentence_vec: from ft_embedding import get_sentence_vec print('Computing sentence vectors for dataset') train_embedding_mat = np.asarray( [get_sentence_vec(x) for x in df_train['combined_str']]) inference_embedding_mat = np.asarray( [get_sentence_vec(x) for x in df_inference['combined_str']]) variable_types.append('ft_embedding') if use_onehot: print(features_to_use) # One-Hot Categorical Encoding train_X_onehot = enc.transform(df_train[colnames_categ]).toarray() train_X_interval = df_train[features_to_use].as_matrix() print(train_X_onehot.shape) print(train_X_interval.shape) train_X = np.hstack([train_X_onehot, train_X_interval]) inference_X_onehot = enc.transform( df_inference[colnames_categ]).toarray() inference_X_interval = df_inference[features_to_use].as_matrix() print(inference_X_onehot.shape) print(inference_X_interval.shape) inference_X = np.hstack([inference_X_onehot, inference_X_interval]) # Add (one-hot encoded) numerical features to variable_types len_onehot = train_X_onehot.shape[1] print(len_onehot) features_to_use = ['numerical' for i in range(len_onehot)] + features_to_use else: # Index Categorical Encoding (integer) train_X = df_train[features_to_use].as_matrix() inference_X = df_inference[features_to_use].as_matrix() train_y = df_train['case_status'].as_matrix() if use_sentence_vec: # Stack with sentence embedding train_X = np.hstack([train_X.copy(), train_embedding_mat]) inference_X = np.hstack([inference_X.copy(), inference_embedding_mat]) print(train_embedding_mat.shape) print(inference_embedding_mat.shape) print(train_X.shape) print(inference_X.shape) # exit() inference_row_id = df_inference['row ID'] if shuffle: train_X, train_y = skl_shuffle(train_X, train_y) # print(X.shape) # print(y.shape) if use_onehot: vocab_size = 0 else: vocab_size = len(list(le.classes_)) return train_X, train_y, inference_row_id, inference_X, vocab_size, variable_types, features_to_use
X_categorical = X[:, :idx_end_categorical + 1] # Select only the numerical columns of X (including ft_embedding if present) X_numerical = X[:, idx_end_categorical + 1:] return X_categorical, X_numerical else: return np.zeros((X.shape[0], 0)), X if __name__ == "__main__": df_train = load_and_preprocess('TrainingSet(3).csv', nrows=10000) # print(df_train.combined_str) # X = pd.concat([df_train[colnames_categ], df_inference[colnames_categ]]) X = df_train[colnames_categ] X = X.apply(preprocess_categ_series) enc = CategoricalEncoder(handle_unknown='ignore') enc.fit(X) len_onehot = enc.transform( df_train[colnames_categ].iloc[:1]).toarray().shape[1] print(len_onehot) # train_X_onehot = enc.transform(df_train[colnames_categ]).toarray() # # inference_X_onehot = enc.transform(df_train[colnames_categ]).toarray() # print(train_X_onehot.shape) # print(train_X_onehot[0]) # pprint(enc.categories_)
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator, random_state=0) rt_lm = LogisticRegression() pipeline = make_pipeline(rt, rt_lm) pipeline.fit(X_train, y_train) y_pred_rt = pipeline.predict_proba(X_test)[:, 1] fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt) # Supervised transformation based on random forests rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator) rf_enc = CategoricalEncoder() rf_lm = LogisticRegression() rf.fit(X_train, y_train) rf_enc.fit(rf.apply(X_train)) rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm) grd = GradientBoostingClassifier(n_estimators=n_estimator) grd_enc = CategoricalEncoder() grd_lm = LogisticRegression() grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) y_pred_grd_lm = grd_lm.predict_proba( grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1] fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)