print(len(feat), 25) GloveDimOption = '50' # this could be 50 (171.4 MB), 100 (347.1 MB), 200 (693.4 MB), or 300 (1 GB) embeddings_index = loadGloveModel('data/glove.6B.' + GloveDimOption + 'd.txt') # print(embeddings_index['apple']) # print(embeddings_index['mango']) embeddings_index[''] = np.zeros(50) embeddings_index['*root'] = np.ones(50) enc = CategoricalEncoder(encoding='onehot') X_pos = [['ADJ'], ['ADP'], ['ADV'], ['AUX'], ['CCONJ'], ['DET'], ['INTJ'], ['NOUN'], ['NUM'], ['PART'], ['PRON'], ['PROPN'], ['PUNCT'], ['SCONJ'], ['SYM'], ['VERB'], ['X']] enc.fit(X_pos) for i in X_pos: embeddings_index[i[0]] = pad_sequences(enc.transform([[i[0]]]).toarray(), maxlen=50, padding='post')[0] #embeddings_index[i[0]] = pad_sequences(enc.transform([[i[0]]]).toarray(), maxlen=18, padding='post')[0] # print(embeddings_index[i[0]]) # print(embeddings_index['apple']) feat_vect, transit_vect = [], [] # feat_vect = np.array(()) # transit_vect = np.array(()) for i in feat: #print(i) sd = np.array(())
X_categorical = X[:, :idx_end_categorical + 1] # Select only the numerical columns of X (including ft_embedding if present) X_numerical = X[:, idx_end_categorical + 1:] return X_categorical, X_numerical else: return np.zeros((X.shape[0], 0)), X if __name__ == "__main__": df_train = load_and_preprocess('TrainingSet(3).csv', nrows=10000) # print(df_train.combined_str) # X = pd.concat([df_train[colnames_categ], df_inference[colnames_categ]]) X = df_train[colnames_categ] X = X.apply(preprocess_categ_series) enc = CategoricalEncoder(handle_unknown='ignore') enc.fit(X) len_onehot = enc.transform( df_train[colnames_categ].iloc[:1]).toarray().shape[1] print(len_onehot) # train_X_onehot = enc.transform(df_train[colnames_categ]).toarray() # # inference_X_onehot = enc.transform(df_train[colnames_categ]).toarray() # print(train_X_onehot.shape) # print(train_X_onehot[0]) # pprint(enc.categories_)
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator, random_state=0) rt_lm = LogisticRegression() pipeline = make_pipeline(rt, rt_lm) pipeline.fit(X_train, y_train) y_pred_rt = pipeline.predict_proba(X_test)[:, 1] fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt) # Supervised transformation based on random forests rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator) rf_enc = CategoricalEncoder() rf_lm = LogisticRegression() rf.fit(X_train, y_train) rf_enc.fit(rf.apply(X_train)) rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm) grd = GradientBoostingClassifier(n_estimators=n_estimator) grd_enc = CategoricalEncoder() grd_lm = LogisticRegression() grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) y_pred_grd_lm = grd_lm.predict_proba( grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1] fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
# Unsupervised transformation based on totally random trees rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator, random_state=0) rt_lm = LogisticRegression() pipeline = make_pipeline(rt, rt_lm) pipeline.fit(X_train, y_train) y_pred_rt = pipeline.predict_proba(X_test)[:, 1] fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt) # Supervised transformation based on random forests rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator) rf_enc = CategoricalEncoder() rf_lm = LogisticRegression() rf.fit(X_train, y_train) rf_enc.fit(rf.apply(X_train)) rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm) grd = GradientBoostingClassifier(n_estimators=n_estimator) grd_enc = CategoricalEncoder() grd_lm = LogisticRegression() grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) y_pred_grd_lm = grd_lm.predict_proba( grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1] fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)