def gbdt_lr(para): print("gbdt_lr") x_train = para[0] x_train_lr = para[1] x_test = para[2] y_train = para[3] y_train_lr = para[4] y_test = para[5] maxleafnodes = 11 gbc = GBDT(max_leaf_nodes=maxleafnodes - 1, n_estimators=600, min_samples_leaf=5, max_depth=3, learning_rate=0.02, subsample=0.2, max_features=0.1) gbc.fit(x_train, y_train) ohe = OHE() ohe.fit(gbc.apply(x_train)[:, :]) li = gbc.apply(x_train_lr)[:, :] x_train_lr_gbc = ohe.transform(li) #x_train_lr_gbc=myTransform(li,max_leaf_nodes=maxleafnodes) li = gbc.apply(x_test)[:, :] x_test_gbc = ohe.transform(li) #x_test_gbc=myTransform(li,max_leaf_nodes=maxleafnodes) del (li) lr = sgd(n_iter=50) lr.fit(x_train_lr_gbc, y_train_lr) yp = lr.predict(x_test_gbc) print("GBDT+SGD: " + str(auc(y_test, yp))) return (gbc, yp)
def oneHotEncoding(): print("-----------Try to ONE HOT ENCODING-----------------") setToCompare = 'abcdefghijklmnopqrstuvwxyz ' ctoi = dict((c, i) for i, c in enumerate(setToCompare)) itoc = dict((i, c) for i, c in enumerate(setToCompare)) # integer encode input data integer_encoded = [ctoi[char] for char in musicdata] print(integer_encoded) # one hot encode onehot = list() for value in integer_encoded: letter = [0 for _ in range(len(setToCompare))] letter[value] = 1 onehot.append(letter) print(onehot) # invert encoding inverted = itoc[np.argmax(onehot[0])] print(inverted) print( "--------------------------ENCODING IN PROGRESS----------------------") labelencoder = LE() X1 = X Y1 = Y X1[:, 0] = labelencoder.fit_transform(X1[:, 0]) onehotencoder = OHE([0]) X1 = onehotencoder.fit_transform(X1) labelencoderY = LE() Y1 = labelencoderY.fit_transform(Y1) print(X1) print(Y1)
def graph_to_node_label(graphs, labels): targets = np.array( list( itertools.chain(*[[labels[i]] * graphs[i].number_of_nodes() for i in range(len(graphs))]))) enc = OHE(dtype=np.float32) return np.asarray(enc.fit_transform(targets.reshape(-1, 1)).todense())
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder: self.check_requirements(X, y) self.preprocessor['categorical'] = OHE(categories=X['categories'], sparse=False, handle_unknown='error') return self
def fit(self, X, y): """Build a neural network classifier from the training set (X, y). Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_classes] (one-hot), The target values (class labels) as integers. Returns ------- self : object Returns the instance itself. """ X = check_array(X, ensure_min_samples=2, ensure_min_features=2) y = check_array(y, ensure_2d=False) if np.isnan(np.min(X)): raise ValueError("It contains NaN.") y = np.atleast_1d(y) if y.ndim == 1: # convert to one hot code from sklearn.preprocessing import OneHotEncoder as OHE y = np.reshape(y, (-1, 1)) model = OHE(sparse=False) y = np.array(model.fit_transform(y)) self.n_samples, self.n_features = X.shape self.n_input = self.n_features self.__variables() # construct network _, cost_op, optimizer_op = self.__network() init = tf.initialize_all_variables() # create session config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True self.__sess = tf.Session(config=config) if self.use_gpu: self.device = '/gpu:' + str(self.gpu_id) else: self.device = '/cpu' logger.info('Used device: ' + self.device) with tf.Graph().as_default(): with tf.device(self.device): self.__sess.run(init) for epoch in range(self.training_epochs): for batch_X, batch_y in self.__next_batch(X, y): cost, _ = self.__sess.run([cost_op, optimizer_op], \ feed_dict={self.__X: batch_X, self.__y: batch_y}) if epoch % self.display_step == 0: logger.info("Epoch: %04d, cost = %.6f"%(epoch+1,cost)) logger.info("Optimization Finished!") self.fitted = True return self
def read_data(dataset): x = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values ct = col_trans(transformers=[('encoder', OHE(), [3])], remainder="passthrough") x = np.array(ct.fit_transform(x)) x = check_dummy_variable(x) train_data(x, y)
def one_encoder(X_train, X_valid, object_cols): encoder = OHE(handle_unknown='ignore', sparse=False) OH_cols_train = pd.DataFrame(encoder.fit_transform(X_train[object_cols])) OH_cols_valid = pd.DataFrame(encoder.transform(X_valid[object_cols])) OH_cols_train.index = X_train.index OH_cols_valid.index = X_valid.index num_X_train = X_train.drop(object_cols, axis=1) num_X_valid = X_valid.drop(object_cols, axis=1) OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1) OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1) return OH_X_train, OH_X_valid
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder: self.check_requirements(X, y) self.preprocessor['categorical'] = OHE( # It is safer to have the OHE produce a 0 array than to crash a good configuration categories=X['dataset_properties']['categories'] if len(X['dataset_properties']['categories']) > 0 else 'auto', sparse=False, handle_unknown='ignore') return self
def fit(self, X, y=None): if isinstance(X, tuple): X, y = X self.columns = X.columns[[ (X[i].nunique() < self.max_values) & (X[i].dtype == 'object') for i in X.columns ]] self.encoders = { column: OHE(handle_unknown='ignore').fit(X[column].values.reshape(-1, 1)) for column in self.columns } return self
for res in seq: if res.isalpha(): test_vector.append(ord(res)) else: test_vector.append(int(res)) vectenc = pickle.load(open("ohe.sav", "rb")) for i in range(len(test_vector) - window + 1): test_vector_frames.append(test_vector[i:i + window]) test_vector_frames = np.array(test_vector_frames) test_v_enc = vectenc.transform(test_vector_frames) linclf = pickle.load(open("LinearSVC_3SSTRIDE_w21.sav", "rb")) out_prediction = linclf.predict(test_v_enc) result = [] for char in out_prediction: result.append(chr(char)) result = ''.join(result) with open("LinSVC21_predictions", "a+") as wh: wh.write(pid + "\n") wh.write(seq[window // 2:len(seq) - window // 2] + "\n") wh.write(result + "\n" + "\n") if __name__ == "__main__": window = 21 encoder = OHE() predict_fasta("testset.txt", window)
def encode(self, x): if not self._encoder: self._encoder = OHE(sparse=False, categories="auto") return self._encoder.fit_transform(x)
# y = class_le.fit_transform(df2['classlabel'].values) # print(y) # print() df3['classlabel'] = class_le.fit_transform(df3['classlabel']) print('DataFrame with classlabel mapped to an integer class using Scikit-Learn.') print('LabelEncoder - note size is not mapped.') print(df3) print() print('Scikit-Learn One-hot Encoder') df4['size'] = df4['size'].map(size_map) X = df4[['color', 'size', 'price']].values color_OHE = OHE() Y = color_OHE.fit_transform(X[:, 0].reshape(-1,1)).toarray() print(X) print() print(Y) print() ''' [['green' 'M' 10.1] ['red' 'L' 13.5] ['blue' 'XL' 15.3] ['green' 'S' 9.1]] [[0. 1. 0.] [0. 0. 1.] [1. 0. 0.]
features = df.iloc[:, :-1].values labels = df.iloc[:, -1:].values from sklearn.preprocessing import Imputer as ip imp = ip(missing_values='NaN', strategy="median", axis=0) imp = imp.fit(features[:, 1:2]) features[:, 1:2] = imp.transform(features[:, 1:2]) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() le = le.fit(features[:, 0]) features[:, 0] = le.transform(features[:, 0]) from sklearn.preprocessing import OneHotEncoder as OHE ohe = OHE(categorical_features=[0]) features = ohe.fit_transform(features).toarray() labels = le.fit_transform(labels) from sklearn.model_selection import train_test_split as TTS x_train, x_test, y_train, y_test = TTS(features, labels, test_size=0.4, random_state=0) from sklearn.preprocessing import StandardScaler sc = StandardScaler() x_train = sc.fit_transform(x_train) x_test = sc.transform(x_test)
from sklearn import svm from sklearn.neighbors import KNeighborsClassifier as KNC df = pd.read_csv("../dataset/train.csv") y = df['ACTION'] del df['ACTION'] X_test = pd.read_csv("../dataset/test.csv") employee_id = X_test['id'] del X_test['id'] X = df[(df['RESOURCE'] == X_test['RESOURCE'])].dropna(how='all') n_list = [] for column in X.columns: n_list.append(X[column].value_counts().size) n2_list = [] for column in X_test.columns: n2_list.append(X_test[column].value_counts().size) enc = OHE() X = enc.fit_transform(X) X_test = enc.fit_transform(X_test) X_train, X_val, y_train, y_val = cv.train_test_split(X, y, test_size=0.4, random_state=0) linSVC = svm.LinearSVC() prediction = linSVC.fit(X, y).predict(X_test)
from sklearn.preprocessing import LabelEncoder as LE, OneHotEncoder as OHE import numpy as np a = np.array([[0, 1, 100], [1, 2, 200], [2, 3, 400]]) oh = OHE(categorical_features=[0, 1]) a = oh.fit_transform(a).toarray() a[:, 1:] idx_to_delete = [0, 3] indices = [i for i in range(a.shape[-1]) if i not in idx_to_delete] a[:, indices]
# %% x = Data.iloc[:, 0:10] y = Data.iloc[:, -1] # %% x.head() # %% y.head() # %% ohe = OHE(sparse=False, categorical_features= [1]) x = ohe.fit_transform(x) # %% x # %% x_dummy = pd.DataFrame(x) x_dummy.head() # %% x_dummy = x_dummy.drop(labels=[1],axis=1) x_dummy.head()
def __init__(self, data, labels): self._data = data self._labels = labels self._label_encoder = OHE(sparse=False) self._one_hot = self._label_encoder.fit_transform(self._labels)
def __init__(self, **kwargs): r"""Initialize feature encoder. """ self.__one_hot_encoder = OHE(handle_unknown='ignore')
def evitram(): # Restore pretrained model restorestr = pxfinestr.split('.meta')[0] # Save model str evitramstr = evitramfinestr.split('.meta')[0] # Load pretrained evidence representations for all sources K = [] for e in sys.argv[3:]: cp2 = utils.load_config(e) K.append(cp2.get('Experiment', 'PX_Z_TRAIN')) sect = 'Experiment' ev_paths = [cp.get(sect, i) for i in cp.options(sect) if 'evidence' in i] if cp.get('Experiment', 'PREFIX') == 'MNIST' or \ cp.get('Experiment', 'PREFIX') == 'AMNIST': evitram_dict = ConvAE.build_EviTRAM(cp, SEED) elif cp.get('Experiment', 'PREFIX') == 'WEATHER-TEMPO': evitram_dict = WConvAE.build_EviTRAM(cp, SEED) else: # Layerwise autoencoder number ae_ids = [str(i) for i in xrange(cp.getint('Experiment', 'AENUM'))] evitram_dict = SAE.build_EviTRAM(cp, ae_ids, SEED) # Get variables to restore from pretrained model P(x) Encoder var_list = tf.trainable_variables() for ev_path_id, ev_path in enumerate(ev_paths): if cp.get('Experiment', 'PREFIX') == 'MNIST' or \ cp.get('Experiment', 'PREFIX') == 'AMNIST' or \ cp.get('Experiment', 'PREFIX') == 'WEATHER-TEMPO': # Prepare "restore" variable list for v in var_list: if v.name == 'Pre_Q' + str(ev_path_id) + '/kernel:0': var_list.remove(v) for v in var_list: if v.name == 'Pre_Q' + str(ev_path_id) + '/bias:0': var_list.remove(v) else: # Prepare "restore" variable list for v in var_list: if v.name == 'Pre_Q' + str(ev_path_id) + '/kernel:0': var_list.remove(v) for v in var_list: if v.name == 'Pre_Q' + str(ev_path_id) + '/bias:0': var_list.remove(v) for v in var_list: if v.name == 'Pre_Comp_Q' + str(ev_path_id) + '/kernel:0': var_list.remove(v) for v in var_list: if v.name == 'Pre_Comp_Q' + str(ev_path_id) + '/bias:0': var_list.remove(v) ########################################################## # Tensorboard (comment / uncomment) ########################################################## from datetime import datetime now = datetime.utcnow().strftime("%m-%d_%H-%M:%S") root_logdir = cp.get('Experiment', 'ModelOutputPath') logdir = "{}/{}{}-{}/".format( root_logdir, cp.get('Experiment', 'PREFIX') + '_' + cp.get('Experiment', 'Enumber') + '_cond', sys.argv[2], now) tf.summary.scalar(name='cond loss', tensor=evitram_dict['evitram_loss']) tf.summary.scalar(name='recon loss', tensor=evitram_dict['px_mse']) summary = tf.summary.merge_all() file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph()) ########################################################## # Initialize & restore P(x) AE weights init = tf.global_variables_initializer() saver = tf.train.Saver(var_list) saverCOND = tf.train.Saver() # Task outcomes # EV = [np.load(i) for i in K] K = utils.load_evidence(cp2.get('Experiment', 'EVIDENCEDATAPATH')) if cp.get('Experiment', 'PREFIX') == 'WEATHER-TEMPO': from sklearn.preprocessing import OneHotEncoder as OHE EV = K.train.labels EV = np.repeat(EV, cp.getint('Input', 'Frames')) EV = OHE(sparse=False).fit_transform(EV.reshape(len(EV), 1)) EV = [EV] else: EV = K.train.one # EV = EV[p] EV = [EV] # Start Session config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Init values init.run() # Restore finetuned model saver.restore(sess, restorestr) train_dict = { 'cp': cp, 'sess': sess, 'data': XX_full, 'sumr': summary, 'savestr': evitramstr, 'saver': saverCOND, 'fw': file_writer, 'EV': EV, 'ev_paths': ev_paths } if cp.get('Experiment', 'PREFIX') == 'MNIST' or \ cp.get('Experiment', 'PREFIX') == 'AMNIST': ConvAE.evitram_train(train_dict, evitram_dict) elif cp.get('Experiment', 'PREFIX') == 'WEATHER-TEMPO': WConvAE.evitram_train(train_dict, evitram_dict) else: SAE.evitram_train(train_dict, evitram_dict) # Get batch size for batch output save batch_size = train_dict['cp'].getint('Hyperparameters', 'BatchSize') if cp.get('Experiment', 'PREFIX') == 'MNIST' or \ cp.get('Experiment', 'PREFIX') == 'AMNIST': # Save hidden/output layer results for pipeline training px_Z_latent = utils.run_OOM(sess, evitram_dict['conv_in'], XX_full, evitram_dict['conv_z'], batch_size=batch_size) elif cp.get('Experiment', 'PREFIX') == 'WEATHER-TEMPO': px_Z_latent = utils.run_OOM(sess, evitram_dict['conv_in'], XX_full, evitram_dict['conv_z'], batch_size=batch_size) else: px_Z_latent = utils.run_OOM(sess, evitram_dict['sda_in'], XX_full, evitram_dict['sda_hidden'], batch_size=batch_size) # utils.save_OOM(sess, pae_dict['conv_in'], XX_full, # pae_dict['conv_out'], # path=cp.get('Experiment', 'PX_XREC_TRAIN'), # batch_size=batch_size) # Print clustering ACC utils.log_accuracy(cp, YY_full, px_Z_latent, 'COND - ACC FULL', SEED) # Print clustering NMI utils.log_NMI(cp, YY_full, px_Z_latent, 'COND - NMI FULL', SEED) sess.close()