def execute(trainfile, sampler): print("--- Executing") print("Using trainfile: ", trainfile) print("--- Loading (transformed) data") data = Data.Data() train_df = data.load(trainfile) y = train_df["is_attributed"] X = train_df.drop(["is_attributed"], axis=1) columns = X.columns.values before_class_weight = dict( zip([0, 1], compute_class_weight('balanced', [0, 1], y))) print("Original weights: ", before_class_weight) X_resampled = None y_resampled = None if sampler == "RANDOM": oversampler = RandomOverSampler(random_state=0) oversampler.fit(X, y) X_resampled, y_resampled = oversampler.sample(X, y) elif sampler == "ADASYN": oversampler = ADASYN(random_state=0) oversampler.fit(X, y) X_resampled, y_resampled = oversampler.sample(X, y) elif sampler == "SMOTE": oversampler = SMOTE(random_state=0) oversampler.fit(X, y) X_resampled, y_resampled = oversampler.sample(X, y) else: print("Invalid sampler: ", sampler) after_class_weight = dict( zip([0, 1], compute_class_weight('balanced', [0, 1], y_resampled))) print("Sampler: ", sampler, ", weights: ", after_class_weight) X_resampled = X_resampled.astype(int) y_resampled = y_resampled.astype(int) # print("X_resampled: ", X_resampled) # print("y_resampled: ", y_resampled) df = pd.DataFrame(data=X_resampled, columns=columns) df["is_attributed"] = y_resampled # df["is_attributed"] = df["is_attributed"].astype(int) compressor = "blosc" outfilename = trainfile + "." + sampler print("Output file (over-sampled): ", outfilename) df.to_hdf(outfilename, "table", mode="w", append=True, complevel=9, complib=compressor)
def __init__(self, data_pool, parameters, training): self.data_pool = data_pool self.parameters = parameters self.batch_size = parameters['batch_size'] self.training = training # Training is defined as the boolean flag of whether the data is for training or test # During training, the data is sampled from a pool # During test, the data is sampled sequentially, and exhaustively. # A vector needs to be given whether the data is padding data at the end of the dataset # A return state needs to be given to state if all test data is given. self.categorical = True self.d_thresh_range = None self.val_minibatch_idx = 0 self.d_thresh = None self.reduced_pool = None self.distance_pool_cache = {} self.input_mask = pd.Series([ np.tile(self.parameters['input_mask'], (self.parameters['observation_steps'], 1)) for x in range(self.batch_size) ], dtype=object, index=([0] * self.batch_size)) # Generate balanced index list ros = RandomOverSampler() if 'relative' in self.parameters['ibeo_data_columns'][0]: selection_data = list(data_pool.relative_destination.values) else: selection_data = list(data_pool.track_class.values) le = preprocessing.LabelEncoder() le.fit(selection_data) indexed_classes = np.array(le.transform(selection_data)) ros.fit(np.expand_dims(range(len(indexed_classes)), 1), indexed_classes) balanced_idxs, balanced_classes = ros.sample( np.expand_dims(range(len(indexed_classes)), 1), indexed_classes) self.balanced_idxs = np.squeeze(balanced_idxs) # bf = data_pool.iloc[balanced_idxs] # class_dict = {} # for class_t in data_pool.track_class.unique(): # class_dict[class_t] = len(bf[bf.track_class==class_t])/float(len(bf)) return
TerminateOnNaN(), ReduceLROnPlateau(verbose=1, patience=3) ] m = build_keras_embedding_classifier( embeddings=embeddings, #activation='elu', lr=args.learning_rate, depth=args.depth, hidden_size=args.hidden_size, #lr=2.5e-7, depth=5, hidden_size=20, decay=args.decay, dropout=args.dropout, recurrent_dropout=args.recurrent_dropout) print("Using random over-sample") rand_os = RandomOverSampler().fit(sequences, Y) os_sequences, os_Y = rand_os.sample(sequences, Y) print(sequences.shape) print(os_sequences.shape) hist = m.fit(os_sequences, os_Y, epochs=100, batch_size=128, validation_data=(test_sequences, Y_test), callbacks=cb) pred = m.predict(test_sequences).round().astype(int) metrics = binary_classification_metrics(Y_test, pred) print("Dev perf") print(metrics)
plot(hist_iphone_3v) galaxy_cor_3v = galaxy_corr galaxy_cor_3v['galaxysentiment'] = galaxy_cor_3v['galaxysentiment'].map(mapper) galaxy_cor_3v['galaxysentiment'] = pd.Series(galaxy_cor_3v['galaxysentiment'], dtype="category") galaxy_cor_3v.dtypes galaxy_cor_3v['galaxysentiment'].unique() hist_galaxy_3v = px.histogram(galaxy_cor_3v, x="galaxysentiment") plot(hist_galaxy_3v) ### Over sampling # Random over sampler ros = RandomOverSampler(random_state=0) ros.fit(iphone_corr.iloc[:, 0:46], iphone_corr['iphonesentiment']) iphone_resampled, isent_resampled = ros.sample(iphone_corr.iloc[:, 0:46], iphone_corr['iphonesentiment']) iphone_resampled_complete = pd.DataFrame(iphone_resampled) iphone_resampled_complete['iphonesentiment'] = isent_resampled hist_iphone_resampled = px.histogram(iphone_resampled_complete, x='iphonesentiment') plot(hist_iphone_resampled) ros.fit(galaxy_corr.iloc[:, 0:45], galaxy_corr['galaxysentiment']) galaxy_resampled, gsent_resampled = ros.sample(galaxy_corr.iloc[:, 0:45], galaxy_corr['galaxysentiment']) galaxy_resampled_complete = pd.DataFrame(galaxy_resampled) galaxy_resampled_complete['galaxysentiment'] = gsent_resampled hist_galaxy_resampled = px.histogram(galaxy_resampled_complete, x='galaxysentiment') plot(hist_galaxy_resampled)