def execute(trainfile, sampler): print("--- Executing") print("Using trainfile: ", trainfile) print("--- Loading (transformed) data") data = Data.Data() train_df = data.load(trainfile) y = train_df["is_attributed"] X = train_df.drop(["is_attributed"], axis=1) columns = X.columns.values before_class_weight = dict( zip([0, 1], compute_class_weight('balanced', [0, 1], y))) print("Original weights: ", before_class_weight) X_resampled = None y_resampled = None if sampler == "RANDOM": oversampler = RandomOverSampler(random_state=0) oversampler.fit(X, y) X_resampled, y_resampled = oversampler.sample(X, y) elif sampler == "ADASYN": oversampler = ADASYN(random_state=0) oversampler.fit(X, y) X_resampled, y_resampled = oversampler.sample(X, y) elif sampler == "SMOTE": oversampler = SMOTE(random_state=0) oversampler.fit(X, y) X_resampled, y_resampled = oversampler.sample(X, y) else: print("Invalid sampler: ", sampler) after_class_weight = dict( zip([0, 1], compute_class_weight('balanced', [0, 1], y_resampled))) print("Sampler: ", sampler, ", weights: ", after_class_weight) X_resampled = X_resampled.astype(int) y_resampled = y_resampled.astype(int) # print("X_resampled: ", X_resampled) # print("y_resampled: ", y_resampled) df = pd.DataFrame(data=X_resampled, columns=columns) df["is_attributed"] = y_resampled # df["is_attributed"] = df["is_attributed"].astype(int) compressor = "blosc" outfilename = trainfile + "." + sampler print("Output file (over-sampled): ", outfilename) df.to_hdf(outfilename, "table", mode="w", append=True, complevel=9, complib=compressor)
def classify(self): y_data = self.get_result(self.task.label) X_data = self.get_result(self.task.features) y = np.array(y_data.data).ravel() X = np.array(pd.get_dummies(X_data.data)) #X = MinMaxScaler().fit_transform(X) X_train = X[:-TILE_SIZE] y_train = y[:-TILE_SIZE] X_test = X[-TILE_SIZE:] y_test = y[-TILE_SIZE:] cw = compute_class_weight('auto', np.array([0,1]), y) cw = {0:cw[0],1:cw[1]} b = get_classifier(self.task.classifier, cw) b.partial_fit(X_train, y_train, classes=np.array([0,1])) y_prob = None y_pred = None if self.task.classifier in ['perceptron','svm']: y_pred = b.predict(X_test) y_prob = np.array([[0,y] for y in y_pred]) else: y_prob = b.predict_proba(X_test) y_pred = [1 if t[0] >= 0.5 else 0 for t in y_prob] cm = confusion_matrix(y_test, y_pred) stats = classify_stats(cm, y_test, y_prob, TILE_SIZE) result = ClassifyResult(self.task, 1.0, b, stats) self.results[self.task.uuid] = result
def class_weight(labels): y_train = [np.array(x[1].split()).astype(np.int) for x in labels] y_count = [] for y in y_train: y_count.extend(y) cw = compute_class_weight('balanced', np.arange(28), y_count) return cw
def __init__(self, pattern1train, pattern2train, pattern3train, pattern1test, pattern2test, pattern3test, y_train, y_test, batch_size=args.batch_size, lr=args.lr, epochs=args.epochs): input_dim = tf.keras.Input(shape=(1, 250), name='input') self.pattern1train = pattern1train self.pattern2train = pattern2train self.pattern3train = pattern3train self.pattern1test = pattern1test self.pattern2test = pattern2test self.pattern3test = pattern3test self.y_train = y_train self.y_test = y_test self.batch_size = batch_size self.epochs = epochs self.class_weight = compute_class_weight(class_weight='balanced', classes=[0, 1], y=y_train) pattern1vec = tf.keras.layers.Dense(250, activation='relu', name='outputpattern1vec')(input_dim) pattern2vec = tf.keras.layers.Dense(250, activation='relu', name='outputpattern2vec')(input_dim) pattern3vec = tf.keras.layers.Dense(250, activation='relu', name='outputpattern3vec')(input_dim) mergevec = tf.keras.layers.Concatenate(axis=1, name='mergevec')( [pattern1vec, pattern2vec, pattern3vec]) # concatenate patterns flattenvec = tf.keras.layers.Flatten(name='flattenvec')(mergevec) # flatten pattern vectors into one vec finalmergevec = tf.keras.layers.Dense(100, activation='relu', name='outputmergevec')(flattenvec) prediction = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(finalmergevec) model = tf.keras.Model(inputs=[input_dim], outputs=[prediction]) adama = tf.keras.optimizers.Adam(lr) loss = tf.keras.losses.binary_crossentropy model.compile(optimizer=adama, loss=loss, metrics=['accuracy']) model.summary() self.model = model self.finalmergevec = finalmergevec
def test_auto_weight(): # Test class weights for imbalanced data from sklearn.linear_model import LogisticRegression # We take as dataset the two-dimensional projection of iris so # that it is not separable and remove half of predictors from # class 1. # We add one to the targets as a non-regression test: class_weight="balanced" # used to work only when the labels where a range [0..K). from sklearn.utils import compute_class_weight X, y = iris.data[:, :2], iris.target + 1 unbalanced = np.delete(np.arange(y.size), np.where(y > 2)[0][::2]) classes = np.unique(y[unbalanced]) class_weights = compute_class_weight('balanced', classes, y[unbalanced]) assert_true(np.argmax(class_weights) == 2) for clf in (svm.SVC(kernel='linear'), svm.LinearSVC(random_state=0), LogisticRegression()): # check that score is better when class='balanced' is set. y_pred = clf.fit(X[unbalanced], y[unbalanced]).predict(X) clf.set_params(class_weight='balanced') y_pred_balanced = clf.fit(X[unbalanced], y[unbalanced],).predict(X) assert_true(metrics.f1_score(y, y_pred, average='weighted') <= metrics.f1_score(y, y_pred_balanced, average='weighted'))
def classifier(X, Y, clusters): X_train = X.sample(frac=0.8) Y_train = Y.loc[X_train.index] X_val = X.drop(X_train.index) Y_val = Y.drop(X_train.index) c_w = compute_class_weight('balanced', np.unique(clusters), clusters) c_w = dict(enumerate(c_w)) METRICS = [Recall(name='recall'), AUC(name='auc', multi_label=False)] es = EarlyStopping(monitor='weighted_recall', mode='max', verbose=0, patience=6) model = Sequential() model.add(Dense(32, input_dim=X_train.shape[1], activation='relu')) model.add(Dense(16, activation='relu')) model.add(Dense(Y_train.shape[1], activation='softmax')) # Compile model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=METRICS, weighted_metrics=METRICS) model.fit(X_train, Y_train, epochs=500, batch_size=128, validation_data=(X_val, Y_val), shuffle=False, verbose=1, callbacks=[es], class_weight=c_w) return model
def trainModel(self, train, train_labels): # Set start time e1 = cv2.getTickCount() print 'Training Keras DNN ...' print train.shape print train_labels.shape # distribute all the class the same way numeric_Y = np.dot(train_labels, range(0, self.numberOutput)) classes = np.unique(numeric_Y) class_weight = compute_class_weight("balanced", classes, numeric_Y) print 'class weigtht =', class_weight early_stopping = EarlyStopping(monitor='acc', patience=5, verbose=0, mode='auto') self.model.fit(train, train_labels, nb_epoch=100, batch_size=32, shuffle=True, class_weight=class_weight, callbacks=[early_stopping]) #self.model.fit(train, train_labels,nb_epoch=50, batch_size=32,shuffle=True,class_weight=class_weight) # Set end time e2 = cv2.getTickCount() time = (e2 - e1) / cv2.getTickFrequency() print 'Training duration:', time
def classify(self): y_data = self.get_result(self.task.label) X_data = self.get_result(self.task.features) y = np.array(y_data.data).ravel() X = np.array(pd.get_dummies(X_data.data)) #X = MinMaxScaler().fit_transform(X) X_train = X[:-TILE_SIZE] y_train = y[:-TILE_SIZE] X_test = X[-TILE_SIZE:] y_test = y[-TILE_SIZE:] cw = compute_class_weight('auto', np.array([0, 1]), y) cw = {0: cw[0], 1: cw[1]} b = get_classifier(self.task.classifier, cw) b.partial_fit(X_train, y_train, classes=np.array([0, 1])) y_prob = None y_pred = None if self.task.classifier in ['perceptron', 'svm']: y_pred = b.predict(X_test) y_prob = np.array([[0, y] for y in y_pred]) else: y_prob = b.predict_proba(X_test) y_pred = [1 if t[0] >= 0.5 else 0 for t in y_prob] cm = confusion_matrix(y_test, y_pred) stats = classify_stats(cm, y_test, y_prob, TILE_SIZE) result = ClassifyResult(self.task, 1.0, b, stats) self.results[self.task.uuid] = result
def weigh_and_show(dataframe): from sklearn.utils import compute_class_weight, compute_sample_weight x = compute_class_weight("balanced",sorted(dataframe.class_id.unique()),dataframe.class_id) # show class weight for i, proposed_weights in enumerate(x): # print(f"{df.class_name.unique()[i]}: {x[i]}") print('{:<25s}: {:<}'.format(disease[i], x[i])) # build dataframe with list of class_weights, class_name, sum of class names and product of class weights and sum of individual classes temp_weights = list(x) # temp_sample_weights = list(x2) temp_class = list(sorted(dataframe.class_id.unique())) temp_class_name = disease temp_sum = list(dataframe.class_id.value_counts()[temp_class]) temp_weight_products = [temp_weights[i] * temp_sum[i] for i in range(len(temp_weights))] # array check print(f"temp_weights: {len(temp_weights)}") # print(f"sample_weights: {len(temp_sample_weights)}") print(f"class_name: {len(df.class_name.unique())}") print(f"sum_of_class_names: {len(temp_sum)}") #build dataframe temp_dataframe = pd.DataFrame({'class_weights': temp_weights, # 'sample_weights': temp_sample_weights, 'class_name': temp_class, 'sum_of_class_names': temp_sum, 'weight_products': temp_weight_products,}) return temp_dataframe
def make_predictions(x_train, x_test, y_train, y_test): poly = PolynomialFeatures(interaction_only=False, include_bias=False) x_train = poly.fit_transform(x_train) # Let's target class imbalance problem. class_weights = compute_class_weight('balanced', np.unique(y_train), y_train) class_weights_dict = { np.unique(y_train)[i]: w for i, w in enumerate(class_weights) } print(class_weights_dict) clf = LogisticRegression(penalty='l2', random_state=222, solver='newton-cg', C=999999, class_weight=class_weights_dict, multi_class='multinomial').fit(x_train, y_train) IDs = x_test.index x_test = poly.fit_transform(x_test) y_pred = clf.predict(x_test) print('Accuracy of logistic regression classifier on test set: {:.2f}'. format(clf.score(x_test, y_test))) confusion_matrix_ = confusion_matrix(y_test, y_pred) print('Confusion matrix:') print(confusion_matrix_) testres = pd.DataFrame(np.column_stack([IDs.values, y_pred])) testres.columns = ['ID', 'conicSection'] return (testres)
def run(self): # Define generators data = EMNIST('../data/emnist/') train = SimnetGenerator(data.get_training_batch, data.get_sizes()[0]) val = SimnetGenerator(data.get_validation_batch, data.get_sizes()[1]) #class weights class_weights = compute_class_weight('balanced', np.unique(data._test_labels), data._test_labels) with tf.Session() as sess: # Define models simnet = Simnet() # Do fitting train_history, val_history = simnet.fit(sess, train, val, 100, 1024, 100) acc, avg_acc, weighted_acc = simnet.evaluate_special( sess, data.get_test_batch, 1024, data.get_classification_samples, data.get_sizes()[2], class_weights=class_weights) print("Test ACC: ", acc, " TEST AVG ACC: ", avg_acc, "Weightes AVG ACC:", weighted_acc)
def __init__(self, x_train, y_train, x_test, y_test, batch_size, epochs, dropout, lr, name): self.x_train = x_train self.x_test = x_test self.y_train = y_train self.y_test = y_test self.name = name self.lr = lr self.batch_size = batch_size self.epochs = epochs self.class_weight = compute_class_weight(class_weight='balanced', classes=np.arange(10), y=y_train.argmax(axis=1)) model = Sequential() model.add( Bidirectional(LSTM(300, return_sequences=True), input_shape=(n_steps, dim_input))) model.add(AttentionWithContext()) model.add(Addition()) model.add(Dense(300)) model.add(LeakyReLU()) model.add(Dropout(dropout)) model.add(Dense(300)) model.add(LeakyReLU()) model.add(Dropout(dropout)) model.add(Dense(10, activation='softmax')) # Lower learning rate to prevent divergence adamax = Adamax(self.lr) model.compile(adamax, 'categorical_crossentropy', metrics=['accuracy']) self.model = model
def test_multiclass_classifier_class_weight(): """tests multiclass with classweights for each class""" alpha = .1 n_samples = 20 tol = .00001 max_iter = 50 class_weight = {0: .45, 1: .55, 2: .75} fit_intercept = True X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0, cluster_std=0.1) step_size = get_step_size(X, alpha, fit_intercept, classification=True) classes = np.unique(y) clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples, max_iter=max_iter, tol=tol, random_state=77, fit_intercept=fit_intercept, class_weight=class_weight) clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) le = LabelEncoder() class_weight_ = compute_class_weight(class_weight, np.unique(y), y) sample_weight = class_weight_[le.fit_transform(y)] coef1 = [] intercept1 = [] coef2 = [] intercept2 = [] for cl in classes: y_encoded = np.ones(n_samples) y_encoded[y != cl] = -1 spweights1, spintercept1 = sag_sparse(X, y_encoded, step_size, alpha, n_iter=max_iter, dloss=log_dloss, sample_weight=sample_weight) spweights2, spintercept2 = sag_sparse(X, y_encoded, step_size, alpha, n_iter=max_iter, dloss=log_dloss, sample_weight=sample_weight, sparse=True) coef1.append(spweights1) intercept1.append(spintercept1) coef2.append(spweights2) intercept2.append(spintercept2) coef1 = np.vstack(coef1) intercept1 = np.array(intercept1) coef2 = np.vstack(coef2) intercept2 = np.array(intercept2) for i, cl in enumerate(classes): assert_array_almost_equal(clf1.coef_[i].ravel(), coef1[i].ravel(), decimal=2) assert_almost_equal(clf1.intercept_[i], intercept1[i], decimal=1) assert_array_almost_equal(clf2.coef_[i].ravel(), coef2[i].ravel(), decimal=2) assert_almost_equal(clf2.intercept_[i], intercept2[i], decimal=1)
def __init__(self, data, name="", batch_size=args.batch_size, lr=args.lr, epochs=args.epochs, dropout=args.dropout): vectors = np.stack(data.iloc[:, 1].values) labels = data.iloc[:, 0].values positive_idxs = np.where(labels == 1)[0] negative_idxs = np.where(labels == 0)[0] undersampled_negative_idxs = np.random.choice(negative_idxs, len(positive_idxs), replace=False) resampled_idxs = np.concatenate([positive_idxs, undersampled_negative_idxs]) x_train, x_test, y_train, y_test = train_test_split(vectors[resampled_idxs], labels[resampled_idxs], test_size=0.2, stratify=labels[resampled_idxs]) self.x_train = x_train self.x_test = x_test self.y_train = to_categorical(y_train) self.y_test = to_categorical(y_test) self.name = name self.batch_size = batch_size self.epochs = epochs self.class_weight = compute_class_weight(class_weight='balanced', classes=[0, 1], y=labels) model = Sequential() model.add(LSTM(300, input_shape=(vectors.shape[1], vectors.shape[2]))) model.add(Dense(300)) model.add(LeakyReLU()) model.add(Dropout(dropout)) model.add(Dense(300)) model.add(LeakyReLU()) model.add(Dropout(dropout)) model.add(Dense(2, activation='softmax')) # Lower learning rate to prevent divergence adamax = Adamax(lr) model.compile(adamax, 'categorical_crossentropy', metrics=['accuracy']) self.model = model
def initialize_labels(self, Y): y_nodes_flat = [y_val for y in Y for y_val in y.nodes] y_links_flat = [y_val for y in Y for y_val in y.links] self.prop_encoder_ = LabelEncoder().fit(y_nodes_flat) self.link_encoder_ = LabelEncoder().fit(y_links_flat) self.n_prop_states = len(self.prop_encoder_.classes_) self.n_link_states = len(self.link_encoder_.classes_) self.prop_cw_ = np.ones_like(self.prop_encoder_.classes_, dtype=np.double) self.link_cw_ = compute_class_weight(self.class_weight, self.link_encoder_.classes_, y_links_flat) self.link_cw_ /= self.link_cw_.min() logging.info('Setting node class weights {}'.format(", ".join( "{}: {}".format(lbl, cw) for lbl, cw in zip(self.prop_encoder_.classes_, self.prop_cw_)))) logging.info('Setting link class weights {}'.format(", ".join( "{}: {}".format(lbl, cw) for lbl, cw in zip(self.link_encoder_.classes_, self.link_cw_))))
def test_auto_weight(): # Test class weights for imbalanced data from sklearn.linear_model import LogisticRegression # We take as dataset the two-dimensional projection of iris so # that it is not separable and remove half of predictors from # class 1. # We add one to the targets as a non-regression test: # class_weight="balanced" # used to work only when the labels where a range [0..K). from sklearn.utils import compute_class_weight X, y = iris.data[:, :2], iris.target + 1 unbalanced = np.delete(np.arange(y.size), np.where(y > 2)[0][::2]) classes = np.unique(y[unbalanced]) class_weights = compute_class_weight('balanced', classes=classes, y=y[unbalanced]) assert np.argmax(class_weights) == 2 for clf in (svm.SVC(kernel='linear'), svm.LinearSVC(random_state=0), LogisticRegression()): # check that score is better when class='balanced' is set. y_pred = clf.fit(X[unbalanced], y[unbalanced]).predict(X) clf.set_params(class_weight='balanced') y_pred_balanced = clf.fit(X[unbalanced], y[unbalanced],).predict(X) assert (metrics.f1_score(y, y_pred, average='macro') <= metrics.f1_score(y, y_pred_balanced, average='macro'))
def classifier_training_example(mels_path: str) -> None: batch_size = 64 epochs = 50 model_name = 'class_cnn3' patience = 10 val_split = 0.25 classifier = build_cnn3_classifier() classifier.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc']) classifier.summary() mels_npz = np.load(mels_path) samples_x = mels_npz['samples_x'] samples_y = mels_npz['samples_y'] log.info(f'Training samples x shape = {samples_x.shape}') log.info(f'Training samples y shape = {samples_y.shape}') class_weights = compute_class_weight('balanced', classes=np.unique( np.argmax(samples_y, axis=-1)), y=np.argmax(samples_y, axis=-1)) log.info(f'Training samples class weights = {class_weights}') class_weights = {idx: w for idx, w in enumerate(class_weights)} train_drum_classifier(classifier, x=samples_x, y=samples_y, model_name=model_name, batch_size=batch_size, epochs=epochs, val_split=val_split, patience=patience, class_weights=class_weights)
def test_auto_weight(): """Test class weights for imbalanced data""" from sklearn.linear_model import LogisticRegression # we take as dataset a the two-dimensional projection of iris so # that it is not separable and remove half of predictors from # class 1 from sklearn.utils import compute_class_weight X, y = iris.data[:, :2], iris.target unbalanced = np.delete(np.arange(y.size), np.where(y > 1)[0][::2]) classes = np.unique(y[unbalanced]) class_weights = compute_class_weight('auto', classes, y[unbalanced]) assert_true(np.argmax(class_weights) == 2) for clf in (svm.SVC(kernel='linear'), svm.LinearSVC(random_state=0), LogisticRegression()): # check that score is better when class='auto' is set. y_pred = clf.fit(X[unbalanced], y[unbalanced]).predict(X) clf.set_params(class_weight='auto') y_pred_balanced = clf.fit( X[unbalanced], y[unbalanced], ).predict(X) assert_true( metrics.f1_score(y, y_pred) <= metrics.f1_score( y, y_pred_balanced))
def create_criterion(device, weight_with=None): """ Creates a `torch.nn.BCEWithLogitsLoss`. If weight_with is not None, uses class weight for the positive class Arguments: ---------- device: "cuda" or "cpu" weight_with: data.Dataset """ if weight_with: y = [row.subtask_a for row in weight_with] class_weights = compute_class_weight('balanced', ['NOT', 'OFF'], y) # normalize it class_weights = class_weights / class_weights[0] criterion = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([class_weights[1]])) else: criterion = nn.BCEWithLogitsLoss() criterion = criterion.to(device) return criterion
def get_synthetic_data(): x, y = make_classification(n_samples=10000, n_features=200, n_informative=200, n_redundant=0, n_repeated=0, n_classes=3, n_clusters_per_class=2, weights=None, flip_y=0.02, class_sep=0.4, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=2) log("class weights", compute_class_weight('balanced', np.unique(y), y)) encoder = LabelEncoder() encoder.fit(y) encoded_y = encoder.transform(y) y = np_utils.to_categorical(encoded_y) mm_scaler = MinMaxScaler() x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) x_train = mm_scaler.fit_transform(x_train) x_test = mm_scaler.transform(x_test) return x_train, y_train, x_test, y_test
def fit(self, dataset): """Fits the intent classifier with a valid Snips dataset Returns: :class:`LogRegIntentClassifier`: The same instance, trained """ from sklearn.linear_model import SGDClassifier from sklearn.utils import compute_class_weight logger.info("Fitting LogRegIntentClassifier...") dataset = validate_and_format_dataset(dataset) self.load_resources_if_needed(dataset[LANGUAGE]) self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) language = dataset[LANGUAGE] data_augmentation_config = self.config.data_augmentation_config utterances, classes, intent_list = build_training_data( dataset, language, data_augmentation_config, self.resources, self.random_state) self.intent_list = intent_list if len(self.intent_list) <= 1: return self self.featurizer = Featurizer( config=self.config.featurizer_config, builtin_entity_parser=self.builtin_entity_parser, custom_entity_parser=self.custom_entity_parser, resources=self.resources, random_state=self.random_state, ) self.featurizer.language = language none_class = max(classes) try: x = self.featurizer.fit_transform(dataset, utterances, classes, none_class) except _EmptyDatasetUtterancesError: logger.warning("No (non-empty) utterances found in dataset") self.featurizer = None return self alpha = get_regularization_factor(dataset) class_weights_arr = compute_class_weight("balanced", range(none_class + 1), classes) # Re-weight the noise class class_weights_arr[-1] *= self.config.noise_reweight_factor class_weight = {idx: w for idx, w in enumerate(class_weights_arr)} self.classifier = SGDClassifier(random_state=self.random_state, alpha=alpha, class_weight=class_weight, **LOG_REG_ARGS) self.classifier.fit(x, classes) logger.debug("%s", DifferedLoggingMessage(self.log_best_features)) return self
def test_binary_classifier_class_weight(): """tests binary classifier with classweights for each class""" alpha = .1 n_samples = 50 n_iter = 20 tol = .00001 fit_intercept = True X, y = make_blobs(n_samples=n_samples, centers=2, random_state=10, cluster_std=0.1) step_size = get_step_size(X, alpha, fit_intercept, classification=True) classes = np.unique(y) y_tmp = np.ones(n_samples) y_tmp[y != classes[1]] = -1 y = y_tmp class_weight = {1: .45, -1: .55} clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples, max_iter=n_iter, tol=tol, random_state=77, fit_intercept=fit_intercept, multi_class='ovr', class_weight=class_weight) clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) le = LabelEncoder() class_weight_ = compute_class_weight(class_weight, np.unique(y), y) sample_weight = class_weight_[le.fit_transform(y)] spweights, spintercept = sag_sparse(X, y, step_size, alpha, n_iter=n_iter, dloss=log_dloss, sample_weight=sample_weight, fit_intercept=fit_intercept) spweights2, spintercept2 = sag_sparse(X, y, step_size, alpha, n_iter=n_iter, dloss=log_dloss, sparse=True, sample_weight=sample_weight, fit_intercept=fit_intercept) assert_array_almost_equal(clf1.coef_.ravel(), spweights.ravel(), decimal=2) assert_almost_equal(clf1.intercept_, spintercept, decimal=1) assert_array_almost_equal(clf2.coef_.ravel(), spweights2.ravel(), decimal=2) assert_almost_equal(clf2.intercept_, spintercept2, decimal=1)
def pre_init(self, data: LoadedKata): # initialize the array with the labels to avoid zero division observed_labels = copy.deepcopy(self.labels) for x in data.data_loader: observed_labels += [self.labels[label_id] for label_id in x[self.label_name]] weights = compute_class_weight("balanced", classes=np.asarray(self.labels), y=observed_labels) self.register_buffer("balance", torch.tensor(weights.astype(np.float32)))
def get_sample_weights(self): labels = self.get_dataframe()[self.get_label_column()] categories = labels.cat.categories class_weights = compute_class_weight('balanced', classes=categories, y=labels) sample_weights = class_weights[labels.cat.codes] return sample_weights
def __init__(self, tree_model, X_train, y_train, feature_names : List[str], class_names : (List[str],Mapping[int,str])=None): self.tree_model = tree_model self.feature_names = feature_names self.class_names = class_names self.class_weight = tree_model.class_weight if getattr(tree_model, 'tree_') is None: # make sure model is fit tree_model.fit(X_train, y_train) if tree_model.tree_.n_classes > 1: if isinstance(self.class_names, dict): self.class_names = self.class_names elif isinstance(self.class_names, Sequence): self.class_names = {i:n for i, n in enumerate(self.class_names)} else: raise Exception(f"class_names must be dict or sequence, not {self.class_names.__class__.__name__}") if isinstance(X_train, pd.DataFrame): X_train = X_train.values self.X_train = X_train if isinstance(y_train, pd.Series): y_train = y_train.values self.y_train = y_train self.node_to_samples = ShadowDecTree.node_samples(tree_model, X_train) if self.isclassifier(): self.unique_target_values = np.unique(y_train) self.class_weights = compute_class_weight(tree_model.class_weight, self.unique_target_values, self.y_train) tree = tree_model.tree_ children_left = tree.children_left children_right = tree.children_right # use locals not args to walk() for recursion speed in python leaves = [] internal = [] # non-leaf nodes def walk(node_id): if (children_left[node_id] == -1 and children_right[node_id] == -1): # leaf t = ShadowDecTreeNode(self, node_id) leaves.append(t) return t else: # decision node left = walk(children_left[node_id]) right = walk(children_right[node_id]) t = ShadowDecTreeNode(self, node_id, left, right) internal.append(t) return t root_node_id = 0 # record root to actual shadow nodes self.root = walk(root_node_id) self.leaves = leaves self.internal = internal
def calculate_class_weights(self, labels, occured_labels): from sklearn.utils import compute_class_weight label_indexes = [labels.index(i) for i in occured_labels] class_weight_list = compute_class_weight('balanced', np.unique(label_indexes) ,label_indexes ) class_weight = dict(zip(np.unique(label_indexes), class_weight_list)) return class_weight
def get_class_weight(y_true): classes = np.arange(y_true.shape[-1]) class_counts = y_true.sum(0).astype(np.int64) pos = 0 y_weight = np.zeros(int(class_counts.sum())) for i, count in enumerate(class_counts): y_weight[pos:pos + count] = i pos += count weights = np.sqrt(compute_class_weight("balanced", classes, y_weight)) return {i: weight for i, weight in enumerate(weights)}
def cw_to_dict(y_class): """ input: 1D array, labels output: balanced class weight dictionary """ cw = utils.compute_class_weight('balanced', [0, 1], y_class) #compute class weight cw_dict = {} for idx in range(len(cw)): cw_dict[idx] = cw[idx] return cw_dict
def getData(): X = [] Y = [] void_label = -1 train_path = '/content/input' label_path = '/content/highway200' train_files = sorted(os.listdir(train_path)) label_files = sorted(os.listdir(label_path)) for i in range(len(train_files)): img = load_img(os.path.join(train_path, train_files[i])) img = img_to_array(img) X.append(img) img = load_img(os.path.join(label_path, label_files[i]), color_mode = 'grayscale') img = img_to_array(img) shape = img.shape img /= 255.0 img = img.reshape(-1) idx = np.where(np.logical_and(img > 0.25, img < 0.8))[0] # find non-ROI if len(idx) > 0: img[idx] = -1 img = img.reshape(shape) img = np.floor(img) Y.append(img) X = np.asarray(X) Y = np.asarray(Y) idx = list(range(X.shape[0])) np.random.shuffle(idx) np.random.shuffle(idx) X = X[idx] Y = Y[idx] cls_weight_list = [] for i in range(Y.shape[0]): y = Y[i].reshape(-1) idx = np.where(y!=void_label)[0] if(len(idx)>0): y = y[idx] lb = np.unique(y) # 0., 1 cls_weight = compute_class_weight('balanced', lb , y) class_0 = cls_weight[0] class_1 = cls_weight[1] if len(lb)>1 else 1.0 cls_weight_dict = {0:class_0, 1: class_1} cls_weight_list.append(cls_weight_dict) cls_weight_list = np.asarray(cls_weight_list) return [X, Y, cls_weight_list]
def get_class_weight(y_output): #news = pd.read_csv(file,encoding='utf-8',header=None) #tags = news.ix[:,1:].as_matrix() class_weight = [] tags_num = y_output.shape[1] for i in range(tags_num): sample_weight = compute_class_weight('balanced', [0, 1], y_output[:, i]) class_weight.append(sample_weight) class_weight = np.vstack(class_weight).T return class_weight
def _get_class_weights(loader: DataLoader) -> np.ndarray: """ compute class weights :param loader: :return: """ labels = loader.dataset.metadata["dx"].cat.codes # type: ignore classes = np.unique(labels) return compute_class_weight(class_weight="balanced", classes=classes, y=labels)
def train(): #X_lst, y_lst = load_dataset_folder(DATASET_DIR) X_lst, y_lst = load(DATASET_DIR) X, y = stack_data(X_lst, y_lst, onehot=True, num_classes=NUM_CLASSES) normalized_X = normalize(X, axis=1, norm='l2') print(X.shape, y.shape) logging.debug(f'X shape: {X.shape}\ty shape: {y.shape}') X_train, X_test, y_train, y_test = train_test_split(normalized_X, y, test_size=0.33, random_state=42) # Convert to tf.data.Dataset train_data = tf.data.Dataset.from_tensor_slices((X_train, y_train)) test_data = tf.data.Dataset.from_tensor_slices((X_test, y_test)) train_data = train_data.batch(BATCH_SIZE).cache().repeat() test_data = test_data.batch(BATCH_SIZE) model = generate_model_01(BATCH_SIZE, num_classes=NUM_CLASSES) # Compute class weights class_weights = compute_class_weight('balanced', np.unique(np.argmax(y_train, axis=1)), np.argmax(y_train, axis=1)) # TensorBoard callback log_dir = SAVE_LOGS_DIR + 'fit/' + datetime.datetime.now().strftime( '%Y%m%d-%H%M%S') cb_tb = tf.keras.callbacks.TensorBoard(log_dir=log_dir) cb_es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=200, restore_best_weights=True, verbose=True) training_history = model.fit(train_data, epochs=EPOCHS, validation_data=test_data, steps_per_epoch=15, class_weight={ 0: class_weights[0], 1: class_weights[1], 2: class_weights[2] }, callbacks=[cb_tb, cb_es]) # Store training history np.savetxt(SAVE_LOGS_DIR + 'acc.txt', training_history.history['acc']) np.savetxt(SAVE_LOGS_DIR + 'val_acc.txt', training_history.history['val_acc']) np.savetxt(SAVE_LOGS_DIR + 'loss.txt', training_history.history['loss']) np.savetxt(SAVE_LOGS_DIR + 'val_loss.txt', training_history.history['val_loss']) model.save_weights(SAVE_LOGS_DIR + 'weights.h5')
def load_training_data(): raw_training_data = pd.read_csv('train.csv') # convert types to ints raw_training_data['target'] = raw_training_data['target'].apply(class_to_int) raw_training_data = raw_training_data.astype('float32') raw_training_data['target'] = raw_training_data['target'].astype('int32') raw_training_data = raw_training_data.iloc[np.random.permutation(len(raw_training_data))] #shuffle data # Get the features and the classes features = np.log(raw_training_data.iloc[:, 1:94] + 1).values # apply log function classes = raw_training_data['target'].values print np.unique(classes) #split train/validate feat_train, feat_test, class_train, class_test = cross_validation.train_test_split(features, classes, test_size=0.3, random_state=1232) feat_train, feat_val, class_train, class_val = cross_validation.train_test_split(feat_train, class_train, test_size=0.3, random_state=1232) #scale the features std_scale = preprocessing.StandardScaler().fit(feat_train) feat_train = std_scale.transform(feat_train) feat_val = std_scale.transform(feat_val) feat_test = std_scale.transform(feat_test) #class weights weights = compute_class_weight('auto', np.unique(classes), class_train) weights = weights.astype('float32') print weights train_weights = [] val_weights = [] for i in class_train: train_weights.append(weights[i]) for i in list(class_val): val_weights.append(weights[i]) #convert to np array for theanets training_data = [feat_train, class_train, np.array(train_weights)] validation_data = [feat_val, class_val, np.array(val_weights)] test_data = [feat_test, class_test] return training_data, validation_data, test_data, std_scale
def fit(self, X, y): from sklearn.preprocessing import LabelEncoder from sklearn.utils import compute_class_weight label_encoder = LabelEncoder().fit(y) classes = label_encoder.classes_ class_weight = compute_class_weight(self.class_weight, classes, y) # Intentionally modify the balanced class_weight # to simulate a bug and raise an exception if self.class_weight == "balanced": class_weight += 1. # Simply assigning coef_ to the class_weight self.coef_ = class_weight return self
def test_binary_classifier_class_weight(): """tests binary classifier with classweights for each class""" alpha = .1 n_samples = 50 n_iter = 20 tol = .00001 fit_intercept = True X, y = make_blobs(n_samples=n_samples, centers=2, random_state=10, cluster_std=0.1) step_size = get_step_size(X, alpha, fit_intercept, classification=True) classes = np.unique(y) y_tmp = np.ones(n_samples) y_tmp[y != classes[1]] = -1 y = y_tmp class_weight = {1: .45, -1: .55} clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples, max_iter=n_iter, tol=tol, random_state=77, fit_intercept=fit_intercept, class_weight=class_weight) clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) le = LabelEncoder() class_weight_ = compute_class_weight(class_weight, np.unique(y), y) sample_weight = class_weight_[le.fit_transform(y)] spweights, spintercept = sag_sparse(X, y, step_size, alpha, n_iter=n_iter, dloss=log_dloss, sample_weight=sample_weight, fit_intercept=fit_intercept) spweights2, spintercept2 = sag_sparse(X, y, step_size, alpha, n_iter=n_iter, dloss=log_dloss, sparse=True, sample_weight=sample_weight, fit_intercept=fit_intercept) assert_array_almost_equal(clf1.coef_.ravel(), spweights.ravel(), decimal=2) assert_almost_equal(clf1.intercept_, spintercept, decimal=1) assert_array_almost_equal(clf2.coef_.ravel(), spweights2.ravel(), decimal=2) assert_almost_equal(clf2.intercept_, spintercept2, decimal=1)
def test_auto_weight(): """Test class weights for imbalanced data""" from sklearn.linear_model import LogisticRegression # we take as dataset a the two-dimensional projection of iris so # that it is not separable and remove half of predictors from # class 1 from sklearn.utils import compute_class_weight X, y = iris.data[:, :2], iris.target unbalanced = np.delete(np.arange(y.size), np.where(y > 1)[0][::2]) classes = np.unique(y[unbalanced]) class_weights = compute_class_weight('auto', classes, y[unbalanced]) assert_true(np.argmax(class_weights) == 2) for clf in (svm.SVC(kernel='linear'), svm.LinearSVC(random_state=0), LogisticRegression()): # check that score is better when class='auto' is set. y_pred = clf.fit(X[unbalanced], y[unbalanced]).predict(X) clf.set_params(class_weight='auto') y_pred_balanced = clf.fit(X[unbalanced], y[unbalanced],).predict(X) assert_true(metrics.f1_score(y, y_pred) <= metrics.f1_score(y, y_pred_balanced))
def classify(self): y_data = self.results[self.task.label].data X_data = self.results[self.task.features].data y = np.array(y_data) X = np.array(X_data) cw = compute_class_weight('auto', np.array([0,1]), y) cw = {0:cw[0],1:cw[1]} print cw b = get_classifier(self.task.classifier, cw) tile_size = 1000 num_tiles = y.size / tile_size for i in range(num_tiles): pos = i * tile_size X_sub = X[pos : pos + tile_size] y_sub = y[pos : pos + tile_size] y_prob = None y_pred = None if self.task.classifier == 'svm': y_pred = b.predict(X_sub) y_prob = np.array([[0,y] for y in y_pred]) else: y_prob = b.predict_proba(X_sub) y_pred = [1 if y[1] >= 0.5 else 0 for y in y_prob] cm = confusion_matrix(y_sub, y_pred) stats = classify_stats(cm, y_test, y_prob) y_pred = pd.DataFrame(y_pred, columns=y_data.columns) result = ClassifyResult(self.task, 1.0, b, stats) self.results[self.task.uuid] = result b.partial_fit(X_sub, y_sub)
def _compute_class_weight_dictionary(y): # helper for returning a dictionary instead of an array classes = np.unique(y) class_weight = compute_class_weight("balanced", classes, y) class_weight_dict = dict(zip(classes, class_weight)) return class_weight_dict
# path to image folder base_path = os.path.join(base_path, caltech101.config.tar_inner_dirname) # X_test contain only paths to images (X_test, y_test) = util.load_paths_from_files(base_path, 'X_test.txt', 'y_test.txt') for cv_fold in [0]: # on which cross val folds to run; cant loop over several folds due to some bug print("fold {}".format(cv_fold)) experiment_name = '_bn_triangular_cv{}_e{}'.format(cv_fold, nb_epoch) # load cross val split (X_train, y_train), (X_val, y_val) = util.load_cv_split_paths(base_path, cv_fold) # compute class weights, since classes are highly imbalanced class_weight = compute_class_weight('auto', range(nb_classes), y_train) if normalize_data: print("Load mean and std...") X_mean, X_std = util.load_cv_stats(base_path, cv_fold) normalize_data = (X_mean, X_std) nb_train_sample = X_train.shape[0] nb_val_sample = X_val.shape[0] nb_test_sample = X_test.shape[0] print('X_train shape:', X_train.shape) print(nb_train_sample, 'train samples') if X_val is not None: print(nb_val_sample, 'validation samples') print(nb_test_sample, 'test samples')
from sklearn.svm import LinearSVC from sklearn.metrics import average_precision_score from sklearn.utils import compute_class_weight import numpy as np import logging logging.basicConfig(level=logging.DEBUG) iris = load_iris() X = iris.data y = iris.target y[y != 1] = -1 y[y == 1] = 1 weights = compute_class_weight("auto", np.unique(y), y) sample_weight = np.zeros(y.shape, dtype=np.float) sample_weight[y==1] = weights[0] sample_weight[y==-1] = weights[1] # n_iter = int(1e6 / X.shape[0]) vw_clf = VWClassifier(quiet=False, loss_function="hinge", passes=500) vw_clf.fit(X, y.astype(np.double), sample_weight) scores = vw_clf.decision_function(X) print "VW AP: %.3f" % average_precision_score(y, scores) vw_clf.set_params(l2=0.1) vw_clf.fit(X, y.astype(np.double), sample_weight) print "VW AP: %.3f" % average_precision_score(y, scores) # vw_clf.fit(X, y.astype(np.double), sample_weight)