def execute(trainfile, sampler):

    print("--- Executing")
    print("Using trainfile:  ", trainfile)

    print("--- Loading (transformed) data")
    data = Data.Data()
    train_df = data.load(trainfile)
    y = train_df["is_attributed"]
    X = train_df.drop(["is_attributed"], axis=1)
    columns = X.columns.values

    before_class_weight = dict(
        zip([0, 1], compute_class_weight('balanced', [0, 1], y)))
    print("Original weights: ", before_class_weight)

    X_resampled = None
    y_resampled = None
    if sampler == "RANDOM":
        oversampler = RandomOverSampler(random_state=0)
        oversampler.fit(X, y)
        X_resampled, y_resampled = oversampler.sample(X, y)

    elif sampler == "ADASYN":
        oversampler = ADASYN(random_state=0)
        oversampler.fit(X, y)
        X_resampled, y_resampled = oversampler.sample(X, y)

    elif sampler == "SMOTE":
        oversampler = SMOTE(random_state=0)
        oversampler.fit(X, y)
        X_resampled, y_resampled = oversampler.sample(X, y)

    else:
        print("Invalid sampler: ", sampler)

    after_class_weight = dict(
        zip([0, 1], compute_class_weight('balanced', [0, 1], y_resampled)))
    print("Sampler: ", sampler, ", weights: ", after_class_weight)

    X_resampled = X_resampled.astype(int)
    y_resampled = y_resampled.astype(int)

    # print("X_resampled: ", X_resampled)
    # print("y_resampled: ", y_resampled)

    df = pd.DataFrame(data=X_resampled, columns=columns)
    df["is_attributed"] = y_resampled
    # df["is_attributed"] = df["is_attributed"].astype(int)

    compressor = "blosc"
    outfilename = trainfile + "." + sampler
    print("Output file (over-sampled): ", outfilename)
    df.to_hdf(outfilename,
              "table",
              mode="w",
              append=True,
              complevel=9,
              complib=compressor)
示例#2
0
    def classify(self):
        y_data = self.get_result(self.task.label)
        X_data = self.get_result(self.task.features)

        y = np.array(y_data.data).ravel()
        X = np.array(pd.get_dummies(X_data.data))
        #X = MinMaxScaler().fit_transform(X)

        X_train = X[:-TILE_SIZE]
        y_train = y[:-TILE_SIZE]
        X_test = X[-TILE_SIZE:]
        y_test = y[-TILE_SIZE:]

        cw = compute_class_weight('auto', np.array([0,1]), y)
        cw = {0:cw[0],1:cw[1]}

        b = get_classifier(self.task.classifier, cw)
        b.partial_fit(X_train, y_train, classes=np.array([0,1]))

        y_prob = None
        y_pred = None
        if self.task.classifier in ['perceptron','svm']:
            y_pred = b.predict(X_test)
            y_prob = np.array([[0,y] for y in y_pred])
        else:
            y_prob = b.predict_proba(X_test)
            y_pred = [1 if t[0] >= 0.5 else 0 for t in y_prob]

        cm = confusion_matrix(y_test, y_pred)
        stats = classify_stats(cm, y_test, y_prob, TILE_SIZE)

        result = ClassifyResult(self.task, 1.0, b, stats)
        self.results[self.task.uuid] = result
def class_weight(labels):
  y_train = [np.array(x[1].split()).astype(np.int) for x in labels]
  y_count = []
  for y in y_train:
    y_count.extend(y)
  cw = compute_class_weight('balanced', np.arange(28), y_count)
  return cw 
示例#4
0
    def __init__(self, pattern1train, pattern2train, pattern3train, pattern1test, pattern2test,
                 pattern3test, y_train, y_test, batch_size=args.batch_size, lr=args.lr, epochs=args.epochs):
        input_dim = tf.keras.Input(shape=(1, 250), name='input')
        self.pattern1train = pattern1train
        self.pattern2train = pattern2train
        self.pattern3train = pattern3train
        self.pattern1test = pattern1test
        self.pattern2test = pattern2test
        self.pattern3test = pattern3test
        self.y_train = y_train
        self.y_test = y_test
        self.batch_size = batch_size
        self.epochs = epochs
        self.class_weight = compute_class_weight(class_weight='balanced', classes=[0, 1], y=y_train)

        pattern1vec = tf.keras.layers.Dense(250, activation='relu', name='outputpattern1vec')(input_dim)
        pattern2vec = tf.keras.layers.Dense(250, activation='relu', name='outputpattern2vec')(input_dim)
        pattern3vec = tf.keras.layers.Dense(250, activation='relu', name='outputpattern3vec')(input_dim)

        mergevec = tf.keras.layers.Concatenate(axis=1, name='mergevec')(
            [pattern1vec, pattern2vec, pattern3vec])  # concatenate patterns
        flattenvec = tf.keras.layers.Flatten(name='flattenvec')(mergevec)  # flatten pattern vectors into one vec

        finalmergevec = tf.keras.layers.Dense(100, activation='relu', name='outputmergevec')(flattenvec)
        prediction = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(finalmergevec)
        model = tf.keras.Model(inputs=[input_dim], outputs=[prediction])

        adama = tf.keras.optimizers.Adam(lr)
        loss = tf.keras.losses.binary_crossentropy
        model.compile(optimizer=adama, loss=loss, metrics=['accuracy'])
        model.summary()

        self.model = model
        self.finalmergevec = finalmergevec
示例#5
0
def test_auto_weight():
    # Test class weights for imbalanced data
    from sklearn.linear_model import LogisticRegression
    # We take as dataset the two-dimensional projection of iris so
    # that it is not separable and remove half of predictors from
    # class 1.
    # We add one to the targets as a non-regression test: class_weight="balanced"
    # used to work only when the labels where a range [0..K).
    from sklearn.utils import compute_class_weight
    X, y = iris.data[:, :2], iris.target + 1
    unbalanced = np.delete(np.arange(y.size), np.where(y > 2)[0][::2])

    classes = np.unique(y[unbalanced])
    class_weights = compute_class_weight('balanced', classes, y[unbalanced])
    assert_true(np.argmax(class_weights) == 2)

    for clf in (svm.SVC(kernel='linear'), svm.LinearSVC(random_state=0),
                LogisticRegression()):
        # check that score is better when class='balanced' is set.
        y_pred = clf.fit(X[unbalanced], y[unbalanced]).predict(X)
        clf.set_params(class_weight='balanced')
        y_pred_balanced = clf.fit(X[unbalanced], y[unbalanced],).predict(X)
        assert_true(metrics.f1_score(y, y_pred, average='weighted')
                    <= metrics.f1_score(y, y_pred_balanced,
                                        average='weighted'))
def classifier(X, Y, clusters):

    X_train = X.sample(frac=0.8)
    Y_train = Y.loc[X_train.index]
    X_val = X.drop(X_train.index)
    Y_val = Y.drop(X_train.index)

    c_w = compute_class_weight('balanced', np.unique(clusters), clusters)
    c_w = dict(enumerate(c_w))

    METRICS = [Recall(name='recall'), AUC(name='auc', multi_label=False)]

    es = EarlyStopping(monitor='weighted_recall',
                       mode='max',
                       verbose=0,
                       patience=6)
    model = Sequential()
    model.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(Y_train.shape[1], activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=METRICS,
                  weighted_metrics=METRICS)
    model.fit(X_train,
              Y_train,
              epochs=500,
              batch_size=128,
              validation_data=(X_val, Y_val),
              shuffle=False,
              verbose=1,
              callbacks=[es],
              class_weight=c_w)
    return model
    def trainModel(self, train, train_labels):
        # Set start time
        e1 = cv2.getTickCount()

        print 'Training Keras DNN ...'
        print train.shape
        print train_labels.shape

        # distribute all the class the same way
        numeric_Y = np.dot(train_labels, range(0, self.numberOutput))
        classes = np.unique(numeric_Y)
        class_weight = compute_class_weight("balanced", classes, numeric_Y)
        print 'class weigtht =', class_weight

        early_stopping = EarlyStopping(monitor='acc',
                                       patience=5,
                                       verbose=0,
                                       mode='auto')
        self.model.fit(train,
                       train_labels,
                       nb_epoch=100,
                       batch_size=32,
                       shuffle=True,
                       class_weight=class_weight,
                       callbacks=[early_stopping])
        #self.model.fit(train, train_labels,nb_epoch=50, batch_size=32,shuffle=True,class_weight=class_weight)

        # Set end time
        e2 = cv2.getTickCount()
        time = (e2 - e1) / cv2.getTickFrequency()
        print 'Training duration:', time
示例#8
0
    def classify(self):
        y_data = self.get_result(self.task.label)
        X_data = self.get_result(self.task.features)

        y = np.array(y_data.data).ravel()
        X = np.array(pd.get_dummies(X_data.data))
        #X = MinMaxScaler().fit_transform(X)

        X_train = X[:-TILE_SIZE]
        y_train = y[:-TILE_SIZE]
        X_test = X[-TILE_SIZE:]
        y_test = y[-TILE_SIZE:]

        cw = compute_class_weight('auto', np.array([0, 1]), y)
        cw = {0: cw[0], 1: cw[1]}

        b = get_classifier(self.task.classifier, cw)
        b.partial_fit(X_train, y_train, classes=np.array([0, 1]))

        y_prob = None
        y_pred = None
        if self.task.classifier in ['perceptron', 'svm']:
            y_pred = b.predict(X_test)
            y_prob = np.array([[0, y] for y in y_pred])
        else:
            y_prob = b.predict_proba(X_test)
            y_pred = [1 if t[0] >= 0.5 else 0 for t in y_prob]

        cm = confusion_matrix(y_test, y_pred)
        stats = classify_stats(cm, y_test, y_prob, TILE_SIZE)

        result = ClassifyResult(self.task, 1.0, b, stats)
        self.results[self.task.uuid] = result
示例#9
0
文件: chest4.py 项目: gyasis/chest
def weigh_and_show(dataframe):
  from sklearn.utils import compute_class_weight, compute_sample_weight
  x  = compute_class_weight("balanced",sorted(dataframe.class_id.unique()),dataframe.class_id)

  # show class weight
  for i, proposed_weights in enumerate(x):
      # print(f"{df.class_name.unique()[i]}: {x[i]}")
      print('{:<25s}: {:<}'.format(disease[i], x[i]))
      
  # build dataframe with list of class_weights, class_name, sum of class names and product of class weights and sum of individual classes
  temp_weights = list(x)
  # temp_sample_weights = list(x2)
  
  temp_class = list(sorted(dataframe.class_id.unique()))
  temp_class_name = disease
  temp_sum = list(dataframe.class_id.value_counts()[temp_class])
  temp_weight_products = [temp_weights[i] * temp_sum[i] for i in range(len(temp_weights))]
  
  # array check
  print(f"temp_weights: {len(temp_weights)}")
  # print(f"sample_weights: {len(temp_sample_weights)}")

  print(f"class_name: {len(df.class_name.unique())}")
  print(f"sum_of_class_names: {len(temp_sum)}")
  #build dataframe
  temp_dataframe = pd.DataFrame({'class_weights': temp_weights, 
                                  # 'sample_weights': temp_sample_weights,
                                 
                                  'class_name': temp_class, 
                                  'sum_of_class_names': temp_sum, 
                                  'weight_products': temp_weight_products,})
                                 
  return temp_dataframe
示例#10
0
def make_predictions(x_train, x_test, y_train, y_test):
    poly = PolynomialFeatures(interaction_only=False, include_bias=False)
    x_train = poly.fit_transform(x_train)

    # Let's target class imbalance problem.
    class_weights = compute_class_weight('balanced', np.unique(y_train),
                                         y_train)
    class_weights_dict = {
        np.unique(y_train)[i]: w
        for i, w in enumerate(class_weights)
    }
    print(class_weights_dict)

    clf = LogisticRegression(penalty='l2',
                             random_state=222,
                             solver='newton-cg',
                             C=999999,
                             class_weight=class_weights_dict,
                             multi_class='multinomial').fit(x_train, y_train)

    IDs = x_test.index
    x_test = poly.fit_transform(x_test)
    y_pred = clf.predict(x_test)
    print('Accuracy of logistic regression classifier on test set: {:.2f}'.
          format(clf.score(x_test, y_test)))
    confusion_matrix_ = confusion_matrix(y_test, y_pred)
    print('Confusion matrix:')
    print(confusion_matrix_)

    testres = pd.DataFrame(np.column_stack([IDs.values, y_pred]))
    testres.columns = ['ID', 'conicSection']
    return (testres)
示例#11
0
    def run(self):

        # Define generators
        data = EMNIST('../data/emnist/')
        train = SimnetGenerator(data.get_training_batch, data.get_sizes()[0])
        val = SimnetGenerator(data.get_validation_batch, data.get_sizes()[1])

        #class weights
        class_weights = compute_class_weight('balanced',
                                             np.unique(data._test_labels),
                                             data._test_labels)

        with tf.Session() as sess:

            # Define models
            simnet = Simnet()

            # Do fitting
            train_history, val_history = simnet.fit(sess, train, val, 100,
                                                    1024, 100)

            acc, avg_acc, weighted_acc = simnet.evaluate_special(
                sess,
                data.get_test_batch,
                1024,
                data.get_classification_samples,
                data.get_sizes()[2],
                class_weights=class_weights)

            print("Test ACC: ", acc, " TEST AVG ACC: ", avg_acc,
                  "Weightes AVG ACC:", weighted_acc)
 def __init__(self, x_train, y_train, x_test, y_test, batch_size, epochs,
              dropout, lr, name):
     self.x_train = x_train
     self.x_test = x_test
     self.y_train = y_train
     self.y_test = y_test
     self.name = name
     self.lr = lr
     self.batch_size = batch_size
     self.epochs = epochs
     self.class_weight = compute_class_weight(class_weight='balanced',
                                              classes=np.arange(10),
                                              y=y_train.argmax(axis=1))
     model = Sequential()
     model.add(
         Bidirectional(LSTM(300, return_sequences=True),
                       input_shape=(n_steps, dim_input)))
     model.add(AttentionWithContext())
     model.add(Addition())
     model.add(Dense(300))
     model.add(LeakyReLU())
     model.add(Dropout(dropout))
     model.add(Dense(300))
     model.add(LeakyReLU())
     model.add(Dropout(dropout))
     model.add(Dense(10, activation='softmax'))
     # Lower learning rate to prevent divergence
     adamax = Adamax(self.lr)
     model.compile(adamax, 'categorical_crossentropy', metrics=['accuracy'])
     self.model = model
示例#13
0
def test_multiclass_classifier_class_weight():
    """tests multiclass with classweights for each class"""
    alpha = .1
    n_samples = 20
    tol = .00001
    max_iter = 50
    class_weight = {0: .45, 1: .55, 2: .75}
    fit_intercept = True
    X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0,
                      cluster_std=0.1)
    step_size = get_step_size(X, alpha, fit_intercept, classification=True)
    classes = np.unique(y)

    clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples,
                              max_iter=max_iter, tol=tol, random_state=77,
                              fit_intercept=fit_intercept,
                              class_weight=class_weight)
    clf2 = clone(clf1)
    clf1.fit(X, y)
    clf2.fit(sp.csr_matrix(X), y)

    le = LabelEncoder()
    class_weight_ = compute_class_weight(class_weight, np.unique(y), y)
    sample_weight = class_weight_[le.fit_transform(y)]

    coef1 = []
    intercept1 = []
    coef2 = []
    intercept2 = []
    for cl in classes:
        y_encoded = np.ones(n_samples)
        y_encoded[y != cl] = -1

        spweights1, spintercept1 = sag_sparse(X, y_encoded, step_size, alpha,
                                              n_iter=max_iter, dloss=log_dloss,
                                              sample_weight=sample_weight)
        spweights2, spintercept2 = sag_sparse(X, y_encoded, step_size, alpha,
                                              n_iter=max_iter, dloss=log_dloss,
                                              sample_weight=sample_weight,
                                              sparse=True)
        coef1.append(spweights1)
        intercept1.append(spintercept1)
        coef2.append(spweights2)
        intercept2.append(spintercept2)

    coef1 = np.vstack(coef1)
    intercept1 = np.array(intercept1)
    coef2 = np.vstack(coef2)
    intercept2 = np.array(intercept2)

    for i, cl in enumerate(classes):
        assert_array_almost_equal(clf1.coef_[i].ravel(),
                                  coef1[i].ravel(),
                                  decimal=2)
        assert_almost_equal(clf1.intercept_[i], intercept1[i], decimal=1)

        assert_array_almost_equal(clf2.coef_[i].ravel(),
                                  coef2[i].ravel(),
                                  decimal=2)
        assert_almost_equal(clf2.intercept_[i], intercept2[i], decimal=1)
示例#14
0
    def __init__(self, data, name="", batch_size=args.batch_size, lr=args.lr, epochs=args.epochs, dropout=args.dropout):
        vectors = np.stack(data.iloc[:, 1].values)
        labels = data.iloc[:, 0].values
        positive_idxs = np.where(labels == 1)[0]
        negative_idxs = np.where(labels == 0)[0]
        undersampled_negative_idxs = np.random.choice(negative_idxs, len(positive_idxs), replace=False)
        resampled_idxs = np.concatenate([positive_idxs, undersampled_negative_idxs])

        x_train, x_test, y_train, y_test = train_test_split(vectors[resampled_idxs], labels[resampled_idxs],
                                                            test_size=0.2, stratify=labels[resampled_idxs])
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = to_categorical(y_train)
        self.y_test = to_categorical(y_test)
        self.name = name
        self.batch_size = batch_size
        self.epochs = epochs
        self.class_weight = compute_class_weight(class_weight='balanced', classes=[0, 1], y=labels)
        model = Sequential()
        model.add(LSTM(300, input_shape=(vectors.shape[1], vectors.shape[2])))
        model.add(Dense(300))
        model.add(LeakyReLU())
        model.add(Dropout(dropout))
        model.add(Dense(300))
        model.add(LeakyReLU())
        model.add(Dropout(dropout))
        model.add(Dense(2, activation='softmax'))
        # Lower learning rate to prevent divergence
        adamax = Adamax(lr)
        model.compile(adamax, 'categorical_crossentropy', metrics=['accuracy'])
        self.model = model
示例#15
0
    def initialize_labels(self, Y):

        y_nodes_flat = [y_val for y in Y for y_val in y.nodes]
        y_links_flat = [y_val for y in Y for y_val in y.links]
        self.prop_encoder_ = LabelEncoder().fit(y_nodes_flat)
        self.link_encoder_ = LabelEncoder().fit(y_links_flat)

        self.n_prop_states = len(self.prop_encoder_.classes_)
        self.n_link_states = len(self.link_encoder_.classes_)

        self.prop_cw_ = np.ones_like(self.prop_encoder_.classes_,
                                     dtype=np.double)
        self.link_cw_ = compute_class_weight(self.class_weight,
                                             self.link_encoder_.classes_,
                                             y_links_flat)

        self.link_cw_ /= self.link_cw_.min()

        logging.info('Setting node class weights {}'.format(", ".join(
            "{}: {}".format(lbl, cw)
            for lbl, cw in zip(self.prop_encoder_.classes_, self.prop_cw_))))

        logging.info('Setting link class weights {}'.format(", ".join(
            "{}: {}".format(lbl, cw)
            for lbl, cw in zip(self.link_encoder_.classes_, self.link_cw_))))
示例#16
0
def test_auto_weight():
    # Test class weights for imbalanced data
    from sklearn.linear_model import LogisticRegression
    # We take as dataset the two-dimensional projection of iris so
    # that it is not separable and remove half of predictors from
    # class 1.
    # We add one to the targets as a non-regression test:
    # class_weight="balanced"
    # used to work only when the labels where a range [0..K).
    from sklearn.utils import compute_class_weight
    X, y = iris.data[:, :2], iris.target + 1
    unbalanced = np.delete(np.arange(y.size), np.where(y > 2)[0][::2])

    classes = np.unique(y[unbalanced])
    class_weights = compute_class_weight('balanced', classes=classes,
                                         y=y[unbalanced])
    assert np.argmax(class_weights) == 2

    for clf in (svm.SVC(kernel='linear'), svm.LinearSVC(random_state=0),
                LogisticRegression()):
        # check that score is better when class='balanced' is set.
        y_pred = clf.fit(X[unbalanced], y[unbalanced]).predict(X)
        clf.set_params(class_weight='balanced')
        y_pred_balanced = clf.fit(X[unbalanced], y[unbalanced],).predict(X)
        assert (metrics.f1_score(y, y_pred, average='macro')
                <= metrics.f1_score(y, y_pred_balanced,
                                    average='macro'))
def classifier_training_example(mels_path: str) -> None:
    batch_size = 64
    epochs = 50
    model_name = 'class_cnn3'
    patience = 10
    val_split = 0.25

    classifier = build_cnn3_classifier()
    classifier.compile(optimizer='adam',
                       loss='categorical_crossentropy',
                       metrics=['acc'])
    classifier.summary()

    mels_npz = np.load(mels_path)
    samples_x = mels_npz['samples_x']
    samples_y = mels_npz['samples_y']
    log.info(f'Training samples x shape = {samples_x.shape}')
    log.info(f'Training samples y shape = {samples_y.shape}')

    class_weights = compute_class_weight('balanced',
                                         classes=np.unique(
                                             np.argmax(samples_y, axis=-1)),
                                         y=np.argmax(samples_y, axis=-1))
    log.info(f'Training samples class weights = {class_weights}')
    class_weights = {idx: w for idx, w in enumerate(class_weights)}

    train_drum_classifier(classifier,
                          x=samples_x,
                          y=samples_y,
                          model_name=model_name,
                          batch_size=batch_size,
                          epochs=epochs,
                          val_split=val_split,
                          patience=patience,
                          class_weights=class_weights)
示例#18
0
def test_auto_weight():
    """Test class weights for imbalanced data"""
    from sklearn.linear_model import LogisticRegression
    # we take as dataset a the two-dimensional projection of iris so
    # that it is not separable and remove half of predictors from
    # class 1
    from sklearn.utils import compute_class_weight
    X, y = iris.data[:, :2], iris.target
    unbalanced = np.delete(np.arange(y.size), np.where(y > 1)[0][::2])

    classes = np.unique(y[unbalanced])
    class_weights = compute_class_weight('auto', classes, y[unbalanced])
    assert_true(np.argmax(class_weights) == 2)

    for clf in (svm.SVC(kernel='linear'), svm.LinearSVC(random_state=0),
                LogisticRegression()):
        # check that score is better when class='auto' is set.
        y_pred = clf.fit(X[unbalanced], y[unbalanced]).predict(X)
        clf.set_params(class_weight='auto')
        y_pred_balanced = clf.fit(
            X[unbalanced],
            y[unbalanced],
        ).predict(X)
        assert_true(
            metrics.f1_score(y, y_pred) <= metrics.f1_score(
                y, y_pred_balanced))
示例#19
0
def test_multiclass_classifier_class_weight():
    """tests multiclass with classweights for each class"""
    alpha = .1
    n_samples = 20
    tol = .00001
    max_iter = 50
    class_weight = {0: .45, 1: .55, 2: .75}
    fit_intercept = True
    X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0,
                      cluster_std=0.1)
    step_size = get_step_size(X, alpha, fit_intercept, classification=True)
    classes = np.unique(y)

    clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples,
                              max_iter=max_iter, tol=tol, random_state=77,
                              fit_intercept=fit_intercept,
                              class_weight=class_weight)
    clf2 = clone(clf1)
    clf1.fit(X, y)
    clf2.fit(sp.csr_matrix(X), y)

    le = LabelEncoder()
    class_weight_ = compute_class_weight(class_weight, np.unique(y), y)
    sample_weight = class_weight_[le.fit_transform(y)]

    coef1 = []
    intercept1 = []
    coef2 = []
    intercept2 = []
    for cl in classes:
        y_encoded = np.ones(n_samples)
        y_encoded[y != cl] = -1

        spweights1, spintercept1 = sag_sparse(X, y_encoded, step_size, alpha,
                                              n_iter=max_iter, dloss=log_dloss,
                                              sample_weight=sample_weight)
        spweights2, spintercept2 = sag_sparse(X, y_encoded, step_size, alpha,
                                              n_iter=max_iter, dloss=log_dloss,
                                              sample_weight=sample_weight,
                                              sparse=True)
        coef1.append(spweights1)
        intercept1.append(spintercept1)
        coef2.append(spweights2)
        intercept2.append(spintercept2)

    coef1 = np.vstack(coef1)
    intercept1 = np.array(intercept1)
    coef2 = np.vstack(coef2)
    intercept2 = np.array(intercept2)

    for i, cl in enumerate(classes):
        assert_array_almost_equal(clf1.coef_[i].ravel(),
                                  coef1[i].ravel(),
                                  decimal=2)
        assert_almost_equal(clf1.intercept_[i], intercept1[i], decimal=1)

        assert_array_almost_equal(clf2.coef_[i].ravel(),
                                  coef2[i].ravel(),
                                  decimal=2)
        assert_almost_equal(clf2.intercept_[i], intercept2[i], decimal=1)
示例#20
0
def create_criterion(device, weight_with=None):
    """
    Creates a `torch.nn.BCEWithLogitsLoss`.

    If weight_with is not None, uses class weight for the positive class

    Arguments:
    ----------

    device: "cuda" or "cpu"

    weight_with: data.Dataset
    """
    if weight_with:
        y = [row.subtask_a for row in weight_with]

        class_weights = compute_class_weight('balanced', ['NOT', 'OFF'], y)

        # normalize it
        class_weights = class_weights / class_weights[0]
        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([class_weights[1]]))
    else:
        criterion = nn.BCEWithLogitsLoss()

    criterion = criterion.to(device)
    return criterion
示例#21
0
def get_synthetic_data():
    x, y = make_classification(n_samples=10000,
                               n_features=200,
                               n_informative=200,
                               n_redundant=0,
                               n_repeated=0,
                               n_classes=3,
                               n_clusters_per_class=2,
                               weights=None,
                               flip_y=0.02,
                               class_sep=0.4,
                               hypercube=True,
                               shift=0.0,
                               scale=1.0,
                               shuffle=True,
                               random_state=2)
    log("class weights", compute_class_weight('balanced', np.unique(y), y))
    encoder = LabelEncoder()
    encoder.fit(y)
    encoded_y = encoder.transform(y)
    y = np_utils.to_categorical(encoded_y)
    mm_scaler = MinMaxScaler()
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
    x_train = mm_scaler.fit_transform(x_train)
    x_test = mm_scaler.transform(x_test)
    return x_train, y_train, x_test, y_test
示例#22
0
    def fit(self, dataset):
        """Fits the intent classifier with a valid Snips dataset

        Returns:
            :class:`LogRegIntentClassifier`: The same instance, trained
        """
        from sklearn.linear_model import SGDClassifier
        from sklearn.utils import compute_class_weight

        logger.info("Fitting LogRegIntentClassifier...")
        dataset = validate_and_format_dataset(dataset)
        self.load_resources_if_needed(dataset[LANGUAGE])
        self.fit_builtin_entity_parser_if_needed(dataset)
        self.fit_custom_entity_parser_if_needed(dataset)
        language = dataset[LANGUAGE]

        data_augmentation_config = self.config.data_augmentation_config
        utterances, classes, intent_list = build_training_data(
            dataset, language, data_augmentation_config, self.resources,
            self.random_state)

        self.intent_list = intent_list
        if len(self.intent_list) <= 1:
            return self

        self.featurizer = Featurizer(
            config=self.config.featurizer_config,
            builtin_entity_parser=self.builtin_entity_parser,
            custom_entity_parser=self.custom_entity_parser,
            resources=self.resources,
            random_state=self.random_state,
        )
        self.featurizer.language = language

        none_class = max(classes)
        try:
            x = self.featurizer.fit_transform(dataset, utterances, classes,
                                              none_class)
        except _EmptyDatasetUtterancesError:
            logger.warning("No (non-empty) utterances found in dataset")
            self.featurizer = None
            return self

        alpha = get_regularization_factor(dataset)

        class_weights_arr = compute_class_weight("balanced",
                                                 range(none_class + 1),
                                                 classes)
        # Re-weight the noise class
        class_weights_arr[-1] *= self.config.noise_reweight_factor
        class_weight = {idx: w for idx, w in enumerate(class_weights_arr)}

        self.classifier = SGDClassifier(random_state=self.random_state,
                                        alpha=alpha,
                                        class_weight=class_weight,
                                        **LOG_REG_ARGS)
        self.classifier.fit(x, classes)
        logger.debug("%s", DifferedLoggingMessage(self.log_best_features))
        return self
示例#23
0
文件: test_sag.py 项目: AnAnteup/icp4
def test_binary_classifier_class_weight():
    """tests binary classifier with classweights for each class"""
    alpha = .1
    n_samples = 50
    n_iter = 20
    tol = .00001
    fit_intercept = True
    X, y = make_blobs(n_samples=n_samples,
                      centers=2,
                      random_state=10,
                      cluster_std=0.1)
    step_size = get_step_size(X, alpha, fit_intercept, classification=True)
    classes = np.unique(y)
    y_tmp = np.ones(n_samples)
    y_tmp[y != classes[1]] = -1
    y = y_tmp

    class_weight = {1: .45, -1: .55}
    clf1 = LogisticRegression(solver='sag',
                              C=1. / alpha / n_samples,
                              max_iter=n_iter,
                              tol=tol,
                              random_state=77,
                              fit_intercept=fit_intercept,
                              multi_class='ovr',
                              class_weight=class_weight)
    clf2 = clone(clf1)

    clf1.fit(X, y)
    clf2.fit(sp.csr_matrix(X), y)

    le = LabelEncoder()
    class_weight_ = compute_class_weight(class_weight, np.unique(y), y)
    sample_weight = class_weight_[le.fit_transform(y)]
    spweights, spintercept = sag_sparse(X,
                                        y,
                                        step_size,
                                        alpha,
                                        n_iter=n_iter,
                                        dloss=log_dloss,
                                        sample_weight=sample_weight,
                                        fit_intercept=fit_intercept)
    spweights2, spintercept2 = sag_sparse(X,
                                          y,
                                          step_size,
                                          alpha,
                                          n_iter=n_iter,
                                          dloss=log_dloss,
                                          sparse=True,
                                          sample_weight=sample_weight,
                                          fit_intercept=fit_intercept)

    assert_array_almost_equal(clf1.coef_.ravel(), spweights.ravel(), decimal=2)
    assert_almost_equal(clf1.intercept_, spintercept, decimal=1)

    assert_array_almost_equal(clf2.coef_.ravel(),
                              spweights2.ravel(),
                              decimal=2)
    assert_almost_equal(clf2.intercept_, spintercept2, decimal=1)
示例#24
0
    def pre_init(self, data: LoadedKata):
        # initialize the array with the labels to avoid zero division
        observed_labels = copy.deepcopy(self.labels)
        for x in data.data_loader:
            observed_labels += [self.labels[label_id] for label_id in x[self.label_name]]

        weights = compute_class_weight("balanced", classes=np.asarray(self.labels), y=observed_labels)
        self.register_buffer("balance", torch.tensor(weights.astype(np.float32)))
示例#25
0
 def get_sample_weights(self):
     labels = self.get_dataframe()[self.get_label_column()]
     categories = labels.cat.categories
     class_weights = compute_class_weight('balanced',
                                          classes=categories,
                                          y=labels)
     sample_weights = class_weights[labels.cat.codes]
     return sample_weights
示例#26
0
    def __init__(self, tree_model,
                 X_train,
                 y_train,
                 feature_names : List[str],
                 class_names : (List[str],Mapping[int,str])=None):
        self.tree_model = tree_model
        self.feature_names = feature_names
        self.class_names = class_names
        self.class_weight = tree_model.class_weight

        if getattr(tree_model, 'tree_') is None: # make sure model is fit
            tree_model.fit(X_train, y_train)

        if tree_model.tree_.n_classes > 1:
            if isinstance(self.class_names, dict):
                self.class_names = self.class_names
            elif isinstance(self.class_names, Sequence):
                self.class_names = {i:n for i, n in enumerate(self.class_names)}
            else:
                raise Exception(f"class_names must be dict or sequence, not {self.class_names.__class__.__name__}")

        if isinstance(X_train, pd.DataFrame):
            X_train = X_train.values
        self.X_train = X_train
        if isinstance(y_train, pd.Series):
            y_train = y_train.values
        self.y_train = y_train
        self.node_to_samples = ShadowDecTree.node_samples(tree_model, X_train)
        if self.isclassifier():
            self.unique_target_values = np.unique(y_train)
            self.class_weights = compute_class_weight(tree_model.class_weight, self.unique_target_values, self.y_train)

        tree = tree_model.tree_
        children_left = tree.children_left
        children_right = tree.children_right

        # use locals not args to walk() for recursion speed in python
        leaves = []
        internal = [] # non-leaf nodes

        def walk(node_id):
            if (children_left[node_id] == -1 and children_right[node_id] == -1):  # leaf
                t = ShadowDecTreeNode(self, node_id)
                leaves.append(t)
                return t
            else:  # decision node
                left = walk(children_left[node_id])
                right = walk(children_right[node_id])
                t = ShadowDecTreeNode(self, node_id, left, right)
                internal.append(t)
                return t

        root_node_id = 0
        # record root to actual shadow nodes
        self.root = walk(root_node_id)
        self.leaves = leaves
        self.internal = internal
 def calculate_class_weights(self, labels, occured_labels):                
     from sklearn.utils import compute_class_weight        
     
     label_indexes = [labels.index(i) for i in occured_labels]
     
     class_weight_list = compute_class_weight('balanced', np.unique(label_indexes) ,label_indexes )
     
     class_weight = dict(zip(np.unique(label_indexes), class_weight_list))
     
     return class_weight
示例#28
0
 def get_class_weight(y_true):
     classes = np.arange(y_true.shape[-1])
     class_counts = y_true.sum(0).astype(np.int64)
     pos = 0
     y_weight = np.zeros(int(class_counts.sum()))
     for i, count in enumerate(class_counts):
         y_weight[pos:pos + count] = i
         pos += count
     weights = np.sqrt(compute_class_weight("balanced", classes, y_weight))
     return {i: weight for i, weight in enumerate(weights)}
示例#29
0
def cw_to_dict(y_class):
    """
    input: 1D array, labels
    output: balanced class weight dictionary
    """
    cw = utils.compute_class_weight('balanced', [0, 1],
                                    y_class)  #compute class weight
    cw_dict = {}
    for idx in range(len(cw)):
        cw_dict[idx] = cw[idx]
    return cw_dict
示例#30
0
def getData():
  X = []
  Y = []
  void_label = -1

  train_path = '/content/input'
  label_path = '/content/highway200'

  train_files = sorted(os.listdir(train_path))
  label_files = sorted(os.listdir(label_path))

  for i in range(len(train_files)):
    img = load_img(os.path.join(train_path, train_files[i]))
    img = img_to_array(img)
    X.append(img)

    img = load_img(os.path.join(label_path, label_files[i]), color_mode = 'grayscale')
    img = img_to_array(img)
    shape = img.shape
    img /= 255.0
    img = img.reshape(-1)
    idx = np.where(np.logical_and(img > 0.25, img < 0.8))[0] # find non-ROI
    if len(idx) > 0:
      img[idx] = -1
    img = img.reshape(shape)
    img = np.floor(img)
    Y.append(img)

  X = np.asarray(X)
  Y = np.asarray(Y)

  idx = list(range(X.shape[0]))
  np.random.shuffle(idx)
  np.random.shuffle(idx)
  X = X[idx]
  Y = Y[idx]

  cls_weight_list = []
  for i in range(Y.shape[0]):
      y = Y[i].reshape(-1)
      idx = np.where(y!=void_label)[0]
      if(len(idx)>0):
          y = y[idx]
      lb = np.unique(y) #  0., 1
      cls_weight = compute_class_weight('balanced', lb , y)
      class_0 = cls_weight[0]
      class_1 = cls_weight[1] if len(lb)>1 else 1.0
          
      cls_weight_dict = {0:class_0, 1: class_1}
      cls_weight_list.append(cls_weight_dict)
          
  cls_weight_list = np.asarray(cls_weight_list)

  return [X, Y, cls_weight_list]
示例#31
0
def get_class_weight(y_output):
    #news = pd.read_csv(file,encoding='utf-8',header=None)
    #tags = news.ix[:,1:].as_matrix()
    class_weight = []
    tags_num = y_output.shape[1]
    for i in range(tags_num):
        sample_weight = compute_class_weight('balanced', [0, 1], y_output[:,
                                                                          i])
        class_weight.append(sample_weight)
    class_weight = np.vstack(class_weight).T
    return class_weight
示例#32
0
def _get_class_weights(loader: DataLoader) -> np.ndarray:
    """
    compute class weights
    :param loader:
    :return:
    """
    labels = loader.dataset.metadata["dx"].cat.codes  # type: ignore
    classes = np.unique(labels)
    return compute_class_weight(class_weight="balanced",
                                classes=classes,
                                y=labels)
示例#33
0
def train():
    #X_lst, y_lst = load_dataset_folder(DATASET_DIR)
    X_lst, y_lst = load(DATASET_DIR)
    X, y = stack_data(X_lst, y_lst, onehot=True, num_classes=NUM_CLASSES)
    normalized_X = normalize(X, axis=1, norm='l2')
    print(X.shape, y.shape)
    logging.debug(f'X shape: {X.shape}\ty shape: {y.shape}')
    X_train, X_test, y_train, y_test = train_test_split(normalized_X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)
    # Convert to tf.data.Dataset
    train_data = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    test_data = tf.data.Dataset.from_tensor_slices((X_test, y_test))

    train_data = train_data.batch(BATCH_SIZE).cache().repeat()
    test_data = test_data.batch(BATCH_SIZE)

    model = generate_model_01(BATCH_SIZE, num_classes=NUM_CLASSES)

    # Compute class weights
    class_weights = compute_class_weight('balanced',
                                         np.unique(np.argmax(y_train, axis=1)),
                                         np.argmax(y_train, axis=1))

    # TensorBoard callback
    log_dir = SAVE_LOGS_DIR + 'fit/' + datetime.datetime.now().strftime(
        '%Y%m%d-%H%M%S')
    cb_tb = tf.keras.callbacks.TensorBoard(log_dir=log_dir)
    cb_es = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                             patience=200,
                                             restore_best_weights=True,
                                             verbose=True)

    training_history = model.fit(train_data,
                                 epochs=EPOCHS,
                                 validation_data=test_data,
                                 steps_per_epoch=15,
                                 class_weight={
                                     0: class_weights[0],
                                     1: class_weights[1],
                                     2: class_weights[2]
                                 },
                                 callbacks=[cb_tb, cb_es])

    # Store training history
    np.savetxt(SAVE_LOGS_DIR + 'acc.txt', training_history.history['acc'])
    np.savetxt(SAVE_LOGS_DIR + 'val_acc.txt',
               training_history.history['val_acc'])
    np.savetxt(SAVE_LOGS_DIR + 'loss.txt', training_history.history['loss'])
    np.savetxt(SAVE_LOGS_DIR + 'val_loss.txt',
               training_history.history['val_loss'])

    model.save_weights(SAVE_LOGS_DIR + 'weights.h5')
示例#34
0
def load_training_data():
    raw_training_data = pd.read_csv('train.csv')

    # convert types to ints
    raw_training_data['target'] = raw_training_data['target'].apply(class_to_int)
    raw_training_data = raw_training_data.astype('float32')
    raw_training_data['target'] = raw_training_data['target'].astype('int32')

    raw_training_data = raw_training_data.iloc[np.random.permutation(len(raw_training_data))] #shuffle data
    # Get the features and the classes
    features = np.log(raw_training_data.iloc[:, 1:94] + 1).values # apply log function

    classes = raw_training_data['target'].values

    print np.unique(classes)

    #split train/validate
    feat_train, feat_test, class_train, class_test = cross_validation.train_test_split(features, classes,
                                                                                       test_size=0.3,
                                                                                       random_state=1232)

    feat_train, feat_val, class_train, class_val = cross_validation.train_test_split(feat_train, class_train,
                                                                                     test_size=0.3,
                                                                                     random_state=1232)


    #scale the features
    std_scale = preprocessing.StandardScaler().fit(feat_train)
    feat_train = std_scale.transform(feat_train)
    feat_val = std_scale.transform(feat_val)
    feat_test = std_scale.transform(feat_test)

    #class weights
    weights = compute_class_weight('auto', np.unique(classes), class_train)
    weights = weights.astype('float32')
    print weights
    train_weights = []
    val_weights = []
    for i in class_train:
        train_weights.append(weights[i])

    for i in list(class_val):
        val_weights.append(weights[i])

    #convert to np array for theanets
    training_data = [feat_train, class_train, np.array(train_weights)]
    validation_data = [feat_val, class_val, np.array(val_weights)]
    test_data = [feat_test, class_test]

    return training_data, validation_data, test_data, std_scale
    def fit(self, X, y):
        from sklearn.preprocessing import LabelEncoder
        from sklearn.utils import compute_class_weight

        label_encoder = LabelEncoder().fit(y)
        classes = label_encoder.classes_
        class_weight = compute_class_weight(self.class_weight, classes, y)

        # Intentionally modify the balanced class_weight
        # to simulate a bug and raise an exception
        if self.class_weight == "balanced":
            class_weight += 1.

        # Simply assigning coef_ to the class_weight
        self.coef_ = class_weight
        return self
示例#36
0
def test_binary_classifier_class_weight():
    """tests binary classifier with classweights for each class"""
    alpha = .1
    n_samples = 50
    n_iter = 20
    tol = .00001
    fit_intercept = True
    X, y = make_blobs(n_samples=n_samples, centers=2, random_state=10,
                      cluster_std=0.1)
    step_size = get_step_size(X, alpha, fit_intercept, classification=True)
    classes = np.unique(y)
    y_tmp = np.ones(n_samples)
    y_tmp[y != classes[1]] = -1
    y = y_tmp

    class_weight = {1: .45, -1: .55}
    clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples,
                              max_iter=n_iter, tol=tol, random_state=77,
                              fit_intercept=fit_intercept,
                              class_weight=class_weight)
    clf2 = clone(clf1)

    clf1.fit(X, y)
    clf2.fit(sp.csr_matrix(X), y)

    le = LabelEncoder()
    class_weight_ = compute_class_weight(class_weight, np.unique(y), y)
    sample_weight = class_weight_[le.fit_transform(y)]
    spweights, spintercept = sag_sparse(X, y, step_size, alpha, n_iter=n_iter,
                                        dloss=log_dloss,
                                        sample_weight=sample_weight,
                                        fit_intercept=fit_intercept)
    spweights2, spintercept2 = sag_sparse(X, y, step_size, alpha,
                                          n_iter=n_iter,
                                          dloss=log_dloss, sparse=True,
                                          sample_weight=sample_weight,
                                          fit_intercept=fit_intercept)

    assert_array_almost_equal(clf1.coef_.ravel(),
                              spweights.ravel(),
                              decimal=2)
    assert_almost_equal(clf1.intercept_, spintercept, decimal=1)

    assert_array_almost_equal(clf2.coef_.ravel(),
                              spweights2.ravel(),
                              decimal=2)
    assert_almost_equal(clf2.intercept_, spintercept2, decimal=1)
示例#37
0
def test_auto_weight():
    """Test class weights for imbalanced data"""
    from sklearn.linear_model import LogisticRegression
    # we take as dataset a the two-dimensional projection of iris so
    # that it is not separable and remove half of predictors from
    # class 1
    from sklearn.utils import compute_class_weight
    X, y = iris.data[:, :2], iris.target
    unbalanced = np.delete(np.arange(y.size), np.where(y > 1)[0][::2])

    classes = np.unique(y[unbalanced])
    class_weights = compute_class_weight('auto', classes, y[unbalanced])
    assert_true(np.argmax(class_weights) == 2)

    for clf in (svm.SVC(kernel='linear'), svm.LinearSVC(random_state=0),
                LogisticRegression()):
        # check that score is better when class='auto' is set.
        y_pred = clf.fit(X[unbalanced], y[unbalanced]).predict(X)
        clf.set_params(class_weight='auto')
        y_pred_balanced = clf.fit(X[unbalanced], y[unbalanced],).predict(X)
        assert_true(metrics.f1_score(y, y_pred)
                    <= metrics.f1_score(y, y_pred_balanced))
示例#38
0
    def classify(self):
        y_data = self.results[self.task.label].data
        X_data = self.results[self.task.features].data
        y = np.array(y_data)
        X = np.array(X_data)

        cw = compute_class_weight('auto', np.array([0,1]), y)
        cw = {0:cw[0],1:cw[1]}
        print cw

        b = get_classifier(self.task.classifier, cw)

        tile_size = 1000
        num_tiles = y.size / tile_size
        for i in range(num_tiles):
            pos = i * tile_size
            X_sub = X[pos : pos + tile_size]
            y_sub = y[pos : pos + tile_size]

            y_prob = None
            y_pred = None
            if self.task.classifier == 'svm':
                y_pred = b.predict(X_sub)
                y_prob = np.array([[0,y] for y in y_pred])
            else:
                y_prob = b.predict_proba(X_sub)
                y_pred = [1 if y[1] >= 0.5 else 0 for y in y_prob]

            cm = confusion_matrix(y_sub, y_pred)
            stats = classify_stats(cm, y_test, y_prob)

            y_pred = pd.DataFrame(y_pred, columns=y_data.columns)
            result = ClassifyResult(self.task, 1.0, b, stats)
            self.results[self.task.uuid] = result

            b.partial_fit(X_sub, y_sub)
示例#39
0
def _compute_class_weight_dictionary(y):
    # helper for returning a dictionary instead of an array
    classes = np.unique(y)
    class_weight = compute_class_weight("balanced", classes, y)
    class_weight_dict = dict(zip(classes, class_weight))
    return class_weight_dict
# path to image folder
base_path = os.path.join(base_path, caltech101.config.tar_inner_dirname)

# X_test contain only paths to images
(X_test, y_test) = util.load_paths_from_files(base_path, 'X_test.txt', 'y_test.txt')

for cv_fold in [0]: # on which cross val folds to run; cant loop over several folds due to some bug
    print("fold {}".format(cv_fold))

    experiment_name = '_bn_triangular_cv{}_e{}'.format(cv_fold, nb_epoch)

    # load cross val split
    (X_train, y_train), (X_val, y_val) = util.load_cv_split_paths(base_path, cv_fold)

    # compute class weights, since classes are highly imbalanced
    class_weight = compute_class_weight('auto', range(nb_classes), y_train)

    if normalize_data:
        print("Load mean and std...")
        X_mean, X_std = util.load_cv_stats(base_path, cv_fold)
        normalize_data = (X_mean, X_std)

    nb_train_sample = X_train.shape[0]
    nb_val_sample = X_val.shape[0]
    nb_test_sample = X_test.shape[0]

    print('X_train shape:', X_train.shape)
    print(nb_train_sample, 'train samples')
    if X_val is not None:
        print(nb_val_sample, 'validation samples')
    print(nb_test_sample, 'test samples')
示例#41
0
from sklearn.svm import LinearSVC
from sklearn.metrics import average_precision_score
from sklearn.utils import compute_class_weight
import numpy as np
import logging


logging.basicConfig(level=logging.DEBUG)

iris = load_iris()
X = iris.data
y = iris.target
y[y != 1] = -1
y[y == 1] = 1

weights = compute_class_weight("auto", np.unique(y), y)
sample_weight = np.zeros(y.shape, dtype=np.float)
sample_weight[y==1] = weights[0]
sample_weight[y==-1] = weights[1]

# n_iter = int(1e6 / X.shape[0])
vw_clf = VWClassifier(quiet=False, loss_function="hinge", passes=500)
vw_clf.fit(X, y.astype(np.double), sample_weight)
scores = vw_clf.decision_function(X)
print "VW AP: %.3f" % average_precision_score(y, scores)

vw_clf.set_params(l2=0.1)
vw_clf.fit(X, y.astype(np.double), sample_weight)
print "VW AP: %.3f" % average_precision_score(y, scores)

# vw_clf.fit(X, y.astype(np.double), sample_weight)