예제 #1
0
def run_classifer(X_train, s_train, y_train, X_test, s_test, y_test):
    s_train = np.array(s_train)  # samples x features
    s_test = np.array(s_test)

    num_labels = 15
    batch_size = 100

    stemmer = sb.SnowballStemmer('english')

    swlist = sw.words('english')
    swlist += [stemmer.stem(w) for w in swlist]
    swlist += [
        "'d", "'s", 'abov', 'ani', 'becaus', 'befor', 'could', 'doe', 'dure',
        'might', 'must', "n't", 'need', 'onc', 'onli', 'ourselv', 'sha',
        'themselv', 'veri', 'whi', 'wo', 'would', 'yourselv'
    ]  #complained about not having these as stop words
    pubs = [
        'buzzfe', 'buzzf', 'npr', 'cnn', 'vox', 'reuter', 'breitbart', 'fox',
        'guardian', 'review', 'theatlant'
    ]
    punct = [
    ]  #[':', '..', '“', '@', '%', ';', '→', ')', '#', '(', '*', '&', '[', ']', '…', '?','—', '‘', '$'] #gonna leave these in for now

    swlist += pubs
    swlist += punct
    if sys.argv[4].lower() == 'true':
        tkzr = StemTokenizer()
    else:
        tkzr = None

    if sys.argv[5].lower() != 'true':
        swlist = []

    #what features are we using?
    if sys.argv[7].lower() == 'word':
        count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr)
        count_vect.fit(X_train)
        X_train = count_vect.transform(X_train)
        X_test = count_vect.transform(X_test)
        tfidf_transformer = TfidfTransformer()
        tfidf_transformer.fit(X_train)
        X_train = tfidf_transformer.transform(X_train)
        X_test = tfidf_transformer.transform(X_test)

    elif sys.argv[7].lower() == 'topic':
        count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr)
        count_vect.fit(X_train)
        X_train = count_vect.transform(X_train)
        X_test = count_vect.transform(X_test)
        lda_model = LatentDirichletAllocation(n_components=10)
        lda_model.fit(X_train)
        X_train = lda_model.transform(X_train)
        X_test = lda_model.transform(X_test)

    elif sys.argv[7].lower() == 'style':
        X_train = csr_matrix(s_train)
        X_test = csr_matrix(s_test)

    elif sys.argv[7].lower() == 'all':
        count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr)
        count_vect.fit(X_train)
        X_train = count_vect.transform(X_train)
        X_test = count_vect.transform(X_test)

        tfidf_transformer = TfidfTransformer()
        tfidf_transformer.fit(X_train)
        X_train_tf = tfidf_transformer.transform(X_train)
        X_test_tf = tfidf_transformer.transform(X_test)
        print(type(X_train_tf))

        lda_model = LatentDirichletAllocation(n_components=10)
        lda_model.fit(X_train)
        X_train_lda = lda_model.transform(X_train)
        X_test_lda = lda_model.transform(X_test)
        print(type(X_train_lda))

        X_train = csr_matrix(
            sparse.hstack(
                [X_train_tf,
                 csr_matrix(X_train_lda),
                 csr_matrix(s_train)]))
        X_test = csr_matrix(
            sparse.hstack(
                [X_test_tf,
                 csr_matrix(X_test_lda),
                 csr_matrix(s_test)]))

        print(type(X_train))

        # sparse.save_npz("X_train" + sys.argv[6] + ".npz", X_train)
        # sparse.save_npz("X_test" + sys.argv[6] + ".npz", X_test)

    else:
        sys.exit('unknown features')

    encoder = LabelBinarizer()
    encoder.fit(y_train)
    y_train = encoder.transform(y_train)
    y_test = encoder.transform(y_test)

    # np.save('X_train.npy', X_train)
    # np.save('X_test.npy', X_test)
    # np.save('y_train.npy', y_train)
    # np.save('y_test.npy', y_test)

    # sparse.save_npz("y_train" + sys.argv[6] + ".npz", y_train)
    # sparse.save_npz("y_test" + sys.argv[6] + ".npz", y_test)

    # load everything back
    # X_train = sparse.load_npz("X_train.npz")

    input_dim = X_train.shape[1]
    model = Sequential()
    model.add(Dense(512, input_shape=(input_dim, )))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_labels))
    model.add(Activation('softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    history = model.fit(X_train,
                        y_train,
                        batch_size=batch_size,
                        epochs=5,
                        verbose=1,
                        validation_split=0.1)

    # model.model.save(sys.argv[6] + '.h5')

    # X_train = np.load('X_train.npy')
    # X_test = np.load('X_test.npy')
    # y_train = np.load('y_train.npy')
    # y_test = np.load('y_test.npy')

    # model = keras.models.load_model(sys.argv[6] + '.h5')
    score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)

    print('Test accuracy:', score[1])

    y_pred = model.predict(X_test, batch_size=batch_size, verbose=1)
    predicted = np.argmax(y_pred, axis=1)
    p, r, fs, s = precision_recall_fscore_support(np.argmax(y_test, axis=1),
                                                  predicted)
    print(p, r, fs, s)
예제 #2
0
        if logs is None:
            logs = {}
        if (logs.get('val_accuracy') > 0.98):
            print("reach 98% accuracy in valudation")
            self.model.stop_training = True


escb1 = EarlyStopCallback()

csv = pd.read_csv('data/demo74_bmi.csv')
print(csv.shape)
print(csv.head(n=10))
csv['height'] = csv['height'] / 200
csv['weight'] = csv['weight'] / 100

encoder = LabelBinarizer()
transformedLabel = encoder.fit_transform(csv['label'])
print(csv['label'][:10])
print(transformedLabel[:10])

DATA_FOR_TEST = 90000
test_csv = csv[DATA_FOR_TEST:]
test_part = test_csv[['weight', 'height']]
test_answer = transformedLabel[DATA_FOR_TEST:]
train_csv = csv[:DATA_FOR_TEST]
train_part = train_csv[['weight', 'height']]
train_answer = transformedLabel[:DATA_FOR_TEST]
print(test_part.shape, train_part.shape, test_answer.shape, train_answer.shape)

model = Sequential()
model.add(Dense(10, activation='relu', input_shape=(2, )))
예제 #3
0
data = []
labels = []

for category in CATEGORIES:
    path = os.path.join(DIRECTORY, category)
    for img in os.listdir(path):
        img_path = os.path.join(path, img)
        image = load_img(img_path, target_size=(224, 224))
        image = img_to_array(image)
        image = preprocess_input(image)

        data.append(image)
        labels.append(category)

# perform one-hot encoding on the labels
lb = LabelBinarizer()
labels = lb.fit_transform(labels)
labels = to_categorical(labels)

data = np.array(data, dtype="float32")
labels = np.array(labels)

(trainX, testX, trainY, testY) = train_test_split(data,
                                                  labels,
                                                  test_size=0.20,
                                                  stratify=labels,
                                                  random_state=42)

# construct the training image generator for data augmentation
aug = ImageDataGenerator(rotation_range=20,
                         zoom_range=0.15,
예제 #4
0
def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]
    options = parse_args(argv)

    print("[INFO] loading images...")

    loader = SimpleDatasetLoader(preprocessors=[
        SimplePreprocessor(width=img_cols, height=img_rows),
        ImageToArrayPreprocessor(),
    ])
    data, labels = loader.load(
        driving_log_path=options.driving_log,
        data_path=options.dataset,
        verbose=True,
    )
    data = data.astype('float32')
    import ipdb
    ipdb.set_trace()

    # # horizontal reflection for augmentation
    # data = np.append(data, data[:, :, ::-1], axis=0)
    # labels = np.append(labels, -labels, axis=0)

    # split train and validation
    data, labels = shuffle(data, labels)
    x_train, x_test, y_train, y_test = train_test_split(
        data,
        labels,
        random_state=13,
        test_size=0.1,
    )
    # x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
    # x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)

    lb = LabelBinarizer()
    y_train = lb.fit_transform(y_train)
    y_test = lb.transform(y_test)

    label_names = ['straight', 'left', 'right']

    aug = ImageDataGenerator(
        rotation_range=1,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.2,
        horizontal_flip=False,
        fill_mode="nearest",
    )

    print('[INFO] compiling model...')
    # model = NvidiaNet.build(width=img_cols, height=img_rows, depth=1)
    # model = TinyNet.build(width=img_cols, height=img_rows, depth=1)
    # model = ShallowNet.build(width=img_cols, height=img_rows, depth=1, classes=len(label_names))
    model = MiniVGGNet.build(width=img_cols,
                             height=img_rows,
                             depth=1,
                             classes=len(label_names))

    opt = SGD(lr=learning_rate,
              momentum=0.9,
              decay=learning_rate / nb_epoch,
              nesterov=True)
    # opt = SGD(lr=learning_rate)
    # opt = Adam(lr=learning_rate)
    # model.compile(
    #     loss='mean_squared_error',
    #     metrics=["accuracy"],
    #     optimizer=opt,
    # )
    model.compile(
        loss='categorical_crossentropy',
        metrics=['accuracy'],
        optimizer=opt,
    )

    history = model.fit_generator(
        aug.flow(x_train, y_train, batch_size=batch_size),
        # history = model.fit(
        #     x_train, y_train,
        nb_epoch=nb_epoch,
        # batch_size=batch_size,
        steps_per_epoch=(len(x_train) // batch_size),
        verbose=1,
        validation_data=(x_test, y_test),
    )

    predictions = model.predict(x_test, batch_size=batch_size)
    print(
        classification_report(
            y_test.argmax(axis=1),
            predictions.argmax(axis=1),
            target_names=label_names,
        ))

    plt.style.use("ggplot")
    fig, ax_acc = plt.subplots(1, 1)

    ax_acc.set_xlabel("Epoch #")

    ax_loss = ax_acc.twinx()
    ax_loss.grid(None)
    ax_loss.set_ylabel("Loss")

    ax_acc.grid(None)
    ax_acc.set_ylabel("Accuracy")
    ax_acc.set_ylim([0, 1])

    ax_loss.plot(np.arange(0, nb_epoch),
                 history.history["loss"],
                 label="train_loss")
    ax_loss.plot(np.arange(0, nb_epoch),
                 history.history["val_loss"],
                 label="val_loss")
    ax_acc.plot(np.arange(0, nb_epoch),
                history.history["acc"],
                label="train_acc")
    ax_acc.plot(np.arange(0, nb_epoch),
                history.history["val_acc"],
                label="val_acc")
    fig.suptitle("Training Loss and Accuracy")
    fig.legend()
    plt.show()

    model.save(options.model)

    return 0
def load_encoder(data, column="ocean_proximity"):
    encoder = LabelBinarizer()
    data_cat = data[column]
    data_cat_1hot = encoder.fit_transform(data_cat)
    return data_cat_1hot