Пример #1
0
def train_model():
    """
    Train the job application model and save the model
    :return: 
    """
    jobs_training = get_training_data(TRAINING__DATA_PATH)
    jobs_labels = get_training_labels("applications")
    logging.info("Transforming data")
    jobs_prepared = full_pipeline.fit_transform(jobs_training)
    logging.info("Training the model")
    lin_reg = LinearRegression(copy_X=True,
                               fit_intercept=True,
                               n_jobs=1,
                               normalize=False)
    lin_reg.fit(jobs_prepared, jobs_labels)
    logging.info("Saving the model")
    save_trained_model(lin_reg)
def run_experiment(num_models,
                   training_dir,
                   test_dir,
                   results_dir,
                   transform,
                   weights_file=None):
    training_set = get_training_data(training_dir,
                                     COLOURS,
                                     IMG_ROWS,
                                     IMG_COLS,
                                     transform=transform)
    test_set = get_test_data(test_dir,
                             COLOURS,
                             IMG_ROWS,
                             IMG_COLS,
                             transform=transform)
    for epoch in range(num_models):
        callbacks_list = []
        if weights_file:
            callbacks_list.append(
                ModelCheckpoint(weights_file,
                                monitor='val_accuracy',
                                verbose=1,
                                save_best_only=True))
        callbacks_list.append(
            LearnedAccuracyWriter(COLOURS, test_set, epoch, PATIENCE,
                                  results_dir))
        callbacks_list.append(
            EarlyStopping(monitor='val_accuracy', patience=PATIENCE))
        model = colour_net(NUM_CLASSES, weights_file)
        model.fit_generator(training_set,
                            steps_per_epoch=STEPS_PER_EPOCH,
                            epochs=MAX_EPOCHS,
                            validation_data=test_set,
                            callbacks=callbacks_list,
                            verbose=1)
            predicted_tags = self.decode(word_array)
            tag_sequences.append([(word_array[i], predicted_tags[i])
                                  for i in range(len(word_array))])

        self.create_test_result_file(tag_sequences, outfile)

    def create_test_result_file(self, test_result, filename):
        with open(filename, "w", encoding="utf-8") as f:
            for sequence in test_result:
                for word, tag in sequence:
                    f.write(f"{word} {tag}\n")
                f.write("\n")


if __name__ == "__main__":
    if len(sys.argv) < 3:
        print('Please make sure you have installed Python 3.4 or above!')
        print(
            "Usage on Windows:  python emission.py [train file] [dev.in file] [result filepath]"
        )
        print(
            "Usage on Linux/Mac:  python3 emission.py [train file] [dev.in file] [result filepath]"
        )
        sys.exit()

    train_data = get_training_data(sys.argv[1])

    sp = StructuredPerceptron()
    sp.fit(train_data, no_of_epochs=10, learning_rate=0.2)
    sp.predict(sys.argv[2], sys.argv[3])
Пример #4
0
import utils
import nn.nnutils as nnutils
import numpy as np
import keras
from keras.layers.core import Dense

training_points, training_labels = \
    utils.get_training_data("../data/train_2008.csv")

nnutils.standardize_labels(training_labels)

assert len(training_points) == len(training_labels)
validation_set_size = len(training_points) / 6
validation_indices = np.random.choice(a=len(training_points),
                                      size=int(validation_set_size),
                                      replace=False)

print("Separating into training and validation sets...")
X_valid = training_points[validation_indices]
y_valid = training_labels[validation_indices]
X_train = np.delete(training_points, obj=validation_indices, axis=0)
y_train = np.delete(training_labels, obj=validation_indices, axis=0)

print("Creating Keras model...")
model = keras.models.Sequential()
model.add(Dense(30, input_dim=X_train.shape[1], activation='tanh'))
model.add(Dense(100, activation='tanh'))
model.add(Dense(1, activation='softmax'))
model.compile(loss='mean_squared_error',
              optimizer='adam',
              metrics=['accuracy'])
Пример #5
0
from find_cars import find_cars_in_windows

if __name__ == '__main__':

    # Hyper paramaters
    color_space = 'YCrCb'
    orient = 9
    pix_per_cell = 8
    cell_per_block = 2
    hog_channel = 'ALL'  # 0,12,'ALL'
    spatial_size = (32, 32)
    hist_bins = 32

    # Read training data
    train_cars, train_notcars = get_training_data()

    t = time.time()

    # Extract Features
    car_features = extract_features_from_image_list(
        train_cars,
        color_space=color_space,
        spatial_size=spatial_size,
        hist_bins=hist_bins,
        orient=orient,
        pix_per_cell=pix_per_cell,
        cell_per_block=cell_per_block,
        hog_channel=hog_channel,
        spatial_feat=True,
        hist_feat=True,
Пример #6
0
from sklearn import ensemble

import utils

# this is pure shit.

training_points, training_labels = utils.get_training_data('../train_2008.csv')
test_points = utils.get_test_points('../test_2008.csv')

clf = ensemble.AdaBoostClassifier()  # uses DT by default
clf.fit(training_points, training_labels)

utils.prepare_submission_sklearn(clf.predict, test_points)
def purity_score(c, y):
    A = np.c_[(c, y)]
    n_accurate = 0.
    for j in np.unique(A[:, 0]):
        z = A[A[:, 0] == j, 1]
        x = np.argmax(np.bincount(z))
        n_accurate += len(z[z == x])
    return n_accurate / A.shape[0]


if __name__ == "__main__":
    args = get_argparser()
    input_dir_path = args.input
    output_dir_path = args.output
    print("Reading data")
    indices, reviews, labels = utils.get_training_data(input_dir_path)
    print("Training model")
    feature_mat_file_path = 'reviews_features_sparse.pkl' if args.features == 'sparse'\
        else 'reviews_features_dense_pretrained.pkl' if args.features == 'pretrained'\
        else 'reviews_features_dense_custom.pkl'
    feature_mat = pickle.load(open(feature_mat_file_path, 'rb'))
    if args.k is None:
        plot_k_vs_inertia(feature_mat, args.features + '_k_vs_inertia.png')
    else:
        train_feature_mat, val_feature_mat = split_feature_mat(feature_mat)
        train_size = train_feature_mat.shape[0]
        clusters, inertia = clustering(train_feature_mat, val_feature_mat,
                                       args.k)
        print("Writing clusters")
        write_clusters(indices[train_size:],
                       clusters,
Пример #8
0
# Convert categorical features (with 20 or less categories) to 1-hot vectors.
# Normalize each feature to have mean 0 and std/variance 1.

import utils
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Maximum number of unique values a feature can attain and still be considered
#   categorical
max_categories = 20

X_train, y_train = \
    utils.get_training_data('../data/train_2008.csv')
X_test = utils.get_test_points('../data/test_2008.csv')

print("X_train shape" + str(X_train.shape))
print("y_train shape" + str(y_train.shape))
print("X_test shape" + str(X_test.shape))

training_set_size = X_train.shape[0]

# Concatenate X_train and X_test (vertically) to apply operations together.
X_all = np.concatenate([X_train, X_test], axis=0)

# Count the number of unique values assumed by each feature.
cat_cols = []  # columns deemed categorical
cat_unique_values = []  # unique values for each categorical feature
noncat_cols = []  # non-categorical columns
for col_num in range(X_all.shape[1]):
    column = X_all[:, col_num:col_num+1]  # (N,1) ndarray
    unique_values = np.unique(column)
Пример #9
0
def build_and_train_models():
    # load MNIST dataset
    (x_train, y_train), (_, _) = mnist.load_data()

    # load small_norb data
    # (x_train, y_train) = utils.load_data('train')
    # np.save('x_train', x_train)
    # x_train = np.load('x_train.npy')
    # print('x_train.npy', np.shape(x_train))
    # np.save('y_train', y_train)
    #
    # y_train = np.load('y_train.npy')
    # print('y_train.npy', np.shape(y_train))
    # from sklearn.preprocessing import LabelEncoder
    # labelencoder = LabelEncoder()
    # y_train = labelencoder.fit_transform(y_train)

    I, T = utils.get_training_data()
    print('I.shape[1:]', I.shape[1:])
    # assert I.shape[1:] == (96, 96, 1)

    train_size = int(I.shape[0] * 0.8)
    x_train = I[:train_size, :, :, :]
    y_train = T[:train_size, :]
    print('x_train[1]:', x_train[1])
    print('y_train[1]', y_train[1])
    return

    # val_x = I[train_size:, :, :, :]
    # val_y = T[train_size:, :]

    # reshape data for CNN as (28, 28, 1) and normalize
    image_size = x_train.shape[1]
    x_train = np.reshape(x_train, [-1, image_size, image_size, 1])
    x_train = x_train.astype('float32') / 255

    num_labels = np.amax(y_train) + 1
    y_train = to_categorical(y_train)

    model_name = "cgan_mnist"
    # network parameters
    # the latent or z vector is 100-dim
    latent_size = 100
    batch_size = 64
    train_steps = 40000
    lr = 2e-4
    decay = 6e-8
    input_shape = (image_size, image_size, 1)
    label_shape = (num_labels, )

    # build discriminator model
    inputs = Input(shape=input_shape, name='discriminator_input')
    labels = Input(shape=label_shape, name='class_labels')

    discriminator = build_discriminator(inputs, labels, image_size)
    # [1] or original paper uses Adam,
    # but discriminator converges easily with RMSprop
    optimizer = RMSprop(lr=lr, decay=decay)
    discriminator.compile(loss='binary_crossentropy',
                          optimizer=optimizer,
                          metrics=['accuracy'])
    discriminator.summary()

    # build generator model
    input_shape = (latent_size, )
    inputs = Input(shape=input_shape, name='z_input')
    generator = build_generator(inputs, labels, image_size)
    generator.summary()

    # build adversarial model = generator + discriminator
    optimizer = RMSprop(lr=lr * 0.5, decay=decay * 0.5)
    # freeze the weights of discriminator during adversarial training
    discriminator.trainable = False
    outputs = discriminator([generator([inputs, labels]), labels])
    adversarial = Model([inputs, labels], outputs, name=model_name)
    adversarial.compile(loss='binary_crossentropy',
                        optimizer=optimizer,
                        metrics=['accuracy'])
    adversarial.summary()

    # train discriminator and adversarial networks
    models = (generator, discriminator, adversarial)
    data = (x_train, y_train)
    params = (batch_size, latent_size, train_steps, num_labels, model_name)
    train(models, data, params)
Пример #10
0
    return features


if __name__ == '__main__':
    # Hyper paramaters

    color_space = 'RGB'
    hog_channel = 0
    orient = 9
    pix_per_cell = 8
    cell_per_block = 2
    spatial_size = (32, 32)
    hist_bins = 32

    cars, noncars = get_training_data(nsamples=1)
    car_image = mpimg.imread(cars[0])
    noncar_image = mpimg.imread(noncars[0])

    hog_features, car_hog_image = get_hog_features(car_image[:, :,
                                                             hog_channel],
                                                   orient,
                                                   pix_per_cell,
                                                   cell_per_block,
                                                   vis=True,
                                                   feature_vec=True)

    hog_features, noncar_hog_image = get_hog_features(
        noncar_image[:, :, hog_channel],
        orient,
        pix_per_cell,
Пример #11
0
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelBinarizer

from transformers import DataFrameSelector, CombinedAttributesAdder, NumpySelector
from utils import get_training_data

jobs = get_training_data()

jobs_num = jobs.drop(["job_id", "job_type", "city"], axis=1)
jobs_cat = jobs.drop(["job_id", "salary"], axis=1)

num_attribs_hour = ["salary", "hours"]
num_attribs_all = ["salary", "salary_per_hour"]

num_pipeline = Pipeline([
    ('selector_1', DataFrameSelector(num_attribs_hour)),
    ('attribs_adder', CombinedAttributesAdder(0, 1)),
    ('selector_2', NumpySelector([0, 2])),
    ('std_scaler', StandardScaler()),
])

mapper = DataFrameMapper([(cat, LabelBinarizer()) for cat in jobs_cat])

cat_pipeline = Pipeline([('label_multibinarizer', mapper)])

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
Пример #12
0
    def __init__(self, features, save_model_name=None):
        super(DecisionTree, self).__init__(features,
                                           save_model_name=save_model_name)
        if not (self.save_model_path).exists():
            self.call_model = DecisionTreeClassifier()

    def show(self):
        plot_tree(self.call_model)
        plt.show()


if __name__ == "__main__":
    from utils import get_testing_data, get_training_data
    import numpy as np

    train_data = get_training_data(nrows=None)

    features = [
        c for c in train_data.columns.values if not (c in classification_cols)
    ]

    contamination = train_data["malicious"].sum() / len(train_data)
    test_data = get_testing_data(nrows=None)
    AD_models = [LOF, ISOF, OneClassSVM]

    print("contamination ratio", contamination)

    # time model
    for model_name, mc in [("dt", DecisionTree), ("MLP", MLP),
                           ("rf", RandomForest), ("ISOF", ISOF), ("svm", SVM),
                           ("OneClassSVM", OneClassSVM), ("LOF", LOF)]:
Пример #13
0
        
        return backward
    
    def logsumexp(self, a):
        """Log sum exp trick to solve underflow issue"""
        b = a.max()
        return b + np.log((np.exp(a - b)).sum())

    def read_test_file(self, file):
        with open(file, 'r', encoding="utf-8") as f:
            test_data = f.read().rstrip().split('\n\n')
        return test_data
    
    def create_test_result_file(self, test_result, filename):
        with open(filename, "w",  encoding="utf-8") as f:
            for sequence in test_result:
                for word, tag in sequence:
                    f.write(f"{word} {tag}\n")
                f.write("\n")

if __name__ == "__main__":
    if len(sys.argv) < 3:
        print ('Please make sure you have installed Python 3.4 or above!')
        print ("Usage on Windows:  python emission.py [train file] [dev.in file]")
        print ("Usage on Linux/Mac:  python3 emission.py [train file] [dev.in file]")
        sys.exit()
    train_data, tags = get_training_data(sys.argv[1])
    crf = CRF(tags, train_data)
    crf.train(iterations=10)
    crf.predict(sys.argv[2], sys.argv[3])
from __future__ import print_function
import random
import sys
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM, Dropout, Embedding
from keras.optimizers import RMSprop

from utils import init_stop_words, get_training_data, check_rythm
from constant import wu_yan_lv_shi

# init
iter_num = 200

text = get_training_data()
stop_words = init_stop_words()
print('stop_words:', stop_words)

for i in stop_words:
    text = text.replace(i, '')
print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
txt_maxlen = 15000
maxlen = 5