def train_model(): """ Train the job application model and save the model :return: """ jobs_training = get_training_data(TRAINING__DATA_PATH) jobs_labels = get_training_labels("applications") logging.info("Transforming data") jobs_prepared = full_pipeline.fit_transform(jobs_training) logging.info("Training the model") lin_reg = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) lin_reg.fit(jobs_prepared, jobs_labels) logging.info("Saving the model") save_trained_model(lin_reg)
def run_experiment(num_models, training_dir, test_dir, results_dir, transform, weights_file=None): training_set = get_training_data(training_dir, COLOURS, IMG_ROWS, IMG_COLS, transform=transform) test_set = get_test_data(test_dir, COLOURS, IMG_ROWS, IMG_COLS, transform=transform) for epoch in range(num_models): callbacks_list = [] if weights_file: callbacks_list.append( ModelCheckpoint(weights_file, monitor='val_accuracy', verbose=1, save_best_only=True)) callbacks_list.append( LearnedAccuracyWriter(COLOURS, test_set, epoch, PATIENCE, results_dir)) callbacks_list.append( EarlyStopping(monitor='val_accuracy', patience=PATIENCE)) model = colour_net(NUM_CLASSES, weights_file) model.fit_generator(training_set, steps_per_epoch=STEPS_PER_EPOCH, epochs=MAX_EPOCHS, validation_data=test_set, callbacks=callbacks_list, verbose=1)
predicted_tags = self.decode(word_array) tag_sequences.append([(word_array[i], predicted_tags[i]) for i in range(len(word_array))]) self.create_test_result_file(tag_sequences, outfile) def create_test_result_file(self, test_result, filename): with open(filename, "w", encoding="utf-8") as f: for sequence in test_result: for word, tag in sequence: f.write(f"{word} {tag}\n") f.write("\n") if __name__ == "__main__": if len(sys.argv) < 3: print('Please make sure you have installed Python 3.4 or above!') print( "Usage on Windows: python emission.py [train file] [dev.in file] [result filepath]" ) print( "Usage on Linux/Mac: python3 emission.py [train file] [dev.in file] [result filepath]" ) sys.exit() train_data = get_training_data(sys.argv[1]) sp = StructuredPerceptron() sp.fit(train_data, no_of_epochs=10, learning_rate=0.2) sp.predict(sys.argv[2], sys.argv[3])
import utils import nn.nnutils as nnutils import numpy as np import keras from keras.layers.core import Dense training_points, training_labels = \ utils.get_training_data("../data/train_2008.csv") nnutils.standardize_labels(training_labels) assert len(training_points) == len(training_labels) validation_set_size = len(training_points) / 6 validation_indices = np.random.choice(a=len(training_points), size=int(validation_set_size), replace=False) print("Separating into training and validation sets...") X_valid = training_points[validation_indices] y_valid = training_labels[validation_indices] X_train = np.delete(training_points, obj=validation_indices, axis=0) y_train = np.delete(training_labels, obj=validation_indices, axis=0) print("Creating Keras model...") model = keras.models.Sequential() model.add(Dense(30, input_dim=X_train.shape[1], activation='tanh')) model.add(Dense(100, activation='tanh')) model.add(Dense(1, activation='softmax')) model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
from find_cars import find_cars_in_windows if __name__ == '__main__': # Hyper paramaters color_space = 'YCrCb' orient = 9 pix_per_cell = 8 cell_per_block = 2 hog_channel = 'ALL' # 0,12,'ALL' spatial_size = (32, 32) hist_bins = 32 # Read training data train_cars, train_notcars = get_training_data() t = time.time() # Extract Features car_features = extract_features_from_image_list( train_cars, color_space=color_space, spatial_size=spatial_size, hist_bins=hist_bins, orient=orient, pix_per_cell=pix_per_cell, cell_per_block=cell_per_block, hog_channel=hog_channel, spatial_feat=True, hist_feat=True,
from sklearn import ensemble import utils # this is pure shit. training_points, training_labels = utils.get_training_data('../train_2008.csv') test_points = utils.get_test_points('../test_2008.csv') clf = ensemble.AdaBoostClassifier() # uses DT by default clf.fit(training_points, training_labels) utils.prepare_submission_sklearn(clf.predict, test_points)
def purity_score(c, y): A = np.c_[(c, y)] n_accurate = 0. for j in np.unique(A[:, 0]): z = A[A[:, 0] == j, 1] x = np.argmax(np.bincount(z)) n_accurate += len(z[z == x]) return n_accurate / A.shape[0] if __name__ == "__main__": args = get_argparser() input_dir_path = args.input output_dir_path = args.output print("Reading data") indices, reviews, labels = utils.get_training_data(input_dir_path) print("Training model") feature_mat_file_path = 'reviews_features_sparse.pkl' if args.features == 'sparse'\ else 'reviews_features_dense_pretrained.pkl' if args.features == 'pretrained'\ else 'reviews_features_dense_custom.pkl' feature_mat = pickle.load(open(feature_mat_file_path, 'rb')) if args.k is None: plot_k_vs_inertia(feature_mat, args.features + '_k_vs_inertia.png') else: train_feature_mat, val_feature_mat = split_feature_mat(feature_mat) train_size = train_feature_mat.shape[0] clusters, inertia = clustering(train_feature_mat, val_feature_mat, args.k) print("Writing clusters") write_clusters(indices[train_size:], clusters,
# Convert categorical features (with 20 or less categories) to 1-hot vectors. # Normalize each feature to have mean 0 and std/variance 1. import utils import numpy as np from sklearn.preprocessing import OneHotEncoder # Maximum number of unique values a feature can attain and still be considered # categorical max_categories = 20 X_train, y_train = \ utils.get_training_data('../data/train_2008.csv') X_test = utils.get_test_points('../data/test_2008.csv') print("X_train shape" + str(X_train.shape)) print("y_train shape" + str(y_train.shape)) print("X_test shape" + str(X_test.shape)) training_set_size = X_train.shape[0] # Concatenate X_train and X_test (vertically) to apply operations together. X_all = np.concatenate([X_train, X_test], axis=0) # Count the number of unique values assumed by each feature. cat_cols = [] # columns deemed categorical cat_unique_values = [] # unique values for each categorical feature noncat_cols = [] # non-categorical columns for col_num in range(X_all.shape[1]): column = X_all[:, col_num:col_num+1] # (N,1) ndarray unique_values = np.unique(column)
def build_and_train_models(): # load MNIST dataset (x_train, y_train), (_, _) = mnist.load_data() # load small_norb data # (x_train, y_train) = utils.load_data('train') # np.save('x_train', x_train) # x_train = np.load('x_train.npy') # print('x_train.npy', np.shape(x_train)) # np.save('y_train', y_train) # # y_train = np.load('y_train.npy') # print('y_train.npy', np.shape(y_train)) # from sklearn.preprocessing import LabelEncoder # labelencoder = LabelEncoder() # y_train = labelencoder.fit_transform(y_train) I, T = utils.get_training_data() print('I.shape[1:]', I.shape[1:]) # assert I.shape[1:] == (96, 96, 1) train_size = int(I.shape[0] * 0.8) x_train = I[:train_size, :, :, :] y_train = T[:train_size, :] print('x_train[1]:', x_train[1]) print('y_train[1]', y_train[1]) return # val_x = I[train_size:, :, :, :] # val_y = T[train_size:, :] # reshape data for CNN as (28, 28, 1) and normalize image_size = x_train.shape[1] x_train = np.reshape(x_train, [-1, image_size, image_size, 1]) x_train = x_train.astype('float32') / 255 num_labels = np.amax(y_train) + 1 y_train = to_categorical(y_train) model_name = "cgan_mnist" # network parameters # the latent or z vector is 100-dim latent_size = 100 batch_size = 64 train_steps = 40000 lr = 2e-4 decay = 6e-8 input_shape = (image_size, image_size, 1) label_shape = (num_labels, ) # build discriminator model inputs = Input(shape=input_shape, name='discriminator_input') labels = Input(shape=label_shape, name='class_labels') discriminator = build_discriminator(inputs, labels, image_size) # [1] or original paper uses Adam, # but discriminator converges easily with RMSprop optimizer = RMSprop(lr=lr, decay=decay) discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) discriminator.summary() # build generator model input_shape = (latent_size, ) inputs = Input(shape=input_shape, name='z_input') generator = build_generator(inputs, labels, image_size) generator.summary() # build adversarial model = generator + discriminator optimizer = RMSprop(lr=lr * 0.5, decay=decay * 0.5) # freeze the weights of discriminator during adversarial training discriminator.trainable = False outputs = discriminator([generator([inputs, labels]), labels]) adversarial = Model([inputs, labels], outputs, name=model_name) adversarial.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) adversarial.summary() # train discriminator and adversarial networks models = (generator, discriminator, adversarial) data = (x_train, y_train) params = (batch_size, latent_size, train_steps, num_labels, model_name) train(models, data, params)
return features if __name__ == '__main__': # Hyper paramaters color_space = 'RGB' hog_channel = 0 orient = 9 pix_per_cell = 8 cell_per_block = 2 spatial_size = (32, 32) hist_bins = 32 cars, noncars = get_training_data(nsamples=1) car_image = mpimg.imread(cars[0]) noncar_image = mpimg.imread(noncars[0]) hog_features, car_hog_image = get_hog_features(car_image[:, :, hog_channel], orient, pix_per_cell, cell_per_block, vis=True, feature_vec=True) hog_features, noncar_hog_image = get_hog_features( noncar_image[:, :, hog_channel], orient, pix_per_cell,
from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.pipeline import FeatureUnion from sklearn_pandas import DataFrameMapper from sklearn.preprocessing import LabelBinarizer from transformers import DataFrameSelector, CombinedAttributesAdder, NumpySelector from utils import get_training_data jobs = get_training_data() jobs_num = jobs.drop(["job_id", "job_type", "city"], axis=1) jobs_cat = jobs.drop(["job_id", "salary"], axis=1) num_attribs_hour = ["salary", "hours"] num_attribs_all = ["salary", "salary_per_hour"] num_pipeline = Pipeline([ ('selector_1', DataFrameSelector(num_attribs_hour)), ('attribs_adder', CombinedAttributesAdder(0, 1)), ('selector_2', NumpySelector([0, 2])), ('std_scaler', StandardScaler()), ]) mapper = DataFrameMapper([(cat, LabelBinarizer()) for cat in jobs_cat]) cat_pipeline = Pipeline([('label_multibinarizer', mapper)]) full_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline),
def __init__(self, features, save_model_name=None): super(DecisionTree, self).__init__(features, save_model_name=save_model_name) if not (self.save_model_path).exists(): self.call_model = DecisionTreeClassifier() def show(self): plot_tree(self.call_model) plt.show() if __name__ == "__main__": from utils import get_testing_data, get_training_data import numpy as np train_data = get_training_data(nrows=None) features = [ c for c in train_data.columns.values if not (c in classification_cols) ] contamination = train_data["malicious"].sum() / len(train_data) test_data = get_testing_data(nrows=None) AD_models = [LOF, ISOF, OneClassSVM] print("contamination ratio", contamination) # time model for model_name, mc in [("dt", DecisionTree), ("MLP", MLP), ("rf", RandomForest), ("ISOF", ISOF), ("svm", SVM), ("OneClassSVM", OneClassSVM), ("LOF", LOF)]:
return backward def logsumexp(self, a): """Log sum exp trick to solve underflow issue""" b = a.max() return b + np.log((np.exp(a - b)).sum()) def read_test_file(self, file): with open(file, 'r', encoding="utf-8") as f: test_data = f.read().rstrip().split('\n\n') return test_data def create_test_result_file(self, test_result, filename): with open(filename, "w", encoding="utf-8") as f: for sequence in test_result: for word, tag in sequence: f.write(f"{word} {tag}\n") f.write("\n") if __name__ == "__main__": if len(sys.argv) < 3: print ('Please make sure you have installed Python 3.4 or above!') print ("Usage on Windows: python emission.py [train file] [dev.in file]") print ("Usage on Linux/Mac: python3 emission.py [train file] [dev.in file]") sys.exit() train_data, tags = get_training_data(sys.argv[1]) crf = CRF(tags, train_data) crf.train(iterations=10) crf.predict(sys.argv[2], sys.argv[3])
from __future__ import print_function import random import sys import numpy as np from keras.models import Sequential from keras.layers import Dense, Activation from keras.layers import LSTM, Dropout, Embedding from keras.optimizers import RMSprop from utils import init_stop_words, get_training_data, check_rythm from constant import wu_yan_lv_shi # init iter_num = 200 text = get_training_data() stop_words = init_stop_words() print('stop_words:', stop_words) for i in stop_words: text = text.replace(i, '') print('corpus length:', len(text)) chars = sorted(list(set(text))) print('total chars:', len(chars)) char_indices = dict((c, i) for i, c in enumerate(chars)) indices_char = dict((i, c) for i, c in enumerate(chars)) # cut the text in semi-redundant sequences of maxlen characters txt_maxlen = 15000 maxlen = 5