def create_lang_dataset(out_data_path, min_samples=0): if out_data_path[-1] != '/': out_data_path = out_data_path + '/' if min_samples <= 0: min_samples = 2 ** 20 en_input, en_output = get_data("accent", english_dataset_path, file_list[5], out_accent_file) fr_input, fr_output = get_data("accent", french_dataset_path, file_list[5], out_accent_file) de_input, de_output = get_data("accent", german_dataset_path, file_list[5], out_accent_file) en_input = en_input[:min_samples] fr_input = fr_input[:min_samples] de_input = de_input[:min_samples] en_output = ["english" for _ in range(len(en_input))] fr_output = ["french" for _ in range(len(fr_input))] de_output = ["german" for _ in range(len(de_input))] inputs = en_input + fr_input + de_input outputs = en_output + fr_output + de_output print(len(inputs)) print(len(outputs)) print(get_count(outputs)) get_features(out_data_path + "lang_", inputs, ['delta', 'delta2', 'sdc']) write_to_file_labels(out_data_path + "lang_out", outputs) in_files = ["lang_input" + str(i + 1) for i in range(6)] concat_files(out_data_path, in_files, "lang_in")
def create_gender_dataset(out_data_path, min_samples=0): """ create the files holding the data for the gender prediction model :param out_data_path: where to save the files :param min_samples: minimum number of samples """ if out_data_path[-1] != '/': out_data_path = out_data_path + '/' if min_samples <= 0: min_samples = 2 ** 20 en_input, en_output = get_data(["gender", "age"], english_dataset_path, file_list[5], out_gender_file) inputs, outputs = clean_gender_dataset(en_input, en_output) inputs, outputs = create_equal_dataset(inputs, outputs, min_samples) print(len(inputs)) print(len(outputs)) print(get_count(outputs)) get_features(out_data_path + "gender_", inputs, ['delta', 'delta2', 'pitch']) write_to_file_labels(out_data_path + "gender_out", outputs) in_files = ["gender_input" + str(i + 1) for i in range(6)] concat_files(out_data_path, in_files, "gender_in")
def main(): parser = argparse.ArgumentParser() parser.add_argument("--out", default=".") parser.add_argument("--modes", action='append', required=True) parser.add_argument("--sets", action='append', required=True) parser.add_argument("--normalize", default=True) parser.add_argument("model_path", help="Pylearn2 model") options = parser.parse_args() from extract_features import get_features from emotiw.bouthilx.datasets import FeaturesDataset out = options.out d_modes = options.modes sets = options.sets model_path = options.model_path normalize = options.normalize targets = os.path.join(base_path, "afew2_train_targets.npy") from theano import config from theano import function for s in sets: features = [ os.path.join(base_path, modes[mode], base_name % s) for mode in d_modes ] fd = FeaturesDataset(features, targets, "", normalize, shuffle=False) data = np.cast[config.floatX](fd.get_design_matrix()) preds = get_features(model_path, data, layer_idx=None) np.save(os.path.join(out, "_".join(d_modes) + "_" + s), preds)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--out",default=".") parser.add_argument("--modes",action='append',required=True) parser.add_argument("--sets",action='append',required=True) parser.add_argument("--normalize",default=True) parser.add_argument("model_path",help="Pylearn2 model") options = parser.parse_args() from extract_features import get_features from emotiw.bouthilx.datasets import FeaturesDataset out = options.out d_modes = options.modes sets = options.sets model_path = options.model_path normalize = options.normalize targets = os.path.join(base_path,"afew2_train_targets.npy") from theano import config from theano import function for s in sets: features = [os.path.join(base_path,modes[mode],base_name % s) for mode in d_modes] fd = FeaturesDataset(features,targets,"",normalize,shuffle=False) data = np.cast[config.floatX](fd.get_design_matrix()) preds = get_features(model_path,data,layer_idx=None) np.save(os.path.join(out,"_".join(d_modes)+"_"+s),preds)
def create_age_dataset(out_data_path, min_samples=0): if out_data_path[-1] != '/': out_data_path = out_data_path + '/' if min_samples <= 0: min_samples = 2 ** 20 en_input, en_output = get_data("age", english_dataset_path, file_list[5], out_age_file) inputs, outputs = clean_age_dataset(en_input, en_output) inputs, outputs = create_equal_dataset(inputs, outputs, min_samples) print(len(inputs)) print(len(outputs)) print(get_count(outputs)) get_features(out_data_path + "age_", inputs, ['delta', 'delta2', 'pitch']) write_to_file_labels(out_data_path + "age_out", outputs) in_files = ["age_input" + str(i + 1) for i in range(6)] concat_files(out_data_path, in_files, "age_in")
def calculate_featuresX(filename, a, sw): # All samples for activity X = genfromtxt(filename, delimiter=' ') i = 0 # Get functions for features features = extract_features.generate_features() # Calculated features matrix outf = None while i + sw < X.shape[0]: fx = extract_features.get_features(X[i:i+sw,0], features) fy = extract_features.get_features(X[i:i+sw,1], features) fz = extract_features.get_features(X[i:i+sw,2], features) # Concatenate vectores for axis feat = np.concatenate((fx, fy, fx, [a])) if type(outf).__module__ != np.__name__: outf = feat else: # Concatenate matrices outf = np.vstack((outf, feat)) # Move window i += sw/2 savetxt('../data/huawei-p7/' + filename.split('/')[-1].split('.')[0] + 'X.txt', outf, delimiter=',')
def engine_evaluate(position): ''' The zero-search engine's evaluation of `position`; a higher number means that the engine evaluates that the position favors white. ''' x_unscaled = np.array([extract_features.get_features(position) ]).astype(float) x_scaled = train.scaler_X.transform(x_unscaled) y_scaled = scaled_evaluation = model.predict( extract_features.split_features(x_scaled)) y_unscaled = unscaled_evaluation = train.scaler_Y.inverse_transform( y_scaled) return unscaled_evaluation[0][0]
def make_prediction(url): features = get_features(url) #print(features) features_extracted = convertEncodingToPositive(features) #print(features_extracted) from sklearn.preprocessing import OneHotEncoder encoder = OneHotEncoder(sparse=False) one_hot_enc = pickle.load(open("One_Hot_Encoder", "rb")) transformed_point = one_hot_enc.transform( np.array(features_extracted).reshape(1, -1)) model = pickle.load(open("RF_Final_Model.pkl", "rb")) prediction = model.predict(transformed_point)[0] return prediction
def predict(filename, le, model_file): model = load_model(model_file) prediction_feature = extract_features.get_features(filename) if model_file == "trained_mlp.h5": prediction_feature = np.array([prediction_feature]) elif model_file == "trained_cnn.h5": prediction_feature = np.expand_dims(np.array([prediction_feature]), axis=2) predicted_vector = model.predict_classes(prediction_feature) predicted_class = le.inverse_transform(predicted_vector) print("Predicted class", predicted_class[0]) predicted_proba_vector = model.predict_proba([prediction_feature]) predicted_proba = predicted_proba_vector[0] for i in range(len(predicted_proba)): category = le.inverse_transform(np.array([i])) print(category[0], "\t\t : ", format(predicted_proba[i], '.32f'))
def predict(filename, model_file): model = load_model(model_file) prediction_feature = extract_features.get_features(filename) if model_file == "trained_mlp.h5": prediction_feature = np.array([prediction_feature]) elif model_file == "trained_cnn.h5": prediction_feature = np.expand_dims(np.array([prediction_feature]), axis=2) predicted_vector = model.predict_classes(prediction_feature) #predicted_class = le.inverse_transform(predicted_vector) classes = predicted_vector[0] #print("Predicted: ",classes) #print("Predicted class",predicted_class[0]) predicted_proba_vector = model.predict_proba([prediction_feature]) predicted_proba = predicted_proba_vector[0] #print("Prob:" ,predicted_proba[1]) return predicted_proba[1] * 100 '''
def main(_): trn_snt_files = [ # '../datasets/training/as_simplified_training.utf8', '../datasets/training/cityu_simplified_training.utf8', '../datasets/training/msr_training.utf8', '../datasets/training/pku_training.utf8' ] trn_lbl_files = [splitext(f)[0] + '.bies' for f in trn_snt_files] tf.logging.info('Loading training data...') trn_examples = read_examples(trn_snt_files) trn_features = np.asarray(get_features(trn_examples, '')) y_trn = read_labels(trn_lbl_files) tf.logging.info('Creating model...') model = create_model() model.summary() tf.logging.info('Training model...') epochs = 1 batch_size = 32 steps = int(len(trn_features) / batch_size) for epoch in range(epochs): print('Epoch', epoch + 1) for uni_b, lbl_b in tqdm(train_data_generator([trn_features, y_trn], batch_size, shuffle=True), desc='Training Loop', total=steps): try: loss, acc = model.train_on_batch(uni_b, lbl_b) # print('Loss:', loss, 'Acc:', acc) except Exception as e: print(e) model.save('combined_bert_model.h5')
''' def random_guess(): data_random = {} data_random['domain'] = random.sample(domain_value_list, 1)[0] data_random['intent'] = random.sample(intent_value_list, 1)[0] slot = {} slot[random.sample(slots_key_list, 1)[0]] = \ random.sample(slots_value_list, 1)[0] data_random['slots'] = slot return data_random if __name__ == '__main__': import json dev_dct = json.load(open(sys.argv[1]), encoding='utf8') domain_value_list, intent_value_list, slots_key_list, slots_value_list = get_features( sys.argv[1]) rguess_dct = [] for dev_data in dev_dct: text_dic = {"text": dev_data['text']} rguess_dct.append(dict(text_dic, **random_guess())) json.dump(rguess_dct, open( sys.argv[2], 'w', ))
def create_data(n_samples, verbose=False): file_game_pgns, file_stockfish_evals = ( open( '/Users/colinni/evAl-chess/game_database.pgn', encoding='utf-8-sig', errors='surrogateescape' ), open('/Users/colinni/evAl-chess/stockfish_evaluations.csv') ) # Discard the first line; it contains headers. file_stockfish_evals.readline() # The accumulated data samples. data_X, data_Y = [],[] # Iterate through every game in the archive. curr_game = chess.pgn.read_game(file_game_pgns) n_curr_sample = 0 # (`chess.pgn.read_game()` returns None when it reaches the EOF.) while curr_game is not None and n_curr_sample < n_samples: print('\rcurr game |', n_curr_sample, end='') # The evaluations of each position of each game. stockfish_evals = ( # The evaluations are given in centi-pawns. Convert to the # more standard pawn scale. float(stockfish_eval) / 100.0 # Stockfish gives 'NA' for forced mates. if stockfish_eval != 'NA' else None # The lines each begin with a number and comma # (e.g., '451,') which aren't part of the evaluations. # Discard by splitting the string by the comma, taking the # second part, and splitting once again to get the # individual numbers. for stockfish_eval in ( file_stockfish_evals.readline() .split(',')[1] .split() ) ) # Iterate through every move played using the `chess.Game` # class. curr_game_node = ( curr_game.root().variation(0) if not curr_game.root().variation(0).is_end() else None ) # Setting `curr_game_node` to `None` as a flag is sloppy, but # the `chess.Game` class doesn't have a better way of detecting # 0-move games, which the database does contain. while curr_game_node is not None and n_curr_sample < n_samples: features, stockfish_eval = ( extract_features.get_features( curr_game_node.board(), verbose=verbose ), next(stockfish_evals) ) # Stockfish gives 'NA' for forced mates, which we earlier # set to `None`. if stockfish_eval is not None: data_X.append(features) data_Y.append(stockfish_eval) n_curr_sample += 1 # Set curr_game_node to the next position in the game. If # it's the end of the game, set it to None as a flag. curr_game_node = ( curr_game_node.variation(0) if not curr_game_node.is_end() else None ) # Get the next game in the pgn file. curr_game = chess.pgn.read_game(file_game_pgns) # Convert `data_X` and `data_Y` into numpy arrays and store them # in numpy's npy format. To load, `np.load(path)`. np.save('../evAl-chess/X.npy', np.array(data_X).astype(float)) np.save('../evAl-chess/Y.npy', np.array(data_Y))
def train_imageAVmodel(): training_file = sys.argv[1] data_directory = sys.argv[2] parameter_file = 'params.json' params = json.loads(open(parameter_file).read()) if params['extract_features'] == 'true': x_raw, y_raw = get_features(training_file, data_directory, params['vgg_file'], params['gistFile'], params['semF_file']) with open('x_data.pickle', 'wb') as f: pickle.dump(x_raw, f, protocol=pickle.HIGHEST_PROTOCOL) with open('y_data.pickle', 'wb') as f: pickle.dump(y_raw, f, protocol=pickle.HIGHEST_PROTOCOL) else: with open(params['x_file'], 'rb') as f: x_raw = pickle.load(f) with open(params['y_file'], 'rb') as g: y_raw = pickle.load(g) x = np.array(x_raw) y = np.array(y_raw) """ randomly shuffle data """ np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] """ split the original dataset into train_ and test sets""" x_, x_test, y_, y_test = train_test_split(x_shuffled, y_shuffled, test_size=0.1, random_state=42) """ shuffle the train_ set and split the train set into train and val sets""" shuffle_indices = np.random.permutation(np.arange(len(y_))) x_shuffled = x_[shuffle_indices] y_shuffled = y_[shuffle_indices] x_train, x_val, y_train, y_val = train_test_split(x_shuffled, y_shuffled, test_size=0.1) logging.info('x_train: {}, x_val: {}, x_test: {}'.format(len(x_train), len(x_val), len(x_test))) logging.info('y_train: {}, y_val: {}, y_test: {}'.format(len(y_train), len(y_val), len(y_test))) """ build a graph """ graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): imageAV = ImageAVmodel( input_length = x_train.shape[1], num_neurons_in_layers = params['num_neurons_in_layers'] ) global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.MomentumOptimizer(params['learning_rate'], params['momentum']) # grads_and_vars = optimizer.compute_gradients(imageAV.loss) train_op = optimizer.minimize(imageAV.loss, global_step=global_step) timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join( os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "modelData/trained_model_" + timestamp)) saved_model_dir = os.path.abspath(os.path.join(out_dir, "saved_model")) checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables()) if params['warm_start'] == 'true': saver.restore(sess, params['save_path']) logginf.info('Model loaded from {}'.format(params['save_path'])) # One training step: train the model with one batch def train_step(x_batch, y_batch): y_batch = np.reshape(y_batch, (len(y_batch),1)) feed_dict = { imageAV.input_x: x_batch, imageAV.input_y: y_batch} _, step, loss, loss_S = sess.run([train_op, global_step, imageAV.loss, imageAV.loss_summary], feed_dict) return loss, loss_S # One evaluation step: evaluate the model with one batch def val_step(x_batch, y_batch): y_batch = np.reshape(y_batch, (len(y_batch),1)) feed_dict = {imageAV.input_x: x_batch, imageAV.input_y: y_batch} step, loss, loss_S = sess.run([global_step, imageAV.loss, imageAV.loss_summary], feed_dict) return loss, loss_S sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph()) train_batches = batch_iter(list(zip(x_train, y_train)), params['batch_size'],params['num_epochs']) min_loss, min_at_step = float("inf"), 0 logging.info('<--------------Training has begun--------------->') """ train the cnn model with x_train and y_train (batch by batch)""" for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) train_loss, _train_loss_summary = train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) logging.debug('Train Step {}, Loss: {}'.format(current_step,train_loss)) writer.add_summary(_train_loss_summary, current_step) """ evaluate the model with x_val and y_val (batch by batch)""" if current_step % params['evaluate_every'] == 0: val_batches = batch_iter(list(zip(x_val, y_val)), params['batch_size'], 1) total_val_loss = 0.0 for val_batch in val_batches: x_val_batch, y_val_batch = zip(*val_batch) val_loss, _val_loss_summary = val_step(x_val_batch, y_val_batch) total_val_loss += val_loss writer.add_summary(_val_loss_summary, current_step) # avg_val_loss = total_val_loss/len(y_val) logging.info('At step {}, Total loss on val set: {}'.format(current_step, total_val_loss)) # logging.info('At step {}, Average loss on val set: {}'.format(current_step, avg_val_loss)) """ save the model if it is the best based on loss on the val set """ if total_val_loss <= min_loss: min_loss, min_at_step = total_val_loss, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.debug('Saved model {} at step {}'.format(path, min_at_step)) logging.debug('Best accuracy {} at step {}'.format(min_loss, min_at_step)) """ predict x_test (batch by batch)""" test_batches = batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1) total_test_loss = 0.0 logging.info("Testing Now.") for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) test_loss, _test_loss_summary = val_step(x_test_batch, y_test_batch) total_test_loss += test_loss # avg_test_loss = total_test_loss/len(y_test) logging.info('Total loss on the test set is {} based on the best model {}'.format(total_test_loss, path)) # logging.critical('Average loss on test set is {} based on the best model {}'.format(avg_test_loss, path)) logging.info('The training is complete.') """ saving the model """