def evaluate(X, y, model): with timing(): scores, averages = evaluate_model(X, y, model) print(scores) print(f'Averages: {averages}') confusion_matrix(X, y, model, XGBOOST_VISUALIZATION_PATH)
def evaluate(X, y): with timing(): model_for_evaluation = KerasClassifier( build_fn=create_model, epochs=num_epochs, batch_size=batch_size, verbose=0) model = create_pipeline( model_for_evaluation, sampling_strategy=SamplingStrategy.OVERSAMPLING, y=y) scores, averages = evaluate_model(X, y, model, gpu_mode=True) print('\n\n', scores) print(f'Averages: {averages}') confusion_matrix(X, y, model, MLP_VISUALIZATION_PATH)
x_validation, y_validation = helpers.x_and_y_separation(validation) y_train = pandas.DataFrame(y_train) y_test = pandas.DataFrame(y_test) y_validation = pandas.DataFrame(y_validation) x_train, y_train = SMOTE().fit_sample(x_train, y_train) # ---------------------------------------------------------- Tree tree = models.TreeDecision(x_train, y_train).fit() y_train['tree'] = tree.predict(x_train) y_test['tree'] = tree.predict(x_test) y_validation['tree'] = tree.predict(x_validation) cm_tree_train = helpers.confusion_matrix(y_train, ['Fraude', 'tree']) cm_tree_test = helpers.confusion_matrix(y_test, ['Fraude', 'tree']) cm_tree_val = helpers.confusion_matrix(y_validation, ['Fraude', 'tree']) feature_cols = x_train.columns dot_data = StringIO() export_graphviz(tree, out_file=dot_data, filled=True, rounded=True, special_characters=True, feature_names=feature_cols, class_names=['0', '1']) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_png('../data/tree_entropy_{}.png'.format(datetime.date.today()))
def train( input_file="clean_train.csv", text_col="question_text", label_col="target", valid_ratio=0.2, max_sentence_length=91, sample_percent=1, class_weights=None, cell_type="gru", embedding="word2vec", embedding_path="GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin", embedding_dim=300, rnn_layers=3, hidden_size=128, one_minus_dropout=0.5, l2_reg=3.0, batch_size=32, epochs=5, learning_rate=1e-3, allow_soft_placement=True, log_device_placement=False, display_every=10, evaluate_every=100, checkpoint_every=100, num_checkpoints=5): # Load and split data print("Loading data..") X, Y = read_data(input_file, text_col, label_col, sample_percent=sample_percent) # Create a vocanulary process # Its job is to assign each unique word an integer and then our sentences replace each word it's corresponding integer. # These mappings are later used again to substitue each word with its embedding # This method also trims or adds trailing zeros to padd and fit each sentence to a specific length print("Setting up vocabulary..") vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( max_sentence_length) X = np.array(list(vocab_processor.fit_transform(X))) print("Vocabulary Size: ", len(vocab_processor.vocabulary_)) num_classes = len(Y[0]) # split in to train and validation X, Y, x_val, y_val = split_data(X, Y, valid_ratio) # initialize tensorflow config print("Initializing tensorflow session..") with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=allow_soft_placement, log_device_placement=log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): print("Initializing our RNN:") print("\nseq_length : ", X.shape[1], "\nnum_classes : ", Y.shape[1], "\nvocab_size : ", len(vocab_processor.vocabulary_), "\nembedding_size : ", embedding_dim, "\ncell_type : ", cell_type, "\nhidden_size : ", hidden_size, "\nl2 : ", l2_reg, "\nclass_weights : ", class_weights, "\nbatch_size : ", batch_size, "\nrnn_layers : ", rnn_layers) # Initiazlie our RNN rnn = RNN(seq_length=X.shape[1], num_classes=Y.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=embedding_dim, cell_type=cell_type, hidden_size=hidden_size, l2=l2_reg, class_weights=class_weights, batch_size=batch_size, rnn_layers=rnn_layers) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) train_op = tf.train.AdamOptimizer(learning_rate).minimize( rnn.loss, global_step=global_step) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", rnn.loss) acc_summary = tf.summary.scalar("accuracy", rnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Validation summaries val_summary_op = tf.summary.merge([loss_summary, acc_summary]) val_summary_dir = os.path.join(out_dir, "summaries", "val") val_summary_writer = tf.summary.FileWriter(val_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=num_checkpoints) # Write vocabulary vocab_processor.save(os.path.join(out_dir, "text_vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) # Initializing pretrained embeddings if embedding flag is up if embedding: # initial matrix with random uniform initW = np.random.uniform( -0.25, 0.25, (len(vocab_processor.vocabulary_), embedding_dim)) # In case of glove, loading embedings is pretty easy # Just read each line, first word is the word # and evey thing else on the line is a vector embedding for that vector if "glove" in embedding: with open(embedding_path, "r", encoding="utf8") as f: for line in f: first_word = line.partition(' ')[0] rest = line[line.index(' ') + 1:] # Find if word in our vocabulary idx = vocab_processor.vocabulary_.get(first_word) if idx != 0: # If yes then substitue the glove embedding for it instead of the random one initW[idx] = np.fromstring(rest, dtype='float32', sep=" ") # In case of word2vec, we are given a bin file elif "word2vec" in embedding: with open(embedding_path, "rb") as f: # First line is header containing information about number of records and size of one record header = f.readline() vocab_size, layer1_size = map(int, header.split()) # Then, number of bytes in each record = (size of a float) * size of one record binary_len = np.dtype('float32').itemsize * layer1_size # for each record for line in range(vocab_size): word = [] while True: # Keep reading a charachter ch = f.read(1).decode('latin-1') if ch == ' ': # until you find a space, then the first word is complete word = ''.join(word) break if ch != '\n': word.append(ch) # Try to find that first word in our vocabulary idx = vocab_processor.vocabulary_.get(word) if idx != 0: # if found, add substitue the corespoding embedding vector with the random vector initW[idx] = np.fromstring(f.read(binary_len), dtype='float32') else: f.read(binary_len) sess.run(rnn.W_text.assign(initW)) print("Successful to load ", embedding, "!\n") # Once we are done with the embeddings and basic tensorflow settings # We now start with actual training routine # Generate batches itr = batch_iterator(X, Y, batch_size, epochs) # For each batch for x_batch, y_batch, start, end in itr: # Train feed_dict = { rnn.input_text: x_batch, rnn.input_label: y_batch, rnn.keep_prob: one_minus_dropout } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, rnn.loss, rnn.accuracy ], feed_dict) train_summary_writer.add_summary(summaries, step) # Training log display if step % display_every == 0: time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) # Evaluation if step % evaluate_every == 0: print("\nEvaluation:") total_preds = np.zeros(y_val.shape) itr2 = batch_iterator(x_val, y_val, batch_size, 1, shuffle=False) avg_acc = 0 avg_loss = 0 steps = 0 for x_eval_batch, y_eval_batch, s, e in itr2: feed_dict_val = { rnn.input_text: x_eval_batch, rnn.input_label: y_eval_batch, rnn.keep_prob: 1.0 } summaries_val, loss, accuracy, preds = sess.run([ val_summary_op, rnn.loss, rnn.accuracy, rnn.predictions ], feed_dict_val) val_summary_writer.add_summary(summaries_val, step) k = np.array([ one_hot_encode(num_classes, label) for label in preds ]) avg_acc += accuracy avg_loss += loss steps += 1 total_preds[s:e] = k cf, f_score = confusion_matrix(y_val, total_preds, 2) avg_acc /= steps avg_loss /= steps time_str = datetime.datetime.now().isoformat() print("{}: loss {:g}, acc {:g}, fscore {:g}\n".format( time_str, avg_loss, avg_acc, f_score)) print("Confusion Matrix") print(cf) # Model checkpoint if step % checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=step) print("Saved model checkpoint to {}\n".format(path))
def evaluate(X, colname, batch_size, checkpoint_dir, labels=None, allow_soft_placement=True, log_device_placement=False): text_path = os.path.join(checkpoint_dir, "..", "text_vocab") text_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore( text_path) X = [str(x) for x in X] x_eval = np.array(list(text_vocab_processor.transform(X))) if labels is not None: classes = len(labels[0]) y_eval = np.argmax(labels, axis=1) else: y_eval = None classes = None checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=allow_soft_placement, log_device_placement=log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_text = graph.get_operation_by_name("input_text").outputs[0] # input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "keep_prob").outputs[0] # Tensors we want to evaluate predictions = graph.get_operation_by_name( "output/logits").outputs[0] # Generate batches for one epoch iterator = batch_iterator(x_eval, y_eval, batch_size, 1, shuffle=False) # Collect the predictions here all_predictions = [] for item in iterator: x = item[0] batch_predictions = sess.run(predictions, { input_text: x, dropout_keep_prob: 1.0 }) print(batch_predictions.shape) print(batch_predictions[0]) all_predictions = np.concatenate( [all_predictions, batch_predictions]) all_predictions = [ one_hot_encode(classes, int(pred)) for pred in all_predictions ] print("predictions\n", all_predictions) if labels is not None: c, f = confusion_matrix(labels, all_predictions, classes) print("fscore ", f) print("confusion_matrix:") print(c) all_predictions, c, f return all_predictions