def evaluate(X, y, model):
    with timing():
        scores, averages = evaluate_model(X, y, model)
        print(scores)
        print(f'Averages: {averages}')

        confusion_matrix(X, y, model, XGBOOST_VISUALIZATION_PATH)
def evaluate(X, y):
    with timing():
        model_for_evaluation = KerasClassifier(
            build_fn=create_model, epochs=num_epochs, batch_size=batch_size, verbose=0)
        model = create_pipeline(
            model_for_evaluation, sampling_strategy=SamplingStrategy.OVERSAMPLING, y=y)
        scores, averages = evaluate_model(X, y, model, gpu_mode=True)
        print('\n\n', scores)
        print(f'Averages: {averages}')

        confusion_matrix(X, y, model, MLP_VISUALIZATION_PATH)
예제 #3
0
x_validation, y_validation = helpers.x_and_y_separation(validation)

y_train = pandas.DataFrame(y_train)
y_test = pandas.DataFrame(y_test)
y_validation = pandas.DataFrame(y_validation)

x_train, y_train = SMOTE().fit_sample(x_train, y_train)

# ---------------------------------------------------------- Tree
tree = models.TreeDecision(x_train, y_train).fit()

y_train['tree'] = tree.predict(x_train)
y_test['tree'] = tree.predict(x_test)
y_validation['tree'] = tree.predict(x_validation)

cm_tree_train = helpers.confusion_matrix(y_train, ['Fraude', 'tree'])
cm_tree_test = helpers.confusion_matrix(y_test, ['Fraude', 'tree'])
cm_tree_val = helpers.confusion_matrix(y_validation, ['Fraude', 'tree'])

feature_cols = x_train.columns
dot_data = StringIO()

export_graphviz(tree,
                out_file=dot_data,
                filled=True,
                rounded=True,
                special_characters=True,
                feature_names=feature_cols,
                class_names=['0', '1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('../data/tree_entropy_{}.png'.format(datetime.date.today()))
def train(
        input_file="clean_train.csv",
        text_col="question_text",
        label_col="target",
        valid_ratio=0.2,
        max_sentence_length=91,
        sample_percent=1,
        class_weights=None,
        cell_type="gru",
        embedding="word2vec",
        embedding_path="GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin",
        embedding_dim=300,
        rnn_layers=3,
        hidden_size=128,
        one_minus_dropout=0.5,
        l2_reg=3.0,
        batch_size=32,
        epochs=5,
        learning_rate=1e-3,
        allow_soft_placement=True,
        log_device_placement=False,
        display_every=10,
        evaluate_every=100,
        checkpoint_every=100,
        num_checkpoints=5):
    # Load and split data
    print("Loading data..")
    X, Y = read_data(input_file,
                     text_col,
                     label_col,
                     sample_percent=sample_percent)

    # Create a vocanulary process
    # Its job is to assign each unique word an integer and then our sentences replace each word it's corresponding integer.
    # These mappings are later used again to substitue each word with its embedding
    # This method also trims or adds trailing zeros to padd and fit each sentence to a specific length
    print("Setting up vocabulary..")
    vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
        max_sentence_length)
    X = np.array(list(vocab_processor.fit_transform(X)))
    print("Vocabulary Size: ", len(vocab_processor.vocabulary_))
    num_classes = len(Y[0])

    # split in to train and validation
    X, Y, x_val, y_val = split_data(X, Y, valid_ratio)

    # initialize tensorflow config
    print("Initializing tensorflow session..")
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=allow_soft_placement,
            log_device_placement=log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            print("Initializing our RNN:")
            print("\nseq_length : ", X.shape[1], "\nnum_classes : ",
                  Y.shape[1], "\nvocab_size : ",
                  len(vocab_processor.vocabulary_), "\nembedding_size : ",
                  embedding_dim, "\ncell_type : ", cell_type,
                  "\nhidden_size : ", hidden_size, "\nl2 : ", l2_reg,
                  "\nclass_weights :  ", class_weights, "\nbatch_size : ",
                  batch_size, "\nrnn_layers :  ", rnn_layers)
            # Initiazlie our RNN
            rnn = RNN(seq_length=X.shape[1],
                      num_classes=Y.shape[1],
                      vocab_size=len(vocab_processor.vocabulary_),
                      embedding_size=embedding_dim,
                      cell_type=cell_type,
                      hidden_size=hidden_size,
                      l2=l2_reg,
                      class_weights=class_weights,
                      batch_size=batch_size,
                      rnn_layers=rnn_layers)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            train_op = tf.train.AdamOptimizer(learning_rate).minimize(
                rnn.loss, global_step=global_step)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", rnn.loss)
            acc_summary = tf.summary.scalar("accuracy", rnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Validation summaries
            val_summary_op = tf.summary.merge([loss_summary, acc_summary])
            val_summary_dir = os.path.join(out_dir, "summaries", "val")
            val_summary_writer = tf.summary.FileWriter(val_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")

            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=num_checkpoints)

            # Write vocabulary
            vocab_processor.save(os.path.join(out_dir, "text_vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            # Initializing pretrained embeddings if embedding flag is up
            if embedding:
                # initial matrix with random uniform
                initW = np.random.uniform(
                    -0.25, 0.25,
                    (len(vocab_processor.vocabulary_), embedding_dim))

                # In case of glove, loading embedings is pretty easy
                # Just read each line, first word is the word
                # and evey thing else on the line is a vector embedding for that vector
                if "glove" in embedding:
                    with open(embedding_path, "r", encoding="utf8") as f:
                        for line in f:
                            first_word = line.partition(' ')[0]
                            rest = line[line.index(' ') + 1:]
                            # Find if word in our vocabulary
                            idx = vocab_processor.vocabulary_.get(first_word)
                            if idx != 0:
                                # If yes then substitue the glove embedding for it instead of the random one
                                initW[idx] = np.fromstring(rest,
                                                           dtype='float32',
                                                           sep=" ")
                # In case of word2vec, we are given a bin file
                elif "word2vec" in embedding:
                    with open(embedding_path, "rb") as f:
                        # First line is header containing information about number of records and size of one record
                        header = f.readline()
                        vocab_size, layer1_size = map(int, header.split())
                        # Then, number of bytes in each record  = (size of a float) * size of one record
                        binary_len = np.dtype('float32').itemsize * layer1_size
                        # for each record
                        for line in range(vocab_size):
                            word = []
                            while True:
                                # Keep reading a charachter
                                ch = f.read(1).decode('latin-1')
                                if ch == ' ':
                                    # until you find a space, then the first word is complete
                                    word = ''.join(word)
                                    break
                                if ch != '\n':
                                    word.append(ch)
                            # Try to find that first word in our vocabulary
                            idx = vocab_processor.vocabulary_.get(word)
                            if idx != 0:
                                # if found, add substitue the corespoding embedding vector with the random vector
                                initW[idx] = np.fromstring(f.read(binary_len),
                                                           dtype='float32')
                            else:
                                f.read(binary_len)

                sess.run(rnn.W_text.assign(initW))
                print("Successful to load ", embedding, "!\n")

            # Once we are done with the embeddings and basic tensorflow settings
            # We now start with actual training routine

            # Generate batches
            itr = batch_iterator(X, Y, batch_size, epochs)
            # For each batch
            for x_batch, y_batch, start, end in itr:
                # Train
                feed_dict = {
                    rnn.input_text: x_batch,
                    rnn.input_label: y_batch,
                    rnn.keep_prob: one_minus_dropout
                }
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, rnn.loss,
                    rnn.accuracy
                ], feed_dict)
                train_summary_writer.add_summary(summaries, step)

                # Training log display
                if step % display_every == 0:
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step, loss, accuracy))

                # Evaluation
                if step % evaluate_every == 0:
                    print("\nEvaluation:")
                    total_preds = np.zeros(y_val.shape)
                    itr2 = batch_iterator(x_val,
                                          y_val,
                                          batch_size,
                                          1,
                                          shuffle=False)
                    avg_acc = 0
                    avg_loss = 0
                    steps = 0
                    for x_eval_batch, y_eval_batch, s, e in itr2:
                        feed_dict_val = {
                            rnn.input_text: x_eval_batch,
                            rnn.input_label: y_eval_batch,
                            rnn.keep_prob: 1.0
                        }
                        summaries_val, loss, accuracy, preds = sess.run([
                            val_summary_op, rnn.loss, rnn.accuracy,
                            rnn.predictions
                        ], feed_dict_val)
                        val_summary_writer.add_summary(summaries_val, step)
                        k = np.array([
                            one_hot_encode(num_classes, label)
                            for label in preds
                        ])
                        avg_acc += accuracy
                        avg_loss += loss
                        steps += 1
                        total_preds[s:e] = k
                    cf, f_score = confusion_matrix(y_val, total_preds, 2)
                    avg_acc /= steps
                    avg_loss /= steps
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: loss {:g}, acc {:g}, fscore {:g}\n".format(
                        time_str, avg_loss, avg_acc, f_score))
                    print("Confusion Matrix")
                    print(cf)
                # Model checkpoint
                if step % checkpoint_every == 0:
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=step)
                    print("Saved model checkpoint to {}\n".format(path))
def evaluate(X,
             colname,
             batch_size,
             checkpoint_dir,
             labels=None,
             allow_soft_placement=True,
             log_device_placement=False):
    text_path = os.path.join(checkpoint_dir, "..", "text_vocab")
    text_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore(
        text_path)
    X = [str(x) for x in X]
    x_eval = np.array(list(text_vocab_processor.transform(X)))
    if labels is not None:
        classes = len(labels[0])
        y_eval = np.argmax(labels, axis=1)
    else:
        y_eval = None
        classes = None

    checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=allow_soft_placement,
            log_device_placement=log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_text = graph.get_operation_by_name("input_text").outputs[0]
            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "keep_prob").outputs[0]

            # Tensors we want to evaluate
            predictions = graph.get_operation_by_name(
                "output/logits").outputs[0]
            # Generate batches for one epoch
            iterator = batch_iterator(x_eval,
                                      y_eval,
                                      batch_size,
                                      1,
                                      shuffle=False)

            # Collect the predictions here
            all_predictions = []
            for item in iterator:
                x = item[0]
                batch_predictions = sess.run(predictions, {
                    input_text: x,
                    dropout_keep_prob: 1.0
                })
                print(batch_predictions.shape)
                print(batch_predictions[0])
                all_predictions = np.concatenate(
                    [all_predictions, batch_predictions])

            all_predictions = [
                one_hot_encode(classes, int(pred)) for pred in all_predictions
            ]
            print("predictions\n", all_predictions)
            if labels is not None:
                c, f = confusion_matrix(labels, all_predictions, classes)
                print("fscore ", f)
                print("confusion_matrix:")
                print(c)
                all_predictions, c, f
            return all_predictions