Пример #1
0
    def test_and_compute_score(self, x_test_opcode, x_test_assembly, x_test_seq_len, y_test):
        with tf.Session(config=utils.get_default_config()) as sess:
            saver = tf.train.Saver(tf.global_variables())

            check_point = tf.train.get_checkpoint_state(self.checkpoint_path)
            if check_point and tf.train.checkpoint_exists(check_point.model_checkpoint_path):
                message = "Load model parameters from %s\n" % check_point.model_checkpoint_path
                utils.print_and_write_logging_file(self.logging_path, message, self.running_mode, self.datetime)
                saver.restore(sess, check_point.model_checkpoint_path)
            else:
                raise Exception('Saved model not found.')

            testing_set = x_test_opcode.shape[0] - x_test_opcode.shape[0] % self.batch_size
            testing_batches = utils.make_batches(testing_set, self.batch_size)

            average_test_loss = 0.0
            average_accuracy_rnn = 0.0
            full_y_pred = np.array([])
            for batch_idx, (batch_start, batch_end) in enumerate(testing_batches):
                batch_x_opcode = x_test_opcode[batch_start:batch_end]
                batch_x_assembly = x_test_assembly[batch_start:batch_end]

                batch_y = y_test[batch_start:batch_end]
                batch_sequence_length = x_test_seq_len[batch_start:batch_end]

                feed_dict = {
                    self.X_opcode: batch_x_opcode,
                    self.X_assembly: batch_x_assembly,
                    self.Y: batch_y,
                    self.sequence_length: batch_sequence_length,
                }

                # batch_test_loss, accuracy_rnn = sess.run(
                #     [self.loss, self.accuracy_bi_rnn],
                #     feed_dict=feed_dict)

                batch_test_loss = sess.run(self.loss, feed_dict=feed_dict)

                batch_y_pred = sess.run(self.y_pred_svm, feed_dict=feed_dict)
                full_y_pred = np.append(full_y_pred, batch_y_pred)

                average_test_loss += batch_test_loss / len(testing_batches)
                # average_accuracy_rnn += accuracy_rnn / len(testing_batches)

            full_accuracy_score = mt.accuracy_score(y_true=y_test[:testing_set], y_pred=full_y_pred)
            full_pre_score = mt.precision_score(y_true=y_test[:testing_set], y_pred=full_y_pred)
            full_f1_score = mt.f1_score(y_true=y_test[:testing_set], y_pred=full_y_pred)
            full_recall_score = mt.recall_score(y_true=y_test[:testing_set], y_pred=full_y_pred)
            full_auc_score = mt.roc_auc_score(y_true=y_test[:testing_set], y_score=full_y_pred)

            message = "testing loss %.5f\n" % average_test_loss
            message += "accuracy %.2f\n" % (full_accuracy_score * 100)
            message += "compute score:\n"
            message += '\tprecision score %.5f\n' % (full_pre_score * 100)
            message += '\tf1 score %.5f\n' % (full_f1_score * 100)
            message += '\trecall score %.5f\n' % (full_recall_score * 100)
            message += '\tAUC score %.5f\n' % (full_auc_score * 100)
            message += "-----------------------------------------------------\n"
            message += "Finish computing score process.\n"
            utils.print_and_write_logging_file(self.logging_path, message, self.running_mode, self.datetime)
Пример #2
0
    def visualization(self, x_test_opcode, x_test_assembly, x_test_seq_len, y_test):
        with tf.Session(config=utils.get_default_config()) as sess:
            self.checkpoint_path = 'saved-model/full_dataset/good_result/2018-5-8-19-36-50'
            saver = tf.train.Saver(tf.global_variables())

            check_point = tf.train.get_checkpoint_state(self.checkpoint_path)
            if check_point and tf.train.checkpoint_exists(check_point.model_checkpoint_path):
                message = "Load model parameters from %s\n" % check_point.model_checkpoint_path
                utils.print_and_write_logging_file(self.logging_path, message, self.running_mode, self.datetime)
                saver.restore(sess, check_point.model_checkpoint_path)
            else:
                raise Exception('Saved model not found.')
            # model_path = 'saved-model/full_dataset/good_result/2018-5-8-19-36-50'

            testing_set = x_test_opcode.shape[0] - x_test_opcode.shape[0] % self.batch_size
            batch_x_opcode = []
            batch_x_assembly = []
            batch_y = []
            batch_sequence_length = []

            for i in range(3498, testing_set):
                # if y_test[i] == 0:
                if y_test[i] == 1:
                    batch_x_opcode.append(x_test_opcode[i])
                    batch_x_assembly.append(x_test_assembly[i])
                    batch_sequence_length.append(x_test_seq_len[i])
                    batch_y.append(y_test[i])
                if len(batch_y) == self.batch_size:
                    break

            feed_dict = {
                self.X_opcode: batch_x_opcode,
                self.X_assembly: batch_x_assembly,
                self.Y: batch_y,
                self.sequence_length: batch_sequence_length,
            }

            image = sess.run(
                [self.cnn_input],
                feed_dict=feed_dict)

            layers = ["r", "p", "c"]
            path_log = os.path.join('visualize', '_epoch1000000_good_log_1_cap_tuong_tu')
            path_output = os.path.join('visualize', '_epoch1000000_good_output_1_cap_tuong_tu')

            # activation_visualization(sess_graph_path=sess, value_feed_dict=feed_dict, layers=layers, path_logdir=path_log,
            #                              path_outdir=path_output)
            # deconv_visualization(sess_graph_path=sess, value_feed_dict=feed_dict, input_tensor=self.cnn_input, layers=layers, path_logdir=path_log,
            #                          path_outdir=path_output)
            # img_normalize = image_normalization(image[0][0])
            # imsave(os.path.join('visualize', '_epoch100_good_image_1_cap_tuong_tu.png'), np.reshape(img_normalize, [img_normalize.shape[0], img_normalize.shape[1]]))

            layer = 'cnn/relu3/Relu'
            deepdream_visualization(sess_graph_path=sess,input_tensor=self.two_dimension_image, value_feed_dict=feed_dict, layer=layer, classes=[1, 2, 3, 4, 5], path_logdir=path_log,
                                         path_outdir=path_output)

            print('Ok, I got it.')	
Пример #3
0
    def test(self, checkpoint_path, x_test_opcode, x_test_assembly, x_test_seq_len, y_test):
        graph = tf.Graph()
        with graph.as_default():
            with tf.Session(config=utils.get_default_config()) as sess:
                check_point = tf.train.get_checkpoint_state(checkpoint_path)

                try:
                    saver = tf.train.import_meta_graph("{}.meta".format(check_point.model_checkpoint_path))
                    saver.restore(sess, check_point.model_checkpoint_path)
                except:
                    print("Can not find the saved model.")

                message = "Loaded model parameters from %s\n" % check_point.model_checkpoint_path
                utils.print_and_write_logging_file(self.logging_path, message, self.running_mode, self.datetime)

                # get the placeholders from the graph by name
                X_opcode = graph.get_operation_by_name("input/x_opcode_input").outputs[0]
                X_assembly = graph.get_operation_by_name("input/x_assemply_input").outputs[0]
                Y = graph.get_operation_by_name("input/true_label").outputs[0]
                sequence_length = graph.get_operation_by_name("input/seq_length").outputs[0]

                # get tensor to visualize
                phi_x_tilde = graph.get_operation_by_name("oc-svm/phi_x_tilde").outputs[0]
                # get tensor for prediction
                w_phi_minus_rho2 = graph.get_operation_by_name("oc-svm/sub").outputs[0]

                test_set = x_test_opcode.shape[0] - x_test_opcode.shape[0] % self.batch_size
                test_batches = utils.make_batches(test_set, self.batch_size)

                full_phi_x_tilde = np.zeros((test_set, 2*self.num_random_features))  # (batch_size, 2*n_random_features)
                test_full_pred = np.array([])  # (batch_size, 1)
                for batch_idx, (batch_start, batch_end) in enumerate(test_batches):
                    test_x_opcode = utils.convert_list_sparse_to_dense(x_test_opcode[batch_start:batch_end])
                    test_x_assembly = utils.convert_list_sparse_to_dense(x_test_assembly[batch_start:batch_end])
                    test_y = y_test[batch_start:batch_end]
                    test_seq_len = x_test_seq_len[batch_start:batch_end]

                    test_feed_dict = {
                        X_opcode: test_x_opcode,
                        X_assembly: test_x_assembly,
                        Y: test_y,
                        sequence_length: test_seq_len,
                    }

                    batch_phi_x_tilde, test_batch_pred = sess.run([phi_x_tilde, w_phi_minus_rho2],
                                                                feed_dict=test_feed_dict)
                    test_full_pred = np.append(test_full_pred, test_batch_pred)
                    full_phi_x_tilde[batch_start:batch_end] = batch_phi_x_tilde

                test_min_cost_value, test_y_pred_0, test_y_pred_with_optimal_hyperplane, test_y_pred_1, test_n_data_in_strip = self.compute_cost_sensitive_loss_and_y_pred(
                    test_full_pred, y_test[:test_set])

                test_acc_0, test_pre_0, test_f1_0, test_rec_0, test_auc_0 = self.compute_score(
                    y_true=y_test[:test_set],
                    y_pred=test_y_pred_0)

                test_acc_opt, test_pre_opt, test_f1_opt, test_rec_opt, test_auc_opt = self.compute_score(
                    y_true=y_test[:test_set],
                    y_pred=test_y_pred_with_optimal_hyperplane)

                test_acc_1, test_pre_1, test_f1_1, test_rec_1, test_auc_1 = self.compute_score(
                    y_true=y_test[:test_set],
                    y_pred=test_y_pred_1)

                message = "[test] cost_sensitive_loss %.5f\n" % test_min_cost_value

                message += "[test] accuracy_0 %.2f\n" % (test_acc_0 * 100)
                message += "[test] precision_0 %.2f\n" % (test_pre_0 * 100)
                message += "[test] f1_0 %.2f\n" % (test_f1_0 * 100)
                message += "[test] recall_0 %.2f\n" % (test_rec_0 * 100)
                message += "[test] auc_0 %.2f\n" % (test_auc_0 * 100)

                message += "[test] accuracy_opt %.2f\n" % (test_acc_opt * 100)
                message += "[test] precision_opt %.2f\n" % (test_pre_opt * 100)
                message += "[test] f1_opt %.2f\n" % (test_f1_opt * 100)
                message += "[test] recall_opt %.2f\n" % (test_rec_opt * 100)
                message += "[test] auc_opt %.2f\n" % (test_auc_opt * 100)

                message += "[test] accuracy_1 %.2f\n" % (test_acc_1 * 100)
                message += "[test] precision_1 %.2f\n" % (test_pre_1 * 100)
                message += "[test] f1_1 %.2f\n" % (test_f1_1 * 100)
                message += "[test] recall_1 %.2f\n" % (test_rec_1 * 100)
                message += "[test] auc_1 %.2f\n" % (test_auc_1 * 100)

                message += "[test] n_data_in_strip %d\n" % test_n_data_in_strip
                message += "-----------------------------------------------------\n"
                utils.print_and_write_logging_file(self.logging_path, message, self.running_mode)
Пример #4
0
    def train(self, x_train_opcode, x_train_assembly, x_train_seq_len, y_train, x_valid_opcode, x_valid_assembly,
                x_valid_seq_len, y_valid, x_test_opcode, x_test_assembly, x_test_seq_len, y_test):
        outFile = open(self.OutName, 'w')    
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)

        with tf.Session(config=utils.get_default_config()) as sess:
            writer = tf.summary.FileWriter(self.graph_path, sess.graph)

            check_point = tf.train.get_checkpoint_state(self.checkpoint_path)
            if check_point and tf.train.checkpoint_exists(check_point.model_checkpoint_path):
                message = "Load model parameters from %s\n" % check_point.model_checkpoint_path
                utils.print_and_write_logging_file(self.logging_path, message, self.running_mode)
                saver.restore(sess, check_point.model_checkpoint_path)
            else:
                message = "Create the model with fresh parameters\n"
                utils.print_and_write_logging_file(self.logging_path, message, self.running_mode)
                sess.run(tf.global_variables_initializer())

            ####Seperate dataset
            x_train_opcode_0 = []
            x_train_opcode_1 = []
            x_train_assembly_0 = []
            x_train_assembly_1 = []
            y_train_0 = []
            y_train_1 = []
            x_train_seq_len_0 = []
            x_train_seq_len_1 = []

            for index, aLabel in enumerate(y_train):
                 if (aLabel == 0.0):
                    x_train_opcode_0.append(x_train_opcode[index,:,:,:])
                    x_train_assembly_0.append(x_train_assembly[index,:,:,:])
                    y_train_0.append(y_train[index])
                    x_train_seq_len_0.append(x_train_seq_len[index])
                 else:
                    x_train_opcode_1.append(x_train_opcode[index,:,:,:])
                    x_train_assembly_1.append(x_train_assembly[index,:,:,:])
                    y_train_1.append(y_train[index])
                    x_train_seq_len_1.append(x_train_seq_len[index])

            x_train_opcode_0 = np.array(x_train_opcode_0)
            x_train_opcode_1 = np.array(x_train_opcode_1)
            x_train_assembly_0 = np.array(x_train_assembly_0)
            x_train_assembly_1 = np.array(x_train_assembly_1)

            min_train_0_1 = min(x_train_opcode_0.shape[0], x_train_opcode_1.shape[0])
            training_set = min_train_0_1 - min_train_0_1 % (self.batch_size // 2)
            training_batches = utils.make_batches(training_set, (self.batch_size // 2))
            ####Seperate dataset

            step_loss = 0.0  # average loss per epoch
            step_time = 0.0
            full_train_accuracy_score = []
            full_train_pre_score = []
            full_train_f1_score = []
            full_train_recall_score = []
            full_train_auc_score = []
            initial_step = self.global_step.eval()
            for step in range(initial_step, initial_step + self.num_train_steps):

                loss_per_batch = 0.0
                start_time = time.time()
                full_y_predic_train = np.array([])
                full_y_target_train = np.array([])
                for batch_idx, (batch_start, batch_end) in enumerate(training_batches):

                    ####Seperate batch
                    batch_x_opcode_0 = x_train_opcode_0[batch_start:batch_end]
                    batch_x_assembly_0 = x_train_assembly_0[batch_start:batch_end]
                    batch_y_0 = y_train_0[batch_start:batch_end]
                    batch_sequence_length_0 = x_train_seq_len_0[batch_start:batch_end]

                    batch_x_opcode_1 = x_train_opcode_1[batch_start:batch_end]
                    batch_x_assembly_1 = x_train_assembly_1[batch_start:batch_end]
                    batch_y_1 = y_train_1[batch_start:batch_end]
                    batch_sequence_length_1 = x_train_seq_len_1[batch_start:batch_end]

                    batch_x_opcode = np.concatenate((batch_x_opcode_0, batch_x_opcode_1), axis=0)
                    batch_x_assembly = np.concatenate((batch_x_assembly_0, batch_x_assembly_1), axis=0)
                    batch_y = batch_y_0 + batch_y_1
                    batch_sequence_length = batch_sequence_length_0 + batch_sequence_length_1
                    ####Seperate batch
                    full_y_target_train = np.append(full_y_target_train, batch_y)

                    feed_dict = {
                        self.X_opcode: batch_x_opcode,
                        self.X_assembly: batch_x_assembly,
                        self.Y: batch_y,
                        self.sequence_length: batch_sequence_length,
                    }

                    soutputs, sstates, sphi_x_tilde = sess.run(
                        [self.outputs, self.states, self.phi_x_tilde],
                        feed_dict=feed_dict)

                    # print("Hello")
                    # print(soutputs.shape)
                    # print(sstates.shape)
                    # print(sphi_x_tilde.shape)
                    # print(np.sum(soutputs[:,0,:]))
                    # print(np.sum(sphi_x_tilde[:,:256]))
                    # print(np.sum(soutputs[:,-1,:]))
                    # print(np.sum(sstates))
                    # print(np.sum(sphi_x_tilde[:,-256:]))
                    # sys.exit()

                    _, summary, batch_loss, batch_y_pred_train = sess.run(
                        [self.training_op, self.summary_op, self.loss, self.y_pred_svm],
                        feed_dict=feed_dict)
                    full_y_predic_train = np.append(full_y_predic_train, batch_y_pred_train)

                    if (batch_idx + 1) % (len(training_batches) // 10) == 0:
                        writer.add_summary(summary, global_step=step)

                    loss_per_batch += batch_loss / len(training_batches)

                batch_train_accuracy_score = mt.accuracy_score(y_true=full_y_target_train, y_pred=full_y_predic_train)
                batch_train_pre_score = mt.precision_score(y_true=full_y_target_train, y_pred=full_y_predic_train)
                batch_train_f1_score = mt.f1_score(y_true=full_y_target_train, y_pred=full_y_predic_train)
                batch_train_recall_score = mt.recall_score(y_true=full_y_target_train, y_pred=full_y_predic_train)
                batch_train_auc_score = mt.roc_auc_score(y_true=full_y_target_train, y_score=full_y_predic_train)
                full_y_predic_train = np.array([])
                full_y_target_train = np.array([])
                full_train_accuracy_score.append(batch_train_accuracy_score)
                full_train_pre_score.append(batch_train_pre_score)
                full_train_f1_score.append(batch_train_f1_score)
                full_train_recall_score.append(batch_train_recall_score)
                full_train_auc_score.append(batch_train_auc_score)

                step_time += (time.time() - start_time)
                step_loss += loss_per_batch

                # if (step + 1) % 10 == 0:
                #     # Save checkpoint and zero timer and loss.
                #     checkpoint_path = os.path.join(self.checkpoint_path, "rnn_classifier_" + self.data_size + ".ckpt")
                #     saver.save(sess, checkpoint_path, global_step=step)

                if (step + 1) % self.display_step == 0:

                	#Train plot
                    ave_train_accuracy_score = np.mean(full_train_accuracy_score)
                    ave_train_pre_score = np.mean(full_train_pre_score)
                    ave_train_f1_score = np.mean(full_train_f1_score)
                    ave_train_recall_score = np.mean(full_train_recall_score)
                    ave_train_auc_score = np.mean(full_train_auc_score)

                    full_train_accuracy_score = []
                    full_train_pre_score = []
                    full_train_f1_score = []
                    full_train_recall_score = []
                    full_train_auc_score = []
                    message = "global step %d/%d step-time %.2fs average loss %.5f acc %.2f pre %.2f f1 %.2f rec %.2f auc %.2f\n" % (
                        step, self.num_train_steps - 1, step_time, step_loss, ave_train_accuracy_score, ave_train_pre_score, ave_train_f1_score, ave_train_recall_score, ave_train_auc_score)
                    utils.print_and_write_logging_file(self.logging_path, message, self.running_mode)

                    outFile.write("%.2f\n" %(ave_train_accuracy_score * 100))
                    outFile.write("%.2f\n" %(ave_train_pre_score * 100))
                    outFile.write("%.2f\n" %(ave_train_f1_score * 100))
                    outFile.write("%.2f\n" %(ave_train_recall_score * 100))
                    outFile.write("%.2f\n" %(ave_train_auc_score * 100))
                    #Train plot

                    #Dev plot

                    step_time, step_loss = 0.0, 0.0

                    dev_set = x_valid_opcode.shape[0] - x_valid_opcode.shape[0] % self.batch_size
                    dev_batches = utils.make_batches(dev_set, self.batch_size)
                    ####Seperate dataset

                    average_dev_loss = 0.0
                    full_y_pred_svm = np.array([])
                    for batch_idx, (batch_start, batch_end) in enumerate(dev_batches):

                        valid_x_opcode = x_valid_opcode[batch_start:batch_end]
                        valid_x_assembly = x_valid_assembly[batch_start:batch_end]

                        valid_y = y_valid[batch_start:batch_end]
                        valid_seq_len = x_valid_seq_len[batch_start:batch_end]
                        ####Seperate batch

                        feed_dict = {
                            self.X_opcode: valid_x_opcode,
                            self.X_assembly: valid_x_assembly,
                            self.Y: valid_y,
                            self.sequence_length: valid_seq_len,
                        }

                        batch_dev_loss, batch_y_pred = sess.run([self.loss, self.y_pred_svm], feed_dict=feed_dict)
                        full_y_pred_svm = np.append(full_y_pred_svm, batch_y_pred)

                        average_dev_loss += batch_dev_loss / len(dev_batches)
                    message = "eval: accuracy_svm %.2f\n" % (
                                mt.accuracy_score(y_true=y_valid[:dev_set], y_pred=full_y_pred_svm) * 100)
                    message += "eval: precision_svm %.2f\n" % (
                                mt.precision_score(y_true=y_valid[:dev_set], y_pred=full_y_pred_svm) * 100)
                    message += "eval: f1_svm %.2f\n" % (
                                mt.f1_score(y_true=y_valid[:dev_set], y_pred=full_y_pred_svm) * 100)
                    message += "eval: recall_svm %.2f\n" % (
                                mt.recall_score(y_true=y_valid[:dev_set], y_pred=full_y_pred_svm) * 100)
                    message += "eval: roc_auc_svm %.2f\n" % (
                                mt.roc_auc_score(y_true=y_valid[:dev_set], y_score=full_y_pred_svm) * 100)
                    message += "-----------------------------------------------------\n"
                    outFile.write("%.2f\n" %(mt.accuracy_score(y_true=y_valid[:dev_set], y_pred=full_y_pred_svm) * 100))
                    outFile.write("%.2f\n" %(mt.precision_score(y_true=y_valid[:dev_set], y_pred=full_y_pred_svm) * 100))
                    outFile.write("%.2f\n" %(mt.f1_score(y_true=y_valid[:dev_set], y_pred=full_y_pred_svm) * 100))
                    outFile.write("%.2f\n" %(mt.recall_score(y_true=y_valid[:dev_set], y_pred=full_y_pred_svm) * 100))
                    outFile.write("%.2f\n" %(mt.roc_auc_score(y_true=y_valid[:dev_set], y_score=full_y_pred_svm) * 100))
                    utils.print_and_write_logging_file(self.logging_path, message, self.running_mode)
                    #Dev plot

                    #Test plot
                    #x_train_opcode, x_train_assembly, x_train_seq_len, y_train, 
                    #x_valid_opcode, x_valid_assembly, x_valid_seq_len, y_valid, 
                    #x_test_opcode, x_test_assembly, x_test_seq_len, y_test
                    step_time, step_loss = 0.0, 0.0

                    test_set = x_test_opcode.shape[0] - x_test_opcode.shape[0] % self.batch_size
                    test_batches = utils.make_batches(test_set, self.batch_size)
                    ####Seperate dataset

                    average_test_loss = 0.0
                    full_y_pred_svm_test = np.array([])
                    for batch_idx, (batch_start, batch_end) in enumerate(test_batches):

                        test_x_opcode = x_test_opcode[batch_start:batch_end]
                        test_x_assembly = x_test_assembly[batch_start:batch_end]

                        test_y = y_test[batch_start:batch_end]
                        test_seq_len = x_test_seq_len[batch_start:batch_end]
                        ####Seperate batch

                        feed_dict = {
                            self.X_opcode: test_x_opcode,
                            self.X_assembly: test_x_assembly,
                            self.Y: test_y,
                            self.sequence_length: test_seq_len,
                        }

                        batch_test_loss, batch_y_pred_test = sess.run([self.loss, self.y_pred_svm], feed_dict=feed_dict)
                        full_y_pred_svm_test = np.append(full_y_pred_svm_test, batch_y_pred_test)

                        average_test_loss += batch_test_loss / len(test_batches)

                    message = "test: accuracy_svm %.2f\n" % (
                                mt.accuracy_score(y_true=y_test[:test_set], y_pred=full_y_pred_svm_test) * 100)
                    message += "test: precision_svm %.2f\n" % (
                                mt.precision_score(y_true=y_test[:test_set], y_pred=full_y_pred_svm_test) * 100)
                    message += "test: f1_svm %.2f\n" % (
                                mt.f1_score(y_true=y_test[:test_set], y_pred=full_y_pred_svm_test) * 100)
                    message += "test: recall_svm %.2f\n" % (
                                mt.recall_score(y_true=y_test[:test_set], y_pred=full_y_pred_svm_test) * 100)
                    message += "test: roc_auc_svm %.2f\n" % (
                                mt.roc_auc_score(y_true=y_test[:test_set], y_score=full_y_pred_svm_test) * 100)
                    message += "-----------------------------------------------------\n"
                    outFile.write("%.2f\n" %(mt.accuracy_score(y_true=y_test[:test_set], y_pred=full_y_pred_svm_test) * 100))
                    outFile.write("%.2f\n" %(mt.precision_score(y_true=y_test[:test_set], y_pred=full_y_pred_svm_test) * 100))
                    outFile.write("%.2f\n" %(mt.f1_score(y_true=y_test[:test_set], y_pred=full_y_pred_svm_test) * 100))
                    outFile.write("%.2f\n" %(mt.recall_score(y_true=y_test[:test_set], y_pred=full_y_pred_svm_test) * 100))
                    outFile.write("%.2f\n" %(mt.roc_auc_score(y_true=y_test[:test_set], y_score=full_y_pred_svm_test) * 100))
                    utils.print_and_write_logging_file(self.logging_path, message, self.running_mode)
                    #Test plot                    
            writer.close()
        message = "Finish training process.\n"
        utils.print_and_write_logging_file(self.logging_path, message, self.running_mode)
        outFile.close()
Пример #5
0
    def train(self, x_train_opcode, x_train_assembly, x_train_seq_len, y_train,
              x_valid_opcode, x_valid_assembly, x_valid_seq_len, y_valid):

        saver = tf.train.Saver(tf.global_variables())
        with tf.Session(config=utils.get_default_config()) as sess:
            writer = tf.summary.FileWriter(self.graph_path, sess.graph)

            check_point = tf.train.get_checkpoint_state(self.checkpoint_path)
            if check_point and tf.train.checkpoint_exists(check_point.model_checkpoint_path):
                message = "Load model parameters from %s\n" % check_point.model_checkpoint_path
                utils.print_and_write_logging_file(self.logging_path, message, self.running_mode)
                saver.restore(sess, check_point.model_checkpoint_path)
            else:
                message = "Create the model with fresh parameters\n"
                utils.print_and_write_logging_file(self.logging_path, message, self.running_mode)
                sess.run(tf.global_variables_initializer())

            training_set = x_train_opcode.shape[0] - x_train_opcode.shape[0] % self.batch_size
            training_batches = utils.make_batches(training_set, self.batch_size)

            step_loss = 0.0  # the loss per epoch on average
            step_time = 0.0
            initial_step = self.global_step.eval()

            for step in range(initial_step, initial_step + self.num_train_steps):
                loss_per_batch = 0.0
                start_time = time.time()
                step_predict_train = np.array([])

                for batch_idx, (batch_start, batch_end) in enumerate(training_batches):
                    batch_x_opcode = utils.convert_list_sparse_to_dense(x_train_opcode[batch_start:batch_end])
                    batch_x_assembly = utils.convert_list_sparse_to_dense(x_train_assembly[batch_start:batch_end])
                    batch_sequence_length = x_train_seq_len[batch_start:batch_end]
                    batch_y = y_train[batch_start:batch_end]

                    train_feed_dict = {
                        self.X_opcode: batch_x_opcode,
                        self.X_assembly: batch_x_assembly,
                        self.Y: batch_y,
                        self.sequence_length: batch_sequence_length,
                    }
                    _, summary, batch_loss, train_batch_y_pred = sess.run(
                        [self.training_op, self.summary_op, self.loss, self.w_phi_minus_rho2],
                        feed_dict=train_feed_dict)

                    if (batch_idx + 1) % (len(training_batches) // 10) == 0:
                        writer.add_summary(summary, global_step=step)

                    loss_per_batch += batch_loss / len(training_batches)
                    step_predict_train = np.append(step_predict_train, train_batch_y_pred)

                    sys.stdout.write("\rProcessed %.2f%% of mini-batches" % (((batch_idx + 1) / len(training_batches)) * 100))
                    sys.stdout.flush()

                '''
                now we had full (w*phi_tilde(x) - rho2) in 'step_predict_train' after getting rid of the for loop above
                '''

                step_time += (time.time() - start_time) / self.display_step
                step_loss += loss_per_batch / self.display_step

                if (step + 1) % 10 == 0:
                    # save checkpoint
                    checkpoint_path = os.path.join(self.checkpoint_path, "rnn_classifier_" + self.data_size + ".ckpt")
                    saver.save(sess, checkpoint_path, global_step=step)

                if (step + 1) % self.display_step == 0:
                    print("\n")
                    message = "global step %d/%d step-time %.2fs average total loss %.5f\n" % (
                        step, self.num_train_steps - 1, step_time, step_loss)
                    utils.print_and_write_logging_file(self.logging_path, message, self.running_mode)

                    # run evaluation and print the total loss
                    dev_set = x_valid_opcode.shape[0] - x_valid_opcode.shape[0] % self.batch_size
                    dev_batches = utils.make_batches(dev_set, self.batch_size)

                    average_dev_loss = 0.0
                    valid_full_pred = np.array([])
                    for batch_idx, (batch_start, batch_end) in enumerate(dev_batches):
                        valid_x_opcode = utils.convert_list_sparse_to_dense(x_valid_opcode[batch_start:batch_end])
                        valid_x_assembly = utils.convert_list_sparse_to_dense(x_valid_assembly[batch_start:batch_end])
                        valid_y = y_valid[batch_start:batch_end]
                        valid_seq_len = x_valid_seq_len[batch_start:batch_end]

                        valid_feed_dict = {
                            self.X_opcode: valid_x_opcode,
                            self.X_assembly: valid_x_assembly,
                            self.Y: valid_y,
                            self.sequence_length: valid_seq_len,
                        }
                        batch_dev_loss, valid_batch_pred = sess.run([self.loss, self.w_phi_minus_rho2],
                                                                      feed_dict=valid_feed_dict)
                        valid_full_pred = np.append(valid_full_pred, valid_batch_pred)
                        average_dev_loss += batch_dev_loss / len(dev_batches)

                    '''
                    now we have full (w*phi_tilde(x) - rho2) in the 'valid_full_pred' variable
                    '''
                    pred_train_and_valid_set = np.concatenate((step_predict_train, valid_full_pred), axis=0)
                    y_true_train_valid = np.concatenate((y_train[:training_set], y_valid[:dev_set]), axis=0)

                    train_val_min_cost_value, train_val_y_pred_0, train_val_y_pred_with_optimal_hyperplane, train_val_y_pred_1, train_val_n_data_in_strip = self.compute_cost_sensitive_loss_and_y_pred(
                        pred_train_and_valid_set, y_true_train_valid)

                    train_y_pred_0 = train_val_y_pred_0[:training_set]
                    valid_y_pred_0 = train_val_y_pred_0[training_set:]
                    step_train_acc_0, step_train_pre_0, step_train_f1_0, step_train_rec_0, step_train_auc_0 = self.compute_score(
                        y_true=y_train[:training_set], y_pred=train_y_pred_0)

                    train_y_pred_with_opt_hyperplane = train_val_y_pred_with_optimal_hyperplane[:training_set]
                    valid_y_pred_with_opt_hyperplane = train_val_y_pred_with_optimal_hyperplane[training_set:]
                    step_train_acc_opt, step_train_pre_opt, step_train_f1_opt, step_train_rec_opt, step_train_auc_opt = self.compute_score(
                        y_true=y_train[:training_set], y_pred=train_y_pred_with_opt_hyperplane)

                    train_y_pred_1 = train_val_y_pred_1[:training_set]
                    valid_y_pred_1 = train_val_y_pred_1[training_set:]
                    step_train_acc_1, step_train_pre_1, step_train_f1_1, step_train_rec_1, step_train_auc_1 = self.compute_score(
                        y_true=y_train[:training_set], y_pred=train_y_pred_1)

                    message = "[train] total_loss %.5f\n" % step_loss
                    message += "[train] cost_sensitive_loss %.5f\n" % train_val_min_cost_value

                    message += "[train] accuracy_0 %.2f\n" % (step_train_acc_0 * 100)
                    message += "[train] precision_0 %.2f\n" % (step_train_pre_0 * 100)
                    message += "[train] f1_0 %.2f\n" % (step_train_f1_0 * 100)
                    message += "[train] recall_0 %.2f\n" % (step_train_rec_0 * 100)
                    message += "[train] auc_0 %.2f\n" % (step_train_auc_0 * 100)

                    message += "[train] accuracy_opt %.2f\n" % (step_train_acc_opt * 100)
                    message += "[train] precision_opt %.2f\n" % (step_train_pre_opt * 100)
                    message += "[train] f1_opt %.2f\n" % (step_train_f1_opt * 100)
                    message += "[train] recall_opt %.2f\n" % (step_train_rec_opt * 100)
                    message += "[train] auc_opt %.2f\n" % (step_train_auc_opt * 100)

                    message += "[train] accuracy_1 %.2f\n" % (step_train_acc_1 * 100)
                    message += "[train] precision_1 %.2f\n" % (step_train_pre_1 * 100)
                    message += "[train] f1_1 %.2f\n" % (step_train_f1_1 * 100)
                    message += "[train] recall_1 %.2f\n" % (step_train_rec_1 * 100)
                    message += "[train] auc_1 %.2f\n" % (step_train_auc_1 * 100)

                    message += "[train] n_data_in_strip %d\n" % train_val_n_data_in_strip
                    utils.print_and_write_logging_file(self.logging_path, message, self.running_mode, show_message=False)

                    step_val_acc_0, step_val_pre_0, step_val_f1_0, step_val_rec_0, step_val_auc_0 = self.compute_score(
                        y_true=y_valid[:dev_set], y_pred=valid_y_pred_0)
                    step_val_acc_opt, step_val_pre_opt, step_val_f1_opt, step_val_rec_opt, step_val_auc_opt = self.compute_score(
                        y_true=y_valid[:dev_set], y_pred=valid_y_pred_with_opt_hyperplane)
                    step_val_acc_1, step_val_pre_1, step_val_f1_1, step_val_rec_1, step_val_auc_1 = self.compute_score(
                        y_true=y_valid[:dev_set], y_pred=valid_y_pred_1)

                    message = "[eval] total_loss %.5f\n" % average_dev_loss
                    message += "[eval] cost_sensitive_loss %.5f\n" % train_val_min_cost_value

                    message += "[eval] accuracy_0 %.2f\n" % (step_val_acc_0 * 100)
                    message += "[eval] precision_0 %.2f\n" % (step_val_pre_0 * 100)
                    message += "[eval] f1_0 %.2f\n" % (step_val_f1_0 * 100)
                    message += "[eval] recall_0 %.2f\n" % (step_val_rec_0 * 100)
                    message += "[eval] auc_0 %.2f\n" % (step_val_auc_0 * 100)

                    message += "[eval] accuracy_opt %.2f\n" % (step_val_acc_opt * 100)
                    message += "[eval] precision_opt %.2f\n" % (step_val_pre_opt * 100)
                    message += "[eval] f1_opt %.2f\n" % (step_val_f1_opt * 100)
                    message += "[eval] recall_opt %.2f\n" % (step_val_rec_opt * 100)
                    message += "[eval] auc_opt %.2f\n" % (step_val_auc_opt * 100)

                    message += "[eval] accuracy_1 %.2f\n" % (step_val_acc_1 * 100)
                    message += "[eval] precision_1 %.2f\n" % (step_val_pre_1 * 100)
                    message += "[eval] f1_1 %.2f\n" % (step_val_f1_1 * 100)
                    message += "[eval] recall_1 %.2f\n" % (step_val_rec_1 * 100)
                    message += "[eval] auc_1 %.2f\n" % (step_val_auc_1 * 100)

                    message += "[eval] n_data_in_strip %d\n" % train_val_n_data_in_strip
                    message += "-----------------------------------------------------\n"
                    utils.print_and_write_logging_file(self.logging_path, message, self.running_mode)

                    step_time, step_loss = 0.0, 0.0  # it is important to set step_time and step_loss return to zero.

            writer.close()
        message = "Finish training process.\n"
        utils.print_and_write_logging_file(self.logging_path, message, self.running_mode)