示例#1
0
    def load_proced_dir(self, csv_file):
        authors, file_ids, label_matrix = DataHelper.load_csv(
            csv_file_path=csv_file)
        self.num_of_classes = label_matrix.shape[1]

        logging.info("LABEL MATRIX HAS SHAPE: " + str(label_matrix.shape))

        data = AAData(name="ML", size=len(file_ids))
        data.file_id = file_ids

        origin_list = [None] * data.size
        doc_size = [None] * data.size

        folder_list = os.listdir(self.training_data_dir)
        for author in folder_list:
            f = self.training_data_dir + author
            if os.path.isdir(f):
                sub_file_list = os.listdir(f)
                for file_name in sub_file_list:
                    if file_name in data.file_id:
                        index = data.file_id.index(file_name)
                        file_content = DataHelperML.load_proced_file(
                            data_dir=self.training_data_dir,
                            author_code=author,
                            file_name=file_name)
                        origin_list[index] = file_content
                        doc_size[index] = len(file_content)

        doc_size = np.array(doc_size)

        data.raw = origin_list
        data.label_doc = label_matrix
        data.doc_size = doc_size

        return data
示例#2
0
 def load_raw_file(data_dir, author_name, file_name):
     if not os.path.exists(os.path.dirname(data_dir + author_name + "/")):
         logging.error("error: " + author_name + " does not exit")
         return
     file_content = open(
         data_dir + author_name + "/txt/txt-preprocessed/" + file_name,
         "r").readlines()
     content = []
     paragraph = []
     for line in file_content:
         line = line.strip()
         if len(line) == 0 and len(
                 paragraph) > 0:  # end of paragraph, split and push
             paragraph = " ".join(paragraph)
             content.extend(DataHelper.split_sentence(paragraph))
             paragraph = []
         elif len(line.split()) <= 2:  # too short
             pass
         else:  # keep adding to paragraph
             paragraph.append(line)
     return content
示例#3
0
    def evaluate(self,
                 experiment_dir,
                 checkpoint_step,
                 doc_acc=False,
                 do_is_training=True):
        if checkpoint_step is not None:
            checkpoint_file = experiment_dir + "/checkpoints/" + "model-" + str(
                checkpoint_step)
        else:
            checkpoint_file = tf.train.latest_checkpoint(experiment_dir +
                                                         "/checkpoints/",
                                                         latest_filename=None)
        file_name = os.path.basename(checkpoint_file)
        self.eval_log = open(os.path.join(experiment_dir,
                                          file_name + "_eval.log"),
                             mode="w+")
        console = logging.StreamHandler()
        logging.getLogger('').addHandler(console)

        self.eval_log.write("Evaluating: " + __file__ + "\n")
        self.eval_log.write("Test for prob: " + self.dater.problem_name + "\n")
        self.eval_log.write(checkpoint_file + "\n")
        self.eval_log.write(AM.get_time() + "\n")
        self.eval_log.write("Total number of test examples: {}\n".format(
            len(self.test_data.label_instance)))

        graph = tf.Graph()
        with graph.as_default():
            session_conf = tf.ConfigProto(allow_soft_placement=True,
                                          log_device_placement=False)
            sess = tf.Session(config=session_conf)
            with sess.as_default():
                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph(
                    "{}.meta".format(checkpoint_file))
                saver.restore(sess, checkpoint_file)

                # Get the placeholders from the graph by name
                input_x = graph.get_operation_by_name("input_x").outputs[0]
                input_y = graph.get_operation_by_name("input_y").outputs[0]
                dropout_keep_prob = graph.get_operation_by_name(
                    "dropout_keep_prob").outputs[0]
                if do_is_training:
                    is_training = graph.get_operation_by_name(
                        "is_training").outputs[0]
                else:
                    is_training = None

                # Tensors we want to evaluate
                scores = graph.get_operation_by_name(
                    "output/scores").outputs[0]
                predictions = graph.get_operation_by_name(
                    "output/predictions").outputs[0]

                # Generate batches for one epoch
                x_batches = DataHelper.batch_iter(self.test_data.value,
                                                  64,
                                                  1,
                                                  shuffle=False)
                y_batches = DataHelper.batch_iter(
                    self.test_data.label_instance, 64, 1, shuffle=False)

                # Collect the predictions here
                all_score = None
                pred = None
                for [x_test_batch, y_test_batch] in zip(x_batches, y_batches):
                    if do_is_training:
                        batch_scores, batch_pred_max = sess.run(
                            [scores, predictions], {
                                input_x: x_test_batch,
                                dropout_keep_prob: 1.0,
                                is_training: 0
                            })
                    else:
                        batch_scores, batch_pred_max = sess.run(
                            [scores, predictions], {
                                input_x: x_test_batch,
                                dropout_keep_prob: 1.0
                            })

                    batch_scores = tf.nn.softmax(batch_scores).eval()

                    if all_score is None:
                        all_score = batch_scores
                        pred = batch_pred_max
                    else:
                        all_score = np.concatenate([all_score, batch_scores],
                                                   axis=0)
                        pred = np.concatenate([pred, batch_pred_max], axis=0)

        mi_prec = precision_score(y_true=self.y_test_scalar,
                                  y_pred=pred,
                                  average="micro")
        self.eval_log.write("micro prec:\t" + str(mi_prec) + "\n")

        mi_recall = recall_score(y_true=self.y_test_scalar,
                                 y_pred=pred,
                                 average="micro")
        self.eval_log.write("micro recall:\t" + str(mi_recall) + "\n")

        mi_f1 = f1_score(y_true=self.y_test_scalar,
                         y_pred=pred,
                         average="micro")
        self.eval_log.write("micro f1:\t" + str(mi_f1) + "\n")

        ma_prec = precision_score(y_true=self.y_test_scalar,
                                  y_pred=pred,
                                  average='macro')
        self.eval_log.write("macro prec:\t" + str(ma_prec) + "\n")

        ma_recall = recall_score(y_true=self.y_test_scalar,
                                 y_pred=pred,
                                 average='macro')
        self.eval_log.write("macro recall:\t" + str(ma_recall) + "\n")

        ma_f1 = f1_score(y_true=self.y_test_scalar,
                         y_pred=pred,
                         average='macro')
        self.eval_log.write("macro f1:\t" + str(ma_f1) + "\n")

        jaccard = jaccard_similarity_score(y_true=self.y_test_scalar,
                                           y_pred=pred)
        self.eval_log.write("jaccard:\t" + str(jaccard) + "\n")

        hamming = hamming_loss(y_true=self.y_test_scalar, y_pred=pred)
        self.eval_log.write("hamming:\t" + str(hamming) + "\n")

        acc = accuracy_score(y_true=self.y_test_scalar, y_pred=pred)
        self.eval_log.write("acc:\t" + str(acc) + "\n")

        self.eval_log.write("\n")
        self.eval_log.write("\n")

        self.print_a_csv(exp_dir=experiment_dir,
                         file_name=file_name,
                         method_name="NORM",
                         prob=all_score,
                         pred=pred,
                         true=self.y_test_scalar)
示例#4
0
    def evaluate(self,
                 experiment_dir,
                 checkpoint_step,
                 doc_acc=True,
                 do_is_training=True):
        if checkpoint_step is not None:
            checkpoint_file = experiment_dir + "/checkpoints/" + "model-" + str(
                checkpoint_step)
        else:
            checkpoint_file = tf.train.latest_checkpoint(experiment_dir +
                                                         "/checkpoints/",
                                                         latest_filename=None)
        file_name = os.path.basename(checkpoint_file)
        self.eval_log = open(os.path.join(experiment_dir,
                                          file_name + "_eval.log"),
                             mode="w+")

        logging.info("Evaluating: " + __file__)
        self.eval_log.write("Evaluating: " + __file__ + "\n")
        logging.info("Test for prob: " + self.dater.problem_name)
        self.eval_log.write("Test for prob: " + self.dater.problem_name + "\n")
        logging.info(checkpoint_file)
        self.eval_log.write(checkpoint_file + "\n")
        logging.info(AM.get_time())
        self.eval_log.write(AM.get_time() + "\n")
        logging.info("Total number of test examples: {}".format(
            len(self.test_data.label_instance)))
        self.eval_log.write("Total number of test examples: {}\n".format(
            len(self.test_data.label_instance)))

        graph = tf.Graph()
        with graph.as_default():
            session_conf = tf.ConfigProto(allow_soft_placement=True,
                                          log_device_placement=False)
            sess = tf.Session(config=session_conf)
            with sess.as_default():
                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph(
                    "{}.meta".format(checkpoint_file))
                saver.restore(sess, checkpoint_file)

                # Get the placeholders from the graph by name
                input_x = graph.get_operation_by_name("input_x").outputs[0]
                input_y = graph.get_operation_by_name("input_y").outputs[0]
                dropout_keep_prob = graph.get_operation_by_name(
                    "dropout_keep_prob").outputs[0]
                if do_is_training:
                    is_training = graph.get_operation_by_name(
                        "is_training").outputs[0]
                else:
                    is_training = None

                # Tensors we want to evaluate
                scores = graph.get_operation_by_name(
                    "output/scores").outputs[0]
                predictions_sigmoid = graph.get_operation_by_name(
                    "output/predictions_sigmoid").outputs[0]
                predictions_max = graph.get_operation_by_name(
                    "output/predictions_max").outputs[0]

                # Generate batches for one epoch
                x_batches = DataHelper.batch_iter(self.test_data.value,
                                                  64,
                                                  1,
                                                  shuffle=False)
                y_batches = DataHelper.batch_iter(
                    self.test_data.label_instance, 64, 1, shuffle=False)

                # Collect the predictions here
                all_score = None
                pred_sigmoid_value = None
                pred_max_bool = None
                pred_sigmoid_bool = None
                for [x_test_batch, y_test_batch] in zip(x_batches, y_batches):
                    if do_is_training:
                        batch_scores, batch_pred_sigmoid, batch_pred_max_index = sess.run(
                            [scores, predictions_sigmoid, predictions_max], {
                                input_x: x_test_batch,
                                dropout_keep_prob: 1.0,
                                is_training: 0
                            })
                    else:
                        batch_scores, batch_pred_sigmoid, batch_pred_max_index = sess.run(
                            [scores, predictions_sigmoid, predictions_max], {
                                input_x: x_test_batch,
                                dropout_keep_prob: 1.0
                            })

                    batch_pred_max_bool = tf.one_hot(
                        indices=batch_pred_max_index,
                        depth=self.dater.num_of_classes).eval(
                        ) == 1  # TODO temp

                    if all_score is None:
                        all_score = batch_scores
                        pred_max_bool = batch_pred_max_bool
                        pred_sigmoid_bool = batch_pred_sigmoid > 0.5
                        pred_sigmoid_value = batch_pred_sigmoid
                    else:
                        all_score = np.concatenate([all_score, batch_scores],
                                                   axis=0)
                        pred_max_bool = np.concatenate(
                            [pred_max_bool, batch_pred_max_bool], axis=0)
                        pred_sigmoid_bool = np.concatenate(
                            [pred_sigmoid_bool, batch_pred_sigmoid > 0.5],
                            axis=0)
                        pred_sigmoid_value = np.concatenate(
                            [pred_sigmoid_value, batch_pred_sigmoid], axis=0)

            # logging.info("== PRED MAX ==")
            # self.eval_log.write("== PRED MAX ==")
            # self.sent_accuracy(pred_max_bool)
            logging.info("== PRED SIGMOID ==")
            self.eval_log.write("== PRED SIGMOID ==")
            self.sent_accuracy(pred_sigmoid_bool)

            if doc_acc:
                # print("========== WITH MAX ==========")
                # self.doc_accuracy(pred_max)
                # print("========== WITH SIGMOID ==========")
                self.eval_log.write("========== WITH VOTE ==========\n\n")
                self.doc_accuracy(pred_sigmoid_bool)

                self.eval_log.write(
                    "========== WITH SIGMOID CUMU ==========\n\n")
                self.doc_accuracy_sigmoid_cumulation(pred_sigmoid_value)

            self.eval_log.write("\n")
            self.eval_log.write("\n")
示例#5
0
    def write_file(self,
                   experiment_dir,
                   checkpoint_step,
                   doc_acc=True,
                   do_is_training=True):
        if checkpoint_step is not None:
            checkpoint_file = experiment_dir + "/checkpoints/" + "model-" + str(
                checkpoint_step)
        else:
            checkpoint_file = tf.train.latest_checkpoint(experiment_dir +
                                                         "/checkpoints/",
                                                         latest_filename=None)
        file_name = os.path.basename(checkpoint_file)
        self.eval_log = open(os.path.join(experiment_dir,
                                          file_name + "_eval.log"),
                             mode="w+")

        logging.info("Evaluating: " + __file__)
        self.eval_log.write("Evaluating: " + __file__ + "\n")
        logging.info("Test for prob: " + self.dater.problem_name)
        self.eval_log.write("Test for prob: " + self.dater.problem_name + "\n")
        logging.info(checkpoint_file)
        self.eval_log.write(checkpoint_file + "\n")
        logging.info(AM.get_time())
        self.eval_log.write(AM.get_time() + "\n")
        logging.info("Total number of test examples: {}".format(
            len(self.test_data.label_instance)))
        self.eval_log.write("Total number of test examples: {}\n".format(
            len(self.test_data.label_instance)))

        graph = tf.Graph()
        with graph.as_default():
            session_conf = tf.ConfigProto(allow_soft_placement=True,
                                          log_device_placement=False)
            sess = tf.Session(config=session_conf)
            with sess.as_default():
                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph(
                    "{}.meta".format(checkpoint_file))
                saver.restore(sess, checkpoint_file)

                # Get the placeholders from the graph by name
                input_x = graph.get_operation_by_name("input_x").outputs[0]
                input_y = graph.get_operation_by_name("input_y").outputs[0]
                dropout_keep_prob = graph.get_operation_by_name(
                    "dropout_keep_prob").outputs[0]
                if do_is_training:
                    is_training = graph.get_operation_by_name(
                        "is_training").outputs[0]
                else:
                    is_training = None

                # Tensors we want to evaluate
                scores = graph.get_operation_by_name(
                    "output/scores").outputs[0]
                predictions = graph.get_operation_by_name(
                    "output/predictions").outputs[0]

                # TRAIN ===========================================================================
                x_batches = DataHelper.batch_iter(self.train_data.value,
                                                  64,
                                                  1,
                                                  shuffle=False)
                y_batches = DataHelper.batch_iter(
                    self.train_data.label_instance, 64, 1, shuffle=False)
                all_score = None
                pred_sigmoid = None

                for [x_test_batch, y_test_batch] in zip(x_batches, y_batches):
                    if do_is_training:
                        batch_scores, batch_pred_sigmoid = sess.run(
                            [scores, predictions], {
                                input_x: x_test_batch,
                                dropout_keep_prob: 1.0,
                                is_training: 0
                            })
                    else:
                        batch_scores, batch_pred_sigmoid = sess.run(
                            [scores, predictions], {
                                input_x: x_test_batch,
                                dropout_keep_prob: 1.0
                            })

                    if all_score is None:
                        all_score = batch_scores
                        pred_sigmoid = batch_pred_sigmoid
                    else:
                        all_score = np.concatenate([all_score, batch_scores],
                                                   axis=0)
                        pred_sigmoid = np.concatenate(
                            [pred_sigmoid, batch_pred_sigmoid], axis=0)

                self.write_dist_file(doc_size_list=self.train_data.doc_size,
                                     all_sigmoids=pred_sigmoid,
                                     label=self.train_data.label_doc,
                                     experiment_dir=experiment_dir,
                                     file_name="train")

                # TEST  ===========================================================================
                all_score = None
                pred_sigmoid = None
                x_batches = DataHelper.batch_iter(self.test_data.value,
                                                  64,
                                                  1,
                                                  shuffle=False)
                y_batches = DataHelper.batch_iter(
                    self.test_data.label_instance, 64, 1, shuffle=False)

                for [x_test_batch, y_test_batch] in zip(x_batches, y_batches):
                    if do_is_training:
                        batch_scores, batch_pred_sigmoid = sess.run(
                            [scores, predictions], {
                                input_x: x_test_batch,
                                dropout_keep_prob: 1.0,
                                is_training: 0
                            })
                    else:
                        batch_scores, batch_pred_sigmoid = sess.run(
                            [scores, predictions], {
                                input_x: x_test_batch,
                                dropout_keep_prob: 1.0
                            })

                    if all_score is None:
                        all_score = batch_scores
                        pred_sigmoid = batch_pred_sigmoid
                    else:
                        all_score = np.concatenate([all_score, batch_scores],
                                                   axis=0)
                        pred_sigmoid = np.concatenate(
                            [pred_sigmoid, batch_pred_sigmoid], axis=0)

            self.write_dist_file(doc_size_list=self.test_data.doc_size,
                                 all_sigmoids=pred_sigmoid,
                                 label=self.test_data.label_doc,
                                 experiment_dir=experiment_dir,
                                 file_name="test")