def load_proced_dir(self, csv_file): authors, file_ids, label_matrix = DataHelper.load_csv( csv_file_path=csv_file) self.num_of_classes = label_matrix.shape[1] logging.info("LABEL MATRIX HAS SHAPE: " + str(label_matrix.shape)) data = AAData(name="ML", size=len(file_ids)) data.file_id = file_ids origin_list = [None] * data.size doc_size = [None] * data.size folder_list = os.listdir(self.training_data_dir) for author in folder_list: f = self.training_data_dir + author if os.path.isdir(f): sub_file_list = os.listdir(f) for file_name in sub_file_list: if file_name in data.file_id: index = data.file_id.index(file_name) file_content = DataHelperML.load_proced_file( data_dir=self.training_data_dir, author_code=author, file_name=file_name) origin_list[index] = file_content doc_size[index] = len(file_content) doc_size = np.array(doc_size) data.raw = origin_list data.label_doc = label_matrix data.doc_size = doc_size return data
def load_raw_file(data_dir, author_name, file_name): if not os.path.exists(os.path.dirname(data_dir + author_name + "/")): logging.error("error: " + author_name + " does not exit") return file_content = open( data_dir + author_name + "/txt/txt-preprocessed/" + file_name, "r").readlines() content = [] paragraph = [] for line in file_content: line = line.strip() if len(line) == 0 and len( paragraph) > 0: # end of paragraph, split and push paragraph = " ".join(paragraph) content.extend(DataHelper.split_sentence(paragraph)) paragraph = [] elif len(line.split()) <= 2: # too short pass else: # keep adding to paragraph paragraph.append(line) return content
def evaluate(self, experiment_dir, checkpoint_step, doc_acc=False, do_is_training=True): if checkpoint_step is not None: checkpoint_file = experiment_dir + "/checkpoints/" + "model-" + str( checkpoint_step) else: checkpoint_file = tf.train.latest_checkpoint(experiment_dir + "/checkpoints/", latest_filename=None) file_name = os.path.basename(checkpoint_file) self.eval_log = open(os.path.join(experiment_dir, file_name + "_eval.log"), mode="w+") console = logging.StreamHandler() logging.getLogger('').addHandler(console) self.eval_log.write("Evaluating: " + __file__ + "\n") self.eval_log.write("Test for prob: " + self.dater.problem_name + "\n") self.eval_log.write(checkpoint_file + "\n") self.eval_log.write(AM.get_time() + "\n") self.eval_log.write("Total number of test examples: {}\n".format( len(self.test_data.label_instance))) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] if do_is_training: is_training = graph.get_operation_by_name( "is_training").outputs[0] else: is_training = None # Tensors we want to evaluate scores = graph.get_operation_by_name( "output/scores").outputs[0] predictions = graph.get_operation_by_name( "output/predictions").outputs[0] # Generate batches for one epoch x_batches = DataHelper.batch_iter(self.test_data.value, 64, 1, shuffle=False) y_batches = DataHelper.batch_iter( self.test_data.label_instance, 64, 1, shuffle=False) # Collect the predictions here all_score = None pred = None for [x_test_batch, y_test_batch] in zip(x_batches, y_batches): if do_is_training: batch_scores, batch_pred_max = sess.run( [scores, predictions], { input_x: x_test_batch, dropout_keep_prob: 1.0, is_training: 0 }) else: batch_scores, batch_pred_max = sess.run( [scores, predictions], { input_x: x_test_batch, dropout_keep_prob: 1.0 }) batch_scores = tf.nn.softmax(batch_scores).eval() if all_score is None: all_score = batch_scores pred = batch_pred_max else: all_score = np.concatenate([all_score, batch_scores], axis=0) pred = np.concatenate([pred, batch_pred_max], axis=0) mi_prec = precision_score(y_true=self.y_test_scalar, y_pred=pred, average="micro") self.eval_log.write("micro prec:\t" + str(mi_prec) + "\n") mi_recall = recall_score(y_true=self.y_test_scalar, y_pred=pred, average="micro") self.eval_log.write("micro recall:\t" + str(mi_recall) + "\n") mi_f1 = f1_score(y_true=self.y_test_scalar, y_pred=pred, average="micro") self.eval_log.write("micro f1:\t" + str(mi_f1) + "\n") ma_prec = precision_score(y_true=self.y_test_scalar, y_pred=pred, average='macro') self.eval_log.write("macro prec:\t" + str(ma_prec) + "\n") ma_recall = recall_score(y_true=self.y_test_scalar, y_pred=pred, average='macro') self.eval_log.write("macro recall:\t" + str(ma_recall) + "\n") ma_f1 = f1_score(y_true=self.y_test_scalar, y_pred=pred, average='macro') self.eval_log.write("macro f1:\t" + str(ma_f1) + "\n") jaccard = jaccard_similarity_score(y_true=self.y_test_scalar, y_pred=pred) self.eval_log.write("jaccard:\t" + str(jaccard) + "\n") hamming = hamming_loss(y_true=self.y_test_scalar, y_pred=pred) self.eval_log.write("hamming:\t" + str(hamming) + "\n") acc = accuracy_score(y_true=self.y_test_scalar, y_pred=pred) self.eval_log.write("acc:\t" + str(acc) + "\n") self.eval_log.write("\n") self.eval_log.write("\n") self.print_a_csv(exp_dir=experiment_dir, file_name=file_name, method_name="NORM", prob=all_score, pred=pred, true=self.y_test_scalar)
def evaluate(self, experiment_dir, checkpoint_step, doc_acc=True, do_is_training=True): if checkpoint_step is not None: checkpoint_file = experiment_dir + "/checkpoints/" + "model-" + str( checkpoint_step) else: checkpoint_file = tf.train.latest_checkpoint(experiment_dir + "/checkpoints/", latest_filename=None) file_name = os.path.basename(checkpoint_file) self.eval_log = open(os.path.join(experiment_dir, file_name + "_eval.log"), mode="w+") logging.info("Evaluating: " + __file__) self.eval_log.write("Evaluating: " + __file__ + "\n") logging.info("Test for prob: " + self.dater.problem_name) self.eval_log.write("Test for prob: " + self.dater.problem_name + "\n") logging.info(checkpoint_file) self.eval_log.write(checkpoint_file + "\n") logging.info(AM.get_time()) self.eval_log.write(AM.get_time() + "\n") logging.info("Total number of test examples: {}".format( len(self.test_data.label_instance))) self.eval_log.write("Total number of test examples: {}\n".format( len(self.test_data.label_instance))) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] if do_is_training: is_training = graph.get_operation_by_name( "is_training").outputs[0] else: is_training = None # Tensors we want to evaluate scores = graph.get_operation_by_name( "output/scores").outputs[0] predictions_sigmoid = graph.get_operation_by_name( "output/predictions_sigmoid").outputs[0] predictions_max = graph.get_operation_by_name( "output/predictions_max").outputs[0] # Generate batches for one epoch x_batches = DataHelper.batch_iter(self.test_data.value, 64, 1, shuffle=False) y_batches = DataHelper.batch_iter( self.test_data.label_instance, 64, 1, shuffle=False) # Collect the predictions here all_score = None pred_sigmoid_value = None pred_max_bool = None pred_sigmoid_bool = None for [x_test_batch, y_test_batch] in zip(x_batches, y_batches): if do_is_training: batch_scores, batch_pred_sigmoid, batch_pred_max_index = sess.run( [scores, predictions_sigmoid, predictions_max], { input_x: x_test_batch, dropout_keep_prob: 1.0, is_training: 0 }) else: batch_scores, batch_pred_sigmoid, batch_pred_max_index = sess.run( [scores, predictions_sigmoid, predictions_max], { input_x: x_test_batch, dropout_keep_prob: 1.0 }) batch_pred_max_bool = tf.one_hot( indices=batch_pred_max_index, depth=self.dater.num_of_classes).eval( ) == 1 # TODO temp if all_score is None: all_score = batch_scores pred_max_bool = batch_pred_max_bool pred_sigmoid_bool = batch_pred_sigmoid > 0.5 pred_sigmoid_value = batch_pred_sigmoid else: all_score = np.concatenate([all_score, batch_scores], axis=0) pred_max_bool = np.concatenate( [pred_max_bool, batch_pred_max_bool], axis=0) pred_sigmoid_bool = np.concatenate( [pred_sigmoid_bool, batch_pred_sigmoid > 0.5], axis=0) pred_sigmoid_value = np.concatenate( [pred_sigmoid_value, batch_pred_sigmoid], axis=0) # logging.info("== PRED MAX ==") # self.eval_log.write("== PRED MAX ==") # self.sent_accuracy(pred_max_bool) logging.info("== PRED SIGMOID ==") self.eval_log.write("== PRED SIGMOID ==") self.sent_accuracy(pred_sigmoid_bool) if doc_acc: # print("========== WITH MAX ==========") # self.doc_accuracy(pred_max) # print("========== WITH SIGMOID ==========") self.eval_log.write("========== WITH VOTE ==========\n\n") self.doc_accuracy(pred_sigmoid_bool) self.eval_log.write( "========== WITH SIGMOID CUMU ==========\n\n") self.doc_accuracy_sigmoid_cumulation(pred_sigmoid_value) self.eval_log.write("\n") self.eval_log.write("\n")
def write_file(self, experiment_dir, checkpoint_step, doc_acc=True, do_is_training=True): if checkpoint_step is not None: checkpoint_file = experiment_dir + "/checkpoints/" + "model-" + str( checkpoint_step) else: checkpoint_file = tf.train.latest_checkpoint(experiment_dir + "/checkpoints/", latest_filename=None) file_name = os.path.basename(checkpoint_file) self.eval_log = open(os.path.join(experiment_dir, file_name + "_eval.log"), mode="w+") logging.info("Evaluating: " + __file__) self.eval_log.write("Evaluating: " + __file__ + "\n") logging.info("Test for prob: " + self.dater.problem_name) self.eval_log.write("Test for prob: " + self.dater.problem_name + "\n") logging.info(checkpoint_file) self.eval_log.write(checkpoint_file + "\n") logging.info(AM.get_time()) self.eval_log.write(AM.get_time() + "\n") logging.info("Total number of test examples: {}".format( len(self.test_data.label_instance))) self.eval_log.write("Total number of test examples: {}\n".format( len(self.test_data.label_instance))) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] if do_is_training: is_training = graph.get_operation_by_name( "is_training").outputs[0] else: is_training = None # Tensors we want to evaluate scores = graph.get_operation_by_name( "output/scores").outputs[0] predictions = graph.get_operation_by_name( "output/predictions").outputs[0] # TRAIN =========================================================================== x_batches = DataHelper.batch_iter(self.train_data.value, 64, 1, shuffle=False) y_batches = DataHelper.batch_iter( self.train_data.label_instance, 64, 1, shuffle=False) all_score = None pred_sigmoid = None for [x_test_batch, y_test_batch] in zip(x_batches, y_batches): if do_is_training: batch_scores, batch_pred_sigmoid = sess.run( [scores, predictions], { input_x: x_test_batch, dropout_keep_prob: 1.0, is_training: 0 }) else: batch_scores, batch_pred_sigmoid = sess.run( [scores, predictions], { input_x: x_test_batch, dropout_keep_prob: 1.0 }) if all_score is None: all_score = batch_scores pred_sigmoid = batch_pred_sigmoid else: all_score = np.concatenate([all_score, batch_scores], axis=0) pred_sigmoid = np.concatenate( [pred_sigmoid, batch_pred_sigmoid], axis=0) self.write_dist_file(doc_size_list=self.train_data.doc_size, all_sigmoids=pred_sigmoid, label=self.train_data.label_doc, experiment_dir=experiment_dir, file_name="train") # TEST =========================================================================== all_score = None pred_sigmoid = None x_batches = DataHelper.batch_iter(self.test_data.value, 64, 1, shuffle=False) y_batches = DataHelper.batch_iter( self.test_data.label_instance, 64, 1, shuffle=False) for [x_test_batch, y_test_batch] in zip(x_batches, y_batches): if do_is_training: batch_scores, batch_pred_sigmoid = sess.run( [scores, predictions], { input_x: x_test_batch, dropout_keep_prob: 1.0, is_training: 0 }) else: batch_scores, batch_pred_sigmoid = sess.run( [scores, predictions], { input_x: x_test_batch, dropout_keep_prob: 1.0 }) if all_score is None: all_score = batch_scores pred_sigmoid = batch_pred_sigmoid else: all_score = np.concatenate([all_score, batch_scores], axis=0) pred_sigmoid = np.concatenate( [pred_sigmoid, batch_pred_sigmoid], axis=0) self.write_dist_file(doc_size_list=self.test_data.doc_size, all_sigmoids=pred_sigmoid, label=self.test_data.label_doc, experiment_dir=experiment_dir, file_name="test")