示例#1
0
文件: segmentor.py 项目: memeda/cws
 def evaluate(self , gold_path , predict_path) :
     gold_ite = DatasetHandler.read_dev_data(gold_path)
     predict_ite = DatasetHandler.read_dev_data(predict_path)
     nr_processing = 0
     nr_gold = 0
     nr_processing_right = 0
     nr_line = 0
     while True :
         try :
             gold_instance = gold_ite.next()
             predict_instance = predict_ite.next()
         except StopIteration :
             break
         nr_line += 1
         gold_unigrams , gold_tags = self._processing_one_segmented_WSAtom_instance2unigrams_and_tags(gold_instance)
         predict_unigrams , predict_tags = self._processing_one_segmented_WSAtom_instance2unigrams_and_tags(predict_instance)
         gold_coor_seq = self.__innerfunc_4evaluate_generate_word_coordinate_sequence_from_tags(gold_tags)
         predict_coor_seq = self.__innerfunc_4evaluate_generate_word_coordinate_sequence_from_tags(predict_tags)
         cur_nr_gold , cur_nr_processing , cur_nr_processing_right = (
                 self.__innerfunc_4evaluate_get_nr_gold_and_processing_and_processing_right(gold_coor_seq , predict_coor_seq) )
         nr_gold += cur_nr_gold
         nr_processing += cur_nr_processing
         nr_processing_right += cur_nr_processing_right
     p , r , f = self.__innerfunc_4evaluate_calculate_prf(nr_gold , nr_processing , nr_processing_right)
     print ("Eval result :\np : %.2f%% r : %.2f%% f : %.2f%%\n"
           "line num : %d total word num : %d total predict word num : %d predict right num : %d ") %(
             p * 100 , r * 100, f * 100 , nr_line , nr_gold , nr_processing , nr_processing_right
             )
def plot_roc():
    for domain in domains:
        for model_name in model_names:
            for init_opt in init_opts:
                for preprocess_opt in preprocess_opts:
                    scores = []
                    labels = []
                    for ind_fold, fold in enumerate(folds):
                        K.clear_session()
                        dh = DatasetHandler(domain)
                        dataset_fold = dh.get_fold(ind_fold, preprocess_opt)

                        model_path = os.path.join('log', domain, model_name,
                                                  init_opt, preprocess_opt,
                                                  fold, 'stage_5.h5')
                        model = model_loader.load_full_model(model_name,
                                                             no_cats=2)
                        model.load_weights(model_path)

                        scores.append(
                            model.predict(dataset_fold['test_data'],
                                          batch_size=10))
                        labels.append(dataset_fold['test_labels'])

                    scores = np.concatenate(scores)[:, 1]
                    labels = np.concatenate(labels)[:, 1]

                    fpr, tpr, _ = roc_curve(labels, scores)
                    roc_auc = auc(fpr, tpr)
                    savemat(
                        os.path.join(
                            'log', 'roc', domain + '_' + model_name + '_' +
                            init_opt + '_' + preprocess_opt + '.mat'), {
                                'fpr': fpr,
                                'tpr': tpr,
                                'roc_auc': roc_auc
                            })

                    plt.plot(fpr,
                             tpr,
                             color='darkorange',
                             lw=2,
                             label='ROC curve (area = %0.2f)' % roc_auc)
                    plt.plot([0, 1], [0, 1],
                             color='navy',
                             lw=2,
                             linestyle='--')
                    plt.xlim([0.0, 1.0])
                    plt.ylim([0.0, 1.05])
                    plt.xlabel('False Positive Rate')
                    plt.ylabel('True Positive Rate')
                    plt.title('Receiver operating characteristics curve')
                    plt.legend(loc="lower right")
                    plt.savefig(
                        os.path.join(
                            'log', 'roc', domain + '_' + model_name + '_' +
                            init_opt + '_' + preprocess_opt))
                    plt.close()
示例#3
0
def seg_eval(args):
    if not (DatasetHandler.is_readable(args.gold_file)):
        logging.error("path '%s' open failed !" % (args.gold_file))
        logging.error('Exit!')
        exit(1)
    if not DatasetHandler.is_readable(args.predict_file):
        logging.error("path '%s' open failed ! predict file open error ." %
                      (args.predict_file))
        logging.error("Exit!")
        exit(1)
    segmentor = Segmentor()
    segmentor.evaluate(args.gold_file, args.predict_file)
def main():
    for domain in domains:
        for model_name in model_names:
            for init_opt in init_opts:
                for preprocess_opt in preprocess_opts:
                    log_path = os.path.join(log_path_main, domain, model_name,
                                            init_opt)
                    if not os.path.exists(log_path):
                        os.makedirs(log_path)
                    for ind_fold, fold in enumerate(folds):
                        K.clear_session()
                        dh = DatasetHandler(domain)
                        dataset_fold = dh.get_fold(ind_fold, preprocess_opt)

                        model_path = os.path.join('log', domain, model_name,
                                                  init_opt, preprocess_opt,
                                                  fold, 'stage_5.h5')
                        model = model_loader.load_full_model(model_name,
                                                             no_cats=2)
                        model.load_weights(model_path)
                        model.compile(optimizer='adam',
                                      loss='categorical_crossentropy',
                                      metrics=['accuracy'])
                        guided_bprop = GuidedBackprop(model)

                        for ind_image, image in enumerate(
                                dataset_fold['test_data']):
                            if domain == 'VL':
                                image = np.dot(image[..., :3],
                                               [0.299, 0.587, 0.114])
                                image = image[:, :, np.newaxis]
                                image = np.repeat(image, 3, axis=2)

                            mask = guided_bprop.get_mask(image)

                            mask = np.power(mask, 2)
                            mask = np.sum(mask, axis=2)
                            mask = np.sqrt(mask)

                            mask -= np.min(mask)
                            norm_max = np.max(mask)
                            if norm_max == 0:
                                norm_max = 1
                            mask /= norm_max
                            mask *= 255

                            mask = np.uint8(mask)
                            img = Image.fromarray(mask, 'L')
                            im_name = dataset_fold['test_image_names'][
                                ind_image].split('.')[0]
                            img.save(os.path.join(log_path, im_name + '.png'),
                                     'PNG')
示例#5
0
文件: segmentor.py 项目: fseasy/cws
 def _4training_evaluate_processing(self, dev_path):
     nr_processing_right = 0
     nr_gold = 0
     nr_processing = 0
     for instance in DatasetHandler.read_dev_data(dev_path):
         unigrams, gold_tags = Segmentor._processing_one_segmented_WSAtom_instance2unigrams_and_tags(
             instance)
         predict_tags = Decoder.decode_for_predict(self.extractor,
                                                   self.model,
                                                   self.constrain, unigrams)
         gold_coor_seq = self.__innerfunc_4evaluate_generate_word_coordinate_sequence_from_tags(
             gold_tags)
         predict_coor_seq = self.__innerfunc_4evaluate_generate_word_coordinate_sequence_from_tags(
             predict_tags)
         cur_nr_gold, cur_nr_processing, cur_nr_processing_right = (
             self.
             __innerfunc_4evaluate_get_nr_gold_and_processing_and_processing_right(
                 gold_coor_seq, predict_coor_seq))
         nr_gold += cur_nr_gold
         nr_processing += cur_nr_processing
         nr_processing_right += cur_nr_processing_right
     p, r, f = self.__innerfunc_4evaluate_calculate_prf(
         nr_gold, nr_processing, nr_processing_right)
     print >> sys.stderr, (
         "Eval result :\np : %.2f%% r : %.2f%% f : %.2f%%\n"
         "total word num : %d total predict word num : %d predict right num : %d "
     ) % (p * 100, r * 100, f * 100, nr_gold, nr_processing,
          nr_processing_right)
     return f
示例#6
0
def seg_train(args):
    if not DatasetHandler.is_readable(args.training_file):
        logging.error("path '%s' open failed !" % (args.training_file))
        logging.error('Exit!')
        exit(1)
    if not DatasetHandler.is_readable(args.developing_file):
        logging.error("path '%s' open failed !" % (args.developing_file))
        logging.error("Exit!")
        exit(1)
    if not DatasetHandler.is_writeable(args.model_saving):
        logging.error("path '%s' open failed !" % (args.model_saving))
        logging.error('Exit!')
        exit(1)
    segmentor = Segmentor()
    segmentor.train(args.training_file, args.developing_file,
                    args.model_saving, args.max_iter)
示例#7
0
def seg_predict(args):
    if not DatasetHandler.is_readable(args.predict_file):
        logging.error("path '%s' open failed !" % (args.predict_file))
        logging.error('Exit!')
        exit(1)
    if not DatasetHandler.is_readable(args.model_loading):
        logging.error("path '%s' open failed ! Model load Error ." %
                      (args.model_loading))
        logging.error("Exit!")
        exit(1)
    if not DatasetHandler.is_writeable(
            args.output_path) and args.output_path != "stdout":
        logging.error("path '%s' open failed !" % (args.output_path))
        logging.error('Exit!')
        exit(1)
    segmentor = Segmentor()
    segmentor.predict(args.model_loading, args.predict_file, args.output_path)
示例#8
0
文件: segmentor.py 项目: memeda/cws
 def train(self , training_path , dev_path , model_saving_path , max_iter=None) :
     self._set_max_iter(max_iter)
     self.raw_training_data = DatasetHandler.read_training_data(training_path)
     self._build_inner_lexicon(threshold=0.9)
     self._processing_raw_training_data2unigrams_and_tags()
     self._build_extractor()
     self._build_constrain()
     self._build_decoder()
     self._build_training_model()
     self._training_processing( model_saving_path , dev_path)
示例#9
0
文件: segmentor.py 项目: fseasy/cws
 def train(self, training_path, dev_path, model_saving_path, max_iter=None):
     self._set_max_iter(max_iter)
     self.raw_training_data = DatasetHandler.read_training_data(
         training_path)
     self._build_inner_lexicon(threshold=0.9)
     self._processing_raw_training_data2unigrams_and_tags()
     self._build_extractor()
     self._build_constrain()
     self._build_decoder()
     self._build_training_model()
     self._training_processing(model_saving_path, dev_path)
示例#10
0
文件: segmentor.py 项目: fseasy/cws
 def evaluate(self, gold_path, predict_path):
     gold_ite = DatasetHandler.read_dev_data(gold_path)
     predict_ite = DatasetHandler.read_dev_data(predict_path)
     nr_processing = 0
     nr_gold = 0
     nr_processing_right = 0
     nr_line = 0
     while True:
         try:
             gold_instance = gold_ite.next()
             predict_instance = predict_ite.next()
         except StopIteration:
             break
         nr_line += 1
         gold_unigrams, gold_tags = self._processing_one_segmented_WSAtom_instance2unigrams_and_tags(
             gold_instance)
         predict_unigrams, predict_tags = self._processing_one_segmented_WSAtom_instance2unigrams_and_tags(
             predict_instance)
         gold_coor_seq = self.__innerfunc_4evaluate_generate_word_coordinate_sequence_from_tags(
             gold_tags)
         predict_coor_seq = self.__innerfunc_4evaluate_generate_word_coordinate_sequence_from_tags(
             predict_tags)
         cur_nr_gold, cur_nr_processing, cur_nr_processing_right = (
             self.
             __innerfunc_4evaluate_get_nr_gold_and_processing_and_processing_right(
                 gold_coor_seq, predict_coor_seq))
         nr_gold += cur_nr_gold
         nr_processing += cur_nr_processing
         nr_processing_right += cur_nr_processing_right
     p, r, f = self.__innerfunc_4evaluate_calculate_prf(
         nr_gold, nr_processing, nr_processing_right)
     print(
         "Eval result :\np : %.2f%% r : %.2f%% f : %.2f%%\n"
         "line num : %d total word num : %d total predict word num : %d predict right num : %d "
     ) % (p * 100, r * 100, f * 100, nr_line, nr_gold, nr_processing,
          nr_processing_right)
示例#11
0
文件: segmentor.py 项目: memeda/cws
 def _predict_processing(self , predict_path , output_path) :
     if isinstance(output_path , file) :
         output_f = output_path 
     else :
         if  output_path == "stdout" :
             output_f = sys.stdout
         else :
             output_f = open(output_path , "w")
     logging.info("set output %s " %(output_f.name))
     logging.info("reading instance from %s . predicting ." %(predict_path))
     for instance , separator_data in DatasetHandler.read_predict_data(predict_path) :
         self.constrain.set_constrain_data(separator_data)
         predict_tags = Decoder.decode_for_predict(self.extractor , self.model , self.constrain , instance)
         segmented_line = self._processing_unigrams_and_tags2segmented_line(instance,predict_tags)
         output_f.write("%s" %( "".join([segmented_line , os.linesep]) ) )
     if output_f is not sys.stdout :
         output_f.close()
     logging.info("predicting done.")
示例#12
0
文件: segmentor.py 项目: memeda/cws
 def _4training_evaluate_processing(self , dev_path) :
     nr_processing_right = 0
     nr_gold = 0
     nr_processing = 0
     for instance in DatasetHandler.read_dev_data(dev_path) :
         unigrams , gold_tags = Segmentor._processing_one_segmented_WSAtom_instance2unigrams_and_tags(instance)
         predict_tags = Decoder.decode_for_predict(self.extractor , self.model , self.constrain , unigrams)
         gold_coor_seq = self.__innerfunc_4evaluate_generate_word_coordinate_sequence_from_tags(gold_tags)
         predict_coor_seq = self.__innerfunc_4evaluate_generate_word_coordinate_sequence_from_tags(predict_tags)
         cur_nr_gold , cur_nr_processing , cur_nr_processing_right = (
                         self.__innerfunc_4evaluate_get_nr_gold_and_processing_and_processing_right(gold_coor_seq , predict_coor_seq)
                 )
         nr_gold += cur_nr_gold
         nr_processing += cur_nr_processing
         nr_processing_right += cur_nr_processing_right
     p , r , f = self.__innerfunc_4evaluate_calculate_prf(nr_gold , nr_processing , nr_processing_right)
     print >>sys.stderr , ("Eval result :\np : %.2f%% r : %.2f%% f : %.2f%%\n"
            "total word num : %d total predict word num : %d predict right num : %d ")%(
             p * 100 , r * 100, f * 100 , nr_gold , nr_processing , nr_processing_right
             )
     return f
示例#13
0
文件: segmentor.py 项目: fseasy/cws
 def _predict_processing(self, predict_path, output_path):
     if isinstance(output_path, file):
         output_f = output_path
     else:
         if output_path == "stdout":
             output_f = sys.stdout
         else:
             output_f = open(output_path, "w")
     logging.info("set output %s " % (output_f.name))
     logging.info("reading instance from %s . predicting ." %
                  (predict_path))
     for instance, separator_data in DatasetHandler.read_predict_data(
             predict_path):
         self.constrain.set_constrain_data(separator_data)
         predict_tags = Decoder.decode_for_predict(self.extractor,
                                                   self.model,
                                                   self.constrain, instance)
         segmented_line = self._processing_unigrams_and_tags2segmented_line(
             instance, predict_tags)
         output_f.write("%s" % ("".join([segmented_line, os.linesep])))
     if output_f is not sys.stdout:
         output_f.close()
     logging.info("predicting done.")
def main():
    domains = ['IR', 'VL']
    preprocess_methods = ['mean_subtraction', 'scaling']
    init_methods = ['random', 'ImageNet']
    model_names = ['ResNet50', 'VGG19']
    stages = ['stage_1', 'stage_2', 'stage_3', 'stage_4', 'stage_5']

    ResNet50_layer_names = [
        'max_pooling2d_1', 'activation_10', 'activation_22', 'activation_40',
        'activation_49'
    ]

    VGG19_layer_names = [
        'block1_pool', 'block2_pool', 'block3_pool', 'block4_pool',
        'block5_pool'
    ]

    out_dict = {}
    sampling_method = 'pca'
    for domain in domains:
        for preprocess_method in preprocess_methods:
            # load dataset
            dh = DatasetHandler(domain)
            dataset_all = dh.get_all(preprocess_method)

            for init_method in init_methods:
                for model_name in model_names:
                    for ind_stage, stage in enumerate(stages):
                        # load model
                        keras.backend.clear_session()
                        if init_method == 'random':
                            model = model_loader.load_full_model(
                                model_name,
                                random_weights=True,
                                no_cats=2,
                                weight_decay=0.001)
                        elif init_method == 'ImageNet':
                            model = model_loader.load_full_model(
                                model_name,
                                random_weights=False,
                                no_cats=2,
                                weight_decay=0.001)

                        # strip layers
                        if model_name == 'ResNet50':
                            end_layer = ResNet50_layer_names[ind_stage]
                        elif model_name == 'VGG19':
                            end_layer = VGG19_layer_names[ind_stage]

                        model = keras.models.Model(
                            inputs=model.input,
                            outputs=model.get_layer(end_layer).output)

                        feats = model.predict(dataset_all['data'])
                        feats = np.reshape(feats,
                                           (dataset_all['data'].shape[0], -1))

                        if sampling_method == 'pca':
                            pca = PCA(1024)
                            feats = pca.fit_transform(feats)
                        elif sampling_method == 'uniform':
                            if feats.shape[1] > 16384:
                                sample_indices = np.round(
                                    np.linspace(0, feats.shape[1] - 1, 16384))
                                feats = feats[:, sample_indices.astype(int)]

                        name = domain + '_' + preprocess_method + '_' + init_method + '_' + model_name + '_' + stage
                        out_dict[name + '_feats'] = feats
                        out_dict[name + '_labels'] = dataset_all['labels']

    savemat('feats_' + sampling_method + '.mat', out_dict, do_compression=True)
示例#15
0
def main():
    # initialize log file
    file_log = open(os.path.join(log_path, 'log.txt'), 'w')

    file_log.write(domain + ' - ' + str(ind_fold) + '\n')
    file_log.write(model_name + '\n')
    file_log.write(init_method + '\n')
    file_log.write(preprocess_method + '\n')

    # read dataset
    dh = DatasetHandler(domain)
    dataset_fold = dh.get_fold(ind_fold, preprocess_method)

    if init_method == 'random':
        model = model_loader.load_full_model(model_name,
                                             random_weights=True,
                                             no_cats=2,
                                             weight_decay=0.001)
    elif init_method == 'ImageNet':
        model = model_loader.load_full_model(model_name,
                                             random_weights=False,
                                             no_cats=2,
                                             weight_decay=0.001)

    # train the last layer
    accs = []
    false_images = []

    model = model_loader.set_trainable_layers(model, model_name, 'final')
    learning_rate = 0.1
    for ind_iter in range(5):
        model = trainer.train_model(model, dataset_fold, learning_rate)
        learning_rate /= 2
    acc, false_image = trainer.test_model(model, dataset_fold)
    accs.append(acc)
    false_images.append(false_image)
    model.save_weights(os.path.join(log_path, 'final_layer.h5'))

    # fine-tune stage 5 and onwards
    model = model_loader.set_trainable_layers(model, model_name, '5')
    learning_rate = 0.01
    for ind_iter in range(5):
        model = trainer.train_model(model, dataset_fold, learning_rate)
        learning_rate /= 2
    acc, false_image = trainer.test_model(model, dataset_fold)
    accs.append(acc)
    false_images.append(false_image)
    model.save_weights(os.path.join(log_path, 'stage_5.h5'))

    # record accuracies
    file_log.write('Final layer\n')
    file_log.write(str(accs[0]) + '\n')
    file_log.write('Stage 5\n')
    file_log.write(str(accs[1]) + '\n')

    # record falsely classified images
    file_log.write('Final layer\n')
    for fi in false_images[0]:
        file_log.write(fi + '\n')
    file_log.write('Stage 5\n')
    for fi in false_images[1]:
        file_log.write(fi + '\n')

    file_log.close()