Exemplo n.º 1
0
    def train(self, args: ClassifierArgs):
        # get dataset
        dataset, data_loader = self.build_data_loader(args, 'train')

        # get trainer
        trainer = self.build_trainer(args, dataset, data_loader)

        best_metric = None
        for epoch_time in range(args.epochs):
            trainer.train_epoch(args, epoch_time)

            # saving model according to epoch_time
            self.saving_model_by_epoch(args, epoch_time)

            # evaluate model according to epoch_time
            metric = self.evaluate(args, is_training=True)

            # update best metric
            # if best_metric is None, update it with epoch metric directly, otherwise compare it with epoch_metric
            if best_metric is None or metric > best_metric:
                best_metric = metric
                self.save_model_to_file(
                    args.saving_dir,
                    args.build_saving_file_name(description='best'))

        if args.training_type == 'sparse' and args.incremental_trick and args.saving_last_epoch:
            self.save_model_to_file(
                args.saving_dir,
                args.build_saving_file_name(description='best'))
        self.evaluate(args)
Exemplo n.º 2
0
    def predict(self, args: ClassifierArgs, **kwargs):
        # self.evaluate(args, is_training=False)
        self.loading_model_from_file(
            args.saving_dir, args.build_saving_file_name(description='best'))
        self.model.eval()
        predictor = Predictor(self.model, self.data_processor, args.model_type)

        dataset, _ = self.build_data_loader(args,
                                            args.evaluation_data_type,
                                            tokenizer=False)
        assert isinstance(dataset, ListDataset)
        if args.predict_numbers == -1:
            predict_dataset = dataset.data
        else:
            predict_dataset = np.random.choice(dataset.data,
                                               size=(args.predict_numbers, ),
                                               replace=False)

        description = tqdm(predict_dataset)
        metric = RandomSmoothAccuracyMetrics()
        for data in description:
            tmp_instances = self.mask_instance_decorator(
                args, data, args.predict_ensemble)
            tmp_probs = predictor.predict_batch(tmp_instances)
            target = self.data_reader.get_label_to_idx(data.label)
            pred = predict(tmp_probs, args.alpha)
            metric(pred, target)
            description.set_description(metric.__str__())
        print(metric)
        logging.info(metric)
Exemplo n.º 3
0
    def infer(self, args: ClassifierArgs) -> Dict:
        content = args.content
        assert content is not None, 'in infer mode, parameter content cannot be None! '
        content = content.strip()
        assert content != '' and len(
            content) != 0, 'in infer mode, parameter content cannot be empty! '

        self.loading_model_from_file(
            args.saving_dir, args.build_saving_file_name(description='best'))
        self.model.eval()

        predictor = Predictor(self.model, self.data_processor, args.model_type)
        pred_probs = predictor.predict(content)
        pred_label = np.argmax(pred_probs)
        pred_label = self.data_reader.get_idx_to_label(pred_label)
        if pred_label == '100':
            pred_label = '0'
        elif pred_label == '101':
            pred_label = '1'

        result_in_dict = {
            'content': content,
            'pred_label': pred_label,
            'pred_confidence': pred_probs
        }
        result_in_str = ', '.join([
            '{}: {}'.format(key, value) if not isinstance(value, list) else
            '{}: [{}]'.format(key, ', '.join(["%.4f" % val for val in value]))
            for key, value in result_in_dict.items()
        ])
        print(result_in_str)
        logging.info(result_in_str)
        return result_in_dict
Exemplo n.º 4
0
    def evaluate(self, args: ClassifierArgs, is_training=False) -> Metric:
        if is_training:
            logging.info('Using current modeling parameter to evaluate')
            data_type = 'dev'
        else:
            self.loading_model_from_file(
                args.saving_dir,
                args.build_saving_file_name(description='best'))
            data_type = args.evaluation_data_type
        self.model.eval()

        dataset, data_loader = self.build_data_loader(args, data_type)
        epoch_iterator = tqdm(data_loader)

        metric = DATASET_TYPE.get_evaluation_metric(
            args.dataset_name, compare_key=args.compare_key)
        for step, batch in enumerate(epoch_iterator):
            assert isinstance(batch[0], torch.Tensor)
            batch = tuple(t.cuda() for t in batch)
            golds = batch[3]
            inputs = convert_batch_to_bert_input_dict(batch, args.model_type)
            logits = self.model.forward(**inputs)[0]
            losses = self.loss_function(
                logits.view(-1, self.data_reader.NUM_LABELS), golds.view(-1))
            epoch_iterator.set_description('loss: {:.4f}'.format(
                torch.mean(losses)))
            metric(losses, logits, golds)

        print(metric)
        logging.info(metric)
        return metric
Exemplo n.º 5
0
 def saving_model_by_epoch(self, args: ClassifierArgs, epoch: int):
     # saving
     if args.saving_step is not None and args.saving_step != 0:
         if (epoch - 1) % args.saving_step == 0:
             self.save_model_to_file(
                 args.saving_dir,
                 args.build_saving_file_name(
                     description='epoch{}'.format(epoch)))
Exemplo n.º 6
0
    def attack(self, args: ClassifierArgs, **kwargs):
        # self.evaluate(args, is_training=False)
        # self.evaluate(args, is_training=False)
        self.loading_model_from_file(
            args.saving_dir, args.build_saving_file_name(description='best'))
        self.model.eval()

        # build test dataset
        dataset, _ = self.build_data_loader(args,
                                            args.evaluation_data_type,
                                            tokenizer=False)
        test_instances = dataset.data

        # build attacker
        attacker = self.build_attacker(args)

        attacker_log_path = '{}'.format(args.build_logging_path())
        attacker_log_path = os.path.join(args.logging_dir, attacker_log_path)
        attacker_log_manager = AttackLogManager()
        # attacker_log_manager.enable_stdout()
        attacker_log_manager.add_output_file(
            os.path.join(attacker_log_path,
                         '{}.txt'.format(args.attack_method)))

        for i in range(args.attack_times):
            print("Attack time {}".format(i))

            choice_instances = np.random.choice(test_instances,
                                                size=(args.attack_numbers, ),
                                                replace=False)
            dataset = CustomTextAttackDataset.from_instances(
                args.dataset_name, choice_instances,
                self.data_reader.get_labels())
            results_iterable = attacker.attack_dataset(dataset)
            description = tqdm(results_iterable, total=len(choice_instances))
            result_statistics = SimplifidResult()
            for result in description:
                try:
                    attacker_log_manager.log_result(result)
                    result_statistics(result)
                    description.set_description(result_statistics.__str__())
                except RuntimeError as e:
                    print('error in process')

        attacker_log_manager.enable_stdout()
        attacker_log_manager.log_summary()
Exemplo n.º 7
0
 def build_writer(self, args: ClassifierArgs,
                  **kwargs) -> Union[SummaryWriter, None]:
     writer = None
     if args.tensorboard == 'yes':
         tensorboard_file_name = '{}-tensorboard'.format(
             args.build_logging_path())
         tensorboard_path = os.path.join(args.logging_dir,
                                         tensorboard_file_name)
         writer = SummaryWriter(tensorboard_path)
     return writer
Exemplo n.º 8
0
    def augmentation(self, args: ClassifierArgs, **kwargs):
        self.loading_model_from_file(
            args.saving_dir, args.build_saving_file_name(description='best'))
        self.model.eval()

        train_instances, _ = self.build_data_loader(args,
                                                    'train',
                                                    tokenizer=False)
        train_dataset_len = len(train_instances.data)
        print('Training Set: {} sentences. '.format(train_dataset_len))

        # delete instance whose length is smaller than 3
        train_instances_deleted = [
            instance for instance in train_instances.data
            if instance.length() >= 3
        ]
        dataset_to_aug = np.random.choice(train_instances_deleted,
                                          size=(int(train_dataset_len *
                                                    0.5), ),
                                          replace=False)

        dataset_to_write = np.random.choice(train_instances.data,
                                            size=(int(train_dataset_len *
                                                      0.5), ),
                                            replace=False).tolist()
        attacker = self.build_attacker(args)
        attacker_log_manager = AttackLogManager()
        dataset = CustomTextAttackDataset.from_instances(
            args.dataset_name, dataset_to_aug, self.data_reader.get_labels())
        results_iterable = attacker.attack_dataset(dataset)
        aug_instances = []
        for result, instance in tqdm(zip(results_iterable, dataset_to_aug),
                                     total=len(dataset)):
            try:
                adv_sentence = result.perturbed_text()
                aug_instances.append(
                    InputInstance.from_instance_and_perturb_sentence(
                        instance, adv_sentence))
            except:
                print('one error happend, delete one instance')

        dataset_to_write.extend(aug_instances)
        self.data_reader.saving_instances(dataset_to_write, args.dataset_dir,
                                          'aug_{}'.format(args.attack_method))
        print('Writing {} Sentence. '.format(len(dataset_to_write)))
        attacker_log_manager.enable_stdout()
        attacker_log_manager.log_summary()
Exemplo n.º 9
0
    def run(cls, args: ClassifierArgs):
        # build logging
        # including check logging path, and set logging config
        args.build_logging_dir()
        args.build_logging()
        logging.info(args)

        args.build_environment()
        # check dataset and its path
        args.build_dataset_dir()

        args.build_saving_dir()
        args.build_caching_dir()

        if args.dataset_name in ['agnews', 'snli']:
            args.keep_sentiment_word = False

        classifier = cls(args)
        classifier.methods[args.mode](args)
Exemplo n.º 10
0
    def statistics(self, args: ClassifierArgs, **kwargs):
        # self.evaluate(args, is_training=False)
        self.loading_model_from_file(
            args.saving_dir, args.build_saving_file_name(description='best'))
        self.model.eval()
        predictor = Predictor(self.model, self.data_processor, args.model_type)

        dataset, _ = self.build_data_loader(args,
                                            args.evaluation_data_type,
                                            tokenizer=False)
        assert isinstance(dataset, ListDataset)
        if args.certify_numbers == -1:
            certify_dataset = dataset.data
        else:
            certify_dataset = np.random.choice(dataset.data,
                                               size=(args.certify_numbers, ),
                                               replace=False)

        description = tqdm(certify_dataset)
        num_labels = self.data_reader.NUM_LABELS
        metric = RandomAblationCertifyMetric()
        result_dicts = {"pix": []}
        for i in range(11):
            result_dicts[str(i)] = list()
        for data in description:
            target = self.data_reader.get_label_to_idx(data.label)
            data_length = data.length()
            keep_nums = data_length - round(
                data_length * args.sparse_mask_rate)

            tmp_instances = self.mask_instance_decorator(
                args, data, args.predict_ensemble)
            tmp_probs = predictor.predict_batch(tmp_instances)
            guess = np.argmax(
                np.bincount(np.argmax(tmp_probs, axis=-1),
                            minlength=num_labels))

            if guess != target:
                metric(np.nan, data_length)
                continue

            numbers = args.ceritfy_ensemble * 2
            tmp_instances, mask_indexes = self.mask_instance_decorator(
                args, data, numbers, return_indexes=True)
            ablation_indexes = [
                list(set(list(range(data_length))) - set(indexes))
                for indexes in mask_indexes
            ]
            tmp_probs = predictor.predict_batch(tmp_instances)
            tmp_preds = np.argmax(tmp_probs, axis=-1)
            p_i_x = np.bincount(tmp_preds,
                                minlength=num_labels)[guess] / numbers
            result_dicts["pix"].append(p_i_x)
            for i in range(1, 11):
                lambda_value = population_lambda(tmp_preds, ablation_indexes,
                                                 data_length, i, num_labels,
                                                 guess)
                result_dicts[str(i)].append(lambda_value)

        file_name = os.path.join(
            args.logging_dir, "{}-probs.txt".format(args.build_logging_path()))
        with open(file_name, 'w') as file:
            for key, value in result_dicts.items():
                file.write(key)
                file.write(":  ")
                file.write(" ".join([str(v) for v in value]))
                file.write("\n")
Exemplo n.º 11
0
    def certify(self, args: ClassifierArgs, **kwargs):
        # self.evaluate(args, is_training=False)
        self.loading_model_from_file(
            args.saving_dir, args.build_saving_file_name(description='best'))
        self.model.eval()
        predictor = Predictor(self.model, self.data_processor, args.model_type)

        dataset, _ = self.build_data_loader(args,
                                            args.evaluation_data_type,
                                            tokenizer=False)
        assert isinstance(dataset, ListDataset)
        if args.certify_numbers == -1:
            certify_dataset = dataset.data
        else:
            certify_dataset = np.random.choice(dataset.data,
                                               size=(args.certify_numbers, ),
                                               replace=False)

        description = tqdm(certify_dataset)
        num_labels = self.data_reader.NUM_LABELS
        metric = RandomAblationCertifyMetric()
        for data in description:
            target = self.data_reader.get_label_to_idx(data.label)
            data_length = data.length()
            keep_nums = data_length - round(
                data_length * args.sparse_mask_rate)

            tmp_instances = self.mask_instance_decorator(
                args, data, args.predict_ensemble)
            tmp_probs = predictor.predict_batch(tmp_instances)
            guess = np.argmax(
                np.bincount(np.argmax(tmp_probs, axis=-1),
                            minlength=num_labels))

            if guess != target:
                metric(np.nan, data_length)
                continue

            tmp_instances = self.mask_instance_decorator(
                args, data, args.ceritfy_ensemble)
            tmp_probs = predictor.predict_batch(tmp_instances)
            guess_counts = np.bincount(np.argmax(tmp_probs, axis=-1),
                                       minlength=num_labels)[guess]
            lower_bound, upper_bound = lc_bound(guess_counts,
                                                args.ceritfy_ensemble,
                                                args.alpha)
            if args.certify_lambda:
                # tmp_instances, mask_indexes = mask_instance(data, args.sparse_mask_rate, self.tokenizer.mask_token,nums=args.ceritfy_ensemble * 2, return_indexes=True)
                # tmp_probs = predictor.predict_batch(tmp_instances)
                # tmp_preds = np.argmax(tmp_probs, axis=-1)
                # ablation_indexes = [list(set(list(range(data_length))) - set(indexes.tolist())) for indexes in mask_indexes]
                # radius = population_radius_for_majority_by_estimating_lambda(lower_bound, data_length, keep_nums, tmp_preds, ablation_indexes, num_labels, guess, samplers = 200)
                radius = population_radius_for_majority(
                    lower_bound,
                    data_length,
                    keep_nums,
                    lambda_value=guess_counts / args.ceritfy_ensemble)
            else:
                radius = population_radius_for_majority(
                    lower_bound, data_length, keep_nums)

            metric(radius, data_length)

            result = metric.get_metric()
            description.set_description("Accu: {:.2f}%, Median: {}".format(
                result['accuracy'] * 100, result['median']))
        print(metric)
        logging.info(metric)

        # logging metric certify_radius and length
        logging.info(metric.certify_radius())
        logging.info(metric.sentence_length())
Exemplo n.º 12
0
import logging
from args import ClassifierArgs
from classifier import Classifier

if __name__ == '__main__':
    args = ClassifierArgs()._parse_args()
    print(args)
    logging.info(args)
    Classifier.run(args)