Exemplo n.º 1
0
    def __init__(self,
                 model,
                 encoder,
                 class_names,
                 tokenize_on_space,
                 use_unk=False,
                 unk='<unk>',
                 exp_th=0.95,
                 seed=1):
        """
        :param use_unk: If False, replaces words by similar words instead of UNKs
        :param unk: the symbol to use for unknown words
        """
        self.model = model
        self.encoder = encoder
        self.use_unk = use_unk
        self.unk = unk
        self.threshold = exp_th

        # need to install this spacy module separately to enable word similarity
        self.nlp = spacy.load("en_core_web_lg")
        if tokenize_on_space:
            self.nlp.tokenizer = Tokenizer(self.nlp.vocab)
        else:
            self.nlp.tokenizer = self.nlp.Defaults.create_tokenizer(self.nlp)

        np.random.seed(seed)
        self.explainer = anchor_text.AnchorText(
            self.nlp,
            class_names,
            use_unk_distribution=self.use_unk,
            mask_string=self.unk)
Exemplo n.º 2
0
def main():
    """Fine-tune BERT for a given task with given parameters."""

    # Define all parameters, using argparse/Command Line Interface.
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    def add_args():
        """Add all possible options and defaults to the parser."""
        # Hyperparameters of BERT
        # Parameters often changed
        parser.add_argument("--bert_model",
                            default="bert-base-uncased",
                            type=str,
                            help="Bert pre-trained model selected in the list: bert-base-uncased, "
                                 "bert-large-uncased, bert-base-cased, bert-large-cased, "
                                 "bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese.")
        parser.add_argument("--max_seq_length",
                            default=128,
                            type=int,
                            help="The maximum total input sequence length after WordPiece tokenization. \n"
                                 "Sequences longer than this will be truncated, and sequences shorter \n"
                                 "than this will be padded.")
        parser.add_argument("--train_batch_size",
                            default=16,
                            type=int,
                            help="Total batch size for training.")
        parser.add_argument("--learning_rate",
                            default=2e-5,
                            type=float,
                            help="The initial learning rate for Adam.")
        parser.add_argument("--num_train_epochs",
                            default=3.0,
                            type=float,
                            help="Total number of training epochs to perform.")
        parser.add_argument("--do_lower_case",
                            action='store_true',
                            help="Set this flag if you are using an uncased model.")
        # Parameters usually unchanged
        parser.add_argument("--warmup_proportion",
                            default=0.1,
                            type=float,
                            help="Proportion of training to perform linear learning rate warmup for. "
                                 "E.g., 0.1 = 10%% of training.")
        parser.add_argument("--eval_batch_size",
                            default=8,
                            type=int,
                            help="Total batch size for eval.")
        # Parameters of the task
        parser.add_argument("--task_name",
                            default="node",
                            type=str,
                            help="The name of the task to train. One of node, political-as, "
                                 "political-ru, political-asu, agreement, node-ext, political-as-topics,"
                                 "political-ru-topics, political-asu-topics, agreement-topics")
        parser.add_argument("--input_to_use",
                            type=str,
                            default="both",
                            help="Which input to use. One of both, org, response, response-org.")
        # Parameters for reproduction
        parser.add_argument('--seed',
                            type=int,
                            default=42,
                            help="random seed for initialization")
        parser.add_argument('--gradient_accumulation_steps',
                            type=int,
                            default=1,
                            help="Number of updates steps to accumulate before performing a backward/update pass.")
        # Parameters for where to save/load data
        parser.add_argument("--data_dir",
                            default="../data",
                            type=str,
                            help="The input data dir. Should contain the .tsv file (or other data files) for the task.")
        parser.add_argument("--output_dir",
                            default="run",
                            type=str,
                            help="The output directory where the model predictions and checkpoints will be written.")
        parser.add_argument("--cache_dir",
                            default="",
                            type=str,
                            help="Where do you want to store the pre-trained models downloaded from s3")
        parser.add_argument('--overwrite_output_dir',
                            action='store_true',
                            help="Overwrite the content of the output directory")
        # Parameters to decide what to do (train, test, crossval, save the model)
        parser.add_argument("--do_train",
                            action='store_true',
                            help="Whether to run training.")
        parser.add_argument("--do_eval",
                            action='store_true',
                            help="Whether to run eval on the dev set.")
        parser.add_argument("--do_train_eval",
                            action='store_true',
                            help="Whether to run training and eval.")
        parser.add_argument('--n_times',
                            type=int,
                            default=10,
                            help="Number of restarts for every parameter setting in train&eval mode")
        parser.add_argument("--do_cross_val",
                            action='store_true',
                            help="Whether to run cross-validation.")
        parser.add_argument("--do_save",
                            action='store_true',
                            help="Whether to save the resulting model.")
        parser.add_argument("--do_visualization",
                            action='store_true',
                            help="Whether to run visualization.")
        # Additional parameters
        parser.add_argument("--no_cuda",
                            action='store_true',
                            help="Whether not to use CUDA when available")
        parser.add_argument('--log_level',
                            type=str,
                            default="info",
                            help="Verbosity of logging output. One of info or warn.")

    # Add all parameters to the parser and parse them.
    add_args()
    args = parser.parse_args()

    # Set up all parameters given the CLI arguments.
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()
    args.device = device
    task_name = args.task_name.lower()
    processor = processors[task_name](args.input_to_use)
    label_list = processor.get_labels()
    num_labels = len(label_list)
    global_step = 0
    tr_loss = 0
    tb_writer = SummaryWriter()

    # Prepare the logging.
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO if args.log_level == "info" else logging.WARN)
    logger.info("device: {} n_gpu: {}".format(
        device, n_gpu))

    # Check the arguments and fail if the arguments are invalid.
    if not args.do_train and not args.do_eval and not args.do_cross_val and not args.do_visualization \
            and not args.do_train_eval:
        raise ValueError("At least one of `do_train`, `do_eval` `do_cross_val` "
                         "or `do_visualization` or 'do_train_eval` must be True.")
    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and not args.overwrite_output_dir:
        raise ValueError("Output directory ({}) already exists and is not empty. "
                         "Use the --overwrite_output_dir option.".format(args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    # Calculate the train_batch_size if gradient accumulation is used
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    # Set all seeds for reproducibility
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    def get_features_examples(mode):
        """Returns the features and examples of train or test mode."""

        def convert(split, modus, exs):
            """Converts the examples or load them from cache."""
            cached_features_file = os.path.join(args.data_dir, 'cache', '{0}_{1}_{2}_{3}_{4}_{5}'.format(modus,
                list(filter(None, args.bert_model.split('/'))).pop(),
                            str(args.max_seq_length),
                            str(task_name), str(args.input_to_use), split))
            # Try to load the cached features.
            try:
                with open(cached_features_file, "rb") as reader:
                    fs = pickle.load(reader)
            # Creates and cache the features.
            except FileNotFoundError:
                if not os.path.exists(os.path.join(args.data_dir, 'cache')):
                    os.makedirs(os.path.join(args.data_dir, 'cache'))
                fs = convert_examples_to_features(
                    exs, label_list, args.max_seq_length, tokenizer)
                logger.info('Saving {0} features into cached file {1}'.format(mode, cached_features_file))
                with open(cached_features_file, "wb") as writer:
                    pickle.dump(fs, writer)

            return fs

        # Return the features, examples and dataframes depending on the mode.
        if mode == "train":
            train_ex, df = processor.get_train_examples(args.data_dir)
            return convert("X", mode, train_ex), train_ex, df
        elif mode == "dev":
            dev_ex, df = processor.get_dev_examples(args.data_dir)
            return convert("X", mode, dev_ex), dev_ex, df
        elif mode == "cross_val":
            data = processor.get_splits(args.data_dir)
            train_f_list, train_e_list, train_df_list, test_f_list, test_e_list, test_df_list = ([] for _ in range(6))
            for i, (train_ex, train_df, test_ex, test_df) in enumerate(data):
                train_e_list.append(train_ex)
                train_df_list.append(train_df)
                test_e_list.append(test_ex)
                test_df_list.append(test_df)
                # Create features from the examples
                train_f_list.append(convert(i, "train", train_ex))
                test_f_list.append(convert(i, "dev", test_ex))
            return train_f_list, train_e_list, train_df_list, test_f_list, test_e_list, test_df_list
        else:
            raise ValueError("Invalid feature mode.")

    def create_tensor_dataset(exfeatures):
        """Creates a TensoDataset out of the features."""
        all_input_ids = torch.tensor([f.input_ids for f in exfeatures], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in exfeatures], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in exfeatures], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in exfeatures], dtype=torch.long)
        return TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

    def do_training(train_fs, train_exs):
        """Runs BERT fine-tuning."""
        # Allows to write to enclosed variables global_step
        nonlocal global_step

        # Create the batched training data out of the features.
        train_data = create_tensor_dataset(train_fs)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        # Calculate the number of optimization steps.
        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

        # Prepare optimizer.
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

        # Log some information about the training.
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_exs))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        # Set the model to training mode and train for X epochs.
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            # Iterate over all batches.
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch

                # Get the Logits and calculate the loss.
                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
                loss = CrossEntropyLoss()(logits.view(-1, num_labels), label_ids.view(-1))

                # Scale the loss in gradient accumulation mode.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                # Calculate the gradients.
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                # Update the weights every gradient_accumulation_steps steps.
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
                    tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', loss.item(), global_step)

    def do_save():
        """Saves the current model, tokenizer and arguments."""
        nonlocal model
        nonlocal tokenizer

        model_to_save = model.module if hasattr(model, 'module') else model
        # Using the predefined names, we can load using `from_pretrained`.
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        # Save the trained model, configuration and tokenizer
        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Save the training arguments together with the trained model.
        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
        torch.save(args, output_args_file)

    def do_eval(eval_features, eval_examples):
        """Do evaluation on the current model."""

        # Logg some information.
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)

        # Get the eval data and create a sequential dataloader.
        eval_data = create_tensor_dataset(eval_features)
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        # Set the model to eval mode (disable dropout)
        model.eval()
        eval_loss = 0
        nb_eval_steps = 0
        preds = []
        out_label_ids = None

        # Iterate over the evaluation data.
        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            # Forward pass with deactivated autograd engine.
            with torch.no_grad():
                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)

            # Calculate eval loss.
            tmp_eval_loss = CrossEntropyLoss()(logits.view(-1, num_labels), label_ids.view(-1))
            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
                out_label_ids = label_ids.detach().cpu().numpy()
            else:
                preds[0] = np.append(
                    preds[0], logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(
                    out_label_ids, label_ids.detach().cpu().numpy(), axis=0)

        # Calculate the mean loss and get all predictions.
        eval_loss = eval_loss / nb_eval_steps
        loss = tr_loss/global_step if args.do_train else None
        preds = preds[0]
        preds = np.argmax(preds, axis=1)
        # Compute the metrics for the given task
        result = compute_metrics(task_name, preds, out_label_ids)

        # Save additional information in the result dict.
        result['eval_loss'] = eval_loss
        result['global_step'] = global_step
        result['loss'] = loss
        # Save all settings for external evaluation
        result['_task'] = task_name
        result['_input_mode'] = args.input_to_use
        result['_learning_rate'] = args.learning_rate
        result['_bert-model'] = args.bert_model
        result['_batch_size'] = args.train_batch_size
        result['_warmup'] = args.warmup_proportion
        result['_num_epochs'] = args.num_train_epochs
        result['_seq_len'] = args.max_seq_length
        result['_seed'] = args.seed
        result['_gradient_acc'] = args.gradient_accumulation_steps

        return result, preds

    def save_results(result_list, pred_list):
        """Saves the results and the predictions."""
        # Save the results in a text file.
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "a") as writer:
            logger.info("***** Eval results *****")
            for i, result_dict in enumerate(result_list):
                logger.info("Run %i", i)
                writer.write("Run %i\n" % i)
                for key in sorted(result_dict.keys()):
                    if not key.startswith("_"):
                        logger.info("  %s = %s", key, str(result_dict[key]))
                        writer.write("%s = %s\n" % (key, str(result_dict[key])))
        # Save the results and predictions in csv and tsv files.
        output_csv_file = os.path.join(args.output_dir, "../eval_results.tsv")
        output_preds_file = os.path.join(args.output_dir, "../eval_preds.csv")
        df_res = pd.DataFrame(result_list)
        df_preds = pd.DataFrame(pred_list)
        df_preds['run'] = '{0}_{1}_{2}_{3}'.format(
            args.bert_model, args.num_train_epochs, args.train_batch_size, args.learning_rate)
        # If the files do not exist, create them with headers.
        if not os.path.exists(output_csv_file):
            df_res.to_csv(output_csv_file, encoding='utf-8', sep='\t', index=False)
            df_preds.to_csv(output_preds_file, encoding='utf-8', index=False)
        # If the files already exist, just append to them without headers.
        else:
            df_res.to_csv(output_csv_file, mode='a', encoding='utf-8', sep='\t', index=False, header=False)
            df_preds.to_csv(output_preds_file, mode='a', encoding='utf-8', index=False, header=False)

    # Load the tokenizer and the model.
    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
    model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
    model.to(device)

    # Train and test .
    if args.do_train_eval:
        # Get the train and test features only once.
        train_features, train_examples, _ = get_features_examples("train")
        test_features, test_examples, _ = get_features_examples("dev")

        # Repeat N times.
        for i in range(args.n_times):
            # Train.
            do_training(train_features, train_examples)
            # Eval.
            result, preds = do_eval(test_features, test_examples)
            # Save the results.
            save_results([result], [preds])
            # Reset and new seeds.
            if i+1 < args.n_times:
                args.seed += 1
                random.seed(args.seed)
                np.random.seed(args.seed)
                torch.manual_seed(args.seed)
                if n_gpu > 0:
                    torch.cuda.manual_seed_all(args.seed)
                # Reset model.
                model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
                model.to(device)

    # Training
    if args.do_train:
        # Get the train features.
        features, examples, df = get_features_examples("train")
        # Train.
        do_training(features, examples)
        # Save the model if wanted.
        if args.do_save:
            do_save()

    # Evaluation.
    if args.do_eval:
        # Get the dev features.
        features, examples, df = get_features_examples("dev")
        # Evaluate.
        result, preds = do_eval(features, examples)
        # Save the results.
        save_results([result], [preds])

    # CrossVal.
    if args.do_cross_val:
        # Get the data for all splits
        train_f_l, train_e_l, train_df_l, test_f_l, test_e_l, test_df_l = get_features_examples("cross_val")
        # Iterate over all splits
        for train_features, train_examples, test_features, test_examples in zip(
                train_f_l, train_e_l, test_f_l, test_e_l):
            # Reset model.
            model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
            model.to(device)
            # Train.
            do_training(train_features, train_examples)
            # Eval.
            result, preds = do_eval(test_features, test_examples)
            # Save results.
            save_results([result], [preds])

    # Visualization.
    if args.do_visualization:
        # Additional imports needed for the visualizations.
        import spacy
        from skorch import NeuralNetClassifier
        from sklearn.pipeline import make_pipeline
        from run_classifier_dataset_utils import InputExample
        from anchor import anchor_text
        from lime.lime_text import LimeTextExplainer

        # Example sentences.
        raw_text_1 = "But Mr. Nixon did n't say a word that was ever publicly recorded . Even more incredible , " \
                     "he did n't say a word when the Communists took power in Cuba - not 4 miles off their shores , " \
                     "but only 90 miles off our shores . Mr. Nixon saw what was happening in Cuba ."
        raw_text_2 = "Cordoba House is no act of tolerance, but of excess/arrogance. Building this structure on the " \
                     "edge of the battlefield created by radical Islamists is not a celebration of " \
                     "religious pluralism and mutual tolerance; it is a political statement of shocking arrogance " \
                     "and hypocrisy."
        raw_text_3 = "Are not right no does he alcohol child china play"
        raw_text_list = [raw_text_1, raw_text_2, raw_text_3]

        class BertConverter:
            """Pipeline-Class to convert text to the input format of BERT."""
            def transform(self, X, y=None, **fit_params):
                """Transforms a list of strings to a list of BERT inputs."""
                exs = []
                for text in X:
                    exs.append(InputExample(guid=None, text_a=text, text_b=None, label="attack"))
                visu_features = convert_examples_to_features(exs, label_list, args.max_seq_length, tokenizer)
                all_input_ids = torch.tensor([f.input_ids for f in visu_features], dtype=torch.long)
                all_input_mask = torch.tensor([f.input_mask for f in visu_features], dtype=torch.long)
                all_segment_ids = torch.tensor([f.segment_ids for f in visu_features], dtype=torch.long)
                return [all_input_ids, all_segment_ids, all_input_mask]

            def fit(self, X, y=None, **fit_params):
                return self

        class MyBERT(torch.nn.Module):
            """Class to wrap the current BERT model."""
            def __init__(self):
                super(MyBERT, self).__init__()
                self.model = model

            def forward(self, X):
                """Apply a softmax function to the output of the BERT model."""
                return torch.nn.functional.softmax(self.model(*X), dim=1)

        # Creates a NeuralNetClassifier.
        if device == torch.device('cuda'):
            net = NeuralNetClassifier(MyBERT, device='cuda', max_epochs=0, lr=0.0, train_split=None)
        else:
            net = NeuralNetClassifier(MyBERT, max_epochs=0, lr=0.0, train_split=None)

        # Set up the pipeline.
        c = make_pipeline(BertConverter(), net)
        # To initialize the pipeline (does not train, because epochs=0).
        c.fit(raw_text_list, y=torch.zeros(len(raw_text_list), dtype=torch.long))

        # Print the predictions and probabilities for the example texts.
        print(c.predict_proba(raw_text_list))

        # Creates the LimeTextExplainer.
        # bow=True to replace all occurrences of a string at once.
        explainer = LimeTextExplainer(class_names=processor.get_labels(), bow=False, mask_string="[UNK]")

        # Explain the first example in the list and save the result using LIME.
        idx = 0
        exp = explainer.explain_instance(raw_text_list[idx], c.predict_proba)
        print('Document id: %d' % idx)
        print('Probability(support) =', c.predict_proba([raw_text_list[idx]])[0, 1])
        print('True class: %s' % "None")
        print(exp.as_list())
        exp.save_to_file(os.path.join(args.output_dir, "lime.html"))

        # Explain the first example using the ANCHOR explainer and save the result.
        nlp = spacy.load("en_core_web_sm")
        explainer2 = anchor_text.AnchorText(nlp, processor.get_labels(), use_unk_distribution=True)
        exp2 = explainer2.explain_instance(raw_text_list[idx], c.predict, threshold=0.95, use_proba=True)
        pred = explainer2.class_names[c.predict([raw_text_list[idx]])[0]]
        alternative = explainer2.class_names[1 - c.predict([raw_text_list[idx]])[0]]
        print('Anchor: %s' % (' AND '.join(exp2.names())))
        print('Precision: %.2f\n' % exp2.precision())
        print('Examples where anchor applies and model predicts %s:\n' % pred)
        print('\n'.join([x[0] for x in exp2.examples(only_same_prediction=True)]))
        print('Examples where anchor applies and model predicts %s:\n' % alternative)
        print('\n'.join([x[0] for x in exp2.examples(only_different_prediction=True)]))
        exp2.save_to_file(os.path.join(args.output_dir, "anchor.html"))
Exemplo n.º 3
0
# In[91]:


# this is the requested function by Anchors!

#@timeit
def predict_text(text):
    return model.predict(vectorizer.transform(text))


# In[57]:


# build explanator
explanator = anchor_text.AnchorText(nlp, ["negative", "positive"], use_unk_distribution=False)


# In[55]:


predict_text(["Good film"])


# In[85]:


explain_sample = train[:30]


# In[25]:
Exemplo n.º 4
0
# train deepmatcher
model = dm.MatchingModel(attr_summarizer='hybrid')
model.load_state('da_dm.pth')
#model.run_train(trainLab, validationLab, best_save_path='da_dm.pth', epochs=15)

# evaluate deepmatcher on test data
eval = model.run_eval(testLab)

# transform test data to feed it to anchors
test_df = pd.read_csv(datadir + '/merged_test.csv')
pairs_str_test = pairs_to_string(test_df,'ltable_','rtable_')

# create anchors text explainer instance
class_names = ["non-matching","matching"]
nlp = spacy.load('en_core_web_lg')
explainer = anchor_text.AnchorText(nlp, class_names, mask_string='', use_unk_distribution=True, use_bert=False)

verbose = False
e_values = {0: [''], 1: ['']}
threshold = 51
print(f'using {len(pairs_str_test)} test samples')
for t_i in pairs_str_test:
    try:
        if len(e_values[0]) >= threshold and len(e_values[1]) >= threshold:
            print('finished!')
            break
        # perform prediction on test instance
        fn_result = predict_fn([t_i])
        result_key = fn_result[0]
        if len(e_values[result_key]) < threshold:
            pred = explainer.class_names[result_key]
Exemplo n.º 5
0
with open(in_file, 'r') as f:
    file_data = f.read()

# In[11]:

parser = PlaintextParser.from_file(in_file, Tokenizer(LANGUAGE))

summarizer = TextRankSummarizer(Stemmer(LANGUAGE))
summarizer.stop_words = get_stop_words('slovak')

helper = _summarizer.AbstractSummarizer()

# In[36]:

explanator = anchor_text.AnchorText(nlp, ['negative', 'positive'],
                                    use_unk_distribution=True)

# In[13]:

# define a decorator to log execusion time
# inspired by https://medium.com/pythonhive/python-decorator-to-measure-the-execution-time-of-methods-fa04cb6bb36d


def timeit(method):
    def timed(*args, **kw):
        timed.calls += 1
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        timed.time_taken += (te - ts) * 1000
        return result
Exemplo n.º 6
0
from anchor import anchor_text
from anecdotes_utils import anecdotes_predict_anchor, anecdotes_labels, get_merged_instance, anecdotes_exp_dir
import spacy
import time

instance_idx = 0
nlp = spacy.load('en_core_web_lg')
#nlp = spacy.load('en_core_web_sm') #FOR BERT but throws error when using
explainer = anchor_text.AnchorText(nlp,
                                   anecdotes_labels,
                                   use_unk_distribution=False,
                                   use_bert=True)

# BERT limits to 512 tokens, anchor implementation does not take this into account
# so anecdotes are truncated before being explained
start_time = time.time()
exp = explainer.explain_instance(get_merged_instance(instance_idx,
                                                     truncate=True),
                                 anecdotes_predict_anchor,
                                 threshold=0.9)
end_time = time.time()
exp.save_to_file(anecdotes_exp_dir + "anchor0.html")
print('done in :%f', end_time - start_time)
            input_tensor = torch.stack(
                [tensor_txt_inputs, tensor_attention_masks], dim=1)
            output = net(input_tensor)
            #predicted = torch.argmax(output.data, 1)
            res[i] = output.numpy()[0]
            i = ++i
    print(res)
    return res


nlp = spacy.load(
    '/home/julien/Documents/stage/anchor/datasets/en_core_web_lg-2.2.5/en_core_web_lg/en_core_web_lg-2.2.5'
)
#nlp = spacy.load('/udd/jdelauna/Documents/anchor/datasets/fr_core_news_sm-2.2.5/fr_core_news_sm/fr_core_news_sm-2.2.5')

explainer = anchor_text.AnchorText(nlp, ['true news', 'fake news'],
                                   use_unk_distribution=True)

np.random.seed(1)
#text = 'This is a good book . I will learn a lot in this book . Maybe one day I will be an expert in such a domain .'
#text = "DUBAI, April 19 (Reuters) - Singapore-based Lloyd’s of London insurer, Global Specialty Brokers (GSB), said on Monday.It had suspended flights to Hong Kong from Qatar Airways."
#text = "DUBAI , April 19 (Reuters) - Singapore - based Lloyd of London insurer , Global Specialty Brokers (GSB) , said on Monday . China destroys France ."

#text = "WARSAW (Reuters) - Three new cases of the new coronavirus have been diagnosed in Poland - one man in a critical condition, and two suspected cases - the Health Ministry said on Friday.In January, a 35-year-old Iraqi man died in Poland after suffering severe respiratory infection, possibly caused by the novel coronavirus, also known as NCoV.Authorities are still trying to determine the extent of any relationship between the man, who was admitted to hospital with respiratory illness and died last month, and other possible victims in the country.As in other parts of the world, some foreign universities and medical schools have cancelled conferences or seminars due to NCoV cases in different countries, as has Poland’s health ministry.There have been no reported cases of the novel coronavirus in Poland.NCoV is a virus from the same family as the SARS virus which killed around 800 people worldwide in 2002 and 2003. Scientists believe it may have circulated before the world had developed the ability to detect it through human-to-human transmission."
text = "DUBAI, April 19 (Reuters) - Singapore-based Lloyd’s of London insurer, Global Specialty Brokers (GSB), said on Monday it had suspended flights to Hong Kong from Qatar Airways after low demand since a state crackdown on fundraising by activist investors.“Since the implementation of Hong Kong regulations, low demand for our services from Qatar Airways has led us to suspend its operations,” GSB said in a statement.Hong Kong has tightened regulations on shareholder activism, including curbs on companies bringing in external directors, and launched a review of such matters after a wave of activist campaigns last year.The rules also require companies to publish a list of companies that have been conducting a financial or administrative audit for up to three years, citing concerns over the preparation of the financial statements of such companies.GSB offered alternative services to Qatar Airways, such as trading claims, claims management and reinsurance, through Lloyd’s of London in Hong Kong. It did not reveal how many passengers it had earned from services for Qatar Airways.GSB had offered “well over” 20 flights per month to Hong Kong from Doha since 2015, but with limited demand, the insurer said.The Qatar Airways spokesman said its policy is to not comment on media reports."
text = text.encode('utf-8')
text = str(text)

#text = "We are going to extend this new method and prevent China from attacking France ."
#text = "This is a good book ."
pred = explainer.class_names[predict_antoine(text)]
alternative = explainer.class_names[1 - predict_antoine(text)]
Exemplo n.º 8
0
def main(TRAIN=False,
         TUNING=False,
         ANCHOR=False,
         LIME=True,
         STATISTICS=False,
         PROTODASH=False):
    # read poems using simplereader
    poems_english = readPoems('tsv/english.tsv')
    poems_german = readPoems('tsv/emotion.german.tsv')
    poems_chinese = readPoems('tsv/chinese.tsv')
    print(len(poems_english))
    print(len(poems_german))
    print(len(poems_chinese))
    # set up label dictionary
    label_dict = {
        'Sadness': 0,
        'Humor': 1,
        'Suspense': 2,
        'Nostalgia': 3,
        'Uneasiness': 4,
        'Annoyance': 5,
        'Awe / Sublime': 6,
        'Awe/Sublime': 6,
        'Vitality': 7,
        'Beauty / Joy': 8,
        'Beauty/Joy': 8
    }

    # array of stanzas
    stanzas = []

    # array of most prominent label for each stanza
    labels = []

    # list of languages
    lang = []

    # extract sentences with one label
    for poem in itertools.chain(poems_english, poems_german, poems_chinese):
        for stanza in poem[1:]:
            if poem in poems_english:
                lang.append(0)
            elif poem in poems_german:
                lang.append(1)
            else:
                lang.append(2)
            labelsPerStanza = []
            currentStanzaIndex = len(stanzas)
            newStanza = 1
            for line in stanza:
                if newStanza:
                    stanzas.append(line[0])
                    newStanza = 0
                else:
                    stanzas[currentStanzaIndex] += " " + line[0]
                labelsPerStanza.extend(line[1].split(" --- "))
                if len(line) > 2:
                    labelsPerStanza.extend(line[2].split(" --- "))
            counter = [0, 0, 0, 0, 0, 0, 0, 0, 0]
            for label in labelsPerStanza:
                counter[label_dict[label]] += 1
            labels.append(np.argmax(counter))

    # plot dataset statistics
    if STATISTICS is True:
        df = pd.DataFrame({
            "stanzas": stanzas,
            "labels": labels,
            "languages": lang
        })

        bar_labels = [lab.replace(" ", "") for lab in label_dict.keys()]
        ger_values = df.loc[df["languages"] == 1, "labels"].value_counts()
        en_values = df.loc[df["languages"] == 0, "labels"].value_counts()
        ch_values = df.loc[df["languages"] == 2, "labels"].value_counts()
        print(type(df.loc[df["languages"] == 1, "labels"].value_counts()))
        ger_values[3] = 0
        ger_values.sort_index(inplace=True)
        en_values.sort_index(inplace=True)
        ch_values.sort_index(inplace=True)

        width = 0.5

        fig, ax = plt.subplots()
        plt.grid(zorder=0, alpha=0.7)
        ax.bar(bar_labels, ger_values, width, label='German')
        ax.bar(bar_labels,
               en_values,
               width,
               bottom=ger_values,
               label='English')
        ax.bar(bar_labels,
               ch_values,
               width,
               bottom=en_values + ger_values,
               label='Chinese')

        ax.set_ylabel('Number of stanzas', fontsize=18)
        ax.legend(prop={'size': 18})
        ax.tick_params(axis='both', which='major', labelsize=18)
        plt.xticks(rotation=16)

        plt.show()

    # transform labels into one hot encodings
    one_hot_labels = to_categorical(labels)

    # analyze distribution of labels in dataset
    df = pd.DataFrame({"labels": labels})
    print(df['labels'].value_counts())

    # use pretrained multilingual model to encode sentences
    model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
    embeddings = model.encode(stanzas)

    # shuffle data and split into train and test set
    all_data = [(embeddings[i], one_hot_labels[i], i)
                for i in range(len(embeddings))]
    unshuffled_data = all_data
    random.shuffle(all_data)
    embeddings = [emb for emb, _, _ in all_data]
    labels = [lab for _, lab, _ in all_data]
    indices = [idx for _, _, idx in all_data]

    train_data = np.array(embeddings[:int(0.75 * len(embeddings))])
    train_labels = np.array(labels[:int(0.75 * len(embeddings))])
    dev_data = np.array(
        embeddings[int(0.75 * len(embeddings)):int(0.875 * len(embeddings))])
    dev_labels = np.array(
        labels[int(0.75 * len(embeddings)):int(0.875 * len(embeddings))])
    test_data = np.array(embeddings[int(0.875 * len(embeddings)):])
    test_labels = np.array(labels[int(0.875 * len(embeddings)):])

    # Hyperparameter Tuning
    if TUNING is True:
        learning_rates = [0.001, 0.01, 0.1]
        epochs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        middle_nodes = [20, 50, 100, 150, 200]
        losses = []
        accuracies = []
        max_loss = 100000
        min_acc = 0
        max_config = None
        for lr in learning_rates:
            for epoch in epochs:
                for middle_node in middle_nodes:
                    print("Training with following hyperparameters:", lr,
                          epoch, middle_node)
                    adam = Adam(learning_rate=lr)
                    mdl = Sequential()
                    mdl.add(
                        Dense(middle_node,
                              input_dim=512,
                              kernel_initializer="uniform",
                              activation="relu"))
                    mdl.add(
                        Dense(9,
                              activation="softmax",
                              kernel_initializer="uniform"))
                    mdl.compile(loss="categorical_crossentropy",
                                optimizer=adam,
                                metrics=["categorical_accuracy"])

                    mdl.fit(train_data, train_labels, epochs=epoch, verbose=1)
                    print("evaluating on dev set...")
                    (loss, accuracy) = mdl.evaluate(dev_data,
                                                    dev_labels,
                                                    verbose=1)
                    print("loss: {:.4f}, accuracy: {:.4f}%".format(
                        loss, accuracy * 100))
                    losses.append(loss)
                    accuracies.append(accuracy)
                    if accuracy > min_acc:
                        min_acc = accuracy
                        max_config = (lr, epoch, middle_node)
        print(max_config)

    max_config = (0.01, 7, 150)
    mdl = Sequential()
    if TRAIN is True:
        # use final model
        adam = Adam(learning_rate=max_config[0])
        mdl = Sequential()
        mdl.add(
            Dense(max_config[2],
                  input_dim=512,
                  kernel_initializer="uniform",
                  activation="relu"))
        mdl.add(Dense(9, activation="softmax", kernel_initializer="uniform"))
        mdl.compile(loss="categorical_crossentropy",
                    optimizer=adam,
                    metrics=["categorical_accuracy"])

        mdl.fit(train_data, train_labels, epochs=max_config[1], verbose=1)
        print("evaluating on test set...")
        (loss, accuracy) = mdl.evaluate(test_data, test_labels, verbose=1)
        print("loss={:.4f}, accuracy: {:.4f}%".format(loss, accuracy * 100))
        #print("precision={:.4f}%".format(precision * 100))
        #print("recall={:.4f}%".format(recall * 100))
        # mdl.save('emotion_classifier')

    #mdl = keras.models.load_model('emotion_classifier')
    (loss, accuracy) = mdl.evaluate(test_data, test_labels, verbose=1)

    y_pred = mdl.predict(test_data, batch_size=test_data.shape[0])

    wrong_classified_idx = []

    for j, idx in enumerate(indices[int(0.875 * len(embeddings)):]):
        if np.argmax(y_pred[j]) != np.where(test_labels[j] == 1.0)[0]:
            wrong_classified_idx.append(idx)

    print("These stanzas were wronlgy classified:")
    print(wrong_classified_idx)

    wrong_classified_en = [idx for idx in wrong_classified_idx if idx < 167]
    wrong_classified_ger = [
        idx for idx in wrong_classified_idx if (idx >= 167 and idx < 688)
    ]
    wrong_classified_ch = [idx for idx in wrong_classified_idx if idx >= 688]

    total_en = [
        idx for idx in indices[int(0.875 * len(embeddings)):] if idx < 167
    ]
    total_ger = [
        idx for idx in indices[int(0.875 * len(embeddings)):]
        if (idx >= 167 and idx < 688)
    ]
    total_ch = [
        idx for idx in indices[int(0.875 * len(embeddings)):] if idx >= 688
    ]

    print("Number of wrongly classified stanzas - English: ",
          len(wrong_classified_en))
    print("Number of wrongly classified stanzas - German: ",
          len(wrong_classified_ger))
    print("Number of wrongly classified stanzas - Chinese: ",
          len(wrong_classified_ch))

    print("Total - English: ", len(total_en))
    print("Total - German: ", len(total_ger))
    print("Total - Chinese: ", len(total_ch))

    class_names = [
        'Sadness', 'Humor', 'Suspense', 'Nostalgia', 'Uneasiness', 'Annoyance',
        'Awe / Sublime', 'Vitality', 'Beauty / Joy'
    ]

    examples = [592, 9, 5]

    # ------------------------------------------------------------LIME--------------------------------------------------------------------------------------------
    # apply LIME to obtain explanations for a specific instance

    def pipeline(stanza, mdl=mdl, model=model):
        embedded = model.encode(stanza)
        return mdl.predict(embedded, batch_size=embedded.shape[0])

    if LIME is True:
        # apply LIME to 10 uncorreclty classified stanzas

        for idx in examples:
            print("True Label: ", one_hot_labels[idx])
            emb = np.array(model.encode(stanzas[idx]))
            emb = emb.reshape((512, 1))
            emb = emb.T
            print("Predicted Probabilities: ", mdl.predict(emb, batch_size=1))

            explainer = LimeTextExplainer(class_names=class_names)
            exp = explainer.explain_instance(stanzas[idx],
                                             pipeline,
                                             num_features=6,
                                             top_labels=2)
            top_labs = exp.available_labels()

            print("Explanation for class {}".format(top_labs[0]))
            print('\n'.join(map(str, exp.as_list(label=top_labs[0]))))

            print("Explanation for class {}".format(top_labs[1]))
            print('\n'.join(map(str, exp.as_list(label=top_labs[1]))))

            fig = exp.as_pyplot_figure(top_labs[0])
            plt.show()
            fig_2 = exp.as_pyplot_figure(top_labs[1])
            plt.show()
        # apply LIME to different correctly classified stanzas
        idx = 5
        print("True Label: ", one_hot_labels[idx])
        emb = np.array(model.encode(stanzas[idx]))
        emb = emb.reshape((512, 1))
        emb = emb.T
        print("Predicted Probabilities: ", mdl.predict(emb, batch_size=1))
        print(mdl.predict(emb, batch_size=1).sum())

        explainer = LimeTextExplainer(class_names=class_names)
        exp = explainer.explain_instance(stanzas[idx],
                                         pipeline,
                                         num_features=6,
                                         top_labels=2)
        pickle.dump(exp, open("explanation.pkl", "wb"))
        top_labs = exp.available_labels()

        print("Explanation for class {}".format(top_labs[0]))
        print('\n'.join(map(str, exp.as_list(label=top_labs[0]))))

        print("Explanation for class {}".format(top_labs[1]))
        print('\n'.join(map(str, exp.as_list(label=top_labs[1]))))

        fig = exp.as_pyplot_figure(top_labs[0])
        plt.legend(prop={'size': 600})
        plt.tick_params(axis='both', which='major', labelsize=600)
        plt.set_yticklabels(x, fontsize=600)
        plt.show()
        fig_2 = exp.as_pyplot_figure(top_labs[1])
        plt.legend(prop={'size': 20})
        plt.tick_params(axis='both', which='major', labelsize=20)
        plt.show()

    # ----------------------------------------------------------ANCHOR---------------------------------------------------------------------------------------------
    def predict_label(stanza):
        embedded = model.encode(stanza)
        probs = mdl.predict(embedded, batch_size=embedded.shape[0])
        return [np.argmax(probs[0])]

    def predict_second_label(stanza, predicted_label):
        embedded = model.encode(stanza)
        probs = mdl.predict(embedded, batch_size=embedded.shape[0])
        probs[0][np.argmax(probs[0])] = 0
        return [np.argmax(probs)]

    if ANCHOR is True:
        ids = np.zeros(3)
        print()
        # for i in examples:
        #     lowest = 500
        #     lowest_id = 500
        #     for j in range(len(stanzas)):
        #         if len(stanzas[j]) < lowest:
        #             if j not in ids and len(stanzas[j]) > 85 and j < 174:
        #                 lowest = len(stanzas[j])
        #                 lowest_id = j
        #     ids[i] = lowest_id
        #     print("Ausgewähltes Stanza: ", stanzas[lowest_id])
        #     print("Länge: ", len(stanzas[lowest_id]), "   id: ", lowest_id)
        #     print()

        nlp = spacy.load('en_core_web_lg')
        explainer = anchor_text.AnchorText(nlp,
                                           class_names,
                                           use_unk_distribution=True)
        print("GPU's: ", get_available_gpus())

        for idx in examples:
            print()
            print("------------STANZA-", idx, "------------")
            print()
            text = stanzas[idx]
            print(predict_label([text]))
            pred = explainer.class_names[predict_label([text])[0]]
            alternative = explainer.class_names[predict_second_label(
                [text],
                predict_label([text])[0])[0]]
            print('Prediction: %s' % pred)
            print("Stanza: ", stanzas[idx], "   True Label: ", labels[idx])
            exp = explainer.explain_instance(text,
                                             predict_label,
                                             threshold=0.95)

            print('Anchor: %s' % (' AND '.join(exp.names())))
            print('Precision: %.2f' % exp.precision())
            print()
            print('Examples where anchor applies and model predicts %s:' %
                  pred)
            print()
            print('\n'.join(
                [x[0] for x in exp.examples(only_same_prediction=True)]))
            print()
            print('Examples where anchor applies and model predicts %s:' %
                  alternative)
            print()
            print('\n'.join([
                x[0] for x in exp.examples(partial_index=0,
                                           only_different_prediction=True)
            ]))

    # ----------------------------------------------------------PROTODASH------------------------------------------------------------------------------------------

    if PROTODASH is True:

        for idx in examples:

            from aix360.algorithms.protodash import ProtodashExplainer

            def predict_label(stanza):
                embedded = model.encode(stanza)
                embedded = embedded.reshape((512, 1))
                embedded = embedded.T
                probs = mdl.predict(embedded, batch_size=1)
                return [np.argmax(probs)]

            def index_to_vector(index):
                for k, data in enumerate(all_data):
                    if data[2] == index:
                        return embeddings[k]
                return None

            explainer = ProtodashExplainer()

            num_prototypes = 5

            print(train_data.shape)

            vector = index_to_vector(idx)
            vector = vector.reshape((1, 512))

            (weights, proto_ind, _) = explainer.explain(vector,
                                                        train_data,
                                                        m=num_prototypes)

            weights = np.around(weights / np.sum(weights), 2)

            print()
            print("example: ", stanzas[idx])
            print("prototypes with weights:")
            print()
            print()
            for i in range(num_prototypes):
                j = proto_ind[i]
                print(weights[i], stanzas[indices[j]])

            all_indices = [idx]
            for i in range(num_prototypes):
                j = proto_ind[i]
                stanza_ind = indices[j]
                all_indices.append(stanza_ind)

            for l in all_indices:
                print()
                print(stanzas[l])
                print("Predicted Label: ", predict_label(stanzas[l]))
                print("True Label: ", np.argmax(one_hot_labels[l]))