예제 #1
0
def main(args):
    tokenizer = BertTokenizer.from_pretrained(args.bert_model)
    train_loader, _, _ = get_squad_data_loader(tokenizer,
                                               args.train_dir,
                                               shuffle=True,
                                               args=args)
    eval_data = get_squad_data_loader(tokenizer,
                                      args.dev_dir,
                                      shuffle=False,
                                      args=args)

    args.device = torch.cuda.current_device()

    trainer = VAETrainer(args)

    loss_log1 = tqdm(total=0, bar_format='{desc}', position=2)
    loss_log2 = tqdm(total=0, bar_format='{desc}', position=3)
    eval_log = tqdm(total=0, bar_format='{desc}', position=5)
    best_eval_log = tqdm(total=0, bar_format='{desc}', position=6)

    # Cargar checkpoint
    if args.load_checkpoint:
        epochs = trainer.loadd(args.model_dir)
        best_f1, best_bleu, best_em = VAETrainer.load_measures(args.model_dir)
        print(
            f"The current best measures are: F1  = {best_f1}, BLEU = {best_bleu} and EM = {best_em}."
        )
    else:
        epochs = -1
        best_bleu, best_em, best_f1 = 0.0, 0.0, 0.0

    print("MODEL DIR: " + args.model_dir)
    mlflow_logger = init_mlflow(args, f"{args.model_dir}/mlruns")
    for epoch in trange(int(args.epochs), desc="Epoch", position=0):
        if epoch <= epochs:
            print(f"jumping epoch {epoch}...")
        else:
            for batch in tqdm(train_loader,
                              desc="Train iter",
                              leave=False,
                              position=1):
                c_ids, q_ids, a_ids, start_positions, end_positions \
                = batch_to_device(batch, args.device)
                trainer.train(c_ids, q_ids, a_ids, start_positions,
                              end_positions)

                str1 = 'Q REC : {:06.4f} A REC : {:06.4f}'
                str2 = 'ZQ KL : {:06.4f} ZA KL : {:06.4f} INFO : {:06.4f}'
                str1 = str1.format(float(trainer.loss_q_rec),
                                   float(trainer.loss_a_rec))
                str2 = str2.format(float(trainer.loss_zq_kl),
                                   float(trainer.loss_za_kl),
                                   float(trainer.loss_info))
                loss_log1.set_description_str(str1)
                loss_log2.set_description_str(str2)

            if epoch >= 0:
                f1, em, bleu, _str = eval_measures(epoch, args, trainer,
                                                   eval_data)
                eval_log.set_description_str(_str)
                result = {"epoch": epoch, "em": em, "f1": f1, "bleu": bleu}
                mlflow_logger.on_result(result)
                if em > best_em:
                    best_em = em
                if f1 > best_f1:
                    best_f1 = f1
                    trainer.save(
                        os.path.join(args.model_dir, "best_f1_model.pt"),
                        epoch, f1, bleu, em)
                if bleu > best_bleu:
                    best_bleu = bleu
                    trainer.save(
                        os.path.join(args.model_dir, "best_bleu_model.pt"),
                        epoch, f1, bleu, em)
                trainer.save(os.path.join(args.model_dir, "checkpoint.pt"),
                             epoch, f1, bleu, em)
                mlflow_logger.on_checkpoint(
                    f"{args.model_dir}/mlruns/checkpoint")
                _str = 'BEST BLEU : {:02.2f} EM : {:02.2f} F1 : {:02.2f}'
                _str = _str.format(best_bleu, best_em, best_f1)
                best_eval_log.set_description_str(_str)
예제 #2
0
random.seed(2019)

logger = logging.getLogger('propaganda_predict_TC')

PROP_CLASS = ['Appeal_to_Authority', 'Appeal_to_fear-prejudice', 'Bandwagon,Reductio_ad_hitlerum',
                'Black-and-White_Fallacy', 'Causal_Oversimplification', 'Doubt', 'Exaggeration,Minimisation',
                'Flag-Waving', 'Loaded_Language', 'Name_Calling,Labeling', 'Repetition', 'Slogans',
                'Thought-terminating_Cliches', 'Whataboutism,Straw_Men,Red_Herring']


PRETRAINED_MODEL = 'bert-base-uncased'
MAX_TOKEN = 128
EPOCHS = 5
BATCH_SIZE = 64 

tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)
logger.info("Bert pretrained model: {0}".format(PRETRAINED_MODEL))


# load article files
def loadArticleFiles(article_dir):
    articles = {}

    article_files = glob.glob(os.path.join(article_dir, "*.txt"))
    for filename in article_files:
        with open(filename, "r", encoding="utf-8") as f:
            content = f.read()
            article_id = os.path.basename(filename).split(".")[0][7:]

            if 'uncased' in PRETRAINED_MODEL:
                content = content.lower()
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--data_dir",
        default="whole_dataset_biobert",
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .json files (or other data files) for the task."
    )

    parser.add_argument(
        "--out_dir",
        default="whole_dataset",
        type=str,
        required=True,
        help=
        "The directory where the output embeddings will be stored as a pickled dictionary"
    )

    parser.add_argument(
        "--filter_file",
        default=None,
        type=str,
        required=False,
        help=
        "The input path to file which contains the names of files which should only be considered out of the entire dataset."
    )

    parser.add_argument("--model_path",
                        default=None,
                        type=str,
                        required=False,
                        help="The path to the .bin transformer model.")

    parser.add_argument(
        "--have_input_data",
        action="store_true",
        help="Whether the input data is already stored in the form of Tensors")

    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")

    parser.add_argument("--batch_size",
                        default=16,
                        type=int,
                        help="The batch size to feed the model")

    parser.add_argument('--seed_words', nargs='+')

    args = parser.parse_args()

    add_seed_word(args.seed_words)
    logger.info("seed words given by user are %s", str(seed_words))
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

    if not args.have_input_data:
        json_files = extract_data(args.filter_file)
        data = preprocess_data_to_df(json_files)

        abstracts = data["abstract"].to_list()
        logger.info("total abstracts: %d", len(abstracts))
        input_ids, attention_masks, _ = create_input_ids__attention_masks_tensor(
            data, tokenizer, args.max_seq_length)
        del data
    else:
        input_ids = torch.load(f"inputs/{args.data_dir}/input_ids.pt")
        attention_masks = torch.load(
            f"inputs/{args.data_dir}/attention_masks.pt")

    logger.info("%s", str(input_ids.shape))
    logger.info('Token IDs: %s', str(input_ids[0]))

    if args.model_path is None:
        model = BertModel.from_pretrained("bert-base-cased")
    else:
        configuration = BertConfig.from_json_file(
            f"{args.model_path}/config.json")
        model = BertModel.from_pretrained(
            f"{args.model_path}/pytorch_model.bin", config=configuration)
    model.cuda()

    tensor_dataset = TensorDataset(input_ids, attention_masks)

    batch_size = args.batch_size

    dataloader = DataLoader(tensor_dataset,
                            sampler=SequentialSampler(tensor_dataset),
                            batch_size=batch_size)

    device = torch.device("cuda")
    seed_val = 42

    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    # Measure the total training time for the whole run.
    total_t0 = time.time()

    logger.info("")
    logger.info('Forward pass...')

    model.eval()

    token_to_embedding_map = defaultdict(list)
    seed_embeddings = defaultdict(list)
    # number of times a token is encountered: needed to maintain the average
    token_count = defaultdict(int)
    t0 = time.time()

    for step, batch in enumerate(dataloader):

        if step % 100 == 0:
            logger.info('======== Batch {:} / {:} ========'.format(
                step, len(dataloader)))

        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)

        embeddings, cls = model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_mask)

        # move everything to cpu to save GPU space
        b_input_ids_np = b_input_ids.cpu().numpy()
        b_input_mask_np = b_input_mask.cpu().numpy()
        embeddings_np = embeddings.detach().cpu().numpy()
        cls_np = cls.detach().cpu().numpy()

        del b_input_ids
        del b_input_mask
        del embeddings
        del cls
        torch.cuda.empty_cache()

        for batch_number in range(len(b_input_ids_np)):
            tokens = tokenizer.convert_ids_to_tokens(
                b_input_ids_np[batch_number])
            for token, embedding in zip(tokens, embeddings_np[batch_number]):
                # add the seed word to the seed dict
                if token in seed_words:
                    if token not in seed_embeddings:
                        seed_embeddings[token] = embedding
                    else:
                        seed_embeddings[token] += embedding
                # every token including seed should also be added to token_to_embedding_map
                if token not in token_to_embedding_map and token not in stop_words:
                    token_to_embedding_map[token] = embedding
                    tokens_with_embeddings.add(token)
                elif token not in stop_words:
                    token_to_embedding_map[token] += embedding
                token_count[token] += 1

        if step % 1000 == 0 and step > 0:
            with open(
                    f'word_embeddings/{args.out_dir}/word_embeddings_averaged_{step}.pickle',
                    'wb') as handle:
                pickle.dump(token_to_embedding_map,
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)
            del token_to_embedding_map
            token_to_embedding_map = defaultdict(list)
            logger.info(
                "Time to find embeddings for batches {} to {}: {:} (h:mm:ss)".
                format(max(0, step - 500), step,
                       format_time(time.time() - t0)))
            t0 = time.time()

        del b_input_ids_np
        del b_input_mask_np
        del embeddings_np
        del cls_np

    # save the embeddings of the seed words
    for token, embedding in seed_embeddings.items():
        seed_embeddings[token] = embedding / (token_count[token] * 1.0)
    with open(
            f'word_embeddings/{args.out_dir}/seed_embeddings_averaged.pickle',
            'wb') as handle:
        pickle.dump(seed_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)
    del seed_embeddings

    # save the word embeddings
    with open(
            f'word_embeddings/{args.out_dir}/word_embeddings_averaged_{step}.pickle',
            'wb') as handle:
        pickle.dump(token_to_embedding_map,
                    handle,
                    protocol=pickle.HIGHEST_PROTOCOL)
    del token_to_embedding_map

    # save the number of times each token occurs
    with open(f'word_embeddings/{args.out_dir}/token_count.pickle',
              'wb') as handle:
        pickle.dump(token_count, handle, protocol=pickle.HIGHEST_PROTOCOL)
    del token_count

    logger.info(
        "Total time to complete the entire process: {:} (h:mm:ss)".format(
            format_time(time.time() - total_t0)))

    logger.info("\n")
    logger.info("Embeddings received!")
예제 #4
0
파일: CI_Model.py 프로젝트: xiaotret/DLISR
    def feature_extracter_from_texts(self, mashup_api=None):
        """
        对mashup,service的description均需要提取特征,右路的文本的整个特征提取过程
        公用的话应该封装成新的model!
        :param mashup_api: 默认是None,只有'HDP'/'Bert'时为非空
        :return: 输出的是一个封装好的model,所以可以被mashup和api公用
        """
        if self.args.text_extracter_mode in fixed_vector_modes and mashup_api is not None:

            if self.args.text_extracter_mode == 'Bert':
                tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
                bertModel = BertModel.from_pretrained("bert-base-uncased")

                if mashup_api == 'mashup':
                    if self.mashup_text_feature_extracter is None:  # 没求过
                        mashup_texts = get_iterable_values(
                            data_repository.get_md().mashup_df,
                            'final_description',
                            return_ele_type='str')
                        dense_mashup_features = bertModel(
                            tokenizer(mashup_texts, return_tensors='tf'))
                        self.mashup_text_feature_extracter = vector_feature_extracter_from_texts(
                            'mashup', dense_mashup_features)
                    return self.mashup_text_feature_extracter
                elif mashup_api == 'api':
                    if self.api_text_feature_extracter is None:
                        api_texts = get_iterable_values(
                            data_repository.get_md().api_df,
                            'final_description',
                            return_ele_type='str')
                        dense_api_features = bertModel(
                            tokenizer(api_texts, return_tensors='tf'))
                        self.api_text_feature_extracter = vector_feature_extracter_from_texts(
                            'api', dense_api_features)
                    return self.api_text_feature_extracter
                else:
                    raise TypeError('wrong mashup_api mode!')

            else:
                if self.gd is None:
                    self.gd = get_default_gd(
                        tag_times=0, mashup_only=False,
                        strict_train=True)  # 用gensim处理文本,文本中不加tag
                    self.gd.model_pcs(self.args.text_extracter_mode)  #

                if mashup_api == 'mashup':
                    if self.mashup_text_feature_extracter is None:  # 没求过
                        self.mashup_text_feature_extracter = vector_feature_extracter_from_texts(
                            'mashup', self.gd.dense_mashup_features)
                    return self.mashup_text_feature_extracter
                elif mashup_api == 'api':
                    if self.api_text_feature_extracter is None:
                        self.api_text_feature_extracter = vector_feature_extracter_from_texts(
                            'api', self.gd.dense_api_features)
                    return self.api_text_feature_extracter
                else:
                    raise TypeError('wrong mashup_api mode!')

        elif self.text_feature_extracter is None:  # 没求过
            if 'trainable_bert' in self.args.text_extracter_mode.lower():
                self.text_feature_extracter = TFDistilBertModel.from_pretrained(
                    "distilbert-base-uncased")  # layer
                if self.args.frozen_bert:
                    self.text_feature_extracter.trainable = False
            else:
                text_input = Input(shape=(self.args.MAX_SEQUENCE_LENGTH, ),
                                   dtype='int32')
                text_embedding_layer = self.get_text_embedding_layer(
                )  # 参数还需设为外部输入!
                text_embedded_sequences = text_embedding_layer(
                    text_input)  # 转化为2D

                if self.args.text_extracter_mode in (
                        'inception', 'textCNN'):  # 2D转3D,第三维是channel
                    # print(text_embedded_sequences.shape)
                    text_embedded_sequences = Lambda(
                        lambda x: tf.expand_dims(x, axis=3))(
                            text_embedded_sequences)  # tf 和 keras的tensor 不同!!!
                    print(text_embedded_sequences.shape)

                if self.args.text_extracter_mode == 'inception':
                    x = inception_layer(
                        text_embedded_sequences, self.args.embedding_dim,
                        self.args.inception_channels,
                        self.args.inception_pooling)  # inception处理
                    print('built inception layer, done!')
                elif self.args.text_extracter_mode == 'textCNN':
                    x = textCNN_feature_extracter_from_texts(
                        text_embedded_sequences, self.args)
                elif self.args.text_extracter_mode == 'LSTM':
                    x = LSTM_feature_extracter_from_texts(
                        text_embedded_sequences, self.args)
                else:
                    raise TypeError('wrong extracter!')
                print('text feature after inception/textCNN/LSTM whole_model,',
                      x)  # 观察MLP转化前,模块输出的特征

                for FC_unit_num in self.args.inception_fc_unit_nums:
                    x = Dense(FC_unit_num,
                              kernel_regularizer=l2(self.args.l2_reg))(
                                  x)  # , activation='relu'
                    if self.args.inception_MLP_BN:
                        x = BatchNormalization(scale=False)(x)
                    x = PReLU()(x)  #
                    if self.args.inception_MLP_dropout:
                        x = tf.keras.layers.Dropout(0.5)(x)
                self.text_feature_extracter = Model(
                    text_input, x, name='text_feature_extracter')
        return self.text_feature_extracter
예제 #5
0
import os
import sys

import torch
import torch.utils.data
import torch.utils.data.distributed
from transformers import BertTokenizer

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

MAX_LEN = 64  # this is the max length of the sentence

print("Loading BERT tokenizer...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)


def model_fn(model_dir):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    loaded_model = torch.jit.load(os.path.join(model_dir, "traced_bert.pt"))
    return loaded_model.to(device)


def input_fn(request_body, request_content_type):
    """An input_fn that loads a pickled tensor"""
    if request_content_type == "application/json":
        sentence = json.loads(request_body)

        input_ids = []
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " + ", ".join(MODEL_TYPES),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
    )
    parser.add_argument(
        "--summary",
        default=None,
        type=str,
        help="Model summary",
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints and predictions will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        help="The input data dir. Should contain the .json files for the task."
        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--feature_dir",
        default=None,
        type=str,
        help="The input feature dir. Should contain the cached_features_file for the task."
    )
    parser.add_argument(
        "--train_file",
        default=None,
        type=str,
        help="The input training file. If a data dir is specified, will look for the file there"
        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="The input evaluation file. If a data dir is specified, will look for the file there"
        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--test_file",
        default=None,
        type=str,
        help="The input test file.",
    )
    parser.add_argument(
        "--test_prob_file",
        default=None,
        type=str,
        help="The output test_prob file.",
    )
    parser.add_argument(
        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )

    parser.add_argument(
        "--version_2_with_negative",
        action="store_true",
        help="If true, the SQuAD examples contain some that do not have an answer.",
    )
    parser.add_argument(
        "--null_score_diff_threshold",
        type=float,
        default=0.0,
        help="If null_score - best_non_null is greater than the threshold predict null.",
    )

    parser.add_argument(
        "--max_seq_length",
        default=512,
        type=int,
        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded.",
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help="When splitting up a long document into chunks, how much stride to take between chunks.",
    )
    parser.add_argument(
        "--max_query_length",
        default=32,
        type=int,
        help="The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.",
    )
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument("--do_test", action="store_true", help="Whether to run test on the test set.")
    parser.add_argument("--do_merge", action="store_true", help="Whether to merge test prob.")
    parser.add_argument(
        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
    )
    parser.add_argument(
        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
    )
    parser.add_argument("--do_fgm", action="store_true", help="Whether to run Adv-FGM training.")
    parser.add_argument("--do_pgd", action="store_true", help="Whether to run Adv-PGD training.")
    parser.add_argument("--gc", action="store_true", help="Whether to run optimizer-gc training.")

    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
        "--per_gpu_eval_batch_size", default=32, type=int, help="Batch size per GPU/CPU for evaluation."
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_ratio", default=0.1, type=float, help="Linear warmup over warmup_ratio.")
    parser.add_argument(
        "--n_best_size",
        default=10,
        type=int,
        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
    )
    parser.add_argument(
        "--max_answer_length",
        default=32,
        type=int,
        help="The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.",
    )
    parser.add_argument(
        "--verbose_logging",
        action="store_true",
        help="If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.",
    )
    parser.add_argument(
        "--lang_id",
        default=0,
        type=int,
        help="language id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)",
    )
    parser.add_argument("--best_val_f1", type=float, default=0., help="best_val_f1")
    parser.add_argument("--best_val_step", type=int, default=0, help="best_val_step")
    parser.add_argument("--logging_ratio", type=float, default=0.1, help="Log every X updates ratio.")
    parser.add_argument("--save_ratio", type=float, default=0.1, help="Save checkpoint every X updates ratio.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
    )
    parser.add_argument(
        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")

    parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")
    args = parser.parse_args()

    if args.doc_stride >= args.max_seq_length - args.max_query_length:
        logger.warning(
            "WARNING - You've set a doc stride which may be superior to the document length in some "
            "examples. This could result in errors when building features from the examples. Please reduce the doc "
            "stride or increase the maximum length to ensure the features are correctly built."
        )

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    args.model_type = args.model_type.lower()
    config = BertConfig.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    tokenizer = BertTokenizer.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    model = BertForQuestionAnswering.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
    # remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex
            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, set_type='train', output_examples=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Save the trained model and the tokenizer
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        # Take care of distributed/parallel training
        model_to_save = model.module if hasattr(model, "module") else model
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)

    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        if args.do_train:
            logger.info("Loading checkpoints saved during training for evaluation")
            checkpoints = [args.output_dir]
            if args.eval_all_checkpoints:
                checkpoints = list(
                    os.path.dirname(c)
                    for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
                )
                logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
        else:
            logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path)
            checkpoints = [args.model_name_or_path]

        logger.info("Evaluate the following checkpoints: %s", checkpoints)

        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            model = BertForQuestionAnswering.from_pretrained(checkpoint)
            model.to(args.device)

            # Evaluate
            result = evaluate(args, model, tokenizer, prefix='dev', step=global_step)

            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
            results.update(result)

    logger.info("Results: {}".format(results))

    if args.do_test and args.local_rank in [-1, 0]:
        if args.do_train:
            checkpoint = f'{args.output_dir}/checkpoint-{args.best_val_step}'
            model = BertForQuestionAnswering.from_pretrained(checkpoint)
            model.to(args.device)
            evaluate(args, model, tokenizer, prefix='test', step=args.best_val_step)
        else:
            global_step = args.model_name_or_path.split("-")[-1]
            model = BertForQuestionAnswering.from_pretrained(args.model_name_or_path)
            model.to(args.device)
            evaluate(args, model, tokenizer, prefix='test', step=global_step)
    if args.do_merge:
        merge(args, tokenizer, prefix="test")
cols_name = [
    'Date', 'Note', 'myr', 'uname', 'tuname', 'ADULT_CONTENT', 'HEALTH',
    'DRUGS_ALCOHOL_GAMBLING', 'RACE', 'VIOLENCE_CRIME', 'POLITICS', 'RELATION',
    'LOCATION'
]
label_cols = cols_name[5:]  # drop 'Date' & 'Note' (the 2 leftmost columns)
sens_cols = [
    'ADULT_CONTENT', 'HEALTH', 'DRUGS_ALCOHOL_GAMBLING', 'RACE',
    'VIOLENCE_CRIME', 'POLITICS', 'RELATION', 'LOCATION', 'T'
]

personal_cols = ['A', 'E', 'I', 'P', 'T']
userfields = ['S', 'P', 'T', 'A']

bert_model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True)

saved_model = BertClassifier(TFBertModel.from_pretrained(bert_model_name),
                             len(label_cols))
saved_model.load_weights(MODEL_FILE)
time.sleep(5)
print("\n MODEL LOADED\n\n\n\n\n")

c2 = c3 = c4 = c5 = c6 = c7 = c8 = c9 = [0] * (BATCH - 1)
#===============================================================#
"""
Convert all letters to lower or upper case (common : lower case)
"""


def convert_letters(tokens, style="lower"):
예제 #8
0
def main(args, f):
    # args = parse_arguments()
    set_seed(args.train_seed)
    if args.model in ['roberta', 'distilroberta']:
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    else:
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # preprocess data
    src_eval_loader, src_loader, tgt_all_loader, tgt_train_loader = get_all_dataloader(
        args, tokenizer)

    # load models
    if args.model == 'bert':
        src_encoder = BertEncoder()
        tgt_encoder = BertEncoder()
        src_classifier = BertClassifier()
    elif args.model == 'distilbert':
        src_encoder = DistilBertEncoder()
        tgt_encoder = DistilBertEncoder()
        src_classifier = BertClassifier()
    elif args.model == 'roberta':
        src_encoder = RobertaEncoder()
        tgt_encoder = RobertaEncoder()
        src_classifier = RobertaClassifier()
    else:
        src_encoder = DistilRobertaEncoder()
        tgt_encoder = DistilRobertaEncoder()
        src_classifier = RobertaClassifier()
    discriminator = Discriminator()

    # parallel models
    if torch.cuda.device_count() > 1:
        print('Let\'s use {} GPUs!'.format(torch.cuda.device_count()))
        src_encoder = nn.DataParallel(src_encoder)
        src_classifier = nn.DataParallel(src_classifier)
        tgt_encoder = nn.DataParallel(tgt_encoder)
        discriminator = nn.DataParallel(discriminator)

    if args.load:
        src_encoder = init_model(args,
                                 src_encoder,
                                 restore_path=param.src_encoder_path)
        src_classifier = init_model(args,
                                    src_classifier,
                                    restore_path=param.src_classifier_path)
        # tgt_encoder = init_model(args, tgt_encoder, restore_path=param.tgt_encoder_path)
        # discriminator = init_model(args, discriminator, restore_path=param.d_model_path)
    else:
        src_encoder = init_model(args, src_encoder)
        src_classifier = init_model(args, src_classifier)

    tgt_encoder = init_model(args, tgt_encoder)
    discriminator = init_model(args, discriminator)

    # train source model
    if args.pretrain:
        print("=== Training classifier for source domain ===")
        src_encoder, src_classifier = pretrain(args, src_encoder,
                                               src_classifier, src_loader)

        # save pretrained model
        save_model(args, src_encoder, param.src_encoder_path)
        save_model(args, src_classifier, param.src_classifier_path)

    # eval source model
    print("=== Evaluating classifier for source domain ===")
    evaluate(args, src_encoder, src_classifier, src_loader)
    src_acc = evaluate(args, src_encoder, src_classifier, tgt_all_loader)
    f.write(f'{args.src} -> {args.tgt}: No adapt acc on src data: {src_acc}\n')

    for params in src_encoder.parameters():
        params.requires_grad = False

    for params in src_classifier.parameters():
        params.requires_grad = False

    # adapt
    print("=== Adapt tgt encoder ===")
    tgt_encoder.load_state_dict(src_encoder.state_dict())
    if args.src_free:
        s_res_features = src_gmm(args, src_encoder, src_loader)
        src_loader = s_numpy_dataloader(s_res_features, args.batch_size)
        tgt_encoder = aad_adapt_src_free(args, src_encoder, tgt_encoder,
                                         discriminator, src_classifier,
                                         src_loader, tgt_train_loader,
                                         tgt_all_loader)
    else:
        tgt_encoder = aad_adapt(args, src_encoder, tgt_encoder, discriminator,
                                src_classifier, src_loader, tgt_train_loader,
                                tgt_all_loader)

    # save_model(args, tgt_encoder, param.tgt_encoder_path)

    # argument setting
    # print("=== Argument Setting ===")
    print(
        f"model_type: {args.model}; max_seq_len: {args.max_seq_length}; batch_size: {args.batch_size}; "
        f"pre_epochs: {args.pre_epochs}; num_epochs: {args.num_epochs}; adv weight: {args.alpha}; "
        f"KD weight: {args.beta}; temperature: {args.temperature}; src: {args.src}; tgt: {args.tgt}; "
        f'src_free: {args.src_free}; dp: {args.dp}; ent: {args.ent}')

    # eval target encoder on lambda0.1 set of target dataset
    print("=== Evaluating classifier for encoded target domain ===")
    print(">>> domain adaption <<<")
    tgt_acc = evaluate(args, tgt_encoder, src_classifier, tgt_all_loader)
    f.write(f'{args.src} -> {args.tgt}: DA acc on tgt data: {tgt_acc}\n')
    f.write(
        f"model_type: {args.model}; batch_size: {args.batch_size}; pre_epochs: {args.pre_epochs}; "
        f"num_epochs: {args.num_epochs}; src_free: {args.src_free}; src: {args.src}; "
        f"tgt: {args.tgt}; dp: {args.dp}; ent: {args.ent}\n\n")
예제 #9
0
LEARNING_RATE = 2e-5
TRAIN_EPOCHS = 10

financial_news_fp = "data/financial_news_data_downsampled.csv"
df = pd.read_csv(financial_news_fp)
df_train, df_val_test = train_test_split(df, test_size=0.3, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_val_test, test_size=0.3, random_state=RANDOM_SEED)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
class_names = ["negative", "neutral", "positive"]
num_classes = len(df_train[LABEL_COL].unique())


tokenizer = BertTokenizer.from_pretrained(BERT_PRETRAINED_MODEL, do_lower_case=True)
# Example
sample_text = df_train[TEXT_COL].iloc[0]
tokens = tokenizer.tokenize(sample_text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(tokens)
print(token_ids)

encoding = tokenizer.encode_plus(
    sample_text,
    max_length=MAX_LENGTH,
    add_special_tokens=True,
    return_token_type_ids=False,
    truncation="longest_first",
    padding="max_length",
    return_attention_mask=True,
예제 #10
0
def main():

    MODEL_CACHE = './model/bert-base-chinese'
    WORD_2_VECTOR_MODEL_DIR = './model/merge_sgns_bigram_char300.txt'

    WORD_FREQ_DICT = './dict/modern_chinese_word_freq.txt'

    EVAL_FILE_PATH = './dataset/annotation_data.csv'
    BERT_RES_PATH = './data/bert_ss_res.csv'
    # ERNIE_RES_PATH = './data/ernie_output.csv'
    VECTOR_RES_PATH = './data/vector_ss_res.csv'
    DICT_RES_PATH = './data/dict_ss_res.csv'
    HOWNET_RES_PATH = './data/hownet_ss_res.csv'
    HYBRID_RES_PATH = './data/hybrid_ss_res.csv'

    SUBSTITUTION_NUM = 10

    word_2_vector_model_dir = WORD_2_VECTOR_MODEL_DIR
    model_cache = MODEL_CACHE

    word_freq_dict = WORD_FREQ_DICT

    eval_file_path = EVAL_FILE_PATH

    bert_res_path = BERT_RES_PATH
    # ernie_res_path = ERNIE_RES_PATH
    vector_res_path = VECTOR_RES_PATH
    dict_res_path = DICT_RES_PATH
    hownet_res_path = HOWNET_RES_PATH
    hybrid_res_path = HYBRID_RES_PATH

    substitution_num = SUBSTITUTION_NUM

    print('loading models...')
    tokenizer = BertTokenizer.from_pretrained(model_cache)
    model = BertForMaskedLM.from_pretrained(model_cache)

    model.to('cuda')
    model.eval()
    print('loading embeddings...')
    model_word2vector = gensim.models.KeyedVectors.load_word2vec_format(
        word_2_vector_model_dir, binary=False)
    print('loading files...')
    word_freq_dict = read_dict(word_freq_dict)

    bert_res = read_ss_result(bert_res_path)
    vector_res = read_ss_result(vector_res_path)
    dict_res = read_ss_result(dict_res_path)
    hownet_res = read_ss_result(hownet_res_path)
    hybrid_res = read_ss_result(hybrid_res_path)

    row_lines, source_sentences, source_words = read_dataset(eval_file_path)

    for row_line, source_sentence, source_word, bert_subs, vector_subs, dict_subs, hownet_subs, hybrid_subs in zip(
            row_lines, source_sentences, source_words, bert_res, vector_res,
            dict_res, hownet_res, hybrid_res):
        # 全部运行可能耗时较长,建议注释部分代码块运行需要的测试
        if bert_subs[0] != 'NULL':
            bert_pre_word, bert_ss_sorted = substitute_ranking(
                row_line, model_word2vector, model, tokenizer, source_sentence,
                source_word, bert_subs, word_freq_dict, substitution_num)
        else:
            bert_pre_word = 'NULL'
            bert_ss_sorted = ['NULL']
        if vector_subs[0] != 'NULL':
            vector_pre_word, vector_ss_sorted = substitute_ranking(
                row_line, model_word2vector, model, tokenizer, source_sentence,
                source_word, vector_subs, word_freq_dict, substitution_num)
        else:
            vector_pre_word = 'NULL'
            vector_ss_sorted = ['NULL']
        if dict_subs[0] != 'NULL':
            dict_pre_word, dict_ss_sorted = substitute_ranking(
                row_line, model_word2vector, model, tokenizer, source_sentence,
                source_word, dict_subs, word_freq_dict, substitution_num)
        else:
            dict_pre_word = 'NULL'
            dict_ss_sorted = ['NULL']
        if hownet_subs[0] != 'NULL':
            hownet_pre_word, hownet_ss_sorted = substitute_ranking(
                row_line, model_word2vector, model, tokenizer, source_sentence,
                source_word, hownet_subs, word_freq_dict, substitution_num)
        else:
            hownet_pre_word = 'NULL'
            hownet_ss_sorted = ['NULL']
        if hybrid_subs[0] != 'NULL':
            hybrid_pre_word, hybrid_ss_sorted = substitute_ranking(
                row_line, model_word2vector, model, tokenizer, source_sentence,
                source_word, hybrid_subs, word_freq_dict, substitution_num)
        else:
            hybrid_pre_word = 'NULL'
            hybrid_ss_sorted = ['NULL']

        save_result(row_line, bert_pre_word, bert_ss_sorted,
                    './test/data/nohownet/bert_sr_res_no_hownet.csv')
        save_result(row_line, vector_pre_word, vector_ss_sorted,
                    './test/data/nohownet/vector_sr_res_no_hownet.csv')
        save_result(row_line, dict_pre_word, dict_ss_sorted,
                    './test/data/nohownet/dict_sr_res_no_hownet.csv')
        save_result(row_line, hownet_pre_word, hownet_ss_sorted,
                    './test/data/nohownet/hownet_sr_res_no_hownet.csv')
        save_result(row_line, hybrid_pre_word, hybrid_ss_sorted,
                    './test/data/nohownet/hybrid_sr_res_no_hownet.csv')
예제 #11
0
import json
import torch
from tqdm import tqdm
import argparse
import pickle
import random
import numpy as np

from transformers import BertTokenizer, BertForQuestionAnswering, BertConfig, BertPreTrainedModel
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese',
                                          do_lower_case=True)

import logging
logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)


class Example():
    def __init__(self, qid, question_text, answer_text, context_text,
                 start_pos, title, answerable, answers):
        self.qid = qid
        self.question_text = question_text
        self.answer_text = answer_text
        self.context_text = context_text
        self.start_pos = start_pos
        self.title = title
        self.answerable = answerable
        self.answers = answers


class testExample():
    def __init__(
예제 #12
0
    'max_len': 500,
    'dropout_rate': 0.2,
    'kernel_size': 5,
    'num_patience': 3,
    'lr': 3e-4,
    'max_word_len': 1000,
    'max_char_len': 10,
    'char_embed_size': 100,
    'cnn_filters': 300,
    'cnn_kernel_size': 5,
    'init_lr': 1e-4,
    'max_lr': 8e-4
}

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                               lowercase=True,
                                               add_special_tokens=True)

albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2',
                                                   lowercase=True,
                                                   add_special_tokens=True)

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base',
                                                     lowercase=True,
                                                     add_special_tokens=True)

xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased',
                                                 lowercase=True,
                                                 add_special_tokens=True)

def test(args, testfile, true_label, save_flag: bool, seed_val):

    device = util.get_device(device_no=args.device_no)
    model = torch.load(args.model_path, map_location=device)
    # seed_val = 2346610

    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    # testfile = args.output_file
    # true_label = args.label
    truncation = args.truncation
    n_samples = None
    if "n_samples" in args:
        n_samples = args.n_samples

    # saves_dir = "saves/"
    # time = datetime.datetime.now()
    # saves_path = os.path.join(saves_dir, util.get_filename(time))
    # if save_flag:
    #     Path(saves_path).mkdir(parents=True, exist_ok=True)

    # log_path = os.path.join(saves_path, "testing.log")

    # logging.basicConfig(filename=log_path, filemode='w', format='%(name)s - %(levelname)s - %(message)s')
    # logger=logging.getLogger()
    # logger.setLevel(logging.DEBUG)

    # Load the BERT tokenizer.
    # logger.info('Loading BERT tokenizer...')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True)
    max_len = 0
    reviews = []
    labels = []
    with open(testfile, "r") as fin:
        reviews = fin.readlines()

    reviews = [rev.lower() for rev in reviews]

    if n_samples == None:
        n_samples = len(reviews)

    indices = np.random.choice(np.arange(len(reviews)), size=n_samples)
    selected_reviews = [reviews[idx] for idx in indices]

    labels = [0 if true_label == "negative" else 1] * len(selected_reviews)
    # For every sentence...
    # for rev in selected_reviews:
    #     # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    #     input_ids = tokenizer.encode(rev, add_special_tokens=True)
    #     # Update the maximum sentence length.
    #     max_len = max(max_len, len(input_ids))

    # print('Max sentence length: ', max_len)

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids = []
    attention_masks = []

    # For every sentence...
    for rev in selected_reviews:
        # `encode_plus` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        input_id = tokenizer.encode(rev, add_special_tokens=True)
        if len(input_id) > 512:
            if truncation == "tail-only":
                # tail-only truncation
                input_id = [tokenizer.cls_token_id] + input_id[-511:]
            elif truncation == "head-and-tail":
                # head-and-tail truncation
                input_id = [tokenizer.cls_token_id
                            ] + input_id[1:129] + input_id[-382:] + [
                                tokenizer.sep_token_id
                            ]
            else:
                # head-only truncation
                input_id = input_id[:511] + [tokenizer.sep_token_id]

            input_ids.append(torch.tensor(input_id).view(1, -1))
            attention_masks.append(
                torch.ones([1, len(input_id)], dtype=torch.long))
        else:
            encoded_dict = tokenizer.encode_plus(
                rev,  # Sentence to encode.
                add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
                max_length=512,  # Pad & truncate all sentences.
                pad_to_max_length=True,
                return_attention_mask=True,  # Construct attn. masks.
                return_tensors='pt',  # Return pytorch tensors.
            )

            # Add the encoded sentence to the list.
            input_ids.append(encoded_dict['input_ids'])

            # And its attention mask (simply differentiates padding from non-padding).
            attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    # Set the batch size.
    batch_size = 8

    # Create the DataLoader.
    prediction_data = TensorDataset(input_ids, attention_masks, labels)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data,
                                       sampler=prediction_sampler,
                                       batch_size=batch_size)
    print('Predicting labels for {:,} test sentences...'.format(
        len(input_ids)))

    # Put model in evaluation mode
    model.eval()

    # Tracking variables
    predictions, true_labels = [], []

    # Predict
    for batch in prediction_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients, saving memory and
        # speeding up prediction
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)

        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Store predictions and true labels
        predictions.append(logits)
        true_labels.append(label_ids)

    print('    DONE.')
    return predictions, true_labels, reviews
예제 #14
0
    def _create_examples(self, lines):
        with torch.no_grad():
            if self.cfg.feature:
                fea_tokenizer = BertTokenizer.from_pretrained(
                    osp.join(self.cfg.pretrained_lm_path, 'bert-base'),
                    do_lower_case=True)
                feature = BertModel.from_pretrained(
                    osp.join(self.cfg.pretrained_lm_path, 'bert-base'))
                if self.cfg.pretrained_bert is not None:
                    print('loading feature ckpt from ',
                          self.cfg.pretrained_bert)
                    assert osp.exists(self.cfg.pretrained_bert)
                    if self.cfg.cuda:
                        feature = feature.cuda()
                        checkpoint = torch.load(self.cfg.pretrained_bert)
                    else:
                        checkpoint = torch.load(
                            self.cfg.pretrained_bert,
                            map_location=lambda storage, loc: storage)
                    feature.load_state_dict(checkpoint['net'])

            examples = []
            index2qid = []
            i = 0
            if self.cfg.test:
                lines = lines[:200]
            for line in tqdm(lines):
                data = dict()
                data['index'] = i
                i += 1

                data['qid'] = line['qID']
                index2qid.append(data['qid'])

                sentence = line['sentence']

                name1 = line['option1']
                name2 = line['option2']

                # data['sentence'] = line['sentence']
                # data['option1'] = line['option1']
                # data['option2'] = line['option2']

                conj = "_"
                idx = sentence.index(conj)
                context = sentence[:idx]
                option_str = "_ " + sentence[idx + len(conj):].strip()

                option1 = option_str.replace("_", name1)
                option2 = option_str.replace("_", name2)

                options = [{
                    'segment1': context,
                    'segment2': option1
                }, {
                    'segment1': context,
                    'segment2': option2
                }]

                # the test set has no answer key so use '1' as a dummy label
                data['label_ids'] = self.LABELS.index(line.get('answer', '1'))

                _, data['token_ids'], data['mask'], data[
                    'segment_ids'] = self.example_to_token_ids_segment_ids_label_ids(
                        options,
                        self.tokenizer,
                        cls_token_at_end=False,
                        cls_token=self.tokenizer.cls_token,
                        sep_token=self.tokenizer.sep_token,
                        sep_token_extra=False,
                        cls_token_segment_id=0,
                        pad_on_left=False,
                        pad_token=self.tokenizer.convert_tokens_to_ids(
                            [self.tokenizer.pad_token])[0],
                        pad_token_segment_id=0)

                if self.cfg.feature:
                    if self.cfg.model == 'bert':
                        input_ids = torch.Tensor(data['token_ids']).long()
                        input_mask = torch.Tensor(data['mask']).long()
                        segment_ids = torch.Tensor(data['segment_ids']).long()
                    else:
                        _, input_ids, segment_ids, input_mask = self.example_to_token_ids_segment_ids_label_ids(
                            options,
                            tokenizer=fea_tokenizer,
                            cls_token_at_end=False,
                            cls_token=fea_tokenizer.cls_token,
                            sep_token=fea_tokenizer.sep_token,
                            cls_token_segment_id=0,
                            pad_on_left=False,
                            pad_token=fea_tokenizer.convert_tokens_to_ids(
                                [fea_tokenizer.pad_token])[0],
                            pad_token_segment_id=0)
                        input_ids = torch.Tensor(input_ids).long()
                        input_mask = torch.Tensor(input_mask).long()
                        segment_ids = torch.Tensor(segment_ids).long()

                    if self.cfg.cuda:
                        input_ids = input_ids.cuda()
                        input_mask = input_mask.cuda()
                        segment_ids = segment_ids.cuda()

                    bert_outputs = feature(input_ids,
                                           attention_mask=input_mask,
                                           token_type_ids=segment_ids)

                    data['feature'] = bert_outputs[0].cpu().data
                    data['fea_mask'] = input_mask.cpu().data

                examples.append(data)

            torch.cuda.empty_cache()

        return examples, index2qid
jon_folder = 'C:/Users/mmall/Documents/github/bertembeddings/data/jonathans/adjacent/'
random_model = False
# random_model = True

if random_model:
    # config = AutoConfig.from_pretrained(pretrained_weights, output_hidden_states=True,
    #                                 output_attentions=args.attention,
    #                                 cache_dir='pretrained_models')
    # model = AutoModel.from_config(config)
    model = BertModel(
        BertConfig(output_hidden_states=True, output_attentions=True))
else:
    model = BertModel.from_pretrained('bert-base-cased',
                                      output_hidden_states=True,
                                      output_attentions=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

lines = pkl.load(open(jon_folder + 'phrase_boundary_tree_dist.pkl', 'rb'))

#%%
max_num = 300
these_bounds = [0, 1, 2, 3, 4]

frob = []
nuc = []
inf = []
csim = []
avgdist = []
whichline = []
whichcond = []
whichswap = []
예제 #16
0
# Copyright (c) 2019 Baidu.com, Inc. All Rights Reserved
#
"""
requirements:
Authors: daisongtai([email protected])
Date:    2019/5/29 6:38 PM
"""
from __future__ import print_function

import re

from transformers import BertTokenizer


max_seq_length = 500
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext-large', do_lower_case=True)

LHan = [
    [0x2E80, 0x2E99],  # Han # So  [26] CJK RADICAL REPEAT, CJK RADICAL RAP
    [0x2E9B, 0x2EF3
     ],  # Han # So  [89] CJK RADICAL CHOKE, CJK RADICAL C-SIMPLIFIED TURTLE
    [0x2F00, 0x2FD5],  # Han # So [214] KANGXI RADICAL ONE, KANGXI RADICAL FLUTE
    0x3005,  # Han # Lm       IDEOGRAPHIC ITERATION MARK
    0x3007,  # Han # Nl       IDEOGRAPHIC NUMBER ZERO
    [0x3021,
     0x3029],  # Han # Nl   [9] HANGZHOU NUMERAL ONE, HANGZHOU NUMERAL NINE
    [0x3038,
     0x303A],  # Han # Nl   [3] HANGZHOU NUMERAL TEN, HANGZHOU NUMERAL THIRTY
    0x303B,  # Han # Lm       VERTICAL IDEOGRAPHIC ITERATION MARK
    [
        0x3400, 0x4DB5
예제 #17
0
if os.path.exists(outDir):
    filelist = [f for f in os.listdir(outDir)]
    for f in filelist:
        os.remove(os.path.join(outDir, f))
else:
    os.makedirs(outDir)

device = torch.device("cuda" if (
    args.gpu and torch.cuda.is_available()) else "cpu")
print('Device', device)
n_gpu = torch.cuda.device_count()

model = BertForSequenceClassification.from_pretrained(
    args.model_name_or_path, num_labels=args.num_labels)
tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path,
                                          do_lower_case=True)

model.cuda()

train_inputs, train_labels, train_masks = readData(tokenizer,
                                                   args,
                                                   mode="train")
validation_inputs, validation_labels, validation_masks = readData(tokenizer,
                                                                  args,
                                                                  mode="dev")

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
예제 #18
0
def main(args):
    warnings.filterwarnings("ignore")
    # Load documents
    with open('./data/docs_noun.json', 'r') as f:
        json_docs = json.load(f)
    # prepare the dataset
    with open('data/test_anno.json', 'r') as f:
        val_json = json.load(f)
    tmpdir = 'tmp'
    if not os.path.exists(tmpdir):
        os.makedirs(tmpdir)

    # ------ Retrieve documents ------------------
    if os.path.exists(tmpdir + '/eval_dr.json'):
        with open(tmpdir + '/eval_dr.json', 'r') as f:
            val_json = json.load(f)
    else:
        document_retrieval(args, json_docs, val_json)
        with open(tmpdir + '/eval_dr.json', 'w') as f:
            json.dump(val_json, f)
    # Calculate recall
    rank = []
    for vidx, d in enumerate(val_json):
        reference = d['context'].split(' ')
        rank.append(99999)
        for i, didx in enumerate(d['dr_result']):
            hypothesis = json_docs[didx]['context'].split(' ')
            #if json_docs[didx]['context'] == d['context']:
            BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference],
                                                                hypothesis,
                                                                weights=(0.5,
                                                                         0.5))
            if BLEUscore > 0.9:
                rank[-1] = i + 1
                break
    recall5 = sum([1 for x in rank if x <= 5])
    recall1 = sum([1 for x in rank if x <= 1])
    print('DR R@1', recall1 / len(val_json), 'R@5', recall5 / len(val_json))

    # ------ SS ---------------------------------
    # Make sure to pass do_lower_case=False when use multilingual-cased model.
    # See https://github.com/google-research/bert/blob/master/multilingual.md
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',
                                              do_lower_case=False)
    if os.path.exists(tmpdir + '/eval_ss.json'):
        with open(tmpdir + '/eval_ss.json', 'r') as f:
            val_json = json.load(f)
    else:
        sentence_selection(args, json_docs, val_json, tokenizer)
        with open(tmpdir + '/eval_ss.json', 'w') as f:
            json.dump(val_json, f)
    # Calculate recall
    rank = []
    for vidx, d in enumerate(val_json):
        reference = d['reference'].split(' ')
        for i, (_, sent) in enumerate(d['ss_result']):
            hypothesis = sent.split(' ')
            #if sent == d['reference']:
            BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference],
                                                                hypothesis,
                                                                weights=(0.5,
                                                                         0.5))
            if BLEUscore > 0.8:
                rank.append(i + 1)
                break
    recall5 = sum([1 for x in rank if x <= 5])
    recall1 = sum([1 for x in rank if x <= 1])
    print('SS R@1', recall1 / len(val_json), 'R@5', recall5 / len(val_json))

    # ------ RTE ------------------------------------
    if os.path.exists(tmpdir + '/eval_rte.pkl'):
        with open(tmpdir + '/eval_rte.pkl', 'rb') as f:
            val_json = pickle.load(f)
    else:
        rte(args, val_json, tokenizer)
        with open(tmpdir + '/eval_rte.pkl', 'wb') as f:
            pickle.dump(val_json, f)
    # Calculate accuracy
    name2label = {'TRUE': 0, 'FALSE': 1, 'NEI': 2}
    acc = []
    for vidx, d in enumerate(val_json):
        gt = name2label[d['True_False']]
        pred, norm = 0, 0
        if len(d['rte_result']) == 0:
            # No retrieved document in document retrieval
            acc.append(0)
            continue
        for rte_logit, sidx in d['rte_result']:
            pred += d['ss_result'][sidx][0] * rte_logit
            norm += d['ss_result'][sidx][0]
        pred = (pred / norm).argmax(0)
        acc.append(float(pred == gt))
    print('RTE Acc', sum(acc) / len(acc))
예제 #19
0
from transformers import BertTokenizer

# cat orig.jsonl | python scripts/preprocess_jsonl.py $(TARGET) > processed.tsv
"""
jsonl ファイルから文と topic と S-ID を取り出し、
S-ID <tab> topic <tab> 文
という形式の tsv に変換して出力する
"""

MAX_TOKEN_LENGTH = 192 - 2  # BERTKNP が扱える最大長
MAX_BYTE_SIZE = 4096  # Juman++ が扱える文の最大バイト数
JUMAN_COMMAND = '/mnt/violet/share/tool/juman++v2/bin/jumanpp'
BERTKNP_MODEL = '/mnt/berry/home/ueda/bertknp-0.2-20190901/pretrained_model'

jumanpp = Juman(command=JUMAN_COMMAND)
tokenizer = BertTokenizer.from_pretrained(BERTKNP_MODEL)


class Document(NamedTuple):
    did: str
    topic: str
    sentences: List[str]


def main():
    documents = []
    idx = 0
    for line in tqdm(sys.stdin.readlines()):
        input_obj = json.loads(line.strip())
        classes = [
            key for key, value in input_obj['classes'].items() if value == 1
예제 #20
0
    early_stop = 20

    train_data = pd.read_csv('./data/train.csv')
    test_data = pd.read_csv('./data/test.csv')

    feature_cols = ['query', 'reply']
    label_cols = ['label']
    kf = KFold(n_splits=5)
    res_proba = np.zeros((len(test_data), 2))

    for tr_idx, val_idx in kf.split(train_data):
        train_x, train_y = train_data[feature_cols].loc[tr_idx], train_data[
            label_cols].loc[tr_idx]
        val_x, val_y = train_data[feature_cols].loc[val_idx], train_data[
            label_cols].loc[val_idx]
        tokenizer = BertTokenizer.from_pretrained(model_name)
        train_encodings = tokenizer(train_x['query'].tolist(),
                                    train_x['reply'].tolist(),
                                    truncation=True,
                                    padding=True,
                                    max_length=max_seq_len)
        val_encodings = tokenizer(val_x['query'].tolist(),
                                  val_x['reply'].tolist(),
                                  truncation=True,
                                  padding=True,
                                  max_length=max_seq_len)
        test_encodings = tokenizer(test_data['query'].tolist(),
                                   test_data['reply'].tolist(),
                                   truncation=True,
                                   padding=True,
                                   max_length=max_seq_len)
예제 #21
0
import pandas as pd
from transformers import BertTokenizer
from abstractive_summarizer import AbstractiveSummarization
from hyper_parameters import h_parms
from configuration import config

model = AbstractiveSummarization(num_layers=config.num_layers,
                                 d_model=config.d_model,
                                 num_heads=config.num_heads,
                                 dff=config.dff,
                                 vocab_size=config.input_vocab_size,
                                 output_seq_len=config.summ_length,
                                 rate=h_parms.dropout_rate)

tokenizer = BertTokenizer.from_pretrained(config.pretrained_bert_model)


def create_dataframe(path, num_examples):
    df = pd.read_csv(path)
    df.columns = [
        i.capitalize() for i in df.columns
        if i.lower() in ['document', 'summary']
    ]
    assert len(df.columns) == 2, 'column names should be document and summary'
    df = df[:num_examples]
    assert not df.isnull().any().any(), 'dataset contains  nans'
    return (df["Document"].values, df["Summary"].values)
def load_tokenizer(args):
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
    tokenizer.add_special_tokens({"additional_special_tokens": ADDITIONAL_SPECIAL_TOKENS})
    return tokenizer
예제 #23
0
    "data_cache_dir": "/home/ubuntu/likun/huggingface_dataset",
    "train_size": 500,
    "val_size": 30,
    "test_size": 50,
    "max_length": 128,
    "shuffle": True,
}
pre_trained_model_name = 'bert-google-uncase-base'
run_name = "zero-shot-metric-learning-benchmark-topic-medium-changelabel"
# pre_trained_model_name = 'roberta-base'
logger.critical("Build pre-trained model {}".format(pre_trained_model_name))
base_pre_trained_model_path = '/home/ubuntu/likun/nlp_pretrained/{}'.format(
    pre_trained_model_name)
# trained_model_path = '/home/ubuntu/likun/nlp_save_kernels/zero-shot-metric-learning-benchmark-topic-small'
# tokenizer = AutoTokenizer.from_pretrained(trained_model_path)
tokenizer = BertTokenizer.from_pretrained(base_pre_trained_model_path)

from datasets.features import ClassLabel
from datasets.features import Features
yahoo_zsl_path = '/home/ubuntu/likun/nlp_data/zsl/BenchmarkingZeroShot/topic_yahoo'
fea = Features({
    "text":
    datasets.Value("string"),
    "label":
    ClassLabel(names_file=os.path.join(yahoo_zsl_path, 'classes.txt'))
})

download_config = datasets.DownloadConfig()
download_config.max_retries = 20
dataset = datasets.load_dataset('csv',
                                data_files={
예제 #24
0
 def get_tokenizer(self):
     return BertTokenizer.from_pretrained('bert-base-uncased',
                                          cache_dir=HF_CACHE_DIR)
def main():
    config = {
        'overwrite': True,
        'data_path': '../tcdata/nlp_round2_data',
        'data_cache_path':
        '../user_data/tmp_data/finetune_output/nezha_ngram_cv7_processed/data.pkl',
        'output_path':
        '../user_data/tmp_data/finetune_output/nezha_ngram_cv7_results',
        'model_path':
        '../user_data/tmp_data/pretrain_output/nezha_ngram_output/best_model_ckpt',
        'best_model_path': '',
        'batch_size': 64,  # 64
        'num_epochs': 3,  # 3
        'num_folds': 5,  # 7
        'cv': 'cv-',
        'max_seq_len': 32,
        'learning_rate': 2e-5,
        'eps': 0.1,
        'alpha': 0.3,
        'adv': 'fgm',
        'warmup_ratio': 0.1,
        'weight_decay': 0.01,
        'device': 'cuda:2',
        'logging_step': 500,  # 500
        'ema_start_step': 1500,  # 1500
        'ema_start': False,
        'seed': 20200409
    }

    if not torch.cuda.is_available():
        config['device'] = 'cpu'
    else:
        config['n_gpus'] = torch.cuda.device_count()
        config['batch_size'] *= config['n_gpus']

    if not os.path.exists(config['output_path']):
        os.makedirs((config['output_path']))

    tokenizer = BertTokenizer.from_pretrained(
        '../user_data/tmp_data/pretrain_output/nezha_ngram_output'
        '/nezha_ngram_tokenizer_and_config/vocab.txt')
    if not os.path.exists(config['data_cache_path']) or config['overwrite']:
        read_data(config, tokenizer, debug=False)

    collate_fn, test_dataloader, train_dev_data, eval_train_dataloader = load_data(
        config, tokenizer)

    # test_pred_df = pd.DataFrame(data={'id': range(len(test_dataloader.dataset) // 2),
    #                                   'fold1-probs': [0.0] * (len(test_dataloader.dataset) // 2),
    #                                   'fold1-logits0': [0.0] * (len(test_dataloader.dataset) // 2),
    #                                   'fold1-logits1': [0.0] * (len(test_dataloader.dataset) // 2),
    #                                   })
    # train_pred_df = pd.DataFrame(data={'id': range(len(train_dev_data['input_ids'])),
    #                                    'fold1-probs': [0.0] * len(train_dev_data['input_ids']),
    #                                    'fold1-logits0': [0.0] * len(train_dev_data['input_ids']),
    #                                    'fold1-logits1': [0.0] * len(train_dev_data['input_ids'])}
    #                              )

    fold = 0
    skf = StratifiedKFold(shuffle=True,
                          n_splits=config['num_folds'],
                          random_state=config['seed'])
    for train_idxs, dev_idxs in skf.split(X=train_dev_data['input_ids'],
                                          y=train_dev_data['labels']):
        fold += 1
        config['ema_start'] = False

        dev_dataloader, train_dataloader = load_cv_data(
            collate_fn, config, dev_idxs, train_dev_data, train_idxs, None,
            None)
        seed_everyone(config['seed'])

        if not config['best_model_path']:
            best_model_path = train(config, train_dataloader, dev_dataloader,
                                    fold)
        else:
            best_model_path = config['best_model_path']

        if best_model_path:
            print('\n>>> Loading best model ...')
            model = NeZhaForSequenceClassification.from_pretrained(
                best_model_path)
            model.to(config['device'])
            del model

        # train_pred_probs, train_pred_logits = predict(config, model, eval_train_dataloader, mode='valid')
        # train_pred_df.loc[:, f'fold{fold}-probs'] = train_pred_probs
        # train_pred_df.loc[:, f'fold{fold}-logits0'] = train_pred_logits[:, 0]
        # train_pred_df.loc[:, f'fold{fold}-logits1'] = train_pred_logits[:, 1]
        # test_pred_probs, test_pred_logits = predict(config, model, test_dataloader, mode='test')
        # test_pred_df.loc[:, f'fold{fold}-probs'] = test_pred_probs
        # test_pred_df.loc[:, f'fold{fold}-logits0'] = test_pred_logits[:, 0]
        # test_pred_df.loc[:, f'fold{fold}-logits1'] = test_pred_logits[:, 1]
        del train_dataloader, dev_dataloader
        gc.collect()
        torch.cuda.empty_cache()
예제 #26
0
        if self.phase != "test":
            y = self.categories[idx]
            y = self.idx2onehot(y)
            X["y"] = torch.LongTensor(y)
        return X

    def pad(self, arr):
        return arr[:self.max_len] + [self.pad_token_id
                                     ] * (self.max_len - len(arr))

    def idx2onehot(self, y):
        onehot = np.zeros(self.n_outputs)
        onehot[y] = 1
        return onehot


if __name__ == "__main__":
    from transformers import BertTokenizer
    from pprint import pprint
    _root = "data/corona_nlp"
    _phases = ["train", "test", "dev"]
    _tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    _max_len = 25
    for _phase in _phases:
        dataset = CustomDataset(_root, _phase, _tokenizer, _max_len)

        for res in dataset:
            pprint(res)
            print(res['y'].size())
            break
def train_ner_model(
    model_config_path, data_dir,
    logger_file_dir=None, labels_file=None
):
    # loading model config path
    if os.path.exists(model_config_path):
        with open(model_config_path, "r", encoding="utf-8") as reader:
            text = reader.read()
        model_config_dict = json.loads(text)
    else:
        print("model_config_path doesn't exist.")
        sys.exit()

    if os.path.exists(model_config_dict["final_model_saving_dir"]):
        output_model_file = model_config_dict["final_model_saving_dir"] + "pytorch_model.bin"
        output_config_file = model_config_dict["final_model_saving_dir"] + "bert_config.json"
        output_vocab_file = model_config_dict["final_model_saving_dir"] + "vocab.txt"
    else:
        print("model_saving_dir doesn't exist.")
        sys.exit()

    if os.path.exists(logger_file_dir):
        logging.basicConfig(
            filename=logger_file_dir + "logs.txt",
            filemode="w"
        )
        logger = logging.getLogger()
        logger.setLevel(logging.DEBUG)
    else:
        print("logger_file_path doesn't exist.")
        sys.exit()

    if os.path.exists(labels_file):
        print("Labels file exist")
    else:
        print("labels_file doesn't exist.")
        sys.exit()

    logger.info("Training configurations are given below ::")
    for key, val in model_config_dict.items():
        logger.info("{} == {}".format(key, val))

    logger.info("Started training model :::::::::::::::::::::")

    bert_config = BertConfig.from_json_file(model_config_dict["bert_config_path"])
    bert_tokenizer = BertTokenizer.from_pretrained(
        model_config_dict["bert_vocab_path"],
        config=bert_config,
        do_lower_case=model_config_dict["tokenizer_do_lower_case"]
    )
    # saving confgi and tokenizer
    bert_tokenizer.save_vocabulary(output_vocab_file)
    bert_config.to_json_file(output_config_file)

    labels = get_labels(labels_file)
    logger.info("Labels for Ner are: {}".format(labels))

    label2idx = {l: i for i, l in enumerate(labels)}

    # preparing training data
    train_dataset = load_and_cache_examples(
        data_dir=data_dir,
        max_seq_length=model_config_dict["max_seq_length"],
        tokenizer=bert_tokenizer,
        label_map=label2idx,
        pad_token_label_id=label2idx["O"],
        mode="train", logger=logger
    )
    # preparing eval data
    eval_dataset = load_and_cache_examples(
        data_dir=data_dir,
        max_seq_length=model_config_dict["max_seq_length"],
        tokenizer=bert_tokenizer,
        label_map=label2idx,
        pad_token_label_id=label2idx["O"],
        mode="dev", logger=logger
    )
    logger.info("Training data and eval data loaded successfully.")

    if model_config_dict["model_type"] == "crf":
        model = BertCrfForNER.from_pretrained(
            model_config_dict["bert_model_path"],
            config=bert_config,
            pad_idx=bert_tokenizer.pad_token_id,
            sep_idx=bert_tokenizer.sep_token_id,
            num_labels=len(labels)
        )

    logger.info("{} model loaded successfully.".format(model_config_dict["model_type"]))

    # checking whether to finetune or not
    if model_config_dict["finetune"] == True:
        logger.info("Finetuning bert.")
    else:
        for param in list(model.bert.parameters()):
            param.requires_grad = False
        logger.infd("Freezing Berts weights.")

    # preparing optimizer and scheduler
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0
        },
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0
        }
    ]
    # total optimizer steps
    t_total = int((len(train_dataset) / model_config_dict["train_batch_size"]) * model_config_dict["num_epochs"])
    logger.info("t_total : {}".format(t_total))

    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=model_config_dict["learning_rate"],
        eps=model_config_dict["epsilon"]
    )
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=model_config_dict["warmup_steps"],
        num_training_steps=t_total
    )
    logger.info("{}".format(count_parameters))

    model.to(DEVICE)

    best_eval_f1 = 0.0
    for epoch in range(model_config_dict["num_epochs"]):
        train_result = train_epoch(
            model=model, dataset=train_dataset,
            batch_size=model_config_dict["train_batch_size"],
            label_map=label2idx,
            max_grad_norm=model_config_dict["max_grad_norm"],
            optimizer=optimizer, scheduler=scheduler, device=DEVICE,
            sep_token_id=bert_tokenizer.sep_token_id
        )
        eval_result = eval_epoch(
            model=model, dataset=eval_dataset,
            batch_size=model_config_dict["validation_batch_size"],
            label_map=label2idx, device=DEVICE, sep_token_id=bert_tokenizer.sep_token_id,
            give_lists=False
        )
        print(f'Epoch: {epoch + 1}')
        print(f'Train Loss: {train_result["loss"]: .4f}| Train F1: {train_result["f1"]: .4f}')
        print(f'Eval Loss: {eval_result["loss"]: .4f}| Eval F1: {eval_result["f1"]: .4f}')
        logger.info(f'Epoch: {epoch + 1}')
        logger.info(f'Train Loss: {train_result["loss"]: .4f}| Train F1: {train_result["f1"]: .4f}')
        logger.info(f'Eval Loss: {eval_result["loss"]: .4f}| Eval F1: {eval_result["f1"]: .4f}')

        if best_eval_f1 < eval_result["f1"]:
            best_eval_f1 = eval_result["f1"]
            # saving model to disk
            model_to_save = model.module if hasattr(model, "module") else model
            torch.save(model_to_save.state_dict(), output_model_file)
            print("Saved a better model.")
            logger.info("Saved a beter model")
            del model_to_save

    # loading the best model and test results
    model.load_state_dict(torch.load(output_model_file))
    logger.info("Loaded best model successfully.")

    test_dataset, test_examples, test_features = load_and_cache_examples(
        data_dir=data_dir,
        max_seq_length=model_config_dict["max_seq_length"],
        tokenizer=bert_tokenizer,
        label_map=label2idx,
        pad_token_label_id=label2idx["O"],
        mode="test", logger=logger,
        return_features_and_examples=True
    )
    logger.info("Test data loaded successfully.")

    test_label_predictions = predictions_from_model(
        model=model, tokenizer=bert_tokenizer,
        dataset=test_dataset,
        batch_size=model_config_dict["validation_batch_size"],
        label2idx=label2idx, device=DEVICE
    )
    # restructure test_label_predictions with real labels
    aligned_predicted_labels, true_labels = align_predicted_labels_with_original_sentence_tokens(
        test_label_predictions, test_examples, test_features, max_seq_length=model_config_dict["max_seq_length"],
        num_special_tokens=model_config_dict["num_special_tokens"]
    )
    print("Test Results classification report...")
    print(classification_report(true_labels, aligned_predicted_labels))
    return aligned_predicted_labels, true_labels
예제 #28
0
Note: I believe this model was trained on version 1 of SQuAD, since it's not outputting whether the question is "impossible" to answer from the text (which is part of the task in v2 of SQuAD).
"""

from transformers import BertForQuestionAnswering

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

"""Load the tokenizer as well. 

Side note: Apparently the vocabulary of this model is identicaly to the one in bert-base-uncased. You can load the tokenizer from `bert-base-uncased` and that works just as well.
"""

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

"""## 3. Ask a Question

Now we're ready to feed in an example!

A QA example consists of a question and a passage of text containing the answer to that question.

Let's try an example using the text in this tutorial!
"""

question = "How many parameters does BERT-large have?"
answer_text = "BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab instance."

"""We'll need to run the BERT tokenizer against both the `question` and the `answer_text`. To feed these into BERT, we actually concatenate them together and place the special [SEP] token in between."""
예제 #29
0
    os.path.join(
        args.data_path,
        "train_toy.csv" if args.toy in ["True", "toy"] else "train.csv"))
test_df = pd.read_csv(
    os.path.join(
        args.data_path,
        "test_toy.csv" if args.toy in ["True", "toy"] else "test.csv"))
submission = pd.read_csv(
    os.path.join(
        args.data_path,
        "sample_submission_toy.csv"
        if args.toy in ["True", "toy"] else "sample_submission.csv",
    ))

tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                          do_lower_case=("uncased"
                                                         in args.bert_model))

test_set = get_test_set(args, test_df, tokenizer)
test_loader = DataLoader(
    test_set,
    batch_sampler=BucketingSampler(test_set.lengths,
                                   batch_size=args.batch_size,
                                   maxlen=args.max_sequence_length),
    collate_fn=make_collate_fn(),
)

for fold, train_set, valid_set, train_fold_df, val_fold_df in cross_validation_split(
        args, train_df, tokenizer):

    print()
예제 #30
0
def load_model_and_tokenizer():
    global model_path
    model = load_model(model_path).cpu()
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
    tokenizer.add_special_tokens({"additional_special_tokens": ['[BLANK]']})
    return model, tokenizer