) == 1, "the condition: diff. tags -> same value ?"
                    for idx, t in enumerate(_tags):
                        tags_values[TAGs.index(t)].append(_values[0])

                elif len(_tags) < len(_values):
                    assert len(
                        _tags
                    ) == 1, "the condition: diff. values -> same tag ?"
                    for idx, v in enumerate(_values):
                        tags_values[TAGs.index(_tags[0])].append(v)

        print("\r{}:[{}/{}]".format(mode, f_idx, len(files)), end='   \r')
# tags_values: group the values having same tag
print("Finish Collecting")

tokenizer = BertJapaneseTokenizer.from_pretrained(pretrained_weights,
                                                  do_lower_case=True)

config = BertConfig.from_pretrained(pretrained_weights,
                                    output_hidden_states=True)

fine_tune_model = Model(config)
fine_tune_model.load(FINE_TUNE_BERT_MODEL_PATH)

model = fine_tune_model.bert_embedd

#config = BertConfig.from_pretrained(pretrained_weights, output_hidden_states=True)
#model = BertModel.from_pretrained(pretrained_weights, config=config)
#model.load_state_dict(torch.load(FINE_TUNE_BERT_MODEL_PATH))

Clusters = []
ALL_LAYERS = 12
def main():
    # model files check and download
    check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH)

    if args.arch == 'bert-base-cased' or args.arch == 'bert-base-uncased':
        tokenizer = BertTokenizer.from_pretrained(args.arch)
    else:
        tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/" +
                                                          args.arch)

    net = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=args.env_id)
    net.set_input_blob_shape((1, PADDING_LEN),
                             net.find_blob_index_by_name("token_type_ids"))
    net.set_input_blob_shape((1, PADDING_LEN),
                             net.find_blob_index_by_name("input_ids"))
    net.set_input_blob_shape((1, PADDING_LEN),
                             net.find_blob_index_by_name("attention_mask"))

    with codecs.open(args.input, 'r', 'utf-8', 'ignore') as f:
        s = f.readlines()

    for text in s:
        tokenized_text = tokenizer.tokenize(text)
        original_text_len = len(tokenized_text)

        # if not args.onnx:
        for j in range(len(tokenized_text), PADDING_LEN):
            tokenized_text.append('[PAD]')

        score = numpy.zeros((len(tokenized_text)))
        suggest = {}

        for i in range(0, len(tokenized_text)):
            masked_index = i

            if tokenized_text[masked_index] == '[PAD]':
                continue

            tokenized_text_saved = tokenized_text[masked_index]

            tokenized_text[masked_index] = '[MASK]'

            outputs = inference(net, tokenizer, tokenized_text, masked_index,
                                original_text_len)

            target_ids = tokenizer.convert_tokens_to_ids(
                [tokenized_text_saved])
            index = target_ids[0]
            score[masked_index] = outputs[0][0, masked_index][index]

            predictions = torch.from_numpy(outputs[0][0, masked_index]).topk(1)
            index = predictions.indices[0]
            top_token = tokenizer.convert_ids_to_tokens([index])[0]
            suggest[masked_index] = top_token

            tokenized_text[masked_index] = tokenized_text_saved

        fine_text = colorize(tokenized_text, score, suggest)
        print(fine_text)

    print('Script finished successfully.')
예제 #3
0
from helper.image_helper import *

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

parser = argparse.ArgumentParser()
parser.add_argument('--path')
parser.add_argument('--log', action='store_true')
args = parser.parse_args()
path = args.path
log = args.log
task = path.split('/')[1] 
text_model = path.split('/')[2].split('_')[0]
image_model = path.split('/')[2].split('_')[1] 
fusion_method = path.split('/')[2].split('_')[2]

tokenizer = BertJapaneseTokenizer.from_pretrained('bert-base-japanese-whole-word-masking')

def set_log():
    save_dir = 'log/' + task
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    format = '%(message)s'
    filename = save_dir + '/' + text_model + '_' + image_model + '_' \
               + fusion_method + '_' + datetime.now().strftime('%Y%m%d%H%M') + '.log'
    logging.basicConfig(filename=filename, level=logging.DEBUG, format=format)

def get_config(file_path):
    config_file = file_path
    json_file = open(config_file, 'r')
    json_object = json.load(json_file)
    config = AttrDict(json_object)
예제 #4
0
def main():
    # model files check and download
    check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH)

    if args.arch == 'bert-base-cased' or args.arch == 'bert-base-uncased':
        tokenizer = BertTokenizer.from_pretrained(args.arch)
    else:
        tokenizer = BertJapaneseTokenizer.from_pretrained(
            'cl-tohoku/' + 'bert-base-japanese-whole-word-masking')
    text = args.input
    logger.info("Input text : " + text)

    tokenized_text = tokenizer.tokenize(text)
    logger.info("Tokenized text : " + str(tokenized_text))

    masked_index = -1
    for i in range(0, len(tokenized_text)):
        if tokenized_text[i] == '[MASK]':
            masked_index = i
            break
    if masked_index == -1:
        logger.info("[MASK] not found")
        sys.exit(1)

    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    logger.info("Indexed tokens : " + str(indexed_tokens))

    ailia_model = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=args.env_id)

    indexed_tokens = numpy.array(indexed_tokens)
    token_type_ids = numpy.zeros((1, len(tokenized_text)))
    attention_mask = numpy.zeros((1, len(tokenized_text)))
    attention_mask[:, 0:len(tokenized_text)] = 1

    inputs_onnx = {
        "token_type_ids": token_type_ids,
        "input_ids": indexed_tokens,
        "attention_mask": attention_mask,
    }

    logger.info("Predicting...")
    if args.benchmark:
        logger.info('BENCHMARK mode')
        for i in range(5):
            start = int(round(time.time() * 1000))
            outputs = ailia_model.predict(inputs_onnx)
            end = int(round(time.time() * 1000))
            logger.info("\tailia processing time {} ms".format(end - start))
    else:
        outputs = ailia_model.predict(inputs_onnx)

    predictions = torch.from_numpy(outputs[0][0,
                                              masked_index]).topk(NUM_PREDICT)

    logger.info("Predictions : ")
    for i, index_t in enumerate(predictions.indices):
        index = index_t.item()
        token = tokenizer.convert_ids_to_tokens([index])[0]
        logger.info(str(i) + " " + str(token))

    logger.info('Script finished successfully.')
import torchvision
from transformers import (
    BertConfig,
    BertJapaneseTokenizer,
    BertForMultipleChoice,
    AdamW,
    get_linear_schedule_with_warmup,
)
from textformatting import ssplit
import jaconv
from pyknp import Juman

config = BertConfig.from_json_file(
    "../PretrainedModel/KyotoUniv/bert_config.json")
tokenizer = BertJapaneseTokenizer.from_pretrained(
    "../PretrainedModel/KyotoUniv/vocab.txt",
    do_lower_case=False,
    do_basic_tokenize=False)

#Juman++
juman = Juman(jumanpp=True)

#Object detection
import cv2
import detectron2
from detectron2.utils.logger import setup_logger
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from PIL import Image
from PIL import ImageFile
예제 #6
0
def train():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="multi-bert",
                        help="Path, url or short name of the model")
    parser.add_argument("--num_candidates",
                        type=int,
                        default=2,
                        help="Number of candidates for training")
    parser.add_argument("--max_turns",
                        type=int,
                        default=3,
                        help="Number of previous turns to keep in history")
    parser.add_argument("--train_batch_size",
                        type=int,
                        default=4,
                        help="Batch size for training")
    parser.add_argument("--valid_batch_size",
                        type=int,
                        default=4,
                        help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=8,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--lr",
                        type=float,
                        default=6.25e-5,
                        help="Learning rate")
    parser.add_argument("--lm_coef",
                        type=float,
                        default=1.0,
                        help="LM loss coefficient")
    parser.add_argument("--max_norm",
                        type=float,
                        default=1.0,
                        help="Clipping gradient norm")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=3,
                        help="Number of training epochs")
    parser.add_argument("--personality_permutations",
                        type=int,
                        default=1,
                        help="Number of permutations of personality sentences")
    parser.add_argument(
        "--eval_before_start",
        action='store_true',
        help="If true start with a first evaluation before training")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument(
        "--fp16",
        type=str,
        default="",
        help=
        "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="Local rank for distributed training (-1: not distributed)")
    parser.add_argument("--random_init",
                        action='store_true',
                        help="If true random initailze the model")
    parser.add_argument(
        "--train_lang",
        type=str,
        default="",
        help="train monolingual model, defaul: multilingual model")
    args = parser.parse_args()

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Running process %d", args.local_rank
    )  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    # Model
    logger.info("Prepare tokenizer, pretrained model and optimizer.")
    model_path = 'bert-base-multilingual-cased'
    if args.train_lang in ["En", "It", "Jp",
                           "Zh"]:  # for Fr Ko Id we use MBERT
        model_path = LANG_2_MODEL[args.train_lang]

    tokenizer = BertTokenizer.from_pretrained(model_path)
    if args.train_lang == "Jp":
        tokenizer = BertJapaneseTokenizer.from_pretrained(model_path)
    model = Model2Model.from_pretrained(model_path)

    # tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    # if args.random_init:
    #     config = BertConfig.from_pretrained('bert-base-multilingual-cased')
    #     config.is_decoder = True
    #     bert_decoder = BertForMaskedLM(config)
    #     model = Model2Model.from_pretrained('bert-base-multilingual-cased', decoder_model=bert_decoder)
    # else:
    #     model = Model2Model.from_pretrained('bert-base-multilingual-cased')
    #     model_dict = model.state_dict()
    #     # initialize crossattention with selfattention
    #     model_dict.update({ name: model_dict[name.replace("crossattention", "attention")] for name in model_dict if "crossattention" in name })
    #     model.load_state_dict(model_dict)
    model.to(args.device)

    # Add special tokens if they are not already added
    add_special_tokens_(model, tokenizer)
    optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank,
                                        find_unused_parameters=True)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(
        args, tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        batch = tuple(batch[input_name].to(args.device)
                      for input_name in MODEL_INPUTS)

        #batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        encoder_mask, decoder_mask, encoder_input_ids, decoder_input_ids, lm_labels, token_type_ids, decoder_lang_id = batch
        model_kwargs = {
            "encoder_token_type_ids": token_type_ids,
            "decoder_token_type_ids": decoder_lang_id,
            "encoder_attention_mask": encoder_mask,
            "decoder_attention_mask": decoder_mask,
            "decoder_lm_labels": lm_labels
        }
        lm_loss, prediction_scores, *_ = model(
            encoder_input_ids=encoder_input_ids,
            decoder_input_ids=decoder_input_ids,
            **model_kwargs)

        loss = (lm_loss) / args.gradient_accumulation_steps
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                           args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(batch[input_name].to(args.device)
                          for input_name in MODEL_INPUTS)
            #batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
            encoder_mask, decoder_mask, encoder_input_ids, decoder_input_ids, lm_labels, token_type_ids, decoder_lang_id = batch
            logger.info(tokenizer.decode(encoder_input_ids[0, :].tolist()))
            # if we dont send labels to model, it doesnt return losses
            model_kwargs = {
                "encoder_token_type_ids": token_type_ids,
                "decoder_token_type_ids": decoder_lang_id,
                "encoder_attention_mask": encoder_mask,
                "decoder_attention_mask": decoder_mask
            }

            lm_logits, *_ = model(encoder_input_ids=encoder_input_ids,
                                  decoder_input_ids=decoder_input_ids,
                                  **model_kwargs)
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(
                -1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted, ), (lm_labels_flat_shifted, )

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED,
                                  lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, args.lr),
                                 (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {
        "nll":
        Loss(torch.nn.CrossEntropyLoss(ignore_index=-1),
             output_transform=lambda x: (x[0][0], x[1][0]))
    }
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], args)
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: pbar.log_message(
                "Validation: %s" % pformat(evaluator.state.metrics)))

        log_dir = make_logdir(args.model_checkpoint)
        log_dir += "_lang_id"
        if args.random_init:
            log_dir = log_dir + "_random_init"
        tb_logger = TensorboardLogger(log_dir)

        tb_logger.attach(trainer,
                         log_handler=OutputHandler(tag="training",
                                                   metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=OptimizerParamsHandler(optimizer),
                         event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator,
                         log_handler=OutputHandler(tag="validation",
                                                   metric_names=list(
                                                       metrics.keys()),
                                                   another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(log_dir,
                                             'checkpoint',
                                             save_interval=1,
                                             n_saved=3)
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': getattr(model, 'module', model)
             })  # "getattr" takes care of distributed encapsulation

        torch.save(args, log_dir + '/model_training_args.bin')
        if args.distributed:
            getattr(model.module, 'encoder', model).config.to_json_file(
                os.path.join(log_dir, CONFIG_NAME)
            )  # the config for encoder and decoder should be the same
        else:
            getattr(model, 'encoder', model).config.to_json_file(
                os.path.join(log_dir, CONFIG_NAME)
            )  # the config for encoder and decoder should be the same
        tokenizer.save_pretrained(log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(
            checkpoint_handler._saved[-1][1][-1],
            os.path.join(log_dir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
        )
        dl = DataLoader(ds, batch_size=self.batch_size, shuffle=False)
        preds = []
        for batch in tqdm(dl):
            input_ids, attention_mask = batch["input_ids"], batch["attention_mask"]
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
            output = bert(input_ids=input_ids, attention_mask=attention_mask)
            output = output[0]
            output = output.to(cpu)
            preds.append(output.detach().clone().numpy())
        return np.concatenate(preds, axis=0)


# MODEL_NAME = "bert-base-japanese-whole-word-masking"
MODEL_NAME = "bert-base-japanese-whole-word-masking"
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
config = BertConfig.from_pretrained(MODEL_NAME)
bert = BertModel.from_pretrained(MODEL_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cpu = torch.device('cpu')
print(f"使用デバイス: {device}")
bert.to(device)


def tokenize(text, max_length=128):
    id_dict = tokenizer.encode_plus(str(text),
                                    max_length=max_length,
                                    pad_to_max_length=True,
                                    truncation=True)
    return id_dict["input_ids"], id_dict["attention_mask"]
 def __init__(self, df, max_length=128, model_name="bert-base-japanese-whole-word-masking", transforms=None):
     self.max_length = max_length
     self.df = df
     self.model_name = model_name
     self.tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)
     self.transforms = transforms
예제 #9
0
def build_tokenizer() -> BertJapaneseTokenizer:
    tokenizer = BertJapaneseTokenizer.from_pretrained(
        BERT_PRETRAINED_MODEL_NAME)
    return tokenizer
예제 #10
0
from torch.utils.data import TensorDataset, random_split
# Tokenizerの準備
from transformers import BertJapaneseTokenizer
from my_module import path_manager as pm
import torch
import numpy as np
import statistics
import sys
# シードを固定することで学習の再現性を担保する。
torch.manual_seed(0)

# 学習済みモデルのロード
model_name = str(pm.get_abs_path(
    __file__, 'resources/BERT/BERT-base_mecab-ipadic-bpe-32k_whole-word-mask'))
# Tokenizerのセット
tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)
# CPU/GPU環境の設定
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 学習に利用するセンテンスを決定する境界値
boundary = 0.2
# ラベル付きデータ(極性値を付与したレビューのセンテンス)の読み込み
df = pd.read_csv('sent_senti.csv')
# 無極性のレビューを除去する。
df = df[df['score'] != 0]
# 感情値の絶対値が境界値以上の行を抽出
df = df[abs(df['score']) >= boundary]
# 感情値が正の場合は1、負の場合は0のラベルを付与した列を挿入
df['label'] = df['score'].apply(lambda x: 1 if 0 < x else 0)
# 文章中の単語数を集計した列を挿入
df['length'] = df['content'].apply(lambda x: len(tokenizer.tokenize(x)))
예제 #11
0
def main(args):
    train_input_dir:str=args.train_input_dir
    dev_input_dir:str=args.dev_input_dir
    train_qohs_filepath:str=args.train_qohs_filepath
    dev_qohs_filepath:str=args.dev_qohs_filepath
    bert_model_dir:str=args.bert_model_dir
    imagebert_checkpoint_filepath:str=args.imagebert_checkpoint_filepath
    roi_boxes_dir:str=args.roi_boxes_dir
    roi_features_dir:str=args.roi_features_dir
    max_num_rois:int=args.max_num_rois
    roi_features_dim:int=args.roi_features_dim
    train_batch_size:int=args.train_batch_size
    num_epochs:int=args.num_epochs
    lr:float=args.lr
    result_save_dir:str=args.result_save_dir
    train_logging_steps:int=args.train_logging_steps
    use_multi_gpus:bool=args.use_multi_gpus
    no_init_from_pretrained_bert:bool=args.no_init_from_pretrained_bert
    use_roi_seq_position:bool=args.use_roi_seq_position

    logger.info("バッチサイズ: {}".format(train_batch_size))
    logger.info("エポック数: {}".format(num_epochs))
    logger.info("学習率: {}".format(lr))

    if use_roi_seq_position:
        logger.info("RoIのSequence Positionに昇順の値を使用します。")

    logger.info("{}から訓練用データセットを作成します。".format(train_input_dir))
    train_dataset=mf.create_dataset(train_input_dir,num_examples=-1,num_options=4)

    logger.info("{}からDev用データローダを作成します。".format(dev_input_dir))
    dev_dataset=mf.create_dataset(dev_input_dir,num_examples=-1,num_options=20)
    dev_dataloader=DataLoader(dev_dataset,batch_size=4,shuffle=False)

    logger.info("問題と選択肢ハッシュ値の辞書を作成します。")
    logger.info("train_qohs_filepath: {}\tdev_qohs_filepath: {}".format(train_qohs_filepath,dev_qohs_filepath))
    train_qohs=mf.load_question_option_hashes(train_qohs_filepath)
    dev_qohs=mf.load_question_option_hashes(dev_qohs_filepath)

    logger.info("RoI情報は以下のディレクトリから読み込まれます。")
    logger.info("roi_boxes_dir: {}\troi_features_dir: {}".format(roi_boxes_dir,roi_features_dir))
    if os.path.exists(roi_boxes_dir)==False:
        logger.warn("roi_boxes_dirは存在しません。")
    if os.path.exists(roi_features_dir)==False:
        logger.warn("roi_features_dirは存在しません。")

    logger.info("ImageBERTForMultipleChoiceモデルを作成します。")
    config=BertConfig.from_pretrained(bert_model_dir)
    classifier_model=ImageBertForMultipleChoice(config)
    if no_init_from_pretrained_bert:
        logger.info("ImageBERTのパラメータを事前学習済みのモデルから初期化しません。")
        tokenizer=BertJapaneseTokenizer.from_pretrained(bert_model_dir)
        classifier_model.imbert.set_sep_token_id(tokenizer.sep_token_id)
    else:
        classifier_model.setup_image_bert(bert_model_dir)
    classifier_model.to(device)

    if imagebert_checkpoint_filepath is not None:
        logger.info("{}からImageBERTのチェックポイントを読み込みます。".format(imagebert_checkpoint_filepath))
        parameters=torch.load(imagebert_checkpoint_filepath,map_location=device)
        parameters=fix_model_state_dict(parameters)
        classifier_model.load_state_dict(parameters,strict=False)

    if use_multi_gpus:
        logger.info("複数のGPUを使用します。")
        classifier_model=nn.DataParallel(classifier_model)
        torch.backends.cudnn.benchmark=True

    num_iterations=len(train_dataset)//train_batch_size
    total_steps=num_iterations*num_epochs

    optimizer=AdamW(classifier_model.parameters(),lr=lr,eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=total_steps
    )

    #結果を保存するディレクトリを作成する。
    logger.info("結果は{}に保存されます。".format(result_save_dir))
    os.makedirs(result_save_dir,exist_ok=True)

    #訓練ループ
    for epoch in range(num_epochs):
        logger.info("===== Epoch {}/{} =====".format(epoch,num_epochs-1))

        #訓練
        train_dataloader=DataLoader(train_dataset,batch_size=train_batch_size,shuffle=True)
        mean_loss=mf.train(
            classifier_model,
            train_qohs,
            roi_boxes_dir,
            roi_features_dir,
            optimizer,
            scheduler,
            train_dataloader,
            max_num_rois,
            roi_features_dim,
            use_roi_seq_position,
            device,
            logger,
            train_logging_steps
        )
        logger.info("訓練時の損失平均値: {}".format(mean_loss))

        #チェックポイントを保存する。
        checkpoint_filepath=os.path.join(result_save_dir,"checkpoint_{}.pt".format(epoch))
        torch.save(classifier_model.state_dict(),checkpoint_filepath)

        #評価
        result_save_filepath=os.path.join(result_save_dir,"result_eval_{}.txt".format(epoch))
        labels_save_filepath=os.path.join(result_save_dir,"labels_eval_{}.txt".format(epoch))
        logits_save_filepath=os.path.join(result_save_dir,"logits_eval_{}.txt".format(epoch))
        mf.evaluate_and_save_result(
            classifier_model,
            dev_qohs,
            roi_boxes_dir,
            roi_features_dir,
            dev_dataloader,
            max_num_rois,
            roi_features_dim,
            use_roi_seq_position,
            result_save_filepath,
            labels_save_filepath,
            logits_save_filepath,
            device,
            logger
        )
예제 #12
0
def main():
    # omoshiro_tweets = load_omoshiro_tweets_from_json(tweet_filename)
    valid_dataset_size = 128
    batch_size = 8
    config_path = 'cl-tohoku/bert-base-japanese-whole-word-masking'

    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')

    tokenizer = BertJapaneseTokenizer.from_pretrained(config_path)
    config = BertConfig.from_pretrained(config_path)

    pad_token_id = config.pad_token_id

    dataset = load_dataset(dataset_filename)

    train_dataset = dataset[:-128]
    valid_dataset = dataset[-128:]

    train_dataset, train_max_length = preprocess(train_dataset, tokenizer, config, batch_size=batch_size, device=device)
    train_dataset = normalize_dataset(train_dataset)
    valid_dataset, valid_max_length = preprocess(valid_dataset, tokenizer, config, batch_size=batch_size, device=device)

    valid_batches = mk_batches(dataset=valid_dataset, max_length=valid_max_length, batch_size=batch_size, device=device, pad=pad_token_id)

    print('Train dataset size is {}, Valid dataset size is {}'.format(len(train_dataset), len(valid_dataset)))

    model = BertPredictor(config_path=config_path, model_path=config_path)
    # model = Perceptron(vocab_size=tokenizer.vocab_size, hidden_size=128, device=device)

    model.to(device)

    criterion = torch.nn.CrossEntropyLoss(ignore_index=NEUTRAL)

    optimizer = optim.SGD(model.parameters(), lr=0.0001)

    for epoch in range(10):
        print('------ Epoch {} ------'.format(epoch + 1))

        train_batches = mk_batches(dataset=train_dataset, max_length=train_max_length, batch_size=batch_size, device=device, pad=pad_token_id)

        print('Train')
        model.train()
        accuracy = 0.0
        for i, batch in enumerate(train_batches):
            model.zero_grad()

            src = batch['src']
            tgt = batch['tgt']

            # output = [batch_size, vocab_size]
            output = model(src)

            loss = criterion(output, tgt)

            labels = torch.argmax(output, dim=-1)

            accuracy = ((labels == tgt).sum() + accuracy * i * batch_size) / ((i + 1) * batch_size) 

            loss.backward()
            optimizer.step()

            sys.stdout.write('\rLoss: {},  Accuracy: {}'.format(loss.item(), accuracy))
        
        # accuracy /= len(train_dataset)

        print('\nTrain accuracy {}'.format(accuracy))

        print('Validation')
        model.eval()
        with torch.no_grad():
            accuracy = 0.0
            for batch in valid_batches:
                src = batch['src']
                tgt = batch['tgt']

                output = model(src)

                labels = torch.argmax(output, dim=-1)

                accuracy += (labels == tgt).sum()
            
            accuracy /= valid_dataset_size
            print('Valid accuracy : {}'.format(accuracy))

    accuracy = 0.0
    for batch in valid_batches:
        accuracy += (JUN == batch['tgt']).sum()
    
    accuracy /= valid_dataset_size

    print('== JUN accuracy : {}'.format(accuracy))
예제 #13
0
    def from_pretrained(cls, model_name="BERT", normalizer="dict"):
        """学習モデルの読み込み

        学習済みモデルを読み込み,Nerインスタンスを返します.
        学習済みモデルがキャッシュされていない場合,~/.cacheにモデルのダウンロードを行います.
        ダウンロード先を指定したい場合は環境変数DEFAULT_CACHE_PATHで指定してください.

        Args:
            model_name (str): モデル名.現バージョンはBERTのみしか実装していません.
            normalizer (str or callable): 標準化方法の指定.dict or dnorm.

        Returns:
            Ner: Nerインスタンス
        """

        assert model_name in ["BERT", "radiology"], "BERT以外未実装です"
        if model_name in ["BERT", "radiology"]:
            model_dir = DEFAULT_MODEL_PATH
            src_url = BERT_URL
            if model_name == "radiology":
                model_dir = RADIOLOGY_MODEL_PATH
                src_url = RADIOLOGY_URL
            base_model = BertModel.from_pretrained(
                "cl-tohoku/bert-base-japanese-char")
            basic_tokenizer = ListTokenizer()
            subword_tokenizer = BertJapaneseTokenizer.from_pretrained(
                "cl-tohoku/bert-base-japanese-char", do_word_tokenize=False)

        if not model_dir.parent.is_dir():
            logger.info("creating %s", str(model_dir.parent))
            model_dir.parent.mkdir()

        if not model_dir.is_dir():
            logger.info("creating %s", str(model_dir))
            model_dir.mkdir()

        if not (model_dir / "final.model").is_file():
            logger.info("not found %s", str(model_dir / "final.model"))
            download_fileobj(src_url + "/final.model",
                             model_dir / "final.model")
        if not (model_dir / "labels.txt").is_file():
            logger.info("not found %s", str(model_dir / "labels.txt"))
            download_fileobj(src_url + "/labels.txt", model_dir / "labels.txt")

        if isinstance(normalizer, str):
            if normalizer == "dnorm":
                logger.info("try %s normalizer", "dnorm")
                try:
                    from dnorm_j import DNorm

                    normalizer = DNorm.from_pretrained().normalize
                    logger.info("use %s normalizer", "dnorm")
                except:
                    logger.warning("You did not install dnorm")
                    logger.warning("use %s normalizer", "Dict")
                    normalizer = DictNormalizer(DEFAULT_MEDNERJ_PATH /
                                                "norm_dic.csv").normalize
            else:
                logger.info("use %s normalizer", "Dict")
                normalizer = DictNormalizer(DEFAULT_MEDNERJ_PATH /
                                            "norm_dic.csv").normalize

        elif isinstance(normalizer, object):
            logger.info("use %s normalizer", "your original")
            normalizer = normalizer
        else:
            raise TypeError

        ner = cls(
            base_model,
            basic_tokenizer,
            subword_tokenizer,
            model_dir=model_dir,
            normalizer=normalizer,
        )

        return ner