) == 1, "the condition: diff. tags -> same value ?" for idx, t in enumerate(_tags): tags_values[TAGs.index(t)].append(_values[0]) elif len(_tags) < len(_values): assert len( _tags ) == 1, "the condition: diff. values -> same tag ?" for idx, v in enumerate(_values): tags_values[TAGs.index(_tags[0])].append(v) print("\r{}:[{}/{}]".format(mode, f_idx, len(files)), end=' \r') # tags_values: group the values having same tag print("Finish Collecting") tokenizer = BertJapaneseTokenizer.from_pretrained(pretrained_weights, do_lower_case=True) config = BertConfig.from_pretrained(pretrained_weights, output_hidden_states=True) fine_tune_model = Model(config) fine_tune_model.load(FINE_TUNE_BERT_MODEL_PATH) model = fine_tune_model.bert_embedd #config = BertConfig.from_pretrained(pretrained_weights, output_hidden_states=True) #model = BertModel.from_pretrained(pretrained_weights, config=config) #model.load_state_dict(torch.load(FINE_TUNE_BERT_MODEL_PATH)) Clusters = [] ALL_LAYERS = 12
def main(): # model files check and download check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH) if args.arch == 'bert-base-cased' or args.arch == 'bert-base-uncased': tokenizer = BertTokenizer.from_pretrained(args.arch) else: tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/" + args.arch) net = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=args.env_id) net.set_input_blob_shape((1, PADDING_LEN), net.find_blob_index_by_name("token_type_ids")) net.set_input_blob_shape((1, PADDING_LEN), net.find_blob_index_by_name("input_ids")) net.set_input_blob_shape((1, PADDING_LEN), net.find_blob_index_by_name("attention_mask")) with codecs.open(args.input, 'r', 'utf-8', 'ignore') as f: s = f.readlines() for text in s: tokenized_text = tokenizer.tokenize(text) original_text_len = len(tokenized_text) # if not args.onnx: for j in range(len(tokenized_text), PADDING_LEN): tokenized_text.append('[PAD]') score = numpy.zeros((len(tokenized_text))) suggest = {} for i in range(0, len(tokenized_text)): masked_index = i if tokenized_text[masked_index] == '[PAD]': continue tokenized_text_saved = tokenized_text[masked_index] tokenized_text[masked_index] = '[MASK]' outputs = inference(net, tokenizer, tokenized_text, masked_index, original_text_len) target_ids = tokenizer.convert_tokens_to_ids( [tokenized_text_saved]) index = target_ids[0] score[masked_index] = outputs[0][0, masked_index][index] predictions = torch.from_numpy(outputs[0][0, masked_index]).topk(1) index = predictions.indices[0] top_token = tokenizer.convert_ids_to_tokens([index])[0] suggest[masked_index] = top_token tokenized_text[masked_index] = tokenized_text_saved fine_text = colorize(tokenized_text, score, suggest) print(fine_text) print('Script finished successfully.')
from helper.image_helper import * device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') parser = argparse.ArgumentParser() parser.add_argument('--path') parser.add_argument('--log', action='store_true') args = parser.parse_args() path = args.path log = args.log task = path.split('/')[1] text_model = path.split('/')[2].split('_')[0] image_model = path.split('/')[2].split('_')[1] fusion_method = path.split('/')[2].split('_')[2] tokenizer = BertJapaneseTokenizer.from_pretrained('bert-base-japanese-whole-word-masking') def set_log(): save_dir = 'log/' + task if not os.path.exists(save_dir): os.makedirs(save_dir) format = '%(message)s' filename = save_dir + '/' + text_model + '_' + image_model + '_' \ + fusion_method + '_' + datetime.now().strftime('%Y%m%d%H%M') + '.log' logging.basicConfig(filename=filename, level=logging.DEBUG, format=format) def get_config(file_path): config_file = file_path json_file = open(config_file, 'r') json_object = json.load(json_file) config = AttrDict(json_object)
def main(): # model files check and download check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH) if args.arch == 'bert-base-cased' or args.arch == 'bert-base-uncased': tokenizer = BertTokenizer.from_pretrained(args.arch) else: tokenizer = BertJapaneseTokenizer.from_pretrained( 'cl-tohoku/' + 'bert-base-japanese-whole-word-masking') text = args.input logger.info("Input text : " + text) tokenized_text = tokenizer.tokenize(text) logger.info("Tokenized text : " + str(tokenized_text)) masked_index = -1 for i in range(0, len(tokenized_text)): if tokenized_text[i] == '[MASK]': masked_index = i break if masked_index == -1: logger.info("[MASK] not found") sys.exit(1) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) logger.info("Indexed tokens : " + str(indexed_tokens)) ailia_model = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=args.env_id) indexed_tokens = numpy.array(indexed_tokens) token_type_ids = numpy.zeros((1, len(tokenized_text))) attention_mask = numpy.zeros((1, len(tokenized_text))) attention_mask[:, 0:len(tokenized_text)] = 1 inputs_onnx = { "token_type_ids": token_type_ids, "input_ids": indexed_tokens, "attention_mask": attention_mask, } logger.info("Predicting...") if args.benchmark: logger.info('BENCHMARK mode') for i in range(5): start = int(round(time.time() * 1000)) outputs = ailia_model.predict(inputs_onnx) end = int(round(time.time() * 1000)) logger.info("\tailia processing time {} ms".format(end - start)) else: outputs = ailia_model.predict(inputs_onnx) predictions = torch.from_numpy(outputs[0][0, masked_index]).topk(NUM_PREDICT) logger.info("Predictions : ") for i, index_t in enumerate(predictions.indices): index = index_t.item() token = tokenizer.convert_ids_to_tokens([index])[0] logger.info(str(i) + " " + str(token)) logger.info('Script finished successfully.')
import torchvision from transformers import ( BertConfig, BertJapaneseTokenizer, BertForMultipleChoice, AdamW, get_linear_schedule_with_warmup, ) from textformatting import ssplit import jaconv from pyknp import Juman config = BertConfig.from_json_file( "../PretrainedModel/KyotoUniv/bert_config.json") tokenizer = BertJapaneseTokenizer.from_pretrained( "../PretrainedModel/KyotoUniv/vocab.txt", do_lower_case=False, do_basic_tokenize=False) #Juman++ juman = Juman(jumanpp=True) #Object detection import cv2 import detectron2 from detectron2.utils.logger import setup_logger from detectron2 import model_zoo from detectron2.engine import DefaultPredictor from detectron2.config import get_cfg from PIL import Image from PIL import ImageFile
def train(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="multi-bert", help="Path, url or short name of the model") parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") parser.add_argument("--max_turns", type=int, default=3, help="Number of previous turns to keep in history") parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") parser.add_argument("--random_init", action='store_true', help="If true random initailze the model") parser.add_argument( "--train_lang", type=str, default="", help="train monolingual model, defaul: multilingual model") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') # Model logger.info("Prepare tokenizer, pretrained model and optimizer.") model_path = 'bert-base-multilingual-cased' if args.train_lang in ["En", "It", "Jp", "Zh"]: # for Fr Ko Id we use MBERT model_path = LANG_2_MODEL[args.train_lang] tokenizer = BertTokenizer.from_pretrained(model_path) if args.train_lang == "Jp": tokenizer = BertJapaneseTokenizer.from_pretrained(model_path) model = Model2Model.from_pretrained(model_path) # tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') # if args.random_init: # config = BertConfig.from_pretrained('bert-base-multilingual-cased') # config.is_decoder = True # bert_decoder = BertForMaskedLM(config) # model = Model2Model.from_pretrained('bert-base-multilingual-cased', decoder_model=bert_decoder) # else: # model = Model2Model.from_pretrained('bert-base-multilingual-cased') # model_dict = model.state_dict() # # initialize crossattention with selfattention # model_dict.update({ name: model_dict[name.replace("crossattention", "attention")] for name in model_dict if "crossattention" in name }) # model.load_state_dict(model_dict) model.to(args.device) # Add special tokens if they are not already added add_special_tokens_(model, tokenizer) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( args, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(batch[input_name].to(args.device) for input_name in MODEL_INPUTS) #batch = tuple(input_tensor.to(args.device) for input_tensor in batch) encoder_mask, decoder_mask, encoder_input_ids, decoder_input_ids, lm_labels, token_type_ids, decoder_lang_id = batch model_kwargs = { "encoder_token_type_ids": token_type_ids, "decoder_token_type_ids": decoder_lang_id, "encoder_attention_mask": encoder_mask, "decoder_attention_mask": decoder_mask, "decoder_lm_labels": lm_labels } lm_loss, prediction_scores, *_ = model( encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids, **model_kwargs) loss = (lm_loss) / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple(batch[input_name].to(args.device) for input_name in MODEL_INPUTS) #batch = tuple(input_tensor.to(args.device) for input_tensor in batch) encoder_mask, decoder_mask, encoder_input_ids, decoder_input_ids, lm_labels, token_type_ids, decoder_lang_id = batch logger.info(tokenizer.decode(encoder_input_ids[0, :].tolist())) # if we dont send labels to model, it doesnt return losses model_kwargs = { "encoder_token_type_ids": token_type_ids, "decoder_token_type_ids": decoder_lang_id, "encoder_attention_mask": encoder_mask, "decoder_attention_mask": decoder_mask } lm_logits, *_ = model(encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids, **model_kwargs) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, ), (lm_labels_flat_shifted, ) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args.model_checkpoint) log_dir += "_lang_id" if args.random_init: log_dir = log_dir + "_random_init" tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') if args.distributed: getattr(model.module, 'encoder', model).config.to_json_file( os.path.join(log_dir, CONFIG_NAME) ) # the config for encoder and decoder should be the same else: getattr(model, 'encoder', model).config.to_json_file( os.path.join(log_dir, CONFIG_NAME) ) # the config for encoder and decoder should be the same tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
) dl = DataLoader(ds, batch_size=self.batch_size, shuffle=False) preds = [] for batch in tqdm(dl): input_ids, attention_mask = batch["input_ids"], batch["attention_mask"] input_ids, attention_mask = input_ids.to(device), attention_mask.to(device) output = bert(input_ids=input_ids, attention_mask=attention_mask) output = output[0] output = output.to(cpu) preds.append(output.detach().clone().numpy()) return np.concatenate(preds, axis=0) # MODEL_NAME = "bert-base-japanese-whole-word-masking" MODEL_NAME = "bert-base-japanese-whole-word-masking" tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME) config = BertConfig.from_pretrained(MODEL_NAME) bert = BertModel.from_pretrained(MODEL_NAME) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") cpu = torch.device('cpu') print(f"使用デバイス: {device}") bert.to(device) def tokenize(text, max_length=128): id_dict = tokenizer.encode_plus(str(text), max_length=max_length, pad_to_max_length=True, truncation=True) return id_dict["input_ids"], id_dict["attention_mask"]
def __init__(self, df, max_length=128, model_name="bert-base-japanese-whole-word-masking", transforms=None): self.max_length = max_length self.df = df self.model_name = model_name self.tokenizer = BertJapaneseTokenizer.from_pretrained(model_name) self.transforms = transforms
def build_tokenizer() -> BertJapaneseTokenizer: tokenizer = BertJapaneseTokenizer.from_pretrained( BERT_PRETRAINED_MODEL_NAME) return tokenizer
from torch.utils.data import TensorDataset, random_split # Tokenizerの準備 from transformers import BertJapaneseTokenizer from my_module import path_manager as pm import torch import numpy as np import statistics import sys # シードを固定することで学習の再現性を担保する。 torch.manual_seed(0) # 学習済みモデルのロード model_name = str(pm.get_abs_path( __file__, 'resources/BERT/BERT-base_mecab-ipadic-bpe-32k_whole-word-mask')) # Tokenizerのセット tokenizer = BertJapaneseTokenizer.from_pretrained(model_name) # CPU/GPU環境の設定 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # 学習に利用するセンテンスを決定する境界値 boundary = 0.2 # ラベル付きデータ(極性値を付与したレビューのセンテンス)の読み込み df = pd.read_csv('sent_senti.csv') # 無極性のレビューを除去する。 df = df[df['score'] != 0] # 感情値の絶対値が境界値以上の行を抽出 df = df[abs(df['score']) >= boundary] # 感情値が正の場合は1、負の場合は0のラベルを付与した列を挿入 df['label'] = df['score'].apply(lambda x: 1 if 0 < x else 0) # 文章中の単語数を集計した列を挿入 df['length'] = df['content'].apply(lambda x: len(tokenizer.tokenize(x)))
def main(args): train_input_dir:str=args.train_input_dir dev_input_dir:str=args.dev_input_dir train_qohs_filepath:str=args.train_qohs_filepath dev_qohs_filepath:str=args.dev_qohs_filepath bert_model_dir:str=args.bert_model_dir imagebert_checkpoint_filepath:str=args.imagebert_checkpoint_filepath roi_boxes_dir:str=args.roi_boxes_dir roi_features_dir:str=args.roi_features_dir max_num_rois:int=args.max_num_rois roi_features_dim:int=args.roi_features_dim train_batch_size:int=args.train_batch_size num_epochs:int=args.num_epochs lr:float=args.lr result_save_dir:str=args.result_save_dir train_logging_steps:int=args.train_logging_steps use_multi_gpus:bool=args.use_multi_gpus no_init_from_pretrained_bert:bool=args.no_init_from_pretrained_bert use_roi_seq_position:bool=args.use_roi_seq_position logger.info("バッチサイズ: {}".format(train_batch_size)) logger.info("エポック数: {}".format(num_epochs)) logger.info("学習率: {}".format(lr)) if use_roi_seq_position: logger.info("RoIのSequence Positionに昇順の値を使用します。") logger.info("{}から訓練用データセットを作成します。".format(train_input_dir)) train_dataset=mf.create_dataset(train_input_dir,num_examples=-1,num_options=4) logger.info("{}からDev用データローダを作成します。".format(dev_input_dir)) dev_dataset=mf.create_dataset(dev_input_dir,num_examples=-1,num_options=20) dev_dataloader=DataLoader(dev_dataset,batch_size=4,shuffle=False) logger.info("問題と選択肢ハッシュ値の辞書を作成します。") logger.info("train_qohs_filepath: {}\tdev_qohs_filepath: {}".format(train_qohs_filepath,dev_qohs_filepath)) train_qohs=mf.load_question_option_hashes(train_qohs_filepath) dev_qohs=mf.load_question_option_hashes(dev_qohs_filepath) logger.info("RoI情報は以下のディレクトリから読み込まれます。") logger.info("roi_boxes_dir: {}\troi_features_dir: {}".format(roi_boxes_dir,roi_features_dir)) if os.path.exists(roi_boxes_dir)==False: logger.warn("roi_boxes_dirは存在しません。") if os.path.exists(roi_features_dir)==False: logger.warn("roi_features_dirは存在しません。") logger.info("ImageBERTForMultipleChoiceモデルを作成します。") config=BertConfig.from_pretrained(bert_model_dir) classifier_model=ImageBertForMultipleChoice(config) if no_init_from_pretrained_bert: logger.info("ImageBERTのパラメータを事前学習済みのモデルから初期化しません。") tokenizer=BertJapaneseTokenizer.from_pretrained(bert_model_dir) classifier_model.imbert.set_sep_token_id(tokenizer.sep_token_id) else: classifier_model.setup_image_bert(bert_model_dir) classifier_model.to(device) if imagebert_checkpoint_filepath is not None: logger.info("{}からImageBERTのチェックポイントを読み込みます。".format(imagebert_checkpoint_filepath)) parameters=torch.load(imagebert_checkpoint_filepath,map_location=device) parameters=fix_model_state_dict(parameters) classifier_model.load_state_dict(parameters,strict=False) if use_multi_gpus: logger.info("複数のGPUを使用します。") classifier_model=nn.DataParallel(classifier_model) torch.backends.cudnn.benchmark=True num_iterations=len(train_dataset)//train_batch_size total_steps=num_iterations*num_epochs optimizer=AdamW(classifier_model.parameters(),lr=lr,eps=1e-8) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps ) #結果を保存するディレクトリを作成する。 logger.info("結果は{}に保存されます。".format(result_save_dir)) os.makedirs(result_save_dir,exist_ok=True) #訓練ループ for epoch in range(num_epochs): logger.info("===== Epoch {}/{} =====".format(epoch,num_epochs-1)) #訓練 train_dataloader=DataLoader(train_dataset,batch_size=train_batch_size,shuffle=True) mean_loss=mf.train( classifier_model, train_qohs, roi_boxes_dir, roi_features_dir, optimizer, scheduler, train_dataloader, max_num_rois, roi_features_dim, use_roi_seq_position, device, logger, train_logging_steps ) logger.info("訓練時の損失平均値: {}".format(mean_loss)) #チェックポイントを保存する。 checkpoint_filepath=os.path.join(result_save_dir,"checkpoint_{}.pt".format(epoch)) torch.save(classifier_model.state_dict(),checkpoint_filepath) #評価 result_save_filepath=os.path.join(result_save_dir,"result_eval_{}.txt".format(epoch)) labels_save_filepath=os.path.join(result_save_dir,"labels_eval_{}.txt".format(epoch)) logits_save_filepath=os.path.join(result_save_dir,"logits_eval_{}.txt".format(epoch)) mf.evaluate_and_save_result( classifier_model, dev_qohs, roi_boxes_dir, roi_features_dir, dev_dataloader, max_num_rois, roi_features_dim, use_roi_seq_position, result_save_filepath, labels_save_filepath, logits_save_filepath, device, logger )
def main(): # omoshiro_tweets = load_omoshiro_tweets_from_json(tweet_filename) valid_dataset_size = 128 batch_size = 8 config_path = 'cl-tohoku/bert-base-japanese-whole-word-masking' if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') tokenizer = BertJapaneseTokenizer.from_pretrained(config_path) config = BertConfig.from_pretrained(config_path) pad_token_id = config.pad_token_id dataset = load_dataset(dataset_filename) train_dataset = dataset[:-128] valid_dataset = dataset[-128:] train_dataset, train_max_length = preprocess(train_dataset, tokenizer, config, batch_size=batch_size, device=device) train_dataset = normalize_dataset(train_dataset) valid_dataset, valid_max_length = preprocess(valid_dataset, tokenizer, config, batch_size=batch_size, device=device) valid_batches = mk_batches(dataset=valid_dataset, max_length=valid_max_length, batch_size=batch_size, device=device, pad=pad_token_id) print('Train dataset size is {}, Valid dataset size is {}'.format(len(train_dataset), len(valid_dataset))) model = BertPredictor(config_path=config_path, model_path=config_path) # model = Perceptron(vocab_size=tokenizer.vocab_size, hidden_size=128, device=device) model.to(device) criterion = torch.nn.CrossEntropyLoss(ignore_index=NEUTRAL) optimizer = optim.SGD(model.parameters(), lr=0.0001) for epoch in range(10): print('------ Epoch {} ------'.format(epoch + 1)) train_batches = mk_batches(dataset=train_dataset, max_length=train_max_length, batch_size=batch_size, device=device, pad=pad_token_id) print('Train') model.train() accuracy = 0.0 for i, batch in enumerate(train_batches): model.zero_grad() src = batch['src'] tgt = batch['tgt'] # output = [batch_size, vocab_size] output = model(src) loss = criterion(output, tgt) labels = torch.argmax(output, dim=-1) accuracy = ((labels == tgt).sum() + accuracy * i * batch_size) / ((i + 1) * batch_size) loss.backward() optimizer.step() sys.stdout.write('\rLoss: {}, Accuracy: {}'.format(loss.item(), accuracy)) # accuracy /= len(train_dataset) print('\nTrain accuracy {}'.format(accuracy)) print('Validation') model.eval() with torch.no_grad(): accuracy = 0.0 for batch in valid_batches: src = batch['src'] tgt = batch['tgt'] output = model(src) labels = torch.argmax(output, dim=-1) accuracy += (labels == tgt).sum() accuracy /= valid_dataset_size print('Valid accuracy : {}'.format(accuracy)) accuracy = 0.0 for batch in valid_batches: accuracy += (JUN == batch['tgt']).sum() accuracy /= valid_dataset_size print('== JUN accuracy : {}'.format(accuracy))
def from_pretrained(cls, model_name="BERT", normalizer="dict"): """学習モデルの読み込み 学習済みモデルを読み込み,Nerインスタンスを返します. 学習済みモデルがキャッシュされていない場合,~/.cacheにモデルのダウンロードを行います. ダウンロード先を指定したい場合は環境変数DEFAULT_CACHE_PATHで指定してください. Args: model_name (str): モデル名.現バージョンはBERTのみしか実装していません. normalizer (str or callable): 標準化方法の指定.dict or dnorm. Returns: Ner: Nerインスタンス """ assert model_name in ["BERT", "radiology"], "BERT以外未実装です" if model_name in ["BERT", "radiology"]: model_dir = DEFAULT_MODEL_PATH src_url = BERT_URL if model_name == "radiology": model_dir = RADIOLOGY_MODEL_PATH src_url = RADIOLOGY_URL base_model = BertModel.from_pretrained( "cl-tohoku/bert-base-japanese-char") basic_tokenizer = ListTokenizer() subword_tokenizer = BertJapaneseTokenizer.from_pretrained( "cl-tohoku/bert-base-japanese-char", do_word_tokenize=False) if not model_dir.parent.is_dir(): logger.info("creating %s", str(model_dir.parent)) model_dir.parent.mkdir() if not model_dir.is_dir(): logger.info("creating %s", str(model_dir)) model_dir.mkdir() if not (model_dir / "final.model").is_file(): logger.info("not found %s", str(model_dir / "final.model")) download_fileobj(src_url + "/final.model", model_dir / "final.model") if not (model_dir / "labels.txt").is_file(): logger.info("not found %s", str(model_dir / "labels.txt")) download_fileobj(src_url + "/labels.txt", model_dir / "labels.txt") if isinstance(normalizer, str): if normalizer == "dnorm": logger.info("try %s normalizer", "dnorm") try: from dnorm_j import DNorm normalizer = DNorm.from_pretrained().normalize logger.info("use %s normalizer", "dnorm") except: logger.warning("You did not install dnorm") logger.warning("use %s normalizer", "Dict") normalizer = DictNormalizer(DEFAULT_MEDNERJ_PATH / "norm_dic.csv").normalize else: logger.info("use %s normalizer", "Dict") normalizer = DictNormalizer(DEFAULT_MEDNERJ_PATH / "norm_dic.csv").normalize elif isinstance(normalizer, object): logger.info("use %s normalizer", "your original") normalizer = normalizer else: raise TypeError ner = cls( base_model, basic_tokenizer, subword_tokenizer, model_dir=model_dir, normalizer=normalizer, ) return ner