def __init__(self, config_path): config = configparser.ConfigParser() config.read(config_path) self.save_dir = Path(config.get("general", "save_dir")) if not self.save_dir.exists(): self.save_dir.mkdir(parents=True) self.clf_th = config.getfloat("general", "clf_th") self.mlp_model_path = config.get("model", "mlp") assert Path(self.mlp_model_path).exists() self.device = "cuda" if torch.cuda.is_available() else "cpu" bert_config_path = config.get("bert", "config_path") assert Path(bert_config_path).exists() self.bert_config = LongformerConfig.from_json_file(bert_config_path) self.max_seq_length = self.bert_config.max_position_embeddings - 2 self.bert_tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-base-4096') # bert_tokenizer_path = config.get("bert", "tokenizer_path") # assert Path(bert_config_path).exists() # self.bert_tokenizer = LongformerTokenizer.from_pretrained(bert_tokenizer_path) bert_model_path = config.get("bert", "model_path") assert Path(bert_model_path).exists() self.bert_model = LongformerModel.from_pretrained( bert_model_path, config=self.bert_config) self.bert_model.to(self.device) self.bert_model.eval() gold_dir = Path(config.get("data", "gold_dir")) assert Path(gold_dir).exists() self.gold_dataset = ConllDataset(gold_dir) target_dir = Path(config.get("data", "target_dir")) assert Path(target_dir).exists() self.target_dataset = ConllDataset(target_dir)
'1', # 32GB gpu with fp32 '--gradient_accumulation_steps', '4', #'--evaluate_during_training', # this is removed to reduce training time '--do_train', ]) #train_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_train_patients.txt' #val_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_val_patients.txt' # these are small file for test train_fn = '/scratch/xl3119/capstone/data/sample/sample.txt' val_fn = '/scratch/xl3119/capstone/data/sample/sample.txt' training_args.train_datapath = train_fn training_args.val_datapath = val_fn ##################### use pretrianed longformer in transformer init_config = LongformerConfig.from_json_file( 'config_files/longformer_base_4096/config.json') mimic_tokenizer = BertTokenizer.from_pretrained('mimic_tokenizer') word_embeddings = np.loadtxt( join('/scratch/xl3119/capstone/wd_emb', "word_embedding_matrix.txt")) longformer_model = LongformerForMaskedLM(init_config) longformer_model = use_embeddings_fasttext(longformer_model, word_embeddings) # longformer_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') logger.info('Train and eval with Longformer pretrained ...') pretrain_and_evaluate(training_args, longformer_model, mimic_tokenizer, train_only=True, eval_only=False, model_path=None\ #,model_path=training_args.output_dir # Local path to the model if the model to train has been ins tantiated from a local path. )