def __init__(self, opt, bert_config=None): super(SANBertNetwork, self).__init__() self.dropout_list = nn.ModuleList() self.bert_config = BertConfig.from_dict(opt) self.bert = BertModel(self.bert_config) if opt.get('dump_feature', False): self.opt = opt return if opt['update_bert_opt'] > 0: for p in self.bert.parameters(): p.requires_grad = False mem_size = self.bert_config.hidden_size self.decoder_opt = opt['answer_opt'] self.scoring_list = nn.ModuleList() labels = [int(ls) for ls in opt['label_size'].split(',')] task_dropout_p = opt['tasks_dropout_p'] self.bert_pooler = None for task, lab in enumerate(labels): decoder_opt = self.decoder_opt[task] dropout = DropoutWrapper(task_dropout_p[task], opt['vb_dropout']) self.dropout_list.append(dropout) if decoder_opt == 1: out_proj = SANClassifier(mem_size, mem_size, lab, opt, prefix='answer', dropout=dropout) self.scoring_list.append(out_proj) else: out_proj = nn.Linear(self.bert_config.hidden_size, lab) self.scoring_list.append(out_proj) self.opt = opt self._my_init() self.set_embed(opt)
def __init__(self, opt, bert_config=None): super(SANBertNetwork, self).__init__() self.dropout_list = nn.ModuleList() self.encoder_type = opt['encoder_type'] if opt['encoder_type'] == EncoderModelType.ROBERTA: from fairseq.models.roberta import RobertaModel self.bert = RobertaModel.from_pretrained(opt['init_checkpoint']) hidden_size = self.bert.args.encoder_embed_dim self.pooler = LinearPooler(hidden_size) else: self.bert_config = BertConfig.from_dict(opt) self.bert = BertModel(self.bert_config) hidden_size = self.bert_config.hidden_size if opt.get('dump_feature', False): self.opt = opt return if opt['update_bert_opt'] > 0: for p in self.bert.parameters(): p.requires_grad = False self.decoder_opt = opt['answer_opt'] self.task_types = opt["task_types"] self.scoring_list = nn.ModuleList() labels = [int(ls) for ls in opt['label_size'].split(',')] task_dropout_p = opt['tasks_dropout_p'] for task, lab in enumerate(labels): decoder_opt = self.decoder_opt[task] task_type = self.task_types[task] dropout = DropoutWrapper(task_dropout_p[task], opt['vb_dropout']) self.dropout_list.append(dropout) if task_type == TaskType.Span: assert decoder_opt != 1 out_proj = nn.Linear(hidden_size, 2) elif task_type == TaskType.SeqenceLabeling: out_proj = nn.Linear(hidden_size, lab) elif task_type == TaskType.MaskLM: if opt['encoder_type'] == EncoderModelType.ROBERTA: # TODO: xiaodl out_proj = MaskLmHeader( self.bert.embeddings.word_embeddings.weight) else: out_proj = MaskLmHeader( self.bert.embeddings.word_embeddings.weight) else: if decoder_opt == 1: out_proj = SANClassifier(hidden_size, hidden_size, lab, opt, prefix='answer', dropout=dropout) else: out_proj = nn.Linear(hidden_size, lab) self.scoring_list.append(out_proj) self.opt = opt self._my_init()
def __init__(self, opt, bert_config=None, use_parse=False, embedding_matrix=None, token2idx=None, stx_parse_dim=None, unked_words=None, use_generic_features=False, num_generic_features=None, use_domain_features=False, num_domain_features=None, feature_dim=None): super(SANBertNetwork, self).__init__() self.dropout_list = [] self.bert_config = BertConfig.from_dict(opt) self.bert = BertModel(self.bert_config) if opt['update_bert_opt'] > 0: for p in self.bert.parameters(): p.requires_grad = False mem_size = self.bert_config.hidden_size self.scoring_list = nn.ModuleList() labels = [int(ls) for ls in opt['label_size'].split(',')] task_dropout_p = opt['tasks_dropout_p'] self.bert_pooler = None self.use_parse = use_parse self.stx_parse_dim = stx_parse_dim self.use_generic_features = use_generic_features self.use_domain_features = use_domain_features clf_dim = self.bert_config.hidden_size if self.use_parse: self.treelstm = BinaryTreeLSTM(self.stx_parse_dim, embedding_matrix.clone(), token2idx, unked_words=unked_words) parse_clf_dim = self.stx_parse_dim * 2 clf_dim += parse_clf_dim self.parse_clf = nn.Linear(parse_clf_dim, labels[0]) if self.use_generic_features: self.generic_feature_proj = nn.Linear(num_generic_features, num_generic_features * feature_dim) generic_feature_clf_dim = num_generic_features * feature_dim clf_dim += generic_feature_clf_dim self.generic_feature_clf = nn.Linear(generic_feature_clf_dim, labels[0]) if self.use_domain_features: self.domain_feature_proj = nn.Linear(num_domain_features, num_domain_features * feature_dim) domain_feature_clf_dim = num_domain_features * feature_dim clf_dim += domain_feature_clf_dim self.domain_feature_clf = nn.Linear(domain_feature_clf_dim, labels[0]) assert len(labels) == 1 for task, lab in enumerate(labels): dropout = DropoutWrapper(task_dropout_p[task], opt['vb_dropout']) self.dropout_list.append(dropout) out_proj = nn.Linear(self.bert_config.hidden_size, lab) self.scoring_list.append(out_proj) self.opt = opt self._my_init() self.set_embed(opt) if embedding_matrix is not None and self.use_parse: self.treelstm.embedding.weight = nn.Parameter(embedding_matrix) # set again b/c self._my_init() overwrites it
def __init__(self, args, sample_datum, class_names=None, use_cuda=torch.cuda.is_available()): print("curr path:", args.run_dir) assert os.path.isdir(args.run_dir) if class_names is None: class_names = {} self.run_dir = args.run_dir device = torch.device("cuda" if use_cuda else "cpu") n_gpu = 0 if not use_cuda else torch.cuda.device_count() self.device = device self.n_gpu = n_gpu self.class_names = class_names task_weights, task_class_weights = None, { 'tasks_binary_multilabel': torch.ones(len(self.class_names['tasks_binary_multilabel'])).to( self.device) } if args.task_weights_filepath: assert os.path.isfile(args.task_weights_filepath ), "Task weights file does not exist!" assert args.regression_task_weight == 1, "Can't set both regression task weight and file!" assert not args.ablate, "Can't both use a file and an ablation code." with open(args.task_weights_filepath, mode='r') as f: task_weights = json.loads(f.read()) elif args.regression_task_weight != 1: assert not os.path.isfile( args.task_weights_filepath ), "Can't use both a file and a reg. weight!" assert not args.ablate, "Can't both use a reg. weight and an ablation code!" task_weights = {t: 1 for t in ALL_TASKS} task_weights['next_timepoint'] = args.regression_task_weight elif args.ablate: assert not os.path.isfile( args.task_weights_filepath ), "Can't use both an ablation and a file!" assert args.regression_task_weight == 1, "Can't set both regression task weight and ablation!" print("Ablating!") task_weights, task_class_weights = self.ablate(args.ablate, post_init=False) else: task_weights, task_class_weights = self.ablate(None, post_init=False) self.add_cls_analog = False if args.do_add_cls_analog: assert args.modeltype.lower() not in ( 'cnn', 'gru', 'linear'), "CLS analog only works w/ BERT" self.add_cls_analog = True self.cls_embed = nn.Parameter(data=torch.randn( 1, 1, args.hidden_size), requires_grad=True) else: self.cls_embed = None # No batch size as this is just accessed via dataset[#]. ts_feat_dim, statics_feat_dim = sample_datum['ts'].shape[ 1], sample_datum['statics'].shape[0] pred_dim = sample_datum['next_timepoint'].shape config = { "attention_probs_dropout_prob": 0.1, "hidden_act": "gelu", "hidden_dropout_prob": args.hidden_dropout_prob, "pred_dim": pred_dim, "hidden_size": args.hidden_size, "initializer_range": 0.02, "intermediate_size": args.intermediate_size, "max_position_embeddings": args.max_seq_len + 1 if self.add_cls_analog else args.max_seq_len, "num_attention_heads": args.num_attention_heads, "num_hidden_layers": args.num_hidden_layers, "type_vocab_size": 2, "vocab_size": None # TODO(mmd): Omit this from config... } bert_config = BertConfig.from_dict(config) bert_config_filepath = os.path.join(args.run_dir, CONFIG_FILENAME) if not os.path.isfile(bert_config_filepath) or args.do_overwrite: bert_config.to_json_file( os.path.join(args.run_dir, 'bert_config.json')) # default arg is self attention timeseries # alternative is CNN if args.modeltype.lower() == 'cnn': model = CNN( bert_config, data_shape=[args.max_seq_len, args.hidden_size], use_cuda=torch.cuda.is_available(), conv_layers=list(args.num_filters), kernel_sizes=list(args.kernel_sizes), fc_layer_sizes=list(args.fc_layer_sizes), pooling_method=args.pooling_method, pooling_kernel_size=args.pooling_kernel_size, pooling_stride=args.pooling_stride, conv_layers_per_pool=args.conv_layers_per_pool, task_weights=task_weights, task_class_weights=task_class_weights, ) elif args.modeltype.lower() == 'gru': model = GRUModel( bert_config, data_shape=[args.max_seq_len, args.hidden_size], use_cuda=torch.cuda.is_available(), hidden_dim=args.gru_hidden_layer_size, num_layers=args.gru_num_hidden, bidirectional=args.do_bidirectional, task_weights=task_weights, pooling_method=args.gru_pooling_method, task_class_weights=task_class_weights, ) elif args.modeltype.lower() == 'linear': model = LinearModel( bert_config, data_shape=[args.max_seq_len, args.hidden_size], use_cuda=torch.cuda.is_available(), task_weights=task_weights, task_class_weights=task_class_weights, ) else: model = SelfAttentionTimeseries( bert_config, use_cuda=torch.cuda.is_available(), task_weights=task_weights, task_class_weights=task_class_weights, ) # TODO(mmd): Need to also load ts_projector. ts_projector = nn.Linear(ts_feat_dim, bert_config.hidden_size) statics_projector = nn.Linear(statics_feat_dim, bert_config.hidden_size) model.apply(weight_init) for m in (model, ts_projector, statics_projector, self.cls_embed): if m is None: continue m.to(device) if n_gpu > 1: m = torch.nn.DataParallel(m).cuda() parameters = (list(model.parameters()) + list(ts_projector.parameters()) + list(statics_projector.parameters())) if self.add_cls_analog: parameters += [self.cls_embed] if args.notes == 'integrate_note_bert': # initialize pretrained clinical note bert model cache_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_-1') model_location = BERT_MODEL_LOCATION note_embedding_model = BertModel.from_pretrained( model_location, cache_dir=cache_dir, ) notes_projector = nn.Linear(768, bert_config.hidden_size) note_embedding_model.to(device) notes_projector.to(device) if n_gpu > 1: note_embedding_model = torch.nn.DataParallel( note_embedding_model, device_ids=list(range(torch.cuda.device_count()))).cuda() notes_projector = torch.nn.DataParallel( notes_projector, device_ids=list(range(torch.cuda.device_count()))).cuda() if args.do_train_note_bert: parameters = [ { "params": parameters, "lr": args.learning_rate }, { "params": note_embedding_model.parameters(), "lr": args.learning_rate / args.note_bert_lr_reduce }, { "params": notes_projector.parameters(), "lr": args.learning_rate / args.note_bert_lr_reduce }, ] else: note_embedding_model = None notes_projector = None self.bert_config = bert_config self.model = model self.ts_projector = ts_projector self.statics_projector = statics_projector self.notes_projector = notes_projector self.note_embedding_model = note_embedding_model self.parameters = parameters self.trainable_models = [ self.model, self.ts_projector, self.statics_projector ] if args.do_train_note_bert: self.trainable_models.extend( [self.notes_projector, self.note_embedding_model]) self.n_gpu = n_gpu self.device = device self.notes = args.notes self.run_dir = args.run_dir self.save_name = args.model_file_template.format(**args.to_dict())
def __init__(self, args, sample_datum, class_names=None, verbose=False, use_cuda=torch.cuda.is_available()): print("curr path:", args.run_dir) assert os.path.isdir(args.run_dir) if class_names is None: class_names = {} self.run_dir = args.run_dir self.do_eicu = args.do_eicu self.debug = False device = torch.device("cuda" if use_cuda else "cpu") n_gpu = 0 if not use_cuda else torch.cuda.device_count() self.device = device self.n_gpu = n_gpu self.do_masked_imputation = args.do_masked_imputation self.do_fake_masked_imputation_shape = args.do_fake_masked_imputation_shape assert not (self.do_fake_masked_imputation_shape and self.do_masked_imputation), \ "Can't fake and mask!" self.class_names = class_names task_weights, task_class_weights = None, { 'tasks_binary_multilabel': torch.ones(len(self.class_names['tasks_binary_multilabel'])).to( self.device) } # We can handle zeroing out the regression task through the ablation interface. if args.regression_task_weight in (0, None): if args.ablate: if isinstance(args.ablate, str): args.ablate = [args.ablate] if 'next_timepoint' not in args.ablate: args.ablate.append('next_timepoint') else: args.ablate = ['next_timepoint'] if args.task_weights_filepath: print("filepath") assert os.path.isfile(args.task_weights_filepath ), "Task weights file does not exist!" assert args.regression_task_weight == 1, "Can't set both regression task weight and file!" assert not args.ablate, "Can't both use a file and an ablation code." with open(args.task_weights_filepath, mode='r') as f: task_weights = json.loads(f.read()) elif args.regression_task_weight not in (1, 0, None): print("regression weight") assert not os.path.isfile( args.task_weights_filepath ), "Can't use both a file and a reg. weight!" # assert not args.ablate, "Can't both use a reg. weight and an ablation code!" # this is commented out because now, by default regression is turned off. task_weights = { t: 1 if t in ALL_TASKS_EICU or not self.do_eicu else 0 for t in ALL_TASKS } task_weights['next_timepoint'] = args.regression_task_weight elif args.ablate: assert not os.path.isfile( args.task_weights_filepath ), "Can't use both an ablation and a file!" assert args.regression_task_weight in (None, 0, 1), \ "Can't set both regression task weight and ablation!" if args.regression_task_weight in (None, 0): assert 'next_timepoint' in args.ablate or 'next_timepoint_info' in args.ablate, \ "Must ablate the regression task with a weighting of 0! Should happen automatically." print("Ablating!") task_weights, task_class_weights = self.ablate(args.ablate, post_init=False) else: print("else") task_weights, task_class_weights = self.ablate(None, post_init=False) self.add_cls_analog = False if args.do_add_cls_analog: assert args.modeltype.lower() not in ( 'cnn', 'gru', 'linear'), "CLS analog only works w/ BERT" self.add_cls_analog = True self.cls_embed = nn.Parameter(data=torch.randn( 1, 1, args.hidden_size), requires_grad=True) else: self.cls_embed = None # No batch size as this is just accessed via dataset[#]. ts_feat_dim, statics_feat_dim = sample_datum['ts'].shape[ 1], sample_datum['statics'].shape[0] pred_dim = sample_datum['next_timepoint'].shape # For the is-masked bit. if self.do_masked_imputation or self.do_fake_masked_imputation_shape: ts_feat_dim += 1 config = { "attention_probs_dropout_prob": 0.1, "hidden_act": "gelu", "hidden_dropout_prob": args.hidden_dropout_prob, "pred_dim": pred_dim, "hidden_size": args.hidden_size, "initializer_range": 0.02, "intermediate_size": args.intermediate_size, "max_position_embeddings": args.max_seq_len + 1 if self.add_cls_analog else args.max_seq_len, "num_attention_heads": args.num_attention_heads, "num_hidden_layers": args.num_hidden_layers, "type_vocab_size": 2, "vocab_size": None # TODO(mmd): Omit this from config... } bert_config = BertConfig.from_dict(config) bert_config_filepath = os.path.join(args.run_dir, CONFIG_FILENAME) if not os.path.isfile(bert_config_filepath) or args.do_overwrite: bert_config.to_json_file( os.path.join(args.run_dir, 'bert_config.json')) # default arg is self attention timeseries # alternative is CNN assert args.modeltype.lower( ) == 'gru', "Only GRU is supported in this version." model = GRUModel( bert_config, data_shape=[args.max_seq_len, args.hidden_size], use_cuda=torch.cuda.is_available(), hidden_dim=args.gru_hidden_layer_size, num_layers=args.gru_num_hidden, bidirectional=args.do_bidirectional, task_weights=task_weights, pooling_method=args.gru_pooling_method, task_class_weights=task_class_weights, verbose=verbose, do_eicu=self.do_eicu, ) # TODO(mmd): Need to also load ts_projector. ts_projector = nn.Linear(ts_feat_dim, bert_config.hidden_size) statics_projector = nn.Linear(statics_feat_dim, bert_config.hidden_size) model.apply(weight_init) for m in (model, ts_projector, statics_projector, self.cls_embed): if m is None: continue m.to(device) if n_gpu > 1: m = torch.nn.DataParallel(m).cuda() parameters = (list(model.parameters()) + list(ts_projector.parameters()) + list(statics_projector.parameters())) if self.add_cls_analog: parameters += [self.cls_embed] if args.notes == 'integrate_note_bert': # initialize pretrained clinical note bert model cache_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_-1') model_location = BERT_MODEL_LOCATION note_embedding_model = BertModel.from_pretrained( model_location, cache_dir=cache_dir, ) notes_projector = nn.Linear(768, bert_config.hidden_size) note_embedding_model.to(device) notes_projector.to(device) if n_gpu > 1: note_embedding_model = torch.nn.DataParallel( note_embedding_model, device_ids=list(range(torch.cuda.device_count()))).cuda() notes_projector = torch.nn.DataParallel( notes_projector, device_ids=list(range(torch.cuda.device_count()))).cuda() if args.do_train_note_bert: parameters = [ { "params": parameters, "lr": args.learning_rate }, { "params": note_embedding_model.parameters(), "lr": args.learning_rate / args.note_bert_lr_reduce }, { "params": notes_projector.parameters(), "lr": args.learning_rate / args.note_bert_lr_reduce }, ] else: note_embedding_model = None notes_projector = None self.bert_config = bert_config self.model = model self.ts_projector = ts_projector self.statics_projector = statics_projector self.notes_projector = notes_projector self.note_embedding_model = note_embedding_model self.parameters = parameters self.trainable_models = [ self.model, self.ts_projector, self.statics_projector ] if args.do_train_note_bert: self.trainable_models.extend( [self.notes_projector, self.note_embedding_model]) self.n_gpu = n_gpu self.device = device self.notes = args.notes self.run_dir = args.run_dir self.save_name = args.model_file_template.format(**args.to_dict())
def create_task(args): task_name = "TACRED" bert_model = BertModel.from_pretrained(args.bert_model, cache_dir="./cache/") bert_output_dim = 768 if "base" in args.bert_model else 1024 config = ENT_BERT_ENCODER_CONFIG if ( args.ent_emb_file is not None or args.static_ent_emb_file is not None or args.type_emb_file is not None or args.rel_emb_file is not None ): config["num_hidden_layers"] = args.kg_encoder_layer output_size = ENT_BERT_ENCODER_CONFIG["hidden_size"] else: output_size = bert_output_dim ENT_BERT_ENCODER_CONFIG["hidden_size"] = output_size config = BertConfig.from_dict(config) logger.info(config) encoder = EntBertEncoder( config, bert_output_dim, output_size, args.ent_emb_file, args.static_ent_emb_file, args.type_emb_file, args.rel_emb_file, tanh=args.tanh, norm=args.norm, ) task = EmmentalTask( name=task_name, module_pool=nn.ModuleDict( { "bert": bert_model, "encoder": encoder, f"{task_name}_pred_head": nn.Linear( output_size, len(LABEL_TO_ID.keys()) ), } ), task_flow=[ { "name": "bert", "module": "bert", "inputs": [ ("_input_", "token_ids"), ("_input_", "token_segments"), ("_input_", "token_masks"), ], }, { "name": "encoder", "module": "encoder", "inputs": [ ("bert", 0), ("_input_", "token_ent_ids"), ("_input_", "token_static_ent_ids"), ("_input_", "token_type_ent_ids"), ("_input_", "token_rel_ent_ids"), ("_input_", "token_masks"), ], }, { "name": f"{task_name}_pred_head", "module": f"{task_name}_pred_head", "inputs": [("encoder", 1)], }, ], loss_func=partial(ce_loss, f"{task_name}_pred_head"), output_func=partial(output, f"{task_name}_pred_head"), scorer=Scorer(customize_metric_funcs={"tacred_scorer": tacred_scorer}), ) return task