def test_bert_encoder_backward(gpu, default_implementation, sdfg_name): batch_size = 2 seq_len = 512 hidden_size = 768 input = torch.randn([batch_size, seq_len, hidden_size]) ptmodel = BertLayer(BertConfig(hidden_act="relu")).eval() dace_model = DaceModule(ptmodel, cuda=gpu, train=False, backward=True, sdfg_name=sdfg_name, apply_strict=True) ptinput = torch.clone(input) ptinput.requires_grad = True ptmodel(ptinput)[0].sum().backward() dace_input = torch.clone(input) dace_input.requires_grad = True dace_model(dace_input).sum().backward() diff = np.abs(dace_input.grad.detach().numpy() - ptinput.grad.detach().numpy()) assert np.max(diff) < 1e-4
def test_bert_cf(sdfg_name): batch_size = 8 seq_len = 512 hidden_size = 768 input = torch.randn([batch_size, seq_len, hidden_size]) ptmodel = BertLayer(BertConfig()).eval() pt_outputs = ptmodel(input.clone()) dace_model = DaceModule(ptmodel, train=False, sdfg_name=sdfg_name, dummy_inputs=(input.clone(), ), auto_optimize=False) # run again with constant folding dace_model.reset_sdfg() dace_model.prepend_post_onnx_hook( "cf", lambda onnx_model: onnx_model.sdfg. apply_transformations_repeated([ConstantFolding, RedundantSecondArray], validate_all=True, strict=True)) dace_outputs1 = dace_model(input.clone()) diff = np.abs(dace_outputs1.detach().numpy() - pt_outputs[0].detach().numpy()) assert np.max(diff) < 1e-5
def __init__(self, device): super(Model, self).__init__() self.device = device self.num_labels = 2 self.config = BertConfig.from_pretrained('./roberta_pretrain/bert_config.json') self.embeddings = BertEmbeddings(self.config) num_layers = 3 self.layer = nn.ModuleList([BertLayer(self.config) for _ in range(num_layers)]) self.output = nn.Linear(self.config.hidden_size, self.num_labels) # 分类
def __init__(self, count, config, num_labels): super(HSUM, self).__init__() self.count = count self.num_labels = num_labels self.pre_layers = torch.nn.ModuleList() self.crf_layers = torch.nn.ModuleList() self.classifier = torch.nn.Linear(config.hidden_size, num_labels) for i in range(count): self.pre_layers.append(BertLayer(config)) self.crf_layers.append(CRF(num_labels))
def __init__(self, count, config, num_labels): super(HSUM, self).__init__() self.count = count self.num_labels = num_labels self.pre_layers = torch.nn.ModuleList() self.loss_fct = torch.nn.ModuleList() self.pooler = BertPooler(config) self.classifier = torch.nn.Linear(config.hidden_size, num_labels) for i in range(count): self.pre_layers.append(BertLayer(config)) self.loss_fct.append(torch.nn.CrossEntropyLoss(ignore_index=-1))
def __init__(self, bert, config, args): super(BertAttentiveKeywordsClassification, self).__init__() self.bert = bert self.hidden_size = config.hidden_size self.transformer = BertLayer(config) self.dropout = nn.Dropout(args.dropout) self.seq = nn.Sequential( nn.Dropout(args.dropout), nn.Linear(6 * config.hidden_size, config.hidden_size), nn.Dropout(args.dropout), nn.Linear(config.hidden_size, 2) )
def __init__(self, config): super(Net, self).__init__() self.config = config self.bert_dim = 768 self.rel_num = self.config.rel_num self.max_len = self.config.max_len self.device = self.config.device self.lr = self.config.learning_rate self.id2rel = json.load(open(self.config.rel2id, encoding="utf8"))[0] self.bert_encoder = BertModel.from_pretrained("bert-base-chinese") self.conv = nn.Conv1d(in_channels=self.bert_dim, out_channels=self.rel_num, kernel_size=self.config.conv_kernel) self.pool = nn.MaxPool1d(self.config.pool_kernel) self.lstm = nn.LSTM(input_size=self.config.lstm_in, hidden_size=self.config.lstm_out, batch_first=True, bidirectional=self.config.if_bidirectional) self.lstms = LSTMS(self.config) # self.w = nn.Linear(in_features=self.max_len - # self.config.conv_kernel + 1, out_features=128) self.w = nn.Linear(in_features=self.max_len - self.config.conv_kernel + 1, out_features=384) self.linears = nn.Linear(in_features=self.config.lstm_out, out_features=self.config.tag_num) self.rel2tag = nn.Linear(in_features=self.max_len - self.config.conv_kernel + 1, out_features=1) self.softmax = nn.Softmax(-1) self.layernorm = nn.LayerNorm( [self.config.max_len, self.config.lstm_in]) self.matrix = Variable(torch.randn(128, self.bert_dim), requires_grad=True).to(self.device) self.bertlayer1 = BertLayer(BertConfig(vocab_size=21128)) self.bertlayer2 = BertLayer(BertConfig(vocab_size=21128))
def test_bert_encoder(gpu, default_implementation, sdfg_name): if not gpu and default_implementation == 'onnxruntime': pytest.skip("combination is tested below") batch_size = 8 seq_len = 512 hidden_size = 768 input = torch.randn([batch_size, seq_len, hidden_size]) ptmodel = BertLayer(BertConfig()).eval() pt_outputs = ptmodel(input.clone()) dace_model = DaceModule(ptmodel, cuda=gpu, train=False, sdfg_name=sdfg_name, apply_strict=True, dummy_inputs=(input.clone(), )) if gpu: for name, _ in dace_model.model.named_parameters(): parameter_to_transient(dace_model, name) dace_outputs0 = dace_model(input.clone()) diff = np.abs(dace_outputs0.detach().numpy() - pt_outputs[0].detach().numpy()) assert np.max(diff) < 1e-5 if default_implementation == "pure": ort_nodes = [ n for n, _ in dace_model.sdfg.all_nodes_recursive() if hasattr(n, "environments") and any("onnx" in e.lower() for e in n.environments) ] if len(ort_nodes) > 0: assert False, f"expected pure graph, found ORT nodes: {ort_nodes} " # check that cuBLAS is being used if gpu: assert any( (hasattr(n, "environments") and "cuBLAS" in n.environments or hasattr(n, "implementation") and n.implementation == "cuBLAS") for n, _ in dace_model.sdfg.all_nodes_recursive())
def __init__(self,bertconfig,config): super(BERT_Seq2SeqModel, self).__init__() self.encoder = BertModel.from_pretrained( config.model_path, config=bertconfig) self.num_labels = bertconfig.num_labels self.l2_reg_lambda = bertconfig.l2_reg_lambda self.dropout = nn.Dropout(bertconfig.hidden_dropout_prob) vocab_size=config.vocab_size self.ner_classifier=nn.Linear(config.enc_hidden_size, vocab_size) self.span_layer = BertLayer(config=bertconfig) self.w = nn.Parameter(torch.Tensor([0.5, 0.5])) self.gamma = nn.Parameter(torch.ones(1)) dec_att_type=int(config.dec_att_type) self.rel_size=config.rel_size self.decoder = Decoder(config.dec_inp_size, config.dec_hidden_size, 1, config.drop_rate, config.max_trg_len,dec_att_type,self.rel_size) self.relation_embeddings = nn.Embedding(config.rel_size, config.dec_inp_size) self.dropout_di = nn.Dropout(config.drop_rate)
def __init__(self, config, **kwargs): super().__init__(config) self.config = config self.num_labels = config.num_labels self.cus_config = kwargs['cus_config'] self.type = self.cus_config.type # a,b,c,d self.usr_embed = nn.Embedding(self.cus_config.num_usrs, self.cus_config.attr_dim) self.usr_embed.weight.requires_grad = True init.uniform_(self.usr_embed.weight, a=-0.25, b=0.25) self.prd_embed = nn.Embedding(self.cus_config.num_prds, self.cus_config.attr_dim) self.prd_embed.weight.requires_grad = True init.uniform_(self.usr_embed.weight, a=-0.25, b=0.25) if self.type not in ['b', 'a']: self.text = nn.Parameter(torch.Tensor(1, self.cus_config.attr_dim)) # init.normal_(self.text) init.uniform_(self.text, a=-0.25, b=0.25) self.ATrans_decoder = nn.ModuleList([ MAALayer(config, self.cus_config) for _ in range(self.cus_config.n_mmalayer) ]) self.classifier = BERTClassificationHead(config) elif self.type == 'a': self.fusion = Fusion(self.config.hidden_size, self.cus_config.attr_dim) self.layer = nn.ModuleList( [BertLayer(config) for _ in range(self.cus_config.n_mmalayer)]) self.classifier = BERTClassificationHead(config) else: self.classifier = BERTClassificationHeadWithAttribute( self.cus_config) self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.init_weights()
def test_bert_encoder(gpu, apply_strict): batch_size = 8 seq_len = 512 hidden_size = 768 input = torch.randn([batch_size, seq_len, hidden_size]) ptmodel = BertLayer(BertConfig()).eval() pt_outputs = ptmodel(input.clone()) dace_model = DaceModule(ptmodel, cuda=gpu, train=False) dace_outputs0 = dace_model(input.clone()) dace_model.dace_model.sdfg.apply_transformations_repeated( [ConstantFolding, RedundantSecondArray], validate_all=True) dace_outputs1 = dace_model(input.clone()) diff = np.abs(dace_outputs0 - pt_outputs[0].detach().numpy()) assert np.max(diff) < 1e-5 assert np.allclose(dace_outputs1, dace_outputs0)
def __init__(self): super(Model, self).__init__() self.config = BertConfig.from_pretrained( './roberta_pretrain/bert_config.json') self.ques_encoder = BertModel.from_pretrained( './roberta_pretrain/pytorch_model.bin', config=self.config) self.context_encoder = BertModel.from_pretrained( './roberta_pretrain/pytorch_model.bin', config=self.config) self.basicblocks = nn.ModuleList() self.n_layers = 3 trans_heads = 8 trans_drop = 0.1 bert_config = BertConfig(hidden_size=self.config.hidden_size, num_attention_heads=trans_heads, attention_probs_dropout_prob=trans_drop) for layer in range(self.n_layers): self.basicblocks.append(BertLayer(bert_config)) self.num_labels = 2 self.output = nn.Linear(self.config.hidden_size, self.num_labels)
def __init__(self, config): super().__init__() self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
def _compute_pytorch( model_names, batch_sizes, slice_sizes, dictionary, average_over, device, torchscript, fp16, no_speed, no_memory, verbose, num_hashes ): hidden_size = 64 num_attention_heads = 2 intermediate_size = 128 chunk_length = 64 num_hashes = num_hashes hidden_states = floats_tensor((1, 2 ** 16, hidden_size)) for c, model_name in enumerate(model_names): print(f"{c + 1} / {len(model_names)}") dictionary[model_name] = { "bs": batch_sizes, "ss": slice_sizes, "results": {}, "memory": {}, } dictionary[model_name]["results"] = {i: {} for i in batch_sizes} dictionary[model_name]["memory"] = {i: {} for i in batch_sizes} for batch_size in batch_sizes: for slice_size in slice_sizes: num_buckets = int(2 * slice_size / chunk_length) if num_buckets > chunk_length: factorized_num_buckets = num_buckets // 32 num_buckets = [32, factorized_num_buckets] bert_config = BertConfig( hidden_size=hidden_size, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0, ) reformer_config = ReformerConfig( hidden_size=hidden_size, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, chunk_length=chunk_length, num_hashes=num_hashes, num_buckets=num_buckets ) layers = { 'ReformerLayer': ReformerLayer(reformer_config), 'BertLayer': BertLayer(bert_config) } model = layers[model_name] if fp16: model.half() model.to(device) model.eval() if False: dictionary[model_name]["results"][batch_size][slice_size] = "N/A" else: sequence = ( hidden_states[0, :slice_size, :] .to(device=device) .repeat(batch_size, 1, 1) ) try: if torchscript: print("Tracing model with sequence size", sequence.shape) inference = torch.jit.trace(model, sequence) inference(sequence) else: inference = model if model_name == "ReformerLayer": inference(sequence, sequence) else: inference(sequence) if not no_memory: # model.add_memory_hooks() # Forward method tracing (only for PyTorch models) trace = start_memory_tracing("transformers") if model_name == "ReformerLayer": inference(sequence, sequence) else: inference(sequence) summary = stop_memory_tracing(trace) if verbose: print_summary_statistics(summary) dictionary[model_name]["memory"][batch_size][ slice_size ] = str(summary.total) else: dictionary[model_name]["memory"][batch_size][ slice_size ] = "N/A" if not no_speed: print( "Going through model with sequence of shape", sequence.shape, ) if model_name == "ReformerLayer": runtimes = timeit.repeat( lambda: inference(sequence, sequence), repeat=average_over, number=3, ) else: runtimes = timeit.repeat( lambda: inference(sequence), repeat=average_over, number=3, ) average_time = sum(runtimes) / float(len(runtimes)) / 3.0 dictionary[model_name]["results"][batch_size][ slice_size ] = average_time else: dictionary[model_name]["results"][batch_size][ slice_size ] = "N/A" except RuntimeError as e: print("Doesn't fit on GPU.", e) torch.cuda.empty_cache() dictionary[model_name]["results"][batch_size][ slice_size ] = "N/A" dictionary[model_name]["memory"][batch_size][slice_size] = "N/A" return dictionary
def __init__(self): super(BertTokenSoftmaxClf, self).__init__() self.bert = BertLayer(BertConfig(hidden_act="relu")).eval() self.sm = nn.LogSoftmax(dim=-1)