def inference(model, rv, auxiliary_embs=None): pro_rv = Txtfile.process_sent(rv) rv_id = topic_encoder.word2idx(pro_rv) padded_inp, _ = seqPAD.pad_sequences([rv_id], pad_tok=margs.vocab.w2i[PADt]) inputs = Data2tensor.idx2tensor(padded_inp, torch.long, topic_encoder.device) with torch.no_grad(): model.eval() # inputs = [batch_size, sent_length] # auxiliary_embs = [batch_size, sent_length, aux_dim] emb_word = model.emb_layer(inputs, auxiliary_embs) # emb_word = [batch_size, sent_length, emb_dim] emb_sent = emb_word.mean(dim=1, keepdim=True) # emb_sent = [batch_size, 1, emb_dim] sent_length = emb_word.size(1) emb_sent_ex = emb_sent.expand(-1, sent_length, -1).contiguous() # emb_sent_ex = [batch_size, sent_length, emb_dim] alpha_score = model.attention(emb_word, emb_sent_ex) # alpha_score = [batch_size, sent_length, 1] alpha_norm = model.norm_attention(alpha_score) # alpha_norm = [batch_size, sent_length, 1] emb_attsent = torch.bmm(alpha_norm.transpose(1, 2), emb_word) # emb_attsent = [batch_size, 1, emb_dim] <------ # alpha_norm.transpose(1, 2) = [batch_size, 1, sent_length] dot emb_word = [batch_size, sent_length, emb_dim] emb_topic = model.encoder(emb_attsent.squeeze(1)) topic_class = model.norm_layer(emb_topic) # emb_topic = topic_class = [batch_size, nn_out_dim] label_prob, label_pred = topic_class.data.topk(topic_class.size(1)) return label_prob, label_pred
def train_batch(self, train_data): clip_rate = self.args.clip chunk_size = self.args.batch_size * (self.args.neg_samples + 1) total_batch = self.args.vocab.nodocs // chunk_size prog = Progbar(target=total_batch) # set model in train model train_loss = [] self.model.train() for i, inp_ids in enumerate(self.args.vocab.minibatches(train_data, batch_size=chunk_size)): padded_inp, _ = seqPAD.pad_sequences(inp_ids, pad_tok=self.args.vocab.w2i[PADt]) data_tensor = Data2tensor.idx2tensor(padded_inp, torch.long, self.device) # shuffle data_chunks perm_ids = torch.randperm(chunk_size) data_tensor = data_tensor[perm_ids] data_tensor = data_tensor.view(self.args.batch_size, self.args.neg_samples + 1, -1) # data_tensor = [batch_size, 1 + neg_sampling, word_length] inp_tensor = data_tensor[:, 0, :] noise_tensor = data_tensor[:, 1:, :] self.model.zero_grad() emb_sent, trans_sent, emb_noise = self.model(inp_tensor, noise_tensor) batch_loss = self.model.batchHingeLoss(emb_sent, trans_sent, emb_noise) train_loss.append(batch_loss.item()) batch_loss.backward() if clip_rate > 0: torch.nn.utils.clip_grad_norm_(self.model.parameters(), clip_rate) self.optimizer.step() prog.update(i + 1, [("Train loss", batch_loss.item())]) return np.mean(train_loss)
def predict(self, rv): pro_rv = Txtfile.process_sent(rv) rv_id = self.word2idx(pro_rv) padded_inp, _ = seqPAD.pad_sequences([rv_id], pad_tok=self.args.vocab.w2i[PADt]) inputs = Data2tensor.idx2tensor(padded_inp, torch.long, self.device) self.model.eval() with torch.no_grad(): label_prob, label_pred = self.model.inference(inputs) return label_prob, label_pred
return label_prob, label_pred if __name__ == "__main__": from data_utils import Data2tensor, Vocab, seqPAD, Csvfile filename = "/media/data/langID/small_scale/train.csv" vocab = Vocab(cl_th=None, cutoff=1, c_lower=False, c_norm=False) vocab.build([filename], firstline=False) word2idx = vocab.wd2idx(vocab.c2i) tag2idx = vocab.tag2idx(vocab.l2i) train_data = Csvfile(filename, firstline=False, word2idx=word2idx, tag2idx=tag2idx) train_iters = Vocab.minibatches(train_data, batch_size=10) data = [] label_ids = [] for words, labels in train_iters: data.append(words) label_ids.append(labels) word_ids, sequence_lengths = seqPAD.pad_sequences(words, pad_tok=0, wthres=1024, cthres=32) w_tensor = Data2tensor.idx2tensor(word_ids) y_tensor = Data2tensor.idx2tensor(labels) data_tensors = Data2tensor.sort_tensors(labels, word_ids, sequence_lengths) label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors
#embedding_matrix=create_embedding_matrix(vocab,ntoken,emb_size) #print(embedding_matrix[5]) #embedding = nn.Embedding.from_pretrained(embedding_matrix) #input = torch.LongTensor([1]) #print(embedding(input)) train_data = Txtfile(data_files[0], firstline=False, source2idx=word2idx, label2idx=label2idx) # train_data = [sent[0] for sent in train_data] train_batch = vocab.minibatches_with_label(train_data, batch_size=batch_size) inpdata = [] outdata = [] for doc, label in train_batch: doc_pad_ids, doc_lengths = seqPAD.pad_sequences(doc, pad_tok=vocab.w2i[PAD]) doc_tensor = Data2tensor.idx2tensor(doc_pad_ids, device) doc_lengths_tensor = Data2tensor.idx2tensor(doc_lengths, device) label_tensor = Data2tensor.idx2tensor(label, device) inpdata.append(doc_tensor) outdata.append(label_tensor) break # model = RNNModule(rec_type=rec_type, ntokens=ntoken, emb_size=emb_size, hidden_size=hidden_size, nlayers=nlayers, # dropout=dropout, bidirect=bidirect).to(device) # rec_output, rec_hidden, rec_output = model(input_tensor, input_lens_tensor) # # model = UniLSTMModel(rec_type=rec_type, ntokens=ntoken, emb_size=emb_size, hidden_size=hidden_size, nlayers=nlayers, # dropout=dropout, bidirect=False, nlabels=nlabels).to(device) # decoded_scores, rec_hidden, rec_output = model(input_tensor, input_lens_tensor) model = BiLSTMModel(rec_type=rec_type,
return label_score if __name__ == "__main__": from data_utils import Data2tensor, Vocab, seqPAD, CoNLLDataset train_file='/media/data/NER/conll03/conll03/train.bmes' dev_file='/media/data/NER/conll03/conll03/dev.bmes' test_file='/media/data/NER/conll03/conll03/test.bmes' vocab = Vocab(cutoff=1, wl_th=None, cl_th=None, w_lower=False, w_norm=False, c_lower=False, c_norm=False) vocab.build([train_file, dev_file, test_file]) word2idx = vocab.wd2idx(vocab_words=vocab.w2i, vocab_chars=vocab.c2i, allow_unk=True, start_end=True) tag2idx = vocab.tag2idx(vocab_tags=vocab.l2i, start_end=True) train_data = CoNLLDataset(train_file, word2idx=word2idx, tag2idx=tag2idx) train_iters = Vocab.minibatches(train_data, batch_size=10) data=[] label_ids = [] for words, labels in train_iters: char_ids, word_ids = zip(*words) data.append(words) word_ids, sequence_lengths = seqPAD.pad_sequences(word_ids, pad_tok=0, wthres=1024, cthres=32) char_ids, word_lengths = seqPAD.pad_sequences(char_ids, pad_tok=0, nlevels=2, wthres=1024, cthres=32) label_ids, label_lengths = seqPAD.pad_sequences(labels, pad_tok=0, wthres=1024, cthres=32) w_tensor=Data2tensor.idx2tensor(word_ids) c_tensor=Data2tensor.idx2tensor(char_ids) y_tensor=Data2tensor.idx2tensor(label_ids) data_tensor = Data2tensor.sort_tensors(label_ids, word_ids, sequence_lengths, char_ids, word_lengths, volatile_flag=False) label_tensor, word_tensor, sequence_lengths, word_seq_recover, char_tensor, word_lengths, char_seq_recover = data_tensor
unk_words=True, se_words=False) train_data = Txtfile(filename, firstline=False, word2idx=word2idx, limit=100000) batch_size = 8 neg_sampling = 5 no_chunks = batch_size * (neg_sampling + 1) train_iters = Vocab.minibatches(train_data, batch_size=no_chunks) data = [] label = [] for inp_ids in train_iters: padded_inp, _ = seqPAD.pad_sequences(inp_ids, pad_tok=vocab.w2i[PADt]) data_tensor = Data2tensor.idx2tensor(padded_inp) # shuffle chunks perm_ids = torch.randperm(no_chunks) data_tensor = data_tensor[perm_ids] data_tensor = data_tensor.view(batch_size, neg_sampling + 1, -1) inp_tensor = data_tensor[:, 0, :] noise_tensor = data_tensor[:, 1:, :] break emb_size = len(vocab.w2i) emb_dim = 100 pre_embs = None emb_drop_rate = 0.5 emb_zero_padding = False grad_flag = True
bptt = 10 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") data_files = ["../dataset/train.txt"] vocab = Vocab(wl_th=wl_th, cutoff=cutoff) vocab.build(data_files, firstline=False) word2idx = vocab.wd2idx(vocab.w2i) label2idx = vocab.tag2idx(vocab.l2i) train_data = Txtfile(data_files[0], firstline=False, source2idx=word2idx, label2idx=label2idx) # train_data = [sent[0] for sent in train_data] train_batch = vocab.minibatches(train_data, batch_size=batch_size) inpdata=[] outdata=[] for sent in train_batch: word_pad_ids, seq_lens = seqPAD.pad_sequences(sent, pad_tok=vocab.w2i[PAD]) data_tensor = Data2tensor.idx2tensor(word_pad_ids) for i in range(0, data_tensor.size(1)-1, bptt): data, target = vocab.bptt_batch(data_tensor, i, bptt) inpdata.append(data) outdata.append(target) break rnn_type = "GRU" ntoken = len(vocab.w2i) ninp = 32 nhid = 64 nlayers = 1 dropout = 0.5 tie_weights = False bidirect = False