def train_leave_one_lecture_out(name='cv'): wapiti_home = global_params.wapiti_dir pattern_file = '../data/%s.pattern.txt'%name model_dir = '../data/%s/%s/model/%s/'%(course, system, name) fio.NewPath(model_dir) feature_dir = '../data/%s/%s/extraction/'%(course, system) feature_cv_dir = '../data/%s/%s/extraction/%s/'%(course, system, name) fio.NewPath(feature_cv_dir) outputdir = '../data/%s/%s/extraction/%s_output/'%(course, system, name) fio.NewPath(outputdir) lectures = annotation.Lectures dict = defaultdict(int) for i, lec in enumerate(lectures): train = [x for x in lectures if x != lec] test = [lec] train_filename = os.path.join(feature_cv_dir, 'train_%d.feature.crf'%i) model_file = os.path.join(model_dir, '%d.model'%i) print train_filename print model_file crf = CRF(wapiti_home) if not fio.IsExist(model_file): #if True: combine_files(feature_dir, train, train_filename) crf.train(train_filename, pattern_file, model_file) for q in ['q1', 'q2']: test_filename = os.path.join(feature_cv_dir, 'test_%d_%s.feature.crf'%(i, q)) output_file = os.path.join(outputdir, 'test_%d_%s.out'%(i, q)) dict['test_%d_%s'%(i, q)] = 1 if empty == 'Y': test_filename_old = test_filename.replace('_Y', '_N') cmd = 'cp %s %s'%(test_filename_old, test_filename) os.system(cmd) else: if method == 'combine': test_filename_old = test_filename.replace('_combine', '_A1') cmd = 'cp %s %s'%(test_filename_old, test_filename) os.system(cmd) else: combine_files(feature_dir, test, test_filename, prompts=[q]) crf.predict(test_filename, model_file, output_file) if debug: break file_util.save_dict2json(dict, class_index_dict_file)
def train_on_course(traincourse, name='all'): wapiti_home = global_params.wapiti_dir pattern_file = '../data/%s.pattern.txt'%name model_dir = '../data/%s/%s/model/%s/'%(course, system, name) fio.NewPath(model_dir) feature_dir = '../data/%s/%s/extraction/'%(traincourse, system) feature_cv_dir = '../data/%s/%s/extraction/%s/'%(traincourse, system, name) fio.NewPath(feature_cv_dir) outputdir = '../data/%s/%s/extraction/%s_output/'%(course, system, name) fio.NewPath(outputdir) if traincourse == 'IE256': lectures = [x for x in range(14, 26) if x != 22] else: lectures = [x for x in range(3, 27)] dict = defaultdict(int) train = [x for x in lectures] train_filename = os.path.join(feature_cv_dir, 'train.feature.crf') model_file = os.path.join(model_dir, '%s.model'%traincourse) print train_filename print model_file crf = CRF(wapiti_home) if not fio.IsExist(model_file): #if True: combine_files(feature_dir, train, train_filename) crf.train(train_filename, pattern_file, model_file)
def __init__(self, nwords, nchars, ntags, pretrained_list): super().__init__() # Create word embeddings pretrained_tensor = torch.FloatTensor(pretrained_list) self.word_embedding = torch.nn.Embedding.from_pretrained( pretrained_tensor, freeze=False) # Create input dropout parameter self.word_dropout = torch.nn.Dropout(1 - KEEP_PROB) # Create LSTM parameters self.lstm = torch.nn.LSTM(DIM_EMBEDDING + CHAR_LSTM_HIDDEN, LSTM_HIDDEN, num_layers=LSTM_LAYER, batch_first=True, bidirectional=True) # Create output dropout parameter self.lstm_output_dropout = torch.nn.Dropout(1 - KEEP_PROB) # Character-level LSTMs self.char_embedding = torch.nn.Embedding(nchars, CHAR_DIM_EMBEDDING) self.char_lstm = torch.nn.LSTM(CHAR_DIM_EMBEDDING, CHAR_LSTM_HIDDEN, num_layers=1, batch_first=True, bidirectional=False) # Create final matrix multiply parameters self.hidden_to_tag = torch.nn.Linear(LSTM_HIDDEN * 2, ntags + 2) self.crf = CRF(target_size=ntags)
class Net(nn.Module): def __init__(self, args): super().__init__() self.args = args self.wemb = Wemb(args) self.drop = nn.Dropout(args.dropout) odim = len(args.tag_stoi) if args.ner: self.crf = CRF(args.tag_stoi) odim = len(args.tag_stoi) + 2 if not args.lstm: self.ffn = nn.Sequential(nn.Linear(300, 400), nn.ReLU(), nn.Dropout(args.dropout)) else: self.lstm = nn.LSTM(input_size=300, hidden_size=200, num_layers=2, bias=True, batch_first=True, dropout=args.dropout, bidirectional=True) self.hid2tag = nn.Linear(400, odim) def forward(self, batch): mask = pad_sequence([torch.ones(len(x)) for x in batch], True, 0).byte().cuda() if self.args.fix: with torch.no_grad(): x = self.wemb.eval()(batch) else: x = self.wemb(batch) x = self.drop(x) if not self.args.lstm: x = self.ffn(x) else: x = Lstm(self.lstm, x, mask.sum(-1)) x = self.hid2tag(x) return x, mask def train_batch(self, batch, tags): x, mask = self.forward(batch) tag_ids = pad_sequence([ torch.LongTensor([self.args.tag_stoi[t] for t in s]) for s in tags ], True, self.args.tag_stoi["<pad>"]).cuda() if not self.args.ner: loss = nn.functional.cross_entropy(x[mask], tag_ids[mask]) else: loss = self.crf.neg_log_likelihood_loss(x, mask, tag_ids) return loss def test_batch(self, batch): x, mask = self.forward(batch) if not self.args.ner: path = x.max(-1)[1] else: _, path = self.crf._viterbi_decode(x, mask) path = [p[m].tolist() for p, m in zip(path, mask)] tags = [[self.args.tag_itos[i] for i in s] for s in path] return tags
def __init__(self, data): super(SeqModel, self).__init__() self.use_crf = data.use_crf print "build network..." print "use_char: ", data.use_char if data.use_char: print "char feature extractor: ", data.char_feature_extractor print "word feature extractor: ", data.word_feature_extractor print "use crf: ", self.use_crf self.gpu = data.HP_gpu self.average_batch = data.average_batch_loss ## add two more label for downlayer lstm, use original label size for CRF label_size = data.label_alphabet_size # data.label_alphabet_size += 2 # self.word_hidden = WordSequence(data, False, True, data.use_char) # The linear layer that maps from hidden state space to tag space self.hidden2tag = nn.Linear(data.HP_hidden_dim, label_size + 2) if self.use_crf: self.crf = CRF(label_size, self.gpu) if torch.cuda.is_available(): self.hidden2tag = self.hidden2tag.cuda(self.gpu) self.frozen = False
def extractPhraseFromCRFWithColor(phrasedir, systemdir): crf_reader = CRF() aligner = AlignPhraseAnnotation() lectures = annotation.Lectures for i, lec in enumerate(lectures): path = phrasedir + str(lec)+ '/' fio.NewPath(path) for prompt in ['q1', 'q2']: filename = path + prompt + '.' + method + '.key' extracted_phrases = [] extracted_colors = [] crf_file = os.path.join(systemdir, 'extraction', 'all_output', 'test_%i_%s.out'%(i, prompt)) for tokens, tags, color0, color1 in crf_reader.read_file_generator_index(crf_file, [0, -1, -4, -3]): phrases, phrase_colors = aligner.get_phrase_with_colors(tokens, tags, [color0, color1]) for phrase, phrase_color in zip(phrases, phrase_colors): extracted_phrases.append(phrase.lower()) extracted_colors.append(phrase_color) fio.SaveList(extracted_phrases, filename) filename = path + prompt + '.' + method + '.key.color' fio.SaveDict2Json(extracted_colors, filename)
def crfpp(self, msg): print 'crf++' crf = CRF() crf.create_file_input(msg) start_time = time.time() os.system('crf_test -m model crf.test.data > crf.result') total_time = time.time() - start_time #return in seconds
def image_output(mask, realw, realh, key, channel_bindings, output_dir, no_segmentation_images): ######################## COLOR GT ################################# decoded = decode_segmap(np.array(mask, dtype=np.uint8)) decoded = Image.fromarray(np.uint8(decoded * 255), 'RGBA') basewidth = int(realw) hsize = int(realh) decoded = decoded.resize((basewidth, hsize), Image.ANTIALIAS) decoded.save(os.path.join(output_dir, "{0}_Color_output.png".format(key))) if not no_segmentation_images: ######################## Primary root GT ########################### decoded1 = CRF.decode_channel(mask, [ channel_bindings['segmentation']['Primary'], channel_bindings['heatmap']['Seed'] ]) decoded1 = Image.fromarray(decoded1) decoded1 = decoded1.resize((basewidth, hsize), Image.NEAREST) decoded1 = decoded1.convert('L') decoded1.save(os.path.join(output_dir, "{0}_C1.png".format(key))) ######################## Lat root GT ########################### decoded2 = CRF.decode_channel( mask, channel_bindings['segmentation']['Lateral']) decoded2 = Image.fromarray(decoded2) decoded2 = decoded2.resize((basewidth, hsize), Image.NEAREST) decoded2 = decoded2.convert('L') decoded2.save(os.path.join(output_dir, "{0}_C2.png".format(key)))
def __init__(self): super().__init__() self.bert = BertModel.from_pretrained('bert_base/') if args.bert_freeze: for param in self.bert.parameters(): param.requires_grad = False self.lstm = BiLSTM( input_size=args.bert_hidden_size + args.cnn_output_size, hidden_size=args.rnn_hidden_size + args.cnn_output_size, num_layers=args.rnn_num_layers, num_dirs=args.rnn_num_dirs) self.lstm_dropout = nn.Dropout(p=args.rnn_dropout) self.cnn = CharCNN(embedding_num=len(CHAR_VOCAB), embedding_dim=args.cnn_embedding_dim, filters=eval(args.cnn_filters), output_size=args.cnn_output_size) self.crf = CRF(target_size=len(VOCAB) + 2, use_cuda=args.crf_use_cuda) self.linear = nn.Linear(in_features=args.rnn_hidden_size + args.cnn_output_size, out_features=len(VOCAB) + 2) self.attn = MultiHeadAttention(model_dim=args.rnn_hidden_size + args.cnn_output_size, num_heads=args.attn_num_heads, dropout=args.attn_dropout) self.feat_dropout = nn.Dropout(p=args.feat_dropout)
def __init__(self, config): super(BertNer, self).__init__(config) self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.crf = CRF(config.num_labels) self.init_weights()
def __init__(self, word2id, char2id, tag2id, pretrain_embedding, embed_dim, char_embed_dim, n_hidden): super(LSTMTagger, self).__init__() self.word2id = word2id #通过预训练emdedding得到的word字典 self.char2id = char2id self.tag2id = tag2id self.word_num = len(word2id) self.char_num = len(char2id) self.tag_num = len(tag2id) self.embed_dim = embed_dim self.embedding = torch.nn.Embedding.from_pretrained( torch.FloatTensor(pretrain_embedding), freeze=False) #加载预训练embedding矩阵并设置为可变 # self.pre_embedding = nn.Embedding(self.word_num,self.embedding_dim) self.clstm = CharLSTM( chrdim=self.char_num, embdim=embed_dim, char_embed=char_embed_dim, ) self.wlstm = nn.LSTM(input_size=embed_dim + char_embed_dim, hidden_size=n_hidden // 2, num_layers=1, batch_first=True, bidirectional=True) self.out = nn.Linear(n_hidden, self.tag_num) self.crf = CRF(self.tag_num) self.drop = nn.Dropout()
class Net(nn.Module): def __init__(self): super().__init__() self.net = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(tag_stoi) + 2) self.g2b = nn.Linear(300, 768) self.gate = nn.Linear(768, 1) self.crf = CRF(tag_stoi) def forward(self, inputs, wids, attention_mask, labels): b = self.net.bert.embeddings(input_ids=inputs) a = self.gate(b).sigmoid() g = self.g2b(wvec[wids].cuda()) x = (1 - a) * b + a * g logits = self.net(inputs_embeds=x, attention_mask=attention_mask)[0] first_mask = labels != -100 mask = lens2mask(first_mask.sum(-1)).cuda() logits = torch.zeros(*mask.shape, logits.shape[-1]).cuda().masked_scatter(mask[:, :, None], logits[first_mask]) labels = torch.zeros(*mask.shape).long().cuda().masked_scatter(mask, labels[first_mask]) return logits, mask, labels def train_batch(self, inputs, wids, attention_mask, labels): logits, mask, labels = self.forward(inputs, wids, attention_mask, labels) loss = self.crf.neg_log_likelihood_loss(logits, mask, labels) return loss def test_batch(self, inputs, wids, attention_mask, labels): logits, mask, labels = self.forward(inputs, wids, attention_mask, labels) _, path = self.crf._viterbi_decode(logits, mask) pred = [[tag_itos[i] for i in p[m]] for p, m in zip(path, mask)] return pred
def five_two(): '''implement your experiments for question 5.2 here''' file = open('Q5_2.txt', 'w') crf_test = CRF(L=CHARS, F=321) W_F = np.load('W_F_{}.npy'.format(7), 'r') W_T = np.load('W_T_{}.npy'.format(7), 'r') crf_test.set_params(W_F, W_T) Y_gen = [] X_gen = [] samples_per_length = 50 for length in range(1, 21): Y_gen.append(np.random.choice(CHARS, [samples_per_length, length])) X_gen.append( np.random.randint(2, size=(samples_per_length, length, 321))) t0 = time.time() for x, y in zip(X_gen[length - 1], Y_gen[length - 1]): predictions = crf_test.predict(x) t1 = time.time() print('Average time to predict ', samples_per_length, 'samples of length ', length, 'is', (t1 - t0) / samples_per_length) file.write( str(length) + ',' + str((t1 - t0) / samples_per_length) + '\n') file.close() pass
def __init__(self, n_classes, n_features, **kwargs): self.__dict__.update(locals()) CRF.__init__(self, **kwargs) self.n_classes = int(self.n_classes) self.n_features = int(self.n_features) self.n_parameters = int(self.n_classes * self.n_features + self.n_classes**2)
def __init__(self, hidden_size: int, output_size: int, num_layers: int=1, bidirectional: bool=False, dropout_p: float=0.1, device: str="cpu", weights: Optional=None, num_embeddings: Optional=None, embedding_dim: Optional=None): super(NERTagger, self).__init__() if weights is not None: self.embedding = nn.Embedding.from_pretrained(weights, padding_idx=PAD_IDX) else: self.embedding = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim, padding_idx=PAD_IDX) self.hidden_size = hidden_size self.output_size = output_size self.num_layers = num_layers self.dropout_p = dropout_p self.bidirectional = bidirectional self.device = device self.dropout = nn.Dropout(p=dropout_p) self.lstm = nn.LSTM(input_size=self.embedding.embedding_dim, hidden_size=hidden_size, bidirectional=bidirectional, num_layers=num_layers, batch_first=True) if self.bidirectional: hidden_size = 2 * hidden_size self.crf = CRF(hidden_size, output_size, device=device)
def run_crf(epoch, score_map, bag_label, bag_index, co_exp_net_isoform, co_exp_net_lncRNA, training_size, testing_size, theta, sigma = 10): bag_label = bag_label[0: training_size] bag_index = bag_index[0: training_size] positive_unary_energy = 1 - score_map crf_isoform = CRF(training_size, testing_size, positive_unary_energy, co_exp_net_isoform, theta, bag_label, bag_index) crf_lncRNA = CRF(training_size, testing_size, positive_unary_energy, co_exp_net_lncRNA, theta, bag_label, bag_index) label_update_i, pos_prob_crf_i, unary_potential_i, pairwise_potential_i = crf_isoform.inference(10) label_update_l, pos_prob_crf_l, unary_potential_l, pairwise_potential_l = crf_lncRNA.inference(10) label_update = label_update_i + label_update_l pos_prob_crf = pos_prob_crf_i + pos_prob_crf_l unary_potential = unary_potential_i + unary_potential_l pairwise_potential = pairwise_potential_i + pairwise_potential_l if epoch == 0: theta_prime_isoform = crf_isoform.parameter_learning(bag_label[0:training_size], theta, sigma) theta_prime_lncRNA = crf_lncRNA.parameter_learning(bag_label[0:training_size], theta, sigma) else: theta_prime_isoform = crf_isoform.parameter_learning(label_update, theta, sigma) theta_prime_lncRNA = crf_lncRNA.parameter_learning(label_update, theta, sigma) theta_prime = theta_prime_isoform + theta_prime_lncRNA return label_update, theta_prime, pos_prob_crf, unary_potential, pairwise_potential
def __init__( self, nwords, nchars, ntags, pretrained_list, run_name, exp_name, list_of_possible_tags, use_char=True, use_crf=False, class_weights=[], learning_rate=0.015, learning_decay_rate=0.05, weight_decay=1e-8, ): super().__init__() self.run_name = run_name self.exp_name = exp_name self.class_weights = torch.Tensor(class_weights) # Create word embeddings pretrained_tensor = torch.FloatTensor(pretrained_list) self.word_embedding = torch.nn.Embedding.from_pretrained( pretrained_tensor, freeze=False) self.list_of_possible_tags = list_of_possible_tags # Create input dropout parameter # self.word_dropout = torch.nn.Dropout(1 - KEEP_PROB) char_lstm_hidden = 0 self.use_char = use_char if self.use_char: # Character-level LSTMs self.char_embedding = torch.nn.Embedding(nchars, CHAR_DIM_EMBEDDING) self.char_lstm = torch.nn.LSTM( CHAR_DIM_EMBEDDING, CHAR_LSTM_HIDDEN, num_layers=1, batch_first=True, bidirectional=True, ) char_lstm_hidden = CHAR_LSTM_HIDDEN # Create LSTM parameters self.lstm = torch.nn.LSTM( DIM_EMBEDDING + char_lstm_hidden, LSTM_HIDDEN, num_layers=LSTM_LAYER, batch_first=True, bidirectional=True, ) # Create output dropout parameter self.lstm_output_dropout = torch.nn.Dropout(1 - KEEP_PROB) # Create final matrix multiply parameters self.hidden_to_tag = torch.nn.Linear(LSTM_HIDDEN * 2, ntags) self.ntags = ntags self.use_crf = use_crf if self.use_crf: self.crf = CRF(target_size=ntags)
def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, batch_size, max_len): super(BiLSTM_CRF, self).__init__() self.embedding_dim = embedding_dim self.hidden_dim = hidden_dim self.vocab_size = vocab_size self.batch_size = batch_size self.tag_to_ix = tag_to_ix self.tagset_size = len(tag_to_ix) self.max_len = max_len self.crf = CRF(len(tag_to_ix), batch_first=True) self.word_embeds = nn.Embedding(vocab_size, embedding_dim) self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, batch_first=True, bidirectional=True) # Maps the output of the LSTM into tag space. self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size) # Matrix of transition parameters. Entry i,j is the score of # transitioning *to* i *from* j. self.transitions = nn.Parameter( torch.randn(self.tagset_size, self.tagset_size)) # These two statements enforce the constraint that we never transfer # to the start tag and we never transfer from the stop tag self.transitions.data[tag_to_ix["<START>"], :] = -10000 self.transitions.data[:, tag_to_ix["<STOP>"]] = -10000 self.hidden = self.init_hidden()
def __init__(self, base_path, oov, num_labels, lstm_hidden_size=128, dropout=0.3, lm_flag=False): super(Bert_CRF, self).__init__() bert_config = BertConfig.from_json_file( os.path.join(base_path, 'config.json')) bert_config.num_labels = num_labels #hidden_states (tuple(torch.FloatTensor), optional, returned when config.output_hidden_states=True): bert_config.output_hidden_states = True bert_config.output_attentions = True self.bert = BertModel.from_pretrained(os.path.join( base_path, 'pytorch_model.bin'), config=bert_config) self.tokenizer = tokenizer self.oov = oov self._oov_embed() self.dropout = nn.Dropout(dropout) #lstm input_size = bert_config.hidden_size hidden_size(第二个参数)= 跟Linear 的第一个参数对上 # 尝试下双向LSTM self.lm_flag = lm_flag self.lstm = nn.LSTM(bert_config.hidden_size, lstm_hidden_size, num_layers=1, bidirectional=True, dropout=0.3, batch_first=True) self.clf = nn.Linear(256, bert_config.num_labels + 2) self.layer_norm = nn.LayerNorm(lstm_hidden_size * 2) self.crf = CRF(target_size=bert_config.num_labels, average_batch=True, use_cuda=True)
def extractPhraseFromCRFWithColor(phrasedir, systemdir): crf_reader = CRF() aligner = AlignPhraseAnnotation() lectures = annotation.Lectures for i, lec in enumerate(lectures): path = phrasedir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: filename = path + prompt + '.' + method + '.key' extracted_phrases = [] extracted_colors = [] crf_file = os.path.join(systemdir, 'extraction', 'all_output', 'test_%i_%s.out' % (i, prompt)) for tokens, tags, color0, color1 in crf_reader.read_file_generator_index( crf_file, [0, -1, -4, -3]): phrases, phrase_colors = aligner.get_phrase_with_colors( tokens, tags, [color0, color1]) for phrase, phrase_color in zip(phrases, phrase_colors): extracted_phrases.append(phrase.lower()) extracted_colors.append(phrase_color) fio.SaveList(extracted_phrases, filename) filename = path + prompt + '.' + method + '.key.color' fio.SaveDict2Json(extracted_colors, filename)
def __init__(self, opt, tag2label): super(Bilstm_crf, self).__init__() self.embedding_length = opt.embedding_length self.hidden_size = opt.hidden_size self.output_size = len(tag2label) self.batch_size = opt.batch_size self.vocab_size = opt.vocab_size self.dropout = opt.dropout self.dropout_embed = nn.Dropout(opt.dropout) self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_length) self.word_embeddings.weight.data.copy_(torch.from_numpy( opt.embeddings)) self.dropout_embed = nn.Dropout(opt.dropout) self.lstm = nn.LSTM(self.embedding_length, self.hidden_size, bidirectional=True, dropout=opt.dropout) if self.lstm.bidirectional: self.label = nn.Linear(self.hidden_size * 2, self.output_size) else: self.label = nn.Linear(self.hidden_size, self.output_size) self.crf = CRF(self.output_size)
def do_infer(args): config = ConfigParser() config.read_file(args.config) model = CRF(config) reader = csv.reader(args.input, delimiter='\t') header = next(reader) assert all(w in header for w in ["id", "words", "lemmas", "pos_tags", "doc_char_begin", "doc_char_end", "gloss"]), "Input doesn't have required annotations." Sentence = namedtuple('Sentence', header) def parse_input(row): sentence = Sentence(*row) words, lemmas, pos_tags = [parse_psql_array(arr) for arr in (sentence.words, sentence.lemmas, sentence.pos_tags)] return sentence._replace(words=words, lemmas=lemmas, pos_tags=pos_tags) writer = csv.writer(args.output, delimiter='\t') writer.writerow([ 'id', 'speaker_token_begin', 'speaker_token_end', 'cue_token_begin', 'cue_token_end', 'content_token_begin', 'content_token_end', 'content_tokens', 'speaker', 'cue', 'content']) for sentences in tqdm(grouper(map(parse_input, reader), args.batch_size)): conll = [zip(s.words, s.lemmas, s.pos_tags) for s in sentences] for sentence, tags in zip(sentences, model.infer(conll)): if "SPKR" not in tags or "CTNT" not in tags: continue writer.writerow([sentence.id,] + extract_quote_entries(sentence, tags))
class BiLSTM_CRF(nn.Module): def __init__(self, data): super(BiLSTM_CRF, self).__init__() print "build batched lstmcrf..." self.gpu = data.HP_gpu label_size = data.label_alphabet_size data.label_alphabet_size += 2 self.lstm = BiLSTM(data) self.crf = CRF(label_size, self.gpu) def neg_log_likelihood_loss(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask): outs = self.lstm.get_output_score(gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) total_loss = self.crf.neg_log_likelihood_loss(outs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outs, mask) return total_loss, tag_seq def forward(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask): outs = self.lstm.get_output_score(gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) scores, tag_seq = self.crf._viterbi_decode(outs, mask) return tag_seq def get_lstm_features(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): return self.lstm.get_lstm_features(gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
def __init__(self, words_num, embed_dim, hidden_dim, num_layers, out_class, word2idx, dropout=0.2, bi_direction=True): super(LSTMCRF, self).__init__() self.word2idx = word2idx self.bi_direction = bi_direction self.hidden_dim = hidden_dim self.embed_layer = nn.Embedding(words_num, embed_dim) if bi_direction: self.rnn = nn.LSTM(embed_dim, hidden_dim // 2, num_layers=num_layers, bidirectional=True) else: self.rnn = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, bidirectional=False) self.fc = nn.Linear(hidden_dim, out_class) self.crf = CRF(out_class)
def __init__(self, data, circul_time, deepth): super(SeqModel_circulationBiLSTM, self).__init__() self.use_crf = data.use_crf self.use_trans = data.use_trans self.use_mapping = data.use_mapping print "build network..." print "use_char: ", data.use_char if data.use_char: print "char feature extractor: ", data.char_seq_feature print "use_trans: ", data.use_trans print "word feature extractor: ", data.word_feature_extractor print "use crf: ", self.use_crf self.gpu = data.gpu self.average_batch = data.average_batch_loss # add two more label for downlayer lstm, use original label size for CRF label_size = data.label_alphabet_size data.label_alphabet_size += 2 self.word_hidden = WordSequence_circulationBiLSTM( data, circul_time, deepth) if self.use_crf: self.crf = CRF(label_size, self.gpu)
def __init__(self, dicts, config): super(EntityDetection, self).__init__() self.config = config self.embed = Embeddings(word_vec_size=config.d_embed, dicts=dicts) if self.config.rnn_type.lower() == 'gru': self.rnn = nn.GRU(input_size=config.d_embed, hidden_size=config.d_hidden, num_layers=config.n_layers, dropout=config.dropout_prob, bidirectional=config.birnn) else: self.rnn = nn.LSTM(input_size=config.d_embed, hidden_size=config.d_hidden, num_layers=config.n_layers, dropout=config.dropout_prob, bidirectional=config.birnn) self.dropout = nn.Dropout(p=config.dropout_prob) self.relu = nn.ReLU() seq_in_size = config.d_hidden if self.config.birnn: seq_in_size *= 2 self.hidden2tag = nn.Sequential( nn.Linear(seq_in_size, seq_in_size), nn.BatchNorm1d(seq_in_size), self.relu, self.dropout, nn.Linear(seq_in_size, config.n_out) ) self.crf=CRF(config.n_out)
class deepBiLSTM_CRF(nn.Module): def __init__(self, word_HPs, char_HPs, num_labels=None, drop_final=0.5): super(deepBiLSTM_CRF, self).__init__() [word_size, word_dim, word_pre_embs, word_hidden_dim, word_dropout, word_layers, word_bidirect] = word_HPs if char_HPs: [char_size, char_dim, char_pred_embs, char_hidden_dim, char_dropout, char_layers, char_bidirect] = char_HPs self.lstm = Deep_bisltm(word_HPs, char_HPs, num_labels, att=True) # add two more labels for CRF self.crf = CRF(num_labels+2, use_cuda) ## add two more labels to learn hidden features for start and end transition self.hidden2tag = nn.Linear(2*word_hidden_dim, num_labels+2) self.dropfinal = nn.Dropout(drop_final) if use_cuda: self.hidden2tag = self.hidden2tag.cuda() self.dropfinal = self.dropfinal.cuda() def NLL_loss(self, label_score, mask_tensor, label_tensor): batch_loss = self.crf.neg_log_likelihood_loss(label_score, mask_tensor, label_tensor) return batch_loss def inference(self, label_score, mask_tensor): label_prob, label_pred = self.crf._viterbi_decode(label_score, mask_tensor) return label_prob, label_pred def forward(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): # (batch_size,sequence_len,hidden_dim) rnn_out = self.lstm.get_all_atthiddens(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) # (batch_size,sequence_len,num_labels+2) label_score = self.hidden2tag(rnn_out) label_score = self.dropfinal(label_score) return label_score
def __init__(self, char_init_embed, word_init_embed, pos_init_embed, spo_embed_dim, sentence_length, hidden_size, num_classes, dropout=0.3, id2words=None, encoding_type='bieso', weight=None): super().__init__() # self.Embedding = nn.Embedding(init_embed) # print(char_init_embed) self.char_embed = nn.Embedding(char_init_embed[0], char_init_embed[1]) self.word_embed = nn.Embedding(word_init_embed[0], word_init_embed[1]) # word2vec self.word_embed.weight.data.copy_(torch.from_numpy(weight)) self.pos_embed = nn.Embedding(pos_init_embed[0], pos_init_embed[1]) # spo embed size: 50 self.embed_dim = self.char_embed.embedding_dim + self.word_embed.embedding_dim + self.pos_embed.embedding_dim + spo_embed_dim # sentence length #self.sen_len = sentence_length #self.zeros = torch.zeros(self.sen_len, dtype=torch.long) self.norm1 = torch.nn.LayerNorm(self.embed_dim) self.Rnn = nn.LSTM(input_size=self.embed_dim, hidden_size=hidden_size, num_layers=2, dropout=dropout, bidirectional=True, batch_first=True) self.Linear1 = nn.Linear(hidden_size * 2, hidden_size * 2 // 3) self.norm2 = torch.nn.LayerNorm(hidden_size * 2 // 3) self.relu = torch.nn.LeakyReLU() self.drop = torch.nn.Dropout(dropout) self.Linear2 = nn.Linear(hidden_size * 2 // 3, num_classes) if id2words is None: self.Crf = CRF(num_classes, include_start_end_trans=False) else: self.Crf = CRF(num_classes, include_start_end_trans=False, allowed_transitions=allowed_transitions(id2words, encoding_type=encoding_type))
def __init__(self, vocab_tag, char_embed_size, num_hidden_layer, channel_size, kernel_size, dropout_rate=0.2): super(CharWordSeg, self).__init__() self.vocab_tag = vocab_tag self.char_embed_size = char_embed_size self.num_hidden_layer = num_hidden_layer self.channel_size = channel_size self.kernel_size = kernel_size self.dropout_rate = dropout_rate num_tags = len(self.vocab_tag['tag_to_index']) vocab_size = len(self.vocab_tag['token_to_index']) self.char_embedding = nn.Embedding(vocab_size, char_embed_size) self.dropout_embed = nn.Dropout(dropout_rate) self.glu_layers = nn.ModuleList([ ConvGLUBlock(in_channels=char_embed_size, out_channels=channel_size, kernel_size=kernel_size, drop_out=0.2, padding=1) ] + [ ConvGLUBlock(in_channels=channel_size, out_channels=channel_size, kernel_size=kernel_size, drop_out=0.2, padding=1) for _ in range(num_hidden_layer - 1) ]) self.hidden_to_tag = nn.Linear(char_embed_size, num_tags) self.crf_layer = CRF(num_tags, batch_first=True)
def __init__(self, label_size, input_dim): super(CRFDecoder, self).__init__() self.input_dim = input_dim self.linear = nn.Linear(in_features=input_dim, out_features=label_size) self.crf = CRF(label_size + 2) self.label_size = label_size self.init_weights()
def __init__(self, data, opt): super(SeqModel, self).__init__() self.gpu = opt.gpu ## add two more label for downlayer lstm, use original label size for CRF self.word_hidden = WordSequence(data, opt) self.crf = CRF(data.label_alphabet.size(), self.gpu)
def __init__(self, data): super(BiLSTM_CRF, self).__init__() print "build batched lstmcrf..." self.gpu = data.HP_gpu label_size = data.label_alphabet_size data.label_alphabet_size += 2 self.lstm = BiLSTM(data) self.crf = CRF(label_size, self.gpu)
def __init__(self, utterance_encoder: PreTrainedModel, conversation_encoder: nn.RNNBase, n_classes: int) -> None: super(UtteranceClassificationHRNN, self).__init__() self.hrnn = HierarchicalRNN(utterance_encoder, conversation_encoder) self.output_layer = nn.Linear( in_features=conversation_encoder.hidden_size, out_features=n_classes) self.crf = CRF(n_classes)
def __init__(self, vocab_size, word_embed_dim, word_hidden_dim, alphabet_size, char_embedding_dim, char_hidden_dim, feature_extractor, tag_num, dropout, pretrain_embed=None, use_char=False, use_crf=False, use_gpu=False): super(NamedEntityRecog, self).__init__() self.use_crf = use_crf self.use_char = use_char self.drop = nn.Dropout(dropout) self.input_dim = word_embed_dim self.feature_extractor = feature_extractor self.embeds = nn.Embedding(vocab_size, word_embed_dim, padding_idx=0) if pretrain_embed is not None: self.embeds.weight.data.copy_(torch.from_numpy(pretrain_embed)) else: self.embeds.weight.data.copy_( torch.from_numpy( self.random_embedding(vocab_size, word_embed_dim))) if self.use_char: self.input_dim += char_hidden_dim self.char_feature = CharCNN(alphabet_size, char_embedding_dim, char_hidden_dim, dropout) if feature_extractor == 'lstm': self.lstm = nn.LSTM(self.input_dim, word_hidden_dim, batch_first=True, bidirectional=True) else: self.word2cnn = nn.Linear(self.input_dim, word_hidden_dim * 2) self.cnn_list = list() for _ in range(4): self.cnn_list.append( nn.Conv1d(word_hidden_dim * 2, word_hidden_dim * 2, kernel_size=3, padding=1)) self.cnn_list.append(nn.ReLU()) self.cnn_list.append(nn.Dropout(dropout)) self.cnn_list.append(nn.BatchNorm1d(word_hidden_dim * 2)) self.cnn = nn.Sequential(*self.cnn_list) if self.use_crf: self.hidden2tag = nn.Linear(word_hidden_dim * 2, tag_num + 2) self.crf = CRF(tag_num, use_gpu) else: self.hidden2tag = nn.Linear(word_hidden_dim * 2, tag_num)
def do_train(args): # Load configuration config = ConfigParser() config.read_file(args.config) data = DataStore(config) # Create the CRF model. model = CRF(config) retrain_epochs = config["training"].getint("retrain_every") accuracy = [] with EditShell(config) as shell: while data.has_next(): conll = data.next() i = data.i() # if the data doesn't have tags, try to smart-tag them. if len(conll[0]) == DataStore.TAG_LABEL+1: tags = [tok[DataStore.TAG_LABEL] for tok in conll] else: tags = model.infer(conll) try: #conll_display = ["{}/{}".format(token[0], token[2]) for token in conll] conll_display = ["{}".format(token[0]) for token in conll] # Create a copy of the list action = shell.run(conll_display, list(tags), metadata=render_progress(data, accuracy)) if action.type == ":prev": try: data.rewind(2) # move 2 indices back except AttributeError: data.rewind(1) elif action.type == ":goto": doc_idx, = action.args assert doc_idx >= 0 data.goto(doc_idx) elif action.type == "save": _, tags_ = action.args accuracy.append(score(tags, tags_)) data.update(conll, tags_) if i % retrain_epochs == 0: model.retrain() except QuitException: break
def main(): parser = OptionParser() parser.add_option("-d", dest="training_dir", help="training data directory") parser.add_option("-t", dest="test_dir", help="test data directory") parser.add_option("-f", dest="test_file", help="test data file") parser.add_option("-m", dest="model", help="model file") parser.add_option("-l", dest="regularity", type="int", help="regularity. 0=none, 1=L1, 2=L2 [2]", default=2) (options, args) = parser.parse_args() if not options.training_dir and not options.model: parser.error("need training data directory(-d) or model file(-m)") features = pg_features(["H", "B", "F"]) crf = CRF(features, options.regularity) print "features:", features.size() print "labels:", len(features.labels) if options.training_dir: texts, labels = load_dir(options.training_dir) fvs = [FeatureVector(features, x, y) for x, y in zip(texts, labels)] # initial parameter (pick up max in 10 random parameters) theta = sorted([crf.random_param() for i in range(10)], key=lambda t:crf.likelihood(fvs, t))[-1] # inference print "log likelihood (before inference):", crf.likelihood(fvs, theta) theta = crf.inference(fvs, theta) if options.model: f = open(options.model, 'w') f.write(pickle.dumps(theta)) f.close() else: f = open(options.model, 'r') theta = pickle.loads(f.read()) f.close() if features.size() != len(theta): raise ValueError, "model's length not equal feature's length." if options.test_dir: test_files = glob.glob(options.test_dir + '/*') elif options.test_file: test_files = [options.test_file] else: test_files = [] i = 0 for filename in test_files: print "========== test = ", i text, label = load_file(filename) pg_tagging(FeatureVector(features, text), text, label, crf, features, theta) i += 1
def test_cross_course(train, name='all'): wapiti_home = global_params.wapiti_dir pattern_file = '../data/%s.pattern.txt'%name model_dir = '../data/%s/%s/model/%s/'%(course, system, name) fio.NewPath(model_dir) feature_dir = '../data/%s/%s/extraction/'%(course, system) feature_cv_dir = '../data/%s/%s/extraction/%s/'%(course, system, name) fio.NewPath(feature_cv_dir) outputdir = '../data/%s/%s/extraction/%s_output/'%(course, system, name) fio.NewPath(outputdir) lectures = annotation.Lectures dict = defaultdict(int) for i, lec in enumerate(lectures): test = [lec] model_file = os.path.join(model_dir, '%s.model'%train) print model_file crf = CRF(wapiti_home) if not fio.IsExist(model_file): print "Model is not available" for q in ['q1', 'q2']: test_filename = os.path.join(feature_cv_dir, 'test_%d_%s.feature.crf'%(i, q)) output_file = os.path.join(outputdir, 'test_%d_%s.out'%(i, q)) dict['test_%d_%s'%(i, q)] = 1 if method == 'combine': test_filename_old = test_filename.replace('_combine', '_A1') cmd = 'cp %s %s'%(test_filename_old, test_filename) os.system(cmd) else: combine_files(feature_dir, test, test_filename, prompts=[q]) crf.predict(test_filename, model_file, output_file) if debug: break file_util.save_dict2json(dict, class_index_dict_file)
def extractPhraseFromCRF(phrasedir, systemdir): crf_reader = CRF() aligner = AlignPhraseAnnotation() lectures = annotation.Lectures for i, lec in enumerate(lectures): path = phrasedir + str(lec)+ '/' fio.NewPath(path) for prompt in ['q1', 'q2']: filename = path + prompt + '.' + method + '.key' phrases = [] crf_file = os.path.join(systemdir, 'extraction', 'all_output', 'test_%i_%s.out'%(i, prompt)) for tokens, tags in crf_reader.read_file_generator(crf_file): for phrase in aligner.get_phrase(tokens, tags): phrases.append(phrase.lower()) fio.SaveList(phrases, filename)
def crfpp(self, msg): crf = CRF() fileUtil = FileUtil() crf.create_file_input(msg) os.system('crf_test -m ../model1 crf.test.data > crf.result') lst = fileUtil.read_file('crf.result') # lst = [a for a in lst if a != u'\n'] # str_ans = reduce(lambda x,y:x+y, [a.split('\t')[0] for a in lst]) # ans = reduce(lambda x,y:x+y, [a.split('\t')[3][:-1] for a in lst]) # lst_col3 = [a.split('\t')[3][:-1] for a in lst] lst_col3, str_ans = self.process_ans(lst) lst_ans = [n for (n, e) in enumerate(lst_col3) if e == 'B'] result_lst = [] for i in range(len(lst_ans)-1): a = lst_ans[i] b = lst_ans[i+1] result_lst.append(str_ans[a:b]) result_lst.append(str_ans[b:len(str_ans)]) return result_lst
def crfpp(self, msg, model): crf = CRF() fileUtil = FileUtil() crf.create_file_input(msg) start_time = time.time() os.system('crf_test -m '+model+' crf.test.data > logs/out/crf.result') total_time = time.time() - start_time #return in seconds lst = fileUtil.read_file('logs/out/crf.result') # lst = [a for a in lst if a != u'\n'] # str_ans = reduce(lambda x,y:x+y, [a.split('\t')[0] for a in lst]) # ans = reduce(lambda x,y:x+y, [a.split('\t')[3][:-1] for a in lst]) # lst_col3 = [a.split('\t')[3][:-1] for a in lst] lst_col3, str_ans = self.process_ans(lst) lst_ans = [n for (n, e) in enumerate(lst_col3) if e == 'B'] result_lst = [] for i in range(len(lst_ans)-1): a = lst_ans[i] b = lst_ans[i+1] result_lst.append(str_ans[a:b]) result_lst.append(str_ans[b:len(str_ans)]) return total_time, result_lst
from crf import CRF from features import * import re, sys import pickle training_file = sys.argv[1] if __name__ == '__main__': labels,obsrvs,word_sets,word_data,label_data = fit_dataset(training_file) crf = CRF( labels=list(labels), feature_functions = Membership.functions(labels,*word_sets.values()) + MatchRegex.functions(labels, '^[^0-9a-zA-Z\-]+$', '^[^0-9\-]+$', '^[A-Z]+$', '^-?[1-9][0-9]*\.[0-9]+$', '^[1-9][0-9\.]+[a-z]+$', '^[0-9]+$', '^[A-Z][a-z]+$', '^([A-Z][a-z]*)+$', '^[^aeiouAEIOU]+$' ))# + [ # lambda yp,y,x_v,i,_y=_y,_x=_x: # 1 if i < len(x_v) and y==_y and x_v[i].lower() ==_x else 0 # for _y in labels # for _x in obsrvs #]) crf.train(word_data[:-5],label_data[:-5]) pickle.dump(crf,open(sys.argv[2],'wb')) for i in range(-5,0): print word_data[i]
def main(): def load_data(data): texts = [] labels = [] text = [] data = "\n" + data + "\n" for line in data.split("\n"): line = line.strip() if len(line) == 0: if len(text)>0: texts.append(text) labels.append(label) text = [] label = [] else: token, info, chunk = line.split() text.append((token, info)) label.append(chunk) return (texts, labels) texts, labels = load_data(""" This DT B-NP temblor-prone JJ I-NP city NN I-NP dispatched VBD B-VP inspectors NNS B-NP , , O firefighters NNS B-NP and CC O other JJ B-NP earthquake-trained JJ I-NP personnel NNS I-NP to TO B-VP aid VB I-VP San NNP B-NP Francisco NNP I-NP . . O """) print texts, labels test_texts, test_labels = load_data(""" Rockwell NNP B-NP said VBD B-VP the DT B-NP agreement NN I-NP calls VBZ B-VP for IN B-SBAR it PRP B-NP to TO B-VP supply VB I-VP 200 CD B-NP additional JJ I-NP so-called JJ I-NP shipsets NNS I-NP for IN B-PP the DT B-NP planes NNS I-NP . . O """) features = Features(labels) tokens = dict([(i[0],1) for x in texts for i in x]).keys() infos = dict([(i[1],1) for x in texts for i in x]).keys() for label in features.labels: for token in tokens: features.add_feature( lambda x, y, l=label, t=token: 1 if y==l and x[0]==t else 0 ) for info in infos: features.add_feature( lambda x, y, l=label, i=info: 1 if y==l and x[1]==i else 0 ) features.add_feature_edge( lambda y_, y: 0 ) fvs = [FeatureVector(features, x, y) for x, y in zip(texts, labels)] fv = fvs[0] text_fv = FeatureVector(features, test_texts[0]) # text sequence without labels crf = CRF(features, 0) theta0 = crf.random_param() print "initial log likelihood:", crf.likelihood(fvs, theta0) print ">> Steepest Descent" theta = theta0.copy() eta = 0.5 t = time.time() for i in range(20): theta += eta * crf.gradient_likelihood(fvs, theta) print i, "log likelihood:", crf.likelihood(fvs, theta) eta *= 0.95 print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size) print ">> SGD" theta = theta0.copy() eta = 0.5 t = time.time() for i in range(20): for fv in fvs: theta += eta * crf.gradient_likelihood([fv], theta) print i, "log likelihood:", crf.likelihood(fvs, theta) eta *= 0.95 print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size) print ">> SGD + FOBOS L1" theta = theta0.copy() eta = 0.5 lmd = 0.01 t = time.time() for i in range(20): lmd_eta = lmd * eta for fv in fvs: theta += eta * crf.gradient_likelihood([fv], theta) theta = (theta > lmd_eta) * (theta - lmd_eta) + (theta < -lmd_eta) * (theta + lmd_eta) print i, "log likelihood:", crf.likelihood(fvs, theta) eta *= 0.95 print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size) print ">> Steepest Descent + FOBOS L1" theta = theta0.copy() eta = 0.2 lmd = 0.5 t = time.time() for i in range(20): theta += eta * crf.gradient_likelihood(fvs, theta) lmd_eta = lmd * eta theta = (theta > lmd_eta) * (theta - lmd_eta) + (theta < -lmd_eta) * (theta + lmd_eta) print i, "log likelihood:", crf.likelihood(fvs, theta) eta *= 0.9 print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size) #print theta print ">> BFGS" t = time.time() theta = crf.inference(fvs, theta0) print "log likelihood:", crf.likelihood(fvs, theta) print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size)
def main(): parser = OptionParser() parser.add_option("-d", dest="training_dir", help="training data directory") parser.add_option("-t", dest="test_dir", help="test data directory") parser.add_option("-f", dest="test_file", help="test data file") parser.add_option("-m", dest="model", help="model file") parser.add_option("-b", dest="body", action="store_true", help="output body") parser.add_option("-l", dest="regularity", type="int", help="regularity. 0=none, 1=L1, 2=L2 [2]", default=2) parser.add_option("--l1", dest="fobos_l1", action="store_true", help="FOBOS L1", default=False) (options, args) = parser.parse_args() if not options.training_dir and not options.model: parser.error("need training data directory(-d) or model file(-m)") theta = LABELS = None if options.model and os.path.isfile(options.model): with open(options.model, 'r') as f: LABELS, theta = pickle.loads(f.read()) if options.training_dir: texts, labels = load_dir(options.training_dir) if LABELS == None: LABELS = unique(flatten(labels)) features = wce_features(LABELS) crf = CRF(features, options.regularity) if options.training_dir: fvs = [FeatureVector(features, x, y) for x, y in zip(texts, labels)] # initial parameter (pick up max in 10 random parameters) if theta == None: theta = sorted([crf.random_param() for i in range(10)], key=lambda t:crf.likelihood(fvs, t))[-1] # inference print "features:", features.size() print "labels:", len(features.labels), features.labels print "log likelihood (before inference):", crf.likelihood(fvs, theta) if options.fobos_l1: eta = 0.000001 for i in range(0): for fv in fvs: theta += eta * crf.gradient_likelihood([fv], theta) print i, "log likelihood:", crf.likelihood(fvs, theta) eta *= 0.98 lmd = 1 while lmd < 200: for i in range(50): theta += eta * crf.gradient_likelihood(fvs, theta) lmd_eta = lmd * eta theta = (theta > lmd_eta) * (theta - lmd_eta) + (theta < -lmd_eta) * (theta + lmd_eta) if i % 10 == 5: print i, "log likelihood:", crf.likelihood(fvs, theta) #eta *= 0.95 import numpy print "%d : relevant features = %d / %d" % (lmd, (numpy.abs(theta) > 0.00001).sum(), theta.size) with open(options.model + str(lmd), 'w') as f: f.write(pickle.dumps((LABELS, theta))) lmd += 1 else: theta = crf.inference(fvs, theta) print "log likelihood (after inference):", crf.likelihood(fvs, theta) if options.model: with open(options.model, 'w') as f: f.write(pickle.dumps((LABELS, theta))) elif features.size() != len(theta): raise ValueError, "model's length not equal feature's length." if options.test_dir: test_files = glob.glob(options.test_dir + '/*.htm*') elif options.test_file: test_files = [options.test_file] else: test_files = [] for x in sorted(theta): print x, print corrects = blocks = 0 for i, filename in enumerate(test_files): if not options.body: print "========== test = ", i text, label = load_file(filename) fv = FeatureVector(features, text) prob, ys = crf.tagging(fv, theta) tagged_label = features.id2label(ys) cor, blo = len(filter(lambda x:x[0]==x[1], zip(label, tagged_label))), len(label) corrects += cor blocks += blo print "log_likely = %.3f, rate = %d / %d" % (prob, cor, blo) if options.body: for x, l in zip(text, tagged_label): if l == "body": print re.sub(r'\s+', ' ', re.sub(r'(?s)<[^>]+>', '', x.org_text)).strip() else: #wce_output_tagging(text, label, prob, tagged_label) map = CountDict() for x in zip(label, tagged_label): map[x] += 1 for x in sorted(map): print x[0], " => ", x[1], " : ", map[x] if blocks > 0: print "total : %d / %d = %.3f%%" % (corrects, blocks, 100.0 * corrects / blocks)
def __init__(self, label_alphabet, feature_alphabet): self.label_alphabet = label_alphabet self.feature_alphabet = feature_alphabet CRF.__init__(self, len(self.label_alphabet), len(self.feature_alphabet))
def __call__(self, x): return self.label_alphabet.lookup_many(CRF.__call__(self, x))