def train_leave_one_lecture_out(name='cv'):
    wapiti_home = global_params.wapiti_dir
    
    pattern_file = '../data/%s.pattern.txt'%name
    model_dir = '../data/%s/%s/model/%s/'%(course, system, name)
    fio.NewPath(model_dir)
    
    feature_dir = '../data/%s/%s/extraction/'%(course, system)
    feature_cv_dir = '../data/%s/%s/extraction/%s/'%(course, system, name)
    fio.NewPath(feature_cv_dir)
    
    outputdir = '../data/%s/%s/extraction/%s_output/'%(course, system, name)
    fio.NewPath(outputdir)
    
    lectures = annotation.Lectures
    
    dict = defaultdict(int)
    
    for i, lec in enumerate(lectures):
        train = [x for x in lectures if x != lec]
        test = [lec]
        
        train_filename = os.path.join(feature_cv_dir, 'train_%d.feature.crf'%i)
        
        model_file = os.path.join(model_dir, '%d.model'%i)
        
        print train_filename
        print model_file
        
        crf = CRF(wapiti_home)
        if not fio.IsExist(model_file):
        #if True:
            combine_files(feature_dir, train, train_filename)
            crf.train(train_filename, pattern_file, model_file)
        
        for q in ['q1', 'q2']:
            
            test_filename = os.path.join(feature_cv_dir, 'test_%d_%s.feature.crf'%(i, q))
            output_file = os.path.join(outputdir, 'test_%d_%s.out'%(i, q))
            
            dict['test_%d_%s'%(i, q)] = 1
            
            if empty == 'Y':
                test_filename_old = test_filename.replace('_Y', '_N')
                cmd = 'cp %s %s'%(test_filename_old, test_filename)
                os.system(cmd)
            else:
                
                if method == 'combine':
                    test_filename_old = test_filename.replace('_combine', '_A1')
                    cmd = 'cp %s %s'%(test_filename_old, test_filename)
                    os.system(cmd)
                else:
                    combine_files(feature_dir, test, test_filename, prompts=[q])
            
            crf.predict(test_filename, model_file, output_file)
        
        if debug: break
    
    file_util.save_dict2json(dict, class_index_dict_file)
def train_on_course(traincourse, name='all'):
    wapiti_home = global_params.wapiti_dir
    
    pattern_file = '../data/%s.pattern.txt'%name
    model_dir = '../data/%s/%s/model/%s/'%(course, system, name)
    fio.NewPath(model_dir)
    
    feature_dir = '../data/%s/%s/extraction/'%(traincourse, system)
    feature_cv_dir = '../data/%s/%s/extraction/%s/'%(traincourse, system, name)
    fio.NewPath(feature_cv_dir)
    
    outputdir = '../data/%s/%s/extraction/%s_output/'%(course, system, name)
    fio.NewPath(outputdir)
    
    if traincourse == 'IE256':
        lectures = [x for x in range(14, 26) if x != 22]
    else:
        lectures = [x for x in range(3, 27)]
    
    dict = defaultdict(int)
    
    train = [x for x in lectures]
    
    train_filename = os.path.join(feature_cv_dir, 'train.feature.crf')
    
    model_file = os.path.join(model_dir, '%s.model'%traincourse)
    
    print train_filename
    print model_file
    
    crf = CRF(wapiti_home)
    if not fio.IsExist(model_file):
    #if True:
        combine_files(feature_dir, train, train_filename)
        crf.train(train_filename, pattern_file, model_file)
예제 #3
0
    def __init__(self, nwords, nchars, ntags, pretrained_list):
        super().__init__()

        # Create word embeddings
        pretrained_tensor = torch.FloatTensor(pretrained_list)
        self.word_embedding = torch.nn.Embedding.from_pretrained(
            pretrained_tensor, freeze=False)
        # Create input dropout parameter
        self.word_dropout = torch.nn.Dropout(1 - KEEP_PROB)
        # Create LSTM parameters
        self.lstm = torch.nn.LSTM(DIM_EMBEDDING + CHAR_LSTM_HIDDEN,
                                  LSTM_HIDDEN,
                                  num_layers=LSTM_LAYER,
                                  batch_first=True,
                                  bidirectional=True)
        # Create output dropout parameter
        self.lstm_output_dropout = torch.nn.Dropout(1 - KEEP_PROB)

        # Character-level LSTMs
        self.char_embedding = torch.nn.Embedding(nchars, CHAR_DIM_EMBEDDING)
        self.char_lstm = torch.nn.LSTM(CHAR_DIM_EMBEDDING,
                                       CHAR_LSTM_HIDDEN,
                                       num_layers=1,
                                       batch_first=True,
                                       bidirectional=False)

        # Create final matrix multiply parameters
        self.hidden_to_tag = torch.nn.Linear(LSTM_HIDDEN * 2, ntags + 2)

        self.crf = CRF(target_size=ntags)
예제 #4
0
class Net(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.args = args

        self.wemb = Wemb(args)
        self.drop = nn.Dropout(args.dropout)
        odim = len(args.tag_stoi)
        if args.ner:
            self.crf = CRF(args.tag_stoi)
            odim = len(args.tag_stoi) + 2
        if not args.lstm:
            self.ffn = nn.Sequential(nn.Linear(300, 400), nn.ReLU(),
                                     nn.Dropout(args.dropout))
        else:
            self.lstm = nn.LSTM(input_size=300,
                                hidden_size=200,
                                num_layers=2,
                                bias=True,
                                batch_first=True,
                                dropout=args.dropout,
                                bidirectional=True)
        self.hid2tag = nn.Linear(400, odim)

    def forward(self, batch):
        mask = pad_sequence([torch.ones(len(x)) for x in batch], True,
                            0).byte().cuda()
        if self.args.fix:
            with torch.no_grad():
                x = self.wemb.eval()(batch)
        else:
            x = self.wemb(batch)
        x = self.drop(x)
        if not self.args.lstm:
            x = self.ffn(x)
        else:
            x = Lstm(self.lstm, x, mask.sum(-1))
        x = self.hid2tag(x)
        return x, mask

    def train_batch(self, batch, tags):
        x, mask = self.forward(batch)
        tag_ids = pad_sequence([
            torch.LongTensor([self.args.tag_stoi[t] for t in s]) for s in tags
        ], True, self.args.tag_stoi["<pad>"]).cuda()
        if not self.args.ner:
            loss = nn.functional.cross_entropy(x[mask], tag_ids[mask])
        else:
            loss = self.crf.neg_log_likelihood_loss(x, mask, tag_ids)
        return loss

    def test_batch(self, batch):
        x, mask = self.forward(batch)
        if not self.args.ner:
            path = x.max(-1)[1]
        else:
            _, path = self.crf._viterbi_decode(x, mask)
        path = [p[m].tolist() for p, m in zip(path, mask)]
        tags = [[self.args.tag_itos[i] for i in s] for s in path]
        return tags
예제 #5
0
    def __init__(self, data):
        super(SeqModel, self).__init__()
        self.use_crf = data.use_crf
        print "build network..."
        print "use_char: ", data.use_char
        if data.use_char:
            print "char feature extractor: ", data.char_feature_extractor
        print "word feature extractor: ", data.word_feature_extractor
        print "use crf: ", self.use_crf

        self.gpu = data.HP_gpu
        self.average_batch = data.average_batch_loss
        ## add two more label for downlayer lstm, use original label size for CRF
        label_size = data.label_alphabet_size
        # data.label_alphabet_size += 2
        # self.word_hidden = WordSequence(data, False, True, data.use_char)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(data.HP_hidden_dim, label_size + 2)

        if self.use_crf:
            self.crf = CRF(label_size, self.gpu)

        if torch.cuda.is_available():
            self.hidden2tag = self.hidden2tag.cuda(self.gpu)

        self.frozen = False
def extractPhraseFromCRFWithColor(phrasedir, systemdir):
    crf_reader = CRF()
    aligner = AlignPhraseAnnotation()
    
    lectures = annotation.Lectures
    for i, lec in enumerate(lectures):
        path = phrasedir + str(lec)+ '/'
        fio.NewPath(path)
        
        for prompt in ['q1', 'q2']:
            filename = path + prompt + '.' + method + '.key'
            extracted_phrases = []
            extracted_colors = []
            
            crf_file = os.path.join(systemdir, 'extraction', 'all_output', 'test_%i_%s.out'%(i, prompt))
            for tokens, tags, color0, color1 in crf_reader.read_file_generator_index(crf_file, [0, -1, -4, -3]):
                phrases, phrase_colors = aligner.get_phrase_with_colors(tokens, tags, [color0, color1])
                
                for phrase, phrase_color in zip(phrases, phrase_colors):
                    
                    extracted_phrases.append(phrase.lower())
                    extracted_colors.append(phrase_color)
            
            fio.SaveList(extracted_phrases, filename)
            
            filename = path + prompt + '.' + method + '.key.color'
            fio.SaveDict2Json(extracted_colors, filename)
예제 #7
0
 def crfpp(self, msg):
     print 'crf++'
     crf = CRF()
     crf.create_file_input(msg)
     start_time = time.time()
     os.system('crf_test -m model crf.test.data > crf.result')
     total_time = time.time() - start_time #return in seconds
예제 #8
0
def image_output(mask, realw, realh, key, channel_bindings, output_dir,
                 no_segmentation_images):
    ######################## COLOR GT #################################
    decoded = decode_segmap(np.array(mask, dtype=np.uint8))
    decoded = Image.fromarray(np.uint8(decoded * 255), 'RGBA')
    basewidth = int(realw)
    hsize = int(realh)
    decoded = decoded.resize((basewidth, hsize), Image.ANTIALIAS)
    decoded.save(os.path.join(output_dir, "{0}_Color_output.png".format(key)))

    if not no_segmentation_images:
        ######################## Primary root GT ###########################
        decoded1 = CRF.decode_channel(mask, [
            channel_bindings['segmentation']['Primary'],
            channel_bindings['heatmap']['Seed']
        ])
        decoded1 = Image.fromarray(decoded1)
        decoded1 = decoded1.resize((basewidth, hsize), Image.NEAREST)
        decoded1 = decoded1.convert('L')
        decoded1.save(os.path.join(output_dir, "{0}_C1.png".format(key)))
        ######################## Lat root GT ###########################
        decoded2 = CRF.decode_channel(
            mask, channel_bindings['segmentation']['Lateral'])
        decoded2 = Image.fromarray(decoded2)
        decoded2 = decoded2.resize((basewidth, hsize), Image.NEAREST)
        decoded2 = decoded2.convert('L')
        decoded2.save(os.path.join(output_dir, "{0}_C2.png".format(key)))
예제 #9
0
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert_base/')

        if args.bert_freeze:
            for param in self.bert.parameters():
                param.requires_grad = False

        self.lstm = BiLSTM(
            input_size=args.bert_hidden_size + args.cnn_output_size,
            hidden_size=args.rnn_hidden_size + args.cnn_output_size,
            num_layers=args.rnn_num_layers,
            num_dirs=args.rnn_num_dirs)

        self.lstm_dropout = nn.Dropout(p=args.rnn_dropout)

        self.cnn = CharCNN(embedding_num=len(CHAR_VOCAB),
                           embedding_dim=args.cnn_embedding_dim,
                           filters=eval(args.cnn_filters),
                           output_size=args.cnn_output_size)

        self.crf = CRF(target_size=len(VOCAB) + 2, use_cuda=args.crf_use_cuda)

        self.linear = nn.Linear(in_features=args.rnn_hidden_size +
                                args.cnn_output_size,
                                out_features=len(VOCAB) + 2)

        self.attn = MultiHeadAttention(model_dim=args.rnn_hidden_size +
                                       args.cnn_output_size,
                                       num_heads=args.attn_num_heads,
                                       dropout=args.attn_dropout)

        self.feat_dropout = nn.Dropout(p=args.feat_dropout)
예제 #10
0
 def __init__(self, config):
     super(BertNer, self).__init__(config)
     self.bert = BertModel(config)
     self.dropout = nn.Dropout(config.hidden_dropout_prob)
     self.classifier = nn.Linear(config.hidden_size, config.num_labels)
     self.crf = CRF(config.num_labels)
     self.init_weights()
예제 #11
0
 def __init__(self, word2id, char2id, tag2id, pretrain_embedding, embed_dim,
              char_embed_dim, n_hidden):
     super(LSTMTagger, self).__init__()
     self.word2id = word2id  #通过预训练emdedding得到的word字典
     self.char2id = char2id
     self.tag2id = tag2id
     self.word_num = len(word2id)
     self.char_num = len(char2id)
     self.tag_num = len(tag2id)
     self.embed_dim = embed_dim
     self.embedding = torch.nn.Embedding.from_pretrained(
         torch.FloatTensor(pretrain_embedding),
         freeze=False)  #加载预训练embedding矩阵并设置为可变
     #        self.pre_embedding = nn.Embedding(self.word_num,self.embedding_dim)
     self.clstm = CharLSTM(
         chrdim=self.char_num,
         embdim=embed_dim,
         char_embed=char_embed_dim,
     )
     self.wlstm = nn.LSTM(input_size=embed_dim + char_embed_dim,
                          hidden_size=n_hidden // 2,
                          num_layers=1,
                          batch_first=True,
                          bidirectional=True)
     self.out = nn.Linear(n_hidden, self.tag_num)
     self.crf = CRF(self.tag_num)
     self.drop = nn.Dropout()
예제 #12
0
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(tag_stoi) + 2)
        self.g2b = nn.Linear(300, 768)
        self.gate = nn.Linear(768, 1)
        self.crf = CRF(tag_stoi)

    def forward(self, inputs, wids, attention_mask, labels):
        b = self.net.bert.embeddings(input_ids=inputs)
        a = self.gate(b).sigmoid()
        g = self.g2b(wvec[wids].cuda())
        x = (1 - a) * b + a * g

        logits = self.net(inputs_embeds=x, attention_mask=attention_mask)[0]
        first_mask = labels != -100
        mask = lens2mask(first_mask.sum(-1)).cuda()
        logits = torch.zeros(*mask.shape, logits.shape[-1]).cuda().masked_scatter(mask[:, :, None], logits[first_mask])
        labels = torch.zeros(*mask.shape).long().cuda().masked_scatter(mask, labels[first_mask])
        return logits, mask, labels

    def train_batch(self, inputs, wids, attention_mask, labels):
        logits, mask, labels = self.forward(inputs, wids, attention_mask, labels)
        loss = self.crf.neg_log_likelihood_loss(logits, mask, labels)
        return loss

    def test_batch(self, inputs, wids, attention_mask, labels):
        logits, mask, labels = self.forward(inputs, wids, attention_mask, labels)
        _, path = self.crf._viterbi_decode(logits, mask)
        pred = [[tag_itos[i] for i in p[m]] for p, m in zip(path, mask)]
        return pred
def five_two():
    '''implement your experiments for question 5.2 here'''

    file = open('Q5_2.txt', 'w')
    crf_test = CRF(L=CHARS, F=321)
    W_F = np.load('W_F_{}.npy'.format(7), 'r')
    W_T = np.load('W_T_{}.npy'.format(7), 'r')
    crf_test.set_params(W_F, W_T)

    Y_gen = []
    X_gen = []
    samples_per_length = 50

    for length in range(1, 21):
        Y_gen.append(np.random.choice(CHARS, [samples_per_length, length]))
        X_gen.append(
            np.random.randint(2, size=(samples_per_length, length, 321)))
        t0 = time.time()
        for x, y in zip(X_gen[length - 1], Y_gen[length - 1]):
            predictions = crf_test.predict(x)
        t1 = time.time()
        print('Average time to predict ', samples_per_length,
              'samples of length ', length, 'is',
              (t1 - t0) / samples_per_length)
        file.write(
            str(length) + ',' + str((t1 - t0) / samples_per_length) + '\n')
    file.close()

    pass
예제 #14
0
 def __init__(self, n_classes, n_features, **kwargs):
     self.__dict__.update(locals())
     CRF.__init__(self, **kwargs)
     self.n_classes = int(self.n_classes)
     self.n_features = int(self.n_features)
     self.n_parameters = int(self.n_classes * self.n_features +
                             self.n_classes**2)
예제 #15
0
   def __init__(self, hidden_size: int, output_size: int, num_layers: int=1, 
                bidirectional: bool=False, dropout_p: float=0.1,  
                device: str="cpu", weights: Optional=None, num_embeddings: Optional=None, 
                embedding_dim: Optional=None):
       super(NERTagger, self).__init__()
       if weights is not None:
           self.embedding = nn.Embedding.from_pretrained(weights, padding_idx=PAD_IDX)
       else:
           self.embedding = nn.Embedding(num_embeddings=num_embeddings, 
                                         embedding_dim=embedding_dim, 
                                         padding_idx=PAD_IDX)
 
       self.hidden_size = hidden_size
       self.output_size = output_size
       self.num_layers = num_layers
       self.dropout_p = dropout_p
       self.bidirectional = bidirectional
       self.device = device
       self.dropout = nn.Dropout(p=dropout_p)
       self.lstm = nn.LSTM(input_size=self.embedding.embedding_dim, 
                           hidden_size=hidden_size, 
                           bidirectional=bidirectional, 
                           num_layers=num_layers,
                           batch_first=True)
       if self.bidirectional:
           hidden_size = 2 * hidden_size
       self.crf = CRF(hidden_size, output_size, device=device)
예제 #16
0
def run_crf(epoch, score_map, bag_label, bag_index, co_exp_net_isoform, co_exp_net_lncRNA, training_size, testing_size, theta, sigma = 10):
    bag_label = bag_label[0: training_size]
    bag_index = bag_index[0: training_size]
    positive_unary_energy = 1 - score_map

    crf_isoform = CRF(training_size, testing_size, positive_unary_energy, co_exp_net_isoform, theta, bag_label, bag_index)
    crf_lncRNA = CRF(training_size, testing_size, positive_unary_energy, co_exp_net_lncRNA, theta, bag_label, bag_index)
    
    label_update_i, pos_prob_crf_i, unary_potential_i, pairwise_potential_i = crf_isoform.inference(10)
    label_update_l, pos_prob_crf_l, unary_potential_l, pairwise_potential_l = crf_lncRNA.inference(10)
    
    label_update = label_update_i + label_update_l 
    pos_prob_crf = pos_prob_crf_i + pos_prob_crf_l
    unary_potential = unary_potential_i + unary_potential_l
    pairwise_potential = pairwise_potential_i + pairwise_potential_l
    
    if epoch == 0:
        theta_prime_isoform = crf_isoform.parameter_learning(bag_label[0:training_size], theta, sigma)
        theta_prime_lncRNA = crf_lncRNA.parameter_learning(bag_label[0:training_size], theta, sigma)
    else:
        theta_prime_isoform = crf_isoform.parameter_learning(label_update, theta, sigma)
        theta_prime_lncRNA = crf_lncRNA.parameter_learning(label_update, theta, sigma)
    
    theta_prime = theta_prime_isoform + theta_prime_lncRNA
    
    return label_update, theta_prime, pos_prob_crf, unary_potential, pairwise_potential
예제 #17
0
파일: tagger.py 프로젝트: ASAPP-H/clip2
    def __init__(
        self,
        nwords,
        nchars,
        ntags,
        pretrained_list,
        run_name,
        exp_name,
        list_of_possible_tags,
        use_char=True,
        use_crf=False,
        class_weights=[],
        learning_rate=0.015,
        learning_decay_rate=0.05,
        weight_decay=1e-8,
    ):
        super().__init__()

        self.run_name = run_name
        self.exp_name = exp_name
        self.class_weights = torch.Tensor(class_weights)
        # Create word embeddings
        pretrained_tensor = torch.FloatTensor(pretrained_list)
        self.word_embedding = torch.nn.Embedding.from_pretrained(
            pretrained_tensor, freeze=False)
        self.list_of_possible_tags = list_of_possible_tags
        # Create input dropout parameter
        # self.word_dropout = torch.nn.Dropout(1 - KEEP_PROB)
        char_lstm_hidden = 0
        self.use_char = use_char
        if self.use_char:
            # Character-level LSTMs
            self.char_embedding = torch.nn.Embedding(nchars,
                                                     CHAR_DIM_EMBEDDING)
            self.char_lstm = torch.nn.LSTM(
                CHAR_DIM_EMBEDDING,
                CHAR_LSTM_HIDDEN,
                num_layers=1,
                batch_first=True,
                bidirectional=True,
            )
            char_lstm_hidden = CHAR_LSTM_HIDDEN

        # Create LSTM parameters
        self.lstm = torch.nn.LSTM(
            DIM_EMBEDDING + char_lstm_hidden,
            LSTM_HIDDEN,
            num_layers=LSTM_LAYER,
            batch_first=True,
            bidirectional=True,
        )
        # Create output dropout parameter
        self.lstm_output_dropout = torch.nn.Dropout(1 - KEEP_PROB)

        # Create final matrix multiply parameters
        self.hidden_to_tag = torch.nn.Linear(LSTM_HIDDEN * 2, ntags)
        self.ntags = ntags
        self.use_crf = use_crf
        if self.use_crf:
            self.crf = CRF(target_size=ntags)
    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim,
                 batch_size, max_len):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.max_len = max_len
        self.crf = CRF(len(tag_to_ix), batch_first=True)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim // 2,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix["<START>"], :] = -10000
        self.transitions.data[:, tag_to_ix["<STOP>"]] = -10000

        self.hidden = self.init_hidden()
 def __init__(self,
              base_path,
              oov,
              num_labels,
              lstm_hidden_size=128,
              dropout=0.3,
              lm_flag=False):
     super(Bert_CRF, self).__init__()
     bert_config = BertConfig.from_json_file(
         os.path.join(base_path, 'config.json'))
     bert_config.num_labels = num_labels
     #hidden_states (tuple(torch.FloatTensor), optional, returned when config.output_hidden_states=True):
     bert_config.output_hidden_states = True
     bert_config.output_attentions = True
     self.bert = BertModel.from_pretrained(os.path.join(
         base_path, 'pytorch_model.bin'),
                                           config=bert_config)
     self.tokenizer = tokenizer
     self.oov = oov
     self._oov_embed()
     self.dropout = nn.Dropout(dropout)
     #lstm input_size = bert_config.hidden_size  hidden_size(第二个参数)= 跟Linear 的第一个参数对上
     # 尝试下双向LSTM
     self.lm_flag = lm_flag
     self.lstm = nn.LSTM(bert_config.hidden_size,
                         lstm_hidden_size,
                         num_layers=1,
                         bidirectional=True,
                         dropout=0.3,
                         batch_first=True)
     self.clf = nn.Linear(256, bert_config.num_labels + 2)
     self.layer_norm = nn.LayerNorm(lstm_hidden_size * 2)
     self.crf = CRF(target_size=bert_config.num_labels,
                    average_batch=True,
                    use_cuda=True)
예제 #20
0
def extractPhraseFromCRFWithColor(phrasedir, systemdir):
    crf_reader = CRF()
    aligner = AlignPhraseAnnotation()

    lectures = annotation.Lectures
    for i, lec in enumerate(lectures):
        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            filename = path + prompt + '.' + method + '.key'
            extracted_phrases = []
            extracted_colors = []

            crf_file = os.path.join(systemdir, 'extraction', 'all_output',
                                    'test_%i_%s.out' % (i, prompt))
            for tokens, tags, color0, color1 in crf_reader.read_file_generator_index(
                    crf_file, [0, -1, -4, -3]):
                phrases, phrase_colors = aligner.get_phrase_with_colors(
                    tokens, tags, [color0, color1])

                for phrase, phrase_color in zip(phrases, phrase_colors):

                    extracted_phrases.append(phrase.lower())
                    extracted_colors.append(phrase_color)

            fio.SaveList(extracted_phrases, filename)

            filename = path + prompt + '.' + method + '.key.color'
            fio.SaveDict2Json(extracted_colors, filename)
예제 #21
0
    def __init__(self, opt, tag2label):
        super(Bilstm_crf, self).__init__()

        self.embedding_length = opt.embedding_length
        self.hidden_size = opt.hidden_size
        self.output_size = len(tag2label)
        self.batch_size = opt.batch_size

        self.vocab_size = opt.vocab_size

        self.dropout = opt.dropout

        self.dropout_embed = nn.Dropout(opt.dropout)
        self.word_embeddings = nn.Embedding(self.vocab_size,
                                            self.embedding_length)
        self.word_embeddings.weight.data.copy_(torch.from_numpy(
            opt.embeddings))
        self.dropout_embed = nn.Dropout(opt.dropout)

        self.lstm = nn.LSTM(self.embedding_length,
                            self.hidden_size,
                            bidirectional=True,
                            dropout=opt.dropout)

        if self.lstm.bidirectional:
            self.label = nn.Linear(self.hidden_size * 2, self.output_size)
        else:
            self.label = nn.Linear(self.hidden_size, self.output_size)
        self.crf = CRF(self.output_size)
예제 #22
0
def do_infer(args):
    config = ConfigParser()
    config.read_file(args.config)
    model = CRF(config)

    reader = csv.reader(args.input, delimiter='\t')
    header = next(reader)
    assert all(w in header for w in ["id", "words", "lemmas", "pos_tags", "doc_char_begin", "doc_char_end", "gloss"]), "Input doesn't have required annotations."
    Sentence = namedtuple('Sentence', header)

    def parse_input(row):
        sentence = Sentence(*row)
        words, lemmas, pos_tags = [parse_psql_array(arr) for arr in (sentence.words, sentence.lemmas, sentence.pos_tags)]
        return sentence._replace(words=words, lemmas=lemmas, pos_tags=pos_tags)

    writer = csv.writer(args.output, delimiter='\t')
    writer.writerow([
        'id',
        'speaker_token_begin', 'speaker_token_end',
        'cue_token_begin', 'cue_token_end',
        'content_token_begin', 'content_token_end', 'content_tokens',
        'speaker', 'cue', 'content'])

    for sentences in tqdm(grouper(map(parse_input, reader), args.batch_size)):
        conll = [zip(s.words, s.lemmas, s.pos_tags) for s in sentences]
        for sentence, tags in zip(sentences, model.infer(conll)):
            if "SPKR" not in tags or "CTNT" not in tags: continue
            writer.writerow([sentence.id,] + extract_quote_entries(sentence, tags))
예제 #23
0
class BiLSTM_CRF(nn.Module):
    def __init__(self, data):
        super(BiLSTM_CRF, self).__init__()
        print "build batched lstmcrf..."
        self.gpu = data.HP_gpu
        label_size = data.label_alphabet_size
        data.label_alphabet_size += 2
        self.lstm = BiLSTM(data)
        self.crf = CRF(label_size, self.gpu)


    def neg_log_likelihood_loss(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths,  char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask):
        outs = self.lstm.get_output_score(gaz_list, word_inputs, biword_inputs, word_seq_lengths,  char_inputs, char_seq_lengths, char_seq_recover)
        batch_size = word_inputs.size(0)
        seq_len = word_inputs.size(1)
        total_loss = self.crf.neg_log_likelihood_loss(outs, mask, batch_label)
        scores, tag_seq = self.crf._viterbi_decode(outs, mask)
        return total_loss, tag_seq


    def forward(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask):
        outs = self.lstm.get_output_score(gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
        batch_size = word_inputs.size(0)
        seq_len = word_inputs.size(1)
        scores, tag_seq = self.crf._viterbi_decode(outs, mask)
        return tag_seq


    def get_lstm_features(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover):
        return self.lstm.get_lstm_features(gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
        
예제 #24
0
 def __init__(self,
              words_num,
              embed_dim,
              hidden_dim,
              num_layers,
              out_class,
              word2idx,
              dropout=0.2,
              bi_direction=True):
     super(LSTMCRF, self).__init__()
     self.word2idx = word2idx
     self.bi_direction = bi_direction
     self.hidden_dim = hidden_dim
     self.embed_layer = nn.Embedding(words_num, embed_dim)
     if bi_direction:
         self.rnn = nn.LSTM(embed_dim,
                            hidden_dim // 2,
                            num_layers=num_layers,
                            bidirectional=True)
     else:
         self.rnn = nn.LSTM(embed_dim,
                            hidden_dim,
                            num_layers=num_layers,
                            bidirectional=False)
     self.fc = nn.Linear(hidden_dim, out_class)
     self.crf = CRF(out_class)
예제 #25
0
파일: seqmodel.py 프로젝트: NLP1502/NLP
    def __init__(self, data, circul_time, deepth):
        super(SeqModel_circulationBiLSTM, self).__init__()
        self.use_crf = data.use_crf
        self.use_trans = data.use_trans
        self.use_mapping = data.use_mapping
        print "build network..."
        print "use_char: ", data.use_char
        if data.use_char:
            print "char feature extractor: ", data.char_seq_feature

        print "use_trans: ", data.use_trans
        print "word feature extractor: ", data.word_feature_extractor
        print "use crf: ", self.use_crf

        self.gpu = data.gpu
        self.average_batch = data.average_batch_loss
        # add two more label for downlayer lstm, use original label size for CRF
        label_size = data.label_alphabet_size
        data.label_alphabet_size += 2

        self.word_hidden = WordSequence_circulationBiLSTM(
            data, circul_time, deepth)

        if self.use_crf:
            self.crf = CRF(label_size, self.gpu)
예제 #26
0
파일: model.py 프로젝트: ThisIsSoMe/KBQA
    def __init__(self, dicts, config):
        super(EntityDetection, self).__init__()
        self.config = config
        self.embed = Embeddings(word_vec_size=config.d_embed, dicts=dicts)
        if self.config.rnn_type.lower() == 'gru':
            self.rnn = nn.GRU(input_size=config.d_embed, hidden_size=config.d_hidden,
                              num_layers=config.n_layers, dropout=config.dropout_prob,
                              bidirectional=config.birnn)
        else:
            self.rnn = nn.LSTM(input_size=config.d_embed, hidden_size=config.d_hidden,
                               num_layers=config.n_layers, dropout=config.dropout_prob,
                               bidirectional=config.birnn)

        self.dropout = nn.Dropout(p=config.dropout_prob)
        self.relu = nn.ReLU()
        seq_in_size = config.d_hidden
        if self.config.birnn:
            seq_in_size *= 2

        self.hidden2tag = nn.Sequential(
                        nn.Linear(seq_in_size, seq_in_size),
                        nn.BatchNorm1d(seq_in_size),
                        self.relu,
                        self.dropout,
                        nn.Linear(seq_in_size, config.n_out)
        )
        self.crf=CRF(config.n_out)
예제 #27
0
class deepBiLSTM_CRF(nn.Module):
    def __init__(self, word_HPs, char_HPs, num_labels=None, drop_final=0.5):
        super(deepBiLSTM_CRF, self).__init__()
        [word_size, word_dim, word_pre_embs, word_hidden_dim, word_dropout, word_layers, word_bidirect] = word_HPs
        if char_HPs:
            [char_size, char_dim, char_pred_embs, char_hidden_dim, char_dropout, char_layers, char_bidirect] = char_HPs
       
        self.lstm = Deep_bisltm(word_HPs, char_HPs, num_labels, att=True)
        # add two more labels for CRF
        self.crf = CRF(num_labels+2, use_cuda)
        ## add two more labels to learn hidden features for start and end transition 
        self.hidden2tag = nn.Linear(2*word_hidden_dim, num_labels+2)
        self.dropfinal = nn.Dropout(drop_final)
        if use_cuda:
            self.hidden2tag = self.hidden2tag.cuda()
            self.dropfinal = self.dropfinal.cuda()


    def NLL_loss(self, label_score, mask_tensor, label_tensor):
        batch_loss = self.crf.neg_log_likelihood_loss(label_score, mask_tensor, label_tensor)
        return batch_loss

    def inference(self, label_score, mask_tensor):
        label_prob, label_pred = self.crf._viterbi_decode(label_score, mask_tensor)
        return label_prob, label_pred

    def forward(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover):
        # (batch_size,sequence_len,hidden_dim)
        rnn_out = self.lstm.get_all_atthiddens(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
        # (batch_size,sequence_len,num_labels+2)
        label_score = self.hidden2tag(rnn_out)
        label_score = self.dropfinal(label_score)
        return label_score
예제 #28
0
    def __init__(self, char_init_embed, word_init_embed, pos_init_embed, spo_embed_dim, sentence_length, 
        hidden_size, num_classes, dropout=0.3, id2words=None, encoding_type='bieso', weight=None):
        
        super().__init__()
        
        # self.Embedding = nn.Embedding(init_embed)
#         print(char_init_embed)
        self.char_embed = nn.Embedding(char_init_embed[0], char_init_embed[1])
        self.word_embed = nn.Embedding(word_init_embed[0], word_init_embed[1])
        # word2vec
        self.word_embed.weight.data.copy_(torch.from_numpy(weight))
        self.pos_embed = nn.Embedding(pos_init_embed[0], pos_init_embed[1])
        # spo embed size: 50
        self.embed_dim = self.char_embed.embedding_dim + self.word_embed.embedding_dim + self.pos_embed.embedding_dim + spo_embed_dim
        # sentence length
        #self.sen_len = sentence_length
        #self.zeros = torch.zeros(self.sen_len, dtype=torch.long)

        self.norm1 = torch.nn.LayerNorm(self.embed_dim)
        self.Rnn = nn.LSTM(input_size=self.embed_dim, hidden_size=hidden_size, num_layers=2,
                            dropout=dropout, bidirectional=True, batch_first=True)
        self.Linear1 = nn.Linear(hidden_size * 2, hidden_size * 2 // 3)
        self.norm2 = torch.nn.LayerNorm(hidden_size * 2 // 3)
        self.relu = torch.nn.LeakyReLU()
        self.drop = torch.nn.Dropout(dropout)
        self.Linear2 = nn.Linear(hidden_size * 2 // 3, num_classes)
        
        if id2words is None:
            self.Crf = CRF(num_classes, include_start_end_trans=False)
        else:
            self.Crf = CRF(num_classes, include_start_end_trans=False,
                            allowed_transitions=allowed_transitions(id2words, encoding_type=encoding_type))
예제 #29
0
    def __init__(self,
                 vocab_tag,
                 char_embed_size,
                 num_hidden_layer,
                 channel_size,
                 kernel_size,
                 dropout_rate=0.2):
        super(CharWordSeg, self).__init__()
        self.vocab_tag = vocab_tag
        self.char_embed_size = char_embed_size
        self.num_hidden_layer = num_hidden_layer
        self.channel_size = channel_size
        self.kernel_size = kernel_size
        self.dropout_rate = dropout_rate

        num_tags = len(self.vocab_tag['tag_to_index'])
        vocab_size = len(self.vocab_tag['token_to_index'])
        self.char_embedding = nn.Embedding(vocab_size, char_embed_size)
        self.dropout_embed = nn.Dropout(dropout_rate)
        self.glu_layers = nn.ModuleList([
            ConvGLUBlock(in_channels=char_embed_size,
                         out_channels=channel_size,
                         kernel_size=kernel_size,
                         drop_out=0.2,
                         padding=1)
        ] + [
            ConvGLUBlock(in_channels=channel_size,
                         out_channels=channel_size,
                         kernel_size=kernel_size,
                         drop_out=0.2,
                         padding=1) for _ in range(num_hidden_layer - 1)
        ])
        self.hidden_to_tag = nn.Linear(char_embed_size, num_tags)
        self.crf_layer = CRF(num_tags, batch_first=True)
예제 #30
0
    def __init__(self, label_size, input_dim):
        super(CRFDecoder, self).__init__()
        self.input_dim = input_dim
        self.linear = nn.Linear(in_features=input_dim, out_features=label_size)
        self.crf = CRF(label_size + 2)
        self.label_size = label_size

        self.init_weights()
예제 #31
0
    def __init__(self, data, opt):
        super(SeqModel, self).__init__()

        self.gpu = opt.gpu

        ## add two more label for downlayer lstm, use original label size for CRF
        self.word_hidden = WordSequence(data, opt)
        self.crf = CRF(data.label_alphabet.size(), self.gpu)
예제 #32
0
 def __init__(self, data):
     super(BiLSTM_CRF, self).__init__()
     print "build batched lstmcrf..."
     self.gpu = data.HP_gpu
     label_size = data.label_alphabet_size
     data.label_alphabet_size += 2
     self.lstm = BiLSTM(data)
     self.crf = CRF(label_size, self.gpu)
예제 #33
0
 def __init__(self, utterance_encoder: PreTrainedModel,
              conversation_encoder: nn.RNNBase, n_classes: int) -> None:
     super(UtteranceClassificationHRNN, self).__init__()
     self.hrnn = HierarchicalRNN(utterance_encoder, conversation_encoder)
     self.output_layer = nn.Linear(
         in_features=conversation_encoder.hidden_size,
         out_features=n_classes)
     self.crf = CRF(n_classes)
예제 #34
0
파일: model.py 프로젝트: houking-can/NER
    def __init__(self,
                 vocab_size,
                 word_embed_dim,
                 word_hidden_dim,
                 alphabet_size,
                 char_embedding_dim,
                 char_hidden_dim,
                 feature_extractor,
                 tag_num,
                 dropout,
                 pretrain_embed=None,
                 use_char=False,
                 use_crf=False,
                 use_gpu=False):
        super(NamedEntityRecog, self).__init__()
        self.use_crf = use_crf
        self.use_char = use_char
        self.drop = nn.Dropout(dropout)
        self.input_dim = word_embed_dim
        self.feature_extractor = feature_extractor

        self.embeds = nn.Embedding(vocab_size, word_embed_dim, padding_idx=0)
        if pretrain_embed is not None:
            self.embeds.weight.data.copy_(torch.from_numpy(pretrain_embed))
        else:
            self.embeds.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(vocab_size, word_embed_dim)))

        if self.use_char:
            self.input_dim += char_hidden_dim
            self.char_feature = CharCNN(alphabet_size, char_embedding_dim,
                                        char_hidden_dim, dropout)

        if feature_extractor == 'lstm':
            self.lstm = nn.LSTM(self.input_dim,
                                word_hidden_dim,
                                batch_first=True,
                                bidirectional=True)
        else:
            self.word2cnn = nn.Linear(self.input_dim, word_hidden_dim * 2)
            self.cnn_list = list()
            for _ in range(4):
                self.cnn_list.append(
                    nn.Conv1d(word_hidden_dim * 2,
                              word_hidden_dim * 2,
                              kernel_size=3,
                              padding=1))
                self.cnn_list.append(nn.ReLU())
                self.cnn_list.append(nn.Dropout(dropout))
                self.cnn_list.append(nn.BatchNorm1d(word_hidden_dim * 2))
            self.cnn = nn.Sequential(*self.cnn_list)

        if self.use_crf:
            self.hidden2tag = nn.Linear(word_hidden_dim * 2, tag_num + 2)
            self.crf = CRF(tag_num, use_gpu)
        else:
            self.hidden2tag = nn.Linear(word_hidden_dim * 2, tag_num)
예제 #35
0
def do_train(args):
    # Load configuration
    config = ConfigParser()
    config.read_file(args.config)

    data = DataStore(config)

    # Create the CRF model.
    model = CRF(config)

    retrain_epochs = config["training"].getint("retrain_every")

    accuracy = []

    with EditShell(config) as shell:
        while data.has_next():
            conll = data.next()
            i = data.i()

            # if the data doesn't have tags, try to smart-tag them.
            if len(conll[0]) == DataStore.TAG_LABEL+1:
                tags = [tok[DataStore.TAG_LABEL] for tok in conll]
            else:
                tags = model.infer(conll)

            try:
                #conll_display = ["{}/{}".format(token[0], token[2]) for token in conll]
                conll_display = ["{}".format(token[0]) for token in conll]

                # Create a copy of the list
                action = shell.run(conll_display, list(tags), metadata=render_progress(data, accuracy))

                if action.type == ":prev":
                    try:
                        data.rewind(2) # move 2 indices back
                    except AttributeError:
                        data.rewind(1)
                elif action.type == ":goto":
                    doc_idx, = action.args
                    assert doc_idx >= 0
                    data.goto(doc_idx)
                elif action.type == "save":
                    _, tags_ = action.args
                    accuracy.append(score(tags, tags_))

                    data.update(conll, tags_)

                    if i % retrain_epochs == 0:
                        model.retrain()

            except QuitException:
                break
예제 #36
0
파일: pg.py 프로젝트: 52nlp/iir
def main():
    parser = OptionParser()
    parser.add_option("-d", dest="training_dir", help="training data directory")
    parser.add_option("-t", dest="test_dir", help="test data directory")
    parser.add_option("-f", dest="test_file", help="test data file")
    parser.add_option("-m", dest="model", help="model file")
    parser.add_option("-l", dest="regularity", type="int", help="regularity. 0=none, 1=L1, 2=L2 [2]", default=2)
    (options, args) = parser.parse_args()
    if not options.training_dir and not options.model:
        parser.error("need training data directory(-d) or model file(-m)")

    features = pg_features(["H", "B", "F"])
    crf = CRF(features, options.regularity)
    print "features:", features.size()
    print "labels:", len(features.labels)

    if options.training_dir:
        texts, labels = load_dir(options.training_dir)
        fvs = [FeatureVector(features, x, y) for x, y in zip(texts, labels)]

        # initial parameter (pick up max in 10 random parameters)
        theta = sorted([crf.random_param() for i in range(10)], key=lambda t:crf.likelihood(fvs, t))[-1]

        # inference
        print "log likelihood (before inference):", crf.likelihood(fvs, theta)
        theta = crf.inference(fvs, theta)
        if options.model:
            f = open(options.model, 'w')
            f.write(pickle.dumps(theta))
            f.close()
    else:
        f = open(options.model, 'r')
        theta = pickle.loads(f.read())
        f.close()
        if features.size() != len(theta):
            raise ValueError, "model's length not equal feature's length."

    if options.test_dir:
        test_files = glob.glob(options.test_dir + '/*')
    elif options.test_file:
        test_files = [options.test_file]
    else:
        test_files = []

    i = 0
    for filename in test_files:
        print "========== test = ", i
        text, label = load_file(filename)
        pg_tagging(FeatureVector(features, text), text, label, crf, features, theta)
        i += 1
def test_cross_course(train, name='all'):
    wapiti_home = global_params.wapiti_dir
    
    pattern_file = '../data/%s.pattern.txt'%name
    model_dir = '../data/%s/%s/model/%s/'%(course, system, name)
    fio.NewPath(model_dir)
    
    feature_dir = '../data/%s/%s/extraction/'%(course, system)
    feature_cv_dir = '../data/%s/%s/extraction/%s/'%(course, system, name)
    fio.NewPath(feature_cv_dir)
    
    outputdir = '../data/%s/%s/extraction/%s_output/'%(course, system, name)
    fio.NewPath(outputdir)
    
    lectures = annotation.Lectures
    
    dict = defaultdict(int)
    
    for i, lec in enumerate(lectures):
        test = [lec]
        model_file = os.path.join(model_dir, '%s.model'%train)
        
        print model_file
        
        crf = CRF(wapiti_home)
        if not fio.IsExist(model_file):
            print "Model is not available"
            
        for q in ['q1', 'q2']:
            
            test_filename = os.path.join(feature_cv_dir, 'test_%d_%s.feature.crf'%(i, q))
            output_file = os.path.join(outputdir, 'test_%d_%s.out'%(i, q))
            
            dict['test_%d_%s'%(i, q)] = 1
            
            if method == 'combine':
                test_filename_old = test_filename.replace('_combine', '_A1')
                cmd = 'cp %s %s'%(test_filename_old, test_filename)
                os.system(cmd)
            else:
                combine_files(feature_dir, test, test_filename, prompts=[q])
        
            crf.predict(test_filename, model_file, output_file)
        
        if debug: break
    
    file_util.save_dict2json(dict, class_index_dict_file)
def extractPhraseFromCRF(phrasedir, systemdir):
    crf_reader = CRF()
    aligner = AlignPhraseAnnotation()
    
    lectures = annotation.Lectures
    for i, lec in enumerate(lectures):
        path = phrasedir + str(lec)+ '/'
        fio.NewPath(path)
        
        for prompt in ['q1', 'q2']:
            filename = path + prompt + '.' + method + '.key'
            phrases = []
            
            crf_file = os.path.join(systemdir, 'extraction', 'all_output', 'test_%i_%s.out'%(i, prompt))
            for tokens, tags in crf_reader.read_file_generator(crf_file):
                for phrase in aligner.get_phrase(tokens, tags):
                    phrases.append(phrase.lower())
                    
            fio.SaveList(phrases, filename)
예제 #39
0
    def crfpp(self, msg):
        crf = CRF()
        fileUtil = FileUtil()
        crf.create_file_input(msg)
        os.system('crf_test -m ../model1 crf.test.data > crf.result')

        lst = fileUtil.read_file('crf.result')
#         lst = [a for a in lst if a != u'\n']
#         str_ans = reduce(lambda x,y:x+y, [a.split('\t')[0] for a in lst])
         
        # ans = reduce(lambda x,y:x+y, [a.split('\t')[3][:-1] for a in lst])
#         lst_col3 = [a.split('\t')[3][:-1] for a in lst]
        lst_col3, str_ans = self.process_ans(lst)
        lst_ans = [n for (n, e) in enumerate(lst_col3) if e == 'B']
        result_lst = []
        for i in range(len(lst_ans)-1):
            a = lst_ans[i]
            b = lst_ans[i+1]
            result_lst.append(str_ans[a:b])
        result_lst.append(str_ans[b:len(str_ans)])
        return result_lst    
예제 #40
0
    def crfpp(self, msg, model):
        crf = CRF()
        fileUtil = FileUtil()
        crf.create_file_input(msg)
        start_time = time.time()
        os.system('crf_test -m '+model+' crf.test.data > logs/out/crf.result')
        total_time = time.time() - start_time #return in seconds

        lst = fileUtil.read_file('logs/out/crf.result')
#         lst = [a for a in lst if a != u'\n']
#         str_ans = reduce(lambda x,y:x+y, [a.split('\t')[0] for a in lst])
         
        # ans = reduce(lambda x,y:x+y, [a.split('\t')[3][:-1] for a in lst])
#         lst_col3 = [a.split('\t')[3][:-1] for a in lst]
        lst_col3, str_ans = self.process_ans(lst)
        lst_ans = [n for (n, e) in enumerate(lst_col3) if e == 'B']
        result_lst = []
        for i in range(len(lst_ans)-1):
            a = lst_ans[i]
            b = lst_ans[i+1]
            result_lst.append(str_ans[a:b])
        result_lst.append(str_ans[b:len(str_ans)])
        return total_time, result_lst
예제 #41
0
파일: train.py 프로젝트: Ambier/python-crf
from crf import CRF
from features import *
import re, sys
import pickle
training_file = sys.argv[1]

if __name__ == '__main__':
	labels,obsrvs,word_sets,word_data,label_data = fit_dataset(training_file)
	crf = CRF(
			labels=list(labels),
			feature_functions = Membership.functions(labels,*word_sets.values()) +
								MatchRegex.functions(labels,
									'^[^0-9a-zA-Z\-]+$',
									'^[^0-9\-]+$',
									'^[A-Z]+$',
									'^-?[1-9][0-9]*\.[0-9]+$',
									'^[1-9][0-9\.]+[a-z]+$',
									'^[0-9]+$',
									'^[A-Z][a-z]+$',
									'^([A-Z][a-z]*)+$',
									'^[^aeiouAEIOU]+$'
								))# + [
								#	lambda yp,y,x_v,i,_y=_y,_x=_x:
								#		1 if i < len(x_v) and y==_y and x_v[i].lower() ==_x else 0
								#	for _y in labels
								#	for _x in obsrvs
								#])
	crf.train(word_data[:-5],label_data[:-5])
	pickle.dump(crf,open(sys.argv[2],'wb'))
	for i in range(-5,0):
		print word_data[i]
예제 #42
0
파일: testcrf.py 프로젝트: 52nlp/iir
def main():
    def load_data(data):
        texts = []
        labels = []
        text = []
        data = "\n" + data + "\n"
        for line in data.split("\n"):
            line = line.strip()
            if len(line) == 0:
                if len(text)>0:
                    texts.append(text)
                    labels.append(label)
                text = []
                label = []
            else:
                token, info, chunk = line.split()
                text.append((token, info))
                label.append(chunk)
        return (texts, labels)

    texts, labels = load_data("""
    This DT B-NP
    temblor-prone JJ I-NP
    city NN I-NP
    dispatched VBD B-VP
    inspectors NNS B-NP
    , , O

    firefighters NNS B-NP
    and CC O
    other JJ B-NP
    earthquake-trained JJ I-NP
    personnel NNS I-NP
    to TO B-VP
    aid VB I-VP
    San NNP B-NP
    Francisco NNP I-NP
    . . O
    """)

    print texts, labels

    test_texts, test_labels = load_data("""
    Rockwell NNP B-NP
    said VBD B-VP
    the DT B-NP
    agreement NN I-NP
    calls VBZ B-VP
    for IN B-SBAR
    it PRP B-NP
    to TO B-VP
    supply VB I-VP
    200 CD B-NP
    additional JJ I-NP
    so-called JJ I-NP
    shipsets NNS I-NP
    for IN B-PP
    the DT B-NP
    planes NNS I-NP
    . . O
    """)

    features = Features(labels)
    tokens = dict([(i[0],1) for x in texts for i in x]).keys()
    infos = dict([(i[1],1) for x in texts for i in x]).keys()

    for label in features.labels:
        for token in tokens:
            features.add_feature( lambda x, y, l=label, t=token: 1 if y==l and x[0]==t else 0 )
        for info in infos:
            features.add_feature( lambda x, y, l=label, i=info: 1 if y==l and x[1]==i else 0 )
    features.add_feature_edge( lambda y_, y: 0 )

    fvs = [FeatureVector(features, x, y) for x, y in zip(texts, labels)]
    fv = fvs[0]
    text_fv = FeatureVector(features, test_texts[0]) # text sequence without labels


    crf = CRF(features, 0)
    theta0 = crf.random_param()
    print "initial log likelihood:", crf.likelihood(fvs, theta0)


    print ">> Steepest Descent"
    theta = theta0.copy()
    eta = 0.5
    t = time.time()
    for i in range(20):
        theta += eta * crf.gradient_likelihood(fvs, theta)
        print i, "log likelihood:", crf.likelihood(fvs, theta)
        eta *= 0.95
    print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size)

    print ">> SGD"
    theta = theta0.copy()
    eta = 0.5
    t = time.time()
    for i in range(20):
        for fv in fvs:
            theta += eta * crf.gradient_likelihood([fv], theta)
        print i, "log likelihood:", crf.likelihood(fvs, theta)
        eta *= 0.95
    print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size)

    print ">> SGD + FOBOS L1"
    theta = theta0.copy()
    eta = 0.5
    lmd = 0.01
    t = time.time()
    for i in range(20):
        lmd_eta = lmd * eta
        for fv in fvs:
            theta += eta * crf.gradient_likelihood([fv], theta)
            theta = (theta > lmd_eta) * (theta - lmd_eta) + (theta < -lmd_eta) * (theta + lmd_eta)
        print i, "log likelihood:", crf.likelihood(fvs, theta)
        eta *= 0.95
    print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size)

    print ">> Steepest Descent + FOBOS L1"
    theta = theta0.copy()
    eta = 0.2
    lmd = 0.5
    t = time.time()
    for i in range(20):
        theta += eta * crf.gradient_likelihood(fvs, theta)
        lmd_eta = lmd * eta
        theta = (theta > lmd_eta) * (theta - lmd_eta) + (theta < -lmd_eta) * (theta + lmd_eta)
        print i, "log likelihood:", crf.likelihood(fvs, theta)
        eta *= 0.9
    print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size)
    #print theta

    print ">> BFGS"
    t = time.time()
    theta = crf.inference(fvs, theta0)
    print "log likelihood:", crf.likelihood(fvs, theta)
    print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size)
예제 #43
0
파일: webextract.py 프로젝트: 52nlp/iir
def main():
    parser = OptionParser()
    parser.add_option("-d", dest="training_dir", help="training data directory")
    parser.add_option("-t", dest="test_dir", help="test data directory")
    parser.add_option("-f", dest="test_file", help="test data file")
    parser.add_option("-m", dest="model", help="model file")
    parser.add_option("-b", dest="body", action="store_true", help="output body")
    parser.add_option("-l", dest="regularity", type="int", help="regularity. 0=none, 1=L1, 2=L2 [2]", default=2)
    parser.add_option("--l1", dest="fobos_l1", action="store_true", help="FOBOS L1", default=False)
    (options, args) = parser.parse_args()
    if not options.training_dir and not options.model:
        parser.error("need training data directory(-d) or model file(-m)")

    theta = LABELS = None
    if options.model and os.path.isfile(options.model):
        with open(options.model, 'r') as f:
            LABELS, theta = pickle.loads(f.read())
    if options.training_dir:
        texts, labels = load_dir(options.training_dir)
        if LABELS == None:
            LABELS = unique(flatten(labels))

    features = wce_features(LABELS)
    crf = CRF(features, options.regularity)

    if options.training_dir:
        fvs = [FeatureVector(features, x, y) for x, y in zip(texts, labels)]

        # initial parameter (pick up max in 10 random parameters)
        if theta == None:
            theta = sorted([crf.random_param() for i in range(10)], key=lambda t:crf.likelihood(fvs, t))[-1]

        # inference
        print "features:", features.size()
        print "labels:", len(features.labels), features.labels
        print "log likelihood (before inference):", crf.likelihood(fvs, theta)
        if options.fobos_l1:
            eta = 0.000001
            for i in range(0):
                for fv in fvs:
                    theta += eta * crf.gradient_likelihood([fv], theta)
                    print i, "log likelihood:", crf.likelihood(fvs, theta)
                eta *= 0.98
            lmd = 1
            while lmd < 200:
                for i in range(50):
                    theta += eta * crf.gradient_likelihood(fvs, theta)
                    lmd_eta = lmd * eta
                    theta = (theta > lmd_eta) * (theta - lmd_eta) + (theta < -lmd_eta) * (theta + lmd_eta)
                    if i % 10 == 5: print i, "log likelihood:", crf.likelihood(fvs, theta)
                    #eta *= 0.95
                import numpy
                print "%d : relevant features = %d / %d" % (lmd, (numpy.abs(theta) > 0.00001).sum(), theta.size)
                with open(options.model + str(lmd), 'w') as f:
                    f.write(pickle.dumps((LABELS, theta)))
                lmd += 1
        else:
            theta = crf.inference(fvs, theta)
        print "log likelihood (after inference):", crf.likelihood(fvs, theta)
        if options.model:
            with open(options.model, 'w') as f:
                f.write(pickle.dumps((LABELS, theta)))
    elif features.size() != len(theta):
        raise ValueError, "model's length not equal feature's length."

    if options.test_dir:
        test_files = glob.glob(options.test_dir + '/*.htm*')
    elif options.test_file:
        test_files = [options.test_file]
    else:
        test_files = []

    for x in sorted(theta):
        print x,
    print

    corrects = blocks = 0
    for i, filename in enumerate(test_files):
        if not options.body: print "========== test = ", i
        text, label = load_file(filename)
        fv = FeatureVector(features, text)
        prob, ys = crf.tagging(fv, theta)
        tagged_label = features.id2label(ys)

        cor, blo = len(filter(lambda x:x[0]==x[1], zip(label, tagged_label))), len(label)
        corrects += cor
        blocks += blo
        print "log_likely = %.3f, rate = %d / %d" % (prob, cor, blo)

        if options.body:
            for x, l in zip(text, tagged_label):
                if l == "body": print re.sub(r'\s+', ' ', re.sub(r'(?s)<[^>]+>', '', x.org_text)).strip()
        else:
            #wce_output_tagging(text, label, prob, tagged_label)
            map = CountDict()
            for x in zip(label, tagged_label):
                map[x] += 1
            for x in sorted(map):
                print x[0], " => ", x[1], " : ", map[x]
    if blocks > 0:
        print "total : %d / %d = %.3f%%" % (corrects, blocks, 100.0 * corrects / blocks)
예제 #44
0
 def __init__(self, label_alphabet, feature_alphabet):
     self.label_alphabet = label_alphabet
     self.feature_alphabet = feature_alphabet
     CRF.__init__(self, len(self.label_alphabet), len(self.feature_alphabet))
예제 #45
0
 def __call__(self, x):
     return self.label_alphabet.lookup_many(CRF.__call__(self, x))