예제 #1
0
 def __init__(self, hidden_size, word_emb_size):
     super(NZSigmoidLoss, self).__init__()
     require_type_lst = utils.get_ontoNotes_train_types()
     self.weight = nn.Parameter(
         torch.zeros(len(require_type_lst),
                     hidden_size * 2 + word_emb_size))
     utils.init_weight(self.weight)
예제 #2
0
def get_type_lst(depth, data):
    type_lst = None
    if data == 'onto':
        type_lst = utils.get_ontoNotes_train_types(depth)
    elif data == 'wiki':
        type_lst = utils.get_wiki_types(depth)
    elif data == 'bbn':
        type_lst = utils.get_bbn_types(depth)
    return type_lst
예제 #3
0
def get_down_type_lst(depth, data):
    type_lst = []
    if data == 'onto':
        for i in range(1, depth+1):
            type_lst.extend(utils.get_ontoNotes_train_types(i))
    elif data == 'wiki':
        for i in range(1, depth + 1):
            type_lst.extend(utils.get_wiki_types(i))
    elif data == 'bbn':
        for i in range(1, depth + 1):
            type_lst.extend(utils.get_bbn_types(i))
    return type_lst
예제 #4
0
def refine(labels, tier, maxDepth=3):
    keep = [""] * maxDepth
    short2full = get_short2full_map(utils.get_ontoNotes_train_types())
    for l in labels:
        path = l.split('/')[1:]
        path = [short2full[k] for k in path]
        for i in range(len(path)):
            if keep[i] == "":
                keep[i] = path[i]
            elif keep[i] != path[i]:
                break

    return [l for l in keep if l != ""]
예제 #5
0
def evaluate_all(my_arg, pr=True):
    emb = LoadEmbedding('res/onto_emb.txt')
    print('finish loading embedding')
    batch_size = 1000
    batch_getter = OntoNotesFGGetter('data/OntoNotes/test.json',
                                     utils.get_ontoNotes_train_types(),
                                     batch_size, True)
    print('finish loading train data')
    ctx_lstm = CtxLSTM(emb.get_emb_size())
    embedding_layer = EmbeddingLayer(emb)
    ctx_att = CtxAtt(fg_config['hidden_size'], emb.get_emb_size())
    sigmoid_loss = SigmoidLoss(fg_config['hidden_size'], emb.get_emb_size())

    if fg_config['USE_CUDA']:
        embedding_layer.cuda(fg_config['cuda_num'])
        ctx_lstm.cuda(fg_config['cuda_num'])
        ctx_att.cuda(fg_config['cuda_num'])
        sigmoid_loss.cuda(fg_config['cuda_num'])
    model_dir = 'et_model' + str(my_arg)
    embedding_layer.load_state_dict(
        torch.load(model_dir + '/embedding_layer.pkl'))
    ctx_lstm.load_state_dict(torch.load(model_dir + '/ctx_lstm.pkl'))
    ctx_att.load_state_dict(torch.load(model_dir + '/ctx_att.pkl'))
    sigmoid_loss.load_state_dict(torch.load(model_dir + '/sigmoid_loss.pkl'))
    embedding_layer.eval()
    ctx_lstm.eval()
    ctx_att.eval()
    sigmoid_loss.eval()
    ex_iterations = 0
    evaluator = BoundaryPerformance()
    for iteration, this_batch in enumerate(batch_getter):
        pred, label = evaluate_one(ex_iterations + iteration, embedding_layer,
                                   ctx_lstm, ctx_att, sigmoid_loss, this_batch)

        evaluator.evaluate(label, pred)
        if (iteration + 1) * batch_size % 100 == 0:
            print('{} sentences processed'.format(
                (iteration + 1) * batch_size))
            evaluator.get_performance()
    return evaluator.get_performance()
예제 #6
0
 def __init__(self,
              hidden_size,
              word_emb_size,
              dropout_p=fg_config['dropout']):
     super(WARPLoss, self).__init__()
     require_type_lst = None
     if fg_config['data'] == 'onto':
         require_type_lst = utils.get_ontoNotes_train_types()
     elif fg_config['data'] == 'wiki':
         require_type_lst = utils.get_wiki_types()
     elif fg_config['data'] == 'bbn':
         require_type_lst = utils.get_bbn_types()
     num_labels = len(require_type_lst)
     self.weight = nn.Parameter(
         torch.zeros(hidden_size * 2 + word_emb_size, word_emb_size))
     utils.init_weight(self.weight)
     self.rank_weights = [1.0 / 1]
     for i in range(1, num_labels):
         self.rank_weights.append(self.rank_weights[i - 1] + (1.0 / i + 1))
     self.trans = nn.Linear(hidden_size * 2 + word_emb_size, word_emb_size)
     utils.init_linear(self.trans)
     self.activate = nn.ReLU()
     self.dropout = nn.Dropout(dropout_p)
예제 #7
0
def main(my_arg):
    log_dir = 'et_logs' + str(my_arg)
    logger = Logger(log_dir)
    emb = LoadEmbedding('res/onto_embedding.txt')
    print('finish loading embedding')
    batch_getter = OntoNotesFGGetter('data/OntoNotes/train.json',
                                     utils.get_ontoNotes_train_types(),
                                     fg_config['batch_size'], True)
    # batch_getter = OntoNotesNZGetter('data/OntoNotes/train.json', utils.get_ontoNotes_train_types(),
    #                                  fg_config['batch_size'], True)
    print('finish loading train data')
    ctx_lstm = CtxLSTM(emb.get_emb_size())
    embedding_layer = EmbeddingLayer(emb)
    ctx_att = CtxAtt(fg_config['hidden_size'], emb.get_emb_size())
    sigmoid_loss = SigmoidLoss(fg_config['hidden_size'], emb.get_emb_size())
    warp_loss = WARPLoss(89)

    if fg_config['USE_CUDA']:
        embedding_layer.cuda(fg_config['cuda_num'])
        ctx_lstm.cuda(fg_config['cuda_num'])
        ctx_att.cuda(fg_config['cuda_num'])
        sigmoid_loss.cuda(fg_config['cuda_num'])

    ctx_lstm_opt = torch.optim.Adam(ctx_lstm.parameters())
    ctx_att_opt = torch.optim.Adam(ctx_att.parameters())
    sig_opt = torch.optim.Adam(sigmoid_loss.parameters())

    log_file = open('{}/log_file'.format(log_dir), 'w')
    f_max = 0
    low_epoch = 0
    ex_iterations = 0
    model_dir = 'et_model' + str(my_arg)
    time0 = time.time()
    for epoch in range(fg_config['max_epoch']):
        embedding_layer.train()
        ctx_lstm.train()
        ctx_att.train()
        sigmoid_loss.train()
        # f, p, r = evaluate_all(my_arg, False)
        for iteration, this_batch in enumerate(batch_getter):
            if (ex_iterations + iteration) % 100 == 0:
                print('epoch: {}, iteraton: {}'.format(
                    epoch, ex_iterations + iteration))

            train_iteration(logger, ex_iterations + iteration, embedding_layer,
                            ctx_lstm, ctx_att, sigmoid_loss, ctx_lstm_opt,
                            ctx_att_opt, sig_opt, this_batch)
            if (ex_iterations + iteration) % 100 == 0:
                time1 = time.time()
                print('this iteration time: ', time1 - time0, '\n')
                time0 = time1
            if (ex_iterations + iteration) % fg_config['save_freq'] == 0:
                torch.save(embedding_layer.state_dict(),
                           model_dir + '/embedding_layer.pkl')
                torch.save(ctx_lstm.state_dict(), model_dir + '/ctx_lstm.pkl')
                torch.save(ctx_att.state_dict(), model_dir + '/ctx_att.pkl')
                torch.save(sigmoid_loss.state_dict(),
                           model_dir + '/sigmoid_loss.pkl')

        ex_iterations += iteration + 1
        batch_getter.reset()
        fg_config['use_dropout'] = False
        f, p, r = evaluate_all(my_arg, False)
        fg_config['use_dropout'] = True
        log_file.write('epoch: {} f: {} p: {} r: {}\n'.format(epoch, f, p, r))
        log_file.flush()
        if f >= f_max:
            f_max = f
            low_epoch = 0
            os.system('cp {}/embedding_layer.pkl {}/early_embedding_layer.pkl'.
                      format(model_dir, model_dir))
            os.system('cp {}/ctx_lstm.pkl {}/early_ctx_lstm.pkl'.format(
                model_dir, model_dir))
            os.system('cp {}/ctx_att.pkl {}/early_ctx_att.pkl'.format(
                model_dir, model_dir))
            os.system(
                'cp {}/sigmoid_loss.pkl {}/early_sigmoid_loss.pkl'.format(
                    model_dir, model_dir))

        else:
            low_epoch += 1
            log_file.write('low' + str(low_epoch) + '\n')
            log_file.flush()
        if low_epoch >= fg_config['early_stop']:
            break
    log_file.close()
예제 #8
0
def evaluate_free(my_arg, pr=True):
    word_emb = LoadEmbedding('res/glove_840B_emb.txt')
    type_emb = LoadEmbedding('res/{}/zero_type_emb.txt'.format(fg_config['data']))
    print('finish loading embedding')
    batch_size = 100
    depth = None
    if fg_config['zero_shot']:
        depth = 2
    elif fg_config['no_zero'] == 'all':
        depth = None
    elif fg_config['no_zero'] == 'one':
        depth = 1

    type_lst = get_down_type_lst(depth, fg_config['data'])
    batch_getter = OntoNotesNZGetter('data/{}/test.json'.format(fg_config['data']),
                                     type_lst, batch_size, True, depth)
    print('finish loading train data')
    ctx_lstm = CtxLSTM(word_emb.get_emb_size())
    word_embedding_layer = EmbeddingLayer(word_emb)
    type_embedding_layer = EmbeddingLayer(type_emb)
    ctx_att = NZCtxAtt(fg_config['hidden_size'], word_emb.get_emb_size())
    warp_loss = WARPLoss(fg_config['hidden_size'], word_emb.get_emb_size())

    if fg_config['USE_CUDA']:
        word_embedding_layer.cuda(fg_config['cuda_num'])
        type_embedding_layer.cuda(fg_config['cuda_num'])
        ctx_lstm.cuda(fg_config['cuda_num'])
        ctx_att.cuda(fg_config['cuda_num'])
        warp_loss.cuda(fg_config['cuda_num'])
    model_dir = '{}/et_model{}'.format(fg_config['data'], str(my_arg))
    word_embedding_layer.load_state_dict(torch.load(model_dir+'/early_embedding_layer.pkl'))
    type_embedding_layer.load_state_dict(torch.load(model_dir+'/early_type_embedding_layer.pkl'))
    ctx_lstm.load_state_dict(torch.load(model_dir+'/early_ctx_lstm.pkl'))
    ctx_att.load_state_dict(torch.load(model_dir+'/early_ctx_att.pkl'))
    warp_loss.load_state_dict(torch.load(model_dir+'/early_sigmoid_loss.pkl'))
    word_embedding_layer.eval()
    type_embedding_layer.eval()
    ctx_lstm.eval()
    ctx_att.eval()
    warp_loss.eval()
    ex_iterations = 0
    evaluator = BoundaryPerformance()
    short2full = None
    if fg_config['data'] == 'onto':
        short2full = get_short2full_map(utils.get_ontoNotes_train_types())
    elif fg_config['data'] == 'wiki':
        short2full = get_short2full_map(utils.get_wiki_types())
        patch = utils.wiki_short2full_patch()
        short2full.update(patch)
    elif fg_config['data'] == 'bbn':
        short2full = get_short2full_map(utils.get_bbn_types())
    for iteration, this_batch in enumerate(batch_getter):
        pred, label = evaluate_one(ex_iterations + iteration, word_embedding_layer, type_embedding_layer,
                                   ctx_lstm, ctx_att, warp_loss, this_batch)

        # evaluator.evaluate(label, pred, type_lst, short2full)
        evaluator.evaluate(this_batch['types_str'], pred, type_lst, short2full)
        if (iteration+1)*batch_size % 100 == 0:
            print('{} sentences processed'.format((iteration+1)*batch_size))
            evaluator.get_performance()
    return evaluator.get_performance()
예제 #9
0
                    src_tokens = []
                    targets = []
            elif blank_line == 1:
                if line.strip():
                    parts = line.strip().split(' ')
                    src_tokens.append(parts[0])
                    targets.append(parts[3])

        train_file.close()
        self.all_samples = all_samples
        self.sample_num = len(self.all_samples)


if __name__ == '__main__':
    # config['Tags'] = {'<PADDING>': 0, '<START>': 1, 'B': 2, 'I': 3, 'O': 4, 'E': 5, 'S': 6}
    # config['misc'] = False
    # config['bioes'] = True
    # config['use_gaz'] = False
    # onto_notes = OntoNotesGetter('data/OntoNotes/test.json', '/organization/company', 1, False)
    # p = ConllBatchGetter('data/conll2003/bioes_eng.train', 'PER', 1, True)
    # pernam_batch_getter = TrainConllBatchGetter('data/conll2003/bioes_eng.train', 'PER', 1, True)
    # pass
    batch_getter = OntoNotesFGGetter('data/OntoNotes/test.json',
                                     utils.get_ontoNotes_train_types(),
                                     fg_config['batch_size'], True)

    print('finish load')
    for this_batch in batch_getter:
        pass
    print('finish read')