示例#1
0
def getwv():
    i2v = preprocess.load_obj('index2vocab')
    model = models.Word2Vec.load('w2v200.model.bin')
    embedding = np.zeros((len(i2v), model.wv.vector_size))
    j = 0
    for i in range(len(i2v)):
        try:
            embedding[i] = model.wv[i2v[i]]
        except:
            print(i2v[i], 'not in w2v')
            j += 1
            continue
    print(j)
    return embedding
示例#2
0
from model import BiLSTM_CRF
from preprocess import load_obj


def get_tag(model, sentence, idx_to_tag):
    sentence = sentence.unsqueeze(1)
    mask = sentence.ne(0)
    best_tag_ids = model.decode(sentence, mask)
    tags = [idx_to_tag[idx] for idx in best_tag_ids[0]]
    return tags


if __name__ == '__main__':
    print(TEST_SENTENCE)
    data_dir = 'data/chinese/processed'
    word_to_idx = load_obj(os.path.join(data_dir, 'word_to_idx.pkl'))
    tag_to_idx = load_obj(os.path.join(data_dir, 'tag_to_idx.pkl'))

    idx_to_tag = {v: k for k, v in tag_to_idx.items()}

    model = BiLSTM_CRF(len(word_to_idx), len(tag_to_idx), 100, 200, 0.1)
    model.load_state_dict(
        torch.load(CUR_MODEL, map_location=torch.device('cuda')))
    model.eval()

    processed_sen = [i.split('/')[0] for i in TEST_SENTENCE.split()]
    sentence = torch.LongTensor(
        [word_to_idx.get(w, word_to_idx[UNK]) for w in processed_sen])
    best_tags = get_tag(model, sentence, idx_to_tag)
    print(' '.join(best_tags))
示例#3
0
def main(args):
    teacher_forcing_ratio = 0.5
    schedule_sampling_ratio = 0.7
    num_layers = 2
    features, _ = preprocess.readfeat()
    #labels, v_size = preprocess.label2onehot_single_sentence(preprocess.readlabel())
    labels, v_size = preprocess.label2onehot(preprocess.readlabel(), limit=12)
    i2v = preprocess.load_obj('index2vocab')
    print('feature shape =', features.shape)
    print('label shape =', labels.shape)
    dataloader = getDataLoader(features,
                               labels,
                               single_sentence=False,
                               batch_size=128)
    encoder = EncoderRNN(4096, 512, num_layers=num_layers, bidirectional=True)
    #decoder = VanillaDecoderRNN(512, v_size, num_layers=1)
    decoder = BAttnDecoderRNN(512, v_size, num_layers=num_layers)
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    print(encoder)
    print(decoder)
    c = nn.CrossEntropyLoss()
    #c = nn.NLLLoss()
    params = list(decoder.parameters()) + list(encoder.parameters())
    optimizer = torch.optim.Adam(params, lr=0.0003)
    epochs = 16
    for epoch in range(epochs):
        losses = []
        desc = 'Epoch [{}/{}]'.format(epoch + 1, epochs)
        train_loss = []
        for images, inputs, targets, lengths in tqdm(dataloader, desc=desc):
            images, inputs, targets = to_var(images), to_var(inputs), to_var(
                targets)
            batch_size, caption_len = inputs.size()[0], inputs.size()[1]
            loss = 0
            encoder.zero_grad()
            decoder.zero_grad()
            encoder_output, encoder_hidden = encoder(images)
            decoder_hidden = encoder_hidden[:num_layers]
            decoder_outputs = torch.autograd.Variable(
                torch.zeros(batch_size, caption_len, v_size))
            if torch.cuda.is_available():
                decoder_outputs = decoder_outputs.cuda()

            #use teacher forcing or not
            if args.learntype == 'teacher_forcing':
                use_teacher_forcing = True if random.random(
                ) < teacher_forcing_ratio else False
                if use_teacher_forcing:
                    for wordindex in range(caption_len):
                        inputword = inputs[:, wordindex]
                        output, decoder_hidden = decoder(
                            inputword, decoder_hidden, encoder_output)
                        decoder_outputs[:, wordindex, :] = output
                else:
                    inputword = inputs[:, 0]
                    for wordindex in range(caption_len):
                        output, decoder_hidden = decoder(
                            inputword, decoder_hidden, encoder_output)
                        maxkey = np.argmax(output.data, axis=1)
                        inputword = to_var(maxkey)
                        decoder_outputs[:, wordindex, :] = output
            #schedule sampling
            else:
                inputword = inputs[:, 0]
                for wordindex in range(caption_len):
                    output, decoder_hidden = decoder(inputword, decoder_hidden,
                                                     encoder_output)
                    decoder_outputs[:, wordindex, :] = output
                    if random.random(
                    ) < schedule_sampling_ratio and wordindex < caption_len - 1:
                        inputword = inputs[:, wordindex + 1]
                    else:
                        maxkey = np.argmax(output.data, axis=1)
                        inputword = to_var(maxkey)

            for i in range(batch_size):
                loss += c(decoder_outputs[i], targets[i])
            loss.backward()
            optimizer.step()
            losses.append(loss.data / batch_size)
        if (epoch + 1) % 2 == 0:
            torch.save(encoder.state_dict(),
                       args.model + '/encoder_epoch{}.pt'.format(epoch + 1))
            torch.save(decoder.state_dict(),
                       args.model + '/decoder_epoch{}.pt'.format(epoch + 1))
        print('loss={:.4f}'.format(np.average(losses)))
示例#4
0
    sentence = sentence.unsqueeze(1)
    mask = sentence.ne(0)
    best_tag_ids = model.decode(sentence, mask)
    tags = [ix_to_tag[idx] for idx in best_tag_ids[0]]
    return tags


if __name__ == "__main__":

    if args.sentence is None:
        raise ValueError("Please input an sentence")
    if args.model is None:
        raise ValueError("Please specify model file path")

    data_dir = "data/msra/processed"
    word_to_ix = load_obj(os.path.join(data_dir, "word_to_ix.pkl"))
    tag_to_ix = load_obj(os.path.join(data_dir, "tag_to_ix.pkl"))

    ix_to_tag = {v: k for k, v in tag_to_ix.items()}

    # Load trained model
    model = BiLSTM_CRF(len(word_to_ix), len(tag_to_ix), 100, 200, 0.1)
    model.load_state_dict(
        torch.load(args.model, map_location=torch.device("cpu")))
    model.eval()

    # Predict
    sentence = torch.LongTensor(
        [word_to_ix.get(w, word_to_ix[UNK]) for w in args.sentence])
    best_tags = tagging(model, sentence, ix_to_tag)
    print(" ".join(best_tags))
import torch as th

from preprocess import tokenize_message, pad_features, load_obj

# Load dictionaries
index_to_word = load_obj("data/index_to_word")
word_to_index = load_obj("data/word_to_index")


def predict(net, message, use_gpu, sequence_length=200):
    """
    net: pytorch RNN model. Whiout encryption

    message: string
        message to classify on plain text
    use_gpu: Boolean 
        if True use GPU computation
    sequence_length: int
        Length of sequences
    """
    if len(message) == 0:
        return None
    net.eval()

    # tokenize review
    test_ints = tokenize_message(message, word_to_index)

    # pad tokenized sequence
    seq_length = sequence_length
    features = pad_features(test_ints, seq_length)
示例#6
0
def main(args):
    fout = open(args.output, 'w')
    maxlen = 20
    num_layers = 1
    dim = 512
    quotes_set = set(['[', ']', '{', '}', '!', '?', '。'])
    sentences, v_size = preprocess.label2onehot(args.input)
    data = getDataLoader(sentences)
    v = preprocess.load_obj('vocabindex')
    i2v = preprocess.load_obj('index2vocab')
    print(sentences)
    v_size = len(v)
    print('sentences shape =', sentences.shape)
    we = getwv()
    encoder = EncoderRNN(v_size,
                         dim,
                         we,
                         num_layers=num_layers,
                         bidirectional=False)
    #decoder = VanillaDecoderRNN(dim, v_size, num_layers=num_layers)
    #decoder = BAttnDecoderRNN(dim, v_size, num_layers=num_layers)
    decoder = LAttnDecoderRNN(dim, v_size, we, num_layers=num_layers)
    encoder.load_state_dict(torch.load(args.encoder))
    decoder.load_state_dict(torch.load(args.decoder))
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    print(encoder)
    print(decoder)
    i = 0
    for sentence in sentences:
        print(sentence)
        s = to_var(torch.LongTensor(sentence).view(1, -1))
        encoder_hidden = encoder.initHidden(num_layers, 1)
        encoder_o, encoder_hidden = encoder(s, [s.size(1)], encoder_hidden)
        encoder_o = pad_packed_sequence(encoder_o, batch_first=True)[0]
        decoder_hidden = encoder_hidden[:num_layers]
        inputword = v['<start>']
        flag = True
        sentence = ''
        words = []
        length = 0
        while flag:
            inputword = to_var(torch.LongTensor([inputword]).view(1, -1))
            output, decoder_hidden = decoder(inputword, decoder_hidden,
                                             encoder_o)
            maxkey = np.argmax(output[0].data)
            inputword = maxkey
            word = i2v[maxkey.item()]
            length += 1
            if length > maxlen:
                flag = False
            if word == '<end>' or word == '<pad>':
                flag = False
            elif word == '<unk>':
                continue
            else:
                if word == '.' or word == '。':
                    #sentence = sentence[:-1]
                    #sentence += word
                    flag = False
                else:
                    if word in words and word in quotes_set:
                        continue
                    if (len(words) == 0 or word != words[-1]):
                        words.append(word)
        sentence = ' '.join(words)
        print(sentence)
        fout.write(sentence + '\n')
        i += 1

    fout.close()
示例#7
0
 def __init__(self, dataset_pkl):
     super(NERDataset, self).__init__()
     self.dataset = load_obj(dataset_pkl)
示例#8
0
def my_train():
    os.makedirs(f"model_result", exist_ok=True)
    torch.manual_seed(1)
    device = torch.device('cuda')

    data_dir = f"data/{DATASET}/processed"

    # 加载
    train_data = NERDataset(os.path.join(data_dir, "train.pkl"))
    test_data = NERDataset(os.path.join(data_dir, "test.pkl"))
    dev_data = NERDataset(os.path.join(data_dir, "dev.pkl"))

    word_to_idx = load_obj(os.path.join(data_dir, "word_to_idx.pkl"))
    tag_to_idx = load_obj(os.path.join(data_dir, "tag_to_idx.pkl"))

    idx_to_tag = {n: m for m, n in tag_to_idx.items()}

    train_loader = DataLoader(
        train_data,
        batch_size=BATCH_SIZE,
        collate_fn=BatchPadding(),
        shuffle=True,
        num_workers=2,
        pin_memory=True,
    )
    dev_loader = DataLoader(
        dev_data,
        batch_size=BATCH_SIZE,
        collate_fn=BatchPadding(),
        shuffle=True,
        num_workers=2,
        pin_memory=True,
    )
    test_loader = DataLoader(
        test_data,
        batch_size=BATCH_SIZE,
        collate_fn=BatchPadding(),
        shuffle=True,
        num_workers=2,
        pin_memory=True,
    )

    # 建模
    model = BiLSTM_CRF(len(word_to_idx), len(tag_to_idx), EMBEDDING_DIM,
                       HIDDEN_DIM, DROPOUT).to(device)
    print(model)
    optimizer = optim.Adam(model.parameters(), lr=LEARN_RATE)

    print("\n开始训练")
    f1_max = 0
    cur_patience = 0  # 用于避免过拟合
    for epoch in range(EPOCHS):
        model.train()
        for i, (seqs, tags, masks) in enumerate(train_loader, 1):
            optimizer.zero_grad()
            loss = model.loss(seqs.to(device), tags.to(device),
                              masks.to(device))
            loss.backward()
            optimizer.step()
            if i % LOG_INTERVAL == 0:
                print("epoch {}: {:.0f}%\t\tLoss: {:.6f}".format(
                    epoch, 100.0 * i / len(train_loader), loss.item()))
        dev_precision, dev_recall, dev_f1 = evaluate(model, dev_loader,
                                                     idx_to_tag)
        test_precision, test_recall, test_f1 = evaluate(
            model, test_loader, idx_to_tag)
        print(
            f"\ndev\tprecision: {dev_precision}, recall: {dev_recall}, f1: {dev_f1}"
        )
        print(
            f"test\tprecision: {test_precision}, recall: {test_recall}, f1: {test_f1}\n"
        )

        torch.save(model.state_dict(), f"model_result/{epoch}.pt")

        if dev_f1 > f1_max:  # 用于检测过拟合情况
            f1_max = dev_f1
            cur_patience = 0
            if dev_f1 > 0.9 and test_f1 > 0.9:
                break
        else:
            cur_patience += 1
            if cur_patience >= PATIENCE:  # 多次低于最高f1,break
                break
    print("Best dev F1: ", f1_max)
示例#9
0
def index2sentence(s):
    v = preprocess.load_obj('index2vocab')
    for w in s:
        print(v[w.item()], end=' ')