Exemplo n.º 1
0
import tensorflow as tf
from model import NerModel
from utils import tokenize, read_vocab, format_result, build_embedding_matrix
import tensorflow_addons as tf_ad
from args_help import args
import json
import numpy as np

# 针对测试集完成词表字典,标签字典,文本序列长度和初始化词向量
vocab2id, id2vocab = read_vocab(args.vocab_file)
tag2id, id2tag = read_vocab(args.tag_file)
print(id2tag)
text_sequences, label_sequences, text_origin, label_origin = tokenize(
    args.test_path, vocab2id, tag2id)
# text_sequences 的维度是(159,110)
embedded_matrix = build_embedding_matrix(args.pretrain_embedding_vec, vocab2id)

# print('查看 text_sequences 的值和维度:')
# print(text_sequences.shape)
# print(type(text_sequences))

# 载入模型
optimizer = tf.keras.optimizers.Adam(args.lr)
model = NerModel(hidden_num=args.hidden_num,
                 vocab_size=len(vocab2id),
                 label_size=len(tag2id),
                 embedding_size=args.embedding_size,
                 embedding_matrix=embedded_matrix)
# restore model
ckpt = tf.train.Checkpoint(optimizer=optimizer, model=model)
ckpt.restore(tf.train.latest_checkpoint(args.output_dir))
import pickle
import utils
from utils import encoderRNN, MLP
import numpy as np
import random

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# word_list, encoded_labels, n_worker, encoded_cls, pad_sens_tensor,int_to_word = utils.load_movie_review()
# crowd_data_mask = utils.crowd_data_mask(encoded_cls)
# print(f"word_size:\t{len(word_list)}")
word_list, encoded_labels, n_worker, crowd_data_mask, pad_sens_tensor,int_to_word = utils.load_tweets10k()
# hidden, encoded_labels, crowd_data_mask, n_worker = utils.load_blue_birds()
# hidden, encoded_labels, crowd_data_mask, n_worker = utils.load_MS()

embedding_matrix = utils.build_embedding_matrix(int_to_word, word_dim=200)
word_size = 200
hidden_size = 10
n_label = crowd_data_mask.size(2)
#-------------------------------------------------------   PCA   -------------------------------------------------------
# U_mat, S_mat, V_mat = torch.pca_lowrank(hidden)
# encoded_reviews = torch.matmul(hidden, V_mat)
# print(encoded_reviews)
#------------------------------------------------mlp encoder------------------------------------------------------------
# with torch.no_grad():
#     mlp = MLP(hidden.size(1 ),hidden_size)
#     encoded_reviews = mlp(hidden.float())
# print(encoded_reviews)
#---------------------------------- --------------AutoEncoder-----------------------------------------------------------
class AutoEncoder(nn.Module):
    def __init__(self, input_size, hidden_size):
Exemplo n.º 3
0
    train_content_word, train_question_word, train_content_char, train_question_char, train_y = ds.get_dataset(
        './data/squad/dev-v1.1.json',
        pickle_data_path=train_data_pickle_data_path)
    test_content_word, test_question_word, test_content_char, test_question_char, test_y = ds.get_dataset(
        './data/squad/dev-v1.1.json',
        pickle_data_path=test_data_pickle_data_path)

    print('dataset load done!')

    embedding_matrix_file = './data/glove/embedding_matrix'

    print(len(ds.wordset))
    print(train_content_word.shape, train_content_char.shape, train_y.shape)
    print(test_content_word.shape, test_content_char.shape, test_y.shape)

    embedding_matrix = build_embedding_matrix(
        ds.word2id, embed_dim=50, embedding_matrix_file=embedding_matrix_file)

    print('embedding_matrix shape {}'.format(embedding_matrix.shape))
    print('embedding done!')

    bidaf = BiDAF(clen=ds.max_clen,
                  qlen=ds.max_qlen,
                  glove_weight=embedding_matrix,
                  char_vocab_size=len(ds.char2id))
    bidaf.build_model()

    bidaf.model.fit([
        train_content_word, train_question_word, train_content_char,
        train_question_char
    ],
                    train_y,
Exemplo n.º 4
0
def train(args):
    utils.make_all_dirs(current_time)
    if args.load_var:
        all_utterances, labels, word_dict = read_data(load_var=args.load_var,
                                                      input_=None,
                                                      mode='train')
        dev_utterances, dev_labels, _ = read_data(load_var=args.load_var,
                                                  input_=None,
                                                  mode='dev')
    else:
        all_utterances, labels, word_dict = read_data(load_var=args.load_var, \
                input_=os.path.join(constant.data_path, "entangled_train.json"), mode='train')
        dev_utterances, dev_labels, _ = read_data(load_var=args.load_var, \
                input_=os.path.join(constant.data_path, "entangled_dev.json"), mode='dev')

    word_emb = build_embedding_matrix(word_dict, glove_loc=args.glove_loc, \
                    emb_loc=os.path.join(constant.save_input_path, "word_emb.pk"), load_emb=False)

    if args.save_input:
        utils.save_or_read_input(os.path.join(constant.save_input_path, "train_utterances.pk"), \
                                    rw='w', input_obj=all_utterances)
        utils.save_or_read_input(os.path.join(constant.save_input_path, "train_labels.pk"), \
                                    rw='w', input_obj=labels)
        utils.save_or_read_input(os.path.join(constant.save_input_path, "word_dict.pk"), \
                                    rw='w', input_obj=word_dict)
        utils.save_or_read_input(os.path.join(constant.save_input_path, "word_emb.pk"), \
                                    rw='w', input_obj=word_emb)
        utils.save_or_read_input(os.path.join(constant.save_input_path, "dev_utterances.pk"), \
                                    rw='w', input_obj=dev_utterances)
        utils.save_or_read_input(os.path.join(constant.save_input_path, "dev_labels.pk"), \
                                    rw='w', input_obj=dev_labels)

    train_dataloader = TrainDataLoader(all_utterances, labels, word_dict)
    if args.add_noise:
        noise_train_dataloader = TrainDataLoader(all_utterances,
                                                 labels,
                                                 word_dict,
                                                 add_noise=True)
    else:
        noise_train_dataloader = None
    dev_dataloader = TrainDataLoader(dev_utterances,
                                     dev_labels,
                                     word_dict,
                                     name='dev')

    logger_name = os.path.join(constant.log_path,
                               "{}.txt".format(current_time))
    LOG_FORMAT = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    logging.basicConfig(format=LOG_FORMAT,
                        level=logging.INFO,
                        filename=logger_name,
                        filemode='w')
    logger = logging.getLogger()
    global log_head
    log_head = log_head + "Training Model: {}; ".format(args.model)
    if args.add_noise:
        log_head += "Add Noise: True; "
    logger.info(log_head)

    if args.model == 'T':
        ensemble_model_bidirectional = EnsembleModel(word_dict,
                                                     word_emb=word_emb,
                                                     bidirectional=True)
    elif args.model == 'TS':
        ensemble_model_bidirectional = EnsembleModel(word_dict,
                                                     word_emb=None,
                                                     bidirectional=True)
    else:
        ensemble_model_bidirectional = None
    if args.model == 'TS':
        ensemble_model_bidirectional.load_state_dict(
            torch.load(args.model_path))
    ensemble_model = EnsembleModel(word_dict,
                                   word_emb=word_emb,
                                   bidirectional=False)

    if torch.cuda.is_available():
        ensemble_model.cuda()
        if args.model == 'T' or args.model == 'TS':
            ensemble_model_bidirectional.cuda()

    supervised_trainer = SupervisedTrainer(args, ensemble_model, teacher_model=ensemble_model_bidirectional, \
                                                logger=logger, current_time=current_time)

    supervised_trainer.train(train_dataloader, noise_train_dataloader,
                             dev_dataloader)
Exemplo n.º 5
0
    cards = pd.read_csv('processed_sets.csv', sep='\t')
    corpus = cards['text'].str.split().values
    tokenizer = Tokenizer(num_words=MAXLEN)
    tokenizer.fit_on_texts(corpus)
    # save for later
    fname = path.join('tmp', 'mtg_tokenizer_%s.pickle' % size)
    with open(fname, 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # Word2Vec
    print('building word2vec model with vec size %d...' % size)
    model = Word2Vec(corpus, size=size, window=5, seed=1234)
    # save model
    print('writing model...')
    build_embedding_matrix(tokenizer.word_docs.keys(),
                           tokenizer.word_index,
                           size=size)
    vocab = list(model.wv.vocab)
    X = model[vocab]

    # dimensionality reduction
    print('visualizing model...')
    n_components = 2
    tsne = TSNE(n_components=n_components,
                init='pca',
                random_state=1234,
                method='exact')
    X_tsne = tsne.fit_transform(X)
    df = pd.DataFrame(X_tsne, index=vocab, columns=['x', 'y'])

    # visualize