Пример #1
0
def prepare_data(data_path, w2v_path, vocab_path, make_vocab=True,
                elmo_w2v_path=None,
                elmo_pca=False):

    [anchor, 
    check, 
    label, 
    anchor_len, 
    check_len] = data_utils.read_data(data_path, 
                    "train", 
                    cut_tool, 
                    data_clearner_api,
                    "tab")

    if make_vocab:
        dic = data_utils.make_dic(anchor+check)
        if not elmo_w2v_path:
            data_utils.read_pretrained_embedding(w2v_path, dic, vocab_path, min_freq=3)
        else:
            data_utils.read_pretrained_elmo_embedding(w2v_path, dic, 
                            vocab_path, min_freq=3,
                            elmo_embedding_path=elmo_w2v_path,
                            elmo_pca=elmo_pca)

    if sys.version_info < (3, ):
        embedding_info = pkl.load(open(os.path.join(vocab_path), "rb"))
    else:
        embedding_info = pkl.load(open(os.path.join(vocab_path), "rb"), 
                                encoding="iso-8859-1")

    return [anchor, check, label, anchor_len, check_len, embedding_info]
Пример #2
0
def test(args, vocab_size):
    device = t.device('cuda') if args.use_gpu else t.device('cpu')
    beam_size = args.beam_size
    topk = args.topk
    rev_model = args.load_model_path
    # print(rev_model)
    model = Seq2Seq(embed_size=args.embed_size,
                    enc_dec_output_size=args.enc_dec_output_size,
                    attn_size=args.attn_size,
                    num_layers=args.num_layers,
                    bidirectional=args.bidirectional,
                    use_gpu=args.use_gpu,
                    vocab_size=vocab_size).to(device)

    assert rev_model is not None

    # 读取已经保存的模型
    rev_path = os.path.join(model_dir, rev_model)
    if os.path.exists(rev_path):
        print('read in model from', rev_path)
        model.load(load_path=rev_path)

    batch_size = args.batch_size
    test_set = Set(read_data(args.test_data_root))
    test_loader = Loader(test_set,
                         batch_size,
                         shuffle=False,
                         use_gpu=args.use_gpu,
                         num_workers=args.num_workers).loader

    model.eval()
    with t.no_grad():
        recorder.epoch_start(0, 'test', len(test_set))
        for batch_id, batch in enumerate(test_loader):
            encoder_inputs, seq_len, decoder_inputs, weights = batch
            encoder_inputs = encoder_inputs.to(device)
            seq_len = seq_len.to(device)
            decoder_inputs = decoder_inputs.to(device)
            weights = weights.to(device)
            encoder_inputs.to(device)
            logits, output_symbols = model(
                encoder_inputs,
                seq_len,
                decoder_inputs[:, :-1],
                mode='test',
                max_len=args.max_len,
                beam_search=False if args.beam_size == 1 else True,
                beam_size=args.beam_size,
                topk=args.topk)

            nll_loss = compute_loss(logits, decoder_inputs[:, 1:], weights)
            ppl = perplexity(nll_loss)
            recorder.batch_end(batch_id, batch_size, nll_loss, ppl)
            recorder.log_text(encoder_inputs.tolist(),
                              decoder_inputs[:, 1:].tolist(),
                              output_symbols.tolist())
        recorder.epoch_end()
Пример #3
0
def main(argv):
    # random.seed(21) # So we have same parition every time.

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Read in data from .pickle as a list of (features, label) tuples
    # each representing a zipcode datapoint.
    data_and_labels = data_utils.read_data()

    need = np.sum([1 if label[1] == 0 else 0 for label in data_and_labels])
    print('need', need)

    # Oversample
    # data_and_labels = data_utils.oversample(data_and_labels)
    # data_and_labels = data_utils.undersample(data_and_labels)

    # Standardize the data.
    x_data = [x[0] for x in data_and_labels]
    y_data = [x[1] for x in data_and_labels]
    scaler = preprocessing.StandardScaler()
    x_data = scaler.fit_transform(x_data)

    # New dataset that is standardized.
    data = [(x_data[i], y_data[i]) for i in range(len(x_data))]

    # Separate 80/10/10 as train/val/test partition.
    data_size = len(data)
    random.shuffle(data)
    train_data = data[:(data_size // 10) * 8]
    val_data = data[(data_size // 10) * 8 : (data_size // 10) * 9]
    test_data = data[(data_size // 10) * 9 :]

    train_data = data_utils.oversample_train(train_data)

    print(len(train_data), 'training points.')
    print(len(val_data), 'validation points.')
    print(len(test_data), 'testing points.')

    input_dim = len(data[0][0]) # number of features
    output_dim = 2 # two classes: food desert and not food desert
    hidden_dim_list = [16, 36, 36, 24]

    model_nn = FoodDesertClassifier(input_dim, hidden_dim_list,
            output_dim).to(device)
    loss = optimize_nn(model_nn, train_data, val_data, test_data)
    
    eval_model_nn(model_nn, loss, test_data, "Testing")
Пример #4
0
def prepare_data(data_path, w2v_path, vocab_path):
    import time

    [anchor, check, label, anchor_len,
     check_len] = data_utils.read_data(data_path, "train", cut_tool,
                                       data_clearner_api, "tab")

    dic = data_utils.make_dic(anchor + check)
    data_utils.read_pretrained_embedding(w2v_path, dic, vocab_path, min_freq=3)

    if sys.version_info < (3, ):
        embedding_info = pkl.load(open(os.path.join(vocab_path), "rb"))
    else:
        embedding_info = pkl.load(open(os.path.join(vocab_path), "rb"),
                                  encoding="iso-8859-1")

    return [anchor, check, label, anchor_len, check_len, embedding_info]
Пример #5
0
def train(args, vocab_size):
    # opt._parse(kwarg)
    print('enter train func')
    device = t.device('cuda') if args.use_gpu else t.device('cpu')
    model = Seq2Seq(embed_size=args.embed_size,
                    enc_dec_output_size=args.enc_dec_output_size,
                    attn_size=args.attn_size,
                    num_layers=args.num_layers,
                    bidirectional=args.bidirectional,
                    use_gpu=args.use_gpu,
                    vocab_size=vocab_size).to(device)

    print('Model structure')
    print(model)
    print('The model has %d parameters' % count_parameters(model))

    if args.load_model_path is not None:
        rev_path = os.path.join(model_dir, args.load_model_path)
        if os.path.exists(rev_path):
            print('read in model from', rev_path)
            last_epoch = model.load(load_path=rev_path,
                                    return_list=['epoch'])[0]
            start_epoch = last_epoch + 1

    else:
        start_epoch = 1
        last_epoch = -1
    optimizer = Adam(model.parameters(), lr=args.lr)
    if args.scheduler_type == 'exponential':
        scheduler = lr_scheduler.ExponentialLR(optimizer,
                                               gamma=args.exponential_lr_decay,
                                               last_epoch=last_epoch)
    elif args.scheduler_type == 'step':
        scheduler = lr_scheduler.StepLR(optimizer,
                                        step_size=args.step_size,
                                        gamma=args.step_lr_decay)
    print('read in data')

    # 读取数据
    batch_size = args.batch_size
    train_set = Set(read_data(args.train_data_root))
    valid_set = Set(read_data(args.valid_data_root))
    # 构造dataloader
    train_loader = Loader(train_set,
                          batch_size,
                          shuffle=True,
                          use_gpu=args.use_gpu,
                          num_workers=args.num_workers).loader
    valid_loader = Loader(valid_set,
                          batch_size,
                          shuffle=False,
                          use_gpu=args.use_gpu,
                          num_workers=args.num_workers).loader

    # 统计数据量
    print('data scale:')
    print('train data:', len(train_set), "batch_nums:", len(train_loader))
    print('valid data:', len(valid_set), "batch_nums:", len(valid_loader))

    # train
    print('start training...')
    epochs = args.max_epoch

    for epoch in range(start_epoch, epochs + 1):
        model.train()
        # epoch开始前记录
        recorder.epoch_start(epoch, 'train', len(train_set))

        if args.scheduler_type is not None:
            print(epoch, 'lr={:.10f}'.format(scheduler.get_lr()[0]))
        for batch_id, batch in enumerate(train_loader):
            encoder_inputs, seq_len, decoder_inputs, weights = batch
            encoder_inputs = encoder_inputs.to(device)
            seq_len = seq_len.to(device)
            decoder_inputs = decoder_inputs.to(device)
            weights = weights.to(device)
            encoder_inputs.to(device)
            optimizer.zero_grad()

            # 第三个参数, 最长的句子最后一个token为EOS_I,不需要作为输入,这样可以减少一些计算
            logits, output_symbols = model(
                encoder_inputs,
                seq_len,
                decoder_inputs[:, :-1],
                mode='train',
                max_len=None,
                teacher_forcing_ratio=args.teacher_forcing_ratio)

            # print('train out',output_symbols)
            # 计算损失
            nll_loss = compute_loss(logits, decoder_inputs[:, 1:], weights)
            # 计算困惑度
            ppl = perplexity(nll_loss)
            # print(nll_loss.item(), ppl.item())
            # 反向传播,更新参数
            nll_loss.backward()
            # 减轻梯度爆炸 小trick
            nn.utils.clip_grad_norm_(model.parameters(),
                                     args.max_gradient_norm)
            optimizer.step()

            recorder.batch_end(batch_id, batch_size, nll_loss, ppl)

        if args.scheduler_type is not None:
            scheduler.step()
        recorder.epoch_end()
        # 保存模型
        if epoch % 5 == 0:
            model.save(os.path.join(
                model_dir,
                f'{args.project}_{datetime.datetime.now().strftime("%y_%m_%d_%H:%M:%S")}_{nll_loss.item()}_{ppl.item()}'
            ),
                       epoch=epoch)
        # 训练一轮后,在验证集上计算loss, ppl
        model.eval()
        with t.no_grad():
            recorder.epoch_start(epoch, 'eval', len(valid_set))
            for batch_id, batch in enumerate(valid_loader):
                encoder_inputs, seq_len, decoder_inputs, weights = batch
                encoder_inputs = encoder_inputs.to(device)
                seq_len = seq_len.to(device)
                decoder_inputs = decoder_inputs.to(device)
                weights = weights.to(device)
                encoder_inputs.to(device)
                logits, output_symbols = model(
                    encoder_inputs,
                    seq_len,
                    decoder_inputs[:, :-1],
                    mode='eval',
                    max_len=args.max_len,
                    beam_search=False if args.beam_size == 1 else True,
                    beam_size=args.beam_size,
                    topk=args.topk)
                # print('eval out: ', output_symbols)
                nll_loss = compute_loss(logits, decoder_inputs[:, 1:], weights)
                ppl = perplexity(nll_loss)
                recorder.batch_end(batch_id, batch_size, nll_loss, ppl)
                recorder.log_text(encoder_inputs.tolist(),
                                  decoder_inputs[:, 1:].tolist(),
                                  output_symbols.tolist())
            recorder.epoch_end()
Пример #6
0
    parser.add_argument("--z_dim", type=int, default=32)
    parser.add_argument("--seq_len", type=int, default=10)

    parser.add_argument("--epochs", type=int, default=100)
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--prt_evry", type=int, default=1)
    parser.add_argument("--save_evry", type=int, default=10)
    parser.add_argument("--lr", type=float, default=1e-3)

    config = parser.parse_args()
    device = th.device('cuda' if th.cuda.is_available() else 'cpu')
    dir_name = mk_dir(config.data + 'experiment')

    print(config, "DEVICE", device)

    data = read_data('data/pianorolls/{}.pkl'.format(config.data))

    train_data, test_data = data2seq(data=data,
                                     split='train',
                                     seq_len=config.seq_len)

    if config.model == "VRNN":
        model = VRNN(config, device)
    else:
        print("NotImplementedERROR")

    model.to(device)

    epoch = 0

    while (epoch < config.epochs):
Пример #7
0
w2v_path = "/data/xuht/Chinese_w2v/sgns.merge.char/sgns.merge.char.pkl"
# vocab_path = "/data/xuht/duplicate_sentence/ChineseSTSCorpus/emb_mat.pkl"
vocab_path = "/data/xuht/duplicate_sentence/LCQMC/emb_mat.pkl"

data_clearner_api = data_clean.DataCleaner({})
cut_tool = data_utils.cut_tool_api()

import time

[train_anchor, 
train_check, 
train_label, 
train_anchor_len, 
train_check_len] = data_utils.read_data(train_data_path, 
                    "train", 
                    cut_tool, 
                    data_clearner_api,
                    "tab")
                
dic = data_utils.make_dic(train_anchor+train_check)
data_utils.read_pretrained_embedding(w2v_path, dic, vocab_path, min_freq=3)

if sys.version_info < (3, ):
    embedding_info = pkl.load(open(os.path.join(vocab_path), "rb"))
else:
    embedding_info = pkl.load(open(os.path.join(vocab_path), "rb"), 
                            encoding="iso-8859-1")

token2id = embedding_info["token2id"]
id2token = embedding_info["id2token"]
embedding_mat = embedding_info["embedding_matrix"]