Пример #1
0
    def __init__(self, args):
        self.args = args

        data = Data(args.train_path, args.val_path, args.glove_path)
        data.build_vocab()
        train_data, val_data = data.input2tensor()
        embedding_matrix = data.build_embedding_matrix(args.embed_type,
                                                       args.embed_dim)
        train_dataset = MyDataset(train_data, data.max_len)
        val_dataset = MyDataset(val_data, data.max_len)

        self.train_dataloader = DataLoader(train_dataset,
                                           batch_size=args.batch_size,
                                           shuffle=True)
        self.val_dataloader = DataLoader(val_dataset,
                                         batch_size=args.batch_size,
                                         shuffle=False)

        if args.model_type == 'CNN':
            self.model = CNNModel(args, data.vocab_size,
                                  embedding_matrix).to(args.device)
        else:
            self.model = LSTMNet(args, data.vocab_size,
                                 embedding_matrix).to(args.device)

        self.loss_func = nn.CrossEntropyLoss()
        self.optim = torch.optim.Adam(self.model.parameters(),
                                      lr=args.learning_rate)

        if torch.cuda.is_available():
            print('cuda memory allocated:',
                  torch.cuda.memory_allocated(device=args.device.index))
Пример #2
0
def load_data(train_path, val_path, glove_path):
    data = Data(train_path, val_path, glove_path)
    train_x_list, _, val_x_list, _ = data.split_sentence()
    data.build_vocab()
    orig_data = train_x_list + val_x_list
    train_data = get_train_data(data.vocab, orig_data)

    print("数据实例个数: {}".format(len(train_data)))

    vocab_size = len(data.vocab) + 1
    print("词表长度为:", vocab_size)

    dist = np.array([v for k, v in data.word_freq.items()])
    dist = np.power(dist, 0.75)
    dist = dist / dist.sum()

    return train_data, data.vocab, vocab_size, dist