예제 #1
0
def train():
    conf = configuration.Config()
    tokenizer = tokenization.FullTokenizer(
        vocab_file=conf.file_config.vocab_file)

    model = models.TransformerEncoder(conf)
    model = model.to(device)

    if args.train:
        train_dataset = datasets.OnlineShopping(
            mode='train',
            config=conf,
            tokenizer=tokenizer,
            auto_padding=conf.train_config.auto_padding)

        logging.info("***** Running training *****")
        logging.info("  Num examples = %d", len(train_dataset))
        logging.info("  Total training steps: {}".format(
            train_dataset.num_steps))

        train_dataloader = DataLoader(
            train_dataset,
            batch_size=conf.train_config.train_batch_size,
            shuffle=True,
            collate_fn=collate_fn)

        run(config=conf,
            dataloader=train_dataloader,
            model=model,
            mode='train',
            total_steps=train_dataset.num_steps)

    if args.dev:
        dev_dataset = datasets.OnlineShopping(
            mode='dev',
            config=conf,
            tokenizer=tokenizer,
            auto_padding=conf.train_config.auto_padding)

        logging.info("***** Running validating *****")
        logging.info("  Num examples = %d", len(dev_dataset))
        logging.info("  Total validating steps: {}".format(
            dev_dataset.num_steps))

        train_dataloader = DataLoader(
            dev_dataset,
            batch_size=conf.train_config.train_batch_size,
            collate_fn=collate_fn)

        run(config=conf, dataloader=train_dataloader, model=model, mode='eval')
예제 #2
0
def predict(texts):
    conf = configuration.Config()
    tokenizer = tokenization.FullTokenizer(
        vocab_file=conf.file_config.vocab_file)

    model = models.TransformerEncoder(conf)
    model = model.to(device)

    if os.path.exists(
            os.path.join(conf.train_config.model_dir,
                         conf.train_config.model_name)):
        logging.info(' *** Loading model ***')
        model.load_state_dict(
            torch.load(
                os.path.join(conf.train_config.model_dir,
                             conf.train_config.model_name)))
    else:
        logging.info(' *** No model available. *** ')
        return

    predict_dataset = datasets.OnlineShopping(mode='single_predict',
                                              config=conf,
                                              tokenizer=tokenizer,
                                              auto_padding=True,
                                              texts=texts)
    predict_dataloader = DataLoader(predict_dataset,
                                    batch_size=len(predict_dataset),
                                    collate_fn=collate_fn)
    data = next(iter(predict_dataloader))
    text_ids, _ = [t.to(device) if t is not None else t for t in data]
    logits = model(text_ids)
    probs, predictions = get_predictions(logits)

    return dict(
        zip(texts, [{
            'result': label,
            'probability': prob
        } for label, prob in zip([
            predict_dataset.convert_label_id_to_value(prediction.item())
            for prediction in predictions
        ], [prob.item() for prob in probs])]))
예제 #3
0
    def __init__(self,
                 config,
                 use_attention=True,
                 encoder=None,
                 decoder=None,
                 src_padding_idx=0,
                 tgt_padding_idx=0,
                 label_smoothing=0,
                 tgt_vocab=None):
        """
        Initialization of variables and functions
        :param config: configuration
        :param use_attention: use attention or not, consistent with seq2seq
        :param encoder: encoder
        :param decoder: decoder
        :param src_padding_idx: source padding index
        :param tgt_padding_idx: target padding index
        :param label_smoothing: ratio for label smoothing
        :param tgt_vocab: target vocabulary
        """
        super(tensor2tensor, self).__init__()

        self.config = config

        # pretrained encoder or not
        if encoder is not None:
            self.encoder = encoder
        else:
            self.encoder = models.TransformerEncoder(
                config, padding_idx=src_padding_idx)
        tgt_embedding = self.encoder.embedding if config.shared_vocab else None
        # pretrained decoder or not
        if decoder is not None:
            self.decoder = decoder
        else:
            self.decoder = models.TransformerDecoder(
                config,
                tgt_embedding=tgt_embedding,
                padding_idx=tgt_padding_idx)
        # log softmax should specify dimension explicitly
        self.log_softmax = nn.LogSoftmax(dim=-1)
        self.use_cuda = config.use_cuda
        self.config = config
        self.label_smoothing = label_smoothing
        if self.label_smoothing > 0:
            self.criterion = LabelSmoothingLoss(label_smoothing,
                                                config.tgt_vocab_size,
                                                ignore_index=tgt_padding_idx)
        else:
            self.criterion = nn.CrossEntropyLoss(ignore_index=utils.PAD)
        if config.use_cuda:
            self.criterion.cuda()
        self.compute_score = nn.Linear(config.hidden_size,
                                       config.tgt_vocab_size)

        # Use rl or not. Should specify a reward provider. Not available yet in this framework.
        # if config.rl:
        # self.bleu_scorer = bleu.Scorer(pad=0, eos=3, unk=1)
        # self.reward_provider = CTRRewardProvider(config.ctr_reward_provider_path)
        # self.tgt_vocab = tgt_vocab
        self.padding_idx = tgt_padding_idx
예제 #4
0
    desc_encoder = models.CNNEncoder(len(DESC.vocab), args.emb_dim,
                                     args.filter_size, args.n_layers,
                                     args.dropout, device)

    code_pooler = models.EmbeddingPooler(args.emb_dim, args.pool_mode)

    desc_pooler = models.EmbeddingPooler(args.emb_dim, args.pool_mode)

elif args.model == 'transformer':

    code_pad_idx = CODE.vocab.stoi[CODE.pad_token]
    desc_pad_idx = DESC.vocab.stoi[DESC.pad_token]

    code_encoder = models.TransformerEncoder(len(CODE.vocab), args.emb_dim,
                                             args.hid_dim, args.n_layers,
                                             args.n_heads, args.dropout,
                                             code_pad_idx, device)

    desc_encoder = models.TransformerEncoder(len(DESC.vocab), args.emb_dim,
                                             args.hid_dim, args.n_layers,
                                             args.n_heads, args.dropout,
                                             desc_pad_idx, device)

    code_pooler = models.EmbeddingPooler(args.emb_dim, args.pool_mode)

    desc_pooler = models.EmbeddingPooler(args.emb_dim, args.pool_mode)

else:
    raise ValueError(f'Model {args.model} not valid!')

if args.model == 'transformer':
예제 #5
0
valid_data = utils.batchify(valid_data, args.batch_size)
test_data = utils.batchify(test_data, args.batch_size)

print('train/valid/test shape',
      [x.shape for x in [train_data, valid_data, test_data]])

#data is [length, batch size]

device = torch.device('cuda')

if args.model == 'transformer':

    pad_idx = vocab[PAD_TOKEN]

    model = models.TransformerEncoder(len(vocab), args.emb_dim, args.hid_dim,
                                      args.n_layers, args.n_heads,
                                      args.dropout, pad_idx, device)

else:
    raise ValueError

if args.model == 'transformer':

    model.apply(utils.initialize_transformer)

else:
    raise ValueError

language_model = models.LanguageModel(model, args.emb_dim, len(vocab))

print(f'Language model parameters: {utils.count_parameters(language_model):,}')
예제 #6
0
    def __init__(self,
                 config,
                 use_attention=True,
                 encoder=None,
                 decoder=None,
                 src_padding_idx=0,
                 tgt_padding_idx=0,
                 label_smoothing=0,
                 tgt_vocab=None):
        # 用来生成描述的target vocab是否固定
        """
        Initialization of variables and functions
        :param config: configuration
        :param use_attention: use attention or not, consistent with seq2seq
        是decoder对encoder的attention
        :param encoder: encoder
        :param decoder: decoder
        :param src_padding_idx: source padding index
        :param tgt_padding_idx: target padding index
        :param label_smoothing: ratio for label smoothing, label smoothing的比例
        :param tgt_vocab: target vocabulary
        """
        super(tensor2tensor, self).__init__()  # 调用父类

        self.config = config

        # pretrained encoder or not
        if encoder is not None:
            self.encoder = encoder  # pretrained
        else:
            self.encoder = models.TransformerEncoder(
                config, padding_idx=src_padding_idx)
            if self.config.knowledge:
                # HACK: we use tgt_vocab for knowledge instead of src_vocab
                src_vocab_size = config.src_vocab_size
                config.src_vocab_size = config.tgt_vocab_size
                self.knowledge_encoder = models.TransformerEncoder(
                    config, padding_idx=src_padding_idx)
                config.src_vocab_size = src_vocab_size
                # 知识图谱信息中用的词是生成描述中的词
        tgt_embedding = self.encoder.embedding if config.shared_vocab else None
        # pretrained decoder or not
        if decoder is not None:
            self.decoder = decoder
        else:
            self.decoder = models.TransformerDecoder(
                config,
                tgt_embedding=tgt_embedding,
                padding_idx=tgt_padding_idx)
            # 确定使用怎样的embedding
        # log softmax should specify dimension explicitly
        self.log_softmax = nn.LogSoftmax(dim=-1)
        self.use_cuda = config.use_cuda
        self.config = config
        self.label_smoothing = label_smoothing
        if self.label_smoothing > 0:
            self.criterion = LabelSmoothingLoss(label_smoothing,
                                                config.tgt_vocab_size,
                                                ignore_index=tgt_padding_idx)
            # 做label_smoothing
        else:
            self.criterion = nn.CrossEntropyLoss(ignore_index=utils.PAD)
        if config.use_cuda:
            self.criterion.cuda()
        self.compute_score = nn.Linear(config.hidden_size,
                                       config.tgt_vocab_size)
        # 不确定是在干啥,self.compute_score似乎是一个提前定义好的函数,
        # 用来做进softmax之前的全连接的

        # 先不使用强化学习
        # Use rl or not. Should specify a reward provider. Not available yet in this framework.
        # if config.rl:
        # self.bleu_scorer = bleu.Scorer(pad=0, eos=3, unk=1)
        # self.reward_provider = CTRRewardProvider(config.ctr_reward_provider_path)
        # self.tgt_vocab = tgt_vocab
        self.padding_idx = tgt_padding_idx  # 使用输出的padding