Пример #1
0
    def __init__(self,
                 dnn_dims=[],
                 vocab_sizes=[],
                 model_type=ModelType.create_classification(),
                 model_arch=ModelArch.create_rnn(),
                 share_semantic_generator=False,
                 class_num=2,
                 share_embed=False,
                 is_infer=False):
        """
        init dssm network
        :param dnn_dims: list of int (dimentions of each layer in semantic vector generator.)
        :param vocab_sizes: 2d tuple (size of both left and right items.)
        :param model_type: classification
        :param model_arch: model architecture
        :param share_semantic_generator: bool (whether to share the semantic vector generator for both left and right.)
        :param class_num: number of categories.
        :param share_embed: bool (whether to share the embeddings between left and right.)
        :param is_infer: inference
        """
        assert len(vocab_sizes) == 2, (
            "vocab sizes specify the sizes left and right inputs, dim is 2.")
        assert len(dnn_dims) > 1, "more than two layers is needed."

        self.dnn_dims = dnn_dims
        self.vocab_sizes = vocab_sizes
        self.share_semantic_generator = share_semantic_generator
        self.share_embed = share_embed
        self.model_type = ModelType(model_type)
        self.model_arch = ModelArch(model_arch)
        self.class_num = class_num
        self.is_infer = is_infer
        logger.warning("build DSSM model with config of %s, %s" %
                       (self.model_type, self.model_arch))
        logger.info("vocabulary sizes: %s" % str(self.vocab_sizes))

        _model_arch = {
            "rnn": self.create_rnn,
            "cnn": self.create_cnn,
            "fc": self.create_fc,
        }

        def _model_arch_creater(emb, prefix=""):
            sent_vec = _model_arch.get(str(model_arch))(emb, prefix)
            dnn = self.create_dnn(sent_vec, prefix)
            return dnn

        self.model_arch_creater = _model_arch_creater
        self.model_type_creater = self._build_classification_model
Пример #2
0
    def infer(self, data_path, output_path,
            model_type=ModelType(ModelType.CLASSIFICATION_MODE),
            feature_dim=800,
            batch_size=100):
        logger.info("infer data...")

        #infer_reader = reader.test(data_path,
        #                                    feature_dim+1,
        #                                    model_type.is_classification())
        infer_batch = paddle.batch(reader.test(data_path,
                                            feature_dim+1,
                                            model_type.is_classification()),
                            batch_size=batch_size)

        logger.warning('write predictions to %s' % output_path)
        output_f = open(output_path, 'w')

        batch = []
        #for item in infer_reader():
        #    batch.append([item[0]])
        for id, batch in enumerate(infer_batch()):
            res = self.inferer.infer(input=batch)
            predictions = [' '.join(map(str, x)) for x in res]
            assert len(batch) == len(
                    predictions), "predict error, %d inputs, but %d predictions" % (
                            len(batch), len(predictions))
            output_f.write('\n'.join(map(str, predictions)) + '\n')
            batch = []
Пример #3
0
    def __init__(self, param_path,
            model_type=ModelType(ModelType.CLASSIFICATION_MODE),
            class_num=2,
            feature_dim=800,
            dnn_dims='256,128,64,32'):
        logger.info("create DNN model")

        paddle.init(use_gpu=False, trainer_count=1)

        # network config
        input_layer = paddle.layer.data(name='input_layer', type=paddle.data_type.dense_vector(feature_dim))
        layer_dims = [int(i) for i in dnn_dims.split(',')]
        dnn = create_dnn(input_layer, layer_dims)
        prediction = None
        label = None
        cost = None
        if model_type.is_classification():
            prediction = paddle.layer.fc(input=dnn, size=class_num, act=paddle.activation.Softmax())
            label = paddle.layer.data(name='label', type=paddle.data_type.integer_value(class_num))
            cost = paddle.layer.classification_cost(input=prediction, label=label)
        elif model_type.is_regression():
            prediction = paddle.layer.fc(input=dnn, size=1, act=paddle.activation.Linear())
            label = paddle.layer.data(name='label', type=paddle.data_type.dense_vector(1))
            cost = paddle.layer.mse_cost(input=prediction, label=label)

        # load parameter
        logger.info("load model parameters from %s" % param_path)
        self.parameters = paddle.parameters.Parameters.from_tar(
                open(param_path, 'r'))
        self.inferer = paddle.inference.Inference(
                output_layer=prediction, parameters=self.parameters)
Пример #4
0
    def __init__(self,
                 dnn_layer_dims,
                 dnn_input_dim,
                 lr_input_dim,
                 model_type=ModelType.create_classification(),
                 is_infer=False):
        '''
        @dnn_layer_dims: list of integer
            dims of each layer in dnn
        @dnn_input_dim: int
            size of dnn's input layer
        @lr_input_dim: int
            size of lr's input layer
        @is_infer: bool
            whether to build a infer model
        '''
        self.dnn_layer_dims = dnn_layer_dims
        self.dnn_input_dim = dnn_input_dim
        self.lr_input_dim = lr_input_dim
        self.model_type = model_type
        self.is_infer = is_infer

        self._declare_input_layers()

        self.dnn = self._build_dnn_submodel_(self.dnn_layer_dims)
        self.lr = self._build_lr_submodel_()

        # model's prediction
        # TODO(superjom) rename it to prediction
        if self.model_type.is_classification():
            self.model = self._build_classification_model(self.dnn, self.lr)
        if self.model_type.is_regression():
            self.model = self._build_regression_model(self.dnn, self.lr)
Пример #5
0
    def __init__(self,
                 dnn_layer_dims,
                 dnn_input_dim,
                 lr_input_dim,
                 model_type=ModelType.create_classification(),
                 is_infer=False):
        '''
        @dnn_layer_dims: list of integer
            DNN每一层的维度
        @dnn_input_dim: int
            DNN输入层的大小
        @lr_input_dim: int
            LR输入层大小
        @is_infer: bool
            是否建立预估模型
        '''
        self.dnn_layer_dims = dnn_layer_dims
        self.dnn_input_dim = dnn_input_dim
        self.lr_input_dim = lr_input_dim
        self.model_type = model_type
        self.is_infer = is_infer

        self._declare_input_layers()

        self.dnn = self._build_dnn_submodel_(self.dnn_layer_dims)
        self.lr = self._build_lr_submodel_()

        # 模型预测
        if self.model_type.is_classification():
            self.model = self._build_classification_model(self.dnn, self.lr)
        if self.model_type.is_regression():
            self.model = self._build_regression_model(self.dnn, self.lr)
Пример #6
0
    def __init__(self, train_path, test_path, source_dic_path, target_dic_path,
                 model_type):
        self.train_path = train_path
        self.test_path = test_path
        self.source_dic_path = source_dic_path
        self.target_dic_path = target_dic_path
        self.model_type = ModelType(model_type)

        self.source_dic = load_dic(self.source_dic_path)
        self.target_dic = load_dic(self.target_dic_path)

        _record_reader = {
            ModelType.CLASSIFICATION_MODE: self._read_classification_record,
            ModelType.REGRESSION_MODE: self._read_regression_record,
            ModelType.RANK_MODE: self._read_rank_record,
        }

        assert isinstance(model_type, ModelType)
        self.record_reader = _record_reader[model_type.mode]
        self.is_infer = False
        
        self.train_data_csv = "/home/kesci/input/qichedashi/train_set.csv"
        self.dev_data_csv = "/home/kesci/input/qichedashi/final_round_dev_set.csv"
        self.test_data_csv = "/home/kesci/input/qichedashi/final_round_test_set.csv"
        self.NEG = 3
        self.train_samples = 200000
Пример #7
0
def train():
    args = parse_args()
    args.model_type = ModelType(
        args.model_type)  #--model_type=0,1 classification regression

    #只使用cpu而且cpu只开一个线程
    paddle.init(use_gpu=False, trainer_count=1)
    '''dnn_input_dim: 61
       lr_input_dim: 10040001'''
    dnn_input_dim, lr_input_dim = reader.load_data_meta(args.data_meta_file)

    # create ctr model.
    model = CTRmodel(dnn_layer_dims,
                     dnn_input_dim,
                     lr_input_dim,
                     model_type=args.model_type,
                     is_infer=False)

    params = paddle.parameters.create(model.train_cost)
    optimizer = paddle.optimizer.AdaGrad()  #学习率优化

    trainer = paddle.trainer.SGD(cost=model.train_cost,
                                 parameters=params,
                                 update_equation=optimizer)

    dataset = reader.Dataset()

    def __event_handler__(event):
        if isinstance(event, paddle.event.EndIteration):
            num_samples = event.batch_id * args.batch_size
            if event.batch_id % 100 == 0:
                logger.warning(
                    "Pass %d, Samples %d, Cost %f, %s" %
                    (event.pass_id, num_samples, event.cost, event.metrics))

            if event.batch_id % 1000 == 0:
                if args.test_data_path:
                    result = trainer.test(reader=paddle.batch(
                        dataset.test(args.test_data_path),
                        batch_size=args.batch_size),
                                          feeding=reader.feeding_index)
                    logger.warning("Test %d-%d, Cost %f, %s" %
                                   (event.pass_id, event.batch_id, result.cost,
                                    result.metrics))

                path = "{}-pass-{}-batch-{}-test-{}.tar.gz".format(
                    args.model_output_prefix, event.pass_id, event.batch_id,
                    result.cost)
                with gzip.open(path, 'w') as f:
                    trainer.save_parameter_to_tar(f)

    trainer.train(reader=paddle.batch(paddle.reader.shuffle(dataset.train(
        args.train_data_path),
                                                            buf_size=500),
                                      batch_size=args.batch_size),
                  feeding=reader.feeding_index,
                  event_handler=__event_handler__,
                  num_passes=args.num_passes)
Пример #8
0
 def __init__(self, param_path):
     logger.info("create CTR model")
     dnn_input_dim, lr_input_dim = reader.load_data_meta(args.data_meta_path)
     # create the mdoel
     self.ctr_model = network_conf.CTRmodel(
         dnn_layer_dims,
         dnn_input_dim,
         lr_input_dim,
         model_type=ModelType(args.model_type),
         is_infer=True)
     # load parameter
     logger.info("load model parameters from %s" % param_path)
     self.parameters = paddle.parameters.Parameters.from_tar(
         gzip.open(param_path, 'r'))
     self.inferer = paddle.inference.Inference(
         output_layer=self.ctr_model.model,
         parameters=self.parameters, )
Пример #9
0
    def __init__(self, train_path, test_path, source_dic_path, target_dic_path,
                 model_type):
        self.train_path = train_path
        self.test_path = test_path
        self.source_dic_path = source_dic_path
        self.target_dic_path = target_dic_path
        self.model_type = ModelType(model_type)

        self.source_dic = load_dic(self.source_dic_path)
        self.target_dic = load_dic(self.target_dic_path)

        _record_reader = {
            ModelType.CLASSIFICATION_MODE: self._read_classification_record,
            ModelType.REGRESSION_MODE: self._read_regression_record,
            ModelType.RANK_MODE: self._read_rank_record,
        }

        assert isinstance(model_type, ModelType)
        self.record_reader = _record_reader[model_type.mode]
        self.is_infer = False
Пример #10
0
def train(data_path=None,
          model_type=ModelType.create_classification(),
          batch_size=100,
          num_passes=50,
          class_num=None,
          num_workers=1,
          use_gpu=False):
    '''
    Train the DNN.
    '''
    paddle.init(use_gpu=use_gpu, trainer_count=num_workers)

    # network config
    input_layer = paddle.layer.data(name='input_layer', type=paddle.data_type.dense_vector(feature_dim))
    dnn = create_dnn(input_layer)
    prediction = None
    label = None
    cost = None
    if args.model_type.is_classification():
        prediction = paddle.layer.fc(input=dnn, size=class_num, act=paddle.activation.Softmax())
        label = paddle.layer.data(name='label', type=paddle.data_type.integer_value(class_num))
        cost = paddle.layer.classification_cost(input=prediction, label=label)
    elif args.model_type.is_regression():
        prediction = paddle.layer.fc(input=dnn, size=1, act=paddle.activation.Linear())
        label = paddle.layer.data(name='label', type=paddle.data_type.dense_vector(1))
        cost = paddle.layer.mse_cost(input=prediction, label=label)

    # create parameters
    parameters = paddle.parameters.create(cost)

    # create optimizer
    optimizer = paddle.optimizer.Momentum(momentum=0)

    trainer = paddle.trainer.SGD(
        cost=cost, 
        extra_layers=paddle.evaluator.auc(input=prediction, label=label),
        parameters=parameters, update_equation=optimizer)

    feeding = {'input_layer': 0, 'label': 1}

    # event_handler to print training and testing info
    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 100 == 0:
                print "Pass %d, Batch %d, Cost %f, %s" % (
                    event.pass_id, event.batch_id, event.cost, event.metrics)

        if isinstance(event, paddle.event.EndPass):
            result = trainer.test(
                reader=paddle.batch(reader.test(data_path,
                                            feature_dim+1,
                                            args.model_type.is_classification()),
                            batch_size=batch_size),
                feeding=feeding)
            print "Test %d, Cost %f, %s" % (event.pass_id, result.cost, result.metrics)
            
            model_desc = "{type}".format(
                    type=str(args.model_type))
            with open("%sdnn_%s_pass_%05d.tar" %
                          (args.model_output_prefix, model_desc,
                           event.pass_id), "w") as f:
                parameters.to_tar(f)

    # training
    trainer.train(
        reader=paddle.batch(
            paddle.reader.shuffle(reader.train(data_path,
                                            feature_dim+1,
                                            args.model_type.is_classification()),
                    buf_size=batch_size*10),
            batch_size=batch_size),
        feeding=feeding,
        event_handler=event_handler,
        num_passes=num_passes)
Пример #11
0
    '-c',
    '--class_num',
    type=int,
    default=0,
    help="number of categories for classification task.")
parser.add_argument(
    '--num_workers', type=int, default=1, help="num worker threads, default 1")
parser.add_argument(
    '--use_gpu',
    type=bool,
    default=False,
    help="whether to use GPU devices (default: False)")

# arguments check.
args = parser.parse_args()
args.model_type = ModelType(args.model_type)
if args.model_type.is_classification():
    assert args.class_num > 1, "--class_num should be set in classification task."

feature_dim = args.feature_dim
layer_dims = [int(i) for i in args.dnn_dims.split(',')]

def create_dnn(sent_vec):
    # if more than three layers, than a fc layer will be added.
    if len(layer_dims) > 1:
        _input_layer = sent_vec
        for id, dim in enumerate(layer_dims):
            name = "fc_%d_%d" % (id, dim)
            logger.info("create fc layer [%s] which dimention is %d" %
                            (name, dim))
            fc = paddle.layer.fc(
Пример #12
0
    def __init__(self,
                 dnn_dims=[],
                 vocab_sizes=[],
                 model_type=ModelType.create_classification(),
                 model_arch=ModelArch.create_cnn(),
                 share_semantic_generator=False,
                 class_num=None,
                 share_embed=False,
                 is_infer=False):
        """
        :param dnn_dims: The dimention of each layer in the semantic vector
                         generator.
        :type dnn_dims: list of int
        :param vocab_sizes: The size of left and right items.
        :type vocab_sizes: A list having 2 elements.
        :param model_type: The type of task to train the DSSM model. The value
                           should be "rank: 0", "regression: 1" or
                           "classification: 2".
        :type model_type: int
        :param model_arch: A value indicating the model architecture to use.
        :type model_arch: int
        :param share_semantic_generator: A flag indicating whether to share the
                                         semantic vector between the left and
                                         the right item.
        :type share_semantic_generator: bool
        :param share_embed: A floag indicating whether to share the embeddings
                            between the left and the right item.
        :type share_embed: bool
        :param class_num: The number of categories.
        :type class_num: int
        """
        assert len(vocab_sizes) == 2, (
            "The vocab_sizes specifying the sizes left and right inputs. "
            "Its dimension should be 2.")
        assert len(dnn_dims) > 1, ("In the DNN model, more than two layers "
                                   "are needed.")

        self.dnn_dims = dnn_dims
        self.vocab_sizes = vocab_sizes
        self.share_semantic_generator = share_semantic_generator
        self.share_embed = share_embed
        self.model_type = ModelType(model_type)
        self.model_arch = ModelArch(model_arch)
        self.class_num = class_num
        self.is_infer = is_infer
        logger.warning("Build DSSM model with config of %s, %s" %
                       (self.model_type, self.model_arch))
        logger.info("The vocabulary size is : %s" % str(self.vocab_sizes))

        # bind model architecture
        _model_arch = {
            "cnn": self.create_cnn,
            "fc": self.create_fc,
            "rnn": self.create_rnn,
        }

        def _model_arch_creater(emb, prefix=""):
            sent_vec = _model_arch.get(str(model_arch))(emb, prefix)
            dnn = self.create_dnn(sent_vec, prefix)
            return dnn

        self.model_arch_creater = _model_arch_creater

        _model_type = {
            "classification": self._build_classification_model,
            "rank": self._build_rank_model,
            "regression": self._build_regression_model,
        }
        print("model type: ", str(self.model_type))
        self.model_type_creater = _model_type[str(self.model_type)]
Пример #13
0
class DSSM(object):
    def __init__(self,
                 dnn_dims=[],
                 vocab_sizes=[],
                 model_type=ModelType.create_classification(),
                 model_arch=ModelArch.create_cnn(),
                 share_semantic_generator=False,
                 class_num=None,
                 share_embed=False,
                 is_infer=False):
        """
        :param dnn_dims: The dimention of each layer in the semantic vector
                         generator.
        :type dnn_dims: list of int
        :param vocab_sizes: The size of left and right items.
        :type vocab_sizes: A list having 2 elements.
        :param model_type: The type of task to train the DSSM model. The value
                           should be "rank: 0", "regression: 1" or
                           "classification: 2".
        :type model_type: int
        :param model_arch: A value indicating the model architecture to use.
        :type model_arch: int
        :param share_semantic_generator: A flag indicating whether to share the
                                         semantic vector between the left and
                                         the right item.
        :type share_semantic_generator: bool
        :param share_embed: A floag indicating whether to share the embeddings
                            between the left and the right item.
        :type share_embed: bool
        :param class_num: The number of categories.
        :type class_num: int
        """
        assert len(vocab_sizes) == 2, (
            "The vocab_sizes specifying the sizes left and right inputs. "
            "Its dimension should be 2.")
        assert len(dnn_dims) > 1, ("In the DNN model, more than two layers "
                                   "are needed.")

        self.dnn_dims = dnn_dims
        self.vocab_sizes = vocab_sizes
        self.share_semantic_generator = share_semantic_generator
        self.share_embed = share_embed
        self.model_type = ModelType(model_type)
        self.model_arch = ModelArch(model_arch)
        self.class_num = class_num
        self.is_infer = is_infer
        logger.warning("Build DSSM model with config of %s, %s" %
                       (self.model_type, self.model_arch))
        logger.info("The vocabulary size is : %s" % str(self.vocab_sizes))

        # bind model architecture
        _model_arch = {
            "cnn": self.create_cnn,
            "fc": self.create_fc,
            "rnn": self.create_rnn,
        }

        def _model_arch_creater(emb, prefix=""):
            sent_vec = _model_arch.get(str(model_arch))(emb, prefix)
            dnn = self.create_dnn(sent_vec, prefix)
            return dnn

        self.model_arch_creater = _model_arch_creater

        _model_type = {
            "classification": self._build_classification_model,
            "rank": self._build_rank_model,
            "regression": self._build_regression_model,
        }
        print("model type: ", str(self.model_type))
        self.model_type_creater = _model_type[str(self.model_type)]

    def __call__(self):
        return self.model_type_creater()

    def create_embedding(self, input, prefix=""):
        """
        Create word embedding. The `prefix` is added in front of the name of
        embedding"s learnable parameter.
        """
        logger.info("Create embedding table [%s] whose dimention is %d. " %
                    (prefix, self.dnn_dims[0]))
        emb = paddle.layer.embedding(input=input,
                                     size=self.dnn_dims[0],
                                     param_attr=ParamAttr(name="%s_emb.w" %
                                                          prefix))
        return emb

    def create_fc(self, emb, prefix=""):
        """
        A multi-layer fully connected neural networks.
        :param emb: The output of the embedding layer
        :type emb: paddle.layer
        :param prefix: A prefix will be added to the layers' names.
        :type prefix: str
        """
        _input_layer = paddle.layer.pooling(input=emb,
                                            pooling_type=paddle.pooling.Max())
        fc = paddle.layer.fc(input=_input_layer,
                             size=self.dnn_dims[1],
                             param_attr=ParamAttr(name="%s_fc.w" % prefix),
                             bias_attr=ParamAttr(name="%s_fc.b" % prefix,
                                                 initial_std=0.))
        return fc

    def create_rnn(self, emb, prefix=""):
        """
        A GRU sentence vector learner.
        """
        gru = paddle.networks.simple_gru(
            input=emb,
            size=self.dnn_dims[1],
            mixed_param_attr=ParamAttr(name="%s_gru_mixed.w" % prefix),
            mixed_bias_param_attr=ParamAttr(name="%s_gru_mixed.b" % prefix),
            gru_param_attr=ParamAttr(name="%s_gru.w" % prefix),
            gru_bias_attr=ParamAttr(name="%s_gru.b" % prefix))
        sent_vec = paddle.layer.last_seq(gru)
        return sent_vec

    def create_cnn(self, emb, prefix=""):
        """
        A multi-layer CNN.
        :param emb: The word embedding.
        :type emb: paddle.layer
        :param prefix: The prefix will be added to of layers' names.
        :type prefix: str
        """
        def create_conv(context_len, hidden_size, prefix):
            key = "%s_%d_%d" % (prefix, context_len, hidden_size)
            conv = paddle.networks.sequence_conv_pool(
                input=emb,
                context_len=context_len,
                hidden_size=hidden_size,
                # set parameter attr for parameter sharing
                context_proj_param_attr=ParamAttr(name=key + "contex_proj.w"),
                fc_param_attr=ParamAttr(name=key + "_fc.w"),
                fc_bias_attr=ParamAttr(name=key + "_fc.b"),
                pool_bias_attr=ParamAttr(name=key + "_pool.b"))
            return conv

        logger.info("create a sequence_conv_pool whose context width is 3.")
        conv_3 = create_conv(3, self.dnn_dims[1], "cnn")
        logger.info("create a sequence_conv_pool whose context width is 4.")
        conv_4 = create_conv(4, self.dnn_dims[1], "cnn")

        return paddle.layer.concat(input=[conv_3, conv_4])

    def create_dnn(self, sent_vec, prefix):
        # if more than three layers, than a fc layer will be added.
        if len(self.dnn_dims) > 1:
            _input_layer = sent_vec
            for id, dim in enumerate(self.dnn_dims[1:]):
                name = "%s_fc_%d_%d" % (prefix, id, dim)
                logger.info("create fc layer [%s] which dimention is %d" %
                            (name, dim))
                fc = paddle.layer.fc(input=_input_layer,
                                     size=dim,
                                     act=paddle.activation.Tanh(),
                                     param_attr=ParamAttr(name="%s.w" % name),
                                     bias_attr=ParamAttr(name="%s.b" % name,
                                                         initial_std=0.))
                _input_layer = fc
        return _input_layer

    def _build_classification_model(self):
        logger.info("build classification model")
        assert self.model_type.is_classification()
        return self._build_classification_or_regression_model(
            is_classification=True)

    def _build_regression_model(self):
        logger.info("build regression model")
        assert self.model_type.is_regression()
        return self._build_classification_or_regression_model(
            is_classification=False)

    def _build_rank_model(self):
        """
        Build a pairwise rank model, and the cost is returned.
        A pairwise rank model has 3 inputs:
          - source sentence
          - left_target sentence
          - right_target sentence
          - label, 1 if left_target should be sorted in front of
                   right_target, otherwise 0.
        """
        logger.info("build rank model")
        assert self.model_type.is_rank()
        source = paddle.layer.data(
            name="source_input",
            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
        left_target = paddle.layer.data(
            name="left_target_input",
            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
        right_target = paddle.layer.data(
            name="right_target_input",
            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
        if not self.is_infer:
            label = paddle.layer.data(name="label_input",
                                      type=paddle.data_type.integer_value(1))

        prefixs = "_ _ _".split(
        ) if self.share_semantic_generator else "source target target".split()
        embed_prefixs = "_ _ _".split(
        ) if self.share_embed else "source target target".split()

        word_vecs = []
        for id, input in enumerate([source, left_target, right_target]):
            x = self.create_embedding(input, prefix=embed_prefixs[id])
            word_vecs.append(x)

        semantics = []
        for id, input in enumerate(word_vecs):
            x = self.model_arch_creater(input, prefix=prefixs[id])
            semantics.append(x)

        # The cosine similarity score of source and left_target.
        left_score = paddle.layer.cos_sim(semantics[0], semantics[1])
        # The cosine similarity score of source and right target.
        right_score = paddle.layer.cos_sim(semantics[0], semantics[2])

        if not self.is_infer:
            # rank cost
            cost = paddle.layer.rank_cost(left_score, right_score, label=label)
            # prediction = left_score - right_score
            # but this operator is not supported currently.
            # so AUC will not used.
            return cost, None, label
        return right_score

    def _build_classification_or_regression_model(self, is_classification):
        """
        Build a classification/regression model, and the cost is returned.
        The classification/regression task expects 3 inputs:
          - source sentence
          - target sentence
          - classification label
        """
        if is_classification:
            assert self.class_num

        source = paddle.layer.data(
            name="source_input",
            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
        target = paddle.layer.data(
            name="target_input",
            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
        label = paddle.layer.data(
            name="label_input",
            type=paddle.data_type.integer_value(self.class_num)
            if is_classification else paddle.data_type.dense_vector(1))

        prefixs = "_ _".split(
        ) if self.share_semantic_generator else "source target".split()
        embed_prefixs = "_ _".split(
        ) if self.share_embed else "source target".split()

        word_vecs = []
        for id, input in enumerate([source, target]):
            x = self.create_embedding(input, prefix=embed_prefixs[id])
            word_vecs.append(x)

        semantics = []
        for id, input in enumerate(word_vecs):
            x = self.model_arch_creater(input, prefix=prefixs[id])
            semantics.append(x)

        if is_classification:
            concated_vector = paddle.layer.concat(semantics)
            prediction = paddle.layer.fc(input=concated_vector,
                                         size=self.class_num,
                                         act=paddle.activation.Softmax())
            cost = paddle.layer.classification_cost(input=prediction,
                                                    label=label)
        else:
            prediction = paddle.layer.cos_sim(*semantics)
            cost = paddle.layer.square_error_cost(prediction, label)

        if not self.is_infer:
            return cost, prediction, label
        return prediction
Пример #14
0
def train(train_data_path=None,
          test_data_path=None,
          source_dic_path=None,
          target_dic_path=None,
          model_type=ModelType.create_classification(),
          model_arch=ModelArch.create_cnn(),
          batch_size=10,
          num_passes=10,
          share_semantic_generator=False,
          share_embed=False,
          class_num=None,
          num_workers=1,
          use_gpu=False):
    '''
    Train the DSSM.
    '''
    default_train_path = './data/rank/train.txt'
    default_test_path = './data/rank/test.txt'
    default_dic_path = './data/vocab.txt'
    if not model_type.is_rank():
        default_train_path = './data/classification/train.txt'
        default_test_path = './data/classification/test.txt'

    use_default_data = not train_data_path

    if use_default_data:
        train_data_path = default_train_path
        test_data_path = default_test_path
        source_dic_path = default_dic_path
        target_dic_path = default_dic_path

    dataset = reader.Dataset(
        train_path=train_data_path,
        test_path=test_data_path,
        source_dic_path=source_dic_path,
        target_dic_path=target_dic_path,
        model_type=model_type,
    )

    train_reader = paddle.batch(paddle.reader.shuffle(dataset.train,
                                                      buf_size=1000),
                                batch_size=batch_size)

    test_reader = paddle.batch(paddle.reader.shuffle(dataset.test,
                                                     buf_size=1000),
                               batch_size=batch_size)

    paddle.init(use_gpu=use_gpu, trainer_count=num_workers)

    cost, prediction, label = DSSM(
        dnn_dims=layer_dims,
        vocab_sizes=[
            len(load_dic(path)) for path in [source_dic_path, target_dic_path]
        ],
        model_type=model_type,
        model_arch=model_arch,
        share_semantic_generator=share_semantic_generator,
        class_num=class_num,
        share_embed=share_embed)()

    parameters = paddle.parameters.create(cost)

    adam_optimizer = paddle.optimizer.Adam(
        learning_rate=1e-3,
        regularization=paddle.optimizer.L2Regularization(rate=1e-3),
        model_average=paddle.optimizer.ModelAverage(average_window=0.5))

    trainer = paddle.trainer.SGD(
        cost=cost,
        extra_layers=paddle.evaluator.auc(input=prediction, label=label)
        if not model_type.is_rank() else None,
        parameters=parameters,
        update_equation=adam_optimizer)

    feeding = {}
    if model_type.is_classification() or model_type.is_regression():
        feeding = {'source_input': 0, 'target_input': 1, 'label_input': 2}
    else:
        feeding = {
            'source_input': 0,
            'left_target_input': 1,
            'right_target_input': 2,
            'label_input': 3
        }

    def _event_handler(event):
        '''
        Define batch handler
        '''
        if isinstance(event, paddle.event.EndIteration):
            # output train log
            if event.batch_id % args.num_batches_to_log == 0:
                logger.info(
                    "Pass %d, Batch %d, Cost %f, %s" %
                    (event.pass_id, event.batch_id, event.cost, event.metrics))

            # test model
            if event.batch_id > 0 and event.batch_id % args.num_batches_to_test == 0:
                if test_reader is not None:
                    if model_type.is_classification():
                        result = trainer.test(reader=test_reader,
                                              feeding=feeding)
                        logger.info("Test at Pass %d, %s" %
                                    (event.pass_id, result.metrics))
                    else:
                        result = None
            # save model
            if event.batch_id > 0 and event.batch_id % args.num_batches_to_save_model == 0:
                model_desc = "{type}_{arch}".format(type=str(args.model_type),
                                                    arch=str(args.model_arch))
                with open(
                        "%sdssm_%s_pass_%05d.tar" %
                    (args.model_output_prefix, model_desc, event.pass_id),
                        "w") as f:
                    parameters.to_tar(f)

    trainer.train(reader=train_reader,
                  event_handler=_event_handler,
                  feeding=feeding,
                  num_passes=num_passes)

    logger.info("Training has finished.")
Пример #15
0
    def __init__(self,
                 dnn_dims=[],
                 vocab_sizes=[],
                 model_type=ModelType.create_classification(),
                 model_arch=ModelArch.create_cnn(),
                 share_semantic_generator=False,
                 class_num=None,
                 share_embed=False,
                 is_infer=False):
        '''
        @dnn_dims: list of int
            dimentions of each layer in semantic vector generator.
        @vocab_sizes: 2-d tuple
            size of both left and right items.
        @model_type: int
            type of task, should be 'rank: 0', 'regression: 1' or 'classification: 2'
        @model_arch: int
            model architecture
        @share_semantic_generator: bool
            whether to share the semantic vector generator for both left and right.
        @share_embed: bool
            whether to share the embeddings between left and right.
        @class_num: int
            number of categories.
        '''
        assert len(
            vocab_sizes
        ) == 2, "vocab_sizes specify the sizes left and right inputs, and dim should be 2."
        assert len(dnn_dims) > 1, "more than two layers is needed."

        self.dnn_dims = dnn_dims
        self.vocab_sizes = vocab_sizes
        self.share_semantic_generator = share_semantic_generator
        self.share_embed = share_embed
        self.model_type = ModelType(model_type)
        self.model_arch = ModelArch(model_arch)
        self.class_num = class_num
        self.is_infer = is_infer
        logger.warning("build DSSM model with config of %s, %s" %
                       (self.model_type, self.model_arch))
        logger.info("vocabulary sizes: %s" % str(self.vocab_sizes))

        # bind model architecture
        _model_arch = {
            'cnn': self.create_cnn,
            'fc': self.create_fc,
            'rnn': self.create_rnn,
        }

        def _model_arch_creater(emb, prefix=''):
            sent_vec = _model_arch.get(str(model_arch))(emb, prefix)
            dnn = self.create_dnn(sent_vec, prefix)
            return dnn

        self.model_arch_creater = _model_arch_creater

        # build model type
        _model_type = {
            'classification': self._build_classification_model,
            'rank': self._build_rank_model,
            'regression': self._build_regression_model,
        }
        print 'model type: ', str(self.model_type)
        self.model_type_creater = _model_type[str(self.model_type)]
Пример #16
0
class DSSM(object):
    def __init__(self,
                 dnn_dims=[],
                 vocab_sizes=[],
                 model_type=ModelType.create_classification(),
                 model_arch=ModelArch.create_cnn(),
                 share_semantic_generator=False,
                 class_num=None,
                 share_embed=False,
                 is_infer=False):
        '''
        @dnn_dims: list of int
            dimentions of each layer in semantic vector generator.
        @vocab_sizes: 2-d tuple
            size of both left and right items.
        @model_type: int
            type of task, should be 'rank: 0', 'regression: 1' or 'classification: 2'
        @model_arch: int
            model architecture
        @share_semantic_generator: bool
            whether to share the semantic vector generator for both left and right.
        @share_embed: bool
            whether to share the embeddings between left and right.
        @class_num: int
            number of categories.
        '''
        assert len(
            vocab_sizes
        ) == 2, "vocab_sizes specify the sizes left and right inputs, and dim should be 2."
        assert len(dnn_dims) > 1, "more than two layers is needed."

        self.dnn_dims = dnn_dims
        self.vocab_sizes = vocab_sizes
        self.share_semantic_generator = share_semantic_generator
        self.share_embed = share_embed
        self.model_type = ModelType(model_type)
        self.model_arch = ModelArch(model_arch)
        self.class_num = class_num
        self.is_infer = is_infer
        logger.warning("build DSSM model with config of %s, %s" %
                       (self.model_type, self.model_arch))
        logger.info("vocabulary sizes: %s" % str(self.vocab_sizes))

        # bind model architecture
        _model_arch = {
            'cnn': self.create_cnn,
            'fc': self.create_fc,
            'rnn': self.create_rnn,
        }

        def _model_arch_creater(emb, prefix=''):
            sent_vec = _model_arch.get(str(model_arch))(emb, prefix)
            dnn = self.create_dnn(sent_vec, prefix)
            return dnn

        self.model_arch_creater = _model_arch_creater

        # build model type
        _model_type = {
            'classification': self._build_classification_model,
            'rank': self._build_rank_model,
            'regression': self._build_regression_model,
        }
        print 'model type: ', str(self.model_type)
        self.model_type_creater = _model_type[str(self.model_type)]

    def __call__(self):
        return self.model_type_creater()

    def create_embedding(self, input, prefix=''):
        '''
        Create an embedding table whose name has a `prefix`.
        '''
        logger.info("create embedding table [%s] which dimention is %d" %
                    (prefix, self.dnn_dims[0]))
        emb = paddle.layer.embedding(
            input=input,
            size=self.dnn_dims[0],
            param_attr=ParamAttr(name='%s_emb.w' % prefix))
        return emb

    def create_fc(self, emb, prefix=''):
        '''
        A multi-layer fully connected neural networks.

        @emb: paddle.layer
            output of the embedding layer
        @prefix: str
            prefix of layers' names, used to share parameters between more than one `fc` parts.
        '''
        _input_layer = paddle.layer.pooling(
            input=emb, pooling_type=paddle.pooling.Max())
        fc = paddle.layer.fc(input=_input_layer, size=self.dnn_dims[1])
        return fc

    def create_rnn(self, emb, prefix=''):
        '''
        A GRU sentence vector learner.
        '''
        gru = paddle.networks.simple_gru(input=emb, size=256)
        sent_vec = paddle.layer.last_seq(gru)
        return sent_vec

    def create_cnn(self, emb, prefix=''):
        '''
        A multi-layer CNN.

        @emb: paddle.layer
            output of the embedding layer
        @prefix: str
            prefix of layers' names, used to share parameters between more than one `cnn` parts.
        '''

        def create_conv(context_len, hidden_size, prefix):
            key = "%s_%d_%d" % (prefix, context_len, hidden_size)
            conv = paddle.networks.sequence_conv_pool(
                input=emb,
                context_len=context_len,
                hidden_size=hidden_size,
                # set parameter attr for parameter sharing
                context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'),
                fc_param_attr=ParamAttr(name=key + '_fc.w'),
                fc_bias_attr=ParamAttr(name=key + '_fc.b'),
                pool_bias_attr=ParamAttr(name=key + '_pool.b'))
            return conv

        logger.info('create a sequence_conv_pool which context width is 3')
        conv_3 = create_conv(3, self.dnn_dims[1], "cnn")
        logger.info('create a sequence_conv_pool which context width is 4')
        conv_4 = create_conv(4, self.dnn_dims[1], "cnn")

        return conv_3, conv_4

    def create_dnn(self, sent_vec, prefix):
        # if more than three layers, than a fc layer will be added.
        if len(self.dnn_dims) > 1:
            _input_layer = sent_vec
            for id, dim in enumerate(self.dnn_dims[1:]):
                name = "%s_fc_%d_%d" % (prefix, id, dim)
                logger.info("create fc layer [%s] which dimention is %d" %
                            (name, dim))
                fc = paddle.layer.fc(
                    name=name,
                    input=_input_layer,
                    size=dim,
                    act=paddle.activation.Tanh(),
                    param_attr=ParamAttr(name='%s.w' % name),
                    bias_attr=ParamAttr(name='%s.b' % name))
                _input_layer = fc
        return _input_layer

    def _build_classification_model(self):
        logger.info("build classification model")
        assert self.model_type.is_classification()
        return self._build_classification_or_regression_model(
            is_classification=True)

    def _build_regression_model(self):
        logger.info("build regression model")
        assert self.model_type.is_regression()
        return self._build_classification_or_regression_model(
            is_classification=False)

    def _build_rank_model(self):
        '''
        Build a pairwise rank model, and the cost is returned.

        A pairwise rank model has 3 inputs:
          - source sentence
          - left_target sentence
          - right_target sentence
          - label, 1 if left_target should be sorted in front of right_target, otherwise 0.
        '''
        logger.info("build rank model")
        assert self.model_type.is_rank()
        source = paddle.layer.data(
            name='source_input',
            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
        left_target = paddle.layer.data(
            name='left_target_input',
            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
        right_target = paddle.layer.data(
            name='right_target_input',
            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
        if not self.is_infer:
            label = paddle.layer.data(
                name='label_input', type=paddle.data_type.integer_value(1))

        prefixs = '_ _ _'.split(
        ) if self.share_semantic_generator else 'source left right'.split()
        embed_prefixs = '_ _'.split(
        ) if self.share_embed else 'source target target'.split()

        word_vecs = []
        for id, input in enumerate([source, left_target, right_target]):
            x = self.create_embedding(input, prefix=embed_prefixs[id])
            word_vecs.append(x)

        semantics = []
        for id, input in enumerate(word_vecs):
            x = self.model_arch_creater(input, prefix=prefixs[id])
            semantics.append(x)

        # cossim score of source and left_target
        left_score = paddle.layer.cos_sim(semantics[0], semantics[1])
        # cossim score of source and right target
        right_score = paddle.layer.cos_sim(semantics[0], semantics[2])

        if not self.is_infer:
            # rank cost
            cost = paddle.layer.rank_cost(left_score, right_score, label=label)
            # prediction = left_score - right_score
            # but this operator is not supported currently.
            # so AUC will not used.
            return cost, None, label
        return right_score

    def _build_classification_or_regression_model(self, is_classification):
        '''
        Build a classification/regression model, and the cost is returned.

        A Classification has 3 inputs:
          - source sentence
          - target sentence
          - classification label

        '''
        if is_classification:
            # prepare inputs.
            assert self.class_num

        source = paddle.layer.data(
            name='source_input',
            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
        target = paddle.layer.data(
            name='target_input',
            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
        label = paddle.layer.data(
            name='label_input',
            type=paddle.data_type.integer_value(self.class_num)
            if is_classification else paddle.data_type.dense_vector(1))

        prefixs = '_ _'.split(
        ) if self.share_semantic_generator else 'left right'.split()
        embed_prefixs = '_ _'.split(
        ) if self.share_embed else 'left right'.split()

        word_vecs = []
        for id, input in enumerate([source, target]):
            x = self.create_embedding(input, prefix=embed_prefixs[id])
            word_vecs.append(x)

        semantics = []
        for id, input in enumerate(word_vecs):
            x = self.model_arch_creater(input, prefix=prefixs[id])
            semantics.append(x)

        if is_classification:
            concated_vector = paddle.layer.concat(semantics)
            prediction = paddle.layer.fc(
                input=concated_vector,
                size=self.class_num,
                act=paddle.activation.Softmax())
            cost = paddle.layer.classification_cost(
                input=prediction, label=label)
        else:
            prediction = paddle.layer.cos_sim(*semantics)
            cost = paddle.layer.square_error_cost(prediction, label)

        if not self.is_infer:
            return cost, prediction, label
        return prediction