Пример #1
0
    word2vec_file_path = (config['word2vec_file_path'])%config['word_embedding_dim']

    print model_file_path
    print result_file_path
    print train_cnn_feature_file_path
    print test_cnn_feature_file_path
    print word2vec_file_path

    rand_embedding_cnn = WordEmbeddingCNN(
        rand_seed=seed,
        verbose=verbose,
        input_dim=feature_encoder.vocabulary_size + 1,
        word_embedding_dim=config['word_embedding_dim'],
        embedding_init_weight=feature_encoder.to_embedding_weight(word2vec_file_path),
        input_length=config['sentence_padding_length'],
        num_labels=len(label_to_index),
        conv_filter_type=config['conv_filter_type'],
        k=config['kmax_k'],
        embedding_dropout_rate=config['embedding_dropout_rate'],
        output_dropout_rate=config['output_dropout_rate'],
        nb_epoch=int(config['cnn_nb_epoch']),
        earlyStoping_patience=config['earlyStoping_patience'],
    )
    rand_embedding_cnn.print_model_descibe()

    if config['refresh_all_model'] or not os.path.exists(model_file_path):
        # 训练模型
        rand_embedding_cnn.fit((feature_encoder.train_padding_index, train_y),
                               (map(feature_encoder.transform_sentence, test_X), test_y))
        # 保存模型
        rand_embedding_cnn.save_model(model_file_path)
    def __init__(self, feature_encoder, num_filter, num_labels, n_estimators,
                 word2vec_model_file_path, **kwargs):

        if kwargs.get('rand_weight', False):
            # CNN(rand)模式
            weight = None
        elif kwargs['dataset_flag'] == 0:
            if RFAndWordEmbeddingCnnMerge.train_data_weight is None:
                # 训练集
                RFAndWordEmbeddingCnnMerge.train_data_weight = feature_encoder.to_embedding_weight(
                    word2vec_model_file_path)
            weight = RFAndWordEmbeddingCnnMerge.train_data_weight
        else:
            # kwargs['dataset_flag']>0
            if RFAndWordEmbeddingCnnMerge.val_data_weight is None:
                RFAndWordEmbeddingCnnMerge.val_data_weight = feature_encoder.to_embedding_weight(
                    word2vec_model_file_path)
            weight = RFAndWordEmbeddingCnnMerge.val_data_weight
        # print(weight)
        self.static_w2v_cnn = WordEmbeddingCNN(
            rand_seed=1377,
            verbose=kwargs.get('verbose', 0),
            feature_encoder=feature_encoder,
            # optimizers='adadelta',
            optimizers='sgd',
            # 当使用CNN (rand) 模式的时候使用到了
            word_embedding_dim=50,
            # 设置embedding使用训练好的w2v模型初始化
            embedding_init_weight=weight,
            # 默认设置为训练时embedding层权重不变
            embedding_weight_trainable=kwargs.get('embedding_weight_trainable',
                                                  False),
            num_labels=num_labels,
            l1_conv_filter_type=[
                [num_filter, 3, -1, 'valid', (-1, 1), 0.5, 'relu', 'none'],
                [num_filter, 4, -1, 'valid', (-1, 1), 0., 'relu', 'none'],
                [num_filter, 5, -1, 'valid', (-1, 1), 0., 'relu', 'none'],
            ],
            l2_conv_filter_type=[],
            full_connected_layer_units=[],
            embedding_dropout_rate=0.,
            nb_epoch=kwargs.get('nb_epoch', 25),
            batch_size=kwargs.get('batch_size', 32),
            earlyStoping_patience=30,
            lr=kwargs.get('lr', 1e-2),
            show_validate_accuracy=True
            if kwargs.get('verbose', 0) > 0 else False,
            # output_regularizer=('l2', 0.5),
            output_constraints=('maxnorm', 3),
            # 必须设为True,才能取中间结果做特征
            save_middle_output=True,
        )

        self.bow_randomforest = BowRandomForest(
            rand_seed=1377,
            verbose=kwargs.get('verbose', 0),
            feature_encoder=feature_encoder,
            # optimizers='adadelta',
            n_estimators=n_estimators,
            min_samples_leaf=1,
        )
Пример #3
0
    def get_model(feature_encoder, num_filter, num_labels,
                  word2vec_model_file_path, **kwargs):
        # print(WordEmbeddingCNNWithOneConv.weight)
        """获取 CNN(w2v)模型

        Parameters
        ----------
        feature_encoder : FeatureEncoder
            特征编码器
        num_filter : int
        num_labels : int
        word2vec_model_file_path : str
        kwargs : dict
            - dataset_flag
            - rand_weight : (default,False)设置为 True 时,为 CNN(rand) 模型
            - verbose
            - embedding_weight_trainable

        Returns
        -------

        """
        if kwargs.get('rand_weight', False):
            # CNN(rand)模式
            weight = None
        elif kwargs['dataset_flag'] == 0:
            if WordEmbeddingCNNWithOneConv.train_data_weight is None:
                # 训练集
                WordEmbeddingCNNWithOneConv.train_data_weight = feature_encoder.to_embedding_weight(
                    word2vec_model_file_path)
            weight = WordEmbeddingCNNWithOneConv.train_data_weight
        else:
            # kwargs['dataset_flag']>0
            if WordEmbeddingCNNWithOneConv.val_data_weight is None:
                WordEmbeddingCNNWithOneConv.val_data_weight = feature_encoder.to_embedding_weight(
                    word2vec_model_file_path)
            weight = WordEmbeddingCNNWithOneConv.val_data_weight
        # print(weight)
        static_w2v_cnn = WordEmbeddingCNN(
            rand_seed=1377,
            verbose=kwargs.get('verbose', 0),
            feature_encoder=feature_encoder,
            # optimizers='adadelta',
            optimizers='sgd',
            # 当使用CNN (rand) 模式的时候使用到了
            word_embedding_dim=300,
            # 设置embedding使用训练好的w2v模型初始化
            embedding_init_weight=weight,
            # 默认设置为训练时embedding层权重不变
            embedding_weight_trainable=kwargs.get('embedding_weight_trainable',
                                                  False),
            num_labels=num_labels,
            l1_conv_filter_type=[
                [num_filter, 3, -1, 'valid', (-1, 1), 0.5, 'relu', 'none'],
                [num_filter, 4, -1, 'valid', (-1, 1), 0., 'relu', 'none'],
                [num_filter, 5, -1, 'valid', (-1, 1), 0., 'relu', 'none'],
            ],
            l2_conv_filter_type=[],
            full_connected_layer_units=[],
            embedding_dropout_rate=0.,
            nb_epoch=kwargs.get('nb_epoch', 25),
            batch_size=kwargs.get('batch_size', 32),
            earlyStoping_patience=30,
            lr=kwargs.get('lr', 1e-2),
            show_validate_accuracy=True
            if kwargs.get('verbose', 0) > 0 else False,
            # output_regularizer=('l2', 0.5),
            output_constraints=('maxnorm', 3),
            save_middle_output=kwargs.get('get_cnn_middle_layer_output',
                                          False),
        )
        # static_w2v_cnn.print_model_descibe()
        # quit()
        return static_w2v_cnn