word2vec_file_path = (config['word2vec_file_path'])%config['word_embedding_dim'] print model_file_path print result_file_path print train_cnn_feature_file_path print test_cnn_feature_file_path print word2vec_file_path rand_embedding_cnn = WordEmbeddingCNN( rand_seed=seed, verbose=verbose, input_dim=feature_encoder.vocabulary_size + 1, word_embedding_dim=config['word_embedding_dim'], embedding_init_weight=feature_encoder.to_embedding_weight(word2vec_file_path), input_length=config['sentence_padding_length'], num_labels=len(label_to_index), conv_filter_type=config['conv_filter_type'], k=config['kmax_k'], embedding_dropout_rate=config['embedding_dropout_rate'], output_dropout_rate=config['output_dropout_rate'], nb_epoch=int(config['cnn_nb_epoch']), earlyStoping_patience=config['earlyStoping_patience'], ) rand_embedding_cnn.print_model_descibe() if config['refresh_all_model'] or not os.path.exists(model_file_path): # 训练模型 rand_embedding_cnn.fit((feature_encoder.train_padding_index, train_y), (map(feature_encoder.transform_sentence, test_X), test_y)) # 保存模型 rand_embedding_cnn.save_model(model_file_path)
def __init__(self, feature_encoder, num_filter, num_labels, n_estimators, word2vec_model_file_path, **kwargs): if kwargs.get('rand_weight', False): # CNN(rand)模式 weight = None elif kwargs['dataset_flag'] == 0: if RFAndWordEmbeddingCnnMerge.train_data_weight is None: # 训练集 RFAndWordEmbeddingCnnMerge.train_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = RFAndWordEmbeddingCnnMerge.train_data_weight else: # kwargs['dataset_flag']>0 if RFAndWordEmbeddingCnnMerge.val_data_weight is None: RFAndWordEmbeddingCnnMerge.val_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = RFAndWordEmbeddingCnnMerge.val_data_weight # print(weight) self.static_w2v_cnn = WordEmbeddingCNN( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', optimizers='sgd', # 当使用CNN (rand) 模式的时候使用到了 word_embedding_dim=50, # 设置embedding使用训练好的w2v模型初始化 embedding_init_weight=weight, # 默认设置为训练时embedding层权重不变 embedding_weight_trainable=kwargs.get('embedding_weight_trainable', False), num_labels=num_labels, l1_conv_filter_type=[ [num_filter, 3, -1, 'valid', (-1, 1), 0.5, 'relu', 'none'], [num_filter, 4, -1, 'valid', (-1, 1), 0., 'relu', 'none'], [num_filter, 5, -1, 'valid', (-1, 1), 0., 'relu', 'none'], ], l2_conv_filter_type=[], full_connected_layer_units=[], embedding_dropout_rate=0., nb_epoch=kwargs.get('nb_epoch', 25), batch_size=kwargs.get('batch_size', 32), earlyStoping_patience=30, lr=kwargs.get('lr', 1e-2), show_validate_accuracy=True if kwargs.get('verbose', 0) > 0 else False, # output_regularizer=('l2', 0.5), output_constraints=('maxnorm', 3), # 必须设为True,才能取中间结果做特征 save_middle_output=True, ) self.bow_randomforest = BowRandomForest( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', n_estimators=n_estimators, min_samples_leaf=1, )
def get_model(feature_encoder, num_filter, num_labels, word2vec_model_file_path, **kwargs): # print(WordEmbeddingCNNWithOneConv.weight) """获取 CNN(w2v)模型 Parameters ---------- feature_encoder : FeatureEncoder 特征编码器 num_filter : int num_labels : int word2vec_model_file_path : str kwargs : dict - dataset_flag - rand_weight : (default,False)设置为 True 时,为 CNN(rand) 模型 - verbose - embedding_weight_trainable Returns ------- """ if kwargs.get('rand_weight', False): # CNN(rand)模式 weight = None elif kwargs['dataset_flag'] == 0: if WordEmbeddingCNNWithOneConv.train_data_weight is None: # 训练集 WordEmbeddingCNNWithOneConv.train_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = WordEmbeddingCNNWithOneConv.train_data_weight else: # kwargs['dataset_flag']>0 if WordEmbeddingCNNWithOneConv.val_data_weight is None: WordEmbeddingCNNWithOneConv.val_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = WordEmbeddingCNNWithOneConv.val_data_weight # print(weight) static_w2v_cnn = WordEmbeddingCNN( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', optimizers='sgd', # 当使用CNN (rand) 模式的时候使用到了 word_embedding_dim=300, # 设置embedding使用训练好的w2v模型初始化 embedding_init_weight=weight, # 默认设置为训练时embedding层权重不变 embedding_weight_trainable=kwargs.get('embedding_weight_trainable', False), num_labels=num_labels, l1_conv_filter_type=[ [num_filter, 3, -1, 'valid', (-1, 1), 0.5, 'relu', 'none'], [num_filter, 4, -1, 'valid', (-1, 1), 0., 'relu', 'none'], [num_filter, 5, -1, 'valid', (-1, 1), 0., 'relu', 'none'], ], l2_conv_filter_type=[], full_connected_layer_units=[], embedding_dropout_rate=0., nb_epoch=kwargs.get('nb_epoch', 25), batch_size=kwargs.get('batch_size', 32), earlyStoping_patience=30, lr=kwargs.get('lr', 1e-2), show_validate_accuracy=True if kwargs.get('verbose', 0) > 0 else False, # output_regularizer=('l2', 0.5), output_constraints=('maxnorm', 3), save_middle_output=kwargs.get('get_cnn_middle_layer_output', False), ) # static_w2v_cnn.print_model_descibe() # quit() return static_w2v_cnn