def main(): print('proprecessing...') # 加载配置文件 with open('./config.yml') as file_config: config = yaml.load(file_config) # 构建字典 columns = config['model_params']['feature_names'] + ['label'] min_counts_dict, path_vocs_dict = defaultdict(int), dict() feature_names = config['model_params']['feature_names'] for feature_name in feature_names: min_counts_dict[feature_name] = \ config['data_params']['voc_params'][feature_name]['min_count'] path_vocs_dict[feature_name] = \ config['data_params']['voc_params'][feature_name]['path'] path_vocs_dict['label'] = \ config['data_params']['voc_params']['label']['path'] build_vocabulary(path_data=config['data_params']['path_train'], columns=columns, min_counts_dict=min_counts_dict, path_vocs_dict=path_vocs_dict) # 构建embedding表 for feature_name in feature_names: path_pre_train = config['model_params']['embed_params'][feature_name][ 'path_pre_train'] if not path_pre_train: continue path_pkl = config['model_params']['embed_params'][feature_name]['path'] path_voc = config['data_params']['voc_params'][feature_name]['path'] with open(path_voc, 'rb') as file_r: voc = pickle.load(file_r) embedding_dict, vec_dim = load_embed_from_txt(path_pre_train) embedding_matrix = np.zeros((len(voc.keys()) + 1, vec_dim), dtype='float32') for item in voc: if item in embedding_dict: embedding_matrix[voc[item], :] = embedding_dict[item] else: embedding_matrix[voc[item], :] = np.random.uniform( -0.25, 0.25, size=(vec_dim)) with open(path_pkl, 'wb') as file_w: pickle.dump(embedding_matrix, file_w) print('all done!')
def main(): print('proprecessing...') # 加载配置文件 with open('./config1.yml') as file_config: config = yaml.load(file_config) # 构建字典(同时获取词表size,序列最大长度) columns = config['model_params']['feature_names'] + ['label'] min_counts_dict, path_vocs_dict = defaultdict(int), dict() feature_names = config['model_params']['feature_names'] for feature_name in feature_names: min_counts_dict[feature_name] = \ config['data_params']['voc_params'][feature_name]['min_count'] path_vocs_dict[feature_name] = \ config['data_params']['voc_params'][feature_name]['path'] path_vocs_dict['label'] = \ config['data_params']['voc_params']['label']['path'] voc_sizes, sequence_length = build_vocabulary( path_data=config['data_params']['path_train'], columns=columns, min_counts_dict=min_counts_dict, path_vocs_dict=path_vocs_dict) # 构建embedding表 feature_dim_dict = dict() # 存储每个feature的dim for i, feature_name in enumerate(feature_names): path_pre_train = config['model_params']['embed_params'][feature_name]['path_pre_train'] if not path_pre_train: if i == 0: feature_dim_dict[feature_name] = 64 else: feature_dim_dict[feature_name] = 32 continue path_pkl = config['model_params']['embed_params'][feature_name]['path'] path_voc = config['data_params']['voc_params'][feature_name]['path'] with open(path_voc, 'rb') as file_r: voc = pickle.load(file_r) embedding_dict, vec_dim = load_embed_from_txt(path_pre_train) feature_dim_dict[feature_name] = vec_dim embedding_matrix = np.zeros((len(voc.keys())+1, vec_dim), dtype='float32') for item in voc: if item in embedding_dict: embedding_matrix[voc[item], :] = embedding_dict[item] else: embedding_matrix[voc[item], :] = np.random.uniform(-0.25, 0.25, size=(vec_dim)) with open(path_pkl, 'wb') as file_w: pickle.dump(embedding_matrix, file_w) # 修改config中各个特征的shape,embedding大小默认为[64, 32, 32, ... , 32] label_size = voc_sizes[-1] voc_sizes = voc_sizes[:-1] # 修改nb_classes config['model_params']['nb_classes'] = label_size + 1 for i, feature_name in enumerate(feature_names): if i == 0: config['model_params']['embed_params'][feature_name]['shape'] = \ [voc_sizes[i]+1, feature_dim_dict[feature_name]] else: config['model_params']['embed_params'][feature_name]['shape'] = \ [voc_sizes[i]+1, feature_dim_dict[feature_name]] # 写入文件 with codecs.open('./config1.yml', 'w', encoding='utf-8') as file_w: yaml.dump(config, file_w) print('all done!')
def main(): logger.info('preprocessing...') useable = [] # 加载配置文件 with open('./config.yml', encoding="utf-8") as file_config: config = yaml.load(file_config) # 构建字典(同时获取词表size,序列最大长度), f1 f2 label 名称固定不能更改 # 输入特征[f1]或者[f1, f2], f1: 汉字或者英语词汇,f2:词性, 加上一个为预测label columns = config['model_params']['feature_names'] + ['label'] min_counts_dict, path_vocs_dict = defaultdict( int), dict() # 用来过滤的最小频数,整理编号词汇的保存路径 feature_names = config['model_params']['feature_names'] # 输入信号的特征 logger.info("feature_names: " + str(feature_names)) for feature_name in feature_names: min_counts_dict[feature_name] = config['data_params']['voc_params'][ feature_name]['min_count'] path_vocs_dict[feature_name] = config['data_params']['voc_params'][ feature_name]['path'] # label的编号保存路径 path_vocs_dict['label'] = config['data_params']['voc_params']['label'][ 'path'] logger.info("min_count: " + str(min_counts_dict)) logger.info(path_vocs_dict) # char feature char 命名也是固定不能更改 min_counts_dict['char'] = config['data_params']['voc_params']['char'][ 'min_count'] path_vocs_dict['char'] = config['data_params']['voc_params']['char'][ 'path'] sequence_len_pt = config['model_params']['sequence_len_pt'] # 句子长度覆盖分位数 use_char_feature = config['model_params'][ 'use_char_feature'] # 是否属于英文本文本并采用字符信息 word_len_pt = config['model_params']['word_len_pt'] # 英文文本每个字符的长度控制 # 将输入用到的输入和输出特征建立遍序号字典保存 # voc_sizes, lengths = build_vocabulary( # path_data=config['data_params']['path_train'], # columns=columns, # min_counts_dict=min_counts_dict, # path_vocs_dict=path_vocs_dict, # sequence_len_pt=sequence_len_pt, # use_char_featrue=use_char_feature, # word_len_pt=word_len_pt # ) voc_sizes = [ get_voc_dict(["../data/train.txt", "../data/test.txt"], 2), get_tag_dict("../data/lstm_crf/train.txt", 1) ] lengths = [128] logger.info(voc_sizes) if not use_char_feature: sequence_length = lengths[0] # 预测句子长度 else: sequence_length, word_length = lengths[:] # 或者英语用到字符的信号 # 构建embedding表, 对每个输入的特征进行embed logger.info( "get feature pre_train matrix...") # 模型自带一个word2vec层, word2vec的初始化权重 feature_dim_dict = dict() # 存储每个feature的dim for i, feature_name in enumerate(feature_names): logger.info("feature: " + feature_name) # embed size path_pre_train = config['model_params']['embed_params'][feature_name][ 'path_pre_train'] # embed 结果保存位置 if not path_pre_train: # 检查嵌入维度初始化权重是否存在,如果不存在为啥还需要给定一个默认维度值, 改为None, 该特征不可用 if i == 0: feature_dim_dict[feature_name] = None else: feature_dim_dict[feature_name] = None continue useable.append(feature_name) config['model_params']['embed_params'][feature_name][ 'path'] = "../data/lstm_crf/%s_embed.mat.pkl" % feature_name path_voc = config['data_params']['voc_params'][feature_name][ 'path'] # 前面的特征词典编号文件位置 with open(path_voc, 'rb') as file_r: # 二进制打开文件 # 通过train数据编号的词典 voc = pickle.load(file_r) logger.info("编号词典:%s " % voc) logger.info("将构建的voc,与训练好的embedding结合整理出word2vec的初始化矩阵: " + feature_name) embedding_dict, vec_dim = load_embed_from_txt( path_pre_train) # 读取嵌入词汇模型 # # 对于未标注的词汇作为,一些词汇可能不在train里面 # embedding_special_words =[] # for key in embedding_dict: # if key not in voc: # embedding_special_words.append(key) # logger.info("embedding 里面包含一些词汇可能voc来面没有,voc来之train data的整理") # logger.info(len(embedding_special_words)) # logger.info(embedding_special_words) # # 是否将这些词汇加入voc中,值得思考: 这里选择加入 # voc_end_index = max(voc.values()) # for key in embedding_special_words: # voc[key] = voc_end_index + 1 feature_dim_dict[feature_name] = vec_dim # 每个特征的嵌入维度 embedding_matrix = np.zeros((len(voc.keys()) + 2, vec_dim), dtype='float32') # 第一行为填充 embedding_matrix[1, :] = embedding_dict[config["unknow_word"]] for item in voc: if item in embedding_dict: embedding_matrix[voc[item], :] = embedding_dict[item] else: # voc里面的词汇可能在pre train embed里面没有 logger.info("训练好的embedding中没有找到的词汇:%s" % item) embedding_matrix[voc[item], :] = np.random.uniform( -0.25, 0.25, size=vec_dim) # embed钟未登录词的处理 with open(config['model_params']['embed_params'][feature_name]['path'], 'wb') as file_w: pickle.dump(embedding_matrix, file_w) # print(embedding_matrix) # 修改config中各个特征的shape,embedding大小默认为[64, 32, 32, ...] if use_char_feature: char_voc_size = voc_sizes.pop(0) label_size = voc_sizes[-1] voc_sizes = voc_sizes[:-1] # 仅仅包含f # 修改nb_classes config['model_params']['nb_classes'] = label_size # 实际的分类数量 # 修改embedding表的shape for i, feature_name in enumerate(feature_names): config['model_params']['embed_params'][feature_name]['shape'] = [ voc_sizes[i], feature_dim_dict[feature_name] ] # 保存特征集合大小 和 embedding size # 修改char表的embedding if use_char_feature: # 默认16维,根据任务调整 # 并且没有生成char mat: char_embed.pkl 这是事先给出的 config['model_params']['embed_params']['char']['shape'] = [ char_voc_size, 16 ] # 固定到16 dim config['model_params']['word_length'] = word_length # 修改句子长度 config['model_params']['sequence_length'] = sequence_length config['model_params']['feature_names'] = useable # 根据数据情概,更新配置文件 with codecs.open('./config.yml', 'w', encoding='utf-8') as file_w: yaml.dump(config, file_w) logger.info('preprocessing successfully!')
def main(): print('preprocessing...') # 加载配置文件 with open('./config.yml') as file_config: config = yaml.load(file_config) # 构建字典(同时获取词表size,序列最大长度) columns = config['model_params']['feature_names'] + ['label'] min_counts_dict, path_vocs_dict = defaultdict(int), dict() feature_names = config['model_params']['feature_names'] for feature_name in feature_names: min_counts_dict[feature_name] = \ config['data_params']['voc_params'][feature_name]['min_count'] path_vocs_dict[feature_name] = \ config['data_params']['voc_params'][feature_name]['path'] path_vocs_dict['label'] = \ config['data_params']['voc_params']['label']['path'] # char feature min_counts_dict['char'] = config['data_params']['voc_params']['char'][ 'min_count'] path_vocs_dict['char'] = config['data_params']['voc_params']['char'][ 'path'] sequence_len_pt = config['model_params']['sequence_len_pt'] use_char_feature = config['model_params']['use_char_feature'] word_len_pt = config['model_params']['word_len_pt'] voc_sizes, lengths = build_vocabulary( path_data=config['data_params']['path_train'], columns=columns, min_counts_dict=min_counts_dict, path_vocs_dict=path_vocs_dict, sequence_len_pt=sequence_len_pt, use_char_featrue=use_char_feature, word_len_pt=word_len_pt) if not use_char_feature: sequence_length = lengths[0] else: sequence_length, word_length = lengths[:] # 构建embedding表 feature_dim_dict = dict() # 存储每个feature的dim for i, feature_name in enumerate(feature_names): path_pre_train = config['model_params']['embed_params'][feature_name][ 'path_pre_train'] if not path_pre_train: if i == 0: feature_dim_dict[feature_name] = 64 else: feature_dim_dict[feature_name] = 32 continue path_pkl = config['model_params']['embed_params'][feature_name]['path'] path_voc = config['data_params']['voc_params'][feature_name]['path'] with open(path_voc, 'rb') as file_r: voc = pickle.load(file_r) embedding_dict, vec_dim = load_embed_from_txt(path_pre_train) feature_dim_dict[feature_name] = vec_dim embedding_matrix = np.zeros((len(voc.keys()) + 2, vec_dim), dtype='float32') for item in voc: if item in embedding_dict: embedding_matrix[voc[item], :] = embedding_dict[item] else: embedding_matrix[voc[item], :] = np.random.uniform( -0.25, 0.25, size=(vec_dim)) with open(path_pkl, 'wb') as file_w: pickle.dump(embedding_matrix, file_w) # 修改config中各个特征的shape,embedding大小默认为[64, 32, 32, ...] if use_char_feature: char_voc_size = voc_sizes.pop(0) label_size = voc_sizes[-1] voc_sizes = voc_sizes[:-1] # 修改nb_classes config['model_params']['nb_classes'] = label_size + 1 # 修改embedding表的shape for i, feature_name in enumerate(feature_names): if i == 0: config['model_params']['embed_params'][feature_name]['shape'] = \ [voc_sizes[i], feature_dim_dict[feature_name]] else: config['model_params']['embed_params'][feature_name]['shape'] = \ [voc_sizes[i], feature_dim_dict[feature_name]] # 修改char表的embedding if use_char_feature: # 默认16维,根据任务调整 config['model_params']['embed_params']['char']['shape'] = \ [char_voc_size, 16] config['model_params']['word_length'] = word_length # 修改句子长度 config['model_params']['sequence_length'] = sequence_length # 写入文件 with codecs.open('./config.yml', 'w', encoding='utf-8') as file_w: yaml.dump(config, file_w) print('all done!')