def save_model_m0(model): if save_model: show_title("保存网络模型") file_name = model_file_path + model_file_prefix + 'm0.h5' print("保存原始模型:{} →".format(file_name), end='') model.save(file_name) print("模型保存成功。")
def export_day_statistical_sequence(lst_data): from src.data.generate_data import generate_day_statistical_sequence show_title("加工数据为 91 天的序列数据,每天为6个特征(最大值、最小值、平均值)96维数据") x_data = generate_day_statistical_sequence(lst_data) from src.base.config import x_data_file_name, base_data_type save_model_data(x_data, data_file_path + x_data_file_name, base_data_type) return x_data
def save_model_m2(history, model): if save_model: show_title("保存网络模型") file_name = model_file_path + model_file_prefix + 'm2.bin' print("保存第二次训练模型:{} → ".format(file_name), end='') model.save_weights(file_name) with open(model_file_path + model_file_prefix + 'm2.pkl', 'wb') as fname: pickle.dump(history.history, fname) print("模型保存成功。")
def export_train_balance(x_train, y_train): show_title(f"对类别 :{label_name}实施平衡{config.train_data_type}") x_train_balance, y_train_balance = generate_balance_data(x_train, y_train) from src.base.config import train_balance_data_type from src.base.config import x_train_balance_file_name, y_train_balance_file_name save_model_data(x_train_balance, data_file_path + x_train_balance_file_name, train_balance_data_type) save_model_data(y_train_balance, data_file_path + y_train_balance_file_name, train_balance_data_type)
def main(): from keras_preprocessing.sequence import pad_sequences from src.data.show_data import show_result, show_parameters from src.data.load_data import load_train_data, load_train_val_data, load_val_data show_title("构建网络模型") show_parameters() model = construct_model() model.summary() save_model_m0(model) show_title("加载与填充{}".format(train_data_type)) x_train_val, y_train_val = load_train_val_data() x_train_val_seq = pad_sequences(x_train_val, maxlen=max_len, padding='post') x_val, y_val = load_val_data() x_val_seq = pad_sequences(x_val, maxlen=max_len, padding='post') show_title("存在验证集训练网络模型") history = model.fit(x={'creative_id': x_train_val_seq}, y=y_train_val, epochs=epochs, batch_size=batch_size, validation_data=(x_val_seq, y_val), verbose=2) save_model_m1(history, model) show_title("加载与填充测试数据集") x_test, y_test = load_test_data() x_test_seq = pad_sequences(x_test, maxlen=max_len, padding='post') results = model.evaluate({'creative_id': x_test_seq}, y_test, verbose=0) predictions = model.predict({'creative_id': x_test_seq}).squeeze() show_result(results, predictions, y_test) show_title("没有验证集训练网络模型,训练次数减半") x_train, y_train = load_train_data() x_train_seq = pad_sequences(x_train, maxlen=max_len, padding='post') # history = model.fit({'creative_id': x_train_seq}, y_train, epochs=epochs, batch_size=batch_size, # validation_split=0.2, verbose=2) history = model.fit({'creative_id': x_train_seq}, y_train, epochs=epochs // 2, batch_size=batch_size, verbose=2) save_model_m2(history, model) results = model.evaluate({'creative_id': x_test_seq}, y_test, verbose=0) predictions = model.predict({'creative_id': x_test_seq}).squeeze() show_result(results, predictions, y_test)
def export_day_list_data(): from src.data.load_data import load_original_data show_title("加载原始数据") x_csv, y_csv = load_original_data() from src.data.generate_data import generate_day_list_data show_title("导出每个用户每天访问数据的不截断列表") lst_data, y_data = generate_day_list_data(x_csv, y_csv) save_model_data(lst_data, data_file_path + config.lst_data_file_name, config.base_data_type) save_model_data(y_data, data_file_path + config.y_data_file_name, config.base_data_type) return lst_data, y_data
def save_word2vec_data(x_creative_id, creative_id_window, file_path): """ 保存训练 word2vec 用的数据 :param x_creative_id: :param creative_id_window: :param file_path: :return: """ file_name = file_path + 'creative_id_{0}'.format(creative_id_window) show_title("保存数据集:{0}".format(file_name)) with open(file_name, 'wb') as fname: pickle.dump(x_creative_id, fname, -1) print("Word2Vec 数据保存成功")
def train_word2vec_model_with_gensim(words_lists): from gensim.models import Word2Vec from src.base.config import embedding_size, embedding_window show_title(f"训练 word2vec({embedding_size}_{embedding_window}) 模型") model = Word2Vec( words_lists, size=embedding_size, window=embedding_window, min_count=1, seed=config.seed, workers=8, sg=0, # 0:CBOW; 1:Skip-Gram iter=20, sorted_vocab=False, batch_words=4096) return model
def export_train_test_data(x_data, y_data): x_data = x_data[0:config.user_id_max] show_title("拆分训练数据集和测试数据集") x_train, x_test, y_train, y_test = train_test_split( x_data, y_data, random_state=config.seed, stratify=y_data) from src.base.config import train_data_type from src.base.config import x_train_file_name, y_train_file_name save_model_data(x_train, data_file_path + x_train_file_name, train_data_type) save_model_data(y_train, data_file_path + y_train_file_name, train_data_type) from src.base.config import test_data_type from src.base.config import x_test_file_name, y_test_file_name save_model_data(x_test, data_file_path + x_test_file_name, test_data_type) save_model_data(y_test, data_file_path + y_test_file_name, test_data_type) return x_train, y_train
def export_val_data(x_train, y_train): show_title("拆分训练数据集和验证数据集") x_train_val, x_val, y_train_val, y_val = train_test_split( x_train, y_train, random_state=config.seed, stratify=y_train) from src.base.config import train_val_data_type from src.base.config import x_train_val_file_name, y_train_val_file_name save_model_data(x_train_val, data_file_path + x_train_val_file_name, train_val_data_type) save_model_data(y_train_val, data_file_path + y_train_val_file_name, train_val_data_type) from src.base.config import val_data_type from src.base.config import x_val_file_name, y_val_file_name save_model_data(x_val, data_file_path + x_val_file_name, val_data_type) save_model_data(y_val, data_file_path + y_val_file_name, val_data_type) return x_train_val, y_train_val
def main(): show_title("数据清洗开始...") from src.data.export_data import export_base_data x_data, y_data = export_base_data() # from src.data.load_data import load_base_data # x_data, y_data = load_base_data() from src.data.export_data import export_train_test_data x_train, y_train = export_train_test_data(x_data, y_data) # from src.data.load_data import load_train_data # x_train, y_train = load_train_data() # export_train_balance(x_train, y_train) from src.data.export_data import export_val_data x_train_val, y_train_val = export_val_data(x_train, y_train) # from src.data.load_data import load_val_data # x_train_val, y_train_val = load_val_data() # export_val_balance(x_train_val, y_train_val) show_title("数据清洗完成!")
def save_word2vec_weights(model_w2v): """保存 word2vec 训练的权重 :param model_w2v: :return: """ from src.base.tools import show_title, get_w2v_file_name file_name = get_w2v_file_name() show_title(f"保存 word2vec 模型 {file_name}") # 初始化嵌入式模型权重矩阵;0 是占位符,因此不记入模型的数据;补:将 0 的权重大小设置为 0.5,效果并不好 embedding_weights = np.zeros( (config.creative_id_window, config.embedding_size)) # 需要将训练的单词(word) 与 数组的序号(ord(word))对应 for word, index in model_w2v.vocab.items(): try: embedding_weights[ord(word), :] = model_w2v[word] except KeyError: print(f"错误的键值{word}") model_w2v.save(file_name) np.save(file_name, embedding_weights) print("Word2Vec 模型保存完成。")
def train_multi_output(): from src.data.show_data import show_result, show_parameters show_title("构建网络模型") show_parameters() model = construct_model_single_input() model.summary() from src.model.save_model import save_model_m0 show_title("保存网络模型") save_model_m0(model) from src.data.load_data import load_train_val_data x_train_val, y_train_val = load_train_val_data() from src.base.config import day_feature_idx x_train_val = single_data_reshape(day_feature_idx, x_train_val, y_train_val.shape[0]) from src.data.load_data import load_val_data x_val, y_val = load_val_data() x_val = single_data_reshape(day_feature_idx, x_val, y_val.shape[0]) show_title("存在验证集训练网络模型") history = model.fit(x_train_val, y_train_val, epochs=epochs, batch_size=batch_size, validation_data=(x_val, y_val), verbose=2) del x_train_val, x_val, y_train_val, y_val gc.collect() from src.model.save_model import save_model_m1 save_model_m1(history, model) from src.data.load_data import load_test_data show_title("加载与填充测试数据集") x_test, y_test = load_test_data() x_test = single_data_reshape(day_feature_idx, x_test, y_test.shape[0]) results = model.evaluate(x_test, y_test, verbose=0) predictions = model.predict(x_test).squeeze() show_result(results, predictions, y_test) show_title("没有验证集训练网络模型,训练次数减半") from src.data.load_data import load_train_data show_title("加载与填充{}".format(train_data_type)) x_train, y_train = load_train_data() x_train = single_data_reshape(day_feature_idx, x_train, y_train.shape[0]) history = model.fit(x_train, y_train, epochs=epochs // 2, batch_size=batch_size, verbose=2) from src.model.save_model import save_model_m2 save_model_m2(history, model) results = model.evaluate(x_test, y_test, verbose=0) predictions = model.predict(x_test).squeeze() show_result(results, predictions, y_test) pass
def export_day_fix_sequence(lst_data): show_title("加工数据为 91 天的序列数据,每天为定长的数据序列") pass
def export_w2v_data(lst_data): show_title("导出用于Word2Vec训练的数据") from src.data.generate_data import generate_w2v_data x_w2v = generate_w2v_data(lst_data) from src.base.config import data_w2v_path, w2v_file_name, w2v_data_type save_model_data(x_w2v, data_w2v_path + w2v_file_name, w2v_data_type)
def load_word2vec_weights(): file_name = get_w2v_file_name() show_title("加载 word2vec 模型 {0}".format(file_name)) embedding_weights = np.load(file_name + '.npy', allow_pickle=True) print("Word2Vec 模型加载完成。") return embedding_weights
def export_user_fix_sequence(lst_data): show_title("加工数据为无间隔有重复的数据列表") pass