def load_condition_stacking_main(): tn_conf = TrainConfigure() term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict) term_embed_matrix = data_utils.load_embedding(term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') # term_embed_matrix = data_utils.load_embedding(term_vocab_dict, # 'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5', # dump_path='data/term_embed_ww.pkl') char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict) char_embed_matrix = data_utils.load_embedding(char_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') name = 'model/stack_condition_model.pkl' model_dir = 'model/stack/' n_fold = 3 name = 'model/stack_condition_model5.pkl' model_dir = 'model/stack5/' n_fold = 5 conf = conditionmodelbase.ModelConfigure() stk_model = stacking(n_fold, name=name, is_condition=True) stk_model.add_model(ConditionConvModel, {"conf":conf,"char_embed_matrix": char_embed_matrix, "term_embed_matrix": term_embed_matrix, "name": model_dir + 'conditionconvmodel_PE.h5'}) stk_model.add_model(ConditionDPCNNModel, {"conf":conf,"char_embed_matrix": char_embed_matrix, "term_embed_matrix": term_embed_matrix, "name": model_dir + 'conditiondpcnnmodel_PE.h5'}) stk_model.add_model(ConditionGatedConvModel, {"conf":conf,"char_embed_matrix": char_embed_matrix, "term_embed_matrix": term_embed_matrix, "name": model_dir + 'conditiongatedconvmodel_PE.h5'}) stk_model.add_model(ConditionGatedDeepCNNModel, {"conf":conf,"char_embed_matrix": char_embed_matrix, "term_embed_matrix": term_embed_matrix, "name": model_dir + 'conditiongateddeepcnnmodel_PE.h5'}) stk_model.load( ) return stk_model
def encode_main(): lda_file = 'data/lda_vec_val.pkl' import data_utils100 import data_utils200 tn_conf = TrainConfigure() val_conf = ValidConfigure() val_conf200 = data_utils200.ValidConfigure() val_conf100 = data_utils100.ValidConfigure() ys_val = predict(tn_conf, lda_file, val_conf, val_conf100, val_conf200) import data_utils100 import data_utils200 tn_conf = TrainConfigure() val_conf = TrainConfigure() val_conf200 = data_utils200.TrainConfigure() val_conf100 = data_utils100.TrainConfigure() ys_train = predict(tn_conf, lda_file, val_conf, val_conf100, val_conf200) data_utils.pickle_dump((ys_train, ys_val), 'data/stack_y.pkl')
def train_main_pe(): print('load data') import data_utils, training_utils tn_conf = TrainConfigure() data_dict = data_utils.pickle_load(tn_conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] xterm = data_utils.pickle_load(tn_conf.term_file) xfeat = data_utils.pickle_load(tn_conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, tn_conf.feat_norm) xfeat = scaler.transform(xfeat) print('loading embed ...') term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict) term_embed_matrix = data_utils.load_embedding( term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') # term_embed_matrix = data_utils.load_embedding(term_vocab_dict, # 'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5', # dump_path='data/term_embed_ww.pkl') char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict) char_embed_matrix = data_utils.load_embedding( char_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') print('load embed done.') xe_char = [[i for i in range(600)] for _ in range(y.shape[0])] xe_char = np.array(xe_char) xe_term = [[i for i in range(300)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) x_tn, y_tn, x_ts, y_ts = training_utils.split( [x, xterm, xfeat, xe_char, xe_term], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('define model') model = HybridModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, name='hybridmodel_PE.h5', PE=True) # +37 print('feat shape', xfeat.shape) import sys print('train') model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts)
def prepare_tn_data(filename='data/topic_train.txt'): tn_conf = TrainConfigure() data_dict = data_utils.pickle_load(tn_conf.char_file) xterm = data_utils.pickle_load(tn_conf.term_file) term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict) reverse_dict = dict() for k, v in term_vocab_dict.items(): reverse_dict[v] = k N = xterm.shape[0] with open(filename, 'w') as fout: for i in range(N): xi = xterm[i] term_list = [] for idx in xi: if idx != 0 and idx != 1 and idx in reverse_dict: term_list.append(reverse_dict[idx]) fout.write(' '.join(term_list) + '\n') print('prepare data done.')
def get_vec(out_file, mode="tn"): index = 0 vec_len = 20 dictionary = corpora.Dictionary.load('./data/lda.dict') lda = gensim.models.LdaModel.load('data/LDA20.model') if mode == "train": print('train') tn_conf = TrainConfigure() elif mode == "val": print('val') tn_conf = data_utils.ValidConfigure() else: print("test") tn_conf = data_utils.TestConfigure() # data_dict = data_utils.pickle_load(tn_conf.char_file) xterm = data_utils.pickle_load(tn_conf.term_file) term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict) reverse_dict = dict() for k, v in term_vocab_dict.items(): reverse_dict[v] = k all_lda = [] for xi in xterm: doc = [] for idx in xi: if idx != 0 and idx != 1 and idx in reverse_dict: doc.append(reverse_dict[idx]) doc_bow = dictionary.doc2bow(doc) lda_vec_tmp = lda[doc_bow] lda_vec = np.zeros(vec_len) for (index, p) in lda_vec_tmp: lda_vec[index] = p index += 1 all_lda.append(lda_vec) data_utils.pickle_dump(np.array(all_lda), out_file) print('done.')
def predict(): """ 根据概率集成 :return: """ print('load data') tn_conf = TrainConfigure() data_dict = data_utils.pickle_load(tn_conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] xterm = data_utils.pickle_load(tn_conf.term_file) xfeat = data_utils.pickle_load(tn_conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, tn_conf.feat_norm) xfeat = scaler.transform(xfeat) print('loading embed ...') term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict) term_embed_matrix = data_utils.load_embedding( term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') # term_embed_matrix = data_utils.load_embedding(term_vocab_dict, # 'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5', # dump_path='data/term_embed_ww.pkl') char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict) char_embed_matrix = data_utils.load_embedding( char_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') print('load embed done.') val_conf = ValidConfigure() data_dict = data_utils.pickle_load(val_conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] ids = data_dict['id'] xterm = data_utils.pickle_load(val_conf.term_file) xfeat = data_utils.pickle_load(val_conf.feat_file) xfeat = scaler.transform(xfeat) print('feat shape', xfeat.shape) xtopic = data_utils.pickle_load('data/lda_vec_val.pkl') xe = [[i for i in range(600)] for _ in range(y.shape[0])] xe = np.array(xe) xe_term = [[i for i in range(300)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) import data_utils100 val_conf100 = data_utils100.ValidConfigure() data_dict100 = data_utils.pickle_load(val_conf100.char_file) x100 = data_dict100['x'] xterm100 = data_utils.pickle_load(val_conf100.term_file) xe100 = [[i for i in range(100)] for _ in range(y.shape[0])] xe100 = np.array(xe100) xe_term100 = [[i for i in range(100)] for _ in range(y.shape[0])] xe_term100 = np.array(xe_term100) import data_utils200 val_conf200 = data_utils200.ValidConfigure() data_dict200 = data_utils.pickle_load(val_conf200.char_file) x200 = data_dict200['x'] xterm200 = data_utils.pickle_load(val_conf200.term_file) xe200 = [[i for i in range(200)] for _ in range(y.shape[0])] xe200 = np.array(xe200) xe_term200 = [[i for i in range(200)] for _ in range(y.shape[0])] xe_term200 = np.array(xe_term200) ys = [] print('define model') model = HybridDenseModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybriddensemodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model model = HybridDenseMAModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybriddensemodelma_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('dense model done.') model = HybridSEModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybridsemodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('se model done.') # print('start len 100 model') # model = HybridConvModel(char_embed_matrix=char_embed_matrix, # term_embed_matrix=term_embed_matrix, MAX_LEN=100, MAX_LEN_TERM=100,NUM_FEAT=8, # PE=True, name='hybridconvmodel_n100.h5') # model.load_weights() # y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic]) # ys.append(y) # del model # print('hybrid conv model done.') # # model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix, # term_embed_matrix=term_embed_matrix, MAX_LEN=100, MAX_LEN_TERM=100,NUM_FEAT=8, # PE=True, name='hybridgateddeepcnnmodel_n100.h5') # model.load_weights() # y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic]) # ys.append(y) # del model # print('hybrid gated deep cnn model done.') # # model = HybridRCNNModel(char_embed_matrix=char_embed_matrix, # term_embed_matrix=term_embed_matrix, MAX_LEN=100, MAX_LEN_TERM=100,NUM_FEAT=8, # PE=True, name='hybridrcnnmodel_n100.h5') # model.load_weights() # y = model.predict([x100, xe100, xterm100, xe_term100, xfeat, xtopic]) # ys.append(y) # del model # print('hybrid RCNN model done.') print('start len 200 model') model = HybridConvModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200, NUM_FEAT=8, PE=True, name='hybridconvmodel_n200.h5') model.load_weights() y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic]) ys.append(y) del model print('hybrid conv model done.') model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200, NUM_FEAT=8, PE=True, name='hybriddpcnnmodel_n200.h5') model.load_weights() y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic]) ys.append(y) del model print('hybrid dpcnn model done.') model = HybridGatedConvTopicModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200, NUM_FEAT=8, PE=True, name='hybridgatedconvtopicmodel_n200.h5') model.load_weights() y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic]) ys.append(y) del model print('hybrid dpcnn model done.') model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200, NUM_FEAT=8, PE=True, name='hybridgateddeepcnnmodel_n200.h5') model.load_weights() y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic]) ys.append(y) del model print('hybrid gated deep cnn model done.') model = HybridRCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, MAX_LEN=200, MAX_LEN_TERM=200, NUM_FEAT=8, PE=True, name='hybridrcnnmodel_n200.h5') model.load_weights() y = model.predict([x200, xe200, xterm200, xe_term200, xfeat, xtopic]) ys.append(y) del model #这个模型太慢 model = ConditionAttModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='conditionattmodel_PE.h5', lr=0.001) model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) print('condition att model done.') model = ConditionConvModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='conditionconvmodel_PE.h5', lr=0.001) model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('condition conv model done.') model = ConditionDPCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='conditiondpcnnmodel_PE.h5', lr=0.001) model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('condition dpcnn model done.') model = ConditionGatedConvModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='conditiongatedconvmodel_PE.h5', lr=0.001) model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('condition gated conv model done.') model = ConditionGatedDeepCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='conditiongateddeepcnnmodel_PE.h5', lr=0.001) model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('condition gated deepcnn model done.') model = HybridAttModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybridattmodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) print('hybrid att model done.') model = HybridConvModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybridconvmodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('hybrid conv model done.') model = HybridDPCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybriddpcnnmodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('hybrid dpcnn model done.') model = HybridGatedDeepCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybridgateddeepcnnmodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('hybrid gated deep cnn model done.') model = HybridRCNNModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybridrcnnmodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) del model print('hybrid rcnn model done.') model = HybridGatedConvTopicModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8, PE=True, name='hybridgatedconvtopicmodel_PE.h5') model.load_weights() y = model.predict([x, xe, xterm, xe_term, xfeat, xtopic]) ys.append(y) print('hybrid gated conv topic done.') y = fasttextmodel.predict_char() ys.append(y) y = fasttextmodel.predict_term() ys.append(y) print(y.shape) print('fast text done.') #hybrid model # model = HybridModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8)# + 37 # model.load_weights() # y = model.predict([x, xterm, xfeat]) # ys.append( y ) # print(y.shape) # print('hybrid model done.') labels = ['人类作者', '自动摘要', '机器作者', '机器翻译'] y_pred = np.mean(ys, axis=0) y_pred = convert_y(y_pred) out_file = 'result.csv' with open(out_file, 'w', encoding='utf-8') as fout: for id, yi in zip(ids, y_pred): label = labels[yi] fout.write('{},{}\n'.format(id, label)) print('done.')
def predict2(): """ 根据分类结果集成 :return: """ print('load data') tn_conf = TrainConfigure() data_dict = data_utils.pickle_load(tn_conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] xterm = data_utils.pickle_load(tn_conf.term_file) xfeat = data_utils.pickle_load(tn_conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, tn_conf.feat_norm) xfeat = scaler.transform(xfeat) print('loading embed ...') term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict) term_embed_matrix = data_utils.load_embedding( term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') # term_embed_matrix = data_utils.load_embedding(term_vocab_dict, # 'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5', # dump_path='data/term_embed_ww.pkl') char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict) char_embed_matrix = data_utils.load_embedding( char_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') print('load embed done.') val_conf = ValidConfigure() data_dict = data_utils.pickle_load(val_conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] ids = data_dict['id'] xterm = data_utils.pickle_load(val_conf.term_file) xfeat = data_utils.pickle_load(val_conf.feat_file) xfeat = scaler.transform(xfeat) xe = [[i for i in range(600)] for _ in range(y.shape[0])] xe = np.array(xe) ys = [] print('define model') #hybrid model model = HybridModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8) # + 37 print('feat shape', xfeat.shape) model.load_weights() y = model.predict([x, xterm, xfeat]) ys.append(convert_onehot(y)) print('hybrid model done.') #CNN model (char) model = CharModel(embed_matrix=char_embed_matrix) model.load_weights() y = model.predict(x) ys.append(convert_onehot(y)) print('char model done.') model = CharModel(embed_matrix=char_embed_matrix, name='charmodel_PE.h5', PE=True) model.load_weights() y = model.predict([x, xe]) ys.append(convert_onehot(y)) print('char model done.') model = CharModel(embed_matrix=char_embed_matrix, name='charmodel_PE_OE.h5', PE=True) model.load_weights() y = model.predict([x, xe]) ys.append(convert_onehot(y)) print('char model done.') #CNN (term) model = TermModel(embed_matrix=term_embed_matrix) model.load_weights() y = model.predict(xterm) ys.append(convert_onehot(y)) print('term model done.') model = DeepCNNModel(embed_matrix=char_embed_matrix) model.load_weights() y = model.predict(x) ys.append(convert_onehot(y)) print('deep cnn done.') # # attention model (char) # model = AttModel(MAX_LEN=600, name='charattmodel.h5', embed_matrix=char_embed_matrix) # model.load_weights() # y = model.predict(x) # ys.append(convert_onehot(y)) # # attention model (term) # model = AttModel(MAX_LEN=300, embed_matrix=term_embed_matrix) # model.load_weights() # y = model.predict(xterm) # ys.append(convert_onehot(y)) # # model = ConditionModel(embed_matrix=char_embed_matrix) # model.load_weights() # y = model.predict(x) # ys.append(convert_onehot(y)) model = SSCharModel(embed_matrix=char_embed_matrix, name='sscharmodel_PE.h5', PE=True, train_embed=True) model.load_weights() y = model.predict([x, xe]) ys.append(convert_onehot(y)) model = SSCharModel(embed_matrix=char_embed_matrix, train_embed=True) model.load_weights() y = model.predict(x) ys.append(convert_onehot(y)) model = GatedConvModel(embed_matrix=char_embed_matrix, name='gatedconvmodel_PE.h5', PE=True) model.load_weights() y = model.predict([x, xe]) ys.append(convert_onehot(y)) model = GatedConvModel(embed_matrix=char_embed_matrix, train_embed=True) model.load_weights() y = model.predict(x) ys.append(convert_onehot(y)) model = GatedDeepCNNModel(embed_matrix=char_embed_matrix, name='gateddeepcnnmodel_PE.h5', PE=True, train_embed=True) model.load_weights() y = model.predict([x, xe]) ys.append(convert_onehot(y)) model = GatedDeepCNNModel(embed_matrix=char_embed_matrix, train_embed=True) model.load_weights() y = model.predict(x) ys.append(convert_onehot(y)) labels = ['人类作者', '自动摘要', '机器作者', '机器翻译'] y_pred = np.mean(ys, axis=0) y_pred = convert_y(y_pred) out_file = 'result.csv' with open(out_file, 'w', encoding='utf-8') as fout: for id, yi in zip(ids, y_pred): label = labels[yi] fout.write('{},{}\n'.format(id, label)) print('done.')
def load_weights(self, name=None): if name is None: save_path = self.name else: save_path = name self.model.load_weights(save_path) def predict(self, x): y_pred = self.model.predict(x, batch_size=512) return y_pred if __name__ == '__main__': import sys tn_conf = TrainConfigure() if len(sys.argv) > 1 and sys.argv[1] == 'char': if len(sys.argv) > 2 and sys.argv[2] == 'pe': print('define char model with position embedding') print('load data') import data_utils, training_utils data_dict = data_utils.pickle_load(tn_conf.char_file) y = to_categorical(data_dict['y']) char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict) char_embed_matrix = data_utils.load_embedding( char_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') xe = [[i for i in range(600)] for _ in range(y.shape[0])] xe = np.array(xe)
def train_main(): print('load data') import data_utils, training_utils tn_conf = TrainConfigure() data_dict = data_utils.pickle_load(tn_conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] xterm = data_utils.pickle_load(tn_conf.term_file) xfeat = data_utils.pickle_load(tn_conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, tn_conf.feat_norm) xfeat = scaler.transform(xfeat) print('loading embed ...') term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict) term_embed_matrix = data_utils.load_embedding( term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') # term_embed_matrix = data_utils.load_embedding(term_vocab_dict, # 'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5', # dump_path='data/term_embed_ww.pkl') char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict) char_embed_matrix = data_utils.load_embedding( char_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') print('load embed done.') x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xterm, xfeat], y, shuffle=False) x_tn, y_tn, x_val, y_val = training_utils.split(x_tn, y_tn, shuffle=False) print('define model') model = HybridModel(char_embed_matrix=char_embed_matrix, term_embed_matrix=term_embed_matrix, NUM_FEAT=8) # +37 print('feat shape', xfeat.shape) import sys if len(sys.argv) <= 1 or sys.argv[1] == 'train': print('train') model.train(x_tn, y_tn, x_val, y_val, x_ts, y_ts) if len(sys.argv) > 1 and sys.argv[1] == 'val': val_conf = ValidConfigure() data_dict = data_utils.pickle_load(val_conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] ids = data_dict['id'] xterm = data_utils.pickle_load(val_conf.term_file) xfeat = data_utils.pickle_load(val_conf.feat_file) xfeat = scaler.transform(xfeat) model.load_weights() model.test([x, xterm, xfeat], ids, val_conf.out_file) if len(sys.argv) > 1 and sys.argv[1] == 'error': start_index = y_tn.shape[0] + y_val.shape[0] texts = data_utils.load_all_text(tn_conf) model.load_weights() model.error_analysis(x_ts, y_ts, texts, start_index)
def stacking_main_condition(): print('load data') tn_conf = TrainConfigure() data_dict = data_utils.pickle_load(tn_conf.char_file) y = to_categorical(data_dict['y']) x = data_dict['x'] xterm = data_utils.pickle_load(tn_conf.term_file) xfeat = data_utils.pickle_load(tn_conf.feat_file) # normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xfeat) data_utils.pickle_dump(scaler, tn_conf.feat_norm) xfeat = scaler.transform(xfeat) xe = [[i for i in range(600)] for _ in range(y.shape[0])] xe = np.array(xe) xe_term = [[i for i in range(300)] for _ in range(y.shape[0])] xe_term = np.array(xe_term) xtopic = data_utils.pickle_load('data/lda_vec.pkl') print('loading embed ...') term_vocab_dict = data_utils.pickle_load(tn_conf.term_dict) term_embed_matrix = data_utils.load_embedding(term_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/term_embed.pkl') # term_embed_matrix = data_utils.load_embedding(term_vocab_dict, # 'data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5', # dump_path='data/term_embed_ww.pkl') char_vocab_dict = data_utils.pickle_load(tn_conf.char_dict) char_embed_matrix = data_utils.load_embedding(char_vocab_dict, 'data/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5', dump_path='data/char_embed.pkl') print('load embed done.') name = 'model/stack_condition_model.pkl' model_dir = 'model/stack/' n_fold = 3 name = 'model/stack_condition_model5.pkl' model_dir = 'model/stack5/' n_fold = 5 stk_model = stacking(n_fold, name=name, is_condition=True) conf = conditionmodelbase.ModelConfigure() conf.PE = True stk_model.add_model(ConditionConvModel, {"conf":conf,"char_embed_matrix":char_embed_matrix, "term_embed_matrix":term_embed_matrix, "name":model_dir+'conditionconvmodel_PE.h5'}) stk_model.add_model(ConditionGatedConvModel, {"conf":conf,"char_embed_matrix": char_embed_matrix, "term_embed_matrix": term_embed_matrix, "name": model_dir+'conditiongatedconvmodel_PE.h5'}) stk_model.add_model(ConditionGatedDeepCNNModel, {"conf":conf,"char_embed_matrix": char_embed_matrix, "term_embed_matrix": term_embed_matrix, "name": model_dir+'conditiongateddeepcnnmodel_PE.h5'}) conf.lr = 0.0005 stk_model.add_model(ConditionDPCNNModel, {"conf": conf, "char_embed_matrix": char_embed_matrix, "term_embed_matrix": term_embed_matrix, "name": model_dir + 'conditiondpcnnmodel_PE.h5'}) #采样0.1用于测试 # x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xtopic], y, split_ratio=0.005, shuffle=False) # x_tn, y_tn, x_ts, y_ts = training_utils.split(x_tn, y_tn, shuffle=False) x_tn, y_tn, x_ts, y_ts = training_utils.split([x, xe, xterm, xe_term, xfeat, xtopic], y, split_ratio=0.95) stk_model.fit(x_tn, y_tn) # joblib.dump(stk_model, 'model/stack_model_3.pkl') y_pred = stk_model.predict(x_ts) acc = accuracy_score(training_utils.convert_y(y_pred), training_utils.convert_y(y_ts) ) print(acc) cnf_matrix = confusion_matrix(training_utils.convert_y(y_pred), training_utils.convert_y(y_ts) ) print(cnf_matrix) stk_model.save( )