def main(): """ Main function of test.py Arguments: modelname: String, name of the model datapath: The testing file subtask: String, "A" or "B" or "C" Outputs: subtask + [subtask]/result/[modelname]/res.pred """ modelname = args.modelname datapath = args.datapath subtask = args.subtask dm = DataManager(subtask) dm.load_tokenizer( os.path.join("subtask" + subtask, "models", modelname, "word2idx.pkl"), os.path.join("subtask" + subtask, "models", modelname, "idx2word.pkl")) dm.add_data("test", datapath) dm.to_sequence(40, 40) (test_Q, test_C), qidlist = dm.get_data("test") print("test_Q", test_Q[0:2]) print("test_C", test_C[0:2]) print("qidlist", qidlist[0:2]) model = load_model( os.path.join("subtask" + subtask, "models", modelname, "model.h5")) result = model.predict([test_Q, test_C], batch_size=128, verbose=1) print("result", result[0:2]) if subtask == "A": outputA(qidlist, result, modelname) elif subtask == "B": outputB(qidlist, result, modelname) elif subtask == "C": outputC(qidlist, result, modelname)
def new_process_xy(tokenpath,path2x,path2y): dm = DataManager() dm.add_data('seed', '0samples.csv') dm.add_data('truth', '0samples.csv') dm.tokenize(230000) #vocab size dm.save_tokenizer(tokenpath) dm.to_sequence(1) #max length dm.save_sequence(path2x) dm.tosave_label(path2y)
def main(): path_pfx = '' max_len = 37 dm = DataManager() dm.add_data('test', os.path.join(sys.argv[1]), False, True) print(len(dm.data['test'][0])) dm.preprocessing() dm.load_word2vec(os.path.join(path_pfx, 'model/word2vec')) #dm.load_tokenizer(os.path.join(path_pfx, 'token.pkl')) dm.to_sequence(max_len, use_pretrain=True) result = predict(dm.data['test'][0], path_pfx) write(sys.argv[2], result) print('finished')
def main(argv): filename = argv[1] output_path = argv[2] output_path = output_path.replace('\r', '') output_path = output_path.replace('\r\n', '') dm = DataManager() dm.add_data('test_data', filename, False) dm.load_tokenizer('./model/token_25k.pk') dm.to_sequence(40) model = load_model('./model/00017-0.82720.h5') model.summary() val_proba = model.predict(dm.data['test_data']) val_classes = [1 if value > 0.5 else 0 for value in val_proba] out = pd.DataFrame(val_classes, columns=['label']) out.to_csv(output_path, index_label='id')
def main(): voc_size = None max_len = 39 path_pfx = '' dm = DataManager() dm.add_data('train', sys.argv[1]) #dm.add_data('semi', os.path.join(path_pfx, 'training_nolabel.txt'), False) #dm.add_data('test', os.path.join(path_pfx, 'testing_data.txt'), False, True) dm.preprocessing() dm.load_word2vec(os.path.join(path_pfx, 'model/word2vec')) #dm.load_embedding_matrix(os.path.join(path_pfx, 'word2vec.wv.vectors.npy')) dm.to_sequence(max_len, use_pretrain=True) #dm.to_bow() print(max_len) #emb_mat = dm.get_embedding_matrix() emb_mat = None train(dm, voc_size=voc_size, max_len=max_len, emb_mat=emb_mat)
def argument_parser(L): token = L[1] dm = DataManager() dm.add_data('data/data.csv') X = dm.get_data('data') Y = dm.get_data('label') data = X[0] label = Y[0] logpath = os.path.join('log') if not os.path.exists(logpath): os.makedirs(logpath) if token == 'LinR': MSE, MAE = train(data, label, token) with open('log/LinR.csv', 'w') as f: f.write('MSE,MAE\n') f.write('{},{}\n'.format(MSE, MAE)) else: bin_size = int(L[2]) acc, pre, rec, f_score = train(data, label, token, bin_size=bin_size) with open('log/' + token + '-bins-' + str(bin_size) + '.csv', 'w') as f: f.write('accuracy,precision,recall,f-score\n') f.write('{},{},{},{}\n'.format(acc, pre, rec, f_score))
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir,args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir,args.load_model) #####read data##### dm = DataManager() print ('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) elif args.action == 'train_corpus': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) else: raise Exception ('Implement your testing parser') # prepare tokenizer print ('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path,'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists('./model/token_25k.pk'): dm.save_tokenizer('./model/token_25k.pk') embedding_w = dm.get_vec_model('emb_1.npy',args.embedding_dim) dm.to_sequence(args.max_length) # initial model print ('initial model...') model = simpleRNN(args,embedding_w) model.summary() if args.load_model is not None: if args.action == 'train': print ('Warning : load a exist model and keep training') path = os.path.join(load_path,'model.h5') if os.path.exists(path): print ('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" %path) elif args.action == 'test': print ('Warning : testing without loading any model') # training if args.action == 'train_corpus': (X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio) earlystopping = EarlyStopping(monitor='val_acc', patience = 3, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath='./model/'+'{epoch:05d}-{val_acc:.5f}.h5', verbose=1, save_best_only=True, save_weights_only=False, monitor='val_acc', mode='max' ) history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, verbose=1, shuffle= True, callbacks=[checkpoint, earlystopping] ) # plot_figure(history) # semi-supervised training elif args.action == 'semi': earlystopping = EarlyStopping(monitor='val_acc', patience = 10, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath='./model/semi/'+'{epoch:05d}-{val_acc:.5f}.h5', verbose=1, save_best_only=True, save_weights_only=False, monitor='val_acc', mode='max' ) # repeat 10 times (X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) dm.clean_data() dm.add_data('train_data', train_path, True) dm.add_data('test_data',test_path, False) dm.to_sequence(args.max_length) semi_X, semi_Y = dm.get_semi_data('test_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print ('-- semi_data size: %d' %(len(semi_X))) model = simpleRNN(args,embedding_w) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=40, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping] ) plot_figure(history)
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir, args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) #####read data##### dm = DataManager() print('Loading data...') if args.action == 'test': dm.add_data('test_data', test_path, False) else: dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) # prepare tokenizer print('get Tokenizer...') if args.action == 'token': dm.tokenize() else: # read exist tokenizer dm.load_tokenizer(args.token) '''else: # create tokenizer on new data dm.tokenize()''' dm.save_tokenizer(args.token) # convert to sequences if args.action != 'token': dm.to_sequence(args.max_length) # initial model if args.action != 'token': print('initial model...') model = simpleRNN(args) print(model.summary()) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': print('Warning : testing without loading any model') # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) earlystopping = EarlyStopping(monitor='val_acc', patience=11, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) # testing elif args.action == 'test': X = dm.get_data('test_data')[0] predict = model.predict(X) result = [['id', 'label']] for i in range(len(predict)): a = [i] if predict[i][0] > 0.5: a.append(1) else: a.append(0) #a.append(predict[i][0]) #test #a.append(predict[i]) result.append(a) i += 1 cout = csv.writer(open(args.result_path, 'w')) cout.writerows(result) #implement after ensure output format # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_acc', patience=11, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') # repeat 10 times #for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) #print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=20, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): # limit gpu memory usage train_path = argv[1] semi_path = argv[2] #K.set_session(get_session(gpu_fraction)) #####read data##### dm = DataManager() print ('Loading data...') if action == 'train': dm.add_data('train_data', train_path, True) #dm.add_data('semi_data', semi_path, False) elif action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) else: raise Exception ('Implement your testing parser') # prepare tokenizer print ('get Tokenizer...') if not os.path.exists(tokenizer_save_path): dm.tokenize(20000) dm.save_tokenizer(tokenizer_save_path) else: dm.load_tokenizer(tokenizer_save_path) # Word2Vec print ('get Word2Vec...') data_dic = dm.get_data() tokenizer = dm.get_tokenizer() #vocab_size = len(tokenizer.word_index)+1 #data_list = data_dic['train_data'][2]+data_dic['semi_data'][1] #data_list = data_dic['train_data'] #w2v_model = Word2Vec(data_list, size=256, min_count=5,iter=16,workers=16) #w2v_model.save(word2vec_save_path) #w2v_model = Word2Vec.load(word2vec_save_path) w2v_model=pk.load(open('emb.pkl','rb')) # convert to sequences dm.to_sequence(max_length) #dm.to_bow() # initial model print ('initial model...') model = simpleRNN() print (model.summary()) labelnum = [] # training if action == 'train': (X,Y),(X_val,Y_val) = dm.split_data('train_data', val_ratio) X = embedding_vector(X, w2v_model, tokenizer) X_val = embedding_vector(X_val, w2v_model, tokenizer) earlystopping = EarlyStopping(monitor='val_acc', patience = 15, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath=model_save_path,verbose=1,save_best_only=True,monitor='val_acc',mode='max' ) history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=nb_epoch, batch_size=batch_size, callbacks=[checkpoint, earlystopping]) # semi-supervised training elif action == 'semi': (X,Y),(X_val,Y_val) = dm.split_data('train_data', val_ratio) semi_all_X = dm.get_data()['semi_data'][0] X = embedding_vector(X, w2v_model, tokenizer) X_val = embedding_vector(X_val, w2v_model, tokenizer) semi_all_X = embedding_vector(semi_all_X,w2v_model,tokenizer) X = np.array(X) X_val = np.array(X_val) semi_all_X = np.array(semi_all_X) earlystopping = EarlyStopping(monitor='val_acc', patience = 5, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath=model_save_path,verbose=1,save_best_only=True,monitor='val_acc',mode='max') # repeat 10 times for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = getsemidata(semi_all_X,semi_pred,threshold) labelnum.append(semi_X.shape) semi_X = np.concatenate((semi_X, X),axis=0) semi_Y = np.concatenate((semi_Y, Y),axis=0) print ('-- iteration %d semi_data size: %d' %(i+1,len(semi_X))) # train history = model.fit(semi_X, semi_Y,validation_data=(X_val, Y_val),epochs=2,batch_size=batch_size,callbacks=[checkpoint, earlystopping] ) if os.path.exists(model_save_path): print ('load model from %s' % model_save_path) model.load_model(model_save_path) else: raise ValueError("Can't find the file %s" %path) '''
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) #save_path = os.path.join(args.save_dir,args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) print('load_path:', load_path) #####read data##### dm = DataManager() w2v_path = os.path.join(args.save_dir, 'word2vec') print(w2v_path) if args.action == 'train': print('Loading data...') dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) dm.add_test_data('test_data', args.test_path) test_data = dm.get_test_data('test_data') train_data = dm.get_data('train_data') semi_data = dm.get_data('semi_data') all_text = np.concatenate((train_data[0], semi_data[0], test_data), axis=0) print('Number of all_text:', all_text.shape[0]) #print('Text sample:',all_text[0]) print('Converting texts to words sequence...') text2word = [] with_filter = 0 if with_filter: for text in all_text: text2word.append( text_to_word_sequence( text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ")) if not with_filter: for text in all_text: text2word.append( text_to_word_sequence(text, filters='', lower=True, split=" ")) print('Word sequence sample:', text2word[0]) if os.path.exists(w2v_path): print('Loading w2v_model from %s' % w2v_path) word_vec = gensim.models.Word2Vec.load(w2v_path) print('Vocabulary size:', len(word_vec.wv.vocab)) else: print('Building word2vec model...') word_vec = gensim.models.Word2Vec(text2word, size=128, min_count=15) print('Vocabulary size:', len(word_vec.wv.vocab)) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'word2vec')): word_vec.save((os.path.join(save_path, 'word2vec'))) print('Coverting train_data to vector...') index_data = [] i = 0 for line in train_data[0]: index_data.append([]) for word in line.split(): if word in word_vec.wv: #print(word ,word_vec.wv.vocab[word].index) index_data[i].append(word_vec.wv.vocab[word].index) i += 1 embedding_matrix = np.zeros((len(word_vec.wv.vocab), 128)) for i in range(len(word_vec.wv.vocab)): embedding_vector = word_vec.wv[word_vec.wv.index2word[i]] if embedding_vector is not None: embedding_matrix[i] = embedding_vector index_data = pad_sequences(index_data, args.max_length) else: if os.path.exists(w2v_path): print('Loading w2v_model from %s' % w2v_path) word_vec = gensim.models.Word2Vec.load(w2v_path) print('Vocabulary size:', len(word_vec.wv.vocab)) embedding_matrix = np.zeros((len(word_vec.wv.vocab), 128)) for i in range(len(word_vec.wv.vocab)): embedding_vector = word_vec.wv[word_vec.wv.index2word[i]] if embedding_vector is not None: embedding_matrix[i] = embedding_vector else: print('Can not load w2v model, please training w2v model first!') #print ('get Tokenizer...') #if args.load_model is not None: # # read exist tokenizer # dm.load_tokenizer(os.path.join(load_path,'token.pk')) #else: # # create tokenizer on new data # dm.tokenize(args.vocab_size) # #if not os.path.isdir(save_path): # os.makedirs(save_path) #if not os.path.exists(os.path.join(save_path,'token.pk')): # dm.save_tokenizer(os.path.join(save_path,'token.pk')) # # mat_train_data = dm.tokenizer.texts_to_matrix(train_data[0], mode='count') # mat_test_data = dm.tokenizer.texts_to_matrix(test_data, mode='count') # convert to sequences #dm.to_sequence(args.max_length) # initial model print('initial model...') #model = bow_model(args,mat_train_data) model = simpleRNN(args, embedding_matrix) print(model.summary()) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') #path = os.path.join(load_path,'model.h5') if os.path.exists(load_path): print('load model from %s' % load_path) model.load_weights(load_path) else: raise ValueError("Can't find the file %s" % load_path) elif args.action == 'test': print('Warning : testing without loading any model') # training if args.action == 'train': #(X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio) X, X_val, Y, Y_val = train_test_split(index_data, train_data[1], test_size=0.33, random_state=42) earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) print(history.history.keys()) print('Val_acc:', history.history['val_acc']) print('Train_acc:', history.history['acc']) # testing elif args.action == 'test': dm.add_test_data('test_data', args.test_path) test_data = dm.get_test_data('test_data') # Covert to vector index_test_data = [] i = 0 for line in test_data: index_test_data.append([]) for word in line.split(): if word in word_vec.wv: #print(word ,word_vec.wv.vocab[word].index) index_test_data[i].append(word_vec.wv.vocab[word].index) i += 1 index_test_data = pad_sequences(index_test_data, args.max_length) if not os.path.exists(args.result_path): os.makedirs(args.result_path) csv_path = os.path.join(args.result_path, 'prediction.csv') print("Predicting testing data...") Y_pred = model.predict(index_test_data) Y_pred = np.round(Y_pred) print('Saving result csv to', csv_path) with open(csv_path, 'w') as f: f.write('id,label\n') for i, v in enumerate(Y_pred): f.write('%d,%d\n' % (i, v)) # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') # repeat 10 times for i in range(5): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=2048, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=256, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir, args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) else: dm.add_data('test_data', test_path, False) # raise Exception ('Implement your testing parser') # prepare tokenizer print('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # convert to sequences dm.to_sequence(args.max_length) # dm.to_bow() # initial model print('initial model...') model = simpleRNN(args) print(model.summary()) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': print('Warning : testing without loading any model') # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) plot(history, args.model) # plot_model(model, to_file='./img/structure.png') # testing elif args.action == 'test': X = dm.get_data('test_data') print('Predict testing data...') result = model.predict(X) print('Save result...') saveResult(result, args.result_path) # raise Exception ('Implement your testing function') # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') # repeat 10 times for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = args.save_dir if args.load_model is not None: load_path = args.save_dir #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) dm.add_test_data('test_data', test_path) else: dm.add_test_data('test_data', test_path) # prepare tokenizer print('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # convert to sequences dm.to_sequence(args.max_length) # initial model print('initial model...') model = simpleRNN(args) print(model.summary()) if args.load_model is not None: path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) # testing elif args.action == 'test': print(model.summary()) [test_x] = dm.get_data('test_data') classes = model.predict(test_x, batch_size=32) with open(args.output_path, "w", encoding='utf-8') as f: spamwriter = csv.writer(f, delimiter=',') spamwriter.writerow(['id', 'label']) for i in range(len(classes)): if classes[i][0] < 0.5: result = 0 else: result = 1 spamwriter.writerow([str(i), str(result)]) # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') [test_x] = dm.get_data('test_data') semi_all_X = np.concatenate((semi_all_X, test_x), axis=0) earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') # repeat 10 times for i in range(16): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): dm = DataManager() dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) print('Get Tokenizer...') dm.load_tokenizer('./token/token.pk') embedding_mat = dm.to_sequence(40, action) print('Initial model...') if action == 'train': model = RNN(embedding_mat) print(model.summary()) elif action == 'semi': model = load_model('./model/model1.hdf5') print(model.summary()) if action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', 0.2) earlystopping = EarlyStopping(monitor='val_acc', patience=30, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath='./model/model.hdf5', verbose=1, save_best_only=True, monitor='val_acc', mode='max') model.fit(X, Y, validation_data=(X_val, Y_val), epochs=80, batch_size=512, callbacks=[checkpoint, earlystopping]) elif action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', 0.2) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath='./model/model_semi.hdf5', verbose=1, save_best_only=True, monitor='val_acc', mode='max') for i in range(10): semi_pred = model.predict(semi_all_X, batch_size=2048, verbose=1) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, 0.1) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=512, callbacks=[checkpoint, earlystopping]) print('load model from') model = load_model('./model/model_semi.hdf5')
import sys import keras import _pickle as pk import numpy as np from keras.models import Model, Sequential, load_model from util import DataManager # argv settings test_path = sys.argv[1] output_path = sys.argv[2] mode = sys.argv[3] # load data dm = DataManager() dm.add_data('test_data',test_path,False) if mode=='private': # tokenizer dm.load_tokenizer('./token/token.pk') # load model model = load_model('./model/model1.hdf5') elif mode=='public': # tokenizer dm.load_tokenizer('./token/token_filter.pk') # load model model = load_model('./model/model2.hdf5') dm.to_sequence(40,'test') test_all_x = dm.get_data('test_data')