def train(lang='pt'): params = PARAMS.copy() initial_epoch = 0 X, Y = util.get_X_Y(data_type='keras_tokenized_tri', lang=lang, file_type="dump") X = np.asarray(X) params['embedding_matrix'] = load_embedding_matrix(name="fasttext_sg_tri_8", tokenizer='keras_tokenized_tri',lang=lang, model_type="dump") params["vocab_size"] = params['embedding_matrix'].shape[0] params["embedding_dim"] = params['embedding_matrix'].shape[1] if not os.path.exists(PATH): os.makedirs(PATH) if not os.path.exists(PATH+'log_dir'): os.makedirs(PATH+'log_dir') #params["loss"] = util.focal_loss(gamma=5.,alpha=1588) lastest_model = load_lastest(lang=lang) if(lastest_model == None): model, params = generate_model(params) else: model = lastest_model[0] initial_epoch = lastest_model[1] print(model.metrics_names) params['sampler'] = FunctionSampler(func=balance_dataset, kw_args={'cut_off': 0.5, 'random_state': 42}) data_generator = DataGenerator(X,Y, lang=lang, process_x=process_x, process_y=process_y, batch_size=PARAMS['batch_size'], sampler=params['sampler']) #data_generator.remove_reliable_0(pct=1.0) validation_data = data_generator.get_validation_data() print('data_generator.x: ', data_generator.__getitem__(0)[0][0:5]) print('data_generator.y: ', data_generator.__getitem__(0)[1][0:5]) #params["class_weights"]= data_generator.get_classes_weights() reduce_lr = ReduceLROnPlateau(monitor='val_categorical_accuracy', factor=0.2, patience=3, verbose=1) early_stopping = EarlyStopping(monitor='val_categorical_accuracy', min_delta=0.02, patience=10, verbose=1) csv_logger = CSVLogger(PATH+'traning.log', append=True) tensorboard_callback = TensorBoard(log_dir=PATH+'log_dir', batch_size=params["batch_size"]) model_checkpoint = ModelCheckpoint(filepath=PATH+'weights-{epoch:03d}-{val_categorical_accuracy:.4f}-'+lang+'.hdf5', monitor='val_categorical_accuracy', verbose=1, mode='max') params["callbacks"] = [model_checkpoint, early_stopping, tensorboard_callback, csv_logger, reduce_lr] model.fit_generator(data_generator, epochs=params["epochs"], verbose=1, callbacks=params["callbacks"], validation_data=validation_data, #workers=7, #use_multiprocessing=True, class_weight=params["class_weights"], initial_epoch=initial_epoch)
def train(lang='pt'): params = PARAMS.copy() initial_epoch = 0 X, Y = util.get_X_Y(data_type='keras_tokenized_tri', lang=lang, file_type="dump") X = np.asarray(X) params['embedding_matrix'] = load_embedding_matrix(name="fasttext_sg_tri_8", tokenizer='keras_tokenized_tri',lang=lang, model_type="dump") params["vocab_size"] = params['embedding_matrix'].shape[0] params["embedding_dim"] = params['embedding_matrix'].shape[1] if not os.path.exists(PATH): os.makedirs(PATH) if not os.path.exists(PATH+'log_dir'): os.makedirs(PATH+'log_dir') kfold_count = 1 skf = StratifiedKFold(n_splits=params['k-folds'], shuffle=True) for train_index, test_index in skf.split(X, Y): print("TRAIN:", train_index.shape, "TEST:", test_index.shape) X_train, Y_train = X[train_index], Y[train_index] X_test, Y_test = X[test_index], Y[test_index] params['sampler'] = FunctionSampler(func=balance_dataset, kw_args={'cut_off': 0.5, 'random_state': np.random.randint(0, 100000)}) model, params = generate_model(params) print(model.metrics_names) data_g_train = DataGenerator(X_train, Y_train, lang=lang, process_x=process_x, process_y=process_y, sampler=params['sampler'], batch_size=PARAMS['batch_size'], separate_val=False) data_g_val = DataGenerator(X_test, Y_test, lang=lang, process_x=process_x, process_y=process_y, batch_size=PARAMS['batch_size'], separate_val=False) print('data_generator.x: ', data_g_train.__getitem__(0)[0][0:5]) print('data_generator.y: ', data_g_train.__getitem__(0)[1][0:5]) #params["class_weights"]= data_generator.get_classes_weights() reduce_lr = ReduceLROnPlateau(monitor='val_categorical_accuracy', factor=0.2, patience=3, verbose=1) early_stopping = EarlyStopping(monitor='val_categorical_accuracy', min_delta=0.02, patience=10, verbose=1) csv_logger = CSVLogger(PATH+'traning.log', append=True) tensorboard_callback = TensorBoard(log_dir=PATH+'log_dir', batch_size=params["batch_size"]) model_checkpoint = ModelCheckpoint(filepath=PATH+'weights-{epoch:03d}-{val_categorical_accuracy:.4f}-'+lang+'.hdf5', monitor='val_categorical_accuracy', verbose=1, mode='max') clr = CyclicLR(base_lr=1e-3, max_lr=2e-3, step_size=300., mode='exp_range', gamma=0.99994) params["callbacks"] = [tensorboard_callback, csv_logger, clr] model.fit_generator(data_g_train, epochs=params["epochs"], verbose=1, callbacks=params["callbacks"], #workers=7, #use_multiprocessing=True, class_weight=params["class_weights"], initial_epoch=initial_epoch) batch_val = 1000 data_g_val.set_batch_size(batch_val) y_pred = np.zeros(Y_test.shape) y_val = np.zeros(Y_test.shape) for i, (x, y) in enumerate(data_g_val): y_pred[i*batch_val:(i+1)*batch_val] = np.argmax(model.predict(x), axis=1) y_val[i*batch_val:(i+1)*batch_val] = np.argmax(y, axis=1) result = util.evaluate(y_val, y_pred) print('Model '+NAME+' val score on '+lang+', k-fold-'+str(kfold_count)+': ', result) model.save(PATH+'weights-{epoch:03d}-kfold{fold}-{result:.4f}-{lang}.hdf5'.format(epoch=params["epochs"], result=result, lang=lang, fold=kfold_count)) del data_g_train, data_g_val del model del X_train, Y_train, X_test, Y_test K.clear_session() gc.collect() kfold_count += 1