def vectorize_jobs(df_jobs, vectorizer_path, tfidfs_path, debug=False): #initializing tfidf vectorizer if debug: print('[Job Vectorization 2/5] Initializing Vectorizer \n') vectorizer = Vectorizer() if debug: print('[Job Vectorization 3/5] Tranforming/Vectorizing data \n') tfidf_jobs = vectorizer.fit_transform( (df_jobs['text'])) #fitting and transforming the vector if debug: print('[Job Vectorization 4/5] saving vectorizer to {path} \n'.format( path=vectorizer_path)) vectorizer.save_vectorizer(vectorizer_path) if debug: print('[Job Vectorization 5/5] saving tfidf to {path} \n'.format( path=tfidfs_path)) vectorizer.save_tfidfs(tfidf_jobs, tfidfs_path)
def main(): # 学習データ読み込み with timer("train data load"): df = load_data_from_gcs() # 前処理 with timer("preprocess"): df = preprocess(df) vectorizer = Vectorizer() X_train = df.drop(columns="price") y_train = df["price"] with timer("training"): X_train = vectorizer.fit_transform(X_train) # 学習 base_params = { 'input_dropout': 0.2, 'hidden_layers': 3, 'hidden_units': 256, 'hidden_activation': 'relu', 'hidden_dropout': 0.2, 'batch_norm': 'before_act', 'optimizer': { 'type': 'adam', 'lr': 5e-5 }, 'batch_size': 64, } model = ModelMLP(base_params) model.fit(X_train, y_train) with timer("save model"): #モデルとパイプラインの保存 vectorizer.save_vectorizer() model.save_model()