예제 #1
0
def vectorize_jobs(df_jobs, vectorizer_path, tfidfs_path, debug=False):
    #initializing tfidf vectorizer
    if debug:
        print('[Job Vectorization 2/5] Initializing Vectorizer \n')
    vectorizer = Vectorizer()

    if debug:
        print('[Job Vectorization 3/5] Tranforming/Vectorizing data \n')
    tfidf_jobs = vectorizer.fit_transform(
        (df_jobs['text']))  #fitting and transforming the vector

    if debug:
        print('[Job Vectorization 4/5] saving vectorizer to {path} \n'.format(
            path=vectorizer_path))
    vectorizer.save_vectorizer(vectorizer_path)

    if debug:
        print('[Job Vectorization 5/5] saving tfidf to {path} \n'.format(
            path=tfidfs_path))
    vectorizer.save_tfidfs(tfidf_jobs, tfidfs_path)
예제 #2
0
def main():
    # 学習データ読み込み
    with timer("train data load"):
        df = load_data_from_gcs()

    # 前処理
    with timer("preprocess"):
        df = preprocess(df)
        vectorizer = Vectorizer()

    X_train = df.drop(columns="price")
    y_train = df["price"]

    with timer("training"):
        X_train = vectorizer.fit_transform(X_train)

        # 学習
        base_params = {
            'input_dropout': 0.2,
            'hidden_layers': 3,
            'hidden_units': 256,
            'hidden_activation': 'relu',
            'hidden_dropout': 0.2,
            'batch_norm': 'before_act',
            'optimizer': {
                'type': 'adam',
                'lr': 5e-5
            },
            'batch_size': 64,
        }

        model = ModelMLP(base_params)
        model.fit(X_train, y_train)

    with timer("save model"):
        #モデルとパイプラインの保存
        vectorizer.save_vectorizer()
        model.save_model()