Exemplo n.º 1
0
def full_cycle(metric):
    csv_file_name = Constants.generate_file_name(metric, 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name(metric, 'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)
    print(json_file_name)
    print(csv_file_name)

    properties = Constants.get_properties_copy()
    results = evaluate_topic_model(metric)
    print(results)
    results.update(properties)

    ETLUtils.write_row_to_csv(csv_file_name, results)
    ETLUtils.write_row_to_json(json_file_name, results)
Exemplo n.º 2
0
def full_cycle(metric):
    csv_file_name = Constants.generate_file_name(
        metric, 'csv', Constants.RESULTS_FOLDER, None,
        None, False)
    json_file_name = Constants.generate_file_name(
        metric, 'json', Constants.RESULTS_FOLDER, None,
        None, False)
    print(json_file_name)
    print(csv_file_name)

    properties = Constants.get_properties_copy()
    results = evaluate_topic_model(metric)
    print(results)
    results.update(properties)

    ETLUtils.write_row_to_csv(csv_file_name, results)
    ETLUtils.write_row_to_json(json_file_name, results)
Exemplo n.º 3
0
def full_cycle():

    plant_random_seeds()
    my_records = load_records()
    preprocess_records(my_records)
    x_matrix, y_vector = transform(my_records)
    count_specific_generic(my_records)

    # Error estimation
    error_estimation_results = []
    best_classifier = None
    best_score = 0.0
    for classifier, params in PARAM_GRID_MAP.items():
        # print('Classifier: %s' % classifier)
        cv = StratifiedKFold(Constants.CROSS_VALIDATION_NUM_FOLDS)
        score = error_estimation(x_matrix, y_vector, params, cv,
                                 SCORE_METRIC).mean()
        error_estimation_results.append({
            'classifier':
            classifier,
            'accuracy':
            score,
            Constants.BUSINESS_TYPE_FIELD:
            Constants.ITEM_TYPE
        })
        print('%s score: %f' % (classifier, score))

        if score > best_score:
            best_score = score
            best_classifier = classifier

    # Model selection
    cv = StratifiedKFold(Constants.CROSS_VALIDATION_NUM_FOLDS)
    grid_search_cv = model_selection(x_matrix, y_vector,
                                     PARAM_GRID_MAP[best_classifier], cv,
                                     SCORE_METRIC)
    # best_model = grid_search_cv.best_estimator_.get_params()['classifier']
    # features_importance = best_model.coef_
    print('%s: %f' % (SCORE_METRIC, grid_search_cv.best_score_))
    print('best params', grid_search_cv.best_params_)

    # for key, value in grid_search_cv.best_params_.items():
    #     print(key, value)

    # print('best estimator', grid_search_cv.best_estimator_)
    # print('features importance', features_importance)

    # csv_file_name = Constants.generate_file_name(
    #     'classifier_results', 'csv', Constants.RESULTS_FOLDER, None,
    #     None, False)
    # json_file_name = Constants.generate_file_name(
    #     'classifier_results', 'json', Constants.RESULTS_FOLDER, None,
    #     None, False)
    csv_file_name2 = Constants.RESULTS_FOLDER + 'classifier_results.csv'
    json_file_name2 = Constants.RESULTS_FOLDER + 'classifier_results.json'

    # results = get_scores(final_grid_search_cv.cv_results_)
    # csv_file = '/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_new_reviews_classifier_results.csv'
    # ETLUtils.save_csv_file(
    #     csv_file_name, error_estimation_results,
    #     error_estimation_results[0].keys())
    # ETLUtils.save_json_file(json_file_name, error_estimation_results)

    for result in error_estimation_results:
        ETLUtils.write_row_to_csv(csv_file_name2, result)
        ETLUtils.write_row_to_json(json_file_name2, result)
    #
    # print(csv_file)

    best_hyperparams_file_name = Constants.generate_file_name(
        'best_hyperparameters', 'json', Constants.CACHE_FOLDER, None, None,
        False)
    save_parameters(best_hyperparams_file_name, grid_search_cv.best_params_)
Exemplo n.º 4
0
def full_cycle():

    plant_random_seeds()
    my_records = load_records()
    preprocess_records(my_records)
    x_matrix, y_vector = transform(my_records)
    count_specific_generic(my_records)

    # Error estimation
    error_estimation_results = []
    best_classifier = None
    best_score = 0.0
    for classifier, params in PARAM_GRID_MAP.items():
        # print('Classifier: %s' % classifier)
        cv = StratifiedKFold(Constants.CROSS_VALIDATION_NUM_FOLDS)
        score = error_estimation(x_matrix, y_vector, params, cv, SCORE_METRIC).mean()
        error_estimation_results.append(
            {
                'classifier': classifier,
                'accuracy': score,
                Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE
            }
        )
        print('%s score: %f' % (classifier, score))

        if score > best_score:
            best_score = score
            best_classifier = classifier

    # Model selection
    cv = StratifiedKFold(Constants.CROSS_VALIDATION_NUM_FOLDS)
    grid_search_cv = model_selection(
        x_matrix, y_vector, PARAM_GRID_MAP[best_classifier], cv, SCORE_METRIC)
    # best_model = grid_search_cv.best_estimator_.get_params()['classifier']
    # features_importance = best_model.coef_
    print('%s: %f' % (SCORE_METRIC, grid_search_cv.best_score_))
    print('best params', grid_search_cv.best_params_)

    # for key, value in grid_search_cv.best_params_.items():
    #     print(key, value)

    # print('best estimator', grid_search_cv.best_estimator_)
    # print('features importance', features_importance)

    # csv_file_name = Constants.generate_file_name(
    #     'classifier_results', 'csv', Constants.RESULTS_FOLDER, None,
    #     None, False)
    # json_file_name = Constants.generate_file_name(
    #     'classifier_results', 'json', Constants.RESULTS_FOLDER, None,
    #     None, False)
    csv_file_name2 = Constants.RESULTS_FOLDER + 'classifier_results.csv'
    json_file_name2 = Constants.RESULTS_FOLDER + 'classifier_results.json'


    # results = get_scores(final_grid_search_cv.cv_results_)
    # csv_file = '/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_new_reviews_classifier_results.csv'
    # ETLUtils.save_csv_file(
    #     csv_file_name, error_estimation_results,
    #     error_estimation_results[0].keys())
    # ETLUtils.save_json_file(json_file_name, error_estimation_results)

    for result in error_estimation_results:
        ETLUtils.write_row_to_csv(
            csv_file_name2, result)
        ETLUtils.write_row_to_json(json_file_name2, result)
    #
    # print(csv_file)

    best_hyperparams_file_name = Constants.generate_file_name(
        'best_hyperparameters', 'json', Constants.CACHE_FOLDER, None,
        None, False)
    save_parameters(best_hyperparams_file_name, grid_search_cv.best_params_)