Exemplo n.º 1
0
def norm_tsne2(x, y):
    if data_loading.dataframe_already_exists(constants.OUTPUT_DATA_PROC_PATH,
                                             DATA_TSNE2):
        return None
    y = y.reset_index(drop=True)
    norm_df = _normalize(x)
    tsne_df = _tsne(norm_df, 2)
    joined_df = pd.concat((norm_df, tsne_df, y),
                          axis=1)
    assert norm_df.shape[0] == tsne_df.shape[0] == joined_df.shape[0]
    data_loading.save_data(joined_df, constants.OUTPUT_DATA_PROC_PATH,
                           DATA_TSNE2)
Exemplo n.º 2
0
def norm_pca3(x, y):
    if data_loading.dataframe_already_exists(constants.OUTPUT_DATA_PROC_PATH,
                                             DATA_PCA3):
        return None
    y = y.reset_index(drop=True)
    norm_df = _normalize(x)
    pca_df = _pca(norm_df, 3)
    joined_df = pd.concat((norm_df, pca_df, y),
                          axis=1)
    assert norm_df.shape[0] == pca_df.shape[0] == joined_df.shape[0]
    data_loading.save_data(joined_df, constants.OUTPUT_DATA_PROC_PATH,
                           DATA_PCA3)
Exemplo n.º 3
0
def main():
    # Set the random seed for the entire project.
    du.set_random_seed(0)
    # Rationale: ensure reproducibility of the results.

    # Flush previous runs.
    constants.flush_project_results(constants.TMP_PATH, constants.OUTPUT_PATH)
    # Rationale: provide a clear state for the project to run and enforces
    # reproducibility of the results.

    # Load, save and split data.
    dataframe = data_loading.load_data(constants.DATA_PATH)
    data_loading.save_data(dataframe, constants.TMP_PATH)
    x_train, x_test, y_train, y_test = data_loading.train_test_split(dataframe)
    # Rationale: *Loading*: load data in the main module and pass it as a first
    # argument to every other defined function (that relates to the data set)
    # thus saving precious time with data loading. *Saving*: for big data sets
    # saving the dataset as a fast read format (such as HDF5) saves time.

    # Load and combine data processing pipelines.
    # TODO:
    data_processing_pipelines = None

    # Perform exploratory data analyses.
    data_exploration.main(dataframe)
    # Rationale: conduct exploratory data analyses.

    # Perform grid search.
    persistent_grid_object = sku.grid_search.PersistentGrid.load_from_path(
        persistent_grid_path=constants.PERSITENT_GRID_PATH,
        dataset_path=constants.DATA_PATH)
    # Iteration over processed data sets may occur here since they are model
    # dependent.
    grid_search.main(dataframe, constants.MODELS, data_processing_pipelines,
                     constants.GRIDS, persistent_grid_object)
    best_grids = grid_search.get_best_grids(  # noqa
        constants.MODELS, data_processing_pipelines, persistent_grid_object)
Exemplo n.º 4
0
def main():
    # Filter warnings that polute the project stdout.
    filter_warnings()
    # Rationale: produce cleaner results.

    # Set the random seed for the entire project.
    du.set_random_seed(0)
    # Rationale: ensure reproducibility of the results.

    # Flush previous runs.
    # constants.flush_project_results(constants.TMP_PATH,
    #                                 constants.OUTPUT_PATH)
    # Rationale: provide a clear state for the project to run and enforces
    # reproducibility of the results.

    # Download, load and save data.
    data_loading.main()
    dataframe = data_loading.load_data(constants.DATASET_PATH,
                                       constants.TMP_PATH)
    data_loading.save_data(dataframe, constants.TMP_PATH,
                           constants.DATASET_PATH)
    # Rationale: *Loading*: load data in the main module and pass it as a first
    # argument to every other defined function (that relates to the data set)
    # thus saving precious time with data loading. *Saving*: for big data sets
    # saving the dataset as a fast read format (such as HDF5) saves time.

    # Load and combine data processing pipelines.
    data_processing.main(dataframe, nan_strategy='drop')
    # Rationale: prepare data to be fed into the models.
    # Different algorithms make use of different data structures. For instance
    # XGBoost allow for nans. Data transformations usually don't.

    # Perform exploratory data analyses.
    data_exploration.main(dataframe)
    # Rationale: conduct exploratory data analyses.

    # Data split.
    # Removed.
    # Rationale: module 'models' should execute this.

    # Perform grid search.
    # Iteration over processed data sets may occur here since they are model
    # dependent.
    grid_search.main(constants.MODELS, constants.GRIDS)
    best_combination_of_datasets_and_grids = (
        grid_search.dict_of_best_datasets_and_grids(constants.MODELS,
                                                    constants.GRIDS))
    best_datasets = best_combination_of_datasets_and_grids['best_datasets']
    best_grids = best_combination_of_datasets_and_grids['best_grids']
    # Rationale: perform grid search as part of machine learning best
    # practices.

    # Summary of what was executed so far:
    # 1) Setting of the random seed for reproducibility.
    # 2) Flusing of intermediate results for a clean run.
    # 3) Data loading and data saving.
    # 4) Conduction of exploratory data analyses.
    # 5) Grid search of best model hyper parameters.
    # To conclude our project we need the grand finale: model selection and
    # evaluation/comparison.
    models.main(constants.MODELS, best_datasets, best_grids,
                constants.MODEL_FITTING_PARAMETERS)
Exemplo n.º 5
0
def no_transform(dataframe):
    if data_loading.dataframe_already_exists(constants.OUTPUT_DATA_PROC_PATH,
                                             DATA_VANILLA):
        return None
    data_loading.save_data(dataframe, constants.OUTPUT_DATA_PROC_PATH,
                           DATA_VANILLA)