示例#1
0
def main():
    """
    initialize the world from the parameter file
    """
    dir_path = os.path.dirname(os.path.realpath(__file__))
    parameter_file = os.path.join(dir_path, sys.argv[1])
    with open(parameter_file) as file:
        parameters = json.load(file)
        world = World(parameters)

        # run for the specified tick time
        ticks = parameters["ticks"]
        for i in range(ticks):
            world.run(i)

        # draw a plot if matplotlib has been installed
        draw_plot(world.result())

        # output the result to a csv file
        output_filename = 'result.csv'
        if len(sys.argv) == 3:
            output_filename = sys.argv[2]
        world.output_csv(os.path.join(dir_path, output_filename))
示例#2
0
def main():

    parser = argparse.ArgumentParser(description='Omega integrals')

    parser.add_argument('-p', '--process', type=str,
                        choices=["omega11", "omega12", "omega13", "omega22", "omegas"],
                        default="omegas",
                        help='Comma-separated names of omega integrals whose regression is performed')

    parser.add_argument('-a', '--algorithm', type=str,
                        choices=['DT', 'RF', 'ET', 'GP', 'KN', 'SVM', 'KR', 'GB', 'HGB', 'MLP'],
                        default='DT',
                        help='transport algorithm')

    parser.add_argument('-l', '--load_model', type=str2bool,
                        nargs='?',
                        choices=[False, True],
                        default=False,
                        const=True,
                        help='Load saved model')

    args = parser.parse_args()

    process = args.process.split(',')
    print("Process: ", colored(process[0], 'green'))

    algorithm = args.algorithm.split(',')
    print("Algorithm: ", colored(algorithm[0],'blue'))

    load_model = args.load_model
    print("Load: ", colored(load_model,'magenta'))

    src_dir = "."
    print("SRC: ", colored(src_dir,'yellow'))

    output_dir = src_dir+"/.."
    print("OUTPUT: ", colored(output_dir,'red'))

    n_jobs = 2

    # Import database
    with open('../data/omega_integrals_encoded.txt') as f:
        lines = (line for line in f if not line.startswith('#'))
        dataset = np.loadtxt(lines, skiprows=1)
    print(dataset.shape)

    x = dataset[:,0:3] # c, d, T
    y = dataset[:,3:]  # Ω(1,1), Ω(1,2), Ω(1,3), Ω(2,2)
    print(x.shape)
    print(y.shape)

    print("### Phase 1: PRE_PROCESSING ###")
    ########################################

    # 1.0) create directory tree
    model, scaler, figure = utils.mk_tree(process[0], algorithm[0], output_dir)

    # 1.1) train/test split dataset
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75, test_size=0.25, random_state=69)

    # 1.2) scale data and save scalers
    sc_x = StandardScaler()
    sc_y = StandardScaler()

    sc_x.fit(x_train)
    x_train = sc_x.transform(x_train)
    x_test  = sc_x.transform(x_test)

    sc_y.fit(y_train)
    y_train = sc_y.transform(y_train)
    y_test  = sc_y.transform(y_test)

    print('Training Features Shape:', x_train.shape)
    print('Training Labels Shape:',   y_train.shape)
    print('Testing Features Shape:',  x_test.shape)
    print('Testing Labels Shape:',    y_test.shape)

    dump(sc_x, open(scaler+"/scaler_x_"+process[0]+'.pkl', 'wb'))
    dump(sc_y, open(scaler+"/scaler_y_"+process[0]+'.pkl', 'wb'))

    print("### Phase 2: PROCESSING ###")
    ####################################

    # 2.0) estimator selection
    if (algorithm[0] == 'DT'):
        est, hyper_params = estimators.est_DT()

    elif (algorithm[0] == 'ET'):
        est, hyper_params = estimators.est_ET()

    elif (algorithm[0] == 'SVM'):
        est, hyper_params = estimators.est_SVM()

    elif (algorithm[0] == 'KR'):
        est, hyper_params = estimators.est_KR()

    elif (algorithm[0] == 'KN'):
        est, hyper_params = estimators.est_KN()

    elif (algorithm[0] == 'MLP'):
        est, hyper_params = estimators.est_MLP()

    elif (algorithm[0] == 'GB'):
        est, hyper_params = estimators.est_GB()

    elif (algorithm[0] == 'HGB'):
        est, hyper_params = estimators.est_HGB()

    elif (algorithm[0] == 'RF'):
        est, hyper_params = estimators.est_RF()

    elif (algorithm[0] == 'GB'):
        est, hyper_params = estimators.est_GB()

    else:
        print("Algorithm not implemented ...")

    # 2.1) search for best hyper-parameters combination
    # Exhaustive search over specified parameter values for the estimator
    # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
    gs = GridSearchCV(est, cv=3, param_grid=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2',
                      refit=True, pre_dispatch='n_jobs', error_score=np.nan, return_train_score=True)

    # Randomized search on hyper parameters
    # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV
    # class sklearn.model_selection.RandomizedSearchCV(estimator, param_distributions, *, n_iter=10, scoring=None, n_jobs=None, refit=True,
    #                                                  cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score=nan,
    #                                                  return_train_score=False)
    #gs = RandomizedSearchCV(est, cv=10, n_iter=10, param_distributions=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2',
    #                        refit=True, pre_dispatch='n_jobs', error_score=np.nan, return_train_score=True)

    # 2.2) training
    utils.fit(x_train, y_train, gs)

    # 2.3) prediction
    y_regr = utils.predict(x_test, gs)

    print("### Phase 3: POST-PROCESSING ###")
    #########################################

    # 3.0) save best hyper-parameters
    results = pd.DataFrame(gs.cv_results_)
    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
    #compression_opts = dict(method='zip', archive_name='GridSearchCV_results.csv')
    #results.to_csv('GridSearchCV_results.zip', index=False, compression=compression_opts)
    results.to_csv(model+"/../"+"GridSearchCV_results.csv", index=False, sep='\t', encoding='utf-8')

    # results print screen
    print("Best: %f using %s" % (gs.best_score_, gs.best_params_))
    means  = gs.cv_results_['mean_test_score']
    stds   = gs.cv_results_['std_test_score']
    params = gs.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))

    # 3.1) compute score metrics
    utils.scores(sc_x, sc_y, x_train, y_train, x_test, y_test, model, gs)

    # 3.2) back to original values (unscaling)
    x_test_dim = sc_x.inverse_transform(x_test)
    y_test_dim = sc_y.inverse_transform(y_test)
    y_regr_dim = sc_y.inverse_transform(y_regr)

    # 3.3) make plots
    utils.draw_plot(x_test_dim, y_test_dim, y_regr_dim, figure, process[0], algorithm[0])

    # 3.4) save model to disk
    dump(gs, model+"/model_"+process[0]+".sav")
    x, y = load_merge_mnist()
    # x = pca(x, no_dims=300).real
    item = [
        "T-shirt/top", "Trouser", "Pullover", "Dress", "Coat", "Sandal",
        "Shirt", "Sneaker", "Bag", "Ankle boot"
    ]

    # UMAP run
    # run_umap(x=x, y=y, item=item, n_neighbors_list=[5])
    # run_umap(x=x, y=y, item=item, n_neighbors_list=[2,5,10,20,50])
    # run_umap2(x=x, y=y, item=item, min_dist_list=[0.1,0.05, 0.01])
    x_umap = umap_.UMAP(n_neighbors=5,
                        min_dist=0.3,
                        metric='correlation',
                        verbose=True).fit_transform(x)
    draw_plot(x_umap, y, item, "umap_result")
    # t-SNE run
    # x_tse = run_tsne(x)
    # draw_plot(x_tse, y, item, "tsne_result")

    # CIFAR 10 (60000, 3072), 163MB
    # http://www.cs.toronto.edu/~kriz/cifar.html
    # x2, y2 = load_merge_cifar()
    # item2 = ["airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"]

    # UMAP run
    # run_umap(x=x2, y=y2, item=item2, n_neighbors_list=[5,20,50,100,200])
    # x_umap2 = umap_.UMAP(n_neighbors=5, min_dist=0.3, metric='correlation', verbose=True).fit_transform(x2)
    # draw_plot(x_umap2, y2, item2, "umap_result2")
    # # t-SNE run
    # x_tse2 = run_tsne(x2)
示例#4
0
def app():
    players = pd.read_csv('similar_players/data/players.csv')
    players = players.drop(columns=['Unnamed: 0'])
    players = players[players['Pos'] != 'GK']

    st.title("Doppelgängers")
    st.markdown("""
    #### Pick a player from Europe's Top 5 Leagues and we'll find you their footballing Doppelgängers
    """)

    player_name = st.text_input("Player Name").title()
    season_input = st.selectbox("Season", ('2019/20', '2020/21'))
    if season_input == '2019/20':
        season = 2020
    else:
        season = 2021

    number = int(
        st.number_input("Number of Players",
                        min_value=5,
                        max_value=20,
                        value=5,
                        step=1))
    if player_name in players['Player'].unique():
        if st.checkbox('Include same player'):
            table = find_similar_players(player_name,
                                         season,
                                         n_players=number,
                                         return_same_player=True)
        else:
            table = find_similar_players(player_name, season, n_players=number)
        table_index = table.set_index('Player')
        info_table = table[['Nation', 'Pos', 'Age', 'Squad', 'Comp', 'Season']]

        st.dataframe(table_index)
        st.text("Players listed in order of similarity")

        position = players[players['Player'] == player_name]['Pos'].mode()[0]

        plot = draw_plot(player_name, season, position, table)
        st.write(plot)
        st.write("Definitions")
        if position == 'DF':
            st.text("CrsPA:     Crosses into the Penalty Area Per 90 Mins")
            st.text("xA:        Expected Assists Per 90 Mins")
            st.text("Err:       Errors Leading to Goals Per 90 Mins")
            st.text("Press:     Pressures Per 90 Mins")
            st.text("Tkl%:      Percentage of Dribblers Tackled")
            st.text("Int:       Number of Interceptions Per 90 Mins")
            st.text(
                "Blocks:    Number of Passes and Shots Blocked Per 90 Mins")
            st.text(
                "Prog:      Forward carries of over 5 metres (excl. those in defensive 3rd) Per 90 Mins"
            )
        if position == 'DFMF':
            st.text("CrsPA:     Crosses into the Penalty Area Per 90 Mins")
            st.text("xA:        Expected Assists Per 90 Mins")
            st.text(
                "PrgRatio:  Percentage of passing distance that is forward")
            st.text("Passes:    Passes Per 90 Mins")
            st.text(
                "Prog:      Forward carries of over 5 metres (excl. those in defensive 3rd) Per 90 Mins"
            )
            st.text("Int:       Number of Interceptions Per 90 Mins")
            st.text("Tkl%:      Percentage of Dribblers Tackled")
            st.text("Press:     Pressures Per 90 Mins")
        if position == 'MF':
            st.text(
                "PrgRatio:      Percentage of passing distance that is forward"
            )
            st.text("xA:            Expected Assists Per 90 Mins")
            st.text("Passes:        Passes Per 90 Mins")
            st.text(
                "Prog:          Forward carries of over 5 metres (excl. those in defensive 3rd) Per 90 Mins"
            )
            st.text(
                "PrgRatioDrib:  Percentage of dribbling distance that is forward"
            )
            st.text("Tkl:           Number of Tackles Attempted Per 90 Mins")
            st.text("Press:         Pressures Per 90 Mins")
            st.text("npxG:          Non-Penalty Expected Goals Per 90 Mins")
        if position == 'MFFW' or position == 'FWMF':
            st.text("npxG:          Non-Penalty Expected Goals Per 90 Mins")
            st.text("npxG/Sh:       Non-Penalty Expected Goals Per Shot")
            st.text(
                "PrgRatio:      Percentage of Passing Distance that is Forward"
            )
            st.text("KP:            Key Passes Per 90 Mins")
            st.text("xA:            Expected Assists Per 90 Mins")
            st.text(
                "Prog:          Forward Carries of Over 5 Metres (excl. those in defensive 3rd) Per 90 Mins"
            )
            st.text(
                "Final3rdDrib:  Number of Dribbles into the Final 3rd Per 90 Mins"
            )
            st.text("Press:         Pressures Per 90 Mins")
        if position == 'FW':
            st.text("npxG:      Non-Penalty Expected Goals Per 90 Mins")
            st.text("npxG/Sh:   Non-Penalty Expected Goals Per Shot")
            st.text("SoT%:      Shots on Target Percentage")
            st.text("Sh/90:     Shots Per 90 Mins")
            st.text("Passes:    Passes Per 90 Mins")
            st.text("xA:        Expected Assists Per 90 Mins")
            st.text(
                "Prog:      Forward Carries of Over 5 Metres (excl. those in defensive 3rd) Per 90 Mins"
            )
            st.text("Press:     Pressures Per 90 Mins")
        if position == 'DFFW' or position == 'FWDF':
            st.text("npxG:      Non-Penalty Expected Goals Per 90 Mins")
            st.text("xA:        Expected Assists Per 90 Mins")
            st.text("CrsPA:     Crosses into the Penalty Area Per 90 Mins")
            st.text("KP:        Key Passes Per 90 Mins")
            st.text("#Pl:       Number of Players Dribbled Past Per 90 Mins")
            st.text(
                "Prog:      Forward Carries of Over 5 Metres (excl. those in defensive 3rd) Per 90 Mins"
            )
            st.text("Tkl:       Number of Tackles Attempted Per 90 Mins")
            st.text("Press:     Pressures Per 90 Mins")
    else:
        st.warning(
            "Please choose a valid player name. For a full list of players available,\
        look in the index")

    if st.button('Click here to learn how this works!'):
        st.write(
            "The similarity rankings are based on the output of a K-Nearest Neighbours\
        model. This is a model that calculates the distances between each player based on\
        all the different metrics in the interactive table above, which are all given equal\
        weighting. It can then rank the other players by distance and select the\
        'Nearest Neighbours' of the chosen player.")
        st.write(
            """The data comes from [FBRef](https://fbref.com/), and includes all
        players in Europe's top five leagues who have played at least five 90s
        in that season.""")
示例#5
0
=> The initialization strategy outlined above applied 30 different times.
    For each initialization, the code should then apply the K-means algorithm
    until it converges.
"""
all_converges = apply_k_means(data, k=10, iterations=30)
"""
=> A plot of the number of images clustered together for each digit in the
  best clustering result.
  The x-axis should show the digit number (0, 1, …, 9) while the y-axis
  should show the count. 
  When the images of one digit are clustered in different clusters,
  use the count of the cluster that has the majority of images.
"""
all_organizations = np.zeros((30, 10))
for i in range(len(all_converges)):
    all_organizations[i] = calc_classification(all_converges[i][0])
"""
=> Determine which of the 30 outputs is the best clustering result.

index_of_best : index of best clustering iteration
    chosen according to the maximum calculated sum of each clustering outcome
    per digit, which was calculated bearing in mind the prior classification
    of the data
"""
index_of_best = np.argmax([sum(all_organizations[i]) for i in range(30)])

x = np.arange(0, 10, step=1)
y = all_organizations[index_of_best]
plot_label = "Clustering " + str(index_of_best)
draw_plot(x, y, plot_label, img_path=count_img_path)
示例#6
0
        deviation0 += (h(x[i]) - y[i])
        deviation1 += ((h(x[i]) - y[i]) * x[i])

    deviation0 = (deviation0 * L_RATE) / size
    deviation1 = (deviation1 * L_RATE) / size

    theta0 -= deviation0
    theta1 -= deviation1


iterations = 10000
cost_list = [[], []]
x, y = generate_points(50)
for i in range(iterations):
    update_parameters(x, y)
    if (i % 100 == 0):
        cost_list[0].append(i)
        cost_list[1].append(cost_func(x, y))

# Draw plot of cost vs number of iterations
# iterations, cost = cost_list
# draw_plot(plt, iterations, 'Iterations', cost, 'Cost', 'line')
# plt.show()

# To show the fitted line and data points
draw_plot(plt, x, 'Size of house', y, 'Cost of house', 'dot')
xs = [0, max(x)]
ys = [h(xs[0]), h(xs[1])]
draw_line(plt, xs, ys)
plt.show()
示例#7
0
def main():

    parser = argparse.ArgumentParser(description='reaction rates regression')

    parser.add_argument(
        '-p',
        '--process',
        type=str,
        choices=['DR', 'VT', 'VV', 'VV2', 'ZR'],
        default='DR,VT,VV,VV2,ZR',
        help='Comma-separated names of properties whose regression is performed'
    )

    parser.add_argument('-a',
                        '--algorithm',
                        type=str,
                        choices=[
                            'DT', 'RF', 'ET', 'GP', 'KN', 'SVM', 'KR', 'GB',
                            'HGB', 'MLP'
                        ],
                        default='DT',
                        help='regression algorithm')

    args = parser.parse_args()

    process = args.process.split(',')
    directory = process[0] + '/data/processes'
    path = directory + "/*.csv"
    print("Process: ", colored(process[0], 'green'))

    algorithm = args.algorithm.split(',')
    print("Algorithm: ", colored(algorithm[0], 'blue'))

    parent_dir = "."
    print("PWD: ", colored(parent_dir, 'yellow'))

    n_jobs = 2

    for f in glob.glob(path):
        #print("{bcolors.OKGREEN}f{bcolors.ENDC}")
        print(colored(f, 'red'))
        dataset_k = pd.read_csv(f, delimiter=",").to_numpy()
        dataset_T = pd.read_csv(parent_dir + "/" + process[0] +
                                "/data/Temperatures.csv").to_numpy()

        x = dataset_T.reshape(-1, 1)
        y = dataset_k

        print("### Phase 1: PRE_PROCESSING ###")
        ########################################
        '''
        https://stackoverflow.com/questions/50565937/how-to-normalize-the-train-and-test-data-using-minmaxscaler-sklearn
        https://towardsdatascience.com/6-amateur-mistakes-ive-made-working-with-train-test-splits-916fabb421bb
        https://www.analyticsvidhya.com/blog/2020/04/feature-scaling-machine-learning-normalization-standardization/
        https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02

        You should fit the MinMaxScaler using the training data and
        then apply the scaler on the testing data before the prediction.

        In summary:

        Step 1: fit the scaler on the TRAINING data
        Step 2: use the scaler to transform the TRAINING data
        Step 3: use the transformed training data to fit the predictive model
        Step 4: use the scaler to transform the TEST data
        Step 5: predict using the trained model (step 3) and the transformed TEST data (step 4).

        data = datasets.load_iris()
        X    = data.data
        y    = data.target

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)

        model = SVC()
        model.fit(X_train_scaled, y_train)

        X_test_scaled = scaler.transform(X_test)
        y_pred = model.predict(X_test_scaled)

        '''
        data, dir, proc, model, scaler, figure, outfile = utils.mk_tree(
            f, parent_dir, process[0], algorithm[0])

        # Train/test split dataset
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            train_size=0.75,
                                                            test_size=0.25,
                                                            random_state=69)

        # Define scalers: they can be modified to investigate the effect of scalers
        ##############################################################################
        input_scaler = None  #MinMaxScaler(feature_range=(-1,1))
        output_scaler = None  #StandardScaler()
        ##############################################################################

        # Scale None and/or inputs and/or outputs
        x_train, x_test, y_train, y_test = utils.scale_dataset(
            x_train, x_test, y_train, y_test, input_scaler, output_scaler)

        print('Training Features Shape:', x_train.shape)
        print('Training Labels Shape:', y_train.shape)
        print('Testing Features Shape:', x_test.shape)
        print('Testing Labels Shape:', y_test.shape)

        # Save scalers (they may be useful)
        dump(input_scaler, open(scaler + "/scaler_x_MO_" + data + '.pkl',
                                'wb'))
        dump(output_scaler, open(scaler + "/scaler_y_MO_" + data + '.pkl',
                                 'wb'))

        if (algorithm[0] == 'DT'):
            est, hyper_params = estimators.est_DT()

        elif (algorithm[0] == 'ET'):
            est, hyper_params = estimators.est_ET()

        elif (algorithm[0] == 'SVM'):
            est, hyper_params = estimators.est_SVM()

        elif (algorithm[0] == 'KR'):
            est, hyper_params = estimators.est_KR()

        elif (algorithm[0] == 'KN'):
            est, hyper_params = estimators.est_KN()

        elif (algorithm[0] == 'MLP'):
            est, hyper_params = estimators.est_MLP()

        elif (algorithm[0] == 'GB'):
            est, hyper_params = estimators.est_GB()

        elif (algorithm[0] == 'HGB'):
            est, hyper_params = estimators.est_HGB()

        elif (algorithm[0] == 'RF'):
            est, hyper_params = estimators.est_RF()

        elif (algorithm[0] == 'GB'):
            est, hyper_params = estimators.est_GB()

        else:
            print("Algorithm not implemented ...")

        # https://github.com/ray-project/tune-sklearn
        # https://docs.ray.io/en/latest/tune/api_docs/sklearn.html#tune-sklearn-docs
        # class ray.tune.sklearn.TuneGridSearchCV(estimator, param_grid, early_stopping=None, scoring=None,
        # n_jobs=None, cv=5, refit=True, verbose=0, error_score='raise', return_train_score=False,
        # local_dir='~/ray_results', max_iters=1, use_gpu=False, loggers=None, pipeline_auto_early_stop=True,
        # stopper=None, time_budget_s=None, sk_n_jobs=None)
        #scheduler = MedianStoppingRule(grace_period=10.0)
        #gs = TuneGridSearchCV(est, cv=10, param_grid=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2',
        #                  refit=True, error_score=np.nan, return_train_score=True)
        #tune_search = TuneSearchCV(clf, parameter_grid, search_optimization="hyperopt", n_trials=3, early_stopping=scheduler, max_iters=10)
        #tune_search.fit(x_train, y_train)

        # Exhaustive search over specified parameter values for the estimator
        # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
        gs = GridSearchCV(est,
                          cv=5,
                          param_grid=hyper_params,
                          verbose=2,
                          n_jobs=n_jobs,
                          scoring='r2',
                          refit=True,
                          pre_dispatch='n_jobs',
                          error_score=np.nan,
                          return_train_score=True)

        # Randomized search on hyper parameters
        # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV
        # class sklearn.model_selection.RandomizedSearchCV(estimator, param_distributions, *, n_iter=10, scoring=None, n_jobs=None, refit=True,
        #                                                  cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score=nan,
        #                                                  return_train_score=False)
        #gs = RandomizedSearchCV(est, cv=10, n_iter=10, param_distributions=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2',
        #                        refit=True, pre_dispatch='n_jobs', error_score=np.nan, return_train_score=True)

        # Training
        utils.fit(x_train, y_train, gs, outfile)

        results = pd.DataFrame(gs.cv_results_)
        # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
        #compression_opts = dict(method='zip', archive_name='GridSearchCV_results.csv')
        #results.to_csv('GridSearchCV_results.zip', index=False, compression=compression_opts)
        results.to_csv(model + "/../" + "GridSearchCV_results.csv",
                       index=False,
                       sep='\t',
                       encoding='utf-8')

        #plt.figure(figsize=(12, 4))
        #for score in ['mean_test_recall', 'mean_test_precision', 'mean_test_min_both']:
        #    plt.plot([_[1] for _ in results['param_class_weight']], results[score], label=score)
        #plt.legend();

        #plt.figure(figsize=(12, 4))
        #for score in ['mean_train_recall', 'mean_train_precision', 'mean_test_min_both']:
        #    plt.scatter(x=[_[1] for _ in results['param_class_weight']], y=results[score.replace('test', 'train')], label=score)
        #plt.legend();

        # summarize results
        print("Best: %f using %s" % (gs.best_score_, gs.best_params_))
        means = gs.cv_results_['mean_test_score']
        stds = gs.cv_results_['std_test_score']
        params = gs.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, param))

        # Perform prediction
        y_regr = utils.predict(x_test, gs, outfile)

        # Compute the scores
        utils.scores(input_scaler, output_scaler, x_train, y_train, x_test,
                     y_test, model, gs, outfile)

        # Transform back
        x_train, x_test, y_train, y_test, y_regr = utils.scale_back_dataset(
            x_train, x_test, y_train, y_test, y_regr, input_scaler,
            output_scaler)

        # Make figures
        utils.draw_plot(x_test, y_test, y_regr, figure, data)

        # save the model to disk
        dump(gs, model + "/model_MO_" + data + '.sav')
示例#8
0
def main():

    parser = argparse.ArgumentParser(description='relaxation terms regression')

    #    parser.add_argument('-p', '--process', type=str,
    #                        choices=["shear", "bulk", "conductivity", "thermal_diffusion", "mass_diffusion"],
    #                        default="shear,bulk,conductivity,thermal_diffusion,mass_diffusion",
    #                        help='Comma-separated names of transport properties whose regression is performed')

    parser.add_argument('-a',
                        '--algorithm',
                        type=str,
                        choices=[
                            'DT', 'RF', 'ET', 'GP', 'KN', 'SVM', 'KR', 'GB',
                            'HGB', 'MLP'
                        ],
                        default='DT',
                        help='regression algorithm')

    args = parser.parse_args()

    #    process   = args.process.split(',')
    #    print("Process: ", colored(process[0], 'green'))

    algorithm = args.algorithm.split(',')
    print("Algorithm: ", colored(algorithm[0], 'blue'))

    src_dir = "."
    print("SRC: ", colored(src_dir, 'yellow'))

    output_dir = src_dir + "/.."
    print("OUTPUT: ", colored(output_dir, 'red'))

    n_jobs = 2

    # Import database
    dataset = np.loadtxt("../data/transposed_reshaped_data.txt")
    #   with open('../data/TCs_air5.txt') as f:
    #       lines = (line for line in f if not line.startswith('#'))
    #       dataset = np.loadtxt(lines, skiprows=1)

    print(dataset.shape)

    #    if (process[0] == "shear"):
    #        x = dataset[:,0:7] # T, P, x_N2, x_O2, x_NO, x_N, x_O
    #        y = dataset[:,7:8] # shear viscosity
    #    elif (process[0] == "bulk"):
    #        x = dataset[:,0:7] # T, P, x_N2, x_O2, x_NO, x_N, x_O
    #        y = dataset[:,8:9] # bulk viscosity
    #    elif (process[0] == "conductivity"):
    #        x = dataset[:,0:7] # T, P, x_N2, x_O2, x_NO, x_N, x_O
    #        y = dataset[:,9:10]# thermal conductivity
    #    elif (process[0] == "thermal_diffusion"):
    #        x = dataset[:,0:7] # T, P, x_N2, x_O2, x_NO, x_N, x_O
    #        y = dataset[:,10:] # thermal diffusion, D_Ti
    #    elif (process[0] == "mass_diffusion"):
    #        x = dataset[:,0:7] # T, P, x_N2, x_O2, x_NO, x_N, x_O
    #        y = dataset[:,:]   # mass diffusion TODO

    x = dataset[:, 0:50]  # ni_n[47], na_n[1], V, T
    y = dataset[:, 50:]  # RD_mol[47], RD_at[1]

    print(x.shape)
    print(y.shape)

    print("### Phase 1: PRE_PROCESSING ###")
    ########################################

    # 1.0) create directory tree
    model, scaler, figure = utils.mk_tree(algorithm[0], output_dir)

    # 1.1) train/test split dataset
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        train_size=0.75,
                                                        test_size=0.25,
                                                        random_state=69)

    # 1.2) scale data and save scalers
    sc_x = StandardScaler()
    sc_y = StandardScaler()

    sc_x.fit(x_train)
    x_train = sc_x.transform(x_train)
    x_test = sc_x.transform(x_test)

    sc_y.fit(y_train)
    y_train = sc_y.transform(y_train)
    y_test = sc_y.transform(y_test)

    print('Training Features Shape:', x_train.shape)
    print('Training Labels Shape:', y_train.shape)
    print('Testing Features Shape:', x_test.shape)
    print('Testing Labels Shape:', y_test.shape)

    dump(sc_x, open(scaler + "/scaler_x.pkl", 'wb'))
    dump(sc_y, open(scaler + "/scaler_y.pkl", 'wb'))

    print("### Phase 2: PROCESSING ###")
    ####################################

    # 2.0) estimator selection
    if (algorithm[0] == 'DT'):
        est, hyper_params = estimators.est_DT()

    elif (algorithm[0] == 'ET'):
        est, hyper_params = estimators.est_ET()

    elif (algorithm[0] == 'SVM'):
        est, hyper_params = estimators.est_SVM()

    elif (algorithm[0] == 'KR'):
        est, hyper_params = estimators.est_KR()

    elif (algorithm[0] == 'KN'):
        est, hyper_params = estimators.est_KN()

    elif (algorithm[0] == 'MLP'):
        est, hyper_params = estimators.est_MLP()

    elif (algorithm[0] == 'GB'):
        est, hyper_params = estimators.est_GB()

    elif (algorithm[0] == 'HGB'):
        est, hyper_params = estimators.est_HGB()

    elif (algorithm[0] == 'RF'):
        est, hyper_params = estimators.est_RF()

    elif (algorithm[0] == 'GB'):
        est, hyper_params = estimators.est_GB()

    else:
        print("Algorithm not implemented ...")

    # 2.1) search for best hyper-parameters combination
    # Exhaustive search over specified parameter values for the estimator
    # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
    gs = GridSearchCV(est,
                      cv=10,
                      param_grid=hyper_params,
                      verbose=2,
                      n_jobs=n_jobs,
                      scoring='r2',
                      refit=True,
                      pre_dispatch='n_jobs',
                      error_score=np.nan,
                      return_train_score=True)

    # Randomized search on hyper parameters
    # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV
    # class sklearn.model_selection.RandomizedSearchCV(estimator, param_distributions, *, n_iter=10, scoring=None, n_jobs=None, refit=True,
    #                                                  cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score=nan,
    #                                                  return_train_score=False)
    #gs = RandomizedSearchCV(est, cv=10, n_iter=10, param_distributions=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2',
    #                        refit=True, pre_dispatch='n_jobs', error_score=np.nan, return_train_score=True)

    # 2.2) training
    utils.fit(x_train, y_train, gs)

    # 2.3) prediction
    y_regr = utils.predict(x_test, gs)

    print("### Phase 3: POST-PROCESSING ###")
    #########################################

    # 3.0) save best hyper-parameters
    results = pd.DataFrame(gs.cv_results_)
    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
    #compression_opts = dict(method='zip', archive_name='GridSearchCV_results.csv')
    #results.to_csv('GridSearchCV_results.zip', index=False, compression=compression_opts)
    results.to_csv(model + "/../" + "GridSearchCV_results.csv",
                   index=False,
                   sep='\t',
                   encoding='utf-8')

    # results print screen
    print("Best: %f using %s" % (gs.best_score_, gs.best_params_))
    means = gs.cv_results_['mean_test_score']
    stds = gs.cv_results_['std_test_score']
    params = gs.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))

    # 3.1) compute score metrics
    utils.scores(sc_x, sc_y, x_train, y_train, x_test, y_test, model, gs)

    # 3.2) back to original values (unscaling)
    x_test_dim = sc_x.inverse_transform(x_test)
    y_test_dim = sc_y.inverse_transform(y_test)
    y_regr_dim = sc_y.inverse_transform(y_regr)

    # 3.3) make plots
    utils.draw_plot(x_test_dim, y_test_dim, y_regr_dim, figure)

    # 3.4) save model to disk
    dump(gs, model + "/model.sav")