Пример #1
0
    def _check_coords_for_distance_weighting(self, coords, check_radius, check_weights, X, y_mean):
        """
        Checks that coords won't break the distance weighting function

        """
        valid_inds = []
        for coord in xrange(len(coords)):

            temp = RadiusNeighborsRegressor(radius=check_radius, weights=check_weights)
            temp.fit(X, y_mean)
            try:
                temp.predict([coords[coord]])
                valid_inds.append(coord)
            except ZeroDivisionError:
                continue
        return valid_inds
Пример #2
0
def get_author_list_with_pruning_method(feature_list, author_list, qp, radius):
    """
        feature_list - the feature list to indicate the stylometric features
        author_list - the author list to indicate a paragraph is written by whom
        qp - the query point, mostly represents a document

        This function will return a shortened author list, which can greatly
        reduce the size of training set by removing those data points too far
        from the query point. Since it takes time to calculate the Hausdorff
        distance, reducing the size of testing set can speed up the process

        Please refer to the following link for more information
        http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.RadiusNeighborsRegressor.html#sklearn.neighbors.RadiusNeighborsRegressor
    """
    neigh = RadiusNeighborsRegressor(radius=radius, algorithm='brute', p=2)
    neigh.fit(feature_list, author_list)
    return neigh.radius_neighbors(qp, return_distance=True)
Пример #3
0
    def _check_coords_for_distance_weighting(self, coords, check_radius,
                                             check_weights, X, y_mean):
        """
        Checks that coords won't break the distance weighting function

        """
        valid_inds = []
        for coord in xrange(len(coords)):

            temp = RadiusNeighborsRegressor(radius=check_radius,
                                            weights=check_weights)
            temp.fit(X, y_mean)
            try:
                temp.predict([coords[coord]])
                valid_inds.append(coord)
            except ZeroDivisionError:
                continue
        return valid_inds
Пример #4
0
    def predict(self):
        """
         trains the scikit-learn  python machine learning algorithm library function
         https://scikit-learn.org

         then passes the trained algorithm the features set and returns the
         predicted y test values form, the function

         then compares the y_test values from scikit-learn predicted to
         y_test values passed in

         then returns the accuracy
         """
        algorithm = RadiusNeighborsRegressor(
            radius=get_ohe_config().rnr_radius)
        algorithm.fit(self.X_train, self.y_train)
        y_pred = list(algorithm.predict(self.X_test))
        self.acc = OneHotPredictor.get_accuracy(y_pred, self.y_test)
        return self.acc
Пример #5
0
def compare_multiple_stacks(folder):
    subfolders = os.listdir(folder)

    all_data = []

    for subfolder in tqdm.tqdm(subfolders):
        all_data.append(load_images(os.path.join(folder, subfolder)))

    all_data = np.array(all_data)
    print(all_data.shape)

    for channel in range(3):
        for subfolder_index in range(all_data.shape[0]):

            channel_stack = all_data[subfolder_index][:, :, :, channel]

            img_mean = np.mean(channel_stack, axis=0)
            img_sigma_clip = np.mean(astropy.stats.sigma_clip(channel_stack,
                                                              sigma=2,
                                                              axis=0),
                                     axis=0)

            img_sigma_ratio = (img_mean / img_sigma_clip - 1) * 1E3
            skip = 1
            flat_ratios = img_sigma_ratio.flatten()[::skip]
            mean_values = img_mean.flatten()[::skip]

            # plt.scatter(mean_values, flat_ratios, alpha=0.1, color='black', s=1)

            rnr = RadiusNeighborsRegressor(radius=50, weights='uniform')
            rnr.fit(np.expand_dims(mean_values, axis=1), flat_ratios.flatten())

            x = np.arange(
                np.min(mean_values) + 200,
                np.max(mean_values) + 1 - 200, 10)
            line_y = rnr.predict(np.expand_dims(x, axis=1))
            plt.plot(x, line_y, label=str(subfolder_index))

        plt.legend()
        plt.grid(True)
        plt.show()
Пример #6
0
def compare_error_vs_brightness(folder):
    data = load_images(folder)

    for channel in range(data.shape[3]):
        channel_stack = data[:, :, :, channel]

        img_mean = np.mean(channel_stack, axis=0)
        img_sigma_clip = np.mean(astropy.stats.sigma_clip(channel_stack,
                                                          sigma=2,
                                                          axis=0),
                                 axis=0)

        img_sigma_ratio = (img_mean / img_sigma_clip - 1) * 1E3

        x = np.arange(np.min(img_mean), np.max(img_mean) + 1)
        bit_flip_change = 128 if channel == 1 else 256
        y_top = ((channel_stack.shape[0] * x) /
                 (channel_stack.shape[0] * x - bit_flip_change) - 1) * 1E3
        y_bottom = ((channel_stack.shape[0] * x) /
                    (channel_stack.shape[0] * x + bit_flip_change) - 1) * 1E3
        plt.plot(x, y_top, 'r')
        plt.plot(x, y_bottom, 'r')
        plt.scatter(img_mean.flatten(),
                    img_sigma_ratio.flatten(),
                    alpha=0.1,
                    color='black',
                    s=1)

        rnr = RadiusNeighborsRegressor(radius=50, weights='distance')
        rnr.fit(np.expand_dims(img_mean.flatten(), axis=1),
                img_sigma_ratio.flatten())

        x = np.arange(np.min(img_mean), np.max(img_mean) + 1)
        line_y = rnr.predict(np.expand_dims(x, axis=1))
        plt.plot(x, line_y, 'g')

        plt.grid(True)
        plt.show()
def grid_points_2d(mesh, cell_size=10):
    grid = vtk_Voxel.from_mesh(mesh, cell_size, 2)

    cells = grid.cell_centers().points

    radius = cell_size * 0.5
    tmat = np.full(cells.shape[0], np.nan)
    print("sample min", np.min(mesh.points[:, 2]), "max",
          np.max(mesh.points[:, 2]))
    while np.any(np.isnan(tmat)):
        # keep increasing radius until all cells have values
        radius *= 1.5
        print("RadiusNeighborsRegressor =", radius, "m")
        neigh = RadiusNeighborsRegressor(radius, 'distance')
        neigh.fit(mesh.points[:, :2], mesh.points[:, 2])
        rmat = neigh.predict(cells[:, :2])
        np.putmask(tmat, np.isnan(tmat), rmat)
    print("regression min", np.min(tmat), "max", np.max(tmat))
    grid.cell_arrays['Elevation'] = tmat
    surf = grid.extract_surface()
    surf = surf.ctp()
    surf.points[:, 2] = surf.point_arrays['Elevation']

    return surf
def powerproduction():
    if fl.request.method == "POST":
        speed = {}
        speed = float(fl.request.form['speed'])
        # speed = requests.get(data['input_s'])
        # import csv data and convert to pandas dataframe
        df = pd.read_csv("powerproduction.csv")

        # remove all zeros
        df = df[df.power != 0]

        # put rows in order of speed
        df = df.sort_values('speed')

        # set each column to a numpy array for processing
        S = df['speed'].to_numpy()
        p = df['power'].to_numpy()

        neigh_radius = RadiusNeighborsRegressor(radius=1.7, weights='distance', p = 2)
        neigh_radius.fit(S.reshape(-1, 1), p)

        p_pred = neigh_radius.predict([[speed]])

        return {'value': p_pred[0]}
Пример #9
0
def process_data(input_true, input_reco, segment_label, group_label):
    """
    arguments are Nx5 from processing data
    input_true: energy depositions
    input_reco: charge depositions
    segment_label: fivetypes label
    group_label: particle instance
    
    purpose is to get find M non-ghost reco voxels and set target energies for them based on blurring
    
    returns tuple of neural network inputs and other useful stuff (it's messy, sorry)
    element 0: [size Mx12] corresponding to input_reco (5) + one-hot encoded fivetypes+ghost (6) + blurred energy target (1)
    element 1: [size M] group label of voxel
    element 2: [size M] indices in input_true of voxels that have been reconstructed
    element 3: [size Mx5] input_true intersection with reco, where the last element in each row is blurred energy
    
    """
    chosen_indices = []
    chosen_reco_indices = []

    current_batch = 0
    current_batch_selection = np.where(input_true[:, -2] == current_batch)[0]
    current_input_true = input_true[current_batch_selection]
    for r in range(len(input_reco)):
        row = input_reco[r]
        b = row[-2]
        if b != current_batch:
            current_batch = b
            current_batch_selection = np.where(
                input_true[:, -2] == current_batch)[0]
        pos = row[:3]
        region_selection = np.where((current_input_true[:, 0] == pos[0])
                                    & (current_input_true[:, 1] == pos[1]))[0]
        input_true_region = current_input_true[region_selection]
        for i in range(len(input_true_region)):
            row2 = input_true_region[i]
            pos2 = row2[:3]
            if np.array_equal(pos, pos2):
                chosen_indices.append(
                    current_batch_selection[region_selection[i]])
                chosen_reco_indices.append(r)
                break

    if len(chosen_indices) == 0:
        return None

    chosen_indices = np.array(chosen_indices)
    chosen_reco_indices = np.array(chosen_reco_indices)

    lost_data = np.delete(input_true, chosen_indices, axis=0)
    found_data = input_true[chosen_indices]

    # find where the chosen indices are in the group data
    lost_group_data = -np.ones((len(lost_data), len(lost_data[0])))
    ungrouped_data = -np.ones((len(lost_data), len(lost_data[0])))
    found_group_data = -np.ones((len(found_data), len(found_data[0])))
    for i in range(len(lost_data)):
        row = lost_data[i]
        filter0 = group_label[np.where(group_label[:, -2] == row[-2])]
        filter1 = filter0[np.where(filter0[:, 0] == row[0])]
        filter2 = filter1[np.where(filter1[:, 1] == row[1])]
        filter3 = filter2[np.where(filter2[:, 2] == row[2])]
        if len(filter3) == 0:
            ungrouped_data[i] = row
        else:
            g = filter3[0]
            lost_group_data[i] = g
    for i in range(len(found_data)):
        row = found_data[i]
        filter0 = group_label[np.where(group_label[:, -2] == row[-2])]
        filter1 = filter0[np.where(filter0[:, 0] == row[0])]
        filter2 = filter1[np.where(filter1[:, 1] == row[1])]
        filter3 = filter2[np.where(filter2[:, 2] == row[2])]
        g = filter3[0]
        found_group_data[i] = g

    if ADD_MISSING_ENERGY:
        batches = np.unique(input_true[:, 3])
        for b in batches:
            # nearest neighbor assignment within group
            found_groups = np.unique(
                found_group_data[np.where(found_group_data[:, 3] == b)][:, -1])
            lost_batch_mask = lost_group_data[:, 3] == b
            found_batch_mask = found_group_data[:, 3] == b
            for g in found_groups:
                lost_selection = np.where(lost_batch_mask
                                          & (lost_group_data[:, -1] == g))[0]
                found_selection = np.where(found_batch_mask
                                           & (found_group_data[:, -1] == g))[0]
                ldata = lost_data[lost_selection]
                fdata = found_data[found_selection]
                lost_positions = ldata[:, :3]
                found_positions = fdata[:, :3]
                distances = distance_matrix(lost_positions, found_positions)
                closest_points = np.argmin(distances, axis=1)
                closest_energies = ldata[:, -1]
                for i in range(len(closest_points)):
                    found_data[found_selection[
                        closest_points[i]]][-1] += closest_energies[i]

            # associated ungrouped voxels with nearest voxels, regardless of group
            lost_ungrouped = np.where((ungrouped_data[:, 3] == b))[0]
            if len(lost_ungrouped) > 0:
                found_selection = np.where(found_batch_mask)[0]
                ldata = lost_data[lost_ungrouped]
                fdata = found_data[found_selection]
                lost_positions = ldata[:, :3]
                found_positions = fdata[:, :3]
                distances = distance_matrix(lost_positions, found_positions)
                closest_points = np.argmin(distances, axis=1)
                closest_energies = ldata[:, -1]
                for i in range(len(closest_points)):
                    found_data[found_selection[
                        closest_points[i]]][-1] += closest_energies[i]

    if BLUR_ENERGY:
        blur_kernel = 3
        for g in np.unique(found_group_data[:, -1]):
            inds = np.where(found_group_data[:, -1] == g)
            selection = found_data[inds]
            total_energy = np.sum(selection[:, -1])

            coords = selection[:, :3]
            energies = selection[:, -1]
            neigh = RadiusNeighborsRegressor(radius=blur_kernel)
            neigh.fit(coords, energies)
            selection[:, -1] = neigh.predict(coords)
            selection[:, -1] *= total_energy / np.sum(selection[:, -1])
            found_data[inds, -1] = selection[:, -1]

    segment_indices = segment_label[chosen_indices, -1].astype(int)
    segment_one_hot = np.zeros((len(segment_indices), 5))
    segment_one_hot[np.arange(len(segment_indices)), segment_indices] = 1
    out = np.concatenate((input_reco[chosen_reco_indices], segment_one_hot,
                          np.expand_dims(found_data[:, -1], axis=1)),
                         axis=1)
    return np.array(out), found_group_data[:, -1], chosen_indices, found_data
Пример #10
0
    ax.set_xticks(list(ax.get_xticks()) + [best_radius])
    ax.plot(radii, mae_rnn, c='orange', linewidth=2)
    fig.savefig('rnn_param.png')

    return best_radius


knn_regressor = KNeighborsRegressor(n_neighbors=get_best_knn_n_neighbors(
    1, 100),
                                    weights='distance')
knn_regressor.fit(train_df[['temperatura', 'vacuo']], train_df[['energia']])

rnn_regressor = RadiusNeighborsRegressor(radius=get_best_rnn_radius(
    1.7, 3.0, 0.05),
                                         weights='distance')
rnn_regressor.fit(train_df[['temperatura', 'vacuo']], train_df[['energia']])

lr_regressor = LinearRegression()
lr_regressor.fit(train_df[['temperatura', 'vacuo']], train_df[['energia']])

energia_knn = knn_regressor.predict(test_df[['temperatura', 'vacuo']])
energia_rnn = rnn_regressor.predict(test_df[['temperatura', 'vacuo']])
energia_lr = lr_regressor.predict(test_df[['temperatura', 'vacuo']])

fig, ax = plt.subplots()
ax.set_title('Evaluation of regression algorithms')
ax.set_ylabel('Mean absolute error')
ax.set_ylim(0, 5)
ax.set_yticks(np.arange(0, 5, 1.5))

rects = ax.bar(x=['kNN', 'rNN', 'LR'],
Пример #11
0
print(code_optimisation())

##############################
# Building the model
# ++++++++++++++++++

filename = "onnx_to_profile.onnx"

if not os.path.exists(filename):
    print(f"Generate a graph for {filename!r}.")
    X = numpy.random.randn(1000, 10).astype(numpy.float64)
    y = X.sum(axis=1).reshape((-1, 1))

    model = RadiusNeighborsRegressor()
    model.fit(X, y)
    onx = to_onnx(model, X, options={'optim': 'cdist'})

    with open(filename, "wb") as f:
        f.write(onx.SerializeToString())

#####################################
# Functions
# +++++++++
#
# We need to generate random inputs to test the graph.


def random_input(typ, shape, batch):
    if typ == 'tensor(double)':
        dtype = numpy.float64
Пример #12
0
y_pred = np.zeros(len(y), dtype=y.dtype) # where we'll accumulate predictions

clf = RadiusNeighborsRegressor(radius=15)




# CV Loop
for train_index, test_index in kf:
    # for each iteration of the for loop we'll do a test train split
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    t = StandardScaler()
    X_train = t.fit_transform(X_train)
    clf.fit(X_train, y_train) # Train clf_1 on the training data

    X_test = t.transform(X_test)
    y_pred[test_index] = clf.predict(X_test) # Predict clf_1 using the test and store in y_pred

r2_score(y, y_pred)
rmse = sqrt(mean_squared_error(y, y_pred))

print "RadiusNeighborsRegressor RMSE: " , rmse


### Prediction ###
result = clf.predict(test_feats)
result = np.asarray(result)
np.savetxt("result.csv", result, delimiter=",")
Пример #13
0
def process_data(data):
    """
    data: input_true, input_reco, ghost_label, group_label
    returns: input, output
        input: intersection between reco and true, labeled with reco charge depositions
        output: intersection between reco and true, labeled with adjusted energy depositions
    """
    input_true = data['input_true']
    input_reco = data['input_reco']
    segment_label = data['segment_label']
    group_label = data['group_label']

    chosen_indices = []
    chosen_reco_indices = []

    current_batch = 0
    current_batch_selection = np.where(input_true[:, -2] == current_batch)[0]
    current_input_true = input_true[current_batch_selection]
    for r in range(len(input_reco)):
        row = input_reco[r]
        b = row[-2]
        if b != current_batch:
            current_batch = b
            current_batch_selection = np.where(
                input_true[:, -2] == current_batch)[0]
        pos = row[:3]
        region_selection = np.where((current_input_true[:, 0] == pos[0])
                                    & (current_input_true[:, 1] == pos[1]))[0]
        input_true_region = current_input_true[region_selection]
        for i in range(len(input_true_region)):
            row2 = input_true_region[i]
            pos2 = row2[:3]
            if np.array_equal(pos, pos2):
                chosen_indices.append(
                    current_batch_selection[region_selection[i]])
                chosen_reco_indices.append(r)
                break

    if len(chosen_indices) == 0:
        return None

    chosen_indices = np.array(chosen_indices)
    chosen_reco_indices = np.array(chosen_reco_indices)

    lost_data = np.delete(input_true, chosen_indices, axis=0)
    found_data = input_true[chosen_indices]

    # find where the chosen indices are in the group data
    lost_group_data = -np.ones((len(lost_data), len(lost_data[0])))
    ungrouped_data = -np.ones((len(lost_data), len(lost_data[0])))
    found_group_data = -np.ones((len(found_data), len(found_data[0])))
    for i in range(len(lost_data)):
        row = lost_data[i]
        filter0 = group_label[np.where(group_label[:, -2] == row[-2])]
        filter1 = filter0[np.where(filter0[:, 0] == row[0])]
        filter2 = filter1[np.where(filter1[:, 1] == row[1])]
        filter3 = filter2[np.where(filter2[:, 2] == row[2])]
        if len(filter3) == 0:
            ungrouped_data[i] = row
        else:
            g = filter3[0]
            lost_group_data[i] = g
    for i in range(len(found_data)):
        row = found_data[i]
        filter0 = group_label[np.where(group_label[:, -2] == row[-2])]
        filter1 = filter0[np.where(filter0[:, 0] == row[0])]
        filter2 = filter1[np.where(filter1[:, 1] == row[1])]
        filter3 = filter2[np.where(filter2[:, 2] == row[2])]
        g = filter3[0]
        found_group_data[i] = g

#     lost_group_data = np.delete(group_label, chosen_indices, axis=0)
#     found_group_data = group_label[chosen_indices]

    if ADD_MISSING_ENERGY:
        batches = np.unique(input_true[:, 3])
        for b in batches:
            # nearest neighbor assignment within group
            found_groups = np.unique(
                found_group_data[np.where(found_group_data[:, 3] == b)][:, -1])
            lost_batch_mask = lost_group_data[:, 3] == b
            found_batch_mask = found_group_data[:, 3] == b
            for g in found_groups:
                lost_selection = np.where(lost_batch_mask
                                          & (lost_group_data[:, -1] == g))[0]
                found_selection = np.where(found_batch_mask
                                           & (found_group_data[:, -1] == g))[0]
                ldata = lost_data[lost_selection]
                fdata = found_data[found_selection]
                lost_positions = ldata[:, :3]
                found_positions = fdata[:, :3]
                distances = distance_matrix(lost_positions, found_positions)
                closest_points = np.argmin(distances, axis=1)
                closest_energies = ldata[:, -1]
                for i in range(len(closest_points)):
                    found_data[found_selection[
                        closest_points[i]]][-1] += closest_energies[i]

            # associated ungrouped voxels with nearest voxels, regardless of group
            lost_ungrouped = np.where((ungrouped_data[:, 3] == b))[0]
            if len(lost_ungrouped) > 0:
                found_selection = np.where(found_batch_mask)[0]
                ldata = lost_data[lost_ungrouped]
                fdata = found_data[found_selection]
                lost_positions = ldata[:, :3]
                found_positions = fdata[:, :3]
                distances = distance_matrix(lost_positions, found_positions)
                closest_points = np.argmin(distances, axis=1)
                closest_energies = ldata[:, -1]
                for i in range(len(closest_points)):
                    found_data[found_selection[
                        closest_points[i]]][-1] += closest_energies[i]

    if BLUR_ENERGY:
        blur_kernel = 3
        for g in np.unique(found_group_data[:, -1]):
            inds = np.where(found_group_data[:, -1] == g)
            selection = found_data[inds]
            total_energy = np.sum(selection[:, -1])

            coords = selection[:, :3]
            energies = selection[:, -1]
            neigh = RadiusNeighborsRegressor(radius=blur_kernel)
            neigh.fit(coords, energies)
            selection[:, -1] = neigh.predict(coords)
            selection[:, -1] *= total_energy / np.sum(selection[:, -1])
            found_data[inds, -1] = selection[:, -1]

    segment_indices = segment_label[chosen_indices, -1].astype(int)
    segment_one_hot = np.zeros((len(segment_indices), 5))
    segment_one_hot[np.arange(len(segment_indices)), segment_indices] = 1
    out = np.concatenate((input_reco[chosen_reco_indices], segment_one_hot,
                          np.expand_dims(found_data[:, -1], axis=1)),
                         axis=1)
    return np.array(out), found_group_data[:, -1]
Пример #14
0
from sklearn.neighbors import KNeighborsRegressor

KNN_reg = KNeighborsRegressor(n_neighbors=6, weights='uniform')

KNN_reg.fit(x_train, y_train)

y_predict_knn = KNN_reg.predict(x_test)

y_predict_knn[0:10]

from sklearn.neighbors import RadiusNeighborsRegressor

RNN_reg = RadiusNeighborsRegressor(radius=x_train.std())

RNN_reg.fit(x_train, y_train)

y_predict_rnn = RNN_reg.predict(x_test)

y_predict_rnn[0:10]

RNN_reg = RadiusNeighborsRegressor()

RNN_reg.fit(x_train, y_train)

RNN_reg.predict(x_test)

from sklearn.metrics import mean_absolute_error, mean_squared_error

mean_absolute_error(y_test, y_predict_knn)
Пример #15
0
# reg = GradientBoostingRegressor()
# reg = HistGradientBoostingRegressor()
# kernel = DotProduct() + WhiteKernel()
# reg = GaussianProcessRegressor(kernel=kernel, random_state=0) #awful
# reg = LogisticRegression()
# reg = Ridge(alpha=1.0)    # not good
# reg = BayesianRidge()     # not good
# reg = PoissonRegressor()  # not good
# reg = TweedieRegressor()  # not good
# reg = GammaRegressor()    # not good
# reg = MLPRegressor(random_state=0, max_iter=500)  # not good
# reg = DecisionTreeRegressor() # not too great

# reg = KNeighborsRegressor(n_neighbors=5, algorithm="auto", weights="uniform", leaf_size=30)
reg = RadiusNeighborsRegressor(radius=4.3)

# reg = SVR(C=60, gamma='auto')
# print(cross_val_score(reg, X, y, cv=10))

reg.fit(X_train, y_train)
# accuracy = reg.score(X_test, y_test)
# print(accuracy)

predictions = reg.predict(X)

df['Prediction'] = predictions
df = df.loc["2015-03-13"]
df['Total_Feeder'].plot()
df['Prediction'].plot()
plt.show()
Пример #16
0
print "Intercept: ", lin3.intercept_
for k, v in enumerate(lin3.coef_[0]):
	print threeYrXcol[k], ": ", v

# KNeighborsRegressor
kn3 = KNReg(weights='uniform')
#kn3.fit(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values)
kn3.fit(X_train, y_train)
print "Train: ", kn3.score(X_train, y_train)
print "Test: ", kn3.score(X_test, y_test)
# print kn3.score(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values)

# RadiusNeighborsRegressor
rn3 = RNReg(radius=7.0)
#rn3.fit(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values)
rn3.fit(X_train, y_train)
print "Train: ", rn3.score(X_train, y_train)
print "Test: ", rn3.score(X_test, y_test)
print rn3.score(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values)

# Test 2010/11/12 stats and 2013 projections against 2013 actuals
y=2013
y3 = [y-1,y-2,y-3]
tms_include = np.intersect1d(df[df.Year == y3[0]].Team.values, df[df.Year == y3[2]].Team.values)
df2012 = pd.merge(df[(df.Year.isin(y3)) & (df.Team.isin(tms_include))].groupby('Team')[Xvar].mean(), df[(df.Year == y3[0]) & (df.Team.isin(tms_include))].groupby('Team')[Xvar].mean(), how='left',left_index=True, right_index=True, suffixes=['_3yr_avg','_yr3'])
df2012['f2013'] = lin3.predict(df2012.values)
df2012.sort('f_yr3', ascending=False, inplace=True)
df2012['rnk_2012'] = range(1,df2012.shape[0]+1)
df2012.sort('f2013', ascending=False, inplace=True)
df2012['rnk_2013'] = range(1,df2012.shape[0]+1)
#df2012.to_csv('f2013_projection_3yrs.csv', headers=True,index=True)
Пример #17
0
import pandas as pd
import numpy as np
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn import cross_validation

# Membaca data training dan test
df = pd.read_hdf(sys.argv[1])
tdf = pd.read_hdf(sys.argv[2])

# Mengubah menjadi array numpy yang digunakan scikit-learn
X_train = df.as_matrix(['lat', 'lon'])
y_train = (df.length.as_matrix())*15
X_test = tdf.as_matrix(['lat', 'lon'])
y_test = (tdf.length.as_matrix())*15

id_test = tdf.index.to_series().as_matrix()

# Inisialisasi model
model = RadiusNeighborsRegressor(radius=0.0005, weights='distance')

# Training
model.fit(X_train, y_train)

# Prediksi
y_try = model.predict(X_test)

# Penulisan hasil
resdf = pd.DataFrame({'idx': id_test, 'predict': (y_try), 'actual': (y_test)}).set_index('idx')

resdf.to_csv(sys.argv[3])
Пример #18
0
class Model:
    def __init__(self):
        pass
    
    def NN_Build(self, train_x, train_y):
        """
            structure nerual network
        """
        self.NN = MLPRegressor(hidden_layer_sizes=(50), activation='relu', solver='adam', 
                               alpha=0.0001, batch_size='auto', learning_rate='adaptive', learning_rate_init=0.001)
        self.NN.fit(train_x, train_y)

    def NN_Predict(self, test):
        """
            nerual network
            predict the result with the input data
        """
        pre_result = self.NN.predict(test)
        return pre_result
    
    def DT_Build(self, train_x, train_y):
        """
            decision tree model
        """
        self.DT = tree.DecisionTreeRegressor()
        self.DT.fit(train_x, train_y)
    
    def DT_Predict(self, test):
        """
            predict the result with the input data
        """
        pre_result = self.DT.predict(test)
        return pre_result
    
    def SVM_Build(self, train_x, train_y):
        """SVM_Build"""
        self.clf = svm.SVR()
        self.clf.fit(train_x, train_y)
        
    def SVM_Predict(self, test):
        """SVM_Predict"""
        pre_result = self.clf.predict(test)
        return pre_result
    
    def KNN_Build(self, train_x, train_y):
        """KNN_Build"""
        self.kneigh = KNeighborsRegressor(n_neighbors=2)
        self.kneigh.fit(train_x, train_y)
        
    def KNN_Predict(self, test):
        """KNN_Predict"""
        pre_result = self.kneigh.predict(test)
        return pre_result
    
    def RNN_Build(self, train_x, train_y):
        """RNN_Build"""
        self.rneigh = RadiusNeighborsRegressor(radius=1.0)
        self.rneigh.fit(train_x, train_y)
    
    def RNN_Predict(self, test):
        """RNN_Predict"""
        pre_result = self.kneigh.predict(test)
        return pre_result

    def pre_plot(self, train_y, pre_result, start, end, ti):
        """
            compare the predict_result with the true label
        """
        train_y = pd.DataFrame(train_y[start:end], columns=['train_y'])
        pre_result = pd.DataFrame(pre_result[start:end], columns=['pre_result'])
        result = pd.concat([train_y, pre_result], axis=1)
#         print result
        result.plot(title=ti)
        plt.show()
        
    def Get_MSE(self, test_y, pre_result):
        """
            compute MSE between label & pre_result
            MSE = sum(pow(test_y-pre_result,2)) / len(test_y)
        """
        diff = []
        for i in range(len(test_y)):
            diff.append(pow(test_y[i] - pre_result[i],2))
        result = np.sum(diff) / len(test_y)
        print 'MSE: ',result
    
    def Algorithm_compare(self, train_x, train_y, test_x, test_y):
        """plot the results precited by the models"""
        self.NN_Build(train_x, train_y)
        self.DT_Build(train_x, train_y)
        self.SVM_Build(train_x, train_y)
        self.KNN_Build(train_x, train_y)
        self.RNN_Build(train_x, train_y)
        self.NN_result = self.NN_Predict(test_x)
        self.DT_result = self.DT_Predict(test_x)
        self.SVM_result = self.SVM_Predict(test_x)
        self.KNN_result = self.KNN_Predict(test_x)
        self.RNN_result = self.RNN_Predict(test_x)
        
        self.pre_plot(test_y, self.NN_result, 0, len(self.NN_result), 'NN_result')
        self.Evaluate(test_y, self.NN_result)
        self.pre_plot(test_y, self.DT_result, 0, len(self.DT_result), 'DT_result')
        self.Evaluate(test_y, self.DT_result)
        self.pre_plot(test_y, self.SVM_result, 0, len(self.SVM_result), 'SVM_result')
        self.Evaluate(test_y, self.SVM_result)
        self.pre_plot(test_y, self.KNN_result, 0, len(self.KNN_result), 'KNN_result')
        self.Evaluate(test_y, self.KNN_result)
        self.pre_plot(test_y, self.RNN_result, 0, len(self.RNN_result), 'RNN_result')
        self.Evaluate(test_y, self.RNN_result)
        
    def Evaluate(self, test_y, pre_result):
        MSE = mean_squared_error(test_y, pre_result)
        MAE = mean_absolute_error(test_y, pre_result)
        EVS = explained_variance_score(test_y, pre_result)
        print 'MSE: ',MSE
        print 'MAE: ',MAE
        print 'EVS: ',EVS
Пример #19
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from functions import errors

data = pd.read_csv("forestfires.csv")
data = data.drop(labels=['month', 'day'], axis=1)

y = data.area
x = data.drop(labels=['area'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=10)

reg = RadiusNeighborsRegressor()
reg.fit(x_train, y_train)
y_predict = reg.predict(x_test)

for i in range(len(y_predict)):
    print("pred %s act %s" % (y_predict[i], y_test.ravel()[i]))

errors(y_test, y_predict)
Пример #20
0
def mydist(x, y):
    distance_assignement = (0. if x[0]==y[0] else 1.)
    distance_time = (0. if x[2]==y[2] else 1.)
    distance_day = (0. if x[1]==y[1] else 1.)
    #distance_week_day = (1 if x[0]==y[0] else 0)
    #distance_time = abs(x[3] - y[3])%1440

    distance = distance_assignement + distance_time + distance_day
    return distance

#dist = neighbors.DistanceMetric.get_metric('pyfunc', func=distance)

preprocessing = fp.feature_preprocessing()
preprocessing.full_preprocess(used_columns=['ASS_ID', 'WEEK_DAY', 'TIME', 'CSPL_RECEIVED_CALLS'])
data = preprocessing.data[:1000]
Y = data['CSPL_RECEIVED_CALLS']
X = data.drop(['CSPL_RECEIVED_CALLS'], axis=1)

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.1, random_state=0)

neigh = RadiusNeighborsRegressor(radius=0.5, metric='pyfunc', func=mydist, algorithm='auto')
print('fitting...')
neigh.fit(X_train, y_train)
print('fitted')
#error = neigh.score(X_test, y_test)

#print(error)

y_pred = neigh.predict(X_test)

Пример #21
0
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

print("KNN ...{}".format(""))
knnreg = KNeighborsRegressor(n_neighbors=1)
knnreg.fit(data, Y)
y_KNNpred = knnreg.predict(data)
trainrms = sqrt(mean_squared_error(Y, y_KNNpred))
print("KNN PCA : trainrms {}".format(trainrms))

print("Rad ...{}".format(""))
from sklearn.neighbors import RadiusNeighborsRegressor
radreg = RadiusNeighborsRegressor(weights='distance', radius=10.3)
radreg.fit(data, Y)
y_radpred = radreg.predict(data)
trainrms = sqrt(mean_squared_error(Y, y_radpred))
print("Rad PCA : trainrms {}".format(trainrms))

#=============================================================================
# end Feature selection
#=============================================================================

# Instanciate a Gaussian Process model
#kernel = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
print("RF ...{}".format(""))
RFregr = RandomForestRegressor(n_estimators=301,
                               random_state=0,
                               oob_score=True)
Пример #22
0
forest = RandomForestRegressor(n_estimators = 100, n_jobs = 2, oob_score=True)
adaboost = AdaBoostRegressor()
nb = GaussianNB()
rd = RidgeClassifierCV()
kf = KFold(report.shape[0], n_folds = 5)

for train_index, test_index in kf:
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = variables.ix[list(train_index),], variables.ix[list(test_index),]
    y_train = report['survey_participant'].ix[list(train_index),]
    y_test = report['survey_participant'].ix[list(test_index),]
    forest.fit(X_train,y_train)
    adaboost.fit(X_train,y_train)
    gdc.fit(X_train, y_train)
    rd.fit(X_train, y_train)
    rgr.fit(X_train, y_train)
    nb.fit(X_train, y_train)
    lr.fit(X_train, y_train)
    et.fit(X_train, y_train)
    #print forest.feature_importances_
    y_hat = list(gdc.predict(X_test))
    print 'GDC', sum((y_hat-y_test)**2)/float(len(y_test))
    y_hat = list(rd.predict(X_test))
    print 'RD', sum((y_hat-y_test)**2)/float(len(y_test))
    y_hat = list(et.predict(X_test))
    print 'ET', sum((y_hat-y_test)**2)/float(len(y_test))
    y_hat = list(lr.predict(X_test))
    print 'LR', sum((y_hat-y_test)**2)/float(len(y_test))
    y_hat = list(forest.predict(X_test))
    print 'RFRegressor', sum(((y_hat)-y_test)**2)/float(len(y_test))
   
Пример #23
0
# Read training dataset
df = pd.read_csv(TRAINING_DATASET, header=None)		# read from the first line

columns = len(df.columns)
rows = len(df.index)

print 'Training dataset:', "{:,}".format(len(df.index)), 'x', "{:,}".format(len(df.columns))

df_y = df.ix[:,columns-1]
df_x = df.ix[:,:columns-2]

X = np.array(df_x)
Y = np.array(df_y)

neigh = RadiusNeighborsRegressor(radius = KNN_RADIUS)
neigh.fit(X, Y)

# Read Test dataset
testFiles = [file for file in os.listdir(TEST_DATASET_DIRECTORY) if str(file).find('test') >= 0]
print 'Number of test files:', len(testFiles)

TEST_Y_ALL = np.array([])
TEST_Y_ALL_PREDICTED = np.array([])
for file in testFiles:
	df = pd.read_csv(TEST_DATASET_DIRECTORY + '/' + file, header=None)		# read from the first line
	df_y = df.ix[:,columns-1]
	df_x = df.ix[:,:columns-2]
	
	X = np.array(df_x)
	Y = np.array(df_y)
	
from data.transformed_data import *
from data.raw_data import data_dir
from sklearn.neighbors import RadiusNeighborsRegressor

regressor = RadiusNeighborsRegressor()
regressor.fit(train_x, train_y)

print('RadiusNeighborsRegressor rmse:{}'.format(
    RMSLE(validation_y, regressor.predict(validation_x))))

predict = regressor.predict(test[col])
test['visitors'] = np.expm1(predict)
test['visitors'] = test['visitors'].clip(lower=0.)
test[['id', 'visitors'
      ]].to_csv(data_dir + 'submission_radius_neighbors_regressor.csv',
                index=False)
Пример #25
0
del globals()['profilesDF']
del globals()['profiles']
del globals()['profilesLSo']
del globals()['profilesLS']
del globals()['row']
del globals()['tmpLS']
del globals()['tmpAGE']
del globals()['profsTOlikes']
del globals()['i']
del globals()['tmpIND']

seed = 7
myRand = np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(likesMAT,
                                                    consARR,
                                                    test_size=1500)

myRAD = float(sys.argv[1])
radNN = RadiusNeighborsRegressor(radius=myRAD)

#radNN.fit(likesMAT, consARR)
radNN.fit(X_train, y_train)

y_pred = radNN.predict(X_test)
import math
myRMSE = math.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("cons, Radius neighbors:  ", str(myRAD), " ", myRMSE)

# joblib.dump(radNN, "/Users/jamster/radNN-A-cons.xz", compress=9)

# impRadNN = joblib.load("/Users/jamster/radNN-A-cons.xz")
Пример #26
0
# 加噪音
y[::5] += 3 * (0.5 - np.random.rand(8))

# 拟合模型
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
svr_lin = SVR(kernel='linear', C=1e3)
svr_poly = SVR(kernel='poly', C=1e3, degree=2)
knng = KNeighborsRegressor(n_neighbors=6, weights='uniform')
rng = RadiusNeighborsRegressor(radius=1.0, weights='uniform')
dtr = DecisionTreeRegressor(criterion='mse')
abr = AdaBoostRegressor(n_estimators=50)
rfr = RandomForestRegressor(n_estimators=50)

svr_rbf.fit(X, y), svr_lin.fit(X, y), svr_poly.fit(X, y)
knng.fit(X, y), rng.fit(X, y), dtr.fit(X, y)
abr.fit(X, y), rfr.fit(X, y)

# 支持向量回归
y_rbf = svr_rbf.predict(X)
y_lin = svr_lin.predict(X)
y_poly = svr_poly.predict(X)

# KNN 回归
y_knng = knng.predict(X)
y_rng = rng.predict(X)

# 决策树回归
y_dtr = dtr.predict(X)

# ensemble