def fit_estimator(sup_x, sup_y, unsup_x, n_neighbors=3, metric='minkowski'): """Provide ssl methods according to self-training """ knn = KNeighborsRegressor(n_neighbors=n_neighbors, metric=metric) knn.fit(sup_x, sup_y) [dist, idx] = knn.kneighbors(unsup_x) label_l = sup_y label_u = np.zeros((len(unsup_x), )) for t in range(0, len(unsup_x)): label_u[t] = np.mean(np.array([sup_y[i] for i in idx[t]]).flatten()) y = np.hstack((label_l, label_u)) [dist, idx] = knn.kneighbors(unsup_x) for i in range(0, 5): label_u = np.zeros((len(unsup_x), )) for t in range(0, len(unsup_x)): label_u[t] = np.mean(np.array([y[i] for i in idx[t]]).flatten()) y = np.hstack((label_l, label_u)) return label_u
def chooseSample(S, c, k2, k1): xs = S[S['Class'] == c] #iterate over each sample of class c1 for row in xs.iterrows(): d1 = row[1] d1 = d1.drop('Class') y = S['Class'] X = S.drop('Class', axis=1) #get k2 nearest neighbour using mahalanobis distance metric cov = np.cov(X, rowvar=False) knn = KNeighborsRegressor(n_neighbors=k2, metric="mahalanobis", metric_params=dict(V=cov)) knn.fit(X, y) d = [] d.append(d1) d1 = d #print(str(d1)) neigbour = knn.kneighbors(d1) #print("length:"+ str(len(neigbour))) neighbour_length = len(neigbour) if (neighbour_length >= k1): S['wieght'] = neighbour_length / k2 return S
class RegressionSMOTE(BaseEstimator): def __init__(self, n, k, sigma, **kwargs): self.n = n self.k = k self.sigma = sigma self.random_state = kwargs.get('random_state', np.random.random()) def fit(self, X, y): self.knn = KNeighborsRegressor(self.k, 'distance').fit(X, y) def transform(self, X, y): np.random.seed(self.random_state) ix = np.random.choice(len(X), self.n) nn = self.knn.kneighbors(X[ix], return_distance=False) newY = self.knn.predict(X[ix]) nni = np.random.choice(self.k, self.n) ix2 = np.array([n[i] for n, i in zip(nn, nni)]) dif = X[ix] - X[ix2] gap = np.random.rand(self.n, 1) newX = X[ix] + dif * gap newX = newX + np.random.rand(*newX.shape) * self.sigma return newX, newY def fit_transform(self, X, y): self.fit(X, y) return self.transform(X, y)
def test_regression(self): X, y = load_boston(return_X_y=True) n_examples = len(y) n_train = int(0.75 * n_examples) np.random.seed(987321) train_X, train_y = X[:n_train, :], y[:n_train] test_X, test_y = X[n_train:, :], y[n_train:] np.random.seed(None) # sklearn sk = KNeighborsRegressor(n_neighbors=3) sk.fit(train_X, train_y) dists_sk, idx_sk = sk.kneighbors(test_X) y_pred_sk = sk.predict(test_X) # Mine myknn = KNearestNeighborsRegressor(k=3) myknn.fit(train_X, train_y) nearest_neighbors_mine, idx, dists = myknn.get_k_nearest_neighbors( test_X, return_idx=True, return_distances=True) y_pred = myknn.predict(test_X) self.assertTrue(np.allclose(dists_sk, np.sqrt(dists))) self.assertTrue(np.all(idx_sk == idx)) self.assertTrue(np.allclose(y_pred, y_pred_sk))
def run_kNeighbors(distances, loadings, test_vars, weightings=('uniform',), k_list=(3)): """ Run Knearest neighbor using precomputed distances to create an ontological mapping Args: distances: square distance matrix to pass to KNeighborsRegressors loadings: loading matrix for training test_vars: variable to reconstruct weightings: (optional) list of weightings to pass to KNeighbors k_list: list of k values to pass to KNeighbors as n_neighbors """ train_distances = distances.loc[loadings.index, loadings.index] test_distances = distances.loc[test_vars, loadings.index] to_return = pd.DataFrame() for weighting in weightings: for k in k_list: clf = KNeighborsRegressor(metric='precomputed', n_neighbors=k, weights=weighting) clf.fit(train_distances, loadings) out = clf.predict(test_distances) out = pd.DataFrame(out, columns=loadings.columns) out['var'] = test_vars out['k'] = k out['weighting'] = weighting # add neighbors and distances neighbors = clf.kneighbors(test_distances) out['distances'] = tuple(neighbors[0]) out['neighbors'] = tuple(test_distances.columns[neighbors[1]]) to_return = pd.concat([to_return, out], sort=False) return to_return
def run_kNeighbors(distances, loadings, test_vars, weightings=('uniform', ), k_list=(3)): """ Run Knearest neighbor using precomputed distances to create an ontological mapping Args: distances: square distance matrix to pass to KNeighborsRegressors loadings: loading matrix for training test_vars: variable to reconstruct weightings: (optional) list of weightings to pass to KNeighbors k_list: list of k values to pass to KNeighbors as n_neighbors """ train_distances = distances.loc[loadings.index, loadings.index] test_distances = distances.loc[test_vars, loadings.index] to_return = pd.DataFrame() for weighting in weightings: for k in k_list: clf = KNeighborsRegressor(metric='precomputed', n_neighbors=k, weights=weighting) clf.fit(train_distances, loadings) out = clf.predict(test_distances) out = pd.DataFrame(out, columns=loadings.columns) out['var'] = test_vars out['k'] = k out['weighting'] = weighting # add neighbors and distances neighbors = clf.kneighbors(test_distances) out['distances'] = tuple(neighbors[0]) out['neighbors'] = tuple(test_distances.columns[neighbors[1]]) to_return = pd.concat([to_return, out], sort=False) return to_return
class QuantileKNN(): def __init__(self, n_neighbors = 50): self.n_neighbors = n_neighbors #keep estimator in memory self.reg = None def fit(self,X_train,y_train): self.neigh=KNeighborsRegressor(n_neighbors=self.n_neighbors) self.neigh.fit(X_train,y_train) self.X_train=X_train self.y_train=y_train def predict(self,x): def get_quantiles(element,indices,array): quantiles = np.arange(1,100)/100.0 temp=array[indices] dist = stats.norm(np.mean(temp),np.std(temp)) quant=[] for quantile in quantiles : quant.append(dist.ppf(quantile)) return quant predictions_gbr=[] for element in tqdm(x): indices=self.neigh.kneighbors([element], return_distance=False) predictions_gbr.append(get_quantiles(element,indices,self.y_train)) return predictions_gbr
def find(movies): X, y, genre_mean = similar_movie(movies, movies_info_final, cosine_similarities_total) neigh = KNeighborsRegressor(n_neighbors=10) neigh.fit(X, y) movies_index = neigh.kneighbors(np.reshape(genre_mean, (1, -1)))[1][0] ret = [y[i] for i in movies_index] return ret
class kNN(): ''' kNN classifier ------------- ''' def __init__(self,N_i,N_o,k=5,n=20): # note: N_o=1 assumed for now self.N_i = N_i self.n = n self.i = 0 self.k = k self.X = zeros((self.n,N_i)) self.y = zeros((self.n)) self.h = KNeighborsRegressor(n_neighbors=k,weights='distance')#='distance') self.c = 0 #self.error_rate = 0 def predict(self,x): ''' Predict -------------- ''' if self.c < 1.: print "[Warning!] No training examples!" return 0.0 elif self.c <= self.k: dist,ind = self.h.kneighbors(self.X[0:self.c],n_neighbors=1) i_max = argmax(ind) return self.y[i_max] return self.h.predict(x)#.reshape(1,-1)) # def samples_X(self): # ''' return samples of the WEIGHTS ''' # if self.c <= 0: # return self.X[0,:] # return self.X[0:self.c,:] def update(self, x, y): ''' Update -------------- ''' self.X[self.i,:] = x self.y[self.i] = y #self.error_rate = (y - self.predict(x))**2 self.i = (self.i + 1) % self.n if self.c < self.n: self.c = self.c + 1 self.h.fit(self.X[0:self.c,:], self.y[0:self.c])
def knn(neighbors=1): model = KNeighborsRegressor(neighbors) X = np.array([[-1, -1.5], [-2, -1.5], [-3, -2], [1, 1], [2, 1], [3, 3]]) random_y_values = np.array([2, 3, 4, 5, 4, 1]) to_predict = [0, 0] model.fit(X, random_y_values) dist, ind = model.kneighbors([to_predict]) fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 7)) circle = plt.Circle(to_predict, max(dist[0]), color='g', alpha=.2) axes[0].add_artist(circle) axes[0].plot(to_predict[0], to_predict[1], 'x', color='g', mew=3) axes[0].scatter(X[:, 0], X[:, 1], color='black') closest_points = X[ind[0]] axes[0].set_title('Distances') x_coords = closest_points.transpose()[0] y_coords = closest_points.transpose()[1] axes[0].scatter(x_coords, y_coords, color='r') for i in range(len(random_y_values)): position = X[i] axes[0].text(position[0] - .05, position[1] + .07, str(random_y_values[i])) num_points = len(ind[0]) axes[1].set_xlim([0, 7]) axes[1].set_ylim([0, 6]) values = [] for i in range(num_points): value = random_y_values[ind[0][i]] axes[1].vlines(x=i + 1, ymin=0, ymax=value, color='r', linewidths=15) values.append(value) axes[1].hlines(y=np.mean(values), xmin=0, xmax=12, linestyles='dashed', linewidths=2, color='g') axes[1].set_title('Values of k closest') axes[1].set_xlabel('k') axes[1].set_ylabel('Value') plt.show() print('Predicted Value: ', np.mean(values))
def mach_learn(listid, d, k): predictions = {} # for i in range(len(listid)): # #forloop for the reference for j in range(len(listid)): #forloop for the target # if i !=j: # R=listid[i]#reference T = listid[j] #target # rsp=d[T][0]#real sp of the target with respect to the reference X = [] Y = [] X_test = [] # psop=[] for z in range(len(listid)): # if z!=i and z!=j: if z != j: #for loop to define the space sop = d[listid[z]][ 0] #the sp score of the coord z with respect to the reference listina = [] x_test = [] for c in range(len(listid)): #for loop to fill the training vectors # if c!=i and c!=j: if c != j: listina.append(d[listid[z]][1][c]) x_test.append(float(d[T][1][c])) X.append( listina ) # X will be the coordinates in the n-dimensional space Y.append( sop) # Y will be the tc/sop score asociated to each point X_test.append(x_test) X = np.asarray(X, dtype='float') Y = np.asarray(Y) #X_test=np.asarray(X_test) neigh = KNeighborsRegressor(n_neighbors=int(k), weights='distance') neigh.fit( X, Y ) #training using R as reference to compute the sp and T to be predicted pred = neigh.predict(X_test) #prediction of T with R as reference dist = neigh.kneighbors(X_test) dis = dist[0][0][0] # num=dist[1][0][0] #dis=sum(dist[0][0])/float(k) #predictions[(R,T)]=[float(pred), float(dis), int(num)] # predictions[(R,T)]=[float(pred), float(dis)] predictions[T] = [float(pred), float(dis)] return predictions
def localLinearRegression(trainingCoordinates, trainingResponses, testCoordinates, neighborNumber, trainingWeights=None): sampleNumber = testCoordinates.shape[0] dim=testCoordinates.shape[1] if trainingWeights is None: trainingWeights=np.ones(trainingCoordinates.shape[0]) testResponses = np.zeros(sampleNumber) knnreg = KNeighborsRegressor(n_neighbors=neighborNumber, weights='uniform', algorithm='kd_tree', leaf_size=40, p=2) knnreg.fit(trainingCoordinates, trainingResponses) for sample in range(0, sampleNumber): # find kNN [dist, ni] = knnreg.kneighbors(testCoordinates[sample,:].reshape(1,-1), neighborNumber) X=np.concatenate((np.ones(neighborNumber).reshape(-1,1),trainingCoordinates[ni.squeeze(),:]),axis=1) y=trainingResponses[ni.squeeze()] w=np.diag(trainingWeights[ni.squeeze()]) rhs=np.zeros(dim+1) rhs=y.dot(w.dot(X)) #for m in range(0, dim+1): # for k in range(0,neighborNumber): # rhs[m]+=X[k,m]*y[k]*w[k] lhs=np.zeros((dim+1,dim+1)) lhs=(X.T).dot(w.dot(X)) #for m in range(0,dim+1): # for n in range(0,dim+1): # for k in range(0,neighborNumber): # lhs[m,n]+=X[k,m]*X[k,n]*w[k] try: coefficients = np.linalg.solve(lhs, rhs) except: break testResponses[sample]=(np.concatenate((np.ones(1),testCoordinates[sample,:]),axis=0).reshape(1,-1)).dot(coefficients.reshape(-1,1)) return testResponses,coefficients,ni
def test_knn_regression(): datafile_viper = '../data_viper/viper.pkl' viper = loadfile(datafile_viper) from sklearn.neighbors import KNeighborsRegressor model = KNeighborsRegressor(n_neighbors=5, weights='uniform', metric='euclidean') model.fit(viper.train_feat, viper.train_y) n_test = len(viper.test_feat) y_pred = np.zeros(n_test) for i, feat in zip(np.arange(n_test), viper.test_feat): dist, ind = model.kneighbors(feat) y_pred[i] = (viper.train_y[ind]*np.exp(-dist**2)).sum()/(np.exp(-dist**2)).sum() # y_pred = model.predict(viper.test_feat) print 'testing error {}'.format(abs_error(y_pred, viper.test_y))
def knn_manip(inp, processed, adhoc): """ Finds the most similar players to a given input using KNNRegressor :param inp: A dictionary of player attribute-value pairs :param processed: Dataset containing player skill attributes :param adhoc: Dataset containing player personality attributes :return: returns name, link to photos and positions of similar players """ predict_config = config_text['predict'] df = pd.DataFrame(inp, index=[0]) df['Simple_Position'] = df.apply(simple_position, axis=1) my_cols_list = predict_config['pos_list'] df = df.reindex(columns=[*df.columns.tolist(), *my_cols_list], fill_value=0) req_simp = df['Simple_Position'].values[0] col_name = 'Simple_Position_' + req_simp df[col_name] = 1 df.drop(labels=['Position', 'Simple_Position'], axis=1, inplace=True) # Find neighbors from processed data features_list = predict_config['features_list'] + [ col for col in processed.columns if col.startswith('Simple_') ] position_data = processed.loc[processed[col_name] == 1, :] y_train = position_data['Value'] X_train = position_data[features_list] scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) df = df[features_list] df = scaler.transform(df) regressor = KNeighborsRegressor(n_neighbors=5) regressor.fit(X_train, y_train) nneighbors = position_data.iloc[regressor.kneighbors(df)[1][0], :] nneighbor_id = nneighbors['ID'].tolist() nname = adhoc.loc[adhoc['ID'].isin(nneighbor_id), 'Name'].tolist() nid = adhoc.loc[adhoc['ID'].isin(nneighbor_id), 'Photo'].tolist() npos = adhoc.loc[adhoc['ID'].isin(nneighbor_id), 'Position'].tolist() return nname, nid, npos
def trainData(n, p, b, totalTrades, K, X, y): x_train = y_train = 0 # initialise model values best = 0 # the best test data set to be used in the future # divide the data into test data and practice data and loop over 100 times until best data set is found for _ in range(n): x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, test_size=0.1) # split data modelTest = KNeighborsRegressor( n_neighbors=K ) # use K-Neighbours Regression model for K neighbours modelTest.fit(x_train, y_train) # fit the model if totalTrades <= 19: with open("tradeModel_{0}_{1}.pickle".format(b, p), "wb") as f: pickle.dump(modelTest, f) # save the test data set break acc = modelTest.score(x_test, y_test) # accuracy of predictions # if accuracy is higher than best recorded accuracy if acc > best: best = acc # new accuracy is best accuracy with open("tradeModel_{0}_{1}.pickle".format(b, p), "wb") as f: pickle.dump(modelTest, f) # save the test data set modelTrade = KNeighborsRegressor( n_neighbors=totalTrades ) # use K-Neighbours Regression model for totalTrades neighbours modelTrade.fit(x_train, y_train) # fit the model trades = modelTrade.kneighbors( n_neighbors=totalTrades ) # get the distance of each trade in the KNN model tradeDistance = 0 # initiate variable to store total distances of all trades # loop over all trades and add their distance into tradeDistance for u in range(totalTrades): for v in range(len(trades)): tradeDistance += trades[0][u][v] average = tradeDistance / totalTrades # get the average distance of all trades with open("tradeDistance_{0}_{1}.pickle".format(b, p), "wb") as f: pickle.dump(average, f) # save the average distance for that model
def select(self, X, y): if self.distance_threshold is None: from sklearn.neighbors import KNeighborsRegressor nn = KNeighborsRegressor(n_neighbors=self.k + 1) nn.fit(X, y) dist, ind = nn.kneighbors(X) self.distance_threshold = np.max(np.min(dist[:, 1:], 1)) from sklearn.neighbors import RadiusNeighborsRegressor self.nn = RadiusNeighborsRegressor(radius=self.distance_threshold) Xcand, ycand, corner_response = self._candidates(X, y) Xcorner, ycorner = self._nonmax_supress(Xcand, ycand, corner_response) idx = np.where(np.isin(X, Xcorner)) return idx, Xcorner, ycorner
def prob_programs_above_perf_threshold_knn( program_feature_vectors: List[ProgramFeatureVector], regressor: KNeighborsRegressor, perf_threshold: Reward) -> List[float]: dist, ind = regressor.kneighbors( np.array(program_feature_vectors) ) probs = [ 1 - mlca.helpers.probability.cdf( perf_threshold, np.mean(regressor._y[ind[i]]), np.std(regressor._y[ind[i]]) ) for i in tqdm(range(len(program_feature_vectors)), "prob_programs_above_perf_threshold") ] return probs
def atemp_impute(X, y, X_missing): scaler = preprocessing.StandardScaler() X2 = scaler.fit_transform(X) nn = KNeighborsRegressor(1) nn.fit(X2, y) X_missing2 = scaler.transform(X_missing) dist, ind = nn.kneighbors(X=X_missing2, n_neighbors=1, return_distance=True) res = X.iloc[ind[:, 0]].copy() res['dist'] = dist res['y'] = y[ind[:, 0]] res.index = X_missing.index res = pd.concat([res, X_missing], axis=1) return res #X = tmp[['temp','humidity', 'windspeed']] #y = tmp.temp_diff #X_missing = data.loc[data.temp_diff < -10, ['temp','humidity', 'windspeed']]
def knn(data, pred_length, D_window=14, max_k=7): if pred_length + D_window >= len(data): print('ERROR: pred_length or D_window too long') return None ret_ypred = [] for h in range(4): train_feature, train_label = get_train_set(data, h, D_window, pred_length) e_LOO_arr = np.zeros(max_k) for k in range(2, max_k + 1): model = KNeighborsRegressor(n_neighbors=k, weights='uniform', algorithm='auto') model.fit(train_feature, train_label) # 获取k近邻 dist_list, index_list = model.kneighbors([data[0 - D_window:]]) k_neighbor_label = [] for i in index_list[0]: k_neighbor_label.append(train_label[i]) # 基于k近邻的预测值 ypred = model.predict([data[0 - D_window:]]) ypred = np.asarray(list(map(round, ypred[0]))) # 计算e_LOO e_LOO_arr[k - 1] = LOO(k_neighbor_label, ypred, k) # 取e_LOO最小的k值 k_min = np.argmin(e_LOO_arr[1:]) + 2 model = KNeighborsRegressor(n_neighbors=k_min, weights='uniform', algorithm='auto') model.fit(train_feature, train_label) ypred = model.predict([data[0 - D_window:]]) ret_ypred += list(map(round, ypred[0])) return np.asarray(ret_ypred)
def MIMO_KNN_LOO_May(data): code = data[0] data = list(map(float, data[1:])) D_window = 14 max_k = 7 pred_May = [] for h in range(4): train_feature, train_label = get_train_set(data, h, D_window) e_LOO_arr = np.zeros(max_k) for k in range(2, max_k + 1): model = KNeighborsRegressor(n_neighbors=k, weights='uniform', algorithm='auto') model.fit(train_feature, train_label) # 获取k近邻 dist_list, index_list = model.kneighbors([data[0 - D_window:]]) k_neighbor_label = [] for i in index_list[0]: k_neighbor_label.append(train_label[i]) # 基于k近邻的预测值 ypred = model.predict([data[0 - D_window:]]) ypred = np.asarray(list(map(round, ypred[0]))) # 计算e_LOO e_LOO_arr[k - 1] = LOO(k_neighbor_label, ypred, k) # 取e_LOO最小的k值 k_min = np.argmin(e_LOO_arr[1:]) + 2 # 令k=k_min,做预测 model = KNeighborsRegressor(n_neighbors=k_min, weights='uniform', algorithm='auto') model.fit(train_feature, train_label) ypred = model.predict([data[0 - D_window:]]) ypred = list(map(round, ypred[0])) pred_May = pred_May + ypred print(pred_May) # 替换文件里编码为code的预测值 change_pred(code, pred_May)
def baseline_predict(sup_x, sup_y, unsup_x): """This is a 1NN regressor with euclidean distance measure. sup_x : array-like laeled data matrix with [n_samples, n_features] y : array-like label matrix with [n_sample, ] unsup_x: array-like unlabeled data matrix with [n_samples, n_features] """ knn = KNeighborsRegressor(n_neighbors=1) knn.fit(sup_x, sup_y) [dist, idx] = knn.kneighbors(unsup_x) baseline_prediction = np.zeros((len(unsup_x), 1)) for t in range(0, len(unsup_x)): baseline_prediction[t] = np.mean( np.array([sup_y[i] for i in idx[t]]).flatten()) return baseline_prediction
def MIMO_KNN_LOO_test(data): code = data[0] data = list(map(float, data[1:])) train_data = data[:90] test_data = data[90:] # 对4个时间段分别训练模型,时间段分别为7天、7天、7天、9天 D_window = 14 max_k = 7 for h in range(4): train_feature, train_label = get_train_set(train_data, h, D_window) y_label = get_test_label(test_data, h) e_LOO_arr = np.zeros(max_k) for k in range(2, max_k + 1): model = KNeighborsRegressor(n_neighbors=k, weights='uniform', algorithm='auto') model.fit(train_feature, train_label) # 获取k近邻 dist_list, index_list = model.kneighbors([train_data[0 - D_window:]]) k_neighbor_label = [] for i in index_list[0]: k_neighbor_label.append(train_label[i]) # 基于k近邻的预测值 ypred = model.predict([train_data[0-D_window:]]) ypred = np.asarray(list(map(round, ypred[0]))) rmse = np.sqrt(((ypred - y_label) ** 2).mean()) print(code, ' h=', h, ' k=', k, ' rmse=', rmse) # 计算e_LOO e_LOO_arr[k-1] = LOO(k_neighbor_label, ypred, k) # 取e_LOO最小的k值 k_min = np.argmin(e_LOO_arr[1:]) + 2 print('k_min=', k_min)
def find_revenue_potential(self, amenity_x, scaling=True): """ Finds the revenue potential for an amenity returns rev_pot: float """ # Datafame to find neighbors from, made from a 'query property' query_df = self.my_property[[ 'bedrooms', 'bathrooms', 'accommodates', 'latitude', 'longitude' ]].astype('float') # Comp dataframe to find neighbors in df = self.comps[[ 'rev_pot', 'bedrooms', 'bathrooms', 'accommodates', 'latitude', 'longitude', amenity_x ]].astype('float') # Creates two dataframes, one for properties that have the amenity and one for those who don't w_amenity = df[df[amenity_x].astype('bool')].drop(columns=[amenity_x]) w_out_amenity = df[~df[amenity_x].astype('bool')].drop( columns=[amenity_x]) # Split and scale df X_w, y_w, w_predict = self.create_test_df(query_df, w_amenity) X_w_out, y_w_out, w_out_predict = self.create_test_df( query_df, w_out_amenity) # Check to make sure dataframes have enough neighbors k = min(X_w_out.shape[0], X_w.shape[0]) # Don't return results if there's not enough neighbors if k < 3: return 0 # Initialize & fit neighbors kn_w = KNeighborsRegressor(n_neighbors=k, weights='distance', n_jobs=-1) kn_w_out = KNeighborsRegressor(n_neighbors=k, weights='distance', n_jobs=-1) kn_w.fit(X_w, y_w) kn_w_out.fit(X_w_out, y_w_out) # find neighbors and their similarity(distance) to the query w_distance, w_neighbors = kn_w.kneighbors(w_predict) w_neighbors_df = w_amenity.iloc[w_neighbors.flatten()] w_out_distance, w_out_neighbors = kn_w_out.kneighbors(w_out_predict) w_out_neighbors_df = w_out_amenity.iloc[w_out_neighbors.flatten()] # predict revenue for nearby properties with and without the amenity w_revenue = kn_w.predict(w_predict)[0] w_out_revenue = kn_w_out.predict(w_out_predict)[0] rev_pot = w_revenue - w_out_revenue # calculate the potential upside if self.verbose: print(f"with amenity {w_amenity['accommodates'].mean()}") print(f"without amenity {w_out_amenity['accommodates'].mean()}") print( f'Average yearly revenue with {amenity_x}: $ {w_revenue:.0f} \n Without: $ {w_out_revenue:.0f} \n Yearly revenue potential : $ { rev_pot:.0f}' ) return rev_pot
# In[45]: knn = KNeighborsRegressor(n_neighbors=5, weights='distance', metric='cosine') # In[46]: knn.fit(X_train, y_train) # In[47]: X_train.iloc[0] # In[48]: knn.kneighbors(X_test.iloc[0:1])[0] # In[49]: knn.kneighbors(X_test.iloc[0:1])[1][0].tolist() # In[ ]: # In[50]: X_test.iloc[0:1] # In[51]: X_train.iloc[knn.kneighbors(X_test.iloc[0:1])[1][0].tolist()]
훈련 세트의 r2: 0.9804899950518966 테스트 세트의 r2: 0.9746459963987609 ''' ''' KNR 모델의 문제 제기 # 길이 50cm, 무게 1.5kg인 농어의 무게를 예측해보자 ''' prd = knr.predict([[50]]) print('50길이의 농어 예측 무게: ', prd) # [1033.33333333] prd = knr.predict([[100]]) print('100길이의 농어 예측 무게: ', prd) # [1033.33333333] ''' 50cm 농어의 이웃을 구해서 산점도 그리기 ''' import matplotlib.pyplot as plt distances, indexes = knr.kneighbors([[50]]) # 훈련 세트의 산점도 그리기 plt.scatter(train_input, train_target, c='b') # 훈련 세트 중 이웃 샘플만 다시 그리기 plt.scatter(train_input[indexes], train_target[indexes], marker='D', c='y') plt.scatter(50, 1033, marker='^', c='r') #plt.show() ''' K 최근점 이웃의 한계 : 새로운 샘플이 훈련세트 범위를 넘어서게 되면 잘못된 값을 예측할 수 있다 ==> 선형회귀 (LinearRegressor) : 특성 하나일 경우 직선을 학습하는 알고리즘 '''
def main(): spotify_client = SpotifyClient(authorization_token, user_id) #Scale key # scaler = MinMaxScaler() # df_kaggle['scaled_key'] = scaler.fit_transform(df_kaggle[['key']]) #Asking users for their preferences query_genre = input("Which genre?\n>") query_pop = get_choice(df=df_kaggle, column="popularity_binned") query_decade = get_choice(df=df_kaggle, column="decades") query_duration = input("Duration of the playlist?\n> ") print( f" You selected {query_pop} {query_genre} tracks from the {query_decade} decade for a total duration of {query_duration} minutes" ) #Converting to the right type query_genre = str(query_genre) query_pop = str(query_pop) # query_decade = str(query_decade) #Handle decade format if int(query_decade[1]) > 1: query_decade = query_decade.replace("'", "19") else: query_decade = query_decade.replace("'", "20") query_duration = int(query_duration) #Filtering the dataset accordingly filtered_results = df_kaggle[ df_kaggle['genres'].str.contains(query_genre) & (df_kaggle['year'].str.contains(query_decade[0:3])) & (df_kaggle['popularity_binned'] == query_pop)] # [0:3] for 1920s / [1:3] for (1920 -1930] #Get features of 1 random seed track from the filtered_results' seed = filtered_results.sample(1) tempo = seed['scaled_tempo'].iat[0] # loudness = seed['scaled_loudness'].iat[0] da = seed['danceability'].iat[0] energy = seed['energy'].iat[0] # key = seed['scaled_key'].iat[0] # valence = seed['valence'].iat[0] #Training the model features_names = ['scaled_tempo', 'danceability', 'energy'] # 'scaled_loudness' , 'valence' , 'scaled_key' X = filtered_results[features_names] y = filtered_results['track_id'] model = KNeighborsRegressor(algorithm='kd_tree', n_jobs=-1).fit(X, y) #Get model output for k: distances & indices knn_out, k = [], 100 knn_out = model.kneighbors([[tempo, da, energy]], n_neighbors=k) # loudness, ,valence ,key ind = knn_out[1][0].tolist() # get indices recs = filtered_results.iloc[ind] # recommendations df #Filter on popularity after modeling # refiltered_results = recs[recs['popularity_binned'] == query_pop] #Filter recommendations based on user's preferred duration # filtered_duration = refiltered_results[refiltered_results['duration_min'].cumsum() <= query_duration] filtered_duration = recs[recs['duration_min'].cumsum() <= query_duration] recommended_playlist = filtered_duration.reset_index(drop=True) #sorting the playlist by tempo recommended_playlist.sort_values(by=['scaled_tempo']) split_threshold = round(len(recommended_playlist) / 2) asc_playlist = recommended_playlist.iloc[0:split_threshold].sort_values( by=['scaled_tempo'], ascending=True) desc_playlist = recommended_playlist.iloc[split_threshold:].sort_values( by=['scaled_tempo'], ascending=False) frames = [asc_playlist, desc_playlist] sorted_playlist = pd.concat(frames) sorted_playlist = sorted_playlist.reset_index(drop=True) recommended_tracks = sorted_playlist[['track_name', 'track_id', 'artists']] # get playlist name from user and create empty playlist playlist_name = input("\nWhat's the playlist name? ") playlist_name = str(playlist_name) playlist = spotify_client.create_playlist(playlist_name) playlist_id = playlist.playlist_id # populate playlist with recommended tracks tracks_id = sorted_playlist['track_id'].tolist() #sp.playlist_add_items(playlist_id, tracks_id, position=None) track_uris = [create_spotify_uri(track) for track in tracks_id] response = requests.post( url=f"https://api.spotify.com/v1/playlists/{playlist_id}/tracks", data=json.dumps(track_uris), headers={ "Content-Type": "application/json", "Authorization": f"Bearer {authorization_token}" }) response = response.json() print('Your playlist was successfully added to your spotify account')
def _find_accuracy_num_homes(appliance, num_homes, start_seed, end_seed, feature="Monthly"): if appliance=="hvac": start, stop=5, 11 else: start, stop=1, 13 out = {} out_overall={} appliance_df = test_df.ix[test_df[['%s_%d' %(appliance,month) for month in range(start,stop)]].dropna().index] for random_seed in range(start_seed, end_seed): out_overall[random_seed] = {} rs = ShuffleSplit(len(appliance_df), n_iter=1, train_size=num_homes, test_size=len(appliance_df)-num_homes, random_state=random_seed) for train, test in rs: train_homes = appliance_df.index.values[train] test_homes = appliance_df.index.values[test] train_homes_df = appliance_df.ix[train_homes] test_homes_df = appliance_df.ix[test_homes] # Now, we need to do cross validation on train homes l = LeaveOneOut(len(train_homes)) for cv_train, cv_test in l: cv_train_home =appliance_df.ix[train_homes[cv_train]] cv_test_home = appliance_df.ix[train_homes[cv_test]] test_home_name = cv_test_home.index.values[0] Y = cv_train_home[['%s_%d' %(appliance, i) for i in range(start, stop)]].sum(axis=1).values forest = ExtraTreesRegressor(n_estimators=250, random_state=0) forest.fit(cv_train_home[feature_map[feature]], Y) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] # Now varying K and top-N features out[test_home_name] ={} for K in range(K_min, K_max): out[test_home_name][K]={} for top_n in range(F_min,F_max): out[test_home_name][K][top_n]=[] top_n_features = cv_train_home[feature_map[feature]].columns[indices][:top_n] # Now fitting KNN on this for month in range(start, stop): clf = KNeighborsRegressor(n_neighbors=K) clf.fit(cv_train_home[top_n_features], cv_train_home['%s_%d' %(appliance, month)]) dist, ind = clf.kneighbors(cv_test_home[top_n_features]) nghbrs = cv_train_home.index.values[ind].flatten() proportion = cv_train_home.ix[nghbrs]['%s_%d' %(appliance, month)].div(cv_train_home.ix[nghbrs]['%s_%d' %("aggregate", month)]) mean_prop = proportion.mean() out[test_home_name][K][top_n].append(cv_test_home['%s_%d' %("aggregate", month)]*mean_prop) accur = {} for K in range(K_min, K_max): accur[K] = {} for top_n in range(F_min, F_max): temp = {} for h in out.iterkeys(): pred = pd.DataFrame(out[h][K][top_n]).T #all_but_h = [x for x in out.keys() if x!=h] pred.index = [h] pred.columns = [['%s_%d' %(appliance, i) for i in range(start, stop)]] gt = appliance_df.ix[h][['%s_%d' %(appliance, i) for i in range(start, stop)]] error = (pred-gt).abs().div(gt).mul(100) mean_error = error.mean().mean() a = 100-mean_error if a<0: a=0 temp[h]=a ac = pd.Series(temp).mean() accur[K][top_n] = ac accur_df = pd.DataFrame(accur) accur_max = accur_df.max().max() max_ac_df = accur_df[accur_df==accur_max] F_best = cv_train_home[feature_map[feature]].columns[indices][:max_ac_df.mean(axis=1).dropna().index.values[0]].tolist() K_best = max_ac_df.mean().dropna().index.values[0] # Now predicting for test home pred_test = {} gt_test = {} for month in range(start, stop): clf = KNeighborsRegressor(n_neighbors=K_best) clf.fit(train_homes_df[F_best], train_homes_df['%s_%d' %(appliance, month)]) pred_test[month] = clf.predict(test_homes_df[F_best]) gt_test[month] = test_homes_df['%s_%d' %(appliance, month)] #json.dump({'f':F_best, 'k':K_best,'accuracy':accur_max},open("../sensitivity-new/%s_%s_%d.json" %(appliance,feature, home),"w") ) pred_df = pd.DataFrame(pred_test) pred_df.index = test_homes_df.index gt_df = pd.DataFrame(gt_test) error = (gt_df-pred_df).abs().div(gt_df).mul(100) accuracy_test = 100-error accuracy_test[accuracy_test<0]=0 out_overall[random_seed]=accuracy_test.mean().mean() return pd.Series(out_overall)
class wolse: def __init__(self, p1, p2, p3): df = pd.read_csv('realdata.csv') #print(len(df.values)) #df=df[df['계약년도']==2019] df['구동'] = df['자치구명'] + df['법정동명'] ''' temp=df['구동'] temp=temp.unique() temp=pd.DataFrame(temp) temp.to_excel('구동.xlsx') ''' rent1 = df[df['전월세구분'] == '준월세'] self.origin = rent1 a = rent1[['임대면적', '건축년도']] a = rent1[['임대면적']] #tempindex=a.index #a=scale(a) #a=pd.DataFrame(a,columns=['임대면적'],index=tempindex) b = pd.get_dummies(rent1['구동']) self.simple_rent1 = a.join(b) c = pd.get_dummies(rent1['임대건물명']) self.simple_rent1 = self.simple_rent1.join(c) floor = [] for f in rent1['층'].values: if f == -1: floor.append(-1) else: floor.append(0) self.simple_rent1['floor'] = floor self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split( self.simple_rent1, rent1['보증금'], random_state=42) self.X_train2, self.X_test2, self.Y_train2, self.Y_test2 = train_test_split( self.simple_rent1, rent1['임대료'], random_state=42) self.knn = KNeighborsRegressor(n_neighbors=3) self.knn2 = KNeighborsRegressor(n_neighbors=3) self.lr = LinearRegression() self.tlist = [] self.train() self.getinput2(p1, p2, p3) self.predict() ''' while True: print("input >> ") tstr=input() if tstr=="-1": break self.getinput(tstr) self.predict() ''' def train(self): self.knn.fit(self.X_train, self.Y_train) self.knn2.fit(self.X_train2, self.Y_train2) self.lr.fit(self.X_train, self.Y_train) def getinput(self, tstr): self.tlist = [] zero = np.zeros(len(self.simple_rent1.columns)) arr = tstr.split(" ") t = pd.DataFrame([zero], columns=self.simple_rent1.columns) #t['임대면적']=float(arr[0])/3.305785 t['임대면적'] = float(arr[0]) t[arr[1]] = 1.0 if arr[2] == '아파트': t['아파트'] = 1.0 elif arr[2] == '오피스텔': t['오피스텔'] = 1.0 else: t['다세대/연립'] = 1.0 #t['건축년도']=2017 #t['보증금']=6 self.tlist.append(t) def getinput2(self, a, b, c): #a 동네 #b 면적 #c 유형 self.tlist = [] zero = np.zeros(len(self.simple_rent1.columns)) #arr=tstr.split(" ") t = pd.DataFrame([zero], columns=self.simple_rent1.columns) t['임대면적'] = float(b) * 3.305785 #t['임대면적']=float(b) t[a] = 1.0 if c == '아파트': t['아파트'] = 1.0 elif c == '오피스텔': t['오피스텔'] = 1.0 else: t['다세대/연립'] = 1.0 self.tlist.append(t) def predict(self): for i in self.tlist: ind = self.knn.kneighbors(i, n_neighbors=3, return_distance=False) print(ind[0]) ind2 = self.knn2.kneighbors(i, n_neighbors=3, return_distance=False) print(ind2[0]) indices = list(ind[0]) indices2 = list(ind2[0]) y_tr = pd.DataFrame(self.Y_train) zlist = list(map(lambda i: y_tr.iloc[i, :], indices)) z = pd.DataFrame(zlist[1]) rz1 = str(int(z.loc['보증금', :])) y_tr2 = pd.DataFrame(self.Y_train2) z2list = list(map(lambda i: y_tr2.iloc[i, :], indices2)) z2 = pd.DataFrame(z2list[1]) rz2 = str(int(z2.loc['임대료', :])) r1 = rz1 + "/" + rz2 z = pd.DataFrame(zlist[2]) z2 = pd.DataFrame(z2list[2]) rz1 = str(int(z.loc['보증금', :])) rz2 = str(int(z2.loc['임대료', :])) r2 = rz1 + "/" + rz2 z = pd.DataFrame(zlist[0]) z2 = pd.DataFrame(z2list[0]) rz1 = str(int(z.loc['보증금', :])) rz2 = str(int(z2.loc['임대료', :])) r3 = rz1 + "/" + rz2 self.resultstr = r1 + '\n' + r2 + '\n' + r3 return str(self.resultstr) print('\n') print('\n')
def _find_accuracy(home, appliance, feature="Monthly"): np.random.seed(42) appliance_df = df.ix[all_homes[appliance]] if appliance=="hvac": start, stop=5, 11 else: start, stop=1, 13 test_homes = [home] train_homes = appliance_df[~appliance_df.index.isin([home])].index all_home_appliance = deepcopy(all_homes) all_home_appliance[appliance] = train_homes # Cross validation on inner loop to find best feature, K train_size = len(train_homes) l = LeaveOneOut(train_size) out = OrderedDict() for cv_train, cv_test in l: cv_train_home=appliance_df.ix[train_homes[cv_train]] cv_test_home = appliance_df.ix[train_homes[cv_test]] test_home_name = cv_test_home.index.values[0] #print cv_test_home out[test_home_name]={} # Summing up energy across start to stop to get Y to learn optimum feature on Y = cv_train_home[['%s_%d' %(appliance, i) for i in range(start, stop)]].sum(axis=1).values forest = ExtraTreesRegressor(n_estimators=250, random_state=0) forest.fit(cv_train_home[feature_map[feature]], Y) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] # Now varying K and top-N features for K in range(K_min, K_max): out[test_home_name][K]={} for top_n in range(F_min,F_max): out[test_home_name][K][top_n]=[] top_n_features = cv_train_home[feature_map[feature]].columns[indices][:top_n] # Now fitting KNN on this for month in range(start, stop): clf = KNeighborsRegressor(n_neighbors=K) clf.fit(cv_train_home[top_n_features], cv_train_home['%s_%d' %(appliance, month)]) out[test_home_name][K][top_n].append(clf.predict(cv_test_home[top_n_features])) # Now, finding the (K, top_n) combination that gave us best accuracy on CV test homes accur = {} for K in range(K_min, K_max): accur[K] = {} for top_n in range(F_min, F_max): temp = {} for h in out.iterkeys(): pred = pd.DataFrame(out[h][K][top_n]).T #all_but_h = [x for x in out.keys() if x!=h] pred.index = [h] pred.columns = [['%s_%d' %(appliance, i) for i in range(start, stop)]] gt = appliance_df.ix[h][['%s_%d' %(appliance, i) for i in range(start, stop)]] error = (pred-gt).abs().div(gt).mul(100) mean_error = error.mean().mean() a = 100-mean_error if a<0: a=0 temp[h]=a ac = pd.Series(temp).mean() accur[K][top_n] = ac accur_df = pd.DataFrame(accur) accur_max = accur_df.max().max() max_ac_df = accur_df[accur_df==accur_max] F_best = cv_train_home[feature_map[feature]].columns[indices][:max_ac_df.mean(axis=1).dropna().index.values[0]].tolist() K_best = max_ac_df.mean().dropna().index.values[0] # Now predicting for test home train_overall = appliance_df.ix[appliance_df[~appliance_df.index.isin([home])].index] test_overall = appliance_df[appliance_df.index.isin([home])] pred_test = {} gt_test = {} for month in range(start, stop): clf = KNeighborsRegressor(n_neighbors=K_best) clf.fit(train_overall[F_best], train_overall['%s_%d' %(appliance, month)]) pred_test[month] = clf.predict(test_overall[F_best]) neighbours = train_overall.index[clf.kneighbors(test_overall[F_best])[1]] print month, neighbours gt_test[month] = test_overall['%s_%d' %(appliance, month)] json.dump({'f':F_best, 'k':K_best,'accuracy':accur_max},open(os.path.expanduser("~/main-out-new-larger/%s_%s_%d.json" %(appliance,feature, home)),"w") ) print F_best, K_best, accur_max pred_df = pd.DataFrame(pred_test) pred_df.index = [home] #gt_df = pd.DataFrame(gt_test) #print pred_df, gt_df #error = (gt_df-pred_df).abs().div(gt_df).mul(100) #print error #accuracy_test = 100-error #accuracy_test[accuracy_test<0]=0 #return accuracy_test.squeeze() return pred_df
class kradius(BaseEstimator): def __init__(self, metric="euclidean", weights="uniform", n_neighbors=3, radius=1.0, n_jobs=1): self.radius = radius self.metric = metric self.weights = weights self.n_neighbors = n_neighbors self.n_jobs = n_jobs def fit(self, X, y): self.knn_model = KNeighborsRegressor( n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, weights=self.weights, metric=self.metric, ) self.knn_model.fit(X, y) return self def predict(self, X): # no need to to distance filtering if we take only 1 neighbor if self.n_neighbors > 1: dists, inds = self.knn_model.kneighbors( X, n_neighbors=self.n_neighbors) # dropping value where distance too big # we always keep the closest point (first value) inds = [ np.array([ index for distance, index in zip(dist, ind) if distance <= self.radius or distance == dist[0] ]) for dist, ind in zip(dists, inds) ] dists = [ np.array([ distance for distance in dist if distance <= self.radius or distance == dist[0] ]) for dist in dists ] weights = _get_weights(dists, self.weights) _y = self.knn_model._y if _y.ndim == 1: _y = _y.reshape((-1, 1)) if weights is None: y_pred = np.array( [np.mean(_y[ind, :], axis=0) for ind in inds]) else: y_pred = np.empty((X.shape[0], _y.shape[1]), dtype=np.float64) for k in range(X.shape[0]): for j in range(_y.shape[1]): y_pred[k, j] = np.sum(_y[inds[k], j] * weights[k] / sum(weights[k])) if _y.ndim == 1: y_pred = y_pred.ravel() return y_pred else: return self.knn_model.predict(X)
Xtrn_KNN = std_scale.transform(Xtrn_KNN) Xtst_KNN = std_scale.transform(Xtst_KNN) print('Running KNN') # the best HP for this case is 20 reg = KNeighborsRegressor(n_neighbors=20, weights='distance', algorithm='brute', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None) reg.fit(Xtrn_KNN, ytrn_KNN) ypreds_knn = reg.predict(Xtst_KNN) top_nns_dist, top_nns_inds = reg.kneighbors(Xtst_KNN, n_neighbors=Xtrn_KNN.shape[0]) print('Saving files for KNN') np.save(save_dir + 'KNN_preds.npy', ypreds_knn) np.save(save_dir + 'KNN_dists.npy', top_nns_dist) np.save(save_dir + 'KNN_inds.npy', top_nns_inds) print() ############# Get results for Lasso ################################## # transpose and split the data for Lasso print('Slicing and Standarizing Data for Lasso') data_Lasso = np.transpose(data) Xdata_Lasso = data_Lasso[:, beta_trn_inds] ydata_Lasso = data_Lasso[:, beta_tst_inds] Xtrn_Lasso = Xdata_Lasso[Xgenes, :] Xtst_Lasso = Xdata_Lasso[ygenes, :]
def DetectCurrentFace(hebi, Group): import scipy.io as scio import sys import numpy as np ### This was used for testing purposes only # import hebi # for the Hebi motors # from time import sleep # # # Need to look into XML formatting for Hebi Gains # # sio.loadmat('defaultGains.mat') # # lookup = hebi.Lookup() # Get table of all Hebi motors # sleep(2) # gives the Lookup process time to discover modules # # # Displays the Hebi modules found on the network # print('Modules found on the network:') # # for entry in lookup.entrylist: # print('{0} | {1}'.format(entry.family, entry.name)) # # # print('\n') # # var = raw_input('Were any modules found? [y/N]: \n') # if var == 'y': # print('\nYay!\n') # elif var == 'Y': # print('\nYay!\n') # else: # print('\nNONE FOUND!\n') # sys.exit() # # Group = lookup.get_group_from_family('*') # infoTable = Group.request_info() ### This was used for testing purposes only trainingData = scio.loadmat( 'IMUTrainingRutgers.mat') # training data gathered from MATLAB labels = np.float(trainingData['labs'][0][0][0]) for i in range(1, len(trainingData['labs'])): labels = np.append(labels, np.float(trainingData['labs'][i][0][0])) # Create KNN model from sklearn.neighbors import KNeighborsRegressor knn = KNeighborsRegressor(n_neighbors=10) # Fit the model knn.fit(trainingData['trainingData'], labels) fbk = hebi.GroupFeedback(Group.size) Group.feedback_frequency = 200.0 fbk = Group.get_next_feedback(reuse_fbk=fbk) # if(fbk.size != trainingData['nbMotors'][0][0]): # print('Something is wrong with the number of connected motors!') # return 0 accel = fbk.accelerometer.reshape(1, -1) [d, n] = knn.kneighbors( accel, 10) # give the lines which most closely match in variable "n" predicted_lines = np.asanyarray( labels[n[0]], dtype=int) # obtains the label values which were predicted in "n" counts = np.bincount( predicted_lines) # counts each instance of face numbers face = np.argmax( counts ) # finds the face with the highest number of instances [THIS IS OUR PREDICTION] return face
pdate = dateutil.parser.parse(row[0]) ptime = pdate.hour*3600 + pdate.minute*60 + pdate.second tdistance = get_distance(row[3], row[2], row[5], row[4]) plat = row[3] plon = row[2] dlat = row[5] dlon = row[4] ttime = row[1] test_data.append([ttime, ptime, tdistance, plat, plon, dlat, dlon]) test_data = np.asmatrix(test_data) test_data[:,[1,2,3,4,5,6]] = (test_data[:,[1,2,3,4,5,6]] - mean) / std optimal_k = 0 optimal = 0 for k in range(5, 21): neigh = KNeighborsRegressor(n_neighbors=k) neigh.fit(train_data[:,[1,2,3,4,5,6]], train_data[:,0]) dist, ind = neigh.kneighbors(test_data[:, [1,2,3,4,5,6]]) result = [] for row in range(ind.shape[0]): e = [] for i in range(ind.shape[1]): e.append(train_data[ind[row,i], 0]) result.append(e) result = np.asmatrix(result) median = np.median(result, axis=1) mae = mean_absolute_error(median, test_data[:,0]) print mae if mae > optimal: optimal_k = k print optimal_k
# Use a nearest neighbors algorithm to augment the data to expand the train set if (data_augmentation): print("Performing data augmentation") text_tfidf = text_tfidf.toarray() augmented_train_tfidf = list(text_tfidf.copy()) augmented_train_labels = list(train_val_labels.copy()) knn = KNeighborsRegressor(4, 'distance').fit(text_tfidf, train_val_labels) shuffled_indexes = list(range(len(augmented_train_tfidf))) np.random.shuffle(shuffled_indexes) # Augment 20% of the train data and add it to the original set for index in shuffled_indexes[0:int(len(augmented_train_tfidf) / 5)]: datapoint_text = np.reshape(augmented_train_tfidf[index], (1, -1)) datapoint_label = augmented_train_labels[index] neighbor = knn.kneighbors(datapoint_text, return_distance=False) random_neighbor = np.random.randint(1, 4) difference = text_tfidf[neighbor[0][random_neighbor]] - datapoint_text gap = np.random.rand(1)[0] new_point = datapoint_text + difference * gap augmented_train_tfidf = np.append(augmented_train_tfidf, new_point, axis=0) augmented_train_labels.append(datapoint_label) text_tfidf = sparse.csr_matrix(augmented_train_tfidf) train_val_labels = augmented_train_labels # Initialize Logistic Regression classifier and fit it on the tf-idf training data print("Training the model") classifier = SGDClassifier(loss='log',
def album_recommender(album_dataset, recommender_output_path): recommender_dataset = album_dataset non_numerical_cols = [ 'artists', 'album_cover', 'album_name', 'album_id', 'first_track_id', 'album' ] X = recommender_dataset.drop(columns=non_numerical_cols).copy() y_tempo = X['tempo'] #Intantiate MinMaxScaler() and fit/transofrm each column minmax = MinMaxScaler() X['danceability'] = minmax.fit(X[["danceability" ]]).transform(X[["danceability"]]) X['energy'] = minmax.fit(X[["energy"]]).transform(X[["energy"]]) X['speechiness'] = minmax.fit(X[["speechiness" ]]).transform(X[["speechiness"]]) X['acousticness'] = minmax.fit(X[["acousticness" ]]).transform(X[["acousticness"]]) X['instrumentalness'] = minmax.fit(X[["instrumentalness" ]]).transform(X[["instrumentalness" ]]) X['liveness'] = minmax.fit(X[["liveness"]]).transform(X[["liveness"]]) X['valence'] = minmax.fit(X[["valence"]]).transform(X[["valence"]]) X['tempo'] = minmax.fit(X[["tempo"]]).transform(X[["tempo"]]) X['loudness'] = minmax.fit(X[["loudness"]]).transform(X[["loudness"]]) X['key'] = minmax.fit(X[["key"]]).transform(X[["key"]]) # Instanciate and train audio feature model knn_tempo = KNeighborsRegressor().fit(X, y_tempo) # Use the model's kneighbors method to pass in a song and grat the 2 nearest to it / drop non-numerial data / returns tuple knn_recommended_tempo = knn_tempo.kneighbors(X, n_neighbors=5) # Grab the indexes of the recommended songs from knn suggested_album_indexes_tempo = knn_recommended_tempo[1][:, 1:] # Turn the KNN results into values in a dictionary with keys matching indexes suggested_album_dict = dict(enumerate(suggested_album_indexes_tempo)) # Turn the KNN dictionary results into a dataframe suggested_album_index_df = pd.DataFrame(suggested_album_dict.values(), index=suggested_album_dict.keys()) # Concatanate to the main dataframe & rename columns recommender_dataset_with_X = pd.concat([X, suggested_album_index_df], axis=1) recommender_dataset_with_X.rename(columns={ 0: 'rec_album_1', 1: 'rec_album_2', 2: 'rec_album_3', 3: 'rec_album_4' }, inplace=True) # Turn album_id column into a index/id matching dictionary album_image_dictionary = dict(recommender_dataset["album_cover"]) album_id_dictionary = dict(recommender_dataset['album_id']) album_artist_dictionary = dict(recommender_dataset['artists']) album_name_dictionary = dict(recommender_dataset['album_name']) # Create new columns for the suggested album cover image (urls) recommender_dataset_with_X[ "rec_album_image_1"] = recommender_dataset_with_X.rec_album_1.map( album_image_dictionary) recommender_dataset_with_X[ "rec_album_image_2"] = recommender_dataset_with_X.rec_album_2.map( album_image_dictionary) recommender_dataset_with_X[ "rec_album_image_3"] = recommender_dataset_with_X.rec_album_3.map( album_image_dictionary) recommender_dataset_with_X[ "rec_album_image_4"] = recommender_dataset_with_X.rec_album_4.map( album_image_dictionary) # Create new columns for the suggested album artist recommender_dataset_with_X[ "rec_album_artist_1"] = recommender_dataset_with_X.rec_album_1.map( album_artist_dictionary) recommender_dataset_with_X[ "rec_album_artist_2"] = recommender_dataset_with_X.rec_album_2.map( album_artist_dictionary) recommender_dataset_with_X[ "rec_album_artist_3"] = recommender_dataset_with_X.rec_album_3.map( album_artist_dictionary) recommender_dataset_with_X[ "rec_album_artist_4"] = recommender_dataset_with_X.rec_album_4.map( album_artist_dictionary) # Create new columns for the suggested album name recommender_dataset_with_X[ "rec_album_name_1"] = recommender_dataset_with_X.rec_album_1.map( album_name_dictionary) recommender_dataset_with_X[ "rec_album_name_2"] = recommender_dataset_with_X.rec_album_2.map( album_name_dictionary) recommender_dataset_with_X[ "rec_album_name_3"] = recommender_dataset_with_X.rec_album_3.map( album_name_dictionary) recommender_dataset_with_X[ "rec_album_name_4"] = recommender_dataset_with_X.rec_album_4.map( album_name_dictionary) # Assign the respective album_id in each column recommender_dataset_with_X.rec_album_1 = recommender_dataset_with_X.rec_album_1.map( album_id_dictionary) recommender_dataset_with_X.rec_album_2 = recommender_dataset_with_X.rec_album_2.map( album_id_dictionary) recommender_dataset_with_X.rec_album_3 = recommender_dataset_with_X.rec_album_3.map( album_id_dictionary) recommender_dataset_with_X.rec_album_4 = recommender_dataset_with_X.rec_album_4.map( album_id_dictionary) # Concatanate the suggested album dataframe to the main dataset # recommender_dataset_with_X = pd.concat([recommender_dataset_with_X, suggested_album_id_df], axis=1) # recommender_dataset_with_X.rename(columns={0:'suggested_album_id'}, inplace=True) # Concatanate the original dataset with the suggested albums dataset recommender_dataset = pd.concat( [recommender_dataset, recommender_dataset_with_X], axis=1) # Write result as csv to a file path recommender_dataset.to_csv(recommender_output_path, index=False)
## predicting file_path = os.path.join(data_path, "python_training.csv") df = pd.read_csv(file_path, sep=';') # filtering outliers (> 12ke per sqm) df = df.loc[df['pricesqm']<12000] file_path = os.path.join(data_path, "python_to_predict.csv") X = pd.read_csv(file_path, sep=';') X_ = processing(X) X_ = feature_processing.transform(X_) estimate = int(model.predict(X_)) kn = model.kneighbors(X_, n_neighbors=10, return_distance=True) ids = [df['id'].iloc[x] for x in kn[1][0]] distances = [x for x in kn[0][0]] estimates = [float(df.iloc[int(x)]['pricesqm']) for x in kn[1][0]] ids.insert(0,int(X['id'])) distances.insert(0,np.mean(distances)) # average distance from 10 nearest --> confidence index estimates.insert(0, estimate) result = {'id': ids, 'estimate': estimates, 'distance': distances } result = pd.DataFrame(data=result)
def _find_accuracy_num_homes(appliance, num_homes, start_seed, end_seed, feature="Monthly"): if appliance=="hvac": start, stop=5, 11 else: start, stop=1, 13 out = {} out_overall={} # We need to find homes that have all the features appliance_df = df.ix[df[['%s_%d' %(appliance,month) for month in range(start,stop)]].dropna().index] all_homes = appliance_df.index kf = KFold(len(all_homes), n_folds=5) for cv_loop_index, (train_index, test_index) in enumerate(kf): out_overall[cv_loop_index] = {} train_df = appliance_df.ix[all_homes[train_index]] test_df = appliance_df.ix[all_homes[test_index]] #print train_df.index print "TRAINING>>>" #Now, for each random seed, we'll pick up `num_homes` homes from the train set # Do CV on that to pick up best features and then predict for the test homes error_df_list = {} #1. Multiple times we will choose `num_homes` from the train set for random_seed in range(start_seed, end_seed): print "Random seed:", random_seed #out_overall[random_seed] = {} train_subset_homes_idx = np.random.choice(len(train_df), num_homes, replace=False) train_subset_homes = train_df.ix[train_df.index[train_subset_homes_idx]] #print train_subset_homes #2. Now, on this small subset of homes, we will do a round of CV to learn optimum # features l = LeaveOneOut(len(train_subset_homes_idx)) for cv_train, cv_test in l: cv_train_home =appliance_df.ix[train_subset_homes.index[cv_train]] cv_test_home = appliance_df.ix[train_subset_homes.index[cv_test]] test_home_name = cv_test_home.index.values[0] Y = cv_train_home[['%s_%d' %(appliance, i) for i in range(start, stop)]].sum(axis=1).values forest = ExtraTreesRegressor(n_estimators=250, random_state=0) forest.fit(cv_train_home[feature_map[feature]], Y) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] # Now varying K and top-N features out[test_home_name] ={} for K in range(K_min, K_max): out[test_home_name][K]={} for top_n in range(F_min,F_max): out[test_home_name][K][top_n]=[] top_n_features = cv_train_home[feature_map[feature]].columns[indices][:top_n] # Now fitting KNN on this for month in range(start, stop): clf = KNeighborsRegressor(n_neighbors=K) clf.fit(cv_train_home[top_n_features], cv_train_home['%s_%d' %(appliance, month)]) dist, ind = clf.kneighbors(cv_test_home[top_n_features]) nghbrs = cv_train_home.index.values[ind].flatten() proportion = cv_train_home.ix[nghbrs]['%s_%d' %(appliance, month)].div(df_unnormalised.ix[nghbrs]['%s_%d' %("aggregate", month)]) mean_prop = proportion.mean() out[test_home_name][K][top_n].append(df_unnormalised.ix[cv_test_home.index]['%s_%d' %("aggregate", month)]*mean_prop) accur = {} # We want to find the F, K combination that minimised the median (over homes) # and mean over months error for K in range(K_min, K_max): accur[K] = {} for top_n in range(F_min, F_max): accur[K][top_n]={} temp = {} for h in out.iterkeys(): pred = pd.DataFrame(out[h][K][top_n]).T pred.index = [h] pred.columns = [['%s_%d' %(appliance, i) for i in range(start, stop)]] gt = appliance_df.ix[h][['%s_%d' %(appliance, i) for i in range(start, stop)]] error = (pred-gt).abs().div(gt).mul(100).squeeze() accur[K][top_n][h]=error accur[K][top_n] = pd.DataFrame(accur[K][top_n]).T.median().mean() accur_df = pd.DataFrame(accur) accur_min = accur_df.min().min() min_ac_df = accur_df[accur_df==accur_min] F_best = cv_train_home[feature_map[feature]].columns[indices][:min_ac_df.mean(axis=1).dropna().index.values[0]].tolist() K_best = min_ac_df.mean().dropna().index.values[0] # Now predicting for test home pred_test = {} gt_test = {} for month in range(start, stop): clf = KNeighborsRegressor(n_neighbors=K_best) clf.fit(train_subset_homes[F_best], train_subset_homes['%s_%d' %(appliance, month)]) dist, ind = clf.kneighbors(test_df[F_best]) nghbrs = train_subset_homes[F_best].index.values[ind].flatten()[:K] nr = train_subset_homes.ix[nghbrs]['%s_%d' %(appliance, month)] dr = df_unnormalised.ix[nghbrs]['%s_%d' %("aggregate", month)] nr.name = dr.name proportion =nr.div(dr) mean_prop = proportion.mean() pred_test[month] =df_unnormalised.ix[test_df.index]['%s_%d' %("aggregate", month)]*mean_prop gt_test[month] = test_df['%s_%d' %(appliance, month)] #json.dump({'f':F_best, 'k':K_best,'accuracy':accur_max},open("../sensitivity-new/%s_%s_%d.json" %(appliance,feature, home),"w") ) pred_df = pd.DataFrame(pred_test) pred_df.index = test_df.index gt_df = pd.DataFrame(gt_test) error = (gt_df-pred_df).abs().div(gt_df).mul(100) out_overall[cv_loop_index][random_seed] = error errors = {} for random_seed in range(start_seed, end_seed): temp_list = [] for cv_loop_index in range(len(kf)): temp_list.append(out_overall[cv_loop_index][random_seed]) errors[random_seed] = pd.concat(temp_list) return errors
def fit(self, L, U, maxIt=1000, poolSize=100, wSize=10, **kwargs): # Initialize Training Sets L = Data(np.copy(L.X), np.copy(L.y)) # Select pool of unlabeled data UpoolIndexs = np.random.choice(len(U), poolSize, replace=False) Upool = [U[i] for i in UpoolIndexs] # Create the regressor model = self.learner(**kwargs) # train regressors on labeled data model.fit(L.X, L.y) # repeat for max_it rounds for i in range(maxIt): print i # keep list of changes to Ls pi = [] UpoolYs = model.predict(Upool) # get the neighbors of each unlabeled point - as indexs of the orig lists kNN = KNeighborsRegressor(n_neighbors=self.k) kNN.fit(L.X, L.y) UpoolNDistances = [sum(ns) for ns in kNN.kneighbors(Upool)[0]] W = heapq.nsmallest(wSize, [(k, t) for t, k in enumerate(UpoolNDistances)]) W = [w[1] for w in W] Wpool = [Upool[r] for r in W] WNeighbors = kNN.kneighbors(Wpool, return_distance=False) RMSEs = [] newX = [] newY = [] for r in range(wSize): neighborsIndexs = WNeighbors[r] neighbors = [L.X[n] for n in neighborsIndexs] neighborsYs = model.predict(neighbors) avgY = sum(neighborsYs)/float(self.k) x = Upool[W[r]] newX.append(x) newY.append(avgY) for x, y in zip(newX, newY): # L combined with the neighbors of each u in the Upool altL = Union(L, x, y) # create a model based on this altL altModel = self.learner(**kwargs) altModel.fit(altL.X, altL.y) altY = altModel.predict(newX) rmse = mean_squared_error(newY, altY) RMSEs.append(rmse) sortedErrors = sorted(RMSEs) lowest = sortedErrors[0] index = W[RMSEs.index(lowest)] bestX = Upool[index] bestY = UpoolYs[index] L = Union(L, bestX, bestY) uIndex = U.tolist().index(bestX.tolist()) m, n = U.shape U = np.delete(U, (uIndex), axis=0) model.fit(L.X, L.y) UpoolIndexs = np.random.choice(len(U), poolSize, replace=False) Upool = [U[i] for i in UpoolIndexs] #print kNNs[0].predict(U) print L.X print L.y self.model = model