Пример #1
0
def fit_estimator(sup_x, sup_y, unsup_x, n_neighbors=3, metric='minkowski'):
    """Provide ssl methods according to self-training

    """
    knn = KNeighborsRegressor(n_neighbors=n_neighbors, metric=metric)

    knn.fit(sup_x, sup_y)

    [dist, idx] = knn.kneighbors(unsup_x)

    label_l = sup_y

    label_u = np.zeros((len(unsup_x), ))
    for t in range(0, len(unsup_x)):
        label_u[t] = np.mean(np.array([sup_y[i] for i in idx[t]]).flatten())

    y = np.hstack((label_l, label_u))
    [dist, idx] = knn.kneighbors(unsup_x)

    for i in range(0, 5):
        label_u = np.zeros((len(unsup_x), ))
        for t in range(0, len(unsup_x)):
            label_u[t] = np.mean(np.array([y[i] for i in idx[t]]).flatten())
        y = np.hstack((label_l, label_u))

    return label_u
def chooseSample(S, c, k2, k1):
    xs = S[S['Class'] == c]
    #iterate over each sample of class c1
    for row in xs.iterrows():
        d1 = row[1]
        d1 = d1.drop('Class')
        y = S['Class']
        X = S.drop('Class', axis=1)

        #get k2 nearest neighbour using mahalanobis distance metric
        cov = np.cov(X, rowvar=False)
        knn = KNeighborsRegressor(n_neighbors=k2,
                                  metric="mahalanobis",
                                  metric_params=dict(V=cov))
        knn.fit(X, y)

        d = []
        d.append(d1)
        d1 = d

        #print(str(d1))
        neigbour = knn.kneighbors(d1)
        #print("length:"+ str(len(neigbour)))
        neighbour_length = len(neigbour)
        if (neighbour_length >= k1):
            S['wieght'] = neighbour_length / k2

    return S
Пример #3
0
class RegressionSMOTE(BaseEstimator):
    def __init__(self, n, k, sigma, **kwargs):
        self.n = n
        self.k = k
        self.sigma = sigma
        self.random_state = kwargs.get('random_state', np.random.random())

    def fit(self, X, y):
        self.knn = KNeighborsRegressor(self.k, 'distance').fit(X, y)

    def transform(self, X, y):
        np.random.seed(self.random_state)

        ix = np.random.choice(len(X), self.n)
        nn = self.knn.kneighbors(X[ix], return_distance=False)
        newY = self.knn.predict(X[ix])
        nni = np.random.choice(self.k, self.n)
        ix2 = np.array([n[i] for n, i in zip(nn, nni)])

        dif = X[ix] - X[ix2]
        gap = np.random.rand(self.n, 1)
        newX = X[ix] + dif * gap
        newX = newX + np.random.rand(*newX.shape) * self.sigma

        return newX, newY

    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X, y)
Пример #4
0
    def test_regression(self):
        X, y = load_boston(return_X_y=True)

        n_examples = len(y)
        n_train = int(0.75 * n_examples)

        np.random.seed(987321)

        train_X, train_y = X[:n_train, :], y[:n_train]
        test_X, test_y = X[n_train:, :], y[n_train:]

        np.random.seed(None)

        # sklearn
        sk = KNeighborsRegressor(n_neighbors=3)
        sk.fit(train_X, train_y)

        dists_sk, idx_sk = sk.kneighbors(test_X)
        y_pred_sk = sk.predict(test_X)

        # Mine
        myknn = KNearestNeighborsRegressor(k=3)

        myknn.fit(train_X, train_y)

        nearest_neighbors_mine, idx, dists = myknn.get_k_nearest_neighbors(
            test_X, return_idx=True, return_distances=True)

        y_pred = myknn.predict(test_X)

        self.assertTrue(np.allclose(dists_sk, np.sqrt(dists)))
        self.assertTrue(np.all(idx_sk == idx))
        self.assertTrue(np.allclose(y_pred, y_pred_sk))
def run_kNeighbors(distances, loadings, test_vars, 
                   weightings=('uniform',), k_list=(3)):
    """
    Run Knearest neighbor using precomputed distances to create an ontological mapping
    
    Args:
        distances: square distance matrix to pass to KNeighborsRegressors
        loadings: loading matrix for training
        test_vars: variable to reconstruct
        weightings: (optional) list of weightings to pass to KNeighbors
        k_list: list of k values to pass to KNeighbors as n_neighbors
    """
    train_distances = distances.loc[loadings.index, loadings.index]
    test_distances = distances.loc[test_vars, loadings.index]
    to_return = pd.DataFrame()
    for weighting in weightings:
        for k in k_list:
            clf = KNeighborsRegressor(metric='precomputed', n_neighbors=k, weights=weighting)
            clf.fit(train_distances, loadings)
            out = clf.predict(test_distances)
            out = pd.DataFrame(out, columns=loadings.columns)
            out['var'] = test_vars
            out['k'] = k
            out['weighting'] = weighting
            # add neighbors and distances
            neighbors = clf.kneighbors(test_distances)
            out['distances'] = tuple(neighbors[0])
            out['neighbors'] = tuple(test_distances.columns[neighbors[1]])
            to_return = pd.concat([to_return, out], sort=False)
    return to_return
def run_kNeighbors(distances,
                   loadings,
                   test_vars,
                   weightings=('uniform', ),
                   k_list=(3)):
    """
    Run Knearest neighbor using precomputed distances to create an ontological mapping
    
    Args:
        distances: square distance matrix to pass to KNeighborsRegressors
        loadings: loading matrix for training
        test_vars: variable to reconstruct
        weightings: (optional) list of weightings to pass to KNeighbors
        k_list: list of k values to pass to KNeighbors as n_neighbors
    """
    train_distances = distances.loc[loadings.index, loadings.index]
    test_distances = distances.loc[test_vars, loadings.index]
    to_return = pd.DataFrame()
    for weighting in weightings:
        for k in k_list:
            clf = KNeighborsRegressor(metric='precomputed',
                                      n_neighbors=k,
                                      weights=weighting)
            clf.fit(train_distances, loadings)
            out = clf.predict(test_distances)
            out = pd.DataFrame(out, columns=loadings.columns)
            out['var'] = test_vars
            out['k'] = k
            out['weighting'] = weighting
            # add neighbors and distances
            neighbors = clf.kneighbors(test_distances)
            out['distances'] = tuple(neighbors[0])
            out['neighbors'] = tuple(test_distances.columns[neighbors[1]])
            to_return = pd.concat([to_return, out], sort=False)
    return to_return
Пример #7
0
class QuantileKNN():
    
    def __init__(self, n_neighbors = 50):
        self.n_neighbors = n_neighbors
        
        #keep estimator in memory 
        self.reg = None
        
    def fit(self,X_train,y_train):
        self.neigh=KNeighborsRegressor(n_neighbors=self.n_neighbors)
        self.neigh.fit(X_train,y_train)
        self.X_train=X_train
        self.y_train=y_train
        
    def predict(self,x):
        def get_quantiles(element,indices,array):

            quantiles = np.arange(1,100)/100.0
            temp=array[indices]
            
            dist = stats.norm(np.mean(temp),np.std(temp))
            quant=[]
            for quantile in quantiles :
                quant.append(dist.ppf(quantile))
            
            return quant

        predictions_gbr=[]
        for element in tqdm(x):
            indices=self.neigh.kneighbors([element], return_distance=False)
            predictions_gbr.append(get_quantiles(element,indices,self.y_train))

        return predictions_gbr
Пример #8
0
def find(movies):
    X, y, genre_mean = similar_movie(movies, movies_info_final,
                                     cosine_similarities_total)
    neigh = KNeighborsRegressor(n_neighbors=10)
    neigh.fit(X, y)
    movies_index = neigh.kneighbors(np.reshape(genre_mean, (1, -1)))[1][0]
    ret = [y[i] for i in movies_index]
    return ret
Пример #9
0
class kNN():
    '''
        kNN classifier
        -------------
    '''

    def __init__(self,N_i,N_o,k=5,n=20):
        # note: N_o=1 assumed for now
        self.N_i = N_i
        self.n = n
        self.i = 0
        self.k = k
        self.X = zeros((self.n,N_i))
        self.y = zeros((self.n))
        self.h = KNeighborsRegressor(n_neighbors=k,weights='distance')#='distance')
        self.c = 0
        #self.error_rate = 0

    def predict(self,x):
        '''
            Predict
            --------------
        '''

        if self.c < 1.:
            print "[Warning!] No training examples!"
            return 0.0
        elif self.c <= self.k:
            dist,ind = self.h.kneighbors(self.X[0:self.c],n_neighbors=1)
            i_max = argmax(ind)
            return self.y[i_max]

        return self.h.predict(x)#.reshape(1,-1))

#    def samples_X(self):
#        ''' return samples of the WEIGHTS '''
#        if self.c <= 0:
#            return self.X[0,:]
#        return self.X[0:self.c,:]

    def update(self, x, y):
        '''
            Update
            --------------
        '''
        self.X[self.i,:] = x
        self.y[self.i] = y

        #self.error_rate = (y - self.predict(x))**2

        self.i = (self.i + 1) % self.n

        if self.c < self.n:
            self.c = self.c + 1

        self.h.fit(self.X[0:self.c,:], self.y[0:self.c])
Пример #10
0
def knn(neighbors=1):

    model = KNeighborsRegressor(neighbors)

    X = np.array([[-1, -1.5], [-2, -1.5], [-3, -2], [1, 1], [2, 1], [3, 3]])
    random_y_values = np.array([2, 3, 4, 5, 4, 1])
    to_predict = [0, 0]

    model.fit(X, random_y_values)
    dist, ind = model.kneighbors([to_predict])

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 7))

    circle = plt.Circle(to_predict, max(dist[0]), color='g', alpha=.2)
    axes[0].add_artist(circle)

    axes[0].plot(to_predict[0], to_predict[1], 'x', color='g', mew=3)

    axes[0].scatter(X[:, 0], X[:, 1], color='black')

    closest_points = X[ind[0]]

    axes[0].set_title('Distances')
    x_coords = closest_points.transpose()[0]
    y_coords = closest_points.transpose()[1]
    axes[0].scatter(x_coords, y_coords, color='r')

    for i in range(len(random_y_values)):
        position = X[i]
        axes[0].text(position[0] - .05, position[1] + .07,
                     str(random_y_values[i]))

    num_points = len(ind[0])
    axes[1].set_xlim([0, 7])
    axes[1].set_ylim([0, 6])

    values = []
    for i in range(num_points):
        value = random_y_values[ind[0][i]]
        axes[1].vlines(x=i + 1, ymin=0, ymax=value, color='r', linewidths=15)
        values.append(value)
    axes[1].hlines(y=np.mean(values),
                   xmin=0,
                   xmax=12,
                   linestyles='dashed',
                   linewidths=2,
                   color='g')

    axes[1].set_title('Values of k closest')
    axes[1].set_xlabel('k')
    axes[1].set_ylabel('Value')
    plt.show()
    print('Predicted Value: ', np.mean(values))
Пример #11
0
def mach_learn(listid, d, k):
    predictions = {}
    # for i in range(len(listid)):
    # 	#forloop for the reference
    for j in range(len(listid)):
        #forloop for the target
        # if i !=j:
        # R=listid[i]#reference
        T = listid[j]  #target
        # rsp=d[T][0]#real sp of the target with respect to the reference
        X = []
        Y = []
        X_test = []
        # psop=[]
        for z in range(len(listid)):
            # if z!=i and z!=j:
            if z != j:
                #for loop to define the space
                sop = d[listid[z]][
                    0]  #the sp score of the coord z with respect to the reference
                listina = []
                x_test = []
                for c in range(len(listid)):
                    #for loop to fill the training vectors
                    # if c!=i and c!=j:
                    if c != j:
                        listina.append(d[listid[z]][1][c])
                        x_test.append(float(d[T][1][c]))
                X.append(
                    listina
                )  # X will be the coordinates in the n-dimensional space
                Y.append(
                    sop)  # Y will be the tc/sop score asociated to each point
        X_test.append(x_test)
        X = np.asarray(X, dtype='float')
        Y = np.asarray(Y)
        #X_test=np.asarray(X_test)
        neigh = KNeighborsRegressor(n_neighbors=int(k), weights='distance')
        neigh.fit(
            X, Y
        )  #training using R as reference to compute the sp and T to be predicted
        pred = neigh.predict(X_test)  #prediction of T with R as reference
        dist = neigh.kneighbors(X_test)
        dis = dist[0][0][0]
        # num=dist[1][0][0]
        #dis=sum(dist[0][0])/float(k)
        #predictions[(R,T)]=[float(pred), float(dis), int(num)]
        # predictions[(R,T)]=[float(pred), float(dis)]
        predictions[T] = [float(pred), float(dis)]
    return predictions
Пример #12
0
def localLinearRegression(trainingCoordinates, trainingResponses, testCoordinates, neighborNumber, trainingWeights=None):
    
    sampleNumber = testCoordinates.shape[0]
    dim=testCoordinates.shape[1]

    if trainingWeights is None:
        trainingWeights=np.ones(trainingCoordinates.shape[0])

    testResponses = np.zeros(sampleNumber)

    knnreg = KNeighborsRegressor(n_neighbors=neighborNumber, weights='uniform', algorithm='kd_tree', leaf_size=40, p=2)
    knnreg.fit(trainingCoordinates, trainingResponses)

    for sample in range(0, sampleNumber):
        # find kNN

        [dist, ni] = knnreg.kneighbors(testCoordinates[sample,:].reshape(1,-1), neighborNumber)

        X=np.concatenate((np.ones(neighborNumber).reshape(-1,1),trainingCoordinates[ni.squeeze(),:]),axis=1)
        y=trainingResponses[ni.squeeze()]
        w=np.diag(trainingWeights[ni.squeeze()])


        rhs=np.zeros(dim+1)
        
        rhs=y.dot(w.dot(X))
        #for m in range(0, dim+1):
        #    for k in range(0,neighborNumber):
        #        rhs[m]+=X[k,m]*y[k]*w[k]

        lhs=np.zeros((dim+1,dim+1))
        
        lhs=(X.T).dot(w.dot(X))
        #for m in range(0,dim+1):
        #    for n in range(0,dim+1):
        #        for k in range(0,neighborNumber):
        #            lhs[m,n]+=X[k,m]*X[k,n]*w[k]

        try:
            coefficients = np.linalg.solve(lhs, rhs)
        except:
            break

        testResponses[sample]=(np.concatenate((np.ones(1),testCoordinates[sample,:]),axis=0).reshape(1,-1)).dot(coefficients.reshape(-1,1))

    return testResponses,coefficients,ni
Пример #13
0
def test_knn_regression():

	datafile_viper = '../data_viper/viper.pkl'
	viper = loadfile(datafile_viper)

	from sklearn.neighbors import KNeighborsRegressor
	model = KNeighborsRegressor(n_neighbors=5, weights='uniform', metric='euclidean')
	model.fit(viper.train_feat, viper.train_y)

	n_test = len(viper.test_feat)
	y_pred = np.zeros(n_test)
	for i, feat in zip(np.arange(n_test), viper.test_feat):
		dist, ind = model.kneighbors(feat)
		y_pred[i] = (viper.train_y[ind]*np.exp(-dist**2)).sum()/(np.exp(-dist**2)).sum()
	
	# y_pred = model.predict(viper.test_feat)
	print 'testing error {}'.format(abs_error(y_pred, viper.test_y)) 
def knn_manip(inp, processed, adhoc):
    """
	Finds the most similar players to a given input using KNNRegressor
	:param inp: A dictionary of player attribute-value pairs
	:param processed: Dataset containing player skill attributes
	:param adhoc: Dataset containing player personality attributes
	:return: returns name, link to photos and positions of similar players
	"""
    predict_config = config_text['predict']
    df = pd.DataFrame(inp, index=[0])
    df['Simple_Position'] = df.apply(simple_position, axis=1)
    my_cols_list = predict_config['pos_list']
    df = df.reindex(columns=[*df.columns.tolist(), *my_cols_list],
                    fill_value=0)
    req_simp = df['Simple_Position'].values[0]
    col_name = 'Simple_Position_' + req_simp
    df[col_name] = 1
    df.drop(labels=['Position', 'Simple_Position'], axis=1, inplace=True)

    # Find neighbors from processed data

    features_list = predict_config['features_list'] + [
        col for col in processed.columns if col.startswith('Simple_')
    ]
    position_data = processed.loc[processed[col_name] == 1, :]
    y_train = position_data['Value']
    X_train = position_data[features_list]

    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)

    df = df[features_list]
    df = scaler.transform(df)

    regressor = KNeighborsRegressor(n_neighbors=5)
    regressor.fit(X_train, y_train)
    nneighbors = position_data.iloc[regressor.kneighbors(df)[1][0], :]
    nneighbor_id = nneighbors['ID'].tolist()

    nname = adhoc.loc[adhoc['ID'].isin(nneighbor_id), 'Name'].tolist()
    nid = adhoc.loc[adhoc['ID'].isin(nneighbor_id), 'Photo'].tolist()
    npos = adhoc.loc[adhoc['ID'].isin(nneighbor_id), 'Position'].tolist()

    return nname, nid, npos
Пример #15
0
def trainData(n, p, b, totalTrades, K, X, y):
    x_train = y_train = 0  # initialise model values
    best = 0  # the best test data set to be used in the future

    # divide the data into test data and practice data and loop over 100 times until best data set is found
    for _ in range(n):
        x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
            X, y, test_size=0.1)  # split data
        modelTest = KNeighborsRegressor(
            n_neighbors=K
        )  # use K-Neighbours Regression model for K neighbours

        modelTest.fit(x_train, y_train)  # fit the model

        if totalTrades <= 19:
            with open("tradeModel_{0}_{1}.pickle".format(b, p), "wb") as f:
                pickle.dump(modelTest, f)  # save the test data set
            break
        acc = modelTest.score(x_test, y_test)  # accuracy of predictions

        # if accuracy is higher than best recorded accuracy
        if acc > best:
            best = acc  # new accuracy is best accuracy
            with open("tradeModel_{0}_{1}.pickle".format(b, p), "wb") as f:
                pickle.dump(modelTest, f)  # save the test data set

    modelTrade = KNeighborsRegressor(
        n_neighbors=totalTrades
    )  # use K-Neighbours Regression model for totalTrades neighbours
    modelTrade.fit(x_train, y_train)  # fit the model
    trades = modelTrade.kneighbors(
        n_neighbors=totalTrades
    )  # get the distance of each trade in the KNN model
    tradeDistance = 0  # initiate variable to store total distances of all trades

    # loop over all trades and add their distance into tradeDistance
    for u in range(totalTrades):
        for v in range(len(trades)):
            tradeDistance += trades[0][u][v]

    average = tradeDistance / totalTrades  # get the average distance of all trades
    with open("tradeDistance_{0}_{1}.pickle".format(b, p), "wb") as f:
        pickle.dump(average, f)  # save the average distance for that model
Пример #16
0
    def select(self, X, y):
        if self.distance_threshold is None:
            from sklearn.neighbors import KNeighborsRegressor

            nn = KNeighborsRegressor(n_neighbors=self.k + 1)
            nn.fit(X, y)
            dist, ind = nn.kneighbors(X)

            self.distance_threshold = np.max(np.min(dist[:, 1:], 1))

        from sklearn.neighbors import RadiusNeighborsRegressor
        self.nn = RadiusNeighborsRegressor(radius=self.distance_threshold)

        Xcand, ycand, corner_response = self._candidates(X, y)

        Xcorner, ycorner = self._nonmax_supress(Xcand, ycand, corner_response)

        idx = np.where(np.isin(X, Xcorner))
        return idx, Xcorner, ycorner
def prob_programs_above_perf_threshold_knn(
    program_feature_vectors: List[ProgramFeatureVector], 
    regressor: KNeighborsRegressor, 
    perf_threshold: Reward) -> List[float]:
  
  dist, ind = regressor.kneighbors(
    np.array(program_feature_vectors)
  )

  probs = [
    1 - mlca.helpers.probability.cdf(
      perf_threshold, 
      np.mean(regressor._y[ind[i]]), 
      np.std(regressor._y[ind[i]])
    )
    for i in tqdm(range(len(program_feature_vectors)), "prob_programs_above_perf_threshold")
  ]

  return probs
Пример #18
0
def atemp_impute(X, y, X_missing):
    scaler = preprocessing.StandardScaler()
    X2 = scaler.fit_transform(X)
    nn = KNeighborsRegressor(1)
    nn.fit(X2, y)
    X_missing2 = scaler.transform(X_missing)
    dist, ind = nn.kneighbors(X=X_missing2,
                              n_neighbors=1,
                              return_distance=True)
    res = X.iloc[ind[:, 0]].copy()
    res['dist'] = dist
    res['y'] = y[ind[:, 0]]
    res.index = X_missing.index
    res = pd.concat([res, X_missing], axis=1)
    return res


#X = tmp[['temp','humidity', 'windspeed']]
#y = tmp.temp_diff
#X_missing =  data.loc[data.temp_diff < -10, ['temp','humidity', 'windspeed']]
Пример #19
0
def knn(data, pred_length, D_window=14, max_k=7):
    if pred_length + D_window >= len(data):
        print('ERROR: pred_length or D_window too long')
        return None

    ret_ypred = []
    for h in range(4):
        train_feature, train_label = get_train_set(data, h, D_window,
                                                   pred_length)

        e_LOO_arr = np.zeros(max_k)
        for k in range(2, max_k + 1):
            model = KNeighborsRegressor(n_neighbors=k,
                                        weights='uniform',
                                        algorithm='auto')
            model.fit(train_feature, train_label)

            # 获取k近邻
            dist_list, index_list = model.kneighbors([data[0 - D_window:]])
            k_neighbor_label = []
            for i in index_list[0]:
                k_neighbor_label.append(train_label[i])

            # 基于k近邻的预测值
            ypred = model.predict([data[0 - D_window:]])
            ypred = np.asarray(list(map(round, ypred[0])))

            # 计算e_LOO
            e_LOO_arr[k - 1] = LOO(k_neighbor_label, ypred, k)

        # 取e_LOO最小的k值
        k_min = np.argmin(e_LOO_arr[1:]) + 2
        model = KNeighborsRegressor(n_neighbors=k_min,
                                    weights='uniform',
                                    algorithm='auto')
        model.fit(train_feature, train_label)
        ypred = model.predict([data[0 - D_window:]])
        ret_ypred += list(map(round, ypred[0]))

    return np.asarray(ret_ypred)
Пример #20
0
def MIMO_KNN_LOO_May(data):
    code = data[0]
    data = list(map(float, data[1:]))

    D_window = 14
    max_k = 7
    pred_May = []
    for h in range(4):
        train_feature, train_label = get_train_set(data, h, D_window)
        e_LOO_arr = np.zeros(max_k)
        for k in range(2, max_k + 1):
            model = KNeighborsRegressor(n_neighbors=k, weights='uniform', algorithm='auto')
            model.fit(train_feature, train_label)

            # 获取k近邻
            dist_list, index_list = model.kneighbors([data[0 - D_window:]])
            k_neighbor_label = []
            for i in index_list[0]:
                k_neighbor_label.append(train_label[i])

            # 基于k近邻的预测值
            ypred = model.predict([data[0 - D_window:]])
            ypred = np.asarray(list(map(round, ypred[0])))

            # 计算e_LOO
            e_LOO_arr[k - 1] = LOO(k_neighbor_label, ypred, k)

        # 取e_LOO最小的k值
        k_min = np.argmin(e_LOO_arr[1:]) + 2

        # 令k=k_min,做预测
        model = KNeighborsRegressor(n_neighbors=k_min, weights='uniform', algorithm='auto')
        model.fit(train_feature, train_label)
        ypred = model.predict([data[0 - D_window:]])
        ypred = list(map(round, ypred[0]))
        pred_May = pred_May + ypred

    print(pred_May)
    # 替换文件里编码为code的预测值
    change_pred(code, pred_May)
Пример #21
0
def baseline_predict(sup_x, sup_y, unsup_x):
    """This is a  1NN regressor with euclidean distance measure.

        sup_x : array-like
            laeled data matrix with [n_samples, n_features]

        y : array-like
            label  matrix with [n_sample, ]

        unsup_x: array-like
            unlabeled data matrix with [n_samples, n_features]
    """

    knn = KNeighborsRegressor(n_neighbors=1)
    knn.fit(sup_x, sup_y)
    [dist, idx] = knn.kneighbors(unsup_x)

    baseline_prediction = np.zeros((len(unsup_x), 1))
    for t in range(0, len(unsup_x)):
        baseline_prediction[t] = np.mean(
            np.array([sup_y[i] for i in idx[t]]).flatten())

    return baseline_prediction
Пример #22
0
def MIMO_KNN_LOO_test(data):
    code = data[0]
    data = list(map(float, data[1:]))
    train_data = data[:90]
    test_data = data[90:]

    # 对4个时间段分别训练模型,时间段分别为7天、7天、7天、9天
    D_window = 14
    max_k = 7
    for h in range(4):
        train_feature, train_label = get_train_set(train_data, h, D_window)
        y_label = get_test_label(test_data, h)

        e_LOO_arr = np.zeros(max_k)
        for k in range(2, max_k + 1):
            model = KNeighborsRegressor(n_neighbors=k, weights='uniform', algorithm='auto')
            model.fit(train_feature, train_label)

            # 获取k近邻
            dist_list, index_list = model.kneighbors([train_data[0 - D_window:]])
            k_neighbor_label = []
            for i in index_list[0]:
                k_neighbor_label.append(train_label[i])

            # 基于k近邻的预测值
            ypred = model.predict([train_data[0-D_window:]])
            ypred = np.asarray(list(map(round, ypred[0])))
            rmse = np.sqrt(((ypred - y_label) ** 2).mean())
            print(code, '  h=', h, '  k=', k, '  rmse=', rmse)

            # 计算e_LOO
            e_LOO_arr[k-1] = LOO(k_neighbor_label, ypred, k)

        # 取e_LOO最小的k值
        k_min = np.argmin(e_LOO_arr[1:]) + 2
        print('k_min=', k_min)
Пример #23
0
    def find_revenue_potential(self, amenity_x, scaling=True):
        """ Finds the revenue potential for an amenity
        returns rev_pot: float
        """

        # Datafame to find neighbors from, made from a 'query property'
        query_df = self.my_property[[
            'bedrooms', 'bathrooms', 'accommodates', 'latitude', 'longitude'
        ]].astype('float')

        # Comp dataframe to find neighbors in
        df = self.comps[[
            'rev_pot', 'bedrooms', 'bathrooms', 'accommodates', 'latitude',
            'longitude', amenity_x
        ]].astype('float')
        # Creates two dataframes, one for properties that have the amenity and one for those who don't
        w_amenity = df[df[amenity_x].astype('bool')].drop(columns=[amenity_x])
        w_out_amenity = df[~df[amenity_x].astype('bool')].drop(
            columns=[amenity_x])

        # Split and scale df
        X_w, y_w, w_predict = self.create_test_df(query_df, w_amenity)

        X_w_out, y_w_out, w_out_predict = self.create_test_df(
            query_df, w_out_amenity)

        # Check to make sure dataframes have enough neighbors
        k = min(X_w_out.shape[0], X_w.shape[0])
        # Don't return results if there's not enough neighbors
        if k < 3:
            return 0

        # Initialize & fit neighbors
        kn_w = KNeighborsRegressor(n_neighbors=k,
                                   weights='distance',
                                   n_jobs=-1)
        kn_w_out = KNeighborsRegressor(n_neighbors=k,
                                       weights='distance',
                                       n_jobs=-1)
        kn_w.fit(X_w, y_w)
        kn_w_out.fit(X_w_out, y_w_out)

        # find neighbors and their similarity(distance) to the query
        w_distance, w_neighbors = kn_w.kneighbors(w_predict)
        w_neighbors_df = w_amenity.iloc[w_neighbors.flatten()]

        w_out_distance, w_out_neighbors = kn_w_out.kneighbors(w_out_predict)
        w_out_neighbors_df = w_out_amenity.iloc[w_out_neighbors.flatten()]

        # predict revenue for nearby properties with and without the amenity
        w_revenue = kn_w.predict(w_predict)[0]
        w_out_revenue = kn_w_out.predict(w_out_predict)[0]
        rev_pot = w_revenue - w_out_revenue  # calculate the potential upside

        if self.verbose:
            print(f"with amenity {w_amenity['accommodates'].mean()}")
            print(f"without amenity {w_out_amenity['accommodates'].mean()}")
            print(
                f'Average yearly revenue with {amenity_x}: $ {w_revenue:.0f} \n Without: $ {w_out_revenue:.0f} \n Yearly revenue potential :  $ { rev_pot:.0f}'
            )
        return rev_pot
Пример #24
0
# In[45]:

knn = KNeighborsRegressor(n_neighbors=5, weights='distance', metric='cosine')

# In[46]:

knn.fit(X_train, y_train)

# In[47]:

X_train.iloc[0]

# In[48]:

knn.kneighbors(X_test.iloc[0:1])[0]

# In[49]:

knn.kneighbors(X_test.iloc[0:1])[1][0].tolist()

# In[ ]:

# In[50]:

X_test.iloc[0:1]

# In[51]:

X_train.iloc[knn.kneighbors(X_test.iloc[0:1])[1][0].tolist()]
Пример #25
0
훈련 세트의 r2: 0.9804899950518966
테스트 세트의 r2: 0.9746459963987609
'''
'''
KNR 모델의 문제 제기
# 길이 50cm, 무게 1.5kg인 농어의 무게를 예측해보자
'''
prd = knr.predict([[50]])
print('50길이의 농어 예측 무게: ', prd) # [1033.33333333]

prd = knr.predict([[100]])
print('100길이의 농어 예측 무게: ', prd) # [1033.33333333]
'''
50cm 농어의 이웃을 구해서 산점도 그리기
'''
import matplotlib.pyplot as plt

distances, indexes = knr.kneighbors([[50]])
# 훈련 세트의 산점도 그리기
plt.scatter(train_input, train_target, c='b')

# 훈련 세트 중 이웃 샘플만 다시 그리기
plt.scatter(train_input[indexes], train_target[indexes], marker='D', c='y')
plt.scatter(50, 1033, marker='^', c='r')
#plt.show()
'''
K 최근점 이웃의 한계 : 새로운 샘플이 훈련세트 범위를 넘어서게 되면 잘못된 값을 예측할 수 있다
==> 선형회귀 (LinearRegressor) : 특성 하나일 경우 직선을 학습하는 알고리즘
'''

Пример #26
0
def main():
    spotify_client = SpotifyClient(authorization_token, user_id)

    #Scale key
    # scaler = MinMaxScaler()
    # df_kaggle['scaled_key'] = scaler.fit_transform(df_kaggle[['key']])

    #Asking users for their preferences
    query_genre = input("Which genre?\n>")
    query_pop = get_choice(df=df_kaggle, column="popularity_binned")
    query_decade = get_choice(df=df_kaggle, column="decades")
    query_duration = input("Duration of the playlist?\n> ")
    print(
        f" You selected {query_pop} {query_genre} tracks from the {query_decade} decade for a total duration of {query_duration} minutes"
    )

    #Converting to the right type
    query_genre = str(query_genre)
    query_pop = str(query_pop)
    # query_decade = str(query_decade)
    #Handle decade format
    if int(query_decade[1]) > 1:
        query_decade = query_decade.replace("'", "19")
    else:
        query_decade = query_decade.replace("'", "20")
    query_duration = int(query_duration)

    #Filtering the dataset accordingly
    filtered_results = df_kaggle[
        df_kaggle['genres'].str.contains(query_genre) &
        (df_kaggle['year'].str.contains(query_decade[0:3])) &
        (df_kaggle['popularity_binned']
         == query_pop)]  # [0:3] for 1920s / [1:3] for (1920 -1930]

    #Get features of 1 random seed track from the filtered_results'
    seed = filtered_results.sample(1)
    tempo = seed['scaled_tempo'].iat[0]
    # loudness = seed['scaled_loudness'].iat[0]
    da = seed['danceability'].iat[0]
    energy = seed['energy'].iat[0]
    # key = seed['scaled_key'].iat[0]
    # valence = seed['valence'].iat[0]

    #Training the model
    features_names = ['scaled_tempo', 'danceability',
                      'energy']  # 'scaled_loudness' , 'valence' , 'scaled_key'
    X = filtered_results[features_names]
    y = filtered_results['track_id']
    model = KNeighborsRegressor(algorithm='kd_tree', n_jobs=-1).fit(X, y)

    #Get model output for k: distances & indices
    knn_out, k = [], 100
    knn_out = model.kneighbors([[tempo, da, energy]],
                               n_neighbors=k)  # loudness, ,valence ,key
    ind = knn_out[1][0].tolist()  # get indices
    recs = filtered_results.iloc[ind]  # recommendations df

    #Filter on popularity after modeling
    # refiltered_results = recs[recs['popularity_binned'] == query_pop]

    #Filter recommendations based on user's preferred duration
    # filtered_duration = refiltered_results[refiltered_results['duration_min'].cumsum() <= query_duration]
    filtered_duration = recs[recs['duration_min'].cumsum() <= query_duration]
    recommended_playlist = filtered_duration.reset_index(drop=True)

    #sorting the playlist by tempo
    recommended_playlist.sort_values(by=['scaled_tempo'])
    split_threshold = round(len(recommended_playlist) / 2)
    asc_playlist = recommended_playlist.iloc[0:split_threshold].sort_values(
        by=['scaled_tempo'], ascending=True)
    desc_playlist = recommended_playlist.iloc[split_threshold:].sort_values(
        by=['scaled_tempo'], ascending=False)
    frames = [asc_playlist, desc_playlist]
    sorted_playlist = pd.concat(frames)
    sorted_playlist = sorted_playlist.reset_index(drop=True)

    recommended_tracks = sorted_playlist[['track_name', 'track_id', 'artists']]

    # get playlist name from user and create empty playlist
    playlist_name = input("\nWhat's the playlist name? ")
    playlist_name = str(playlist_name)
    playlist = spotify_client.create_playlist(playlist_name)
    playlist_id = playlist.playlist_id

    # populate playlist with recommended tracks
    tracks_id = sorted_playlist['track_id'].tolist()
    #sp.playlist_add_items(playlist_id, tracks_id, position=None)
    track_uris = [create_spotify_uri(track) for track in tracks_id]
    response = requests.post(
        url=f"https://api.spotify.com/v1/playlists/{playlist_id}/tracks",
        data=json.dumps(track_uris),
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {authorization_token}"
        })
    response = response.json()
    print('Your playlist was successfully added to your spotify account')
def _find_accuracy_num_homes(appliance, num_homes, start_seed, end_seed, feature="Monthly"):
    if appliance=="hvac":
        start, stop=5, 11
    else:
        start, stop=1, 13
    out = {}
    out_overall={}
    appliance_df = test_df.ix[test_df[['%s_%d' %(appliance,month) for month in range(start,stop)]].dropna().index]


    for random_seed in range(start_seed, end_seed):
        out_overall[random_seed] = {}
        rs = ShuffleSplit(len(appliance_df), n_iter=1,
                          train_size=num_homes,
                          test_size=len(appliance_df)-num_homes,
                          random_state=random_seed)

        for train, test in rs:
            train_homes = appliance_df.index.values[train]
            test_homes = appliance_df.index.values[test]
            train_homes_df = appliance_df.ix[train_homes]
            test_homes_df = appliance_df.ix[test_homes]

            # Now, we need to do cross validation on train homes
            l = LeaveOneOut(len(train_homes))
            for cv_train, cv_test in l:
                cv_train_home =appliance_df.ix[train_homes[cv_train]]
                cv_test_home = appliance_df.ix[train_homes[cv_test]]
                test_home_name = cv_test_home.index.values[0]
                Y = cv_train_home[['%s_%d' %(appliance, i) for i in range(start, stop)]].sum(axis=1).values
                forest = ExtraTreesRegressor(n_estimators=250,
                                      random_state=0)
                forest.fit(cv_train_home[feature_map[feature]], Y)
                importances = forest.feature_importances_
                indices = np.argsort(importances)[::-1]

                # Now varying K and top-N features
                out[test_home_name] ={}
                for K in range(K_min, K_max):
                    out[test_home_name][K]={}
                    for top_n in range(F_min,F_max):
                        out[test_home_name][K][top_n]=[]
                        top_n_features = cv_train_home[feature_map[feature]].columns[indices][:top_n]

                        # Now fitting KNN on this
                        for month in range(start, stop):
                            clf = KNeighborsRegressor(n_neighbors=K)
                            clf.fit(cv_train_home[top_n_features], cv_train_home['%s_%d' %(appliance, month)])
                            dist, ind = clf.kneighbors(cv_test_home[top_n_features])
                            nghbrs = cv_train_home.index.values[ind].flatten()
                            proportion = cv_train_home.ix[nghbrs]['%s_%d' %(appliance, month)].div(cv_train_home.ix[nghbrs]['%s_%d' %("aggregate", month)])
                            mean_prop = proportion.mean()
                            out[test_home_name][K][top_n].append(cv_test_home['%s_%d' %("aggregate", month)]*mean_prop)

            accur = {}

            for K in range(K_min, K_max):
                accur[K] = {}
                for top_n in range(F_min, F_max):
                    temp = {}
                    for h in out.iterkeys():
                        pred = pd.DataFrame(out[h][K][top_n]).T
                        #all_but_h = [x for x in out.keys() if x!=h]
                        pred.index = [h]
                        pred.columns = [['%s_%d' %(appliance, i) for i in range(start, stop)]]
                        gt = appliance_df.ix[h][['%s_%d' %(appliance, i) for i in range(start, stop)]]
                        error = (pred-gt).abs().div(gt).mul(100)
                        mean_error = error.mean().mean()
                        a = 100-mean_error
                        if a<0:
                            a=0
                        temp[h]=a
                    ac = pd.Series(temp).mean()

                    accur[K][top_n] = ac

            accur_df = pd.DataFrame(accur)
            accur_max = accur_df.max().max()
            max_ac_df = accur_df[accur_df==accur_max]
            F_best = cv_train_home[feature_map[feature]].columns[indices][:max_ac_df.mean(axis=1).dropna().index.values[0]].tolist()
            K_best = max_ac_df.mean().dropna().index.values[0]

        # Now predicting for test home


        pred_test = {}
        gt_test = {}
        for month in range(start, stop):
            clf = KNeighborsRegressor(n_neighbors=K_best)
            clf.fit(train_homes_df[F_best], train_homes_df['%s_%d' %(appliance, month)])
            pred_test[month] = clf.predict(test_homes_df[F_best])
            gt_test[month] = test_homes_df['%s_%d' %(appliance, month)]


        #json.dump({'f':F_best, 'k':K_best,'accuracy':accur_max},open("../sensitivity-new/%s_%s_%d.json" %(appliance,feature, home),"w") )

        pred_df = pd.DataFrame(pred_test)
        pred_df.index = test_homes_df.index
        gt_df = pd.DataFrame(gt_test)
        error = (gt_df-pred_df).abs().div(gt_df).mul(100)
        accuracy_test = 100-error
        accuracy_test[accuracy_test<0]=0
        out_overall[random_seed]=accuracy_test.mean().mean()


    return pd.Series(out_overall)
Пример #28
0
class wolse:
    def __init__(self, p1, p2, p3):

        df = pd.read_csv('realdata.csv')
        #print(len(df.values))
        #df=df[df['계약년도']==2019]

        df['구동'] = df['자치구명'] + df['법정동명']
        '''
        temp=df['구동']
        temp=temp.unique()
        temp=pd.DataFrame(temp)
        temp.to_excel('구동.xlsx')
        '''

        rent1 = df[df['전월세구분'] == '준월세']

        self.origin = rent1

        a = rent1[['임대면적', '건축년도']]
        a = rent1[['임대면적']]

        #tempindex=a.index
        #a=scale(a)
        #a=pd.DataFrame(a,columns=['임대면적'],index=tempindex)

        b = pd.get_dummies(rent1['구동'])
        self.simple_rent1 = a.join(b)
        c = pd.get_dummies(rent1['임대건물명'])
        self.simple_rent1 = self.simple_rent1.join(c)

        floor = []
        for f in rent1['층'].values:

            if f == -1:
                floor.append(-1)
            else:
                floor.append(0)

        self.simple_rent1['floor'] = floor

        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(
            self.simple_rent1, rent1['보증금'], random_state=42)
        self.X_train2, self.X_test2, self.Y_train2, self.Y_test2 = train_test_split(
            self.simple_rent1, rent1['임대료'], random_state=42)

        self.knn = KNeighborsRegressor(n_neighbors=3)
        self.knn2 = KNeighborsRegressor(n_neighbors=3)
        self.lr = LinearRegression()

        self.tlist = []

        self.train()

        self.getinput2(p1, p2, p3)
        self.predict()
        '''
        while True:
            
            print("input >> ")
            tstr=input()
            if tstr=="-1":
                break
            self.getinput(tstr)
            self.predict()
        '''

    def train(self):

        self.knn.fit(self.X_train, self.Y_train)
        self.knn2.fit(self.X_train2, self.Y_train2)

        self.lr.fit(self.X_train, self.Y_train)

    def getinput(self, tstr):
        self.tlist = []
        zero = np.zeros(len(self.simple_rent1.columns))

        arr = tstr.split(" ")
        t = pd.DataFrame([zero], columns=self.simple_rent1.columns)
        #t['임대면적']=float(arr[0])/3.305785
        t['임대면적'] = float(arr[0])

        t[arr[1]] = 1.0

        if arr[2] == '아파트':
            t['아파트'] = 1.0
        elif arr[2] == '오피스텔':
            t['오피스텔'] = 1.0
        else:
            t['다세대/연립'] = 1.0
        #t['건축년도']=2017
        #t['보증금']=6

        self.tlist.append(t)

    def getinput2(self, a, b, c):
        #a 동네
        #b 면적
        #c 유형

        self.tlist = []
        zero = np.zeros(len(self.simple_rent1.columns))

        #arr=tstr.split(" ")
        t = pd.DataFrame([zero], columns=self.simple_rent1.columns)
        t['임대면적'] = float(b) * 3.305785
        #t['임대면적']=float(b)

        t[a] = 1.0

        if c == '아파트':
            t['아파트'] = 1.0
        elif c == '오피스텔':
            t['오피스텔'] = 1.0
        else:
            t['다세대/연립'] = 1.0

        self.tlist.append(t)

    def predict(self):

        for i in self.tlist:

            ind = self.knn.kneighbors(i, n_neighbors=3, return_distance=False)
            print(ind[0])
            ind2 = self.knn2.kneighbors(i,
                                        n_neighbors=3,
                                        return_distance=False)
            print(ind2[0])

            indices = list(ind[0])
            indices2 = list(ind2[0])

            y_tr = pd.DataFrame(self.Y_train)

            zlist = list(map(lambda i: y_tr.iloc[i, :], indices))

            z = pd.DataFrame(zlist[1])

            rz1 = str(int(z.loc['보증금', :]))

            y_tr2 = pd.DataFrame(self.Y_train2)

            z2list = list(map(lambda i: y_tr2.iloc[i, :], indices2))

            z2 = pd.DataFrame(z2list[1])

            rz2 = str(int(z2.loc['임대료', :]))

            r1 = rz1 + "/" + rz2

            z = pd.DataFrame(zlist[2])
            z2 = pd.DataFrame(z2list[2])

            rz1 = str(int(z.loc['보증금', :]))
            rz2 = str(int(z2.loc['임대료', :]))

            r2 = rz1 + "/" + rz2

            z = pd.DataFrame(zlist[0])
            z2 = pd.DataFrame(z2list[0])

            rz1 = str(int(z.loc['보증금', :]))
            rz2 = str(int(z2.loc['임대료', :]))

            r3 = rz1 + "/" + rz2

            self.resultstr = r1 + '\n' + r2 + '\n' + r3
            return str(self.resultstr)
            print('\n')

            print('\n')
def _find_accuracy(home, appliance, feature="Monthly"):
    np.random.seed(42)
    appliance_df = df.ix[all_homes[appliance]]
    if appliance=="hvac":
        start, stop=5, 11
    else:
        start, stop=1, 13

    test_homes = [home]
    train_homes = appliance_df[~appliance_df.index.isin([home])].index
    all_home_appliance = deepcopy(all_homes)
    all_home_appliance[appliance] = train_homes

    # Cross validation on inner loop to find best feature, K
    train_size = len(train_homes)
    l = LeaveOneOut(train_size)
    out = OrderedDict()
    for cv_train, cv_test in l:

        cv_train_home=appliance_df.ix[train_homes[cv_train]]
        cv_test_home = appliance_df.ix[train_homes[cv_test]]
        test_home_name = cv_test_home.index.values[0]
        #print cv_test_home
        out[test_home_name]={}


        # Summing up energy across start to stop to get Y to learn optimum feature on
        Y = cv_train_home[['%s_%d' %(appliance, i) for i in range(start, stop)]].sum(axis=1).values
        forest = ExtraTreesRegressor(n_estimators=250,
                              random_state=0)
        forest.fit(cv_train_home[feature_map[feature]], Y)
        importances = forest.feature_importances_
        indices = np.argsort(importances)[::-1]

        # Now varying K and top-N features

        for K in range(K_min, K_max):
            out[test_home_name][K]={}
            for top_n in range(F_min,F_max):
                out[test_home_name][K][top_n]=[]
                top_n_features = cv_train_home[feature_map[feature]].columns[indices][:top_n]

                # Now fitting KNN on this
                for month in range(start, stop):
                    clf = KNeighborsRegressor(n_neighbors=K)
                    clf.fit(cv_train_home[top_n_features], cv_train_home['%s_%d' %(appliance, month)])
                    out[test_home_name][K][top_n].append(clf.predict(cv_test_home[top_n_features]))

        # Now, finding the (K, top_n) combination that gave us best accuracy on CV test homes
    accur = {}

    for K in range(K_min, K_max):
        accur[K] = {}
        for top_n in range(F_min, F_max):
            temp = {}
            for h in out.iterkeys():
                pred = pd.DataFrame(out[h][K][top_n]).T
                #all_but_h = [x for x in out.keys() if x!=h]
                pred.index = [h]
                pred.columns = [['%s_%d' %(appliance, i) for i in range(start, stop)]]
                gt = appliance_df.ix[h][['%s_%d' %(appliance, i) for i in range(start, stop)]]
                error = (pred-gt).abs().div(gt).mul(100)
                mean_error = error.mean().mean()
                a = 100-mean_error
                if a<0:
                    a=0
                temp[h]=a
            ac = pd.Series(temp).mean()

            accur[K][top_n] = ac

    accur_df = pd.DataFrame(accur)
    accur_max = accur_df.max().max()
    max_ac_df = accur_df[accur_df==accur_max]
    F_best = cv_train_home[feature_map[feature]].columns[indices][:max_ac_df.mean(axis=1).dropna().index.values[0]].tolist()
    K_best = max_ac_df.mean().dropna().index.values[0]

    # Now predicting for test home
    train_overall = appliance_df.ix[appliance_df[~appliance_df.index.isin([home])].index]
    test_overall = appliance_df[appliance_df.index.isin([home])]
    pred_test = {}
    gt_test = {}
    for month in range(start, stop):
        clf = KNeighborsRegressor(n_neighbors=K_best)
        clf.fit(train_overall[F_best], train_overall['%s_%d' %(appliance, month)])
        pred_test[month] = clf.predict(test_overall[F_best])
        neighbours = train_overall.index[clf.kneighbors(test_overall[F_best])[1]]
        print month, neighbours
        gt_test[month] = test_overall['%s_%d' %(appliance, month)]


    json.dump({'f':F_best, 'k':K_best,'accuracy':accur_max},open(os.path.expanduser("~/main-out-new-larger/%s_%s_%d.json" %(appliance,feature, home)),"w") )
    print F_best, K_best, accur_max
    pred_df = pd.DataFrame(pred_test)
    pred_df.index = [home]
    #gt_df = pd.DataFrame(gt_test)
    #print pred_df, gt_df
    #error = (gt_df-pred_df).abs().div(gt_df).mul(100)
    #print error
    #accuracy_test = 100-error
    #accuracy_test[accuracy_test<0]=0

    #return accuracy_test.squeeze()
    return pred_df
class kradius(BaseEstimator):
    def __init__(self,
                 metric="euclidean",
                 weights="uniform",
                 n_neighbors=3,
                 radius=1.0,
                 n_jobs=1):
        self.radius = radius
        self.metric = metric
        self.weights = weights
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

    def fit(self, X, y):
        self.knn_model = KNeighborsRegressor(
            n_neighbors=self.n_neighbors,
            n_jobs=self.n_jobs,
            weights=self.weights,
            metric=self.metric,
        )
        self.knn_model.fit(X, y)
        return self

    def predict(self, X):

        # no need to to distance filtering if we take only 1 neighbor
        if self.n_neighbors > 1:

            dists, inds = self.knn_model.kneighbors(
                X, n_neighbors=self.n_neighbors)
            # dropping value where distance too big
            # we always keep the closest point (first value)

            inds = [
                np.array([
                    index for distance, index in zip(dist, ind)
                    if distance <= self.radius or distance == dist[0]
                ]) for dist, ind in zip(dists, inds)
            ]
            dists = [
                np.array([
                    distance for distance in dist
                    if distance <= self.radius or distance == dist[0]
                ]) for dist in dists
            ]

            weights = _get_weights(dists, self.weights)

            _y = self.knn_model._y
            if _y.ndim == 1:
                _y = _y.reshape((-1, 1))

            if weights is None:
                y_pred = np.array(
                    [np.mean(_y[ind, :], axis=0) for ind in inds])

            else:
                y_pred = np.empty((X.shape[0], _y.shape[1]), dtype=np.float64)

                for k in range(X.shape[0]):
                    for j in range(_y.shape[1]):
                        y_pred[k, j] = np.sum(_y[inds[k], j] * weights[k] /
                                              sum(weights[k]))

            if _y.ndim == 1:
                y_pred = y_pred.ravel()

            return y_pred

        else:
            return self.knn_model.predict(X)
Пример #31
0
    Xtrn_KNN = std_scale.transform(Xtrn_KNN)
    Xtst_KNN = std_scale.transform(Xtst_KNN)

    print('Running KNN')
    # the best HP for this case is 20
    reg = KNeighborsRegressor(n_neighbors=20,
                              weights='distance',
                              algorithm='brute',
                              leaf_size=30,
                              p=2,
                              metric='minkowski',
                              metric_params=None,
                              n_jobs=None)
    reg.fit(Xtrn_KNN, ytrn_KNN)
    ypreds_knn = reg.predict(Xtst_KNN)
    top_nns_dist, top_nns_inds = reg.kneighbors(Xtst_KNN,
                                                n_neighbors=Xtrn_KNN.shape[0])

    print('Saving files for KNN')
    np.save(save_dir + 'KNN_preds.npy', ypreds_knn)
    np.save(save_dir + 'KNN_dists.npy', top_nns_dist)
    np.save(save_dir + 'KNN_inds.npy', top_nns_inds)
    print()

############# Get results for Lasso ##################################
# transpose and split the data for Lasso
print('Slicing and Standarizing Data for Lasso')
data_Lasso = np.transpose(data)
Xdata_Lasso = data_Lasso[:, beta_trn_inds]
ydata_Lasso = data_Lasso[:, beta_tst_inds]
Xtrn_Lasso = Xdata_Lasso[Xgenes, :]
Xtst_Lasso = Xdata_Lasso[ygenes, :]
def DetectCurrentFace(hebi, Group):

    import scipy.io as scio
    import sys
    import numpy as np

    ### This was used for testing purposes only

    # import hebi  # for the Hebi motors
    # from time import sleep
    #
    # # Need to look into XML formatting for Hebi Gains
    # # sio.loadmat('defaultGains.mat')
    #
    # lookup = hebi.Lookup()  # Get table of all Hebi motors
    # sleep(2)  # gives the Lookup process time to discover modules
    #
    # # Displays the Hebi modules found on the network
    # print('Modules found on the network:')
    #
    # for entry in lookup.entrylist:
    #     print('{0} | {1}'.format(entry.family, entry.name))
    #
    # # print('\n')
    #
    # var = raw_input('Were any modules found? [y/N]: \n')
    # if var == 'y':
    #     print('\nYay!\n')
    # elif var == 'Y':
    #     print('\nYay!\n')
    # else:
    #     print('\nNONE FOUND!\n')
    #     sys.exit()
    #
    # Group = lookup.get_group_from_family('*')
    # infoTable = Group.request_info()

    ### This was used for testing purposes only

    trainingData = scio.loadmat(
        'IMUTrainingRutgers.mat')  # training data gathered from MATLAB

    labels = np.float(trainingData['labs'][0][0][0])

    for i in range(1, len(trainingData['labs'])):
        labels = np.append(labels, np.float(trainingData['labs'][i][0][0]))

    # Create KNN model
    from sklearn.neighbors import KNeighborsRegressor
    knn = KNeighborsRegressor(n_neighbors=10)
    # Fit the model
    knn.fit(trainingData['trainingData'], labels)

    fbk = hebi.GroupFeedback(Group.size)
    Group.feedback_frequency = 200.0
    fbk = Group.get_next_feedback(reuse_fbk=fbk)

    # if(fbk.size != trainingData['nbMotors'][0][0]):
    #     print('Something is wrong with the number of connected motors!')
    #     return 0

    accel = fbk.accelerometer.reshape(1, -1)
    [d, n] = knn.kneighbors(
        accel, 10)  # give the lines which most closely match in variable "n"
    predicted_lines = np.asanyarray(
        labels[n[0]],
        dtype=int)  # obtains the label values which were predicted in "n"
    counts = np.bincount(
        predicted_lines)  # counts each instance of face numbers
    face = np.argmax(
        counts
    )  # finds the face with the highest number of instances [THIS IS OUR PREDICTION]

    return face
     pdate = dateutil.parser.parse(row[0])
     ptime = pdate.hour*3600 + pdate.minute*60 + pdate.second
     tdistance = get_distance(row[3], row[2], row[5], row[4])
     plat = row[3]
     plon = row[2]
     dlat = row[5]
     dlon = row[4]
     ttime = row[1]
     test_data.append([ttime, ptime, tdistance, plat, plon, dlat, dlon])
 test_data = np.asmatrix(test_data)
 test_data[:,[1,2,3,4,5,6]] = (test_data[:,[1,2,3,4,5,6]] - mean) / std
 
 optimal_k = 0
 optimal = 0
 for k in range(5, 21):
     neigh = KNeighborsRegressor(n_neighbors=k)
     neigh.fit(train_data[:,[1,2,3,4,5,6]], train_data[:,0])
     dist, ind = neigh.kneighbors(test_data[:, [1,2,3,4,5,6]])
     result = []
     for row in range(ind.shape[0]):
         e = []
         for i in range(ind.shape[1]):
             e.append(train_data[ind[row,i], 0])
         result.append(e)
     result = np.asmatrix(result)
     median = np.median(result, axis=1)
     mae = mean_absolute_error(median, test_data[:,0])
     print mae
     if mae > optimal:
         optimal_k = k
 print optimal_k
# Use a nearest neighbors algorithm to augment the data to expand the train set
if (data_augmentation):
    print("Performing data augmentation")
    text_tfidf = text_tfidf.toarray()
    augmented_train_tfidf = list(text_tfidf.copy())
    augmented_train_labels = list(train_val_labels.copy())
    knn = KNeighborsRegressor(4, 'distance').fit(text_tfidf, train_val_labels)
    shuffled_indexes = list(range(len(augmented_train_tfidf)))
    np.random.shuffle(shuffled_indexes)

    # Augment 20% of the train data and add it to the original set
    for index in shuffled_indexes[0:int(len(augmented_train_tfidf) / 5)]:
        datapoint_text = np.reshape(augmented_train_tfidf[index], (1, -1))
        datapoint_label = augmented_train_labels[index]
        neighbor = knn.kneighbors(datapoint_text, return_distance=False)
        random_neighbor = np.random.randint(1, 4)
        difference = text_tfidf[neighbor[0][random_neighbor]] - datapoint_text
        gap = np.random.rand(1)[0]
        new_point = datapoint_text + difference * gap
        augmented_train_tfidf = np.append(augmented_train_tfidf,
                                          new_point,
                                          axis=0)
        augmented_train_labels.append(datapoint_label)

    text_tfidf = sparse.csr_matrix(augmented_train_tfidf)
    train_val_labels = augmented_train_labels

# Initialize Logistic Regression classifier and fit it on the tf-idf training data
print("Training the model")
classifier = SGDClassifier(loss='log',
Пример #35
0
def album_recommender(album_dataset, recommender_output_path):

    recommender_dataset = album_dataset

    non_numerical_cols = [
        'artists', 'album_cover', 'album_name', 'album_id', 'first_track_id',
        'album'
    ]

    X = recommender_dataset.drop(columns=non_numerical_cols).copy()
    y_tempo = X['tempo']

    #Intantiate MinMaxScaler() and fit/transofrm each column
    minmax = MinMaxScaler()
    X['danceability'] = minmax.fit(X[["danceability"
                                      ]]).transform(X[["danceability"]])
    X['energy'] = minmax.fit(X[["energy"]]).transform(X[["energy"]])
    X['speechiness'] = minmax.fit(X[["speechiness"
                                     ]]).transform(X[["speechiness"]])
    X['acousticness'] = minmax.fit(X[["acousticness"
                                      ]]).transform(X[["acousticness"]])
    X['instrumentalness'] = minmax.fit(X[["instrumentalness"
                                          ]]).transform(X[["instrumentalness"
                                                           ]])
    X['liveness'] = minmax.fit(X[["liveness"]]).transform(X[["liveness"]])
    X['valence'] = minmax.fit(X[["valence"]]).transform(X[["valence"]])
    X['tempo'] = minmax.fit(X[["tempo"]]).transform(X[["tempo"]])
    X['loudness'] = minmax.fit(X[["loudness"]]).transform(X[["loudness"]])
    X['key'] = minmax.fit(X[["key"]]).transform(X[["key"]])

    # Instanciate and train audio feature model
    knn_tempo = KNeighborsRegressor().fit(X, y_tempo)

    # Use the model's kneighbors method to pass in a song and grat the 2 nearest to it / drop non-numerial data / returns tuple
    knn_recommended_tempo = knn_tempo.kneighbors(X, n_neighbors=5)

    # Grab the indexes of the recommended songs from knn
    suggested_album_indexes_tempo = knn_recommended_tempo[1][:, 1:]

    # Turn the KNN results into values in a dictionary with keys matching indexes
    suggested_album_dict = dict(enumerate(suggested_album_indexes_tempo))

    # Turn the KNN dictionary results into a dataframe
    suggested_album_index_df = pd.DataFrame(suggested_album_dict.values(),
                                            index=suggested_album_dict.keys())

    # Concatanate to the main dataframe & rename columns
    recommender_dataset_with_X = pd.concat([X, suggested_album_index_df],
                                           axis=1)
    recommender_dataset_with_X.rename(columns={
        0: 'rec_album_1',
        1: 'rec_album_2',
        2: 'rec_album_3',
        3: 'rec_album_4'
    },
                                      inplace=True)

    # Turn album_id column into a index/id matching dictionary
    album_image_dictionary = dict(recommender_dataset["album_cover"])
    album_id_dictionary = dict(recommender_dataset['album_id'])
    album_artist_dictionary = dict(recommender_dataset['artists'])
    album_name_dictionary = dict(recommender_dataset['album_name'])

    # Create new columns for the suggested album cover image (urls)
    recommender_dataset_with_X[
        "rec_album_image_1"] = recommender_dataset_with_X.rec_album_1.map(
            album_image_dictionary)
    recommender_dataset_with_X[
        "rec_album_image_2"] = recommender_dataset_with_X.rec_album_2.map(
            album_image_dictionary)
    recommender_dataset_with_X[
        "rec_album_image_3"] = recommender_dataset_with_X.rec_album_3.map(
            album_image_dictionary)
    recommender_dataset_with_X[
        "rec_album_image_4"] = recommender_dataset_with_X.rec_album_4.map(
            album_image_dictionary)

    # Create new columns for the suggested album artist
    recommender_dataset_with_X[
        "rec_album_artist_1"] = recommender_dataset_with_X.rec_album_1.map(
            album_artist_dictionary)
    recommender_dataset_with_X[
        "rec_album_artist_2"] = recommender_dataset_with_X.rec_album_2.map(
            album_artist_dictionary)
    recommender_dataset_with_X[
        "rec_album_artist_3"] = recommender_dataset_with_X.rec_album_3.map(
            album_artist_dictionary)
    recommender_dataset_with_X[
        "rec_album_artist_4"] = recommender_dataset_with_X.rec_album_4.map(
            album_artist_dictionary)

    # Create new columns for the suggested album name
    recommender_dataset_with_X[
        "rec_album_name_1"] = recommender_dataset_with_X.rec_album_1.map(
            album_name_dictionary)
    recommender_dataset_with_X[
        "rec_album_name_2"] = recommender_dataset_with_X.rec_album_2.map(
            album_name_dictionary)
    recommender_dataset_with_X[
        "rec_album_name_3"] = recommender_dataset_with_X.rec_album_3.map(
            album_name_dictionary)
    recommender_dataset_with_X[
        "rec_album_name_4"] = recommender_dataset_with_X.rec_album_4.map(
            album_name_dictionary)

    # Assign the respective album_id in each column
    recommender_dataset_with_X.rec_album_1 = recommender_dataset_with_X.rec_album_1.map(
        album_id_dictionary)
    recommender_dataset_with_X.rec_album_2 = recommender_dataset_with_X.rec_album_2.map(
        album_id_dictionary)
    recommender_dataset_with_X.rec_album_3 = recommender_dataset_with_X.rec_album_3.map(
        album_id_dictionary)
    recommender_dataset_with_X.rec_album_4 = recommender_dataset_with_X.rec_album_4.map(
        album_id_dictionary)
    # Concatanate the suggested album dataframe to the main dataset
    # recommender_dataset_with_X = pd.concat([recommender_dataset_with_X, suggested_album_id_df], axis=1)
    # recommender_dataset_with_X.rename(columns={0:'suggested_album_id'}, inplace=True)

    # Concatanate the original dataset with the suggested albums dataset
    recommender_dataset = pd.concat(
        [recommender_dataset, recommender_dataset_with_X], axis=1)
    # Write result as csv to a file path

    recommender_dataset.to_csv(recommender_output_path, index=False)
Пример #36
0

## predicting

file_path = os.path.join(data_path, "python_training.csv")
df = pd.read_csv(file_path, sep=';')
# filtering outliers (> 12ke per sqm)
df = df.loc[df['pricesqm']<12000]

file_path = os.path.join(data_path, "python_to_predict.csv")
X = pd.read_csv(file_path, sep=';')
X_ = processing(X)
X_ = feature_processing.transform(X_)

estimate = int(model.predict(X_))
kn = model.kneighbors(X_, n_neighbors=10, return_distance=True)

ids = [df['id'].iloc[x] for x in kn[1][0]]
distances = [x for x in kn[0][0]]
estimates = [float(df.iloc[int(x)]['pricesqm']) for x in kn[1][0]]

ids.insert(0,int(X['id']))
distances.insert(0,np.mean(distances)) # average distance from 10 nearest --> confidence index
estimates.insert(0, estimate)

result = {'id': ids,
          'estimate': estimates,
          'distance': distances
         }
result = pd.DataFrame(data=result)
def _find_accuracy_num_homes(appliance, num_homes, start_seed, end_seed, feature="Monthly"):
    if appliance=="hvac":
        start, stop=5, 11
    else:
        start, stop=1, 13
    out = {}
    out_overall={}
    # We need to find homes that have all the features
    appliance_df = df.ix[df[['%s_%d' %(appliance,month) for month in range(start,stop)]].dropna().index]
    all_homes = appliance_df.index
    kf = KFold(len(all_homes), n_folds=5)
    for cv_loop_index, (train_index, test_index) in enumerate(kf):
        out_overall[cv_loop_index] = {}
        train_df = appliance_df.ix[all_homes[train_index]]
        test_df = appliance_df.ix[all_homes[test_index]]
        #print train_df.index
        print "TRAINING>>>"

        #Now, for each random seed, we'll pick up `num_homes` homes from the train set
        # Do CV on that to pick up best features and then predict for the test homes
        error_df_list = {}
        #1. Multiple times we will choose `num_homes` from the train set
        for random_seed in range(start_seed, end_seed):
            print "Random seed:", random_seed



            #out_overall[random_seed] = {}
            train_subset_homes_idx = np.random.choice(len(train_df), num_homes, replace=False)
            train_subset_homes = train_df.ix[train_df.index[train_subset_homes_idx]]
            #print train_subset_homes
            #2. Now, on this small subset of homes, we will do a round of CV to learn optimum
            # features

            l = LeaveOneOut(len(train_subset_homes_idx))
            for cv_train, cv_test in l:
                cv_train_home =appliance_df.ix[train_subset_homes.index[cv_train]]
                cv_test_home = appliance_df.ix[train_subset_homes.index[cv_test]]
                test_home_name = cv_test_home.index.values[0]
                Y = cv_train_home[['%s_%d' %(appliance, i) for i in range(start, stop)]].sum(axis=1).values
                forest = ExtraTreesRegressor(n_estimators=250,
                                      random_state=0)
                forest.fit(cv_train_home[feature_map[feature]], Y)
                importances = forest.feature_importances_
                indices = np.argsort(importances)[::-1]

                # Now varying K and top-N features
                out[test_home_name] ={}
                for K in range(K_min, K_max):
                    out[test_home_name][K]={}
                    for top_n in range(F_min,F_max):
                        out[test_home_name][K][top_n]=[]
                        top_n_features = cv_train_home[feature_map[feature]].columns[indices][:top_n]

                        # Now fitting KNN on this
                        for month in range(start, stop):
                            clf = KNeighborsRegressor(n_neighbors=K)
                            clf.fit(cv_train_home[top_n_features], cv_train_home['%s_%d' %(appliance, month)])
                            dist, ind = clf.kneighbors(cv_test_home[top_n_features])
                            nghbrs = cv_train_home.index.values[ind].flatten()
                            proportion = cv_train_home.ix[nghbrs]['%s_%d' %(appliance, month)].div(df_unnormalised.ix[nghbrs]['%s_%d' %("aggregate", month)])
                            mean_prop = proportion.mean()
                            out[test_home_name][K][top_n].append(df_unnormalised.ix[cv_test_home.index]['%s_%d' %("aggregate", month)]*mean_prop)

            accur = {}

            # We want to find the F, K combination that minimised the median (over homes)
            # and mean over months error
            for K in range(K_min, K_max):
                accur[K] = {}
                for top_n in range(F_min, F_max):
                    accur[K][top_n]={}
                    temp = {}
                    for h in out.iterkeys():
                        pred = pd.DataFrame(out[h][K][top_n]).T
                        pred.index = [h]
                        pred.columns = [['%s_%d' %(appliance, i) for i in range(start, stop)]]
                        gt = appliance_df.ix[h][['%s_%d' %(appliance, i) for i in range(start, stop)]]
                        error = (pred-gt).abs().div(gt).mul(100).squeeze()
                        accur[K][top_n][h]=error

                    accur[K][top_n] = pd.DataFrame(accur[K][top_n]).T.median().mean()

            accur_df = pd.DataFrame(accur)
            accur_min = accur_df.min().min()
            min_ac_df = accur_df[accur_df==accur_min]
            F_best = cv_train_home[feature_map[feature]].columns[indices][:min_ac_df.mean(axis=1).dropna().index.values[0]].tolist()
            K_best = min_ac_df.mean().dropna().index.values[0]

            # Now predicting for test home


            pred_test = {}
            gt_test = {}
            for month in range(start, stop):
                clf = KNeighborsRegressor(n_neighbors=K_best)
                clf.fit(train_subset_homes[F_best], train_subset_homes['%s_%d' %(appliance, month)])
                dist, ind = clf.kneighbors(test_df[F_best])
                nghbrs = train_subset_homes[F_best].index.values[ind].flatten()[:K]
                nr = train_subset_homes.ix[nghbrs]['%s_%d' %(appliance, month)]
                dr = df_unnormalised.ix[nghbrs]['%s_%d' %("aggregate", month)]
                nr.name = dr.name
                proportion =nr.div(dr)
                mean_prop = proportion.mean()
                pred_test[month] =df_unnormalised.ix[test_df.index]['%s_%d' %("aggregate", month)]*mean_prop
                gt_test[month] = test_df['%s_%d' %(appliance, month)]


            #json.dump({'f':F_best, 'k':K_best,'accuracy':accur_max},open("../sensitivity-new/%s_%s_%d.json" %(appliance,feature, home),"w") )

            pred_df = pd.DataFrame(pred_test)
            pred_df.index = test_df.index
            gt_df = pd.DataFrame(gt_test)
            error = (gt_df-pred_df).abs().div(gt_df).mul(100)
            out_overall[cv_loop_index][random_seed] = error

    errors = {}
    for random_seed in range(start_seed, end_seed):
        temp_list = []
        for cv_loop_index in range(len(kf)):
            temp_list.append(out_overall[cv_loop_index][random_seed])
        errors[random_seed] = pd.concat(temp_list)


    return errors
    def fit(self, L, U, maxIt=1000, poolSize=100, wSize=10, **kwargs):
        
        # Initialize Training Sets
        L = Data(np.copy(L.X), np.copy(L.y))
        
        # Select pool of unlabeled data
        UpoolIndexs = np.random.choice(len(U), poolSize, replace=False)
        Upool = [U[i] for i in UpoolIndexs]
        
        # Create the regressor
        model = self.learner(**kwargs)
        
        # train regressors on labeled data
        model.fit(L.X, L.y)
        
        # repeat for max_it rounds
        for i in range(maxIt):
            print i
            # keep list of changes to Ls
            pi = []
                
            UpoolYs = model.predict(Upool)
            # get the neighbors of each unlabeled point - as indexs of the orig lists
            kNN = KNeighborsRegressor(n_neighbors=self.k)
            kNN.fit(L.X, L.y)
            
            UpoolNDistances = [sum(ns) for ns in kNN.kneighbors(Upool)[0]]
            W = heapq.nsmallest(wSize, [(k, t) for t, k in enumerate(UpoolNDistances)])
            W = [w[1] for w in W]
            Wpool = [Upool[r] for r in W]
            WNeighbors = kNN.kneighbors(Wpool, return_distance=False)
            RMSEs = []
            newX = []
            newY = []
            for r in range(wSize):

                neighborsIndexs = WNeighbors[r]
                neighbors = [L.X[n] for n in neighborsIndexs]
                
                neighborsYs = model.predict(neighbors)
                avgY = sum(neighborsYs)/float(self.k)
                x = Upool[W[r]]
                newX.append(x)
                newY.append(avgY)
            
            
            for x, y in zip(newX, newY):
                # L combined with the neighbors of each u in the Upool
                altL = Union(L, x, y)
                
                # create a model based on this altL
                altModel = self.learner(**kwargs)
                altModel.fit(altL.X, altL.y)
                
                altY = altModel.predict(newX)
                
                rmse = mean_squared_error(newY, altY)
                
                RMSEs.append(rmse)
                
            sortedErrors = sorted(RMSEs)
            lowest = sortedErrors[0]
            index = W[RMSEs.index(lowest)]
            bestX = Upool[index]
            bestY = UpoolYs[index]
            
            L = Union(L, bestX, bestY)
            
            uIndex = U.tolist().index(bestX.tolist())
            m, n = U.shape
            U = np.delete(U, (uIndex), axis=0)
            
            
            model.fit(L.X, L.y)
            UpoolIndexs = np.random.choice(len(U), poolSize, replace=False)
            Upool = [U[i] for i in UpoolIndexs]
        
        #print kNNs[0].predict(U)
        print L.X
        print L.y
        self.model = model