Пример #1
0
class KNeighborsRegressorImpl():
    def __init__(self,
                 n_neighbors=5,
                 weights='uniform',
                 algorithm='auto',
                 leaf_size=30,
                 p=2,
                 metric='minkowski',
                 metric_params=None,
                 n_jobs=None):
        self._hyperparams = {
            'n_neighbors': n_neighbors,
            'weights': weights,
            'algorithm': algorithm,
            'leaf_size': leaf_size,
            'p': p,
            'metric': metric,
            'metric_params': metric_params,
            'n_jobs': n_jobs
        }

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)
Пример #2
0
def skcv_knn(coordinates, Xdata, Ydata, number_of_folds, dzradii, k_neighbors, visualization):
    print("Starting skcv-knn analysis...")
    # Calculate sorted pairwise distance matrix and indexes
    performanceTable = np.zeros([len(dzradii),2])
    data_distances, data_distance_indexes = distanceMatrix(coordinates)
    folds = random_folds(len(Ydata), number_of_folds)
    for rind, dzradius in enumerate(dzradii):
        print("Analysis ongoing, dead zone radius: " + str(dzradius) + "m / " + str(dzradii[len(dzradii)-1]) + "m")
        # Calculate dead zone folds
        dz_folds = dzfolds(dzradius, folds, data_distances, data_distance_indexes)
        # Initialize performance variables   
        P = np.zeros(Ydata.shape)
        for fold_id, dz_fold in enumerate(dz_folds):
            X_tr = np.delete(Xdata, dz_fold, axis=0)
            Y_tr = np.delete(Ydata, dz_fold, axis=0)
            learner = KNeighborsRegressor(n_neighbors=k_neighbors)
            learner.fit(X_tr, Y_tr)
            preds = learner.predict(Xdata[dz_fold])
            if preds.ndim == 0:
                P[folds[fold_id]] = preds         
            else:
                P[folds[fold_id]] = preds[0:len(folds[fold_id])]
            if visualization: # Check for visualization
                testcoords = coordinates[folds[fold_id],:]
                dzcoords = coordinates[dz_fold, :]
                visualize_skcv(coordinates, testcoords, dzcoords, dzradius)                
        perf = cindex(Ydata, P)
        performanceTable[rind,0] = dzradius
        performanceTable[rind, 1] = perf
        plotRes_skcv(performanceTable, rind, number_of_folds, "K-nn")
    print("Analysis done.")
    return performanceTable
Пример #3
0
def evalOne(parameters):
    
    all_obs = []
    all_pred = []
#     all_obs_train = []
#     all_pred_train = []

    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, all_features, "target")
        model = KNeighborsRegressor(weights = parameters["weights"], n_neighbors = parameters["neighbors"], p = parameters["p"])
        model.fit(trainX, trainY)
#         train_prediction = model.predict(trainX)
        prediction = model.predict(testX)
        all_obs.extend(testY)
        all_pred.extend(prediction)
#         all_obs_train.extend(trainY)
#         all_pred_train.extend(train_prediction)

    return rmseEval(all_obs, all_pred)[1] 
Пример #4
0
def trainNearestNeighbors(data, columns, targetColumn, parameters):
    
    modelColumns = []
    for column in columns:
        if column != targetColumn:
            modelColumns.append(column)
            
    modelData = []
    
    for i in range(0, len(data[targetColumn])):
        record = []
        for column in modelColumns:
            record.append(data[column][i])

        modelData.append(record)
        
    model = KNeighborsRegressor(weights = parameters["weights"], n_neighbors = parameters["neighbors"], p = parameters["p"])
    
    model.fit (modelData, data[targetColumn])
    
    return NearestNeighborsModel(model, modelColumns)
Пример #5
0
def lposcv_knn(coordinates, Xdata, Ydata, dzradii, k, visualization):
    print("Starting lposcv-knn analysis...")
    # Calculate sorted pairwise distance matrix and indexes
    performanceTable = np.zeros([len(dzradii), 2])
    data_distances, data_distance_indexes = distanceMatrix(coordinates)
    negcount = len(np.where(Ydata == 0)[0])
    folds = lpo_folds(Ydata, negcount)
    for rind, dzradius in enumerate(dzradii):
        print("Analysis ongoing, dead zone radius: " + str(dzradius) + "m / " +
              str(dzradii[len(dzradii) - 1]) + "m")
        # Calculate dead zone folds
        dz_folds = dzfolds(dzradius, folds, data_distances,
                           data_distance_indexes)
        perfs = []
        for dz_fold in dz_folds:
            X_tr = np.delete(Xdata, dz_fold, axis=0)
            Y_tr = np.delete(Ydata, dz_fold, axis=0)
            learner = KNeighborsRegressor(n_neighbors=k)
            learner.fit(X_tr, Y_tr)
            preds = learner.predict(Xdata[dz_fold])
            if preds[0] > preds[1]:
                perfs.append(1.)
            elif preds[0] == preds[1]:
                perfs.append(0.5)
            else:
                perfs.append(0.)
            if visualization:  # Check for visualization
                testcoords = coordinates[dz_fold[0:2], :]
                dzcoords = coordinates[dz_fold, :]
                visualize_lposcv(coordinates, testcoords, dzcoords, dzradius)
        perf = np.mean(perfs)
        performanceTable[rind, 0] = dzradius
        performanceTable[rind, 1] = perf
        plotRes_lposcv(performanceTable, rind, len(folds), 'knn')
    print("Analysis done.")
    return performanceTable
Пример #6
0
# load the data
data = {}
columns = []
loadData("/data/york_hour_2013.csv", ["timestamp", "atc"], data, columns)

all_features = deepcopy(columns)
all_features.remove("target")
all_features.remove("location")

output = open(OUTPUT_DATA_FILE, 'w')
output.write("location,observation,prediction\n")

for location in locations:
    trainX, testX, trainY, testY = splitDataForXValidation(
        location, "location", data, all_features, "target")
    model = KNeighborsRegressor(weights="distance", n_neighbors=23, p=2.0)

    model.fit(trainX, trainY)
    prediction = model.predict(testX)

    for i in range(0, len(testY)):
        output.write(str(location))
        output.write(",")
        output.write(str(testY[i]))
        output.write(",")
        output.write(str(prediction[i]))
        output.write("\n")

output.close()
Пример #7
0
def predict_personality(test_path):
    #train_path = '/Users/lyulan/TCSS555/Data/Train'
    train_path = '/data/train/'
    print 'train folder is: ', train_path
    
    # prepare training data
    personality_dic = {} 
    profile_path = train_path + '/Profile/Profile.csv'
    with open(profile_path) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            userid = row['userid']
            personality = []
            personality.append(float(row['ope']))
            personality.append(float(row['con']))
            personality.append(float(row['ext']))
            personality.append(float(row['agr']))
            personality.append(float(row['neu']))
            personality_dic[userid] = personality
    #print personality_dic

    train_userids = []
    train_texts = []
    train_personalities = []
    #train_filepath = train_path + '/Text'
    train_filepath = train_path + '/text'
    train_files = [f for f in listdir(train_filepath) if f.endswith(".txt")]
    for n in train_files:
        f = io.open(train_filepath + '/' + n, 'r', encoding='latin-1')
        text = f.read()
        userid = n.replace('.txt', '')
        f.close()
        # get rid of users without personality data
        if userid in personality_dic:
            train_userids.append(userid)
            train_texts.append(text)
            train_personalities.append(personality_dic[userid])
    
    # prepare testing data
    test_userids = []
    test_texts = []
    #test_filepath = test_path + '/Text'
    test_filepath = test_path + '/text'
    test_files = [f for f in listdir(test_filepath) if f.endswith(".txt")]
    for n in test_files:
        f = io.open(test_filepath + '/' + n, 'r', encoding='latin-1')
        text = f.read()
        userid = n.replace('.txt', '')
        f.close()
        test_userids.append(userid)
        test_texts.append(text)
        
        
    # extract features from training data using tfidf_vectorizer
    tfidf_vectorizer = TfidfVectorizer(min_df=0.005, max_df=0.3, ngram_range=(1,3), 
                                       stop_words='english',use_idf=True, tokenizer=tokenize)
    trainX = tfidf_vectorizer.fit_transform(train_texts)
    print("Training data: n_samples: %d, n_features: %d" % trainX.shape)
    # extract features from testing data using the same tfidf_vectorizer
    testX = tfidf_vectorizer.transform(test_texts)
    print("Testing data: n_samples: %d, n_features: %d" % testX.shape)
    # get feature names extracted by tfidf_vectorizer
    feature_names = tfidf_vectorizer.get_feature_names()
    print(feature_names)


    result = {}
    result['userid'] = test_userids
    
    for x in range(0, 5):
        y_train = np.array(train_personalities)[:,x]
        
        # reduce features by SelectKBest
        nkb_features = 50
        skb = SelectKBest(score_func=f_regression, k=nkb_features)
        X_train = skb.fit_transform(trainX, y_train)
        X_test = skb.transform(testX)
        
        # train using KNNRegressor
        knn = KNeighborsRegressor(n_neighbors=500)
        knn.fit(X_train, y_train)
        predicted = knn.predict(X_test)
        
        if x==0:
            result['ope'] = predicted
        elif x==1:
            result['con'] = predicted
        elif x==2:
            result['ext'] = predicted
        elif x==3:
            result['agr'] = predicted
        else:
            result['neu'] = predicted
    
    result_df = pd.DataFrame(result, columns=['userid', 'ope', 'con', 'ext', 'agr', 'neu'])
    return result_df
Пример #8
0
from ex30.ex30_lib_graph import plot2
from sklearn.neighbors.regression import KNeighborsRegressor

OUTPUT_PNG_FILE = '/experiments/ex30/ex30_knn.png'

X = [[float(x)] for x in range(0, 24)]
Y = [
    12.0, 13.0, 13.0, 13.0, 28.0, 31.0, 38.0, 60.0, 85.0, 80.0, 64.0, 60.0,
    59.0, 58.0, 65.0, 70.0, 80.0, 90.0, 110.0, 100.0, 85.0, 65.0, 45.0, 20.0
]

X2 = [[float(x) / 10.0] for x in range(0, 231)]

model = KNeighborsRegressor(n_neighbors=1, p=1)
model.fit(X, Y)
Y_pred = model.predict(X2)

print(str(Y_pred))

plot2(Y, Y_pred, OUTPUT_PNG_FILE, "Observed pollution concentration levels",
      "Predicted pollution concentration levels by NNR")