def classify(test_X): test_X = to_ndarray(test_X) response_Y = [] for neighbor_indices in nn.kneighbors(test_X, return_distance=False): neighbor_labels = [train_Y[index] for index in neighbor_indices] best_label = max(set(train_Y), key=lambda label: neighbor_labels.count(label)) response_Y.append(best_label) return to_ndarray(response_Y, dtype=int)
def compute_relevance_for_most_popular(actuals: pd.DataFrame, k: int): """ Computes the binary relevance vector for each user, meaning 0 if bad recommendation, 1 if good recommendation. :param actuals: actuals: n_ratings x [user, item, rating, timestamp] matrix with actual ratings :param k: number of items to recommend :return: a list of lists, where each list is a binary relevance vector for a user """ np_actuals = utils.to_ndarray(actuals) mp_items = baselines.most_popular(np_actuals) # list of users users = actuals['user'].unique() # result res = [] # iterating over users to make binary relevance vectors for user in users: data = actuals[actuals['user'] == user] # taking data for user. items_rated = list( data['item'] ) # taking items he has rated, as only those can be evaluated. # TODO: I think we should just take k first elements recommended list and then call "not-rated" items false positives # taking k items of most popular which the user has rated. k_mp_items = [i for i in mp_items if i in items_rated][:k] if len(k_mp_items) > 0: relevance = compute_single_relevance_for_index_predictions( data, k_mp_items) res.append(relevance) return res
def ced(img_file, sigma, t, T, all=False): img = to_ndarray(img_file) if not all: # avoid copies, just do all steps: img = gs_filter(img, sigma) img, D = gradient_intensity(img) img = suppression(img, D) img, weak = threshold(img, t, T) img = tracking(img, weak) return [img] else: # make copies, step by step img1 = gs_filter(img, sigma) img2, D = gradient_intensity(img1) img3 = suppression(copy(img2), D) img4, weak = threshold(copy(img3), t, T) img5 = tracking(copy(img4), weak) return [to_ndarray(img_file), img1, img2, img3, img4, img5]
def most_popular(data: pd.DataFrame) -> np.ndarray: """ Computing popularity of each item in terms of average ratings and then returns a list of each items' average rating. :param R: data in form of (user, item, rating, timestamp) tuples :return: list of average ratings for each item """ R = utils.to_ndarray(data) # returning mean of each column return R.mean(axis=0)
def embed(self, param, unknown_embedding=None): if unknown_embedding is not None: assert unknown_embedding.shape == self.values.shape[1 : ] if type(param) is str: try: return self.values[self.word_indices[param]] except KeyError: if unknown_embedding is not None: return unknown_embedding else: raise else: rec = partial(self.embed, unknown_embedding=unknown_embedding) return to_ndarray(chain.from_iterable(map(rec, param)))
def __setitem__(self, key, value): """ x.__setitem__(key, value) <==> x[key] = value Sets values based on `key`. All the functionality of ``ndarray.__setitem__()`` is supported (including fancy indexing), plus a special support for expressions: Parameters ---------- key : string The corresponding ctable column name will be set to `value`. If not a column name, it will be interpret as a boolean expression (computed via `ctable.eval`) and the rows where these values are true will be set to `value`. See Also -------- ctable.eval """ # First, convert value into a structured array value = utils.to_ndarray(value, self.dtype) # Check if key is a condition actually if type(key) is bytes: # Convert key into a boolean array #key = self.eval(key) # The method below is faster (specially for large ctables) rowval = 0 for nrow in self.where(key, outcols=["nrow__"]): nrow = nrow[0] if len(value) == 1: for name in self.names: self.cols[name][nrow] = value[name] else: for name in self.names: self.cols[name][nrow] = value[name][rowval] rowval += 1 return # Then, modify the rows for name in self.names: self.cols[name][key] = value[name] return
def __init__(self, data: pd.DataFrame, K: int, epochs=100, alpha=0.002, beta=0.02): """ Perform matrix factorization to predict empty entries in a matrix. Arguments - data (dataframe) : user-item interactions - K (int) : number of latent dimensions - epochs (int) : number of iterations - alpha (float) : learning rate - beta (float) : regularization parameter """ self.R = utils.to_ndarray(data) self.n_users, self.n_items = self.R.shape self.global_avg = data['rating'].mean() self.K = K self.epochs = epochs self.alpha = alpha self.beta = beta
def classify(test_X): test_X = to_ndarray(test_X) return lgr.predict(test_X)
def classify(test_X): test_X = to_ndarray(test_X) return forest.predict(test_X)
def load_json(filepath): path_feature_map = json.load(open(filepath, 'r')) path_feature_map = to_ndarray(path_feature_map) return path_feature_map
import glob import json import sys import os import numpy as np import utils from similarity import search_k_nearest from mds import calculate_positions from plot import plot_with_labels dicts = [] for path in glob.glob('./jsonfiles/*.json'): dicts.append(json.load(open(path, 'r'))) path_feature_map = utils.merge_multiple_dicts(dicts) path_feature_map = utils.to_ndarray(path_feature_map) #TODO Show error messages key = sys.argv[1] keys = path_feature_map.keys() print(keys) query = path_feature_map[key] k_nearest = search_k_nearest(path_feature_map, query) for filepath, distance in k_nearest: print("{}\n{:>3e}\n".format(filepath.encode('utf-8'), distance))