def test_point_correct_length(): """ Point vector should be length k, the number of columns of the input dataframe """ try: get_all_distances([1,2,3,4],df) except: assert True
def test_metric_input(): """ metric should be a string and one of 'cosine', 'euclidean' or 'manhattan' """ try: get_all_distances(ref_vec,df, metric = "cityblock") except: assert True
def test_second_arg_df(): """ Test that if the second argument isn't a data frame, an exception should be thrown """ try: get_all_distances(ref_vec,[1,2,3]) except: assert True
def test_second_arg_list(): """ Test that if the first argument isn't a list, an exception should be thrown """ try: get_all_distances(df, df) except: assert True
def test_point_types(): """ Test for error if input 'point' contains items other than numerics """ try: get_all_distances(["a","b","c"],df) except: assert True
def test_manhattan(): """ Test that the manhattan output works correctly """ output = get_all_distances(ref_vec, df, metric = "manhattan") output_rounded = [round(dist,2) for dist in output] assert(output_rounded == [7,6,5])
def test_cosine(): """ Test that the cosine output works correctly """ output = get_all_distances(ref_vec, df, metric = "cosine") output_rounded = [round(dist,2) for dist in output] assert(output_rounded == [0.83,0.32,0.45])
def test_euclidean(): """ Test that the euclidean output works correctly """ output = get_all_distances(ref_vec, df, metric = "euclidean") output_rounded = [round(dist,2) for dist in output] assert(output_rounded == [5,4.47,5])
def filter_distances(point: list, data: pd.DataFrame, threshold: float, metric: str="euclidean") -> list: """ Returns indices of rows in a dataframe that are within a given threshold distance from a given observation based on a specified distance metric. Parameters ---------- point : list Values defining a single observation to compute distances for. data : dataframe Dataframe containing values of all observations to calculate distances from point. threshold : float The maximum distance of observations to return indices for. metric: string Type of distance metric to use in distance calculations. Returns ------- list Indices of the observations with distance less than `threshold` from `point` """ if not isinstance(threshold, float): raise Exception("The threshold argument should be a single number (float)") if threshold < 0: raise Exception("The threshold argument should be non-negative") # Call helper function to compute distances distances = get_all_distances(point, data, metric) indices = [] for i, d in enumerate(distances): if d <= threshold: indices.append(i) return indices
def get_closest(point, data, top_k, metric="euclidean"): """ Returns indices of the top k rows in a dataframe that are closest to a given observation based on a specified distance metric. Parameters ---------- point : list Values defining a single observation to compute distances for. data : dataframe Dataframe containing values of all observations to calculate distances from point. top_k : int The number of closest observations to return indices for. metric: string Type of distance metric to use in distance calculations. Returns ------- list Indices of the closest k observations from data. """ # Check inputs are valid and as expected if not isinstance(data, pd.DataFrame): raise Exception("The data argument should be a pandas dataframe") if not isinstance(metric, str): raise Exception("The 'metric' argument should be a string") if top_k < 0 or not isinstance(top_k, int): raise Exception("The top_k argument should be a non-negative integer") if not isinstance(point, list): raise Exception("The point argument should be a list") if not all((isinstance(x, int) | isinstance(x, float)) for x in point): raise Exception("The point argument should contain only numerics") supported_dist = ["euclidean", "cosine", "manhattan"] if metric not in supported_dist: raise Exception( "The 'metric' argument is not a supported distance metric") if top_k > len(data): warnings.warn( "Warning: Note that since top_k is larger than the number of points in the dataframe, fewer than top_k indices will be returned." ) # Call helper function to compute distances distances = get_all_distances(point, data, metric) # Sort distances in ascending order (smallest distances first) # and return indices in that order dist_index_sorted = list(np.argsort(distances)) if len(distances) >= top_k: return dist_index_sorted[:top_k] # Returns first k rows else: return dist_index_sorted # Returns all rows
def test_output_type(): """ Test that output is of type list """ assert(type(get_all_distances(ref_vec, df)) == list)
def test_output_length(): """ Test that the output vector length is the same as the number of rows in the input dataframe """ assert(len(get_all_distances(ref_vec, df)) == df.shape[0])
def filter_distances(point, data, threshold, metric="euclidean"): """ Returns indices of rows in a dataframe that are within a given threshold distance from a given observation based on a specified distance metric. Parameters ---------- point : list Values defining a single observation to compute distances for. data : dataframe Dataframe containing values of all observations to calculate distances from point. threshold : float The maximum distance of observations to return indices for. metric: string Type of distance metric to use in distance calculations. Returns ------- list Indices of the observations with distance less than `threshold` from `point` """ # Check inputs are valid and as expected if not isinstance(data, pd.DataFrame): raise Exception("The data argument should be a pandas dataframe") if not isinstance(metric, str): raise Exception("The metric argument should be a string") if threshold < 0 or not isinstance(threshold, float): raise Exception( "The threshold argument should be a non-negative float") if not isinstance(point, list): raise Exception("The point argument should be a list") if not all((isinstance(x, int) | isinstance(x, float)) for x in point): raise Exception("The point argument should contain only numerics") supported_metric = ["euclidean", "cosine", "manhattan"] if metric not in supported_metric: raise Exception( "The metric argument is not a supported distance metric") # Call helper function to compute distances distances = get_all_distances(point, data, metric) indices = [] for i, d in enumerate(distances): if d <= threshold: indices.append(i) return indices