예제 #1
0
def test_point_correct_length():
    """
    Point vector should be length k, the number of columns of the input dataframe
    """
    try:
        get_all_distances([1,2,3,4],df)
    except:
        assert True
예제 #2
0
def test_metric_input():
    """
    metric should be a string and one of 'cosine', 'euclidean' or 'manhattan'
    """
    try:
        get_all_distances(ref_vec,df, metric = "cityblock")
    except:
        assert True
예제 #3
0
def test_second_arg_df():
    """
    Test that if the second argument isn't a data frame, an exception should be thrown
    """
    try:
        get_all_distances(ref_vec,[1,2,3])
    except:
        assert True
예제 #4
0
def test_second_arg_list():
    """
    Test that if the first argument isn't a list, an exception should be thrown
    """
    try:
        get_all_distances(df, df)
    except:
        assert True
예제 #5
0
def test_point_types():
    """
    Test for error if input 'point' contains items other than numerics
    """
    try:
        get_all_distances(["a","b","c"],df)
    except:
        assert True
예제 #6
0
def test_manhattan():
    """
    Test that the manhattan output works correctly
    """
    output = get_all_distances(ref_vec, df, metric = "manhattan")
    output_rounded = [round(dist,2) for dist in output]

    assert(output_rounded == [7,6,5])
예제 #7
0
def test_cosine():
    """
    Test that the cosine output works correctly
    """
    output = get_all_distances(ref_vec, df, metric = "cosine")
    output_rounded = [round(dist,2) for dist in output]

    assert(output_rounded == [0.83,0.32,0.45])
예제 #8
0
def test_euclidean():
    """
    Test that the euclidean output works correctly
    """
    output = get_all_distances(ref_vec, df, metric = "euclidean")
    output_rounded = [round(dist,2) for dist in output]

    assert(output_rounded == [5,4.47,5])
예제 #9
0
def filter_distances(point: list, data: pd.DataFrame, threshold: float, metric: str="euclidean") -> list:

    """
    Returns indices of rows in a dataframe that are
    within a given threshold distance from a given
    observation based on a specified distance metric.

    Parameters
    ----------
    point : list
        Values defining a single observation
        to compute distances for.

    data : dataframe
        Dataframe containing values of all
        observations to calculate distances
        from point.

    threshold : float
        The maximum distance of observations to
        return indices for.

    metric: string
        Type of distance metric to use in distance
        calculations.

    Returns
    -------
    list
        Indices of the observations with distance less
        than `threshold` from `point`
    """

    if not isinstance(threshold, float):
        raise Exception("The threshold argument should be a single number (float)")

    if threshold < 0:
        raise Exception("The threshold argument should be non-negative")

    # Call helper function to compute distances
    distances = get_all_distances(point, data, metric)

    indices = []
    for i, d in enumerate(distances):
        if d <= threshold:
            indices.append(i)

    return indices
예제 #10
0
def get_closest(point, data, top_k, metric="euclidean"):
    """
    Returns indices of the top k rows in a dataframe
    that are closest to a given observation based on
    a specified distance metric.

    Parameters
    ----------
    point : list
        Values defining a single observation
        to compute distances for.
        
    data : dataframe
        Dataframe containing values of all
        observations to calculate distances
        from point.
        
    top_k : int
        The number of closest observations to
        return indices for.
        
    metric: string
        Type of distance metric to use in distance
        calculations.
    
    Returns
    -------
    list
        Indices of the closest k observations
        from data.
    """
    # Check inputs are valid and as expected
    if not isinstance(data, pd.DataFrame):
        raise Exception("The data argument should be a pandas dataframe")

    if not isinstance(metric, str):
        raise Exception("The 'metric' argument should be a string")

    if top_k < 0 or not isinstance(top_k, int):
        raise Exception("The top_k argument should be a non-negative integer")

    if not isinstance(point, list):
        raise Exception("The point argument should be a list")

    if not all((isinstance(x, int) | isinstance(x, float)) for x in point):
        raise Exception("The point argument should contain only numerics")

    supported_dist = ["euclidean", "cosine", "manhattan"]
    if metric not in supported_dist:
        raise Exception(
            "The 'metric' argument is not a supported distance metric")

    if top_k > len(data):
        warnings.warn(
            "Warning: Note that since top_k is larger than the number of points in the dataframe, fewer than top_k indices will be returned."
        )

    # Call helper function to compute distances
    distances = get_all_distances(point, data, metric)

    # Sort distances in ascending order (smallest distances first)
    # and return indices in that order
    dist_index_sorted = list(np.argsort(distances))

    if len(distances) >= top_k:
        return dist_index_sorted[:top_k]  # Returns first k rows
    else:
        return dist_index_sorted  # Returns all rows
예제 #11
0
def test_output_type():
    """
    Test that output is of type list
    """
    assert(type(get_all_distances(ref_vec, df)) == list)
예제 #12
0
def test_output_length():
    """
    Test that the output vector length is the same as the
    number of rows in the input dataframe
    """
    assert(len(get_all_distances(ref_vec, df)) == df.shape[0])
예제 #13
0
def filter_distances(point, data, threshold, metric="euclidean"):
    """
    Returns indices of rows in a dataframe that are
    within a given threshold distance from a given
    observation based on a specified distance metric.

    Parameters
    ----------
    point : list
        Values defining a single observation
        to compute distances for.

    data : dataframe
        Dataframe containing values of all
        observations to calculate distances
        from point.

    threshold : float
        The maximum distance of observations to
        return indices for.

    metric: string
        Type of distance metric to use in distance
        calculations.

    Returns
    -------
    list
        Indices of the observations with distance less
        than `threshold` from `point`
    """

    # Check inputs are valid and as expected
    if not isinstance(data, pd.DataFrame):
        raise Exception("The data argument should be a pandas dataframe")

    if not isinstance(metric, str):
        raise Exception("The metric argument should be a string")

    if threshold < 0 or not isinstance(threshold, float):
        raise Exception(
            "The threshold argument should be a non-negative float")

    if not isinstance(point, list):
        raise Exception("The point argument should be a list")

    if not all((isinstance(x, int) | isinstance(x, float)) for x in point):
        raise Exception("The point argument should contain only numerics")

    supported_metric = ["euclidean", "cosine", "manhattan"]
    if metric not in supported_metric:
        raise Exception(
            "The metric argument is not a supported distance metric")

    # Call helper function to compute distances
    distances = get_all_distances(point, data, metric)

    indices = []
    for i, d in enumerate(distances):
        if d <= threshold:
            indices.append(i)

    return indices