def find_neighbors_df( self, df: pd.DataFrame, unique_id_column: AnyStr, feature_columns: List[AnyStr], index_array_ids: np.array, num_neighbors: int = 5, **kwargs, ) -> pd.DataFrame: """Find nearest neighbors in a raw pandas DataFrame and format results into a new DataFrame""" output_df = pd.DataFrame() output_df[self.INPUT_COLUMN_NAME] = df[unique_id_column] data_loader = DataLoader(unique_id_column, feature_columns) (array_ids, arrays) = data_loader.convert_df_to_arrays(df, verbose=False) if arrays.shape[1] != self.num_dimensions: raise ValueError( "Incompatible number of dimensions: " + f"{self.num_dimensions} in index, {arrays.shape[1]} in feature column(s)" ) output_df["index_distance_pairs"] = self.find_neighbors_array( arrays, num_neighbors) output_df = output_df.explode("index_distance_pairs") output_df[self.NEIGHBOR_COLUMN_NAME] = output_df[ "index_distance_pairs"].apply(lambda x: int(x[0])) output_df[self.DISTANCE_COLUMN_NAME] = output_df[ "index_distance_pairs"].apply(lambda x: float(x[1])) output_df[self.NEIGHBOR_COLUMN_NAME] = ( output_df[self.NEIGHBOR_COLUMN_NAME].astype(int).apply( lambda i: index_array_ids[i])) # lookup the original array ids del output_df["index_distance_pairs"] return output_df
def test_find_neighbors_df(): params = { 'unique_id_column': 'images', 'feature_columns': ['prediction'], 'algorithm': 'annoy', 'expert': True, 'annoy_metric': 'angular', 'annoy_num_trees': 10 } index_config = { 'algorithm': 'annoy', 'num_dimensions': 2048, 'annoy_metric': 'angular', 'annoy_num_trees': 10, 'feature_columns': ['prediction'], 'expert': True } # Load data into array format for indexing columns = [params["unique_id_column"]] + params["feature_columns"] input_df = pd.read_csv('./tests/resources/caltech_embeddings.csv') input_df = input_df[columns] data_loader = DataLoader(params["unique_id_column"], params["feature_columns"]) (array_ids, arrays) = data_loader.convert_df_to_arrays(input_df) nearest_neighbor = NearestNeighborSearch(num_dimensions=arrays.shape[1], **params) with NamedTemporaryFile() as tmp: nearest_neighbor.build_save_index(arrays=arrays, index_path=tmp.name) params = { 'unique_id_column': 'images', 'feature_columns': ['prediction'], 'num_neighbors': 5 } nearest_neighbor = NearestNeighborSearch(**index_config) nearest_neighbor.load_index(tmp.name) # Find nearest neighbors in input dataset df = nearest_neighbor.find_neighbors_df(input_df, **params, index_array_ids=array_ids) actual = sorted( list(df[df['input_id'] == '34719_ostrich.jpg']['neighbor_id'])) expected = [ '107505_ostrich.jpg', '185189_ostrich.jpg', '213657_ostrich.jpg', '229350_ostrich.jpg', '34719_ostrich.jpg' ] assert len(actual) == len(expected) assert all([ actual_item == expected_item for actual_item, expected_item in zip(actual, expected) ])
def test_build_save_index(): params = { 'unique_id_column': 'images', 'feature_columns': ['prediction'], 'algorithm': 'annoy', 'expert': True, 'annoy_metric': 'angular', 'annoy_num_trees': 10 } # Load data into array format for indexing columns = [params["unique_id_column"]] + params["feature_columns"] input_df = pd.read_csv('./tests/resources/caltech_embeddings.csv') # Restrict to selected columns input_df = input_df[columns] data_loader = DataLoader(params["unique_id_column"], params["feature_columns"]) (array_ids, arrays) = data_loader.convert_df_to_arrays(input_df) nearest_neighbor = NearestNeighborSearch(num_dimensions=arrays.shape[1], **params) with NamedTemporaryFile() as tmp: nearest_neighbor.build_save_index(arrays=arrays, index_path=tmp.name) assert os.path.isfile(tmp.name)
import os from tempfile import NamedTemporaryFile from dku_param_loading import load_indexing_recipe_params from data_loader import DataLoader from nearest_neighbor.base import NearestNeighborSearch from dku_io_utils import save_array_to_folder # Load parameters params = load_indexing_recipe_params() # Load data into array format for indexing columns = [params["unique_id_column"]] + params["feature_columns"] input_df = params["input_dataset"].get_dataframe(columns=columns, infer_with_pandas=False) data_loader = DataLoader(params["unique_id_column"], params["feature_columns"]) (array_ids, arrays) = data_loader.convert_df_to_arrays(input_df) # Build index and save index file to output folder nearest_neighbor = NearestNeighborSearch(num_dimensions=arrays.shape[1], **params) with NamedTemporaryFile() as tmp: nearest_neighbor.build_save_index(arrays=arrays, index_path=tmp.name) index_file_path = os.path.join(params["folder_partition_root"], nearest_neighbor.INDEX_FILE_NAME) params["index_folder"].upload_stream(index_file_path, tmp) # Save arrays and indexing config to guarantee reproducibility array_ids_file_path = os.path.join(params["folder_partition_root"], nearest_neighbor.ARRAY_IDS_FILE_NAME) arrays_file_path = os.path.join(params["folder_partition_root"], nearest_neighbor.ARRAYS_FILE_NAME) config_file_path = os.path.join(params["folder_partition_root"], nearest_neighbor.CONFIG_FILE_NAME) save_array_to_folder(array=array_ids, path=array_ids_file_path, folder=params["index_folder"]) save_array_to_folder(array=arrays, path=arrays_file_path, folder=params["index_folder"]) config = {**nearest_neighbor.get_config(), **{k: v for k, v in params.items() if k in {"feature_columns", "expert"}}}