def get_max_keyword_similarity(self) -> precalculated_dict_type: """ Calculates a dictionary of the maximum keyword-similarity cost for all subsets. :return: The Dictionary with frozen subsets as keys and the corresponding cost value as values. """ if self.normalize_data: norm_data = normalize_data(self.query, self.data) query = norm_data[0] data = norm_data[1] else: query = self.query data = self.data result_dict: precalculated_dict_type = dict() # list_of_subsets = self.get_all_subsets(data) split_ss = split_subsets(self.list_of_subsets, self.max_number_of_concurrent_processes, self.rebalance_subsets) results = [] # with concurrent.futures.ProcessPoolExecutor( # max_workers=self.max_number_of_concurrent_processes) as executor: # for subset in split_ss: # future = executor.submit(get_max_keyword_similarity, self.cost_function, query, subset) # results.append(future) for subset in split_ss: future = get_max_keyword_similarity(self.cost_function, query, subset) results.append(future) for result_list in results: for subset in result_list: #.result(): result_dict[frozenset(subset[1])] = subset[0] return result_dict
def get_max_inter_dataset_distance(self) -> precalculated_dict_type: """ Calculates a dictionary of the maximum inter-dataset cost for all subsets. :return: The Dictionary with frozen subsets as keys and the corresponding cost value as values. """ if self.normalize_data: norm_data = normalize_data(self.query, self.data) data = norm_data[1] denorm_x_max = norm_data[2] denorm_x_min = norm_data[3] denorm_y_max = norm_data[4] denorm_y_min = norm_data[5] else: data = self.data result_dict: precalculated_dict_type = dict() # list_of_subsets = self.get_all_subsets(data) split_ss = split_subsets(self.list_of_subsets, self.max_number_of_concurrent_processes, self.rebalance_subsets) results = [] # with concurrent.futures.ProcessPoolExecutor( # max_workers=self.max_number_of_concurrent_processes) as executor: # for subset in split_ss: # future = executor.submit(get_max_inter_dataset_distances, self.cost_function, subset) # results.append(future) count = 1 for subset in split_ss: print('++++ Subset ', count, ' - Length: ', len(subset)) future = get_max_inter_dataset_distances(self.cost_function, subset) results.append(future) for result_list in results: for subset in result_list: if self.normalize_data: denormalized_result = denormalize_result_data( [(0.0, subset[1])], denorm_x_max, denorm_x_min, denorm_y_max, denorm_y_min) denormalized_subset = denormalized_result[0][1] dict_key = denormalized_subset else: dict_key = subset[1] # print('+++++ subset added +++++') # for keys in dict # print('Key --> ', dict_key) # print('result --> ', subset[0]) result_dict[frozenset(dict_key)] = subset[0] return result_dict
def test_normalize_data(self): query = KeywordCoordinate(2, 1, ['family', 'food', 'outdoor']) kwc1 = KeywordCoordinate(0, 0, ['family']) kwc2 = KeywordCoordinate(3, 2, ['food']) kwc3 = KeywordCoordinate(1, 5, ['outdoor']) data = [kwc1, kwc2, kwc3] norm_query, norm_data, max_x, min_x, max_y, min_y = mt.normalize_data( query, data) self.assertAlmostEqual(norm_query.coordinates.x, 0.66, delta=0.01) self.assertAlmostEqual(norm_query.coordinates.y, 0.20, delta=0.01) self.assertAlmostEqual(norm_data[0].coordinates.x, 0.0, delta=0.01) self.assertAlmostEqual(norm_data[0].coordinates.y, 0.0, delta=0.01) self.assertAlmostEqual(norm_data[1].coordinates.x, 1.0, delta=0.01) self.assertAlmostEqual(norm_data[1].coordinates.y, 0.4, delta=0.01) self.assertAlmostEqual(norm_data[2].coordinates.x, 0.33, delta=0.01) self.assertAlmostEqual(norm_data[2].coordinates.y, 1.0, delta=0.01) self.assertEqual(max_x, 3) self.assertEqual(min_x, 0) self.assertEqual(max_y, 5) self.assertEqual(min_y, 0)
def preprocess_input(self): # Calculates distances between any pair of locations (POIs) distances = [] for kwc in self.data: distance_list = [] # print(kwc.coordinates.x) # print(kwc.coordinates.y) distance_list.append( str(kwc.coordinates.x) + ',' + str(kwc.coordinates.y)) for kwc2 in self.data: distance_list.append( geographic_distance(kwc.coordinates, kwc2.coordinates)) distances.append(distance_list) # Calculates distance list from query location to any POI distances_to_query = [] for kwc in self.data: distances_one_poi = [] distances_one_poi.append( str(kwc.coordinates.x) + ',' + str(kwc.coordinates.y)) distances_one_poi.append( geographic_distance(self.query.coordinates, kwc.coordinates)) distances_to_query.append(distances_one_poi) geographic_distances = pd.DataFrame(distances) geographic_distances.set_index(0, inplace=True) geographic_distances.columns = geographic_distances.index.tolist() # Calculates distances to query location from every POI distances_to_query_df = pd.DataFrame(distances_to_query) distances_to_query_df.set_index(0, inplace=True) distances_to_query_df.columns = ['Distances2Query'] # candidates_set = self.get_all_candidates_heuristic(self.data, distances_to_query_df) dataAux = self.get_all_candidates_heuristic(self.data, distances_to_query_df) # list_of_subsets = self.get_all_subsets(data) # ONLY ONE PROCESSOR (NO BALANCING NEEDED) # list_of_split_subsets = list_of_subsets # dataAux = [x for x in self.data if (str(x.coordinates.x)+','+str(x.coordinates.y)) in candidates_set] if self.normalize_data: query, data, self.denormalize_max_x, self.denormalize_min_x, self.denormalize_max_y, self.denormalize_min_y = normalize_data( self.query, dataAux) else: query = self.query data = self.dataAux list_of_subsets = self.get_all_subsets(data) # UNCOMMENT IF MULTIPROCESSING # list_of_split_subsets = split_subsets(list_of_subsets, self.max_number_of_concurrent_processes, # self.rebalance_subsets) print('List of subsets length: ', len(list_of_subsets)) return list_of_subsets, query