def __init__(self, query: KeywordCoordinate, data: dataset_type, cost_function: CostFunction, normalize: bool = True, result_length: int = 10, max_subset_size: int = math.inf, max_number_of_concurrent_processes: int = 5, rebalance_subsets: bool = True, _map: str = 'London', update: bool = True): """ Constructs a new NaiveSolver object. :param query: The query for which to solve for :param data: The data for which to solve for :param cost_function: The cost function to determine subset costs :param normalize: If the data should be normalized before being processed. The data will be denormalized before being returned. :param result_length: The size of the results (Top-N) :param max_subset_size: The maximum size of any subset used to calculate the solution :param rebalance_subsets: If the passed subsets should be rearranged to better distribute the workload among the processes """ logger = logging.getLogger(__name__) logger.debug( 'creating with query {}, data {}, cost function {}, normalization {} and result length {}' .format(query, dataset_comprehension(data), cost_function, normalize, result_length)) super().__init__(query, data, cost_function, normalize, result_length, max_subset_size, max_number_of_concurrent_processes, rebalance_subsets, _map=_map, update_dataset=update) start_time = time.time() if self.query.keywords[0] != '0': # Only for precalculate distance self.list_of_subsets, self.normalised_query = self.preprocess_input( ) finish_time = time.time() print('NaiveSolver initialization - preprocess_input() --> ', finish_time - start_time) logger.debug( 'created with query {}, data {}, cost function {}, normalization {} and result length {}' .format(self.query, dataset_comprehension(self.data), self.cost_function, self.normalize_data, self.result_length))
def get_minimum_for_query(self, query: KeywordCoordinate, dataset: dataset_type) -> float: """ Calculates the minimum query-dataset distance cost. :param query: The query :param dataset: The dataset :return: Minimum query-dataset distance cost """ logger = logging.getLogger(__name__) logger.debug( 'finding minimum distance for query {} and dataset {}'.format(query, dataset_comprehension(dataset))) if self.precalculated_query_dataset_dict is not None: # logger.debug('querying precalculated set') precalculated_result = self.precalculated_query_dataset_dict.get(frozenset(dataset)) if precalculated_result is not None: # logger.debug('found precalculated value {}'.format(precalculated_result)) return precalculated_result else: logger.debug( 'could not find the minimum precalculated query-dataset value in the given precalculated set. This suggests an erroneous or a wrong dict has been passed into the CostFunction.') else: logger.debug('No precalculated query-dataset dict found') current_minimum = 99999999 for index in range(len(dataset)): current_value = self.distance_metric(query.coordinates, dataset[index].coordinates) if current_value < current_minimum: current_minimum = current_value logger.debug('found minimum distance for query and dataset of {}'.format(current_minimum)) return current_minimum
def get_minimum_for_dataset(self, dataset: dataset_type, denormalized_dataset: dataset_type = None) -> float: """ Calculates the minimum inter-dataset distance cost. :param dataset: The dataset. :param denormalized_dataset: The normalized_dataset. This is used for the matching of precalculated values. :return: Minimum inter-dataset distance cost. """ logger = logging.getLogger(__name__) logger.debug('finding minimum distance for dataset {}'.format(dataset_comprehension(dataset))) if self.precalculated_inter_dataset_dict is not None: logger.debug('querying precalculated set') if denormalized_dataset is not None: dataset_key = denormalized_dataset else: dataset_key = dataset precalculated_result = self.precalculated_inter_dataset_dict.get(frozenset(dataset_key)) if precalculated_result is not None: logger.debug('found precalculated value {}'.format(precalculated_result)) return precalculated_result else: logger.debug('could not find the minimum precalculated inter-dataset value in the given precalculated set. This suggests an erroneous or a wrong dict has been passed into the CostFunction.') else: print('No precalculated inter-dataset dict found') current_minimum: float = 9999999.9 if len(dataset) <= 1: print('Dataset of size 1 returning inter-dataset distance of 0.0') return 0.0 for index1 in range(len(dataset)): for index2 in range(len(dataset) - index1 - 1): current_value = self.distance_metric(dataset[index1].coordinates, dataset[index1 + index2 + 1].coordinates) if current_value < current_minimum: current_minimum = current_value logger.debug('found minimum distance for dataset of {}'.format(current_minimum)) return current_minimum
def generate(self, data_size: int) -> dataset_type: """ Generates a dataset with a given size. :param data_size: The size of the dataset :return: The dataset """ logger = logging.getLogger(__name__) logger.debug('generating dataset of size {}'.format(data_size)) dataset: dataset_type = [] for data_counter in range(data_size): possible_keywords_copy = self.possible_keywords.copy() current_keywords: keyword_dataset_type = [] current_x = random.randint(self.physical_min_x, self.physical_max_x) current_y = random.randint(self.physical_min_y, self.physical_max_y) number_of_keywords = random.randint(self.keywords_min, self.keywords_max) for kw_counter in range(number_of_keywords): try: current_keyword = random.choice(possible_keywords_copy) except IndexError: break possible_keywords_copy.remove(current_keyword) current_keywords.append(current_keyword) new_entry = KeywordCoordinate(current_x, current_y, current_keywords) dataset.append(new_entry) logger.debug('generated dataset {}'.format( dataset_comprehension(dataset))) return dataset
def get_maximum_keyword_distance(self, query: KeywordCoordinate, dataset: dataset_type) -> float: """ Calculates the maximum keyword distance. :param query: The query :param dataset: The dataset :return: Maximum distance between the keywords """ # print('XXXXXXXXXXXXXXXXXXXXXX') logger = logging.getLogger(__name__) logger.debug( 'finding maximum similarity for query {} and dataset {}'.format(query, dataset_comprehension(dataset))) if self.precalculated_keyword_similarity_dict is not None: # logger.debug('querying precalculated set') precalculated_result = self.precalculated_keyword_similarity_dict.get(frozenset(dataset)) if precalculated_result is not None: # logger.debug('found precalculated value {}'.format(precalculated_result)) return precalculated_result else: logger.debug( 'could not find the maximum precalculated keyword similarity value in the given precalculated set. This suggests an erroneous or a wrong dict has been passed into the CostFunction.') else: logger.debug('No precalculated keyword-similarity dict found') current_maximum = 0 combination = False latentfactors = False # print(self.similarity_metric.__name__) if self.similarity_metric.__name__ == 'combined_cosine_similarity': combined_keyword_vector: keyword_dataset_type = create_combined_keyword_vector(query, dataset) combination = True elif self.similarity_metric.__name__ == 'word2vec_cosine_similarity': latentfactors = True for element in dataset: if combination: current_value = self.similarity_metric(query.keywords, element.keywords, combined_keyword_vector) elif latentfactors: results = [] list_with_kw = [] # print('***********') # print('Similarity Q = ', query.keywords, ' y ', element.keywords) for kw_inquery in query.keywords: list_with_kw = [] list_with_kw.append(kw_inquery) # print('list_with_kw --> ', list_with_kw) results.append(self.similarity_metric(list_with_kw, element.keywords, self.model)) # print('1 - similarity of ',kw_inquery, ' --> ', self.similarity_metric(list_with_kw, element.keywords, self.model)) # print('Mean (1-)similarity --> ', mean(results)) current_value = mean(results) # current_value = self.similarity_metric(query.keywords, element.keywords, self.model) # print('Similarity between query: ', query.keywords, ' and POI ', element.keywords) else: current_value = self.similarity_metric(query.keywords, element.keywords) if current_value > current_maximum: current_maximum = current_value # logger.debug('found maximum similarity cost for query and dataset of {}'.format(current_maximum)) return current_maximum
def solve(self) -> solution_list: """ Implements the solution algorithm. :return: A list with tuples. Every tuple contains a cost and the corresponding subset of KeywordCoordinates. """ start_time = time.time() logger = logging.getLogger(__name__) logger.info( 'solving for query {} and dataset {} using cost function {} and result length {}' .format(self.query, dataset_comprehension(self.data), self.cost_function, self.result_length)) result_list: solution_list = [] # UNCOMMENT FOR MULTIPROCESSING (DOES NOT WORK IN WINDOWS 10) # with concurrent.futures.ProcessPoolExecutor( # max_workers=self.max_number_of_concurrent_processes) as executor: # future_list = [] # for subsets in list_of_split_subsets: # future = executor.submit(self.get_cost_for_subset, query, subsets) # future_list.append(future) # for future in future_list: # for solution in future.result(): # result_list.append(solution) # ONE PROCESSOR VERSION result_list = [] for subset in self.list_of_subsets: # list_of_split_subsets if multiprocessing enabled. # print(i) # i = i + 1 solution, query_distance, inter_distance, semantic_distance = self.get_cost_for_subset( self.normalised_query, subset) self.query_distance_list.append(query_distance) self.inter_distance_list.append(inter_distance) self.semantic_distance_list.append(semantic_distance) self.cost_list.append(solution) result_list.append((solution, subset)) # MULTIPROCESSOR VERSION # for future in future_list: # for solution in future.result(): # result_list.append(solution) ######################### result_list.sort(key=lambda x: x[0]) result_list = result_list[:self.result_length] denormalized_result_list = denormalize_result_data( result_list, self.denormalize_max_x, self.denormalize_min_x, self.denormalize_max_y, self.denormalize_min_y) logger.info('solved for {} with length {}'.format( result_list_comprehension(denormalized_result_list), self.result_length)) finish_time = time.time() self.total_time = finish_time - start_time return denormalized_result_list
def normalize_data( query: KeywordCoordinate, dataset: dataset_type ) -> typing.Tuple[KeywordCoordinate, typing.List[KeywordCoordinate], float, float, float, float]: """ Calculates the normalized query, dataset and parameters to undo this normalization. :param query: The query :param dataset: The dataset :return: A tuple with: the normalized query, the normalized dataset, the denormalization parameter max_x, the denormalization parameter min_x, the denormalization parameter max_y and the denormalization parameter min_y, """ logger = logging.getLogger(__name__ + '.normalize_data') logger.debug('calculation for query {} and dataset {}'.format( query, dataset_comprehension(dataset))) data = copy.deepcopy(dataset) # Cambio de Ramon (20200903) data.append( copy.deepcopy(query) ) # Añade una lista a una lista que solo contiene KeywordCoordinates #data.append(copy.deepcopy(query[0])) list_of_x = [] list_of_y = [] type(data) for kwc in data: list_of_x.append(kwc.coordinates.x) list_of_y.append(kwc.coordinates.y) min_x = min(list_of_x) min_y = min(list_of_y) max_x = max(list_of_x) max_y = max(list_of_y) for index in range(len(data)): new_x = (data[index].coordinates.x - min_x) / (max_x - min_x) new_y = (data[index].coordinates.y - min_y) / (max_y - min_y) data[index].coordinates.x = new_x data[index].coordinates.y = new_y logger.debug('calculated query {} and dataset {}'.format( data[-1:][0], dataset_comprehension(data[:-1]))) return (data[-1:][0], data[:-1], max_x, min_x, max_y, min_y)
def find_subsets(input_set: dataset_type, subset_size: int): """ Calculates all the subsets of an input dataset and a given size. :param input_set: The input dataset :param subset_size: The subset size :return: A set of all the subsets """ logger = logging.getLogger(__name__ + '.find_subsets') logger.debug('finding all subsets of length {} in set {}'.format(subset_size, dataset_comprehension(input_set))) if subset_size > len(input_set): solution = set(itertools.combinations(input_set, 0)) else: solution = set(itertools.combinations(input_set, subset_size)) logger.debug('found {}'.format(sets_of_set_comprehension(solution))) return solution
def create_combined_keyword_vector(query: KeywordCoordinate, dataset: dataset_type) -> keyword_dataset_type: """ Creates a combined keyword vector of the query and the dataset. This vector contains all the keywords that appear in either the query or dataset. :param query: The query :param dataset: The dataset :return: A list with all the unique keywords in the query and dataset """ logger = logging.getLogger(__name__ + '.create_combined_keyword_vector') logger.debug('calculating for query {} and dataset {}'.format(query, dataset_comprehension(dataset))) result_keyword_list: keyword_dataset_type = [] for string in query.keywords: result_keyword_list.append(string) for kwc in dataset: for string in kwc.keywords: result_keyword_list.append(string) result = list(set(result_keyword_list)) return result
def solve(self, query: KeywordCoordinate, dataset: dataset_type, denormalized_dataset: dataset_type = None): """ Solves the Type4 cost function. :param query: The query :param dataset: The dataset :param denormalized_dataset: The normalized_dataset. This is used for the matching of precalculated values. :return: The maximum cost for the given query and dataset """ logger = logging.getLogger(__name__) logger.debug('solving for query {} and dataset {}'.format( query, dataset_comprehension(dataset))) # TODO does this type of threshold filtering make sense for the unified function? query_distance = self.get_maximum_for_query(query, dataset) logger.debug('solved query distance for {}'.format(query_distance)) if denormalized_dataset is not None: dataset_distance = self.get_maximum_for_dataset( dataset, denormalized_dataset) else: dataset_distance = self.get_maximum_for_dataset(dataset) logger.debug('solved dataset distance for {}'.format(dataset_distance)) keyword_similarity = self.get_maximum_keyword_distance(query, dataset) logger.debug( 'solved keyword similarity for {}'.format(keyword_similarity)) # if (not self.disable_thresholds and ( # query_distance > self.query_distance_threshold or dataset_distance > self.dataset_distance_threshold or keyword_similarity > self.keyword_similarity_threshold)): # logger.debug( # 'One of the thresholds was not met. Query threshold: {}, dataset threshold: {}, keyword threshold {}'.format( # self.query_distance_threshold, self.dataset_distance_threshold, self.keyword_similarity_threshold)) # return math.inf, 0, 0, 0 # else: a: float = 0.0 for element in dataset: a += self.distance_metric(query.coordinates, element.coordinates)**self.phi_1 a = a**(1 / self.phi_1) a = (self.alpha * a)**self.phi_2 b: float = (self.beta * dataset_distance)**self.phi_2 c: float = ((self.omega * keyword_similarity)**self.phi_2)**(1 / self.phi_2) solution = a + b + c logger.debug('solved with a cost of {}'.format(solution)) return solution, query_distance, dataset_distance, keyword_similarity
def solve(self, query: KeywordCoordinate, dataset: dataset_type, denormalized_dataset: dataset_type = None): """ Solves the Type3 cost function. :param query: The query :param dataset: The dataset :param denormalized_dataset: The normalized_dataset. This is used for the matching of precalculated values. :return: The maximum cost for the given query and dataset """ logger = logging.getLogger(__name__) logger.debug('solving for query {} and dataset {}'.format( query, dataset_comprehension(dataset))) query_distance = self.get_minimum_for_query(query, dataset) logger.debug('solved query distance for {}'.format(query_distance)) if denormalized_dataset is not None: dataset_distance = self.get_maximum_for_dataset( dataset, denormalized_dataset) else: dataset_distance = self.get_maximum_for_dataset(dataset) logger.debug('solved dataset distance for {}'.format(dataset_distance)) keyword_similarity = self.get_maximum_keyword_distance(query, dataset) logger.debug( 'solved keyword similarity for {}'.format(keyword_similarity)) ### Traces if normalized # print('Query distance: ', query_distance) # print('Dataset distance: ', dataset_distance) # print('Keyword similarity: ', keyword_similarity) ### # if (not self.disable_thresholds and ( # query_distance > self.query_distance_threshold or dataset_distance > self.dataset_distance_threshold or keyword_similarity > self.keyword_similarity_threshold)): # logger.debug( # 'One of the thresholds was not met. Query threshold: {}, dataset threshold: {}, keyword threshold {}'.format( # self.query_distance_threshold, self.dataset_distance_threshold, self.keyword_similarity_threshold)) # return math.inf, 0, 0, 0 # else: solution = self.alpha * query_distance + self.beta * dataset_distance + self.omega * keyword_similarity logger.debug('solved with a cost of {}'.format(solution)) # print('Solution: ', solution) return solution, query_distance, dataset_distance, keyword_similarity
def get_maximum_for_dataset(self, dataset: dataset_type, denormalized_dataset: dataset_type = None) -> float: """ Calculates the maximum inter-dataset distance cost. :param dataset: The dataset. :param denormalized_dataset: The normalized_dataset. This is used for the matching of precalculated values. :return: Maximum inter-dataset distance cost. """ logger = logging.getLogger(__name__) logger.debug('finding maximum distance for dataset {}'.format(dataset_comprehension(dataset))) # if self.precalculated_inter_dataset_dict is not None: # logger.debug('querying precalculated set') # if denormalized_dataset is not None: # dataset_key = denormalized_dataset # else: # dataset_key = dataset # print('--------- dataset_key:') # # for i in range(10): # print(dataset_key[0].name) # precalculated_result = self.precalculated_inter_dataset_dict.get(frozenset(dataset_key)) # print('*****************', precalculated_result) # if precalculated_result is not None: # logger.debug('found precalculated value {}'.format(precalculated_result)) # return precalculated_result # else: # # print('could not find the maximum precalculated inter-dataset value in the given precalculated set. This suggests an erroneous or a wrong dict has been passed into the CostFunction.') # logger.warning( # 'could not find the maximum precalculated inter-dataset value in the given precalculated set. This suggests an erroneous or a wrong dict has been passed into the CostFunction.') # else: # pass # logger.warning('No precalculated inter-dataset dict found') current_maximum: float = 0.0 for index1 in range(len(dataset)): for index2 in range(len(dataset) - index1 - 1): current_value = self.distance_metric(dataset[index1].coordinates, dataset[index1 + index2 + 1].coordinates) # print('Max = ', current_maximum, ' - Current value = ', current_value) if current_value > current_maximum: current_maximum = current_value logger.debug('found maximum distance for dataset of {}'.format(current_maximum)) # print('Max distance for dataset: ', current_maximum) return current_maximum
def generate_pickle(self, data_size: int, file_name: str, file_allow_overwrite: bool = False, file_only_overwrite_dot_pickle_files: bool = True, pickle_protocol_version: int = 4) -> dataset_type: """ Generates a new dataset, writes it as pickle and returns the generated dataset. :param data_size: The dataset :param file_name: The name of the file :param file_allow_overwrite: If files are allowed to be overwritten :param file_only_overwrite_dot_pickle_files: If the name of the file has to end with .pickle :param pickle_protocol_version: The protocol version of the pickle format :return: The generated data which has been written to disk """ logger = logging.getLogger(__name__) logger.debug('generating dataset of size {}'.format(data_size)) data = self.generate(data_size) logger.debug('generated dataset {}'.format( dataset_comprehension(data))) write_pickle(data, file_name, file_allow_overwrite, file_only_overwrite_dot_pickle_files, pickle_protocol_version) return data
def __str__(self): return '{}(query: {}, dataset: {}, cost function: {}, result length {})'.format( type(self).__name__, self.query, dataset_comprehension(self.data), self.cost_function, self.result_length)
def __init__(self, query: KeywordCoordinate, data: dataset_type, cost_function: CostFunction, normalize: bool = True, result_length: int = 10, max_subset_size: int = math.inf, max_number_of_concurrent_processes: int = mp.cpu_count(), rebalance_subsets: bool = True, RADIUS: float = 2000.0, semantic_filtering: bool = True, _map: str = 'London', update_dataset: bool = True): """ Constructs a new Solver object. The Solver class should never be directly instantiated. Instead use a class that inherits from the Solver class and implements the solve() method. :param query: The query for which to solve for :param data: The data for which to solve for :param cost_function: The cost function to determine subset costs :param normalize: If the data should be normalized before being processed. The data will be denormalized before being returned. :param result_length: The size of the results (Top-N) :param max_subset_size: The maximum size of any subset used to calculate the solution :param rebalance_subsets: If the passed subsets should be rearranged to better distribute the workload among the processes """ self.query: KeywordCoordinate = query self.data: dataset_type = data self.cost_function: CostFunction = cost_function self.result_length = result_length self.normalize_data = normalize self.denormalize_max_x: float = 0.0 self.denormalize_min_x: float = 0.0 self.denormalize_max_y: float = 0.0 self.denormalize_min_y: float = 0.0 self.max_subset_size = max_subset_size self.max_number_of_concurrent_processes = max_number_of_concurrent_processes self.rebalance_subsets = rebalance_subsets self.RADIUS: float = RADIUS self.semantic_filtering = semantic_filtering self.SEMANTIC_THRESHOLD: float = 0.5 self.map = _map self.max_number_of_subsets = math.factorial(len( self.data)) / (math.factorial(max_subset_size) * math.factorial(len(self.data) - max_subset_size)) self.number_of_subsets_after_radius_filtering: int = 0 self.number_of_subsets_after_semantic_filtering: int = 0 self.cost_list = [] self.query_distance_list = [] self.inter_distance_list = [] self.semantic_distance_list = [] # self.query_distance: float = 0.0 # self.intra_distance: float = 0.0 # self.semantic_distance: float = 0.0 self.time_4_radius_filtering: float = 0.0 self.time_4_build_subsets: float = 0.0 self.total_time: float = 0.0 self.df_poi_queries_similarities = pd.read_csv( os.path.dirname(os.path.abspath(__file__)) + '/../../files/' + 'poi_queries_similarities.csv', index_col='poi_name', encoding='utf-8') if update_dataset: self.data = self.get_promising_poi_data( self.data, self.df_poi_queries_similarities) logging.getLogger(__name__).debug( 'created with query {}, data {}, cost function {}, normalization {} and result length {}' .format(self.query, dataset_comprehension(self.data), self.cost_function, self.normalize_data, self.result_length))