class Task6: def __init__(self): """ Method Explanation: Intializes all the variables for the analysis task. """ self.util = Util() self.data_extractor = DataExtractor() self.location_id_to_title_map = self.data_extractor.location_mapping() self.location_title_to_id_map = self.data_extractor.location_title_to_id_mapping( ) self.location_list = list( self.location_title_to_id_map.values()) # List of location ids self.LOCATION_COUNT = len(self.location_list) # constant self.global_term_dictionary_current_index = 0 # To store the count of unique terms and indexing a given term in the global dictionary self.global_term_dictionary = dict( ) # To store the global list of terms as keys and their indices as values self.global_term_index_dictionary = dict( ) # To store the global list of terms referenced via the indices as the keys and terms as the values self.location_dictionary = dict( ) # To store the terms of a particular location and their corresponding attributes self.similarity_matrix = numpy.zeros( (self.LOCATION_COUNT, self.LOCATION_COUNT)) # To capture location-location similarity def construct_vocabulary(self): """ Method Explanation: . Constructs a global term vocabulary. . Constructs a location based term vocabulary. """ with open(constants.TEXT_DESCRIPTORS_DIR_PATH + "devset_textTermsPerPOI.txt", encoding="utf-8") as f: lines = [line.rstrip("\n") for line in f] for line in lines: words = line.split() temp_list_for_title = [] # extract location title while "\"" not in words[0]: temp_list_for_title.append(words.pop(0)) location_title = ("_").join(temp_list_for_title) location_id = self.location_title_to_id_map[location_title] # Build the term vocabulary and also the dictionary for terms corresponding to the locations and their scores for index, word in enumerate(words): index_mod4 = index % 4 if index_mod4 == 0: # the term current_word = word.strip('\"') if not self.global_term_dictionary.get(current_word): self.global_term_dictionary[ current_word] = self.global_term_dictionary_current_index self.global_term_index_dictionary[ self. global_term_dictionary_current_index] = current_word self.global_term_dictionary_current_index += 1 if not self.location_dictionary.get(location_id): self.location_dictionary[location_id] = {} if not self.location_dictionary.get(location_id).get( current_word): self.location_dictionary[location_id][ current_word] = { "TF": 0, "DF": 0, "TFIDF": 0 } elif index_mod4 == 1: # TF self.location_dictionary[location_id][current_word][ "TF"] = int(word) elif index_mod4 == 2: # DF self.location_dictionary[location_id][current_word][ "DF"] = int(word) elif index_mod4 == 3: # TFIDF self.location_dictionary[location_id][current_word][ "TFIDF"] = float(word) def construct_similarity_matrix(self, model): """ Method Explanation: . Goes over every location as a potential query location, compares its textual descriptors with every other location as a potential target location. . The comparison is based on the Cosine Similarity scores of one of the model vectors (TF/DF/TFIDF) defined by the <model> parameter. Inputs: <model> - Has three possible values -- TF, DF, TFIDF. Corresponds to which model score to consider for computing the Cosine Similarity between the textual descriptors. """ the_model = model # Go over every location as a potential query location for query_location_id in self.location_list: query_model_vector = [0 ] * self.global_term_dictionary_current_index # Construct the query model vector (<the_model> values of each term in the query location) for current_term_id_key, current_term_id_value in self.location_dictionary[ query_location_id].items(): if current_term_id_key == the_model: continue current_term_index = self.global_term_dictionary[ current_term_id_key] query_model_vector[ current_term_index] = self.location_dictionary[ query_location_id][current_term_id_key][the_model] # Go over every location as a potential target location for target_location_id, target_location_id_data in self.location_dictionary.items( ): # If query location is the same as target location, similarity = 1 if target_location_id == query_location_id: self.similarity_matrix[query_location_id - 1][target_location_id - 1] = 1 continue else: if not self.location_dictionary.get( target_location_id).get(the_model): self.location_dictionary[target_location_id][ the_model] = [ 0 ] * self.global_term_dictionary_current_index # Build the target model vector comprising of the_model scores of the target location for current_term_key, current_term_value in self.location_dictionary[ target_location_id].items(): if current_term_key == the_model: continue current_term_index = self.global_term_dictionary[ current_term_key] self.location_dictionary[target_location_id][ the_model][ current_term_index] = self.location_dictionary[ target_location_id][current_term_key][ the_model] # Compute the Cosine Similarity between the query model vector and target model vector cosine_similarity_value = self.util.cosine_similarity( query_model_vector, self.location_dictionary[target_location_id] [the_model]) self.similarity_matrix[query_location_id - 1][target_location_id - 1] = cosine_similarity_value def print_k_latent_semantics(self, k): """ Method Explanation: . Applies a Singular Valued Decomposition on the similarity matrix and prints the first k latent semantics determined by the k parameter. . The output is in the form of location-weight pairs for each semantic sorted in the decreasing order of weights. Input: . <k> for considering only the k latent semantics post SVD """ U, S, Vt = numpy.linalg.svd(self.similarity_matrix) # Get the concept mapping concept_mapping = self.similarity_matrix.dot(U[:, :k]) concept_mapping = concept_mapping.transpose() # { # <location_id>: [{"Location Name": <>, "Weight": <>}, {"Location Name": <>, "Weight": <>}, ...], # <location_id>: [{"Location Name": <>, "Weight": <>}, {"Location Name": <>, "Weight": <>}, ...], # ... # } semantic_data_dict = {} print("") for arr_index, arr in enumerate(concept_mapping): current_key = arr_index + 1 if not semantic_data_dict.get(current_key): semantic_data_dict[current_key] = [] for index, element in enumerate(arr): semantic_data_dict[current_key].append({ "Location Name": self.location_id_to_title_map[str(index + 1)], "Weight": element }) # Sort the latent semantic based on the weight of the feature sorted_list = sorted(semantic_data_dict[current_key], key=itemgetter("Weight"), reverse=True) semantic_data_dict[current_key].clear() semantic_data_dict[current_key] = sorted_list # Print location name-weight pairs sorted in decreasing order of weights print("Latent Semantic: ", current_key) for idx, data in enumerate(sorted_list): print("\tLocation Name: ", semantic_data_dict[current_key][idx]["Location Name"], " | Weight: ", semantic_data_dict[current_key][idx]["Weight"]) print("") def runner(self): k = input("Enter the k value: ") k = int(k) the_model = "TFIDF" self.construct_vocabulary() self.construct_similarity_matrix(the_model) self.print_k_latent_semantics(k)
def runner(self): k = input('Enter the k value: ') k = int(k) util = Util() data_extractor = DataExtractor() location_id_to_title_map = data_extractor.location_mapping() location_title_to_id_map = data_extractor.location_title_to_id_mapping( ) location_list = list(location_id_to_title_map.values()) LOCATION_COUNT = len(location_list) # constant MODEL_COUNT = len(constants.MODELS) MAX_SCORE = (LOCATION_COUNT - 1) * MODEL_COUNT FILE_PATH_PREFIX = constants.PROCESSED_VISUAL_DESCRIPTORS_DIR_PATH # '../dataset/visual_descriptors/processed/' # constant # { # 1: {'CM': [{'location_id': 1, 'distance': 0}, {'location_id':2, 'distance': 0.45}, ...], 'CN': [...], ... }, # 2: {'CM': [...], 'CN': [...], ...}, # ... , # <query_location>: { # <model>: [{'location_id': <location_id>, 'distance': <distance>}, {'location_id': <location_id>, 'distance': <distance>}], # <model>: [...], # ... # } # } global_location_distance_data_dict = {} # { # 1: {1: 0, 2: 0.54, 3: 0.43, ...}, # 2: { 1: 0.45, 2: 0, ...}, # ... , # <query_location>: { <target_location>: <distance>, <target_location>: <distance>, ...} # } location_wise_distance_data_dict = {} similarity_matrix = numpy.zeros((LOCATION_COUNT, LOCATION_COUNT)) print('Starting...') # Go over every location as a potential query location for query_location in location_list: query_location_files = data_extractor.get_all_files_prefixed_with( query_location) query_location_id = location_title_to_id_map[query_location] if not global_location_distance_data_dict.get(query_location_id): global_location_distance_data_dict[query_location_id] = {} if not location_wise_distance_data_dict.get(query_location_id): location_wise_distance_data_dict[query_location_id] = {} print('Query Location: ', query_location) # Go over every model file in the query location for query_model_file in query_location_files: query_model_name_with_csv = query_model_file.split(" ")[ 1] # CM.csv, CN.csv, <modelName>.csv, ... query_model = query_model_name_with_csv.split(".")[ 0] # CM, CN, CN3x3, <modelName>, ... query_file_path = FILE_PATH_PREFIX + query_model_file query_model_df = pd.read_csv(query_file_path, header=None) del query_model_df[0] query_model_df = query_model_df.reset_index(drop=True) query_model_df_row_count = query_model_df.shape[0] if not global_location_distance_data_dict.get( query_location_id).get(query_model): global_location_distance_data_dict[query_location_id][ query_model] = [] print('\tQuery Model: ', query_model) # Go over every location as a potential target location for which we will compute the distance to from the query location for target_location in location_list: target_location_id = location_title_to_id_map[ target_location] # If query location == target location, distance = 0 if query_location == target_location: distance = 0 global_location_distance_data_dict[query_location_id][ query_model].append({ 'location_id': target_location_id, 'distance': 0 }) else: # Find the corresponding model file of the query location in the target location target_model_file_path = FILE_PATH_PREFIX + target_location + " " + query_model + ".csv" target_model_df = pd.read_csv(target_model_file_path, header=None) target_model_df_copy = target_model_df.copy() del target_model_df[0] target_model_df = target_model_df.reset_index( drop=True) target_model_df_row_count = target_model_df.shape[0] target_model_df_column_count = target_model_df.shape[1] # Calculate the distance between the query location's model file and the target location's corresponding model file distance = self.get_the_distance_value( query_model_df, target_model_df, query_model_df_row_count, query_model, util) global_location_distance_data_dict[query_location_id][ query_model].append({ 'location_id': target_location_id, 'distance': distance }) # Set distance temporarily as 0 in the location_wise_distance_data_dict for this location if not location_wise_distance_data_dict.get( query_location_id).get(target_location_id): location_wise_distance_data_dict[query_location_id][ target_location_id] = 0 # At this state, we have gone over every target location with the corresponding model file from the query location. # Sort the model based location list of distances based on distance from the location sorted_list = sorted( global_location_distance_data_dict[query_location_id] [query_model], key=lambda k: k['distance']) global_location_distance_data_dict[query_location_id][ query_model].clear() global_location_distance_data_dict[query_location_id][ query_model] = sorted_list # Repeat the loop, do it for every model file of the query location location_data_dict = global_location_distance_data_dict[ query_location_id] # Compute the ranking of similar locations for the query location for curr_model, distance_list in location_data_dict.items(): for index, curr_location_distance_data in enumerate( distance_list): curr_location_id = curr_location_distance_data[ 'location_id'] curr_val = location_wise_distance_data_dict[ query_location_id][curr_location_id] location_wise_distance_data_dict[query_location_id][ curr_location_id] = curr_val + index for l_id, dist in location_wise_distance_data_dict[ query_location_id].items(): similarity_matrix[query_location_id - 1][l_id - 1] = dist # Add this to similarity matrix print(similarity_matrix) # Generate CSVs of the current similarity matrix (given by distances derived from the ranks of individual models) # df = pd.DataFrame(similarity_matrix) # loc_list = [] # for i in range(1,31): # loc_list.append(location_id_to_title_map[str(i)]) # # Generate the distance datrix as CSV # df.to_csv('./generated_data/distance_matrix_vd_minmax.csv', encoding='utf-8', header=None, index=False) # df.to_csv('./generated_data/distance_matrix_vd_minmax_descriptive.csv', encoding='utf-8', header=loc_list, index=loc_list) # Convert distance score to similarity score converted_similarity_matrix = similarity_matrix for row in range(len(converted_similarity_matrix)): for col in range(len(converted_similarity_matrix[0])): # In the dev set case, it scales distance score that ranges from 0-290 in the computation to a similarity score ranging from 0-1 converted_similarity_matrix[row][col] = ( (float)(MAX_SCORE - converted_similarity_matrix[row][col]) / MAX_SCORE) # Generate the similarity matrix as CSV if needed # df = pd.DataFrame(converted_similarity_matrix) # df.to_csv('./generated_data/similarity_matrix_vd_minmax.csv', encoding='utf-8', header=None, index=False) # df.to_csv('./generated_data/similarity_matrix_vd_descriptive.csv', encoding='utf-8') # Apply SVD on the data U, S, Vt = numpy.linalg.svd(converted_similarity_matrix) # { # <location_id>: [{'Location Name': <>, 'Weight': <>}, {'Location Name': <>, 'Weight': <>}, ...], # <location_id>: [{'Location Name': <>, 'Weight': <>}, {'Location Name': <>, 'Weight': <>}, ...], # ... # } semantic_data_dict = {} for arr_index, arr in enumerate(Vt[:k, :]): if not semantic_data_dict.get(arr_index + 1): semantic_data_dict[arr_index + 1] = [] for index, element in enumerate(arr): semantic_data_dict[arr_index + 1].append({ 'Location Name': location_id_to_title_map[str(index + 1)], 'Weight': element }) # Sort the list based on the weight attribute sorted_list = sorted(semantic_data_dict[arr_index + 1], key=itemgetter('Weight'), reverse=True) semantic_data_dict[arr_index + 1].clear() semantic_data_dict[arr_index + 1] = sorted_list # Print the latent semantic as location name-weight pairs sorted in decreasing order of weights print('Latent Semantic: ', arr_index + 1) for idx, data in enumerate(sorted_list): print('\tLocation Name: ', semantic_data_dict[arr_index + 1][idx]['Location Name'], '| Weight: ', semantic_data_dict[arr_index + 1][idx]['Weight'])