Python DataExtractor.location_title_to_id_mapping примеры использования

Язык программирования: Python

Пространство имен/Пакет: data_extractor

Класс/Тип: DataExtractor

Метод/Функция: location_title_to_id_mapping

Примеров на hotexamples.com: 2

Python DataExtractor.location_title_to_id_mapping - 2 примера найдено. Это лучшие примеры Python кода для data_extractor.DataExtractor.location_title_to_id_mapping, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

DataExtractor(30)

location_mapping(10)

get_movie_actor_data(7)

get_mlmovies_data(6)

choose_versions(5)

get_data_vectors(4)

extract(4)

get_variable_values_sets(4)

get_mltags_data(4)

get_genome_tags_data(4)

get_imdb_actor_info_data(3)

get_bugged_files_path(3)

save(2)

read(2)

location_title_to_id_mapping(2)

checkout_version(2)

get_selected_versions(2)

extract_data(2)

get_data(1)

prepare_dataset_for_task1(1)

load_train_labels(1)

load_valid_data(1)

load_valid_labels(1)

extractTextFromTagList(1)

extractText(1)

prepare_dataset_for_task3(1)

load_test_labels(1)

prepare_dataset_for_task6(1)

dropna(1)

create_dataset(1)

save_csv(1)

set_selected_config(1)

to_json(1)

load_train_data(1)

load_test_data(1)

get_details_from_results(1)

init_jira_commits(1)

import_matches(1)

get_versions_by_type(1)

get_task4_feedback_data(1)

get_task2_feedback_data(1)

collect_data(1)

get_race(1)

append_givenloc_to_list(1)

findAllTags(1)

get_mlratings_data(1)

getData(1)

get_all_files_prefixed_with(1)

get_bugged_methods_path(1)

get_files_bugged(1)

Пример #1

Показать файл

Файл: task_6_textual_descriptors.py Проект: srdevan/Multimedia-Retrieval-Phase-2

class Task6:
    def __init__(self):
        """
		Method Explanation:
			Intializes all the variables for the analysis task.
		"""
        self.util = Util()
        self.data_extractor = DataExtractor()

        self.location_id_to_title_map = self.data_extractor.location_mapping()
        self.location_title_to_id_map = self.data_extractor.location_title_to_id_mapping(
        )

        self.location_list = list(
            self.location_title_to_id_map.values())  # List of location ids
        self.LOCATION_COUNT = len(self.location_list)  # constant

        self.global_term_dictionary_current_index = 0  # To store the count of unique terms and indexing a given term in the global dictionary
        self.global_term_dictionary = dict(
        )  # To store the global list of terms as keys and their indices as values
        self.global_term_index_dictionary = dict(
        )  # To store the global list of terms referenced via the indices as the keys and terms as the values
        self.location_dictionary = dict(
        )  # To store the terms of a particular location and their corresponding attributes
        self.similarity_matrix = numpy.zeros(
            (self.LOCATION_COUNT,
             self.LOCATION_COUNT))  # To capture location-location similarity

    def construct_vocabulary(self):
        """
		Method Explanation:
			. Constructs a global term vocabulary.
			. Constructs a location based term vocabulary.
		"""
        with open(constants.TEXT_DESCRIPTORS_DIR_PATH +
                  "devset_textTermsPerPOI.txt",
                  encoding="utf-8") as f:
            lines = [line.rstrip("\n") for line in f]
            for line in lines:
                words = line.split()

                temp_list_for_title = []
                # extract location title
                while "\"" not in words[0]:
                    temp_list_for_title.append(words.pop(0))
                location_title = ("_").join(temp_list_for_title)
                location_id = self.location_title_to_id_map[location_title]

                # Build the term vocabulary and also the dictionary for terms corresponding to the locations and their scores
                for index, word in enumerate(words):
                    index_mod4 = index % 4

                    if index_mod4 == 0:  # the term
                        current_word = word.strip('\"')
                        if not self.global_term_dictionary.get(current_word):
                            self.global_term_dictionary[
                                current_word] = self.global_term_dictionary_current_index
                            self.global_term_index_dictionary[
                                self.
                                global_term_dictionary_current_index] = current_word
                            self.global_term_dictionary_current_index += 1
                        if not self.location_dictionary.get(location_id):
                            self.location_dictionary[location_id] = {}
                        if not self.location_dictionary.get(location_id).get(
                                current_word):
                            self.location_dictionary[location_id][
                                current_word] = {
                                    "TF": 0,
                                    "DF": 0,
                                    "TFIDF": 0
                                }
                    elif index_mod4 == 1:  # TF
                        self.location_dictionary[location_id][current_word][
                            "TF"] = int(word)
                    elif index_mod4 == 2:  # DF
                        self.location_dictionary[location_id][current_word][
                            "DF"] = int(word)
                    elif index_mod4 == 3:  # TFIDF
                        self.location_dictionary[location_id][current_word][
                            "TFIDF"] = float(word)

    def construct_similarity_matrix(self, model):
        """
		Method Explanation:
			. Goes over every location as a potential query location, compares its textual descriptors with every other location as a
			  potential target location.
			. The comparison is based on the Cosine Similarity scores of one of the model vectors (TF/DF/TFIDF) defined by the <model> parameter.
		Inputs:
			<model> - Has three possible values -- TF, DF, TFIDF. Corresponds to which model score to consider for computing the Cosine Similarity
			          between the textual descriptors.
		"""
        the_model = model
        # Go over every location as a potential query location
        for query_location_id in self.location_list:
            query_model_vector = [0
                                  ] * self.global_term_dictionary_current_index

            # Construct the query model vector (<the_model> values of each term in the query location)
            for current_term_id_key, current_term_id_value in self.location_dictionary[
                    query_location_id].items():
                if current_term_id_key == the_model:
                    continue
                current_term_index = self.global_term_dictionary[
                    current_term_id_key]
                query_model_vector[
                    current_term_index] = self.location_dictionary[
                        query_location_id][current_term_id_key][the_model]

            # Go over every location as a potential target location
            for target_location_id, target_location_id_data in self.location_dictionary.items(
            ):
                # If query location is the same as target location, similarity = 1
                if target_location_id == query_location_id:
                    self.similarity_matrix[query_location_id -
                                           1][target_location_id - 1] = 1
                    continue
                else:
                    if not self.location_dictionary.get(
                            target_location_id).get(the_model):
                        self.location_dictionary[target_location_id][
                            the_model] = [
                                0
                            ] * self.global_term_dictionary_current_index

                    # Build the target model vector comprising of the_model scores of the target location
                    for current_term_key, current_term_value in self.location_dictionary[
                            target_location_id].items():
                        if current_term_key == the_model:
                            continue
                        current_term_index = self.global_term_dictionary[
                            current_term_key]
                        self.location_dictionary[target_location_id][
                            the_model][
                                current_term_index] = self.location_dictionary[
                                    target_location_id][current_term_key][
                                        the_model]

                    # Compute the Cosine Similarity between the query model vector and target model vector
                    cosine_similarity_value = self.util.cosine_similarity(
                        query_model_vector,
                        self.location_dictionary[target_location_id]
                        [the_model])
                    self.similarity_matrix[query_location_id -
                                           1][target_location_id -
                                              1] = cosine_similarity_value

    def print_k_latent_semantics(self, k):
        """
		Method Explanation:
			. Applies a Singular Valued Decomposition on the similarity matrix and prints the first k latent semantics determined by the k parameter.
			. The output is in the form of location-weight pairs for each semantic sorted in the decreasing order of weights.
		Input:
			. <k> for considering only the k latent semantics post SVD
		"""
        U, S, Vt = numpy.linalg.svd(self.similarity_matrix)

        # Get the concept mapping
        concept_mapping = self.similarity_matrix.dot(U[:, :k])
        concept_mapping = concept_mapping.transpose()

        # {
        #  <location_id>: [{"Location Name": <>, "Weight": <>}, {"Location Name": <>, "Weight": <>}, ...],
        #  <location_id>: [{"Location Name": <>, "Weight": <>}, {"Location Name": <>, "Weight": <>}, ...],
        #  ...
        # }
        semantic_data_dict = {}
        print("")
        for arr_index, arr in enumerate(concept_mapping):
            current_key = arr_index + 1
            if not semantic_data_dict.get(current_key):
                semantic_data_dict[current_key] = []

            for index, element in enumerate(arr):
                semantic_data_dict[current_key].append({
                    "Location Name":
                    self.location_id_to_title_map[str(index + 1)],
                    "Weight":
                    element
                })

            # Sort the latent semantic based on the weight of the feature
            sorted_list = sorted(semantic_data_dict[current_key],
                                 key=itemgetter("Weight"),
                                 reverse=True)
            semantic_data_dict[current_key].clear()
            semantic_data_dict[current_key] = sorted_list

            # Print location name-weight pairs sorted in decreasing order of weights
            print("Latent Semantic: ", current_key)
            for idx, data in enumerate(sorted_list):
                print("\tLocation Name: ",
                      semantic_data_dict[current_key][idx]["Location Name"],
                      " | Weight: ",
                      semantic_data_dict[current_key][idx]["Weight"])
            print("")

    def runner(self):
        k = input("Enter the k value: ")
        k = int(k)
        the_model = "TFIDF"
        self.construct_vocabulary()
        self.construct_similarity_matrix(the_model)
        self.print_k_latent_semantics(k)

Пример #2

Показать файл

Файл: task_6_visual_descriptors.py Проект: srdevan/Multimedia-Retrieval-Phase-2

    def runner(self):
        k = input('Enter the k value: ')
        k = int(k)

        util = Util()
        data_extractor = DataExtractor()

        location_id_to_title_map = data_extractor.location_mapping()
        location_title_to_id_map = data_extractor.location_title_to_id_mapping(
        )

        location_list = list(location_id_to_title_map.values())

        LOCATION_COUNT = len(location_list)  # constant
        MODEL_COUNT = len(constants.MODELS)
        MAX_SCORE = (LOCATION_COUNT - 1) * MODEL_COUNT

        FILE_PATH_PREFIX = constants.PROCESSED_VISUAL_DESCRIPTORS_DIR_PATH  # '../dataset/visual_descriptors/processed/' # constant
        # {
        # 	1: {'CM': [{'location_id': 1, 'distance': 0}, {'location_id':2, 'distance': 0.45}, ...], 'CN': [...], ... },
        # 	2: {'CM': [...], 'CN': [...], ...},
        #   ... ,
        #   <query_location>: {
        # 						<model>: [{'location_id': <location_id>, 'distance': <distance>}, {'location_id': <location_id>, 'distance': <distance>}],
        # 					   	<model>: [...],
        # 					   	...
        # 					  }
        # }
        global_location_distance_data_dict = {}
        # {
        # 1: {1: 0, 2: 0.54, 3: 0.43, ...},
        # 2: { 1: 0.45, 2: 0, ...},
        # ... ,
        # <query_location>: { <target_location>: <distance>, <target_location>: <distance>, ...}
        # }
        location_wise_distance_data_dict = {}
        similarity_matrix = numpy.zeros((LOCATION_COUNT, LOCATION_COUNT))
        print('Starting...')

        # Go over every location as a potential query location
        for query_location in location_list:
            query_location_files = data_extractor.get_all_files_prefixed_with(
                query_location)
            query_location_id = location_title_to_id_map[query_location]

            if not global_location_distance_data_dict.get(query_location_id):
                global_location_distance_data_dict[query_location_id] = {}
            if not location_wise_distance_data_dict.get(query_location_id):
                location_wise_distance_data_dict[query_location_id] = {}
            print('Query Location: ', query_location)

            # Go over every model file in the query location
            for query_model_file in query_location_files:
                query_model_name_with_csv = query_model_file.split(" ")[
                    1]  # CM.csv, CN.csv, <modelName>.csv, ...
                query_model = query_model_name_with_csv.split(".")[
                    0]  # CM, CN, CN3x3, <modelName>, ...
                query_file_path = FILE_PATH_PREFIX + query_model_file
                query_model_df = pd.read_csv(query_file_path, header=None)
                del query_model_df[0]
                query_model_df = query_model_df.reset_index(drop=True)
                query_model_df_row_count = query_model_df.shape[0]

                if not global_location_distance_data_dict.get(
                        query_location_id).get(query_model):
                    global_location_distance_data_dict[query_location_id][
                        query_model] = []
                print('\tQuery Model: ', query_model)

                # Go over every location as a potential target location for which we will compute the distance to from the query location
                for target_location in location_list:
                    target_location_id = location_title_to_id_map[
                        target_location]
                    # If query location == target location, distance = 0
                    if query_location == target_location:
                        distance = 0
                        global_location_distance_data_dict[query_location_id][
                            query_model].append({
                                'location_id': target_location_id,
                                'distance': 0
                            })
                    else:
                        # Find the corresponding model file of the query location in the target location
                        target_model_file_path = FILE_PATH_PREFIX + target_location + " " + query_model + ".csv"
                        target_model_df = pd.read_csv(target_model_file_path,
                                                      header=None)
                        target_model_df_copy = target_model_df.copy()
                        del target_model_df[0]
                        target_model_df = target_model_df.reset_index(
                            drop=True)
                        target_model_df_row_count = target_model_df.shape[0]
                        target_model_df_column_count = target_model_df.shape[1]

                        # Calculate the distance between the query location's model file and the target location's corresponding model file
                        distance = self.get_the_distance_value(
                            query_model_df, target_model_df,
                            query_model_df_row_count, query_model, util)

                        global_location_distance_data_dict[query_location_id][
                            query_model].append({
                                'location_id': target_location_id,
                                'distance': distance
                            })

                    # Set distance temporarily as 0 in the location_wise_distance_data_dict for this location
                    if not location_wise_distance_data_dict.get(
                            query_location_id).get(target_location_id):
                        location_wise_distance_data_dict[query_location_id][
                            target_location_id] = 0

                # At this state, we have gone over every target location with the corresponding model file from the query location.
                # Sort the model based location list of distances based on distance from the location
                sorted_list = sorted(
                    global_location_distance_data_dict[query_location_id]
                    [query_model],
                    key=lambda k: k['distance'])
                global_location_distance_data_dict[query_location_id][
                    query_model].clear()
                global_location_distance_data_dict[query_location_id][
                    query_model] = sorted_list
                # Repeat the loop, do it for every model file of the query location

            location_data_dict = global_location_distance_data_dict[
                query_location_id]

            # Compute the ranking of similar locations for the query location
            for curr_model, distance_list in location_data_dict.items():
                for index, curr_location_distance_data in enumerate(
                        distance_list):
                    curr_location_id = curr_location_distance_data[
                        'location_id']
                    curr_val = location_wise_distance_data_dict[
                        query_location_id][curr_location_id]
                    location_wise_distance_data_dict[query_location_id][
                        curr_location_id] = curr_val + index
            for l_id, dist in location_wise_distance_data_dict[
                    query_location_id].items():
                similarity_matrix[query_location_id - 1][l_id - 1] = dist
            # Add this to similarity matrix

        print(similarity_matrix)

        # Generate CSVs of the current similarity matrix (given by distances derived from the ranks of individual models)

        # df = pd.DataFrame(similarity_matrix)
        # loc_list = []
        # for i in range(1,31):
        # 	loc_list.append(location_id_to_title_map[str(i)])

        # # Generate the distance datrix as CSV
        # df.to_csv('./generated_data/distance_matrix_vd_minmax.csv', encoding='utf-8', header=None, index=False)
        # df.to_csv('./generated_data/distance_matrix_vd_minmax_descriptive.csv', encoding='utf-8', header=loc_list, index=loc_list)

        # Convert distance score to similarity score
        converted_similarity_matrix = similarity_matrix
        for row in range(len(converted_similarity_matrix)):
            for col in range(len(converted_similarity_matrix[0])):
                # In the dev set case, it scales distance score that ranges from 0-290 in the computation to a similarity score ranging from 0-1
                converted_similarity_matrix[row][col] = (
                    (float)(MAX_SCORE - converted_similarity_matrix[row][col])
                    / MAX_SCORE)

        # Generate the similarity matrix as CSV if needed
        # df = pd.DataFrame(converted_similarity_matrix)
        # df.to_csv('./generated_data/similarity_matrix_vd_minmax.csv', encoding='utf-8', header=None, index=False)
        # df.to_csv('./generated_data/similarity_matrix_vd_descriptive.csv', encoding='utf-8')

        # Apply SVD on the data
        U, S, Vt = numpy.linalg.svd(converted_similarity_matrix)

        # {
        #  <location_id>: [{'Location Name': <>, 'Weight': <>}, {'Location Name': <>, 'Weight': <>}, ...],
        #  <location_id>: [{'Location Name': <>, 'Weight': <>}, {'Location Name': <>, 'Weight': <>}, ...],
        #  ...
        # }
        semantic_data_dict = {}
        for arr_index, arr in enumerate(Vt[:k, :]):
            if not semantic_data_dict.get(arr_index + 1):
                semantic_data_dict[arr_index + 1] = []

            for index, element in enumerate(arr):
                semantic_data_dict[arr_index + 1].append({
                    'Location Name':
                    location_id_to_title_map[str(index + 1)],
                    'Weight':
                    element
                })

            # Sort the list based on the weight attribute
            sorted_list = sorted(semantic_data_dict[arr_index + 1],
                                 key=itemgetter('Weight'),
                                 reverse=True)
            semantic_data_dict[arr_index + 1].clear()
            semantic_data_dict[arr_index + 1] = sorted_list

            # Print the latent semantic as location name-weight pairs sorted in decreasing order of weights
            print('Latent Semantic: ', arr_index + 1)
            for idx, data in enumerate(sorted_list):
                print('\tLocation Name: ',
                      semantic_data_dict[arr_index + 1][idx]['Location Name'],
                      '| Weight: ',
                      semantic_data_dict[arr_index + 1][idx]['Weight'])