def test_precalculated_max_inter_dataset(self): query = KeywordCoordinate(0, 0, ['family', 'food', 'outdoor']) kwc1 = KeywordCoordinate(1, 1, ['family', 'food', 'outdoor']) kwc2 = KeywordCoordinate(3, 3, ['food', 'family']) kwc3 = KeywordCoordinate(2, 2, ['outdoor']) data = [kwc1, kwc2, kwc3] cf = Type1(euclidean_distance, combined_cosine_similarity, 0.3, 0.3, 0.4, disable_thresholds=True) ns = NaiveSolver(query, data, cf) pre_id = ns.get_max_inter_dataset_distance() result = ns.solve() cf.precalculated_inter_dataset_dict = pre_id result_pre = ns.solve() for index in range(len(result)): self.assertAlmostEqual(result[index][0], result_pre[index][0], delta=0.01) key_list = list(result[index][1]) key_list_pre = list(result_pre[index][1]) for list_index in range(len(key_list)): self.assertAlmostEqual(key_list[list_index].coordinates.x, key_list_pre[list_index].coordinates.x) self.assertAlmostEqual(key_list[list_index].coordinates.y, key_list_pre[list_index].coordinates.y) self.assertListEqual(key_list[list_index].keywords, key_list_pre[list_index].keywords)
def test_complex_precalculations(self): query = KeywordCoordinate(5, 6, ['culture']) kwc1 = KeywordCoordinate(2, 1, ['family', 'rest', 'indoor']) kwc2 = KeywordCoordinate(0, 2, ['science', 'culture', 'history']) kwc3 = KeywordCoordinate(0, 0, ['food', 'outdoor', 'sports']) data = [kwc1, kwc2, kwc3] cf = Type1(euclidean_distance, combined_cosine_similarity, 0.3, 0.3, 0.4, disable_thresholds=True) ns = NaiveSolver(query, data, cf, result_length=100) result = ns.solve() pre_qd = ns.get_query_dataset_distance() pre_id = ns.get_inter_dataset_distance() pre_ks = ns.get_keyword_similarity() cf.precalculated_query_dataset_dict = pre_qd cf.precalculated_inter_dataset_dict = pre_id cf.precalculated_keyword_similarity_dict = pre_ks result_pre = ns.solve() for index in range(len(result)): self.assertAlmostEqual(result[index][0], result_pre[index][0], delta=0.01) key_list = list(result[index][1]) key_list_pre = list(result_pre[index][1]) for list_index in range(len(key_list)): self.assertAlmostEqual(key_list[list_index].coordinates.x, key_list_pre[list_index].coordinates.x) self.assertAlmostEqual(key_list[list_index].coordinates.y, key_list_pre[list_index].coordinates.y) self.assertListEqual(key_list[list_index].keywords, key_list_pre[list_index].keywords)
def test_get_min_inter_dataset_distance(self): query_keywords = ['family', 'food', 'outdoor'] kwc1_keywords = ['family', 'food', 'outdoor'] kwc2_keywords = ['food'] kwc3_keywords = ['outdoor'] query = KeywordCoordinate(0, 0, query_keywords) kwc1 = KeywordCoordinate(1, 1, kwc1_keywords) kwc2 = KeywordCoordinate(2, 2, kwc2_keywords) kwc3 = KeywordCoordinate(3, 3, kwc3_keywords) data = [kwc1, kwc2, kwc3] cf = CostFunction(euclidean_distance, combined_cosine_similarity, 0.3, 0.3, 0.4) so = Solver(query, data, cf, normalize=False) fs1 = frozenset([kwc1]) fs2 = frozenset([kwc2]) fs3 = frozenset([kwc3]) fs4 = frozenset([kwc1, kwc2]) fs5 = frozenset([kwc1, kwc3]) fs6 = frozenset([kwc2, kwc3]) fs7 = frozenset([kwc1, kwc2, kwc3]) result = so.get_min_inter_dataset_distance() self.assertEqual(len(result), 7) self.assertAlmostEqual(result.get(fs1), 0.0, delta=0.01) self.assertAlmostEqual(result.get(fs2), 0.0, delta=0.01) self.assertAlmostEqual(result.get(fs3), 0.0, delta=0.01) self.assertAlmostEqual(result.get(fs4), 1.41, delta=0.01) self.assertAlmostEqual(result.get(fs5), 2.83, delta=0.01) self.assertAlmostEqual(result.get(fs6), 1.41, delta=0.01) self.assertAlmostEqual(result.get(fs7), 1.41, delta=0.01)
def test_threshold6(self): t3 = Type3(euclidean_distance, separated_cosine_similarity, 0.25, 0.25, 0.5, math.inf, math.inf, 0.4) query = KeywordCoordinate(0, 0, ['keyword1', 'keyword2', 'keyword3']) kwc1 = KeywordCoordinate(0, 0, ['keyword1']) kwc2 = KeywordCoordinate(0, 0, ['keyword2']) data = [kwc1, kwc2] result = t3.solve(query, data) self.assertAlmostEqual(result, math.inf, delta=0.01)
def test_find_subsets4(self): kwc1 = KeywordCoordinate(0, 0, ['0']) kwc2 = KeywordCoordinate(1, 1, ['1']) kwc3 = KeywordCoordinate(2, 2, ['2']) kwc4 = KeywordCoordinate(3, 3, ['3']) superset = [kwc1, kwc2, kwc3, kwc4] subsets = mt.find_subsets(superset, 4) self.assertEqual(len(subsets), 1) for subset in subsets: self.assertEqual(len(subset), 4)
def test_threshold4(self): t1 = Type1(euclidean_distance, separated_cosine_similarity, 0.0, 0.3, 0.7, math.inf, 0.1, math.inf) query = KeywordCoordinate(0, 0, ['keyword1', 'keyword2', 'keyword3']) kwc1 = KeywordCoordinate(0.1, 0.1, ['keyword1', 'keyword2', 'keyword3']) kwc2 = KeywordCoordinate(0.2, 0.2, ['keyword1', 'keyword2', 'keyword3']) data = [kwc1, kwc2] result = t1.solve(query, data) self.assertAlmostEqual(result, math.inf, delta=0.01)
def test_create_combined_keyword_vector1(self): kwv1 = ['kw1', 'kw2', 'kw3'] kwv2 = ['kw4', 'kw2', 'kw3'] kwc1 = KeywordCoordinate(0, 0, kwv1) kwc2 = KeywordCoordinate(0, 0, kwv2) kwc3 = KeywordCoordinate(0, 0, kwv2) kwc_list = [kwc2, kwc3] result = mt.create_combined_keyword_vector(kwc1, kwc_list) combined_list = kwv1 + kwv2 for element in result: self.assertTrue(element in combined_list) self.assertEqual(len(result), 4)
def test_denormalize(self): cost_doesnt_matter = 0.0 kwc1 = KeywordCoordinate(0.0, 0.0, ['family']) kwc2 = KeywordCoordinate(1.0, 0.4, ['food']) kwc3 = KeywordCoordinate(0.33, 1.0, ['outdoor']) data = [(cost_doesnt_matter, [kwc1, kwc2, kwc3])] result = mt.denormalize_result_data(data, 3, 0, 5, 0) self.assertAlmostEqual(result[0][1][0].coordinates.x, 0.0, delta=0.02) self.assertAlmostEqual(result[0][1][0].coordinates.y, 0.0, delta=0.02) self.assertAlmostEqual(result[0][1][1].coordinates.x, 3.0, delta=0.02) self.assertAlmostEqual(result[0][1][1].coordinates.y, 2.0, delta=0.02) self.assertAlmostEqual(result[0][1][2].coordinates.x, 1.0, delta=0.02) self.assertAlmostEqual(result[0][1][2].coordinates.y, 5.0, delta=0.02)
def test_solve(self): query = KeywordCoordinate(0, 0, ['family', 'food', 'outdoor']) kwc1 = KeywordCoordinate(1, 1, ['family', 'food', 'outdoor']) kwc2 = KeywordCoordinate(3, 3, ['food']) kwc3 = KeywordCoordinate(2, 2, ['outdoor']) data = [kwc1, kwc2, kwc3] cf = Type1(euclidean_distance, separated_cosine_similarity, 0.3, 0.3, 0.4, disable_thresholds=True) ns = NaiveSolver(query, data, cf, normalize=False, result_length=1) result = ns.solve() self.assertAlmostEqual(result[0][0], 0.42, delta=0.01)
def generate(self, data_size: int) -> dataset_type: """ Generates a dataset with a given size. :param data_size: The size of the dataset :return: The dataset """ logger = logging.getLogger(__name__) logger.debug('generating dataset of size {}'.format(data_size)) dataset: dataset_type = [] for data_counter in range(data_size): possible_keywords_copy = self.possible_keywords.copy() current_keywords: keyword_dataset_type = [] current_x = random.randint(self.physical_min_x, self.physical_max_x) current_y = random.randint(self.physical_min_y, self.physical_max_y) number_of_keywords = random.randint(self.keywords_min, self.keywords_max) for kw_counter in range(number_of_keywords): try: current_keyword = random.choice(possible_keywords_copy) except IndexError: break possible_keywords_copy.remove(current_keyword) current_keywords.append(current_keyword) new_entry = KeywordCoordinate(current_x, current_y, current_keywords) dataset.append(new_entry) logger.debug('generated dataset {}'.format( dataset_comprehension(dataset))) return dataset
def test_instantiation(self): x = 3 y = 8 kw = ['keyword 1', 'kw2', '3'] kwc = KeywordCoordinate(x, y, kw) self.assertEqual(kwc.coordinates.x, x) self.assertEqual(kwc.coordinates.y, y) self.assertEqual(kwc.keywords, kw)
def test_instantiation(self): query_keywords = ['family', 'food', 'outdoor'] kwc1_keywords = ['family', 'food', 'outdoor'] kwc2_keywords = ['food'] kwc3_keywords = ['outdoor'] query = KeywordCoordinate(0, 0, query_keywords) kwc1 = KeywordCoordinate(1, 1, kwc1_keywords) kwc2 = KeywordCoordinate(2, 2, kwc2_keywords) kwc3 = KeywordCoordinate(3, 3, kwc3_keywords) data = [kwc1, kwc2, kwc3] cf = CostFunction(euclidean_distance, separated_cosine_similarity, 0.3, 0.3, 0.4) so = Solver(query, data, cf, normalize=False, result_length=10) self.assertAlmostEqual(so.query.coordinates.x, 0, delta=0.01) self.assertAlmostEqual(so.query.coordinates.y, 0, delta=0.01) self.assertListEqual(so.data, data) self.assertAlmostEqual(so.data[0].coordinates.x, 1, delta=0.01) self.assertAlmostEqual(so.data[0].coordinates.y, 1, delta=0.01) self.assertListEqual(so.data[0].keywords, kwc1_keywords) for index in range(len(so.data[0].keywords)): self.assertEqual(so.data[0].keywords[index], kwc1_keywords[index]) self.assertAlmostEqual(so.data[1].coordinates.x, 2, delta=0.01) self.assertAlmostEqual(so.data[1].coordinates.y, 2, delta=0.01) self.assertListEqual(so.data[1].keywords, kwc2_keywords) for index in range(len(so.data[1].keywords)): self.assertEqual(so.data[1].keywords[index], kwc2_keywords[index]) self.assertAlmostEqual(so.data[2].coordinates.x, 3, delta=0.01) self.assertAlmostEqual(so.data[2].coordinates.y, 3, delta=0.01) self.assertListEqual(so.data[2].keywords, kwc3_keywords) for index in range(len(so.data[2].keywords)): self.assertEqual(so.data[2].keywords[index], kwc3_keywords[index]) self.assertEqual(euclidean_distance.__get__, so.cost_function.distance_metric.__get__) self.assertEqual(separated_cosine_similarity.__get__, so.cost_function.similarity_metric.__get__) self.assertAlmostEqual(so.cost_function.alpha, 0.3, delta=0.01) self.assertAlmostEqual(so.cost_function.beta, 0.3, delta=0.01) self.assertAlmostEqual(so.cost_function.omega, 0.4, delta=0.01) self.assertEqual(so.normalize_data, False) self.assertEqual(so.result_length, 10) self.assertAlmostEqual(so.denormalize_max_x, 0.0, delta=0.01) self.assertAlmostEqual(so.denormalize_min_x, 0.0, delta=0.01) self.assertAlmostEqual(so.denormalize_max_y, 0.0, delta=0.01) self.assertAlmostEqual(so.denormalize_min_y, 0.0, delta=0.01)
def test_get_maximum_keyword_distance4(self): keywords_query = ['food', 'fun', 'outdoor'] keywords_kwc1 = ['food', 'fun', 'outdoor'] keywords_kwc2 = ['food', 'fun', 'outdoor'] keywords_kwc3 = ['food', 'fun', 'outdoor'] coordinates_dont_matter_here = 0 query = KeywordCoordinate(coordinates_dont_matter_here, coordinates_dont_matter_here, keywords_query) kwc1 = KeywordCoordinate(coordinates_dont_matter_here, coordinates_dont_matter_here, keywords_kwc1) kwc2 = KeywordCoordinate(coordinates_dont_matter_here, coordinates_dont_matter_here, keywords_kwc2) kwc3 = KeywordCoordinate(coordinates_dont_matter_here, coordinates_dont_matter_here, keywords_kwc3) dataset: dataset_type = [kwc1, kwc2, kwc3] cf = CostFunction(euclidean_distance, separated_cosine_similarity, 0.3, 0.3, 0.4) result = cf.get_maximum_keyword_distance(query, dataset) self.assertAlmostEqual(result, 0.0, delta=0.01)
def test_normalize_data(self): query = KeywordCoordinate(2, 1, ['family', 'food', 'outdoor']) kwc1 = KeywordCoordinate(0, 0, ['family']) kwc2 = KeywordCoordinate(3, 2, ['food']) kwc3 = KeywordCoordinate(1, 5, ['outdoor']) data = [kwc1, kwc2, kwc3] norm_query, norm_data, max_x, min_x, max_y, min_y = mt.normalize_data( query, data) self.assertAlmostEqual(norm_query.coordinates.x, 0.66, delta=0.01) self.assertAlmostEqual(norm_query.coordinates.y, 0.20, delta=0.01) self.assertAlmostEqual(norm_data[0].coordinates.x, 0.0, delta=0.01) self.assertAlmostEqual(norm_data[0].coordinates.y, 0.0, delta=0.01) self.assertAlmostEqual(norm_data[1].coordinates.x, 1.0, delta=0.01) self.assertAlmostEqual(norm_data[1].coordinates.y, 0.4, delta=0.01) self.assertAlmostEqual(norm_data[2].coordinates.x, 0.33, delta=0.01) self.assertAlmostEqual(norm_data[2].coordinates.y, 1.0, delta=0.01) self.assertEqual(max_x, 3) self.assertEqual(min_x, 0) self.assertEqual(max_y, 5) self.assertEqual(min_y, 0)
def test_write_and_read_data(self): kwc1 = KeywordCoordinate(1, 1, ['1']) kwc2 = KeywordCoordinate(2, 2, ['2']) kwc3 = KeywordCoordinate(3, 3, ['3']) data = [kwc1, kwc2, kwc3] file_name = 'test/test.pickle' write_pickle(data, file_name, True) loaded_result = load_pickle(file_name) self.assertEqual(len(loaded_result), 3) for index in range(len(loaded_result)): self.assertAlmostEqual(loaded_result[index].coordinates.x, data[index].coordinates.x) self.assertAlmostEqual(loaded_result[index].coordinates.y, data[index].coordinates.y) self.assertListEqual(loaded_result[index].keywords, data[index].keywords) os.remove( os.path.abspath( os.path.dirname(os.path.abspath(__file__)) + '../../../' + file_name))
def test_precalculated_word2vec(self): query = KeywordCoordinate(0, 0, ['family', 'food', 'outdoor']) kwc1 = KeywordCoordinate(1, 1, ['family', 'food', 'outdoor']) kwc2 = KeywordCoordinate(3, 3, ['food', 'family']) kwc3 = KeywordCoordinate(2, 2, ['outdoor']) data = [kwc1, kwc2, kwc3] model = calculate_model_subset(query, data, load_word2vec_model()) cf = Type3(euclidean_distance, word2vec_cosine_similarity, 0.3, 0.3, 0.4, disable_thresholds=True, model=model) ns = NaiveSolver(query, data, cf) result = ns.solve() pre_qd = ns.get_query_dataset_distance() pre_id = ns.get_inter_dataset_distance() pre_ks = ns.get_keyword_similarity() cf.precalculated_query_dataset_dict = pre_qd cf.precalculated_inter_dataset_dict = pre_id cf.precalculated_keyword_similarity_dict = pre_ks result_pre = ns.solve() for index in range(len(result)): self.assertAlmostEqual(result[index][0], result_pre[index][0], delta=0.01) key_list = list(result[index][1]) key_list_pre = list(result_pre[index][1]) for list_index in range(len(key_list)): self.assertAlmostEqual(key_list[list_index].coordinates.x, key_list_pre[list_index].coordinates.x) self.assertAlmostEqual(key_list[list_index].coordinates.y, key_list_pre[list_index].coordinates.y) self.assertListEqual(key_list[list_index].keywords, key_list_pre[list_index].keywords)
def test_get_minimum_for_query4(self): keywords_dont_matter_here = [''] query = KeywordCoordinate(0, 0, keywords_dont_matter_here) kwc1 = KeywordCoordinate(8, 8, keywords_dont_matter_here) kwc2 = KeywordCoordinate(9, 9, keywords_dont_matter_here) kwc3 = KeywordCoordinate(13, 13, keywords_dont_matter_here) kwc4 = KeywordCoordinate(24, 24, keywords_dont_matter_here) kwc5 = KeywordCoordinate(35, 35, keywords_dont_matter_here) dataset: dataset_type = [kwc1, kwc2, kwc3, kwc4, kwc5] cf = CostFunction(manhattan_distance, separated_cosine_similarity, 0.3, 0.3, 0.4) result = cf.get_minimum_for_query(query, dataset) self.assertAlmostEqual(result, 16.0, delta=0.01)
def test_get_minimum_for_dataset2(self): keywords_dont_matter_here = [''] kwc1 = KeywordCoordinate(5, 5, keywords_dont_matter_here) kwc2 = KeywordCoordinate(6, 6, keywords_dont_matter_here) kwc3 = KeywordCoordinate(7, 7, keywords_dont_matter_here) kwc4 = KeywordCoordinate(8, 8, keywords_dont_matter_here) kwc5 = KeywordCoordinate(9, 9, keywords_dont_matter_here) kwc6 = KeywordCoordinate(10, 10, keywords_dont_matter_here) dataset: dataset_type = [kwc1, kwc2, kwc3, kwc4, kwc5, kwc6] cf = CostFunction(manhattan_distance, separated_cosine_similarity, 0.3, 0.3, 0.4) result = cf.get_minimum_for_dataset(dataset) self.assertAlmostEqual(result, 2.0, delta=0.01)
def test_get_maximum_for_dataset3(self): keywords_dont_matter_here = [''] kwc1 = KeywordCoordinate(6, 6, keywords_dont_matter_here) kwc2 = KeywordCoordinate(8, 8, keywords_dont_matter_here) kwc3 = KeywordCoordinate(9, 9, keywords_dont_matter_here) kwc4 = KeywordCoordinate(13, 13, keywords_dont_matter_here) kwc5 = KeywordCoordinate(24, 24, keywords_dont_matter_here) kwc6 = KeywordCoordinate(35, 35, keywords_dont_matter_here) dataset: dataset_type = [kwc1, kwc2, kwc3, kwc4, kwc5, kwc6] cf = CostFunction(euclidean_distance, separated_cosine_similarity, 0.3, 0.3, 0.4) result = cf.get_maximum_for_dataset(dataset) self.assertAlmostEqual(result, 41.01, delta=0.01)
def test_get_maximum_for_dataset1(self): keywords_dont_matter_here = [''] kwc1 = KeywordCoordinate(0, 0, keywords_dont_matter_here) kwc2 = KeywordCoordinate(1, 1, keywords_dont_matter_here) kwc3 = KeywordCoordinate(2, 2, keywords_dont_matter_here) kwc4 = KeywordCoordinate(3, 3, keywords_dont_matter_here) kwc5 = KeywordCoordinate(4, 4, keywords_dont_matter_here) kwc6 = KeywordCoordinate(5, 5, keywords_dont_matter_here) dataset: dataset_type = [kwc1, kwc2, kwc3, kwc4, kwc5, kwc6] cf = CostFunction(euclidean_distance, separated_cosine_similarity, 0.3, 0.3, 0.4) result = cf.get_maximum_for_dataset(dataset) self.assertAlmostEqual(result, 7.07, delta=0.01)
def test_get_minimum_for_dataset3(self): keywords_dont_matter_here = [''] kwc1 = KeywordCoordinate(0, 0, keywords_dont_matter_here) kwc2 = KeywordCoordinate(13, 13, keywords_dont_matter_here) kwc3 = KeywordCoordinate(20, 20, keywords_dont_matter_here) kwc4 = KeywordCoordinate(800, 800, keywords_dont_matter_here) kwc5 = KeywordCoordinate(9000, 9000, keywords_dont_matter_here) kwc6 = KeywordCoordinate(10000, 10000, keywords_dont_matter_here) dataset: dataset_type = [kwc1, kwc2, kwc3, kwc4, kwc5, kwc6] cf = CostFunction(euclidean_distance, separated_cosine_similarity, 0.3, 0.3, 0.4) result = cf.get_minimum_for_dataset(dataset) self.assertAlmostEqual(result, 9.9, delta=0.01)
def test_solve4(self): t3 = Type3(manhattan_distance, separated_cosine_similarity, 1, 0, 0, disable_thresholds=True) keywords_query = ['food', 'fun', 'outdoor', 'family'] keywords_kwc1 = ['food', 'fun', 'outdoor'] keywords_kwc2 = ['food', 'fun'] keywords_kwc3 = ['food'] query = KeywordCoordinate(0, 0, keywords_query) kwc1 = KeywordCoordinate(1, 1, keywords_kwc1) kwc2 = KeywordCoordinate(2, 2, keywords_kwc2) kwc3 = KeywordCoordinate(3, 3, keywords_kwc3) kwc4 = KeywordCoordinate(4, 4, keywords_kwc3) kwc5 = KeywordCoordinate(5, 5, keywords_kwc3) data = [kwc1, kwc2, kwc3, kwc4, kwc5] result = t3.solve(query, data) self.assertAlmostEqual(result, 2.0, delta=0.01)
def test_solve9(self): t1 = Type1(euclidean_distance, separated_cosine_similarity, 0, 0, 1, disable_thresholds=True) keywords_query = ['food', 'fun', 'outdoor', 'family'] keywords_kwc1 = ['food', 'fun', 'outdoor', 'family'] keywords_kwc2 = ['food', 'fun', 'outdoor', 'family'] keywords_kwc3 = ['this_is_not_a_match'] query = KeywordCoordinate(0, 0, keywords_query) kwc1 = KeywordCoordinate(1, 1, keywords_kwc1) kwc2 = KeywordCoordinate(2, 2, keywords_kwc2) kwc3 = KeywordCoordinate(3, 3, keywords_kwc3) kwc4 = KeywordCoordinate(4, 4, keywords_kwc3) kwc5 = KeywordCoordinate(5, 5, keywords_kwc3) data = [kwc1, kwc2, kwc3, kwc4, kwc5] result = t1.solve(query, data) self.assertAlmostEqual(result, 1.0, delta=0.01)
def load_csv(file_name: str, x_coordinate_index: int, y_coordinate_index: int, keywords_index: int, keywords_delimiter: str = ' ', max_read_length: int = -1, delimiter: str = ',', newline: str = '', quotechar: str = '"', path_relative_to_project_root: bool = True, query_load: bool = False) -> dataset_type: """ Loads a csv file. :param file_name: The file name of the csv file. The file is usually in the project folder. Otherwise use the path_relative_to_project_root flag. :param x_coordinate_index: The index of the x coordinates :param y_coordinate_index: The index of the y coordinates :param keywords_index: The index of the keywords :param keywords_delimiter: The delimiter of the keywords :param max_read_length: The maximum number of lines to read :param delimiter: The csv cell delimiter :param newline: The newline delimiter :param quotechar: The quotechar symbol :param path_relative_to_project_root: The flag if the file name is relative to the project folder :return: The dataset of the csv """ dataset: dataset_type = [] if query_load: max_read_length -= 1 # because the length doesn't start counting at 0 if path_relative_to_project_root: file_path = os.path.abspath( os.path.dirname(os.path.abspath(__file__)) + '/../../files/' + file_name) else: file_path = file_name with open(file_path, mode='rt', newline=newline, encoding='utf-8') as csvfile: reader = csv.reader(csvfile, delimiter=delimiter, quotechar=quotechar) for row in reader: try: # print(row[x_coordinate_index]) current_coordinate_x = float(row[x_coordinate_index]) # print(current_coordinate_x) current_coordinate_y = float(row[y_coordinate_index]) # print(current_coordinate_y) except: print('----- Failure -----') if max_read_length > 0: max_read_length += 1 continue raw_keyword_list = row[keywords_index].split( keywords_delimiter) current_POI_name = 'Query point' # Query has no POI name current_keywords: keyword_dataset_type = [] for keyword in raw_keyword_list: stripped_keyword = keyword.strip() if len(stripped_keyword) > 0: current_keywords.append(stripped_keyword) current_keyword_coordinate = KeywordCoordinate( current_POI_name, current_coordinate_x, current_coordinate_y, current_keywords) dataset.append(current_keyword_coordinate) else: df = pd.read_csv(os.path.abspath( os.path.dirname(os.path.abspath(__file__)) + '/../../files/' + file_name), delimiter=';', error_bad_lines=False, encoding="utf-8") # print(df) # Calculates topN keywords using TF-IDF # Removes rows with NaN values df.dropna(inplace=True) reviews = df['keywords_all'] df['keyword lists IDF'] = reviews.apply(lambda x: reviews2OneString(x)) #remove POIs with no reviews or NaN values df = df[df['keyword lists IDF'].str.len() != 0] #df.dropna(inplace=True) # nlp = spacy.load('en_core_web_lg') nlp = en_core_web_lg.load() df['keyword lists IDF'] = df['keyword lists IDF'].apply( lambda x: pre_process(x, nlp)) docs = df['keyword lists IDF'].tolist() #print(df['keyword lists IDF'][0]) # Let's compute IDF #1. Create a vocabulary of words, #2. Ignore words that appear in 85% of documents, #3. Eliminate stop words cv = CountVectorizer(max_df=0.85, stop_words='english') word_count_vector = cv.fit_transform(docs) # print(np.shape(word_count_vector)) # Let's compute IDF (test = IDF dataset) tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True) tfidf_transformer.fit(word_count_vector) # Computing TF-IDF and Extracting Keywords # Get the whole vocabulary (all reviews for all POIs) in a list docs = df['keyword lists IDF'].tolist() feature_names = cv.get_feature_names() df['Top-Keywords-TFIDF'] = reviews.apply(lambda x: get_topN_keywords( x, 10, tfidf_transformer, cv, feature_names)) #print('list --> ', df['Top-Keywords-TFIDF']) df_poi_keywords = pd.DataFrame( columns=['poi_name', 'nlp_keywords_encoded']) for index, row in df.iterrows(): current_POI_name = row.get('name') # Whatch out if we want to use coordinates for something else try: current_coordinate_x = float(row.get('lat')) current_coordinate_y = float(row.get('lng')) except: print("ERROR --> Coordinates") continue #current_keywords: keyword_dataset_type = (df['Top-Keywords-TFIDF'][i]) #print(type(row['Top-Keywords-TFIDF'])) current_keywords = [] for k in row['Top-Keywords-TFIDF'].keys(): current_keywords.append(k) # print('+++++ current_keywords --> ', current_keywords) # current_keywords: keyword_dataset_type = [] # for keyword in raw_keyword_list: # stripped_keyword = keyword.strip() # if len(stripped_keyword) > 0: # current_keywords.append(stripped_keyword) # current_keyword_coordinate = KeywordCoordinate(current_POI_name, current_coordinate_x, current_coordinate_y, current_keywords) element_string = '' for kw in current_keywords: element_string = element_string + ' ' + kw nlp_element = nlp(element_string) # print('NLP element: ', nlp_element) new_row = { 'poi_name': current_POI_name, 'nlp_keywords_encoded': nlp_element } df_poi_keywords = df_poi_keywords.append(new_row, ignore_index=True) current_keyword_coordinate = KeywordCoordinate( current_POI_name, current_coordinate_x, current_coordinate_y, current_keywords) dataset.append(current_keyword_coordinate) # print(np.shape(df_poi_keywords)) df_poi_keywords.set_index('poi_name', inplace=True) df_poi_keywords.to_csv(os.path.dirname(os.path.abspath(__file__)) + '/../../files/' + 'poi_keywords_encoded.csv', encoding='utf-8') print(np.shape(dataset)) return dataset