def main(argv): start_time = time.time() # Config file_name_data = 'dataset.pickle' file_name_query = 'query.pickle' file_name_word2vec_model = 'model.pickle' target_file_name = 'precalculated_query_dataset_keyword_similarities_word2vec.pickle' # max_subset_size = int(argv[0]) max_subset_size = 2 # cost_function = Type3(euclidean_distance, word2vec_cosine_similarity, 0.2, 0.1, 0.7, # model=load_word2vec_model(file_name_word2vec_model)) # cost_function = Type2(euclidean_distance, word2vec_cosine_similarity, 0, 0, 1, # model=load_word2vec_model(file_name_word2vec_model)) cost_function = Type1(euclidean_distance, word2vec_cosine_similarity, 0.2, 0.1, 0.7, model=load_word2vec_model(file_name_word2vec_model)) file_allow_overwrite = True # Code data = load_pickle(file_name_data) query = load_pickle(file_name_query) solver = NaiveSolver(query, data, cost_function, max_subset_size=max_subset_size) precalculated_query_dataset_distances = solver.get_keyword_similarity() write_pickle(precalculated_query_dataset_distances, target_file_name, file_allow_overwrite=file_allow_overwrite) print("--- %s seconds ---" % (time.time() - start_time))
def main(argv): start_time = time.time() # Config # csv_file_name = 'London.csv' csv_file_name = argv[0] print(csv_file_name) data_target_name = 'dataset.pickle' x_index = 5 # Starts in 0 y_index = 6 keyword_index = 9 max_read_length = -1 # -1 to disable keyword_delimiter = ' ' csv_delimiter = ';' csv_quotechar = '"' file_allow_overwrite = True # Code print('Loading CSV', csv_file_name) data = load_csv(file_name=csv_file_name, x_coordinate_index=x_index, y_coordinate_index=y_index, keywords_index=keyword_index, keywords_delimiter=keyword_delimiter, delimiter=csv_delimiter, quotechar=csv_quotechar, max_read_length=max_read_length) if len(data) > 0: print('Example Datapoint:', data[0].coordinates.x, data[0].coordinates.y, data[0].keywords) write_pickle(data=data, file_name=data_target_name, file_allow_overwrite=file_allow_overwrite) else: print('Could not load any data.') print("--- %s seconds ---" % (time.time() - start_time))
def main(argv): start_time = time.time() # Config csv_file_name = 'user_queries.csv' data_target_name = 'query.pickle' x_index = 0 y_index = 1 keyword_index = 2 # query_index = 16 # Let's take a query from the query file query_index = int(argv[0]) keyword_delimiter = ' ' csv_delimiter = ';' csv_quotechar = '"' file_allow_overwrite = True # Code if query_index <= 0: print('The query row has to be positive.') else: print('Loading CSV', csv_file_name) data = load_csv(file_name=csv_file_name, x_coordinate_index=x_index, y_coordinate_index=y_index, keywords_index=keyword_index, keywords_delimiter=keyword_delimiter, delimiter=csv_delimiter, quotechar=csv_quotechar, max_read_length=query_index + 1, query_load=True) if len(data) > 0: print('Query Datapoint:', data[query_index - 1].coordinates.x, data[query_index - 1].coordinates.y, data[query_index - 1].keywords) write_pickle(data=data[query_index - 1], file_name=data_target_name, file_allow_overwrite=file_allow_overwrite) else: print('Could not load any data.') nlp = en_core_web_lg.load() df_poi_encoded = pd.read_csv(os.path.dirname(os.path.abspath(__file__)) + '/../files/' + 'poi_keywords_encoded.csv', index_col='poi_name', encoding='utf-8') # print(df_poi_encoded) for kw in data[query_index - 1].keywords: df_poi_encoded[kw] = df_poi_encoded.apply( lambda row: nlp(row['nlp_keywords_encoded']).similarity(nlp(kw)), axis=1) df_poi_encoded.to_csv(os.path.dirname(os.path.abspath(__file__)) + '/../files/' + 'poi_queries_similarities.csv', encoding='utf-8') print("--- %s seconds ---" % (time.time() - start_time))
def main(argv): start_time = time.time() # Config file_name_data = 'dataset.pickle' query_file_name = 'query.pickle' file_name_word2vec_model = 'model.pickle' target_file_name = 'precalculated_inter_dataset_distances.pickle' # max_subset_size = 3 # Changed max_subset_size = 2 #int(argv[0]) # cost_function = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7) # cost_function = Type2(euclidean_distance, word2vec_cosine_similarity, 0, 0, 1, # model=load_word2vec_model(file_name_word2vec_model)) start_time = time.time() cost_function = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7, model=load_word2vec_model(file_name_word2vec_model)) finish_time = time.time() print('Cost function initialization --> ', finish_time - start_time) file_allow_overwrite = True # Code start_time = time.time() data = load_pickle(file_name_data) finish_time = time.time() print('Load data pickle --> ', finish_time - start_time) # query = KeywordCoordinate("", 0, 0, ['0']) query = load_pickle(query_file_name) start_time = time.time() solver = NaiveSolver(query, data, cost_function, max_subset_size=max_subset_size) finish_time = time.time() print('Solver initialization --> ', finish_time - start_time) precalculated_inter_dataset_distances = solver.get_inter_dataset_distance() write_pickle(precalculated_inter_dataset_distances, target_file_name, file_allow_overwrite=file_allow_overwrite) print("--- %s seconds ---" % (time.time() - start_time))
def main(argv): start_time = time.time() # Config # Both files should be in the root directory of the project. word2vec_model_name = 'model.pickle' model_pickle_file_name = 'word2vec_model.pickle' query_file_name = 'query.pickle' data_file_name = 'dataset.pickle' file_allow_overwrite = True # Code - you shouldn't have to make any changes to this model = load_word2vec_model(word2vec_model_name) query = load_pickle(query_file_name) data = load_pickle(data_file_name) shrunk_model = calculate_model_subset(query, data, model) write_pickle(shrunk_model, model_pickle_file_name, file_allow_overwrite=file_allow_overwrite) print("--- %s seconds ---" % (time.time() - start_time))
def test_write_and_read_data(self): kwc1 = KeywordCoordinate(1, 1, ['1']) kwc2 = KeywordCoordinate(2, 2, ['2']) kwc3 = KeywordCoordinate(3, 3, ['3']) data = [kwc1, kwc2, kwc3] file_name = 'test/test.pickle' write_pickle(data, file_name, True) loaded_result = load_pickle(file_name) self.assertEqual(len(loaded_result), 3) for index in range(len(loaded_result)): self.assertAlmostEqual(loaded_result[index].coordinates.x, data[index].coordinates.x) self.assertAlmostEqual(loaded_result[index].coordinates.y, data[index].coordinates.y) self.assertListEqual(loaded_result[index].keywords, data[index].keywords) os.remove( os.path.abspath( os.path.dirname(os.path.abspath(__file__)) + '../../../' + file_name))
def generate_pickle(self, data_size: int, file_name: str, file_allow_overwrite: bool = False, file_only_overwrite_dot_pickle_files: bool = True, pickle_protocol_version: int = 4) -> dataset_type: """ Generates a new dataset, writes it as pickle and returns the generated dataset. :param data_size: The dataset :param file_name: The name of the file :param file_allow_overwrite: If files are allowed to be overwritten :param file_only_overwrite_dot_pickle_files: If the name of the file has to end with .pickle :param pickle_protocol_version: The protocol version of the pickle format :return: The generated data which has been written to disk """ logger = logging.getLogger(__name__) logger.debug('generating dataset of size {}'.format(data_size)) data = self.generate(data_size) logger.debug('generated dataset {}'.format( dataset_comprehension(data))) write_pickle(data, file_name, file_allow_overwrite, file_only_overwrite_dot_pickle_files, pickle_protocol_version) return data
y_index = 6 keyword_index = 9 max_read_length = -1 # -1 to disable keyword_delimiter = ' ' csv_delimiter = ';' csv_quotechar = '"' file_allow_overwrite = True # Code print('Loading CSV', csv_file_name) data = load_csv(file_name=csv_file_name, x_coordinate_index=x_index, y_coordinate_index=y_index, keywords_index=keyword_index, keywords_delimiter=keyword_delimiter, delimiter=csv_delimiter, quotechar=csv_quotechar, max_read_length=max_read_length) if len(data) > 0: print('Example Datapoint:', data[0].coordinates.x, data[0].coordinates.y, data[0].keywords) write_pickle(data=data, file_name=data_target_name, file_allow_overwrite=file_allow_overwrite) else: print('Could not load any data.') print("--- %s seconds ---" % (time.time() - start_time)) word2vec_model_name = 'model.pickle' word2vec_model = load_word2vec_model(word2vec_model_name) for i in range(13, 79): print('++++++++++++++++++') print('Query --> ', i) print('++++++++++++++++++') iteration_start_time = time.time()
import os #import word2vec #from gensim.models import Word2Vec from gensim.models import KeyedVectors import sys sys.path.append("..") from src.utils.data_handler import write_pickle if __name__ == '__main__': # Config # Both files should be in the root directory of the project. #word2vec_model_name = 'model.bin' word2vec_model_name = 'model_test2.bin' model_pickle_file_name = 'word2vec_model.pickle' word2vec_model_path = os.path.abspath( os.path.abspath(os.path.dirname(__file__)) + '/../files/' + word2vec_model_name) # Code - you shouldn't have to make any changes to this keyedVectors = KeyedVectors.load(word2vec_model_path, mmap='r') print(len(keyedVectors.wv.vocab.items())) result_dict = dict() for word in keyedVectors.wv.vocab: result_dict[word] = keyedVectors.wv.get_vector(word) print() write_pickle(result_dict, model_pickle_file_name)
from src.utils.data_generator import DataGenerator from src.utils.data_handler import write_pickle if __name__ == '__main__': # Config data_target_name = 'query.pickle' possible_keywords = [ 'family', 'food', 'outdoor', 'rest', 'indoor', 'sports', 'science', 'culture', 'history' ] file_allow_overwrite = False # Code dg = DataGenerator(possible_keywords) generated_query = dg.generate(1) write_pickle(generated_query, file_name=data_target_name, file_allow_overwrite=file_allow_overwrite)