def main(argv): start_time = time.time() # Config file_name_data = 'dataset.pickle' file_name_query = 'query.pickle' file_name_word2vec_model = 'model.pickle' target_file_name = 'precalculated_query_dataset_keyword_similarities_word2vec.pickle' # max_subset_size = int(argv[0]) max_subset_size = 2 # cost_function = Type3(euclidean_distance, word2vec_cosine_similarity, 0.2, 0.1, 0.7, # model=load_word2vec_model(file_name_word2vec_model)) # cost_function = Type2(euclidean_distance, word2vec_cosine_similarity, 0, 0, 1, # model=load_word2vec_model(file_name_word2vec_model)) cost_function = Type1(euclidean_distance, word2vec_cosine_similarity, 0.2, 0.1, 0.7, model=load_word2vec_model(file_name_word2vec_model)) file_allow_overwrite = True # Code data = load_pickle(file_name_data) query = load_pickle(file_name_query) solver = NaiveSolver(query, data, cost_function, max_subset_size=max_subset_size) precalculated_query_dataset_distances = solver.get_keyword_similarity() write_pickle(precalculated_query_dataset_distances, target_file_name, file_allow_overwrite=file_allow_overwrite) print("--- %s seconds ---" % (time.time() - start_time))
def main(argv): start_time = time.time() # Config file_name_data = 'dataset.pickle' query_file_name = 'query.pickle' file_name_word2vec_model = 'model.pickle' target_file_name = 'precalculated_inter_dataset_distances.pickle' # max_subset_size = 3 # Changed max_subset_size = 2 #int(argv[0]) # cost_function = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7) # cost_function = Type2(euclidean_distance, word2vec_cosine_similarity, 0, 0, 1, # model=load_word2vec_model(file_name_word2vec_model)) start_time = time.time() cost_function = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7, model=load_word2vec_model(file_name_word2vec_model)) finish_time = time.time() print('Cost function initialization --> ', finish_time - start_time) file_allow_overwrite = True # Code start_time = time.time() data = load_pickle(file_name_data) finish_time = time.time() print('Load data pickle --> ', finish_time - start_time) # query = KeywordCoordinate("", 0, 0, ['0']) query = load_pickle(query_file_name) start_time = time.time() solver = NaiveSolver(query, data, cost_function, max_subset_size=max_subset_size) finish_time = time.time() print('Solver initialization --> ', finish_time - start_time) precalculated_inter_dataset_distances = solver.get_inter_dataset_distance() write_pickle(precalculated_inter_dataset_distances, target_file_name, file_allow_overwrite=file_allow_overwrite) print("--- %s seconds ---" % (time.time() - start_time))
def main(argv): start_time = time.time() # Config # Both files should be in the root directory of the project. word2vec_model_name = 'model.pickle' model_pickle_file_name = 'word2vec_model.pickle' query_file_name = 'query.pickle' data_file_name = 'dataset.pickle' file_allow_overwrite = True # Code - you shouldn't have to make any changes to this model = load_word2vec_model(word2vec_model_name) query = load_pickle(query_file_name) data = load_pickle(data_file_name) shrunk_model = calculate_model_subset(query, data, model) write_pickle(shrunk_model, model_pickle_file_name, file_allow_overwrite=file_allow_overwrite) print("--- %s seconds ---" % (time.time() - start_time))
def test_complex_generated_word2vec_precalculations(self): possible_keywords = [ 'family', 'food', 'outdoor', 'rest', 'indoor', 'sports', 'science', 'culture', 'history' ] dg = DataGenerator(possible_keywords) query: KeywordCoordinate = dg.generate(1)[0] data = dg.generate(5) model = calculate_model_subset(query, data, load_word2vec_model()) cf = Type1(euclidean_distance, word2vec_cosine_similarity, 0.3, 0.3, 0.4, disable_thresholds=True, model=model) ns = NaiveSolver(query, data, cf, result_length=100) result = ns.solve() pre_qd = ns.get_query_dataset_distance() pre_id = ns.get_inter_dataset_distance() pre_ks = ns.get_keyword_similarity() cf.precalculated_query_dataset_dict = pre_qd cf.precalculated_inter_dataset_dict = pre_id cf.precalculated_keyword_similarity_dict = pre_ks result_pre = ns.solve() for index in range(len(result)): self.assertAlmostEqual(result[index][0], result_pre[index][0], delta=0.01) key_list = list(result[index][1]) key_list_pre = list(result_pre[index][1]) for list_index in range(len(key_list)): self.assertAlmostEqual(key_list[list_index].coordinates.x, key_list_pre[list_index].coordinates.x) self.assertAlmostEqual(key_list[list_index].coordinates.y, key_list_pre[list_index].coordinates.y) self.assertListEqual(key_list[list_index].keywords, key_list_pre[list_index].keywords)
def test_precalculated_word2vec(self): query = KeywordCoordinate(0, 0, ['family', 'food', 'outdoor']) kwc1 = KeywordCoordinate(1, 1, ['family', 'food', 'outdoor']) kwc2 = KeywordCoordinate(3, 3, ['food', 'family']) kwc3 = KeywordCoordinate(2, 2, ['outdoor']) data = [kwc1, kwc2, kwc3] model = calculate_model_subset(query, data, load_word2vec_model()) cf = Type3(euclidean_distance, word2vec_cosine_similarity, 0.3, 0.3, 0.4, disable_thresholds=True, model=model) ns = NaiveSolver(query, data, cf) result = ns.solve() pre_qd = ns.get_query_dataset_distance() pre_id = ns.get_inter_dataset_distance() pre_ks = ns.get_keyword_similarity() cf.precalculated_query_dataset_dict = pre_qd cf.precalculated_inter_dataset_dict = pre_id cf.precalculated_keyword_similarity_dict = pre_ks result_pre = ns.solve() for index in range(len(result)): self.assertAlmostEqual(result[index][0], result_pre[index][0], delta=0.01) key_list = list(result[index][1]) key_list_pre = list(result_pre[index][1]) for list_index in range(len(key_list)): self.assertAlmostEqual(key_list[list_index].coordinates.x, key_list_pre[list_index].coordinates.x) self.assertAlmostEqual(key_list[list_index].coordinates.y, key_list_pre[list_index].coordinates.y) self.assertListEqual(key_list[list_index].keywords, key_list_pre[list_index].keywords)
def main(argv): start_time = time.time() # Evaluator, instantiate it first for logging purposes ev = Evaluator() query: KeywordCoordinate = load_pickle('query.pickle') print('Query:', query) data: dataset_type = load_pickle('dataset.pickle') # print('Data:', dataset_comprehension(data)) # Let's filter out by user radius # dataAux = sorted(data, key=lambda x: geographic_distance(x.coordinates, query.coordinates)) # distances = [geographic_distance(x.coordinates, query.coordinates) >= RADIUS for x in dataAux] # print('------ Distances: ', distances) # Load precalculated values and models precalculated_inter_dataset_distances = load_pickle( 'precalculated_inter_dataset_distances.pickle') precalculated_query_dataset_distances = load_pickle( 'precalculated_query_dataset_distances.pickle') precalculated_query_dataset_keyword_similarities = load_pickle( 'precalculated_query_dataset_keyword_similarities.pickle') # **** ONLY FOR word2vec model executions precalculated_query_dataset_keyword_similarities_word2vec = load_pickle( 'precalculated_query_dataset_keyword_similarities_word2vec.pickle') word2vec_model = load_word2vec_model('word2vec_model.pickle') # **** # Define the CostFunctions. For all possible parameters refer to the documentation. cf1 = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7) cf2 = Type2(euclidean_distance, word2vec_cosine_similarity, 0.2, 0.1, 0.7, model=word2vec_model) cf3 = Type1( euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7, precalculated_inter_dataset_dict=precalculated_inter_dataset_distances, precalculated_query_dataset_dict=precalculated_query_dataset_distances, precalculated_keyword_similarity_dict= precalculated_query_dataset_keyword_similarities) cf4 = Type2( euclidean_distance, word2vec_cosine_similarity, 0, 0, 1.0, precalculated_inter_dataset_dict=precalculated_inter_dataset_distances, precalculated_query_dataset_dict=precalculated_query_dataset_distances, precalculated_keyword_similarity_dict= precalculated_query_dataset_keyword_similarities_word2vec, model=word2vec_model) cf5 = Type3( euclidean_distance, word2vec_cosine_similarity, 0.1, 0.1, 0.8, precalculated_inter_dataset_dict=precalculated_inter_dataset_distances, precalculated_query_dataset_dict=precalculated_query_dataset_distances, precalculated_keyword_similarity_dict= precalculated_query_dataset_keyword_similarities_word2vec, model=word2vec_model) cf6 = Type1( euclidean_distance, word2vec_cosine_similarity, 0.2, 0.1, 0.7, precalculated_inter_dataset_dict=precalculated_inter_dataset_distances, precalculated_query_dataset_dict=precalculated_query_dataset_distances, precalculated_keyword_similarity_dict= precalculated_query_dataset_keyword_similarities, model=word2vec_model) map_name = argv[0] # map_name = 'London_mini' # Choose which Solvers to use. For all possible parameters refer to the documentation. max_number_of_processes = mp.cpu_count() ns1 = NaiveSolver( query, data, cf2, result_length=5, max_subset_size=3, max_number_of_concurrent_processes=max_number_of_processes, _map=map_name) # ns2 = NaiveSolver(query, data, cf5, result_length=5, max_subset_size=3, # max_number_of_concurrent_processes=max_number_of_processes, _map = map_name) # ns3 = NaiveSolver(query, data, cf3, result_length=5, max_subset_size=6, # max_number_of_concurrent_processes=max_number_of_processes) # ns4 = NaiveSolver(query, data, cf6, result_length=5, max_subset_size=3, # max_number_of_concurrent_processes=max_number_of_processes, _map = map_name) # Add Solvers to Evaluator ev.add_solver(ns1) # ev.add_solver(ns2) # ev.add_solver(ns4) #Only for Debug: calculates and print physical distances between items in the dataset and the query location #distances = [geographic_distance(x.coordinates, query.coordinates) for x in data] # print('------ Distances: ', distances # Run Evaluator and fetch results ev.evaluate() results = ev.get_results() timings = ev.get_timings() write_csv(map_name, results, timings) print('*** Solution -', solution_list_comprehension(results)) # print('*** Timing -', timing_list_comprehension(timings)) initialLat = [] initialLon = [] keywords = [] gmap = gmplot.GoogleMapPlotter(query.coordinates.x, query.coordinates.y, 14) colors = ['red', 'blue', 'green', 'purple', 'orange'] # Third dimension is the order of solution (Best: 0, Second best: 1...) for i in range(5): lats = [] lons = [] for kwc in results[0][0][i][1]: lats.append(kwc.coordinates.x) lons.append(kwc.coordinates.y) keywords.append(kwc.keywords) for j in range(len(lats)): gmap.marker(lats[j], lons[j], color=colors[i]) gmap.polygon(lats, lons, color='cornflowerblue', edge_width=7) # initialLat.append(query.coordinates.x) # initialLon.append(query.coordinates.y) # gmap.scatter(initialLat, initialLon, '#00FF00', size = 70, marker = False) # gmap.scatter(lats, lons, '#FF0000',size = 50, marker = False ) # gmap.plot(lats, lons, 'cornflowerblue', edge_width = 3.0) # gmap.polygon(lats, lons, color='cornflowerblue', edge_width=10) # gmap.scatter(lats, lons, color='#3B0B39', size=40, marker=False) #Your Google_API_Key #gmap.apikey = " API_Key” # save it to html # gmap.scatter(lats, lons, '#FF0000', size=40, marker=True) gmap.marker(query.coordinates.x, query.coordinates.y, color='cornflowerblue', title='Query point') gmap.draw(r"graphic_results.html") print("--- %s seconds ---" % (time.time() - start_time))
# Code print('Loading CSV', csv_file_name) data = load_csv(file_name=csv_file_name, x_coordinate_index=x_index, y_coordinate_index=y_index, keywords_index=keyword_index, keywords_delimiter=keyword_delimiter, delimiter=csv_delimiter, quotechar=csv_quotechar, max_read_length=max_read_length) if len(data) > 0: print('Example Datapoint:', data[0].coordinates.x, data[0].coordinates.y, data[0].keywords) write_pickle(data=data, file_name=data_target_name, file_allow_overwrite=file_allow_overwrite) else: print('Could not load any data.') print("--- %s seconds ---" % (time.time() - start_time)) word2vec_model_name = 'model.pickle' word2vec_model = load_word2vec_model(word2vec_model_name) for i in range(13, 79): print('++++++++++++++++++') print('Query --> ', i) print('++++++++++++++++++') iteration_start_time = time.time() # *********************** # preprocess_csv_query.py #************************ in_start_time = time.time() # Config