def evaluation_on_feature_prediction_all_users(feature_name): #contains the scores of the different predictors combined for all the users mixed_evaluations = {} #contains the scores details of the different predictors for each user detailed_evaluations = {} iter = 0 for user_id in MDataExtractor.users_ids_list(): print("user "+str(user_id)+" on working") if feature_name == "day": [evaluations,classes] = eodpou(user_id) file_name = "evaluation_day_prediction" elif feature_name == "location": [evaluations,classes] = eolpou(user_id) file_name = "evaluation_location_prediction" elif feature_name == "applaunch": [evaluations,classes] = eoapou(user_id) file_name = "evaluation_applaunch_prediction" else: raise Exception("NOT IMPLEMENTED FEATURE EXCEPTION: the feature "+str(feature_name)+" do not implemented or do not exist") if len(detailed_evaluations.keys()) == 0: #instantiate the evaluations objects add the classifier names for detailed_evaluations and detailed_evaluations for classifier_name, evaluation_results in evaluations.iteritems(): mixed_evaluations[classifier_name] = {"correct predictions":0 , "total predictions":0, "accuracy":0, "average accuracy":0, "average macro_accuracy":0} detailed_evaluations[classifier_name] = {} #integrate the evaluations of the current users for classifier_name, evaluation_results in evaluations.iteritems(): #evaluation_results = [good_predictions, total_predictions, accuracy, macro_average_acc_by_class, accuracy_class1,...,accuracy_classn] #update the mixed evaluations mixed_evaluations[classifier_name]["correct predictions"]+=evaluation_results[0] mixed_evaluations[classifier_name]["total predictions"]+=evaluation_results[1] mixed_evaluations[classifier_name]["accuracy"]= (mixed_evaluations[classifier_name]["correct predictions"]*1.0)/mixed_evaluations[classifier_name]["total predictions"] mixed_evaluations[classifier_name]["average accuracy"] = ((mixed_evaluations[classifier_name]["average accuracy"]*iter)+evaluation_results[2])/((iter+1)*1.0) mixed_evaluations[classifier_name]["average macro_accuracy"] = ((mixed_evaluations[classifier_name]["average macro_accuracy"]*iter)+evaluation_results[3])/((iter+1)*1.0) #update the detailed evaluations detailed_evaluations[classifier_name]["user "+str(user_id)]= {"correct predictions": evaluation_results[0], "total predictions":evaluation_results[1], "accuracy":evaluation_results[2], "macro_accuracy":evaluation_results[3], "accuracy by class": evaluation_results[4:]} iter+=1 print("user "+str(user_id)+" extracted") #write the results in a log file #write the dictionaries into files out = LogsFileWriter.open(file_name) LogsFileWriter.write("predictions on the classes "+JsonUtils.dict_as_json_str(classes)+"\n",out) LogsFileWriter.write("Total scores :\n",out) LogsFileWriter.write(JsonUtils.dict_as_json_str(mixed_evaluations),out) LogsFileWriter.write("detailed scores :\n",out) LogsFileWriter.write(JsonUtils.dict_as_json_str(detailed_evaluations),out) LogsFileWriter.close(out)
def compare(reference_transformation, user_id): global labels_importance global labels_importance_rank #global labels_importance_derivative index = 0 transformations = transformation_vectors.keys() for label in rows_labels: labels_importance[label] = {} labels_importance_rank[label] = {} for transformation in transformations: labels_importance[label][transformation]=transformation_vectors[transformation][0][index] labels_importance_rank[label][transformation]= transformation_vectors[transformation][1][index] #labels_importance_derivative[label][transformation]= transformation_vectors[transformation][2][index] index +=1 #sort the dictionaries per presence rate. The most frequent feature at the biginning labels_importance = collections.OrderedDict(sorted(labels_importance.items(), key=lambda x: x[1][reference_transformation], reverse = True)) #labels_importance_derivative = collections.OrderedDict(sorted(labels_importance_derivative.items(), key=lambda x: x[1][reference_transformation], reverse = True)) labels_importance_rank = collections.OrderedDict(sorted(labels_importance_rank.items(), key=lambda x: x[1][reference_transformation])) print JsonUtils.dict_as_json_str(labels_importance) print JsonUtils.dict_as_json_str(labels_importance_rank) #print np.shape(data_matrix) #write the dictionaries into files out = LogsFileWriter.open(file_name) LogsFileWriter.write(JsonUtils.dict_as_json_str(labels_importance),out) LogsFileWriter.write(JsonUtils.dict_as_json_str(labels_importance_rank),out) LogsFileWriter.close(out) #plot the records importance vs different transformation scores importances_list = [] importances_legends = [] ranks_list = [] ranks_legends = [] importances_derivatives_list = [] importances_derivatives_legends = [] for transformation in transformations: importance_list = [importance[transformation] for importance in labels_importance.values()] importances_list.append(importance_list) importances_legends.append(transformation) rank_list = [rank[transformation] for rank in labels_importance_rank.values()] ranks_list.append(rank_list) ranks_legends.append(transformation) importance_derivative_list = np.diff(np.asarray(importance_list), 1).tolist() importances_derivatives_list.append(importance_derivative_list) importances_derivatives_legends.append(transformation) importances_derivatives_list.append([0]*len(importances_derivatives_list[0])) importances_derivatives_legends.append("y=0") PlotlibDrawer.plot_1(labels_importance.keys(), [percentage["presence_percentage"] for percentage in labels_importance.values()], "features rank", "% records", "presence rate of the features in the records", 10) PlotlibDrawer.plot_2(labels_importance.keys(), importances_list, importances_legends, "features rank", "features scores", "comparison of different transformation scores "+str(user_id), 11) PlotlibDrawer.plot_2(labels_importance_rank.keys(), ranks_list, ranks_legends, "features initial rank", "features rank after transformation", "comparison of different transformation ranks "+str(user_id), 11) PlotlibDrawer.plot_2(labels_importance.keys(), importances_derivatives_list, importances_derivatives_legends, "features initial rank", "features scores derivative", "comparison of different transformation scores derivative "+str(user_id), 11)
#!/usr/bin/env python import sys import pprint as pp import os.path sys.path.insert(0, "/home/dehajjik/workspace/src/utils") from features_presence_rate_one_user import features_presence_rate_one_user from logs_file_writer import LogsFileWriter json_data_dir = "/speech/dbwork/mul/reco1/AppPrediction/SonyLogging/Logs/from_TKY/pulled_from_TKY/mixs_launcher_logs/json/" #array containing the path to the validated json fata for each user users_json_files_array = [json_data_dir+x+"/all/all_in_one_validated_log.json" for x in os.listdir(json_data_dir)] pp.pprint(users_json_files_array) features_presence_rate ='' user_number = 1 for json_file in users_json_files_array: if os.path.isfile(json_file): features_presence_rate = features_presence_rate + "\n \n \n user "+str(user_number)+"\n"+ features_presence_rate_one_user(json_file) print("user "+str(user_number)+" extracted") user_number+=1 #write the results to the log file stream = LogsFileWriter.open("features_presence_rate") LogsFileWriter.write(features_presence_rate, stream) LogsFileWriter.close(stream)
import collections sys.path.insert(0, "/home/dehajjik/workspace/src/utils") from data_utils import * from list_events_one_user import list_events_one_user as leou from logs_file_writer import LogsFileWriter stream = LogsFileWriter.open("events_list") LogsFileWriter.write( "different events types and their number of occurrences in the dataset \n\n", stream) events_list = collections.Counter() for user_id in DataExtractor.users_ids_list(): current_list = collections.Counter(leou(user_id)) events_list += current_list LogsFileWriter.write("\n\n user " + str(user_id) + " event list : \n", stream) LogsFileWriter.write(str(json.dumps(current_list, indent=4)), stream) print("user " + str(user_id) + " extracted") #sort the notifications by decreasing order events_list = collections.OrderedDict( sorted(events_list.items(), key=lambda t: t[1], reverse=True)) LogsFileWriter.write("\n\n all users notification list : \n", stream) LogsFileWriter.write(str(json.dumps(events_list, indent=4)), stream) LogsFileWriter.close(stream)
def evaluation_on_feature_prediction_all_users(feature_name): #contains the scores of the different predictors combined for all the users mixed_evaluations = {} #contains the scores details of the different predictors for each user detailed_evaluations = {} iter = 0 for user_id in MDataExtractor.users_ids_list(): print("user " + str(user_id) + " on working") if feature_name == "day": [evaluations, classes] = eodpou(user_id) file_name = "evaluation_day_prediction" elif feature_name == "location": [evaluations, classes] = eolpou(user_id) file_name = "evaluation_location_prediction" elif feature_name == "applaunch": [evaluations, classes] = eoapou(user_id) file_name = "evaluation_applaunch_prediction" else: raise Exception("NOT IMPLEMENTED FEATURE EXCEPTION: the feature " + str(feature_name) + " do not implemented or do not exist") if len(detailed_evaluations.keys()) == 0: #instantiate the evaluations objects add the classifier names for detailed_evaluations and detailed_evaluations for classifier_name, evaluation_results in evaluations.iteritems(): mixed_evaluations[classifier_name] = { "correct predictions": 0, "total predictions": 0, "accuracy": 0, "average accuracy": 0, "average macro_accuracy": 0 } detailed_evaluations[classifier_name] = {} #integrate the evaluations of the current users for classifier_name, evaluation_results in evaluations.iteritems(): #evaluation_results = [good_predictions, total_predictions, accuracy, macro_average_acc_by_class, accuracy_class1,...,accuracy_classn] #update the mixed evaluations mixed_evaluations[classifier_name][ "correct predictions"] += evaluation_results[0] mixed_evaluations[classifier_name][ "total predictions"] += evaluation_results[1] mixed_evaluations[classifier_name]["accuracy"] = ( mixed_evaluations[classifier_name]["correct predictions"] * 1.0) / mixed_evaluations[classifier_name]["total predictions"] mixed_evaluations[classifier_name]["average accuracy"] = ( (mixed_evaluations[classifier_name]["average accuracy"] * iter) + evaluation_results[2]) / ((iter + 1) * 1.0) mixed_evaluations[classifier_name]["average macro_accuracy"] = ( (mixed_evaluations[classifier_name]["average macro_accuracy"] * iter) + evaluation_results[3]) / ((iter + 1) * 1.0) #update the detailed evaluations detailed_evaluations[classifier_name]["user " + str(user_id)] = { "correct predictions": evaluation_results[0], "total predictions": evaluation_results[1], "accuracy": evaluation_results[2], "macro_accuracy": evaluation_results[3], "accuracy by class": evaluation_results[4:] } iter += 1 print("user " + str(user_id) + " extracted") #write the results in a log file #write the dictionaries into files out = LogsFileWriter.open(file_name) LogsFileWriter.write( "predictions on the classes " + JsonUtils.dict_as_json_str(classes) + "\n", out) LogsFileWriter.write("Total scores :\n", out) LogsFileWriter.write(JsonUtils.dict_as_json_str(mixed_evaluations), out) LogsFileWriter.write("detailed scores :\n", out) LogsFileWriter.write(JsonUtils.dict_as_json_str(detailed_evaluations), out) LogsFileWriter.close(out)
def location_distribution_per_hour_one_user(user_id, option): if option != None: current_option = option else: current_option = hardcoded_option complete_data = DataExtractor.load_json_data(user_id) location_data = collections.OrderedDict( sorted( DataExtractor.data(DataExtractor.location_name, complete_data).items())) location_metadata = DataExtractor.metadata(DataExtractor.location_name, complete_data) #contains the counts per hour for each location location_count = {} #contains the total number of visits for each location location_freq = {} iter = 0 #this is to guarantee that we do not mark the same date more than ones (because it may occur that different realizations be mapped to the same hour) most_recent_date = None most_recent_location = None for date, location_feature in location_data.iteritems(): print "iteration " + str(iter) + " from " + str(len(location_data)) iter += 1 location = location_feature[DataExtractor.location_attribute] if location not in location_count: location_count[location] = np.zeros(24) location_freq[location] = 0.0 start_date = DataExtractor.start_date_of_realization(date) end_date = DataExtractor.end_date_of_realization(date) #get a list of datetimes representing the each hour covered between start date and end date hours_covered = DateTimeUtils.hours_between(start_date, end_date) #increment by 1 the concerned hours for location for datetime in hours_covered: if datetime != most_recent_date or location != most_recent_location: #then we have either a new date to mark or a new location if ((current_option == options[0] and DateTimeUtils.is_week_end_day(datetime)) or (current_option == options[1] and not DateTimeUtils.is_week_end_day(datetime)) or (current_option == options[2])): #fits the options location_count[location][datetime.hour] += 1.0 location_freq[location] += 1.0 most_recent_date = datetime most_recent_location = location #order the location_count by the most frequent locations location_freq = collections.OrderedDict( sorted(location_freq.items(), key=lambda tup: tup[1], reverse=True)) ordered_locations = location_freq.keys() #transform the dictionary into a 2 dimentional numpy array and select only the k-most frequent locations np_matrix = np.zeros((len(ordered_locations), 24)) idex = 0 for location in ordered_locations: np_matrix[idex, :] = location_count[location] idex += 1 #create the distribution of locations by hour (normalizing each column) column_sums = np_matrix.sum(axis=0) location_dist = np_matrix[0:k, :] / column_sums[np.newaxis, :] ld_title = "top-" + str(k) + " distribution of locations by hour " + str( current_option) + " for user " + str(user_id) ld_x = "hours" ld_y = "Pr[location|hour]" #create the legends legends = [ str(location) + ", r=" + str(location_metadata[DataExtractor.location_attribute][str(location)][ DataExtractor.location_metaradius]) + ", c=" + str(location_metadata[DataExtractor.location_attribute][str(location)][ DataExtractor.location_metacenter]) for location in ordered_locations[0:k] ] PlotlibDrawer.plot_np(None, location_dist, legends, ld_x, ld_y, ld_title, 0) #add an informative description in a log file containing the content of the clusters stream = LogsFileWriter.open("most_frequent_location_clusters_" + str(current_option) + "_user" + str(user_id)) for location in ordered_locations[0:k]: LogsFileWriter.write( "location " + str(location) + "\n" + JsonUtils.dict_as_json_str(location_metadata[ DataExtractor.location_attribute][str(location)]) + "\n\n\n", stream) LogsFileWriter.close(stream)
evaluations = epou(user_id) if len(detailed_evaluations.keys()) == 0: #instantiate the evaluations objects add the classifier names for detailed_evaluations and detailed_evaluations for classifier_name, evaluation_results in evaluations.iteritems(): mixed_evaluations[classifier_name] = {"average perplexity": 0} detailed_evaluations[classifier_name] = {} #integrate the evaluations of the current users for classifier_name, perplexity in evaluations.iteritems(): mixed_evaluations[classifier_name]["average perplexity"] = ( (mixed_evaluations[classifier_name]["average perplexity"] * iter) + perplexity) / ((iter + 1) * 1.0) #update the detailed evaluations detailed_evaluations[classifier_name]["user " + str(user_id)] = { "perplexity": perplexity } iter += 1 print("user " + str(user_id) + " extracted") #write the dictionaries into files out = LogsFileWriter.open(file_name) LogsFileWriter.write("perplexity \n", out) LogsFileWriter.write("Total scores :\n", out) LogsFileWriter.write(JsonUtils.dict_as_json_str(mixed_evaluations), out) LogsFileWriter.write("detailed scores :\n", out) LogsFileWriter.write(JsonUtils.dict_as_json_str(detailed_evaluations), out) LogsFileWriter.close(out)
def location_distribution_per_hour_one_user(user_id, option): if option!=None: current_option = option else: current_option = hardcoded_option complete_data = DataExtractor.load_json_data(user_id) location_data = collections.OrderedDict(sorted(DataExtractor.data(DataExtractor.location_name, complete_data).items())) location_metadata = DataExtractor.metadata(DataExtractor.location_name, complete_data) #contains the counts per hour for each location location_count = {} #contains the total number of visits for each location location_freq = {} iter = 0 #this is to guarantee that we do not mark the same date more than ones (because it may occur that different realizations be mapped to the same hour) most_recent_date = None most_recent_location = None for date, location_feature in location_data.iteritems(): print "iteration "+str(iter)+" from "+str(len(location_data)) iter+=1 location = location_feature[DataExtractor.location_attribute] if location not in location_count: location_count[location] = np.zeros(24) location_freq[location] = 0.0 start_date = DataExtractor.start_date_of_realization(date) end_date = DataExtractor.end_date_of_realization(date) #get a list of datetimes representing the each hour covered between start date and end date hours_covered = DateTimeUtils.hours_between(start_date, end_date) #increment by 1 the concerned hours for location for datetime in hours_covered: if datetime != most_recent_date or location != most_recent_location: #then we have either a new date to mark or a new location if ((current_option == options[0] and DateTimeUtils.is_week_end_day(datetime)) or (current_option == options[1] and not DateTimeUtils.is_week_end_day(datetime)) or (current_option == options [2])): #fits the options location_count[location][datetime.hour]+=1.0 location_freq[location]+=1.0 most_recent_date = datetime most_recent_location = location #order the location_count by the most frequent locations location_freq = collections.OrderedDict(sorted(location_freq.items(), key=lambda tup: tup[1], reverse=True)) ordered_locations = location_freq.keys() #transform the dictionary into a 2 dimentional numpy array and select only the k-most frequent locations np_matrix = np.zeros((len(ordered_locations), 24)) idex = 0 for location in ordered_locations: np_matrix[idex,:]=location_count[location] idex+=1 #create the distribution of locations by hour (normalizing each column) column_sums = np_matrix.sum(axis=0) location_dist = np_matrix[0:k,:] / column_sums[np.newaxis , :] ld_title = "top-"+str(k)+" distribution of locations by hour "+str(current_option)+" for user "+str(user_id) ld_x = "hours" ld_y = "Pr[location|hour]" #create the legends legends = [str(location)+", r="+str(location_metadata[DataExtractor.location_attribute][str(location)][DataExtractor.location_metaradius])+", c="+str(location_metadata[DataExtractor.location_attribute][str(location)][DataExtractor.location_metacenter]) for location in ordered_locations[0:k]] PlotlibDrawer.plot_np(None, location_dist, legends, ld_x, ld_y, ld_title, 0) #add an informative description in a log file containing the content of the clusters stream = LogsFileWriter.open("most_frequent_location_clusters_"+str(current_option)+"_user"+str(user_id)) for location in ordered_locations[0:k]: LogsFileWriter.write("location "+str(location)+"\n"+JsonUtils.dict_as_json_str(location_metadata[DataExtractor.location_attribute][str(location)])+"\n\n\n", stream) LogsFileWriter.close(stream)