def compute_ghcm_mdt_one_user(user_id): file_name = "ghcm_mdt_user_" + str(user_id) print "loading matrix user " + str(user_id) + "..." [rfvdata, featuresnames, valuesnames, recordsdates] = RVFDataExtractor.load_rvf(user_id) print "values" + JsonUtils.dict_as_json_str(valuesnames) print "data" + JsonUtils.dict_as_json_str(rfvdata[0]) vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))] print "user " + str(user_id) + " has " + str( len(featuresnames)) + " features and " + str(len(rfvdata)) + " records" print "features names" print featuresnames print "values" + JsonUtils.dict_as_json_str(valuesnames) for k in [10, 20, 30]: #compute the ghcm_mdt ghcm_mdt_comp = GHCM_MDTComputer(rfvdata, k, vocab_size) print "computing SVD for user " + str(user_id) + "..." ghcm_mdt_comp.compute() print "constructing interpretable output for user " + str( user_id) + "..." ghcm_mdt_comp.construct_rows_interpretable_output( featuresnames, valuesnames, disp_m) r_output = ghcm_mdt_comp.rows_interpretable_output #write the result print "writing SVD result for user " + str(user_id) + "..." JsonLogsFileWriter.write(r_output, file_name)
def test_time_variances_for_array_feature(data, array_feature): time_variances_number = {} time_variances_feature_min = {} time_variances_feature_max = {} for record_id in data: if array_feature in data[record_id]: max_time = 0 min_time = sys.maxint feature = data[record_id][array_feature] for entry in feature: current_time = long(entry['createDate']) if current_time >= max_time : max_time = current_time if current_time <= min_time: min_time = current_time time_variance = max_time - min_time if time_variance not in time_variances_number: time_variances_number[time_variance] = 0 time_variances_number[time_variance] +=1 time_variances_number = collections.OrderedDict(sorted(time_variances_number.items(),reverse=True)) print "time variances distribution for "+array_feature print JsonUtils.dict_as_json_str(time_variances_number) print "\n \n"
def compute_ghcm_mdt_one_user(user_id): file_name = "ghcm_mdt_user_"+str(user_id) print "loading matrix user "+str(user_id)+"..." [rfvdata, featuresnames, valuesnames, recordsdates] = RVFDataExtractor.load_rvf(user_id) print "values" + JsonUtils.dict_as_json_str(valuesnames) print "data" + JsonUtils.dict_as_json_str(rfvdata[0]) vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))] print "user "+str(user_id)+" has "+str(len(featuresnames))+" features and "+str(len(rfvdata))+" records" print "features names" print featuresnames print "values" + JsonUtils.dict_as_json_str(valuesnames) for k in [10,20,30]: #compute the ghcm_mdt ghcm_mdt_comp = GHCM_MDTComputer(rfvdata, k, vocab_size) print "computing SVD for user "+str(user_id)+"..." ghcm_mdt_comp.compute() print "constructing interpretable output for user "+str(user_id)+"..." ghcm_mdt_comp.construct_rows_interpretable_output(featuresnames, valuesnames, disp_m) r_output = ghcm_mdt_comp.rows_interpretable_output #write the result print "writing SVD result for user "+str(user_id)+"..." JsonLogsFileWriter.write(r_output, file_name)
def weather_and_operating_features_extraction_one_user(user_id): out_path_prefix = "/home/dehajjik/workspace/logs/" data = DataExtractor.load_json_data(user_id) known_features = (["activityRecognitionResult", "androidActivityRecognitionResult", "appLaunch", "battery", "bluetooth", "event", "notifications", "headsetPlug", "location", "networkInfo", "sensor", "settingInfo", "settingInfo", "telephony", "telephony2", "wifiAps", "wifiConnectedAp", "launcherLayouts", "predictors", "neighboringCellInfo2", "neighboringCellInfo" ]) #filter notification already adds to the records the attributes createDateTimeZone and rcreateDateTimeZone #check that the data is really ordered by date!! data = DataOperations.filter_notifications(data) #the data is sorted and notification is filtered and annotated with the timezone date. for record_id in data: record = data[record_id] for key, value in record.iteritems(): if key not in known_features: if key not in other_features.keys(): other_features[key]=[] if len(other_features[key])<100: other_features[key].append(value) t = time.strftime("%Y%m%d") t = t+""+time.strftime("%H%M%S") JsonUtils.save_json_data(out_path_prefix+"/"+t+"extra_features_u"+str(user_id), other_features) return other_features
def weather_and_operating_features_extraction_one_user(user_id): out_path_prefix = "/home/dehajjik/workspace/logs/" data = DataExtractor.load_json_data(user_id) known_features = ([ "activityRecognitionResult", "androidActivityRecognitionResult", "appLaunch", "battery", "bluetooth", "event", "notifications", "headsetPlug", "location", "networkInfo", "sensor", "settingInfo", "settingInfo", "telephony", "telephony2", "wifiAps", "wifiConnectedAp", "launcherLayouts", "predictors", "neighboringCellInfo2", "neighboringCellInfo" ]) #filter notification already adds to the records the attributes createDateTimeZone and rcreateDateTimeZone #check that the data is really ordered by date!! data = DataOperations.filter_notifications(data) #the data is sorted and notification is filtered and annotated with the timezone date. for record_id in data: record = data[record_id] for key, value in record.iteritems(): if key not in known_features: if key not in other_features.keys(): other_features[key] = [] if len(other_features[key]) < 100: other_features[key].append(value) t = time.strftime("%Y%m%d") t = t + "" + time.strftime("%H%M%S") JsonUtils.save_json_data( out_path_prefix + "/" + t + "extra_features_u" + str(user_id), other_features) return other_features
def test_time_variances_in_one_record(data): time_variances_number = {} time_variances_feature_min = {} time_variances_feature_max = {} for record_id in data: max_time = 0 min_time = sys.maxint max_feature="" min_feature = "" record = data[record_id] for feature, value in record.iteritems(): try: current_time = long(value['createDate']) if current_time > max_time : max_time = current_time max_feature = feature if current_time < min_time: min_time = current_time min_feature = feature except TypeError: #it is an array feature for entry in value: current_time = long(entry['createDate']) if current_time >= max_time : max_time = current_time max_feature = feature if current_time <= min_time: min_time = current_time min_feature = feature time_variance = max_time - min_time if time_variance not in time_variances_number: time_variances_number[time_variance] = 0 time_variances_number[time_variance] +=1 if max_feature not in time_variances_feature_max: time_variances_feature_max[max_feature] = 0 time_variances_feature_max[max_feature] += 1 if min_feature not in time_variances_feature_min: time_variances_feature_min[min_feature] = 0 time_variances_feature_min[min_feature] += 1 time_variances_number = collections.OrderedDict(sorted(time_variances_number.items(),reverse=True)) print "time variances distribution " print JsonUtils.dict_as_json_str(time_variances_number) print "\n \n"
def filter_notifications_one_user(user_id): out_path_prefix = "/speech/dbwork/mul/students/dehajjik/notifications_filtered/" data = DataExtractor.load_json_data(user_id) data = DataOperations.filter_notifications(data) JsonUtils.save_json_data( out_path_prefix + str(DataExtractor.user_long_ids[user_id]) + "/all/all_in_one_validated_log", data ) return data
def filter_notifications_one_user(user_id): out_path_prefix = "/speech/dbwork/mul/students/dehajjik/notifications_filtered/" data = DataExtractor.load_json_data(user_id) data = DataOperations.filter_notifications(data) JsonUtils.save_json_data( out_path_prefix + str(DataExtractor.user_long_ids[user_id]) + "/all/all_in_one_validated_log", data) return data
def transform(self): sony_activity_counts = self.count_sony_activity_realizations() android_activity_counts = self.count_android_activity_realizations() print "Sony activities duration (in minutes) :" print JsonUtils.dict_as_json_str(sony_activity_counts) print "Android activities duration (in minutes) : " print JsonUtils.dict_as_json_str(android_activity_counts) #self.exclusive_sony_activity_transform_one() self.exclusive_android_activity_transform_one()
def extract_realizations_in_time_range_soft_version_optimized_for_sorted_data_copy_verbose(feature_realizations_sorted_copy, realization_key): target_realizations = [] target_time_range = realization_key for current_time_range in feature_realizations_sorted_copy.keys(): realization = feature_realizations_sorted_copy[current_time_range] if DataOperations.is_ended_before_the_start_of(current_time_range, target_time_range): #the current realization happened before the target time, so as we assume that the target times that will be given as input to the method are increasing, #we just remove this entry del feature_realizations_sorted_copy[current_time_range] elif DataOperations.does_date_overlaps(target_time_range , current_time_range): #the current realization has a time that overlaps with the target time so we select it and extend the target time so that it includes the time range of the selected realization(to satisfy the soft version property) print "gps "+JsonUtils.dict_as_json_str(realization)+" that occured at time "+current_time_range+" included in the selection" target_realizations.append(realization) #as the target times are strictly increasing, we delete this entry because it will never match another target time del feature_realizations_sorted_copy[current_time_range] target_time_range = DataOperations.union_of_date_intervals(target_time_range , current_time_range) elif DataOperations.is_ended_before_the_start_of(target_time_range , current_time_range): #if the current time range started after the finish of the target one, it means that we will not meet any realizations in the target time range any more #print current_time_range+ ": CAUSED STOP LOOP \n" break; #print "the selected realizations are \n"+JsonUtils.dict_as_json_str(target_realizations)+"\n\n\n\n" #print "\n\n\n\n" return target_realizations
def extract_specific_date_time_one_user(user_id): '''specific_date_times = {1: [datetime.datetime(year=2014, month=8, day=19, hour=16), datetime.datetime(year=2014, month=8, day=27, hour=15), datetime.datetime(year=2014, month=9, day=5, hour=18), datetime.datetime(year=2014, month=10, day=12, hour=15), datetime.datetime(year=2014, month=9, day=1, hour=1)], 2: [datetime.datetime(year=2014, month=9, day=25, hour=7),datetime.datetime(year=2014, month=12, day=8, hour=6), datetime.datetime(year=2014, month=9, day=25, hour=1)], 3: [datetime.datetime(year=2014, month=9, day=13, hour=0), datetime.datetime(year=2014, month=9, day=25, hour=17)], 4: [datetime.datetime(year=2014, month=9, day=5, hour=14), datetime.datetime(year=2015, month=1, day=8, hour=11), datetime.datetime(year=2014, month=9, day=2, hour=13)], 5: [datetime.datetime(year=2014, month=9, day=22, hour=18), datetime.datetime(year=2015, month=1, day=5, hour=13), datetime.datetime(year=2014, month=12, day=29, hour=13)], 6: [datetime.datetime(year=2014, month=10, day=26, hour=3), datetime.datetime(year=2014, month=11, day=4, hour=8)], 7: [datetime.datetime(year=2014, month=7, day=28, hour=10)]}''' specific_date_times = { 1: [ datetime.datetime(year=2014, month=10, day=12, hour=14), datetime.datetime(year=2014, month=10, day=12, hour=22) ], 2: [], 3: [], 4: [datetime.datetime(year=2014, month=12, day=9, hour=15)], 5: [ datetime.datetime(year=2014, month=12, day=5, hour=12), datetime.datetime(year=2014, month=12, day=16, hour=9) ], 6: [], 7: [] } out_path_prefix = "/home/dehajjik/workspace/resources/" data = DataExtractor.load_json_data(user_id) #for each specific date and hour write the data that occured at that specified to a file. Json format for specific_dt in specific_date_times[user_id]: selected_data = DataExtractor.select_records_by_date_and_hour( data, specific_dt) #annotate the records with readable dates and sort the notifications chronologically for record in selected_data: record = DataOperations.annotate_with_readable_date_no_timezone( record) record = DataOperations.order_chronologically_notifications_and_annotate( record) #sort the records chronologically selected_data = DataOperations.order_chronologically_and_annotate( selected_data) JsonUtils.save_json_data( out_path_prefix + "u" + str(user_id) + " d" + specific_dt.strftime('%Y-%m-%d %H'), selected_data)
def output_sample(user_id): specific_date_times = { 1: [ datetime.datetime(year=2014, month=8, day=19, hour=16), datetime.datetime(year=2014, month=8, day=27, hour=15), datetime.datetime(year=2014, month=9, day=5, hour=18), datetime.datetime(year=2014, month=10, day=12, hour=15), datetime.datetime(year=2014, month=9, day=1, hour=1) ], 2: [ datetime.datetime(year=2014, month=9, day=25, hour=7), datetime.datetime(year=2014, month=12, day=8, hour=6), datetime.datetime(year=2014, month=9, day=25, hour=1) ], 3: [datetime.datetime(year=2014, month=9, day=25, hour=17)], 4: [ datetime.datetime(year=2014, month=9, day=5, hour=14), datetime.datetime(year=2015, month=1, day=8, hour=11), datetime.datetime(year=2014, month=9, day=2, hour=13) ], 5: [ datetime.datetime(year=2014, month=9, day=22, hour=18), datetime.datetime(year=2015, month=1, day=5, hour=13), datetime.datetime(year=2014, month=12, day=29, hour=13) ], 6: [ datetime.datetime(year=2014, month=10, day=26, hour=3), datetime.datetime(year=2014, month=11, day=4, hour=8) ], 7: [datetime.datetime(year=2014, month=7, day=28, hour=10)] } out_path_prefix = "/home/dehajjik/workspace/resources/filtered_notifs/" data = DataExtractor.load_json_data(user_id) #for each specific date and hour write the data that occured at that specified to a file. Json format for specific_dt in specific_date_times[user_id]: selected_data = DataExtractor.select_records_by_date_and_hour( data, specific_dt) selected_data = DataOperations.order_chronologically_and_annotate( selected_data) #selected_data = DataOperations.annotate(selected_data) JsonUtils.save_json_data( out_path_prefix + "u" + str(user_id) + " d" + specific_dt.strftime('%Y-%m-%d %H'), selected_data) print(str(json.dumps(selected_data.keys(), indent=4)))
def transform_to_categorized_data_one_user(user_id): out_path_prefix = "/speech/dbwork/mul/students/dehajjik/categorized_data/" data_key = "data" metadata_key = "metadata" print "loading data for user "+str(user_id) nontransformed_data = DataExtractor.load_json_data(user_id) #nontransformed_data = JsonUtils.load_json_data("/home/dehajjik/workspace/resources/sample_data_for_location_categorization_test.json") #the transfomers responsible for the features of the data categorization feature_transformers = {LocationTransformer.transformed_feature_name: LocationTransformer(nontransformed_data), NotificationTransformer.transformed_feature_name : NotificationTransformer(nontransformed_data), ApplaunchTransformer.transformed_feature_name : ApplaunchTransformer(nontransformed_data), BatteryTransformer.transformed_feature_name: BatteryTransformer(nontransformed_data), HeadsetTransformer.transformed_feature_name: HeadsetTransformer(nontransformed_data), BluetoothPairedTransformer.transformed_feature_name: BluetoothPairedTransformer(nontransformed_data), BluetoothSeenTransformer.transformed_feature_name: BluetoothSeenTransformer(nontransformed_data), ActivityTransformer.transformed_feature_name : ActivityTransformer(nontransformed_data)} #the features that we want to transform selected_features = [LocationTransformer.transformed_feature_name, NotificationTransformer.transformed_feature_name, ApplaunchTransformer.transformed_feature_name, #BatteryTransformer.transformed_feature_name, #HeadsetTransformer.transformed_feature_name, BluetoothPairedTransformer.transformed_feature_name, #BluetoothSeenTransformer.transformed_feature_name, ActivityTransformer.transformed_feature_name] #selected_features = [ActivityTransformer.transformed_feature_name] categorized_data = {} categorized_data[data_key]={} categorized_data[metadata_key]={} for feature in selected_features: feature_transformers[feature].transform() if feature_transformers[feature].transformed_feature_data != {None:None} and feature_transformers[feature].transformed_feature_metadata != {None:None}: categorized_data[data_key][feature]= feature_transformers[feature].transformed_feature_data categorized_data[metadata_key][feature] = feature_transformers[feature].transformed_feature_metadata JsonUtils.save_json_data(out_path_prefix+str(DataExtractor.user_long_ids[user_id])+"/all/all_in_one_validated_log", categorized_data) return categorized_data
def output_sample(user_id): specific_date_times = {1: [datetime.datetime(year=2014, month=8, day=19, hour=16), datetime.datetime(year=2014, month=8, day=27, hour=15), datetime.datetime(year=2014, month=9, day=5, hour=18), datetime.datetime(year=2014, month=10, day=12, hour=15), datetime.datetime(year=2014, month=9, day=1, hour=1)], 2: [datetime.datetime(year=2014, month=9, day=25, hour=7),datetime.datetime(year=2014, month=12, day=8, hour=6), datetime.datetime(year=2014, month=9, day=25, hour=1)], 3: [ datetime.datetime(year=2014, month=9, day=25, hour=17)], 4: [datetime.datetime(year=2014, month=9, day=5, hour=14), datetime.datetime(year=2015, month=1, day=8, hour=11), datetime.datetime(year=2014, month=9, day=2, hour=13)], 5: [datetime.datetime(year=2014, month=9, day=22, hour=18), datetime.datetime(year=2015, month=1, day=5, hour=13), datetime.datetime(year=2014, month=12, day=29, hour=13)], 6: [datetime.datetime(year=2014, month=10, day=26, hour=3), datetime.datetime(year=2014, month=11, day=4, hour=8)], 7: [datetime.datetime(year=2014, month=7, day=28, hour=10)]} out_path_prefix = "/home/dehajjik/workspace/resources/filtered_notifs/" data = DataExtractor.load_json_data(user_id) #for each specific date and hour write the data that occured at that specified to a file. Json format for specific_dt in specific_date_times[user_id]: selected_data = DataExtractor.select_records_by_date_and_hour(data, specific_dt) selected_data = DataOperations.order_chronologically_and_annotate(selected_data) #selected_data = DataOperations.annotate(selected_data) JsonUtils.save_json_data(out_path_prefix+"u"+str(user_id)+" d"+specific_dt.strftime('%Y-%m-%d %H'), selected_data) print(str(json.dumps(selected_data.keys(), indent=4)))
def print_times_for_specific_locations(data): accuracies = {} big = 0 small = 0 for record_id in data: if "location" in data[record_id]: accuracy = data[record_id]["location"]["accuracy"] if accuracy not in accuracies: accuracies[accuracy] = 0 accuracies[accuracy]+=1 if accuracy>200: big+=1 else: small+=1 accuracies = collections.OrderedDict(sorted(accuracies.items(),reverse=True)) print "accuracies for location are : " print JsonUtils.dict_as_json_str(accuracies) print "there is "+str(big)+" accuracies bigger than 200 meters from a total of "+str(big+small) print "\n \n"
def write(content, file_suffix): t = time.strftime("%Y%m%d") t = t + "" + time.strftime("%H%M%S") log_file_name = LogsFileWriter.log_dir + t + file_suffix JsonUtils.save_json_data(log_file_name, content)
"seq":[23] } }, "wifiAps": {"2015-01-01 00:55:00 , 1420073700000->2015-01-01 01:10:01 , 1420074601000":[ {"ssid":"w1", "seq":[2,3,4]} ], "2015-01-01 01:10:01 , 1420074601000->2015-01-01 02:00:00 , 1420077600000":[ {"ssid":"w1", "seq":[5,6,7,8]}, {"ssid":"w2", "seq":[5,6,7,8]}, ], "2015-01-01 03:00:01 , 1420081201000->2015-01-01 03:05:00 , 1420081500000":[ {"ssid":"w3", "seq":[9]} ], "2015-01-01 04:05:00 , 1420085100000->2015-01-01 05:00:00 , 1420088400000":[ {"ssid":"w2", "seq":[10,11]}, {"ssid":"w3", "seq":[10,11]} ], "2015-01-01 19:15:00 , 1420139700000->2015-01-01 21:00:00 , 1420146000000":[ {"ssid":"w4", "seq": [21,22]} ] } } JsonUtils.save_json_data( "/home/dehajjik/workspace/resources/sample_data_for_location_categorization_test", data)
def transform_to_matrix_one_user(user_id): print "loading data for user "+str(user_id) categorized_data = DataExtractor.load_json_data(user_id) data = DataExtractor.complete_data(categorized_data) metadata = DataExtractor.complete_metadata(categorized_data) #order the data by the alphabetic name of the features print "ordering data "+str(user_id) data = collections.OrderedDict(sorted(data.items())) #get the first date and the last date print "getting first date and last date " end_date = date_min start_date = datetime.now() for feature, feature_data in data.iteritems(): feature_data = collections.OrderedDict(sorted(feature_data.items())) begin_date = DataExtractor.start_date_of_realization(feature_data.keys()[0]) if begin_date < start_date: start_date = begin_date last_date = DataExtractor.start_date_of_realization(feature_data.keys()[len(feature_data.keys())-1]) if last_date > end_date: end_date = last_date data[feature] = feature_data #construct the data matrix #I- construct the matrices of all the features print "constructing the matrixes " rows = 0 transformers = {} for feature, feature_date in data.iteritems(): if feature == "location": transformers[feature] = MatrixLocationFeatureTransformer(feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision) elif feature == "bluetoothSeen" or feature == "bluetoothPaired": transformers[feature] = MatrixBleutoothFeatureTransformer(feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision) else : transformers[feature] = MatrixFeatureTransformer(feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision) if feature in features_importance_score_one: transformers[feature].let_importance_scores_to_1 = True transformers[feature].transform() rows += transformers[feature].nbdimentions #construct the time feature transformers[MatrixTimeFeatureTransformer.feature_name] = MatrixTimeFeatureTransformer(start_date, end_date, coocurring_precision) transformers[MatrixTimeFeatureTransformer.feature_name].transform() rows += transformers[MatrixTimeFeatureTransformer.feature_name].nbdimentions columns = transformers[MatrixTimeFeatureTransformer.feature_name].nbtimeslots #II-concatenate all the matrices of each feature into one big matrix (do the same for the labels vector) print "regrouping the matrixes " data_matrix = np.zeros((columns, rows)) labels_vector = [""]* rows dimentions_importance_score = np.zeros(rows) transformers = collections.OrderedDict(sorted(transformers.items())) begin_row_idex = 0 end_row_index = 0 for feature, feature_transformer in transformers.iteritems(): end_row_index = begin_row_idex + feature_transformer.nbdimentions data_matrix[:, begin_row_idex:end_row_index] = feature_transformer.matrix_data labels_vector[begin_row_idex:end_row_index] = feature_transformer.labels_vector dimentions_importance_score[begin_row_idex:end_row_index]=feature_transformer.realization_importance_score begin_row_idex = end_row_index ''' The matrix contains a lot of feature vectors that contains 0 in all the features except the time features. Those vectors corresponds to the times where any record has been done. We want to eliminate those timestamps and their corresponding times ''' time_vector = transformers.values()[0].time_vector [data_matrix, time_vector] = eliminate_empty_records(data_matrix, time_vector) data_matrix = np.transpose(data_matrix) print "the labels are : " print JsonUtils.dict_as_json_str(labels_vector) print "first date of observation "+str(start_date) print "first date of observation "+str(end_date) print "dimension of the labels (features) vector : "+str(len(labels_vector)) print "dimension of the time vector : "+str(len(time_vector)) print "dimension of the resulted matrix (features, time) "+str(data_matrix.shape) print "the number of non zeros values is : "+str(np.count_nonzero(data_matrix))+"/"+str(np.size(data_matrix)) print "the number of negative values in the matrix is : "+str(np.size(ma.masked_array(data_matrix, mask=(data_matrix>=0)).compressed())) print "the data matrix printed : " print Numpy.str(data_matrix) #write the matrix data MDataExtractor.save_matrix(user_id, data_matrix) #write the labels vector, then the time vector and the importance scores MDataExtractor.save_labels_vector(user_id, labels_vector) MDataExtractor.save_time_vector(user_id, time_vector) MDataExtractor.save_importance_scores(user_id, dimentions_importance_score)
def clean_and_write_data_one_user(user_id): #empty the lists global clean_data clean_data = {} global last_realization_val last_realization_val = {} global last_realization_key last_realization_key = {} ''' For each feature, we want to know what are the distribution of the difference of time observed between two realizations. For that reason we store in time_variances for each feature, the number of times that the difference between two realizations was x minutes. Note that the maximum time variance allowed is timeout_in_minutes. ''' global time_variances time_variances ={} out_path_prefix = "/speech/dbwork/mul/students/dehajjik/clean_data/" data = DataExtractor.load_json_data(user_id) DataOperations.print_times_for_specific_locations(data) #filter notification already adds to the records the attributes createDateTimeZone and rcreateDateTimeZone #check that the data is really ordered by date!! data = DataOperations.filter_notifications(data) #the data is sorted and notification is filtered and annotated with the timezone date. for record_id in data: record = data[record_id] event_type = record['event']['type'] for key, value in record.iteritems(): #test if it has one black listed value, if it is the case ignore it do_ignore = False is_array_attribute = False if key in blacklisted_values: for attribute, black_values in blacklisted_values[key].iteritems(): try : if value[attribute] in black_values: #the current realization contains one blacklisted value, so we need to ignore it do_ignore = True except TypeError: #the feature we have is an array feature is_array_attribute=True break; if is_array_attribute: #the feature we have is an array feature, so we go throught all the elements and we remove the blacklisted ones if they exist temp_value = [] do_remove = False for one_value in value: for attribute, black_values in blacklisted_values[key].iteritems(): if one_value[attribute] in black_values: do_remove = True if not do_remove: temp_value.append(one_value) if len(temp_value)>=1: value = temp_value else: #all the values were removed so ignore this entry do_ignore = True if not do_ignore: if key == "activityRecognitionResult": activityRecognitionResult_update(value , event_type) elif key == "androidActivityRecognitionResult": androidActivityRecognitionResult_update(value, event_type) elif key == "appLaunch": appLaunch_update(value , event_type) elif key == "battery": battery_update(value, event_type) elif key == "bluetooth": bluetooth_update(value, event_type) elif key == "event": event_update(value, event_type) elif key == "notifications": notifications_update(value, event_type) elif key == "headsetPlug": headsetPlug_update(value, event_type) elif key == "location": location_update(value, event_type) elif key == "networkInfo": networkInfo_update(value, event_type) elif key == "sensor": sensor_update(value, event_type) elif key == "settingInfo": settingInfo_update(value, event_type) elif key == "telephony" or key=="telephony2": telephony_update(value, event_type) elif key == "wifiAps": wifiApps_update(value , event_type) elif key == "wifiConnectedAp": wifiConnectedApp_update(value, event_type) '''elif key not in blacklisted_features: print key+"\n"''' order_data() #do some tests to see that the transformation went well do_sanity_check(data,clean_data) JsonUtils.save_json_data(out_path_prefix+str(DataExtractor.user_long_ids[user_id])+"/all/all_in_one_validated_log", clean_data) return clean_data
def write(content, file_suffix): t = time.strftime("%Y%m%d") t = t+""+time.strftime("%H%M%S") log_file_name = LogsFileWriter.log_dir+t+file_suffix JsonUtils.save_json_data(log_file_name, content)
def load_json_data(user_id): data = JsonUtils.load_json_data(DataExtractor.user_json_path[user_id]) return data
import json import collections sys.path.insert(0, "/home/dehajjik/workspace/src/utils") from json_utils import JsonUtils from data_utils import * other_features = {} def weather_and_operating_features_extraction_one_user(user_id): out_path_prefix = "/home/dehajjik/workspace/logs/" data = DataExtractor.load_json_data(user_id) known_features = (["activityRecognitionResult", "androidActivityRecognitionResult", "appLaunch", "battery", "bluetooth", "event", "notifications", "headsetPlug", "location", "networkInfo", "sensor", "settingInfo", "settingInfo", "telephony", "telephony2", "wifiAps", "wifiConnectedAp", "launcherLayouts", "predictors" ]) #filter notification already adds to the records the attributes createDateTimeZone and rcreateDateTimeZone #check that the data is really ordered by date!! data = DataOperations.filter_notifications(data) #the data is sorted and notification is filtered and annotated with the timezone date. for record_id in data: record = data[record_id] for key, value in record.iteritems(): key not in known_features: if key not in other_features: other_features[key]=[] other_features[key].append(record) print key+"\n" t = time.strftime("%Y%m%d") t = t+""+time.strftime("%H%M%S") JsonUtils.save_json_data(out_path_prefix+str(DataExtractor.user_long_ids[user_id])+"/"+t+"extra_features_u"+str(user_id), other_features) return other_features
def compare(reference_transformation, user_id): global labels_importance global labels_importance_rank #global labels_importance_derivative index = 0 transformations = transformation_vectors.keys() for label in rows_labels: labels_importance[label] = {} labels_importance_rank[label] = {} for transformation in transformations: labels_importance[label][transformation]=transformation_vectors[transformation][0][index] labels_importance_rank[label][transformation]= transformation_vectors[transformation][1][index] #labels_importance_derivative[label][transformation]= transformation_vectors[transformation][2][index] index +=1 #sort the dictionaries per presence rate. The most frequent feature at the biginning labels_importance = collections.OrderedDict(sorted(labels_importance.items(), key=lambda x: x[1][reference_transformation], reverse = True)) #labels_importance_derivative = collections.OrderedDict(sorted(labels_importance_derivative.items(), key=lambda x: x[1][reference_transformation], reverse = True)) labels_importance_rank = collections.OrderedDict(sorted(labels_importance_rank.items(), key=lambda x: x[1][reference_transformation])) print JsonUtils.dict_as_json_str(labels_importance) print JsonUtils.dict_as_json_str(labels_importance_rank) #print np.shape(data_matrix) #write the dictionaries into files out = LogsFileWriter.open(file_name) LogsFileWriter.write(JsonUtils.dict_as_json_str(labels_importance),out) LogsFileWriter.write(JsonUtils.dict_as_json_str(labels_importance_rank),out) LogsFileWriter.close(out) #plot the records importance vs different transformation scores importances_list = [] importances_legends = [] ranks_list = [] ranks_legends = [] importances_derivatives_list = [] importances_derivatives_legends = [] for transformation in transformations: importance_list = [importance[transformation] for importance in labels_importance.values()] importances_list.append(importance_list) importances_legends.append(transformation) rank_list = [rank[transformation] for rank in labels_importance_rank.values()] ranks_list.append(rank_list) ranks_legends.append(transformation) importance_derivative_list = np.diff(np.asarray(importance_list), 1).tolist() importances_derivatives_list.append(importance_derivative_list) importances_derivatives_legends.append(transformation) importances_derivatives_list.append([0]*len(importances_derivatives_list[0])) importances_derivatives_legends.append("y=0") PlotlibDrawer.plot_1(labels_importance.keys(), [percentage["presence_percentage"] for percentage in labels_importance.values()], "features rank", "% records", "presence rate of the features in the records", 10) PlotlibDrawer.plot_2(labels_importance.keys(), importances_list, importances_legends, "features rank", "features scores", "comparison of different transformation scores "+str(user_id), 11) PlotlibDrawer.plot_2(labels_importance_rank.keys(), ranks_list, ranks_legends, "features initial rank", "features rank after transformation", "comparison of different transformation ranks "+str(user_id), 11) PlotlibDrawer.plot_2(labels_importance.keys(), importances_derivatives_list, importances_derivatives_legends, "features initial rank", "features scores derivative", "comparison of different transformation scores derivative "+str(user_id), 11)
from shop import Shop from json_utils import JsonUtils if __name__ == "__main__": utils = JsonUtils() # read json objects from "test_goods.json" goods_to_store = utils.read_json("test_goods.json") keys = list(goods_to_store.keys()) shop = Shop() # add goods that we've read to shop for key in keys: shop.add_good(goods_to_store[key]) # get json objects of shop goods from each existing category food_json_obj = shop.get_goods("Food") tools_json_obj = shop.get_goods("Tools") electronics_json_obj = shop.get_goods("Electronics") books_json_obj = shop.get_goods("Books") # print jsons of goods print("Food goods: ") for json in food_json_obj: print(json) print("*" * 80) print("Tools goods: ")