示例#1
0
def compute_ghcm_mdt_one_user(user_id):
    file_name = "ghcm_mdt_user_" + str(user_id)
    print "loading matrix user " + str(user_id) + "..."
    [rfvdata, featuresnames, valuesnames,
     recordsdates] = RVFDataExtractor.load_rvf(user_id)
    print "values" + JsonUtils.dict_as_json_str(valuesnames)
    print "data" + JsonUtils.dict_as_json_str(rfvdata[0])
    vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))]
    print "user " + str(user_id) + " has " + str(
        len(featuresnames)) + " features and " + str(len(rfvdata)) + " records"
    print "features names"
    print featuresnames

    print "values" + JsonUtils.dict_as_json_str(valuesnames)
    for k in [10, 20, 30]:
        #compute the ghcm_mdt
        ghcm_mdt_comp = GHCM_MDTComputer(rfvdata, k, vocab_size)
        print "computing SVD for user " + str(user_id) + "..."
        ghcm_mdt_comp.compute()

        print "constructing interpretable output for user " + str(
            user_id) + "..."
        ghcm_mdt_comp.construct_rows_interpretable_output(
            featuresnames, valuesnames, disp_m)
        r_output = ghcm_mdt_comp.rows_interpretable_output

        #write the result
        print "writing SVD result for user " + str(user_id) + "..."
        JsonLogsFileWriter.write(r_output, file_name)
示例#2
0
	def test_time_variances_for_array_feature(data, array_feature):
		time_variances_number = {}
		time_variances_feature_min = {}
		time_variances_feature_max = {}

		for record_id in data:
			if array_feature in data[record_id]:
				max_time = 0
				min_time = sys.maxint
				feature = data[record_id][array_feature]
				for entry in feature:
					current_time = long(entry['createDate'])
					if current_time >= max_time :
						max_time = current_time
							
					if current_time <= min_time:
						min_time = current_time
			
							
				time_variance = max_time - min_time
				if time_variance not in time_variances_number:
					time_variances_number[time_variance] = 0
				
				time_variances_number[time_variance] +=1
					
		time_variances_number = collections.OrderedDict(sorted(time_variances_number.items(),reverse=True))
		
		print "time variances distribution for "+array_feature
		print JsonUtils.dict_as_json_str(time_variances_number)
		print "\n \n"
def compute_ghcm_mdt_one_user(user_id):
	file_name = "ghcm_mdt_user_"+str(user_id)
	print "loading matrix user "+str(user_id)+"..."
	[rfvdata, featuresnames, valuesnames, recordsdates] = RVFDataExtractor.load_rvf(user_id)
	print "values" + JsonUtils.dict_as_json_str(valuesnames)
	print "data" + JsonUtils.dict_as_json_str(rfvdata[0])
	vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))]
	print "user "+str(user_id)+" has "+str(len(featuresnames))+" features and "+str(len(rfvdata))+" records"
	print "features names"
	print featuresnames
	
	print "values" + JsonUtils.dict_as_json_str(valuesnames)
	for k in [10,20,30]:
		#compute the ghcm_mdt
		ghcm_mdt_comp = GHCM_MDTComputer(rfvdata, k, vocab_size)
		print "computing SVD for user "+str(user_id)+"..."
		ghcm_mdt_comp.compute()
		
		print "constructing interpretable output for user "+str(user_id)+"..."
		ghcm_mdt_comp.construct_rows_interpretable_output(featuresnames, valuesnames, disp_m)
		r_output = ghcm_mdt_comp.rows_interpretable_output
		
		#write the result
		print "writing SVD result for user "+str(user_id)+"..."
		JsonLogsFileWriter.write(r_output, file_name)
def weather_and_operating_features_extraction_one_user(user_id):
	out_path_prefix = "/home/dehajjik/workspace/logs/"
	data = DataExtractor.load_json_data(user_id)
	known_features = (["activityRecognitionResult", "androidActivityRecognitionResult", "appLaunch", "battery", "bluetooth", "event", "notifications", "headsetPlug", "location", "networkInfo", "sensor", "settingInfo",
						"settingInfo", "telephony", "telephony2", "wifiAps", "wifiConnectedAp", "launcherLayouts", "predictors", "neighboringCellInfo2", "neighboringCellInfo" ])
	
	#filter notification already adds to the records the attributes createDateTimeZone and rcreateDateTimeZone
	#check that the data is really ordered by date!!
	data = DataOperations.filter_notifications(data)
	
	#the data is sorted and notification is filtered and annotated with the timezone date.
	for record_id in data:
		record = data[record_id]
		for key, value in record.iteritems():
			if key not in known_features:
				if key not in other_features.keys():
					other_features[key]=[]
				if len(other_features[key])<100:
					other_features[key].append(value)
	
	
	t = time.strftime("%Y%m%d")
	t = t+""+time.strftime("%H%M%S")
	JsonUtils.save_json_data(out_path_prefix+"/"+t+"extra_features_u"+str(user_id), other_features)
	return other_features
示例#5
0
def weather_and_operating_features_extraction_one_user(user_id):
    out_path_prefix = "/home/dehajjik/workspace/logs/"
    data = DataExtractor.load_json_data(user_id)
    known_features = ([
        "activityRecognitionResult", "androidActivityRecognitionResult",
        "appLaunch", "battery", "bluetooth", "event", "notifications",
        "headsetPlug", "location", "networkInfo", "sensor", "settingInfo",
        "settingInfo", "telephony", "telephony2", "wifiAps", "wifiConnectedAp",
        "launcherLayouts", "predictors", "neighboringCellInfo2",
        "neighboringCellInfo"
    ])

    #filter notification already adds to the records the attributes createDateTimeZone and rcreateDateTimeZone
    #check that the data is really ordered by date!!
    data = DataOperations.filter_notifications(data)

    #the data is sorted and notification is filtered and annotated with the timezone date.
    for record_id in data:
        record = data[record_id]
        for key, value in record.iteritems():
            if key not in known_features:
                if key not in other_features.keys():
                    other_features[key] = []
                if len(other_features[key]) < 100:
                    other_features[key].append(value)

    t = time.strftime("%Y%m%d")
    t = t + "" + time.strftime("%H%M%S")
    JsonUtils.save_json_data(
        out_path_prefix + "/" + t + "extra_features_u" + str(user_id),
        other_features)
    return other_features
示例#6
0
	def test_time_variances_in_one_record(data):
		time_variances_number = {}
		time_variances_feature_min = {}
		time_variances_feature_max = {}

		for record_id in data:
			max_time = 0
			min_time = sys.maxint
			max_feature=""
			min_feature = ""
			record = data[record_id]
			
			for feature, value in record.iteritems():
				
				try:
					current_time = long(value['createDate'])
						
					if current_time > max_time :
						max_time = current_time
						max_feature = feature
					
					if current_time < min_time:
						min_time = current_time
						min_feature = feature
						
				except TypeError:
					#it is an array feature
					
					for entry in value:
						current_time = long(entry['createDate'])
						if current_time >= max_time :
							max_time = current_time
							max_feature = feature
							
						if current_time <= min_time:
							min_time = current_time
							min_feature = feature
							
							
			time_variance = max_time - min_time
			if time_variance not in time_variances_number:
				time_variances_number[time_variance] = 0
				
			time_variances_number[time_variance] +=1
			
			if max_feature not in time_variances_feature_max:
				time_variances_feature_max[max_feature] = 0
			time_variances_feature_max[max_feature] += 1
			
			if min_feature not in time_variances_feature_min:
				time_variances_feature_min[min_feature] = 0
			time_variances_feature_min[min_feature] += 1
		
		time_variances_number = collections.OrderedDict(sorted(time_variances_number.items(),reverse=True))
		
		print "time variances distribution "
		print JsonUtils.dict_as_json_str(time_variances_number)
		print "\n \n"
def filter_notifications_one_user(user_id):
    out_path_prefix = "/speech/dbwork/mul/students/dehajjik/notifications_filtered/"
    data = DataExtractor.load_json_data(user_id)
    data = DataOperations.filter_notifications(data)

    JsonUtils.save_json_data(
        out_path_prefix + str(DataExtractor.user_long_ids[user_id]) + "/all/all_in_one_validated_log", data
    )
    return data
def filter_notifications_one_user(user_id):
    out_path_prefix = "/speech/dbwork/mul/students/dehajjik/notifications_filtered/"
    data = DataExtractor.load_json_data(user_id)
    data = DataOperations.filter_notifications(data)

    JsonUtils.save_json_data(
        out_path_prefix + str(DataExtractor.user_long_ids[user_id]) +
        "/all/all_in_one_validated_log", data)
    return data
	def transform(self):
		
		sony_activity_counts = self.count_sony_activity_realizations()
		android_activity_counts = self.count_android_activity_realizations()
		
		print "Sony activities duration (in minutes) :"
		print JsonUtils.dict_as_json_str(sony_activity_counts)
		
		print "Android activities duration (in minutes) : "
		print JsonUtils.dict_as_json_str(android_activity_counts)
		
		
		#self.exclusive_sony_activity_transform_one()
		self.exclusive_android_activity_transform_one()
	def extract_realizations_in_time_range_soft_version_optimized_for_sorted_data_copy_verbose(feature_realizations_sorted_copy, realization_key):
		target_realizations = []
		target_time_range = realization_key
			
		for current_time_range in feature_realizations_sorted_copy.keys():
			realization = feature_realizations_sorted_copy[current_time_range]
			
			if DataOperations.is_ended_before_the_start_of(current_time_range, target_time_range):
				#the current realization happened before the target time, so as we assume that the target times that will be given as input to the method are increasing,
				#we just remove this entry
				del feature_realizations_sorted_copy[current_time_range]
				
			elif DataOperations.does_date_overlaps(target_time_range , current_time_range):
				#the current realization has a time that overlaps with the target time so we select it and extend the target time so that it includes the time range of the selected realization(to satisfy the soft version property)
				print "gps "+JsonUtils.dict_as_json_str(realization)+" that occured at time "+current_time_range+" included in the selection"
				target_realizations.append(realization)
				#as the target times are strictly increasing, we delete this entry because it will never match another target time
				del feature_realizations_sorted_copy[current_time_range]
				target_time_range = DataOperations.union_of_date_intervals(target_time_range , current_time_range)
				
			elif DataOperations.is_ended_before_the_start_of(target_time_range , current_time_range):
				#if the current time range started after the finish of the target one, it means that we will not meet any realizations in the target time range any more
				#print current_time_range+ ": CAUSED STOP LOOP \n"
				break;
				
			
		#print "the selected realizations are \n"+JsonUtils.dict_as_json_str(target_realizations)+"\n\n\n\n"
		#print "\n\n\n\n"
		return target_realizations
def extract_specific_date_time_one_user(user_id):
    '''specific_date_times = {1: [datetime.datetime(year=2014, month=8, day=19, hour=16), datetime.datetime(year=2014, month=8, day=27, hour=15), datetime.datetime(year=2014, month=9, day=5, hour=18), datetime.datetime(year=2014, month=10, day=12, hour=15), datetime.datetime(year=2014, month=9, day=1, hour=1)],
						2: [datetime.datetime(year=2014, month=9, day=25, hour=7),datetime.datetime(year=2014, month=12, day=8, hour=6), datetime.datetime(year=2014, month=9, day=25, hour=1)],
						3: [datetime.datetime(year=2014, month=9, day=13, hour=0), datetime.datetime(year=2014, month=9, day=25, hour=17)],
						4: [datetime.datetime(year=2014, month=9, day=5, hour=14), datetime.datetime(year=2015, month=1, day=8, hour=11), datetime.datetime(year=2014, month=9, day=2, hour=13)],
						5: [datetime.datetime(year=2014, month=9, day=22, hour=18), datetime.datetime(year=2015, month=1, day=5, hour=13), datetime.datetime(year=2014, month=12, day=29, hour=13)],
						6: [datetime.datetime(year=2014, month=10, day=26, hour=3), datetime.datetime(year=2014, month=11, day=4, hour=8)],
						7: [datetime.datetime(year=2014, month=7, day=28, hour=10)]}'''

    specific_date_times = {
        1: [
            datetime.datetime(year=2014, month=10, day=12, hour=14),
            datetime.datetime(year=2014, month=10, day=12, hour=22)
        ],
        2: [],
        3: [],
        4: [datetime.datetime(year=2014, month=12, day=9, hour=15)],
        5: [
            datetime.datetime(year=2014, month=12, day=5, hour=12),
            datetime.datetime(year=2014, month=12, day=16, hour=9)
        ],
        6: [],
        7: []
    }

    out_path_prefix = "/home/dehajjik/workspace/resources/"

    data = DataExtractor.load_json_data(user_id)

    #for each specific date and hour write the data that occured at that specified to a file. Json format
    for specific_dt in specific_date_times[user_id]:
        selected_data = DataExtractor.select_records_by_date_and_hour(
            data, specific_dt)

        #annotate the records with readable dates and sort the notifications chronologically
        for record in selected_data:
            record = DataOperations.annotate_with_readable_date_no_timezone(
                record)
            record = DataOperations.order_chronologically_notifications_and_annotate(
                record)

        #sort the records chronologically
        selected_data = DataOperations.order_chronologically_and_annotate(
            selected_data)
        JsonUtils.save_json_data(
            out_path_prefix + "u" + str(user_id) + " d" +
            specific_dt.strftime('%Y-%m-%d %H'), selected_data)
def output_sample(user_id):
    specific_date_times = {
        1: [
            datetime.datetime(year=2014, month=8, day=19, hour=16),
            datetime.datetime(year=2014, month=8, day=27, hour=15),
            datetime.datetime(year=2014, month=9, day=5, hour=18),
            datetime.datetime(year=2014, month=10, day=12, hour=15),
            datetime.datetime(year=2014, month=9, day=1, hour=1)
        ],
        2: [
            datetime.datetime(year=2014, month=9, day=25, hour=7),
            datetime.datetime(year=2014, month=12, day=8, hour=6),
            datetime.datetime(year=2014, month=9, day=25, hour=1)
        ],
        3: [datetime.datetime(year=2014, month=9, day=25, hour=17)],
        4: [
            datetime.datetime(year=2014, month=9, day=5, hour=14),
            datetime.datetime(year=2015, month=1, day=8, hour=11),
            datetime.datetime(year=2014, month=9, day=2, hour=13)
        ],
        5: [
            datetime.datetime(year=2014, month=9, day=22, hour=18),
            datetime.datetime(year=2015, month=1, day=5, hour=13),
            datetime.datetime(year=2014, month=12, day=29, hour=13)
        ],
        6: [
            datetime.datetime(year=2014, month=10, day=26, hour=3),
            datetime.datetime(year=2014, month=11, day=4, hour=8)
        ],
        7: [datetime.datetime(year=2014, month=7, day=28, hour=10)]
    }

    out_path_prefix = "/home/dehajjik/workspace/resources/filtered_notifs/"

    data = DataExtractor.load_json_data(user_id)

    #for each specific date and hour write the data that occured at that specified to a file. Json format
    for specific_dt in specific_date_times[user_id]:
        selected_data = DataExtractor.select_records_by_date_and_hour(
            data, specific_dt)
        selected_data = DataOperations.order_chronologically_and_annotate(
            selected_data)
        #selected_data = DataOperations.annotate(selected_data)
        JsonUtils.save_json_data(
            out_path_prefix + "u" + str(user_id) + " d" +
            specific_dt.strftime('%Y-%m-%d %H'), selected_data)
        print(str(json.dumps(selected_data.keys(), indent=4)))
def transform_to_categorized_data_one_user(user_id):
	out_path_prefix = "/speech/dbwork/mul/students/dehajjik/categorized_data/"
	data_key = "data"
	metadata_key = "metadata"
	
	print "loading data for user "+str(user_id)
	nontransformed_data = DataExtractor.load_json_data(user_id)
	#nontransformed_data = JsonUtils.load_json_data("/home/dehajjik/workspace/resources/sample_data_for_location_categorization_test.json")
	
	
	#the transfomers responsible for the features of the data categorization
	feature_transformers = {LocationTransformer.transformed_feature_name: LocationTransformer(nontransformed_data),
							NotificationTransformer.transformed_feature_name : NotificationTransformer(nontransformed_data),
							ApplaunchTransformer.transformed_feature_name : ApplaunchTransformer(nontransformed_data),
							BatteryTransformer.transformed_feature_name: BatteryTransformer(nontransformed_data),
							HeadsetTransformer.transformed_feature_name: HeadsetTransformer(nontransformed_data),
							BluetoothPairedTransformer.transformed_feature_name: BluetoothPairedTransformer(nontransformed_data),
							BluetoothSeenTransformer.transformed_feature_name: BluetoothSeenTransformer(nontransformed_data),
							ActivityTransformer.transformed_feature_name : ActivityTransformer(nontransformed_data)}
	
	#the features that we want to transform
	selected_features = [LocationTransformer.transformed_feature_name,
						NotificationTransformer.transformed_feature_name,
						ApplaunchTransformer.transformed_feature_name,
						#BatteryTransformer.transformed_feature_name,
						#HeadsetTransformer.transformed_feature_name,
						BluetoothPairedTransformer.transformed_feature_name,
						#BluetoothSeenTransformer.transformed_feature_name,
						ActivityTransformer.transformed_feature_name]
						
	#selected_features = [ActivityTransformer.transformed_feature_name]
	
	
	categorized_data = {}
	categorized_data[data_key]={}
	categorized_data[metadata_key]={}
	
	
	for feature in selected_features:
		feature_transformers[feature].transform()
		if feature_transformers[feature].transformed_feature_data != {None:None} and feature_transformers[feature].transformed_feature_metadata != {None:None}:
			categorized_data[data_key][feature]= feature_transformers[feature].transformed_feature_data
			categorized_data[metadata_key][feature] = feature_transformers[feature].transformed_feature_metadata
	
	JsonUtils.save_json_data(out_path_prefix+str(DataExtractor.user_long_ids[user_id])+"/all/all_in_one_validated_log", categorized_data)
	return categorized_data
def output_sample(user_id):
	specific_date_times = {1: [datetime.datetime(year=2014, month=8, day=19, hour=16), datetime.datetime(year=2014, month=8, day=27, hour=15), datetime.datetime(year=2014, month=9, day=5, hour=18), datetime.datetime(year=2014, month=10, day=12, hour=15), datetime.datetime(year=2014, month=9, day=1, hour=1)],
						2: [datetime.datetime(year=2014, month=9, day=25, hour=7),datetime.datetime(year=2014, month=12, day=8, hour=6), datetime.datetime(year=2014, month=9, day=25, hour=1)],
						3: [ datetime.datetime(year=2014, month=9, day=25, hour=17)],
						4: [datetime.datetime(year=2014, month=9, day=5, hour=14), datetime.datetime(year=2015, month=1, day=8, hour=11), datetime.datetime(year=2014, month=9, day=2, hour=13)],
						5: [datetime.datetime(year=2014, month=9, day=22, hour=18), datetime.datetime(year=2015, month=1, day=5, hour=13), datetime.datetime(year=2014, month=12, day=29, hour=13)],
						6: [datetime.datetime(year=2014, month=10, day=26, hour=3), datetime.datetime(year=2014, month=11, day=4, hour=8)],
						7: [datetime.datetime(year=2014, month=7, day=28, hour=10)]}
	
	out_path_prefix = "/home/dehajjik/workspace/resources/filtered_notifs/"
	
	data = DataExtractor.load_json_data(user_id)
	
	
	#for each specific date and hour write the data that occured at that specified to a file. Json format
	for specific_dt in specific_date_times[user_id]:
		selected_data = DataExtractor.select_records_by_date_and_hour(data, specific_dt)
		selected_data = DataOperations.order_chronologically_and_annotate(selected_data)
		#selected_data = DataOperations.annotate(selected_data)
		JsonUtils.save_json_data(out_path_prefix+"u"+str(user_id)+" d"+specific_dt.strftime('%Y-%m-%d %H'), selected_data)
		print(str(json.dumps(selected_data.keys(), indent=4)))
示例#15
0
	def print_times_for_specific_locations(data):
		accuracies = {}
		big = 0 
		small = 0
		for record_id in data:
			if "location" in data[record_id]:
				accuracy = data[record_id]["location"]["accuracy"]
				
				if accuracy not in accuracies:
					accuracies[accuracy] = 0
				
				accuracies[accuracy]+=1
				
				if accuracy>200:
					big+=1
				else:
					small+=1
					
		
		accuracies = collections.OrderedDict(sorted(accuracies.items(),reverse=True))
		print "accuracies for location are : "
		print JsonUtils.dict_as_json_str(accuracies)
		print "there is "+str(big)+" accuracies bigger than 200 meters from a total of "+str(big+small)
		print "\n \n"
 def write(content, file_suffix):
     t = time.strftime("%Y%m%d")
     t = t + "" + time.strftime("%H%M%S")
     log_file_name = LogsFileWriter.log_dir + t + file_suffix
     JsonUtils.save_json_data(log_file_name, content)
示例#17
0
																								   "seq":[23]
																								}
					},
		"wifiAps": {"2015-01-01 00:55:00 , 1420073700000->2015-01-01 01:10:01 , 1420074601000":[ {"ssid":"w1",
																								   "seq":[2,3,4]}
																								],
					"2015-01-01 01:10:01 , 1420074601000->2015-01-01 02:00:00 , 1420077600000":[ {"ssid":"w1",
																								  "seq":[5,6,7,8]},
																								{"ssid":"w2",
																								  "seq":[5,6,7,8]},
																								
																								],
					"2015-01-01 03:00:01 , 1420081201000->2015-01-01 03:05:00 , 1420081500000":[ {"ssid":"w3",
																								   "seq":[9]}
																								],
					"2015-01-01 04:05:00 , 1420085100000->2015-01-01 05:00:00 , 1420088400000":[ {"ssid":"w2",
																								   "seq":[10,11]},
																								  {"ssid":"w3",
																								  "seq":[10,11]}
																								],
					"2015-01-01 19:15:00 , 1420139700000->2015-01-01 21:00:00 , 1420146000000":[ {"ssid":"w4",
																								   "seq": [21,22]}
																								]
					
					}
					
}
					
					
JsonUtils.save_json_data( "/home/dehajjik/workspace/resources/sample_data_for_location_categorization_test", data)
def transform_to_matrix_one_user(user_id):
	
		
	print "loading data for user "+str(user_id)
	categorized_data = DataExtractor.load_json_data(user_id)
	data = DataExtractor.complete_data(categorized_data)
	metadata = DataExtractor.complete_metadata(categorized_data)
	
	#order the data by the alphabetic name of the features
	print "ordering data "+str(user_id)
	data = collections.OrderedDict(sorted(data.items()))
	
	#get the first date and the last date
	print "getting first date and last date "
	end_date = date_min
	start_date = datetime.now()
	for feature, feature_data in data.iteritems():
		feature_data = collections.OrderedDict(sorted(feature_data.items()))
		begin_date = DataExtractor.start_date_of_realization(feature_data.keys()[0])
		if begin_date < start_date:
			start_date = begin_date
			
		last_date = DataExtractor.start_date_of_realization(feature_data.keys()[len(feature_data.keys())-1])
		if last_date > end_date:
			end_date = last_date
		
		data[feature] = feature_data
	
	#construct the data matrix
	#I- construct the matrices of all the features
	print "constructing the matrixes "
	rows = 0
	
	transformers = {} 
	for feature, feature_date in data.iteritems():
		if feature == "location":
			transformers[feature] = MatrixLocationFeatureTransformer(feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision)
		elif feature == "bluetoothSeen" or feature == "bluetoothPaired":
			transformers[feature] = MatrixBleutoothFeatureTransformer(feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision)
		else :
			transformers[feature] = MatrixFeatureTransformer(feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision)
			
		if feature in features_importance_score_one:
			transformers[feature].let_importance_scores_to_1 = True
		
		transformers[feature].transform()
		rows += transformers[feature].nbdimentions
	
	#construct the time feature
	transformers[MatrixTimeFeatureTransformer.feature_name] = MatrixTimeFeatureTransformer(start_date, end_date, coocurring_precision)
	transformers[MatrixTimeFeatureTransformer.feature_name].transform()
	rows +=  transformers[MatrixTimeFeatureTransformer.feature_name].nbdimentions
	columns = transformers[MatrixTimeFeatureTransformer.feature_name].nbtimeslots
	
	#II-concatenate all the matrices of each feature into one big matrix (do the same for the labels vector)
	print "regrouping the matrixes "
	data_matrix = np.zeros((columns, rows))
	labels_vector = [""]* rows
	dimentions_importance_score = np.zeros(rows)
	transformers = collections.OrderedDict(sorted(transformers.items()))
	
	begin_row_idex = 0
	end_row_index = 0
	for feature, feature_transformer in transformers.iteritems():
		end_row_index = begin_row_idex + feature_transformer.nbdimentions
		data_matrix[:, begin_row_idex:end_row_index] =  feature_transformer.matrix_data
		labels_vector[begin_row_idex:end_row_index] = feature_transformer.labels_vector
		dimentions_importance_score[begin_row_idex:end_row_index]=feature_transformer.realization_importance_score
		begin_row_idex = end_row_index
	
	'''
	The matrix contains a lot of feature vectors that contains 0 in all the features except the time features.
	Those vectors corresponds to the times where any record has been done.
	We want to eliminate those timestamps and their corresponding times
	'''
	time_vector = transformers.values()[0].time_vector
	[data_matrix, time_vector] = eliminate_empty_records(data_matrix, time_vector)
	data_matrix = np.transpose(data_matrix)
	
	print "the labels are : "
	print JsonUtils.dict_as_json_str(labels_vector)
	
	
	print "first date of observation "+str(start_date)
	print "first date of observation "+str(end_date)
	print "dimension of the labels (features) vector : "+str(len(labels_vector))
	print "dimension of the time vector : "+str(len(time_vector))
	print "dimension of the resulted matrix (features, time) "+str(data_matrix.shape)
	print "the number of non zeros values is : "+str(np.count_nonzero(data_matrix))+"/"+str(np.size(data_matrix))
	print "the number of negative values in the matrix is : "+str(np.size(ma.masked_array(data_matrix, mask=(data_matrix>=0)).compressed()))
	print "the data matrix printed : "
	print Numpy.str(data_matrix)
	
	#write the matrix data
	MDataExtractor.save_matrix(user_id, data_matrix)
	
	#write the labels vector, then the time vector and the importance scores
	MDataExtractor.save_labels_vector(user_id, labels_vector)
	MDataExtractor.save_time_vector(user_id, time_vector)
	MDataExtractor.save_importance_scores(user_id, dimentions_importance_score)
示例#19
0
def clean_and_write_data_one_user(user_id):
	#empty the lists
	global clean_data
	clean_data = {}
	global last_realization_val
	last_realization_val = {}
	global last_realization_key
	last_realization_key = {}
	
	
	'''
	For each feature, we want to know what are the distribution of the difference of time observed between two realizations.
	For that reason we store in time_variances for each feature, the number of times that the difference between two realizations was x minutes.
	Note that the maximum time variance allowed is timeout_in_minutes.
	'''
	global time_variances
	time_variances ={}
	
	out_path_prefix = "/speech/dbwork/mul/students/dehajjik/clean_data/"
	data = DataExtractor.load_json_data(user_id)
	DataOperations.print_times_for_specific_locations(data)
	#filter notification already adds to the records the attributes createDateTimeZone and rcreateDateTimeZone
	#check that the data is really ordered by date!!
	data = DataOperations.filter_notifications(data)
	
	
	
	#the data is sorted and notification is filtered and annotated with the timezone date.
	for record_id in data:
		record = data[record_id]
		event_type = record['event']['type']
		for key, value in record.iteritems():
			#test if it has one black listed value, if it is the case ignore it
			do_ignore = False
			is_array_attribute = False
			if key in blacklisted_values:
				for attribute, black_values in blacklisted_values[key].iteritems():
					try :
						if value[attribute] in black_values:
							#the current realization contains one blacklisted value, so we need to ignore it
							do_ignore = True
					except TypeError:
						#the feature we have is an array feature
						is_array_attribute=True
						break;
						
			if is_array_attribute:
				#the feature we have is an array feature, so we go throught all the elements and we remove the blacklisted ones if they exist
				temp_value = []
				do_remove = False
				for one_value in value:
					for attribute, black_values in blacklisted_values[key].iteritems():
						if one_value[attribute] in black_values:
							do_remove = True
							
					if not do_remove:
						temp_value.append(one_value)
						
				if len(temp_value)>=1:
					value = temp_value
				else:
					#all the values were removed so ignore this entry
					do_ignore = True
					
			
			if not do_ignore:
				if key == "activityRecognitionResult":
					activityRecognitionResult_update(value , event_type)
				elif key == "androidActivityRecognitionResult":
					androidActivityRecognitionResult_update(value, event_type)
				elif key == "appLaunch":
					appLaunch_update(value , event_type)
				elif key == "battery":
					battery_update(value, event_type)
				elif key == "bluetooth":
					bluetooth_update(value, event_type)
				elif key == "event":
					event_update(value, event_type)
				elif key == "notifications":
					notifications_update(value, event_type)
				elif key == "headsetPlug":
					headsetPlug_update(value, event_type)
				elif key == "location":
					location_update(value, event_type)
				elif key == "networkInfo":
					networkInfo_update(value, event_type)
				elif key == "sensor":
					sensor_update(value, event_type)
				elif key == "settingInfo":
					settingInfo_update(value, event_type)
				elif key == "telephony" or key=="telephony2":
					telephony_update(value, event_type)
				elif key == "wifiAps":
					wifiApps_update(value , event_type)
				elif key == "wifiConnectedAp":
					wifiConnectedApp_update(value, event_type)
				'''elif key not in blacklisted_features:
					print key+"\n"'''
				
	
	
	
	order_data()
	#do some tests to see that the transformation went well
	do_sanity_check(data,clean_data)
	JsonUtils.save_json_data(out_path_prefix+str(DataExtractor.user_long_ids[user_id])+"/all/all_in_one_validated_log", clean_data)
		
	return clean_data
	def write(content, file_suffix):
		t = time.strftime("%Y%m%d")
		t = t+""+time.strftime("%H%M%S")
		log_file_name = LogsFileWriter.log_dir+t+file_suffix
		JsonUtils.save_json_data(log_file_name, content)
	def load_json_data(user_id):
		data = JsonUtils.load_json_data(DataExtractor.user_json_path[user_id])
		return data
import json
import collections
sys.path.insert(0, "/home/dehajjik/workspace/src/utils")
from json_utils import JsonUtils
from data_utils import *
other_features = {}
def weather_and_operating_features_extraction_one_user(user_id):
	out_path_prefix = "/home/dehajjik/workspace/logs/"
	data = DataExtractor.load_json_data(user_id)
	known_features = (["activityRecognitionResult", "androidActivityRecognitionResult", "appLaunch", "battery", "bluetooth", "event", "notifications", "headsetPlug", "location", "networkInfo", "sensor", "settingInfo",
						"settingInfo", "telephony", "telephony2", "wifiAps", "wifiConnectedAp", "launcherLayouts", "predictors" ])
	
	#filter notification already adds to the records the attributes createDateTimeZone and rcreateDateTimeZone
	#check that the data is really ordered by date!!
	data = DataOperations.filter_notifications(data)
	
	#the data is sorted and notification is filtered and annotated with the timezone date.
	for record_id in data:
		record = data[record_id]
		for key, value in record.iteritems():
			key not in known_features:
				if key not in other_features:
					other_features[key]=[]
				other_features[key].append(record)
				print key+"\n"
	
	
	t = time.strftime("%Y%m%d")
	t = t+""+time.strftime("%H%M%S")
	JsonUtils.save_json_data(out_path_prefix+str(DataExtractor.user_long_ids[user_id])+"/"+t+"extra_features_u"+str(user_id), other_features)
	return other_features
示例#23
0
def compare(reference_transformation, user_id):
	global labels_importance
	global labels_importance_rank
	#global labels_importance_derivative
	index = 0
	transformations = transformation_vectors.keys()
	for label in rows_labels:
		labels_importance[label] = {}
		labels_importance_rank[label] = {}
		for transformation in transformations:
			labels_importance[label][transformation]=transformation_vectors[transformation][0][index]
			labels_importance_rank[label][transformation]= transformation_vectors[transformation][1][index]
			#labels_importance_derivative[label][transformation]= transformation_vectors[transformation][2][index]
		
		index +=1
		
		
	#sort the dictionaries per presence rate. The most frequent feature at the biginning
	labels_importance = collections.OrderedDict(sorted(labels_importance.items(), key=lambda x: x[1][reference_transformation], reverse = True))
	#labels_importance_derivative = collections.OrderedDict(sorted(labels_importance_derivative.items(), key=lambda x: x[1][reference_transformation], reverse = True))
	labels_importance_rank = collections.OrderedDict(sorted(labels_importance_rank.items(), key=lambda x: x[1][reference_transformation]))
	
	
	print JsonUtils.dict_as_json_str(labels_importance)
	
	print JsonUtils.dict_as_json_str(labels_importance_rank)
	#print np.shape(data_matrix)
	
	#write the dictionaries into files
	out = LogsFileWriter.open(file_name)
	LogsFileWriter.write(JsonUtils.dict_as_json_str(labels_importance),out)
	LogsFileWriter.write(JsonUtils.dict_as_json_str(labels_importance_rank),out)
	LogsFileWriter.close(out)
	
	
	#plot the records importance vs different transformation scores
	importances_list = []
	importances_legends = []
	ranks_list = []
	ranks_legends = []
	importances_derivatives_list = []
	importances_derivatives_legends = []
	for transformation in transformations:
		importance_list = [importance[transformation] for importance in labels_importance.values()]
		importances_list.append(importance_list)
		importances_legends.append(transformation)
		
		rank_list = [rank[transformation] for rank in labels_importance_rank.values()]
		ranks_list.append(rank_list)
		ranks_legends.append(transformation)
		
		importance_derivative_list = np.diff(np.asarray(importance_list), 1).tolist()
		importances_derivatives_list.append(importance_derivative_list)
		importances_derivatives_legends.append(transformation)
		
		
	importances_derivatives_list.append([0]*len(importances_derivatives_list[0]))
	importances_derivatives_legends.append("y=0")
	PlotlibDrawer.plot_1(labels_importance.keys(), [percentage["presence_percentage"] for percentage in labels_importance.values()], "features rank", "% records", "presence rate of the features in the records", 10)
	PlotlibDrawer.plot_2(labels_importance.keys(), importances_list, importances_legends, "features rank", "features scores", "comparison of different transformation scores "+str(user_id), 11)
	PlotlibDrawer.plot_2(labels_importance_rank.keys(), ranks_list, ranks_legends, "features initial rank", "features rank after transformation", "comparison of different transformation ranks "+str(user_id), 11)
	PlotlibDrawer.plot_2(labels_importance.keys(), importances_derivatives_list, importances_derivatives_legends, "features initial rank", "features scores derivative", "comparison of different transformation scores derivative "+str(user_id), 11)
	
		
	
	
	
		
	
	
	
示例#24
0
from shop import Shop
from json_utils import JsonUtils

if __name__ == "__main__":

    utils = JsonUtils()

    # read json objects from "test_goods.json"
    goods_to_store = utils.read_json("test_goods.json")

    keys = list(goods_to_store.keys())

    shop = Shop()

    # add goods that we've read to shop
    for key in keys:
        shop.add_good(goods_to_store[key])

    # get json objects of shop goods from each existing category
    food_json_obj = shop.get_goods("Food")
    tools_json_obj = shop.get_goods("Tools")
    electronics_json_obj = shop.get_goods("Electronics")
    books_json_obj = shop.get_goods("Books")

    # print jsons of goods
    print("Food goods: ")
    for json in food_json_obj:
        print(json)
    print("*" * 80)

    print("Tools goods: ")