示例#1
0
def mapTestCategoricalFeatures():
	
	#get spark context from import specified
	spark_contxt = ex.configureSpark()
	testDataRDD = ex.getDeviceRDD(spark_contxt, TEST_FILE)
	s = testDataRDD.map(testMapper)
	s.saveAsTextFile(TEST_DIR)	
示例#2
0
def saveExtractedData(spark_contxt):
	
	dataRDD = ex.getDeviceRDD(spark_contxt, DATA_FILE)		

	#replace device type in each line and write to new file
	s = dataRDD.map(device_cookieMapper)
	#save transformed daata to file 
	s.saveAsTextFile(OUTPUT_DIR)
示例#3
0
def joinRDD():
	
	spark_context = explore.configureSpark()	
	#devices RDD
	devices = explore.getDeviceRDD(spark_context, DEVICES_FILE)
	#cookie RDD	
	cookies = explore.getDeviceRDD(spark_context, COOKIES_FILE)
		
	#create key value pairs (drawbridge_handle, devices_data)
	device_pairs = devices.map(lambda line: (line.split(",")[0], line))
	##create key value pairs (drawbridge_handle, cookies_data)
	cookie_pairs = cookies.map(lambda linec: (linec.split(",")[0], linec))

	#join devices and cookies key, val pairs on common key
	join_val = device_pairs.join(cookie_pairs)	

	join_val.saveAsTextFile()
示例#4
0
def joinRDD():

    spark_context = explore.configureSpark()
    #devices RDD
    devices = explore.getDeviceRDD(spark_context, DEVICES_FILE)
    #cookie RDD
    cookies = explore.getDeviceRDD(spark_context, COOKIES_FILE)

    #create key value pairs (drawbridge_handle, devices_data)
    device_pairs = devices.map(lambda line: (line.split(",")[0], line))
    ##create key value pairs (drawbridge_handle, cookies_data)
    cookie_pairs = cookies.map(lambda linec: (linec.split(",")[0], linec))

    #join devices and cookies key, val pairs on common key
    join_val = device_pairs.join(cookie_pairs)

    join_val.saveAsTextFile()
示例#5
0
def mapCategoricalFeatures():
	
	#get spark context from import specified
	spark_contxt = ex.configureSpark()
	#get device RDD
	dataRDD = ex.getDeviceRDD(spark_contxt, DATA_FILE)
	#retrieve device fields	
	data_fields = dataRDD.map(lambda line: line.split(","))
	
	dataRDD.persist()	

	#get device types
	global DEVICE_TYPES
	DEVICE_TYPES = data_fields.map(lambda field: field[3]).distinct().collect()
	#create device type feature map to numeric values andd store in dictionary 
	index = 0
    	for d in DEVICE_TYPES:
		DEVICE_TYPES_DICT[str(d)] = str(index)
		index = index + 1	
	
	#get device os
	global DEVICE_OS	
	DEVICE_OS = data_fields.map(lambda field: field[4]).distinct().collect()
	#create device os feature map to numeric values andd store in dictionary
	index = 0
	for o in DEVICE_OS:
		DEVICE_OS_DICT[str(o)] = str(index)
		index = index + 1		
	
	#get device country
	global DEVICE_COUNTRY	
	DEVICE_COUNTRY = data_fields.map(lambda field: field[7]).distinct().collect()
	#create device country feature map to numeric values andd store in dictionary
	index = 0
	for c in DEVICE_COUNTRY:
		DEVICE_COUNTRY_DICT[str(c)] = str(index)
		index = index + 1


	#get comp os type
	global COMP_OS_TYPES
	COMP_OS = data_fields.map(lambda field: field[5]).distinct()
	COMP_OS_TYPES_COUNT = COMP_OS.count() 
	COMP_OS_TYPES = COMP_OS.collect()	
	print "Distinct COMP OS ", COMP_OS_TYPES_COUNT
	index = 0
	for o in COMP_OS_TYPES:
		COMP_OS_TYPES_DICT[str(o)] = str(index)
		index = index + 1
	print "COMP OS dctionary size ", len(COMP_OS_TYPES_DICT)	
	

	#get browser version
	global BROWSER_VERSION
	BROWSER_VERSION = data_fields.map(lambda field: field[6]).distinct().collect()
	index = 0
	for b in BROWSER_VERSION:
		BROWSER_VERSION_DICT[str(b)] = str(index)
		index = index + 1

	
	'''
	#get anonymous_c1 feature
	global ANON_C1
	ANON_C1 = data_fields.map(lambda field: field[6]).distinct().collect()
	#create categorical-to-numeric mapping in dictionary 
	index = 0
	for a1 in ANON_C1:
		ANON_C1_DICT[str(a1)] = str(index)
		index = index + 1 	


	#get anonymous_c2 feature
	global ANON_C2
	ANON_C2 = data_fields.map(lambda field: field[7]).distinct().collect()
	#create categorical-to-numeric mapping in dictionary 
	index = 0
	for a2 in ANON_C2:
		ANON_C2_DICT[str(a2)] = str(index)
		index = index + 1	
	'''

	
	print DEVICE_TYPES_DICT
	print DEVICE_OS_DICT 
	print DEVICE_COUNTRY_DICT
	print COMP_OS_TYPES_DICT
	print BROWSER_VERSION_DICT
示例#6
0
def mapCategoricalFeatures():

	#get spark context from import specified
	spark_contxt = ex.configureSpark()
	#get cookie RDD
	cookieRDD = ex.getCookieRDD(spark_contxt, COOKIE_FILE_RAW)
	#retrieve cookie fields	
	cookie_fields = cookieRDD.map(lambda line: line.split(","))
	#persist the cookie fields	
	cookie_fields.persist(StorageLevel.DISK_ONLY)	

	global COMP_OS_TYPES
	COMP_OS_TYPES = cookie_fields.map(lambda field: field[2]).distinct().collect()
	index = 0
	for o in COMP_OS_TYPES:
		COMP_OS_TYPES_DICT[str(o)] = str(index)
		index = index + 1
	
	# serialize COMP_OS_TYPES_DICT with pickle(store it in serialized format, can be deserialized later to use)
	with open(PATH + "dictionary/comp-os-type-dict.pickle",'wb') as f:
    		pickle.dump(COMP_OS_TYPES_DICT, f)
	
		
	global BROWSER_VERSION
	BROWSER_VERSION = cookie_fields.map(lambda field: field[3]).distinct().collect()
	index = 0
	for b in BROWSER_VERSION:
		BROWSER_VERSION_DICT[str(b)] = str(index)
		index = index + 1

	#serialize BROWSER_VERSION_DICT with pickle (store it in serialized format, can be deserialized later to use)
	with open(PATH + "dictionary/browser-version-dict.pickle", 'wb') as f:
		pickle.dump(BROWSER_VERSION_DICT, f)
	
	
	global COOKIE_COUNTRY
	COOKIE_COUNTRY = cookie_fields.map(lambda field: field[4]).distinct().collect()
	index = 0
	for c in COOKIE_COUNTRY:
		COOKIE_COUNTRY_DICT[str(c) ]= str(index)
		index = index + 1 	
	
	# serialize python dictionary object into a file with pickle
	with open(PATH + "dictionary/cookie-country-dict.pickle", "wb") as f:
		pickle.dump(COOKIE_COUNTRY_DICT, f)
	

	deviceRDD = ex.getDeviceRDD(spark_contxt, DEVICE_FILE_RAW)
	device_fields = deviceRDD.map(lambda line: line.split(','))
	device_fields.persist(StorageLevel.DISK_ONLY)
	
	#get device types
	global DEVICE_TYPES
	DEVICE_TYPES = device_fields.map(lambda field: field[3]).distinct().collect()
	#create device type feature map to numeric values andd store in dictionary 
	index = 0
    	for d in DEVICE_TYPES:
		DEVICE_TYPES_DICT[str(d)] = str(index)
		index = index + 1	
	
	with open(PATH + "dictionary/dev-type-dict.pickle", 'wb') as f:
		pickle.dump(DEVICE_TYPES_DICT, f)
	
	#get device os
	global DEVICE_OS	
	DEVICE_OS = device_fields.map(lambda field: field[4]).distinct().collect()
	#create device os feature map to numeric values andd store in dictionary
	index = 0
	for o in DEVICE_OS:
		DEVICE_OS_DICT[str(o)] = str(index)
		index = index + 1		

	with open(PATH + "dictionary/device-os-dict.pickle", 'wb') as f:
		pickle.dump(DEVICE_OS_DICT, f) 
	
	saveExtractedData(spark_contxt)