예제 #1
0
def mapCookieCategoricalFeatures():

    #get spark context from import specified
    spark_contxt = ex.configureSpark()
    #get device RDD
    dataRDD = ex.getCookieRDD(spark_contxt, COOKIE_FILE_RAW)
    #retrieve device fields
    data_fields = dataRDD.map(lambda line: line.split(","))
    dataRDD.persist(StorageLevel.DISK_ONLY)

    global COMP_OS_TYPES
    COMP_OS_TYPES = data_fields.map(
        lambda field: field[2]).distinct().collect()
    index = 0
    for o in COMP_OS_TYPES:
        COMP_OS_TYPES_DICT[str(o)] = str(index)
        index = index + 1

    # serialize COMP_OS_TYPES_DICT with pickle
    with open("comp-os-type-dict.pickle", 'wb') as f:
        pickle.dump(COMP_OS_TYPES_DICT, f)

    global BROWSER_VERSION
    BROWSER_VERSION = data_fields.map(
        lambda field: field[3]).distinct().collect()
    index = 0
    for b in BROWSER_VERSION:
        BROWSER_VERSION_DICT[str(b)] = str(index)
        index = index + 1

    #serialize BROWSER_VERSION_DICT with pickle
    with open("browser-version-dict.pickle", 'wb') as f:
        pickle.dump(BROWSER_VERSION_DICT, f)

    global COOKIE_COUNTRY
    COOKIE_COUNTRY = data_fields.map(
        lambda field: field[4]).distinct().collect()
    index = 0
    for c in COOKIE_COUNTRY:
        COOKIE_COUNTRY_DICT[str(c)] = str(index)
        index = index + 1

    # serialize python dictionary object with pickle
    with open("cookie-country-dict.pickle", "wb") as f:
        pickle.dump(COOKIE_COUNTRY_DICT, f)
예제 #2
0
def mapCookieCategoricalFeatures():

	#get spark context from import specified
	spark_contxt = ex.configureSpark()
	#get device RDD
	dataRDD = ex.getCookieRDD(spark_contxt, COOKIE_FILE_RAW)
	#retrieve device fields	
	data_fields = dataRDD.map(lambda line: line.split(","))
	dataRDD.persist(StorageLevel.DISK_ONLY)	

	global COMP_OS_TYPES
	COMP_OS_TYPES = data_fields.map(lambda field: field[2]).distinct().collect()
	index = 0
	for o in COMP_OS_TYPES:
		COMP_OS_TYPES_DICT[str(o)] = str(index)
		index = index + 1
	
	# serialize COMP_OS_TYPES_DICT with pickle
	with open("comp-os-type-dict.pickle",'wb') as f:
    		pickle.dump(COMP_OS_TYPES_DICT, f)
	
		
	global BROWSER_VERSION
	BROWSER_VERSION = data_fields.map(lambda field: field[3]).distinct().collect()
	index = 0
	for b in BROWSER_VERSION:
		BROWSER_VERSION_DICT[str(b)] = str(index)
		index = index + 1

	#serialize BROWSER_VERSION_DICT with pickle
	with open("browser-version-dict.pickle", 'wb') as f:
		pickle.dump(BROWSER_VERSION_DICT, f)
	
	
	global COOKIE_COUNTRY
	COOKIE_COUNTRY = data_fields.map(lambda field: field[4]).distinct().collect()
	index = 0
	for c in COOKIE_COUNTRY:
		COOKIE_COUNTRY_DICT[str(c) ]= str(index)
		index = index + 1 	
	
	# serialize python dictionary object with pickle
	with open("cookie-country-dict.pickle", "wb") as f:
		pickle.dump(COOKIE_COUNTRY_DICT, f)
예제 #3
0
def mapCategoricalFeatures():

	#get spark context from import specified
	spark_contxt = ex.configureSpark()
	#get cookie RDD
	cookieRDD = ex.getCookieRDD(spark_contxt, COOKIE_FILE_RAW)
	#retrieve cookie fields	
	cookie_fields = cookieRDD.map(lambda line: line.split(","))
	#persist the cookie fields	
	cookie_fields.persist(StorageLevel.DISK_ONLY)	

	global COMP_OS_TYPES
	COMP_OS_TYPES = cookie_fields.map(lambda field: field[2]).distinct().collect()
	index = 0
	for o in COMP_OS_TYPES:
		COMP_OS_TYPES_DICT[str(o)] = str(index)
		index = index + 1
	
	# serialize COMP_OS_TYPES_DICT with pickle(store it in serialized format, can be deserialized later to use)
	with open(PATH + "dictionary/comp-os-type-dict.pickle",'wb') as f:
    		pickle.dump(COMP_OS_TYPES_DICT, f)
	
		
	global BROWSER_VERSION
	BROWSER_VERSION = cookie_fields.map(lambda field: field[3]).distinct().collect()
	index = 0
	for b in BROWSER_VERSION:
		BROWSER_VERSION_DICT[str(b)] = str(index)
		index = index + 1

	#serialize BROWSER_VERSION_DICT with pickle (store it in serialized format, can be deserialized later to use)
	with open(PATH + "dictionary/browser-version-dict.pickle", 'wb') as f:
		pickle.dump(BROWSER_VERSION_DICT, f)
	
	
	global COOKIE_COUNTRY
	COOKIE_COUNTRY = cookie_fields.map(lambda field: field[4]).distinct().collect()
	index = 0
	for c in COOKIE_COUNTRY:
		COOKIE_COUNTRY_DICT[str(c) ]= str(index)
		index = index + 1 	
	
	# serialize python dictionary object into a file with pickle
	with open(PATH + "dictionary/cookie-country-dict.pickle", "wb") as f:
		pickle.dump(COOKIE_COUNTRY_DICT, f)
	

	deviceRDD = ex.getDeviceRDD(spark_contxt, DEVICE_FILE_RAW)
	device_fields = deviceRDD.map(lambda line: line.split(','))
	device_fields.persist(StorageLevel.DISK_ONLY)
	
	#get device types
	global DEVICE_TYPES
	DEVICE_TYPES = device_fields.map(lambda field: field[3]).distinct().collect()
	#create device type feature map to numeric values andd store in dictionary 
	index = 0
    	for d in DEVICE_TYPES:
		DEVICE_TYPES_DICT[str(d)] = str(index)
		index = index + 1	
	
	with open(PATH + "dictionary/dev-type-dict.pickle", 'wb') as f:
		pickle.dump(DEVICE_TYPES_DICT, f)
	
	#get device os
	global DEVICE_OS	
	DEVICE_OS = device_fields.map(lambda field: field[4]).distinct().collect()
	#create device os feature map to numeric values andd store in dictionary
	index = 0
	for o in DEVICE_OS:
		DEVICE_OS_DICT[str(o)] = str(index)
		index = index + 1		

	with open(PATH + "dictionary/device-os-dict.pickle", 'wb') as f:
		pickle.dump(DEVICE_OS_DICT, f) 
	
	saveExtractedData(spark_contxt)