def mapCookieCategoricalFeatures(): #get spark context from import specified spark_contxt = ex.configureSpark() #get device RDD dataRDD = ex.getCookieRDD(spark_contxt, COOKIE_FILE_RAW) #retrieve device fields data_fields = dataRDD.map(lambda line: line.split(",")) dataRDD.persist(StorageLevel.DISK_ONLY) global COMP_OS_TYPES COMP_OS_TYPES = data_fields.map( lambda field: field[2]).distinct().collect() index = 0 for o in COMP_OS_TYPES: COMP_OS_TYPES_DICT[str(o)] = str(index) index = index + 1 # serialize COMP_OS_TYPES_DICT with pickle with open("comp-os-type-dict.pickle", 'wb') as f: pickle.dump(COMP_OS_TYPES_DICT, f) global BROWSER_VERSION BROWSER_VERSION = data_fields.map( lambda field: field[3]).distinct().collect() index = 0 for b in BROWSER_VERSION: BROWSER_VERSION_DICT[str(b)] = str(index) index = index + 1 #serialize BROWSER_VERSION_DICT with pickle with open("browser-version-dict.pickle", 'wb') as f: pickle.dump(BROWSER_VERSION_DICT, f) global COOKIE_COUNTRY COOKIE_COUNTRY = data_fields.map( lambda field: field[4]).distinct().collect() index = 0 for c in COOKIE_COUNTRY: COOKIE_COUNTRY_DICT[str(c)] = str(index) index = index + 1 # serialize python dictionary object with pickle with open("cookie-country-dict.pickle", "wb") as f: pickle.dump(COOKIE_COUNTRY_DICT, f)
def mapCookieCategoricalFeatures(): #get spark context from import specified spark_contxt = ex.configureSpark() #get device RDD dataRDD = ex.getCookieRDD(spark_contxt, COOKIE_FILE_RAW) #retrieve device fields data_fields = dataRDD.map(lambda line: line.split(",")) dataRDD.persist(StorageLevel.DISK_ONLY) global COMP_OS_TYPES COMP_OS_TYPES = data_fields.map(lambda field: field[2]).distinct().collect() index = 0 for o in COMP_OS_TYPES: COMP_OS_TYPES_DICT[str(o)] = str(index) index = index + 1 # serialize COMP_OS_TYPES_DICT with pickle with open("comp-os-type-dict.pickle",'wb') as f: pickle.dump(COMP_OS_TYPES_DICT, f) global BROWSER_VERSION BROWSER_VERSION = data_fields.map(lambda field: field[3]).distinct().collect() index = 0 for b in BROWSER_VERSION: BROWSER_VERSION_DICT[str(b)] = str(index) index = index + 1 #serialize BROWSER_VERSION_DICT with pickle with open("browser-version-dict.pickle", 'wb') as f: pickle.dump(BROWSER_VERSION_DICT, f) global COOKIE_COUNTRY COOKIE_COUNTRY = data_fields.map(lambda field: field[4]).distinct().collect() index = 0 for c in COOKIE_COUNTRY: COOKIE_COUNTRY_DICT[str(c) ]= str(index) index = index + 1 # serialize python dictionary object with pickle with open("cookie-country-dict.pickle", "wb") as f: pickle.dump(COOKIE_COUNTRY_DICT, f)
def mapCategoricalFeatures(): #get spark context from import specified spark_contxt = ex.configureSpark() #get cookie RDD cookieRDD = ex.getCookieRDD(spark_contxt, COOKIE_FILE_RAW) #retrieve cookie fields cookie_fields = cookieRDD.map(lambda line: line.split(",")) #persist the cookie fields cookie_fields.persist(StorageLevel.DISK_ONLY) global COMP_OS_TYPES COMP_OS_TYPES = cookie_fields.map(lambda field: field[2]).distinct().collect() index = 0 for o in COMP_OS_TYPES: COMP_OS_TYPES_DICT[str(o)] = str(index) index = index + 1 # serialize COMP_OS_TYPES_DICT with pickle(store it in serialized format, can be deserialized later to use) with open(PATH + "dictionary/comp-os-type-dict.pickle",'wb') as f: pickle.dump(COMP_OS_TYPES_DICT, f) global BROWSER_VERSION BROWSER_VERSION = cookie_fields.map(lambda field: field[3]).distinct().collect() index = 0 for b in BROWSER_VERSION: BROWSER_VERSION_DICT[str(b)] = str(index) index = index + 1 #serialize BROWSER_VERSION_DICT with pickle (store it in serialized format, can be deserialized later to use) with open(PATH + "dictionary/browser-version-dict.pickle", 'wb') as f: pickle.dump(BROWSER_VERSION_DICT, f) global COOKIE_COUNTRY COOKIE_COUNTRY = cookie_fields.map(lambda field: field[4]).distinct().collect() index = 0 for c in COOKIE_COUNTRY: COOKIE_COUNTRY_DICT[str(c) ]= str(index) index = index + 1 # serialize python dictionary object into a file with pickle with open(PATH + "dictionary/cookie-country-dict.pickle", "wb") as f: pickle.dump(COOKIE_COUNTRY_DICT, f) deviceRDD = ex.getDeviceRDD(spark_contxt, DEVICE_FILE_RAW) device_fields = deviceRDD.map(lambda line: line.split(',')) device_fields.persist(StorageLevel.DISK_ONLY) #get device types global DEVICE_TYPES DEVICE_TYPES = device_fields.map(lambda field: field[3]).distinct().collect() #create device type feature map to numeric values andd store in dictionary index = 0 for d in DEVICE_TYPES: DEVICE_TYPES_DICT[str(d)] = str(index) index = index + 1 with open(PATH + "dictionary/dev-type-dict.pickle", 'wb') as f: pickle.dump(DEVICE_TYPES_DICT, f) #get device os global DEVICE_OS DEVICE_OS = device_fields.map(lambda field: field[4]).distinct().collect() #create device os feature map to numeric values andd store in dictionary index = 0 for o in DEVICE_OS: DEVICE_OS_DICT[str(o)] = str(index) index = index + 1 with open(PATH + "dictionary/device-os-dict.pickle", 'wb') as f: pickle.dump(DEVICE_OS_DICT, f) saveExtractedData(spark_contxt)