def t2(loaded_data): #print("processing t1") save_dir = "/data/numpy_conversions/data10/" #page_encoded_data = PageEncode(loaded_data.csv_data_dict["combined_pagelocation"]).run() #event_time = NormalizeTime(loaded_data.csv_data_dict["combined_eventtimestamp"]).run() avg_time, avg_std = GetAvgTime( loaded_data.csv_data_dict["combined_eventtimestamp"]).run() avg_time = avg_time.tolist() avg_std = avg_std.tolist() login1,login2,login3,login4 = LoginTime(loaded_data.csv_data_dict["sessionstarttime_hour"],\ loaded_data.csv_data_dict["sessionstarttime_minute"],loaded_data.csv_data_dict["sessionstarttime_weekday"]).run() os, browser = ProcessUserAgents(loaded_data.csv_data_dict["os"], \ loaded_data.csv_data_dict["browser"]).run() os = os.tolist() browser = browser.tolist() res_np = loaded_data.to_numpy([loaded_data.csv_data_dict["data"], \ os, \ browser, \ login1, \ login2, \ login3, \ login4, \ avg_time, \ avg_std, \ loaded_data.csv_data_dict["target"]]) train, test = split_and_balance_data(res_np) filename = save_dir + "t2DUMMY_" np.save(filename + "train.npy", train) np.save(filename + "test.npy", test)
def t2(loaded_data): print("processing t2") save_dir = "/data/numpy_conversions/data9/" processed_time = GetAvgTime( loaded_data.csv_data_dict["combined_eventtimestamp"]).run() processed_time_list = processed_time.tolist() login1, login2 = LoginTime(loaded_data.csv_data_dict["sessionstarttime_hour"],\ loaded_data.csv_data_dict["sessionstarttime_minute"]).run() os, browser = ProcessUserAgents(loaded_data.csv_data_dict["os"], \ loaded_data.csv_data_dict["browser"]).run() os = os.tolist() browser = browser.tolist() t = time.time() res_np = loaded_data.to_numpy([loaded_data.csv_data_dict["data"], \ processed_time_list, \ os, \ browser, \ login1, \ login2, \ loaded_data.csv_data_dict["target"]]) train, test = split_and_balance_data(res_np) filename = save_dir + "t2_" np.save(filename + "train.npy", train) np.save(filename + "test.npy", test)
def features(data_dir, file_name, percent_to_read): dl = DataLoader(data_dir, file_name, features_to_use= \ ["sessionstarttime_weekday", \ "sessionstarttime_hour", \ "sessionstarttime_minute", \ "data", \ "os", \ "browser", \ "combined_pagelocation", \ "combined_eventtimestamp", \ "target"],percent_to_read=percent_to_read) t = time.time() processed_time = GetAvgTime(dl.csv_data_dict["combined_eventtimestamp"]).run() #processed_time = ProcessTime(dl.csv_data_dict["combined_pagelocation"], \ # dl.csv_data_dict["combined_eventtimestamp"]).run() processed_time_list = processed_time.tolist() t = time.time() login1, login2 = LoginTime(dl.csv_data_dict["sessionstarttime_hour"], dl.csv_data_dict["sessionstarttime_minute"]).run() os, browser = ProcessUserAgents(dl.csv_data_dict["os"], \ dl.csv_data_dict["browser"]).run() os = os.tolist() browser = browser.tolist() t = time.time() res_np = dl.to_numpy([dl.csv_data_dict["data"], \ processed_time_list, \ os, \ browser, \ login1, \ login2, \ dl.csv_data_dict["target"]]) return res_np
def t1(loaded_data): print("processing t1") save_dir = "/data/numpy_conversions/data9/" page_encoded_data = PageEncode( loaded_data.csv_data_dict["combined_pagelocation"]).run() event_time = NormalizeTime( loaded_data.csv_data_dict["combined_eventtimestamp"]).run() login1, login2 = LoginTime(loaded_data.csv_data_dict["sessionstarttime_hour"],\ loaded_data.csv_data_dict["sessionstarttime_minute"]).run() os, browser = ProcessUserAgents(loaded_data.csv_data_dict["os"], \ loaded_data.csv_data_dict["browser"]).run() os = os.tolist() browser = browser.tolist() res_np = loaded_data.to_numpy([page_encoded_data, \ event_time, \ os, \ browser, \ login1, \ login2, \ loaded_data.csv_data_dict["target"]]) train, test = split_and_balance_data(res_np) filename = save_dir + "t1_" np.save(filename + "train.npy", train) np.save(filename + "test.npy", test)
def tbow(loaded_data, task): #print("processing t1") save_dir = "/data/numpy_conversions/task1large/" avg_time, avg_std = GetAvgTime( loaded_data.csv_data_dict["all_eventtimestamp"]).run() avg_time = avg_time.tolist() avg_std = avg_std.tolist() event_time = NormalizeTime(loaded_data.csv_data_dict["all_eventtimestamp"], cut_off=10).run() login1,login2,login3,login4 = LoginTime(loaded_data.csv_data_dict["sessionstarttime_hour"],\ loaded_data.csv_data_dict["sessionstarttime_minute"],loaded_data.csv_data_dict["sessionstarttime_weekday"]).run() os, browser = ProcessUserAgents(loaded_data.csv_data_dict["os"], \ loaded_data.csv_data_dict["browser"]).run() os = os.tolist() browser = browser.tolist() bow = BOW(loaded_data.csv_data_dict["combined_pagelocation"]).run() bow = bow.tolist() if task == 1: reg_np = loaded_data.to_numpy([bow, \ event_time, \ os, \ browser, \ login1, \ login2, \ login3, \ login4, \ avg_time, \ avg_std, \ loaded_data.csv_data_dict["target"]]) train, test = split_and_balance_data(reg_np, False) filename = save_dir + "t1_bow_reg_unbalanced_" np.save(filename + "train.npy", train) np.save(filename + "test.npy", test) #train, test, reg_np = None if task == 2: class_np = loaded_data.to_numpy([bow, \ event_time, \ os, \ browser, \ login1, \ login2, \ login3, \ login4, \ avg_time, \ avg_std, \ ConvertClass(loaded_data.csv_data_dict["target"]).run()]) train, test = split_and_balance_data_class(class_np, False) filename = save_dir + "t2_bow_class_unbalanced_" np.save(filename + "train.npy", train) np.save(filename + "test.npy", test) """
def t1(loaded_data): save_dir = "/data/numpy_conversions/data10/" bow = BOW(loaded_data.csv_data_dict["combined_pagelocation"]).run() bow = bow.tolist() page_encoded_data = PageEncode( loaded_data.csv_data_dict["combined_pagelocation"]).run() event_time = NormalizeTime( loaded_data.csv_data_dict["combined_eventtimestamp"]).run() avg_time, avg_std = GetAvgTime( loaded_data.csv_data_dict["combined_eventtimestamp"]).run() avg_time = avg_time.tolist() avg_std = avg_std.tolist() time_pca =\ ProcessTime(loaded_data.csv_data_dict["combined_pagelocation"],loaded_data.csv_data_dict["combined_eventtimestamp"],170).run() time_discrete = TimeDiscretize( loaded_data.csv_data_dict["combined_eventtimestamp"]).run() print("her") print(time_pca.shape) login1,login2,login3,login4 = LoginTime(loaded_data.csv_data_dict["sessionstarttime_hour"],\ loaded_data.csv_data_dict["sessionstarttime_minute"],loaded_data.csv_data_dict["sessionstarttime_weekday"]).run() os, browser = ProcessUserAgents(loaded_data.csv_data_dict["os"], \ loaded_data.csv_data_dict["browser"]).run() os = os.tolist() browser = browser.tolist() res_np = loaded_data.to_numpy([page_encoded_data, \ loaded_data.csv_data_dict["data"],\ bow, time_pca,\ time_discrete,\ event_time, \ os, \ browser, \ login1, \ login2, \ login3, \ login4, \ avg_time, \ avg_std, \ loaded_data.csv_data_dict["target"]]) train, test = split_and_balance_data(res_np) filename = save_dir + "t1bow_" np.save(filename + "train.npy", train) np.save(filename + "test.npy", test)
def tword2vec(loaded_data): #print("processing t1") save_dir = "/data/numpy_conversions/finalreptask3/" loaded_data.csv_data_dict["all_eventtimestamp"] avg_time, avg_std = GetAvgTime(loaded_data.csv_data_dict["all_eventtimestamp"]).run() avg_time = avg_time.tolist() avg_std = avg_std.tolist() event_time = NormalizeTime(loaded_data.csv_data_dict["all_eventtimestamp"],cut_off = 10).run() login1,login2,login3,login4 = LoginTime(loaded_data.csv_data_dict["sessionstarttime_hour"],\ loaded_data.csv_data_dict["sessionstarttime_minute"],loaded_data.csv_data_dict["sessionstarttime_weekday"]).run() os, browser = ProcessUserAgents(loaded_data.csv_data_dict["os"], \ loaded_data.csv_data_dict["browser"]).run() os = os.tolist() browser = browser.tolist() reg_np = loaded_data.to_numpy([loaded_data.csv_data_dict["data"], \ event_time, \ os, \ browser, \ login1, \ login2, \ login3, \ login4, \ avg_time, \ avg_std, \ loaded_data.csv_data_dict["target"]]) #train, test = split_and_balance_data(reg_np,False) #filename = save_dir + "t3_word2vec_unbalanced_" #np.save(filename + "train.npy", train) #np.save(filename + "test.npy", test) train, test = split_and_balance_data_class(reg_np,True) filename = save_dir + "t3_word2vec_balanced_" np.save(filename + "train.npy", train) np.save(filename + "test.npy", test) train, test = split_and_balance_data_class(reg_np,False) filename = save_dir + "t3_word2vec_unbalanced_" np.save(filename + "train.npy", train) np.save(filename + "test.npy", test) """
def features(data_dir, file_name, percent_to_read): dl = DataLoader(data_dir, file_name, features_to_use= \ ["sessionstarttime_weekday", \ "sessionstarttime_hour", \ "sessionstarttime_minute", \ "data", \ "os", \ "browser", \ "combined_pagelocation", \ "combined_eventtimestamp", \ "target"],percent_to_read=percent_to_read) page_encoded_data = PageEncode( dl.csv_data_dict["combined_pagelocation"]).run() #print(page_encoded_data) #print(list(map(len, page_encoded_data))) event_time = NormalizeTime( dl.csv_data_dict["combined_eventtimestamp"]).run() #print(event_time) login1, login2 = LoginTime( dl.csv_data_dict["sessionstarttime_hour"], dl.csv_data_dict["sessionstarttime_minute"]).run() os, browser = ProcessUserAgents(dl.csv_data_dict["os"], \ dl.csv_data_dict["browser"]).run() os = os.tolist() browser = browser.tolist() res_np = dl.to_numpy([page_encoded_data, \ event_time, \ os, \ browser, \ login1, \ login2, \ dl.csv_data_dict["target"]]) b1 = BalanceDataset(res_np, {"target": -1}, "not classification").run() return res_np
def t2(loaded_data): #print("processing t1") save_dir = "/data/numpy_conversions/data10/" avg_time, avg_std = GetAvgTime( loaded_data.csv_data_dict["combined_eventtimestamp"]).run() avg_time = avg_time.tolist() avg_std = avg_std.tolist() login1,login2,login3,login4 = LoginTime(loaded_data.csv_data_dict["sessionstarttime_hour"],\ loaded_data.csv_data_dict["sessionstarttime_minute"],loaded_data.csv_data_dict["sessionstarttime_weekday"]).run() os, browser = ProcessUserAgents(loaded_data.csv_data_dict["os"], \ loaded_data.csv_data_dict["browser"]).run() os = os.tolist() browser = browser.tolist() bow = BOW(loaded_data.csv_data_dict["combined_pagelocation"]).run() bow = bow.tolist() res_np = loaded_data.to_numpy([bow, \ os, \ browser, \ login1, \ login2, \ login3, \ login4, \ avg_time, \ avg_std, \ loaded_data.csv_data_dict["target"]]) balanced = False train, test = split_and_balance_data(res_np, False) if balanced: filename = save_dir + "t2classbow_" else: filename = save_dir + "t2classbowunbalanced_" #filename = save_dir + "t2classbowunbalanced_" np.save(filename + "train.npy", train) np.save(filename + "test.npy", test)
def features(data_dir, file_name, percent_to_read): dl = DataLoader(data_dir, file_name, features_to_use= \ ["sessionstarttime_weekday", \ "sessionstarttime_hour", \ "sessionstarttime_minute", \ "data", \ "os", \ "browser", \ "combined_pagelocation", \ "combined_eventtimestamp", \ "target"],percent_to_read=percent_to_read) t = time.time() processed_time = NormalizeTime( dl.csv_data_dict["combined_eventtimestamp"]).run() tokens = TokenizePages(dl.csv_data_dict["combined_pagelocation"]).run() login1, login2 = LoginTime( dl.csv_data_dict["sessionstarttime_hour"], dl.csv_data_dict["sessionstarttime_minute"]).run() os, browser = ProcessUserAgents(dl.csv_data_dict["os"], \ dl.csv_data_dict["browser"]).run() os = os.tolist() browser = browser.tolist() res_np = dl.to_numpy([ tokens, \ processed_time, \ os, \ browser, \ login1, \ login2, \ dl.csv_data_dict["target"]]) train, test = split_and_balance_data(res_np) save_dir = "/data/numpy_conversions/data10/" np.save(save_dir + "tokenized_train.npy", train) np.save(save_dir + "tokenized_test.npy", test)
def tbow(loaded_data): save_dir = "/data/numpy_conversions/finalreptask3/" avg_time, avg_std = GetAvgTime(loaded_data.csv_data_dict["all_eventtimestamp"]).run() avg_time = avg_time.tolist() avg_std = avg_std.tolist() event_time = NormalizeTime(loaded_data.csv_data_dict["all_eventtimestamp"],cut_off = 10).run() login1,login2,login3,login4 = LoginTime(loaded_data.csv_data_dict["sessionstarttime_hour"],\ loaded_data.csv_data_dict["sessionstarttime_minute"],loaded_data.csv_data_dict["sessionstarttime_weekday"]).run() os, browser = ProcessUserAgents(loaded_data.csv_data_dict["os"], \ loaded_data.csv_data_dict["browser"]).run() os = os.tolist() browser = browser.tolist() clicks = loaded_data.csv_data_dict["combined_pagelocation2"] bow = BOW(clicks,json_name="task3_new").run() #bow = bow.tolist() reg_np = loaded_data.to_numpy([bow, \ event_time, \ os, \ browser, \ login1, \ login2, \ login3, \ login4, \ avg_time, \ avg_std, \ loaded_data.csv_data_dict["target"]]) train, test d= split_and_balance_data_class(reg_np,False) filename = save_dir + "t3_bow_unbalanced_" np.save(filename + "train.npy", train) np.save(filename + "test.npy", test) train, test = split_and_balance_data_class(reg_np,True) filename = save_dir + "t3_bow_balanced_" np.save(filename + "train.npy", train) np.save(filename + "test.npy", test) """ class_np = loaded_data.to_numpy([bow, \ os, \ browser, \ login1, \ login2, \ login3, \ login4, \ avg_time, \ avg_std, \ ConvertClass(loaded_data.csv_data_dict["target"]).run()]) """ """ train, test = split_and_balance_data_class(class_np,False) filename = save_dir + "t2_bow_class_unbalanced_" np.save(filename + "train.npy", train) np.save(filename + "test.npy", test) """ """