예제 #1
0
def t2(loaded_data):
    #print("processing t1")
    save_dir = "/data/numpy_conversions/data10/"
    #page_encoded_data = PageEncode(loaded_data.csv_data_dict["combined_pagelocation"]).run()
    #event_time = NormalizeTime(loaded_data.csv_data_dict["combined_eventtimestamp"]).run()

    avg_time, avg_std = GetAvgTime(
        loaded_data.csv_data_dict["combined_eventtimestamp"]).run()
    avg_time = avg_time.tolist()
    avg_std = avg_std.tolist()

    login1,login2,login3,login4 = LoginTime(loaded_data.csv_data_dict["sessionstarttime_hour"],\
             loaded_data.csv_data_dict["sessionstarttime_minute"],loaded_data.csv_data_dict["sessionstarttime_weekday"]).run()

    os, browser = ProcessUserAgents(loaded_data.csv_data_dict["os"], \
        loaded_data.csv_data_dict["browser"]).run()
    os = os.tolist()
    browser = browser.tolist()

    res_np = loaded_data.to_numpy([loaded_data.csv_data_dict["data"], \
        os, \
        browser, \
        login1, \
        login2, \
        login3, \
        login4, \
        avg_time, \
        avg_std, \
        loaded_data.csv_data_dict["target"]])
    train, test = split_and_balance_data(res_np)
    filename = save_dir + "t2DUMMY_"
    np.save(filename + "train.npy", train)
    np.save(filename + "test.npy", test)
예제 #2
0
def t2(loaded_data):
    print("processing t2")
    save_dir = "/data/numpy_conversions/data9/"
    processed_time = GetAvgTime(
        loaded_data.csv_data_dict["combined_eventtimestamp"]).run()

    processed_time_list = processed_time.tolist()

    login1, login2 = LoginTime(loaded_data.csv_data_dict["sessionstarttime_hour"],\
                               loaded_data.csv_data_dict["sessionstarttime_minute"]).run()

    os, browser = ProcessUserAgents(loaded_data.csv_data_dict["os"], \
        loaded_data.csv_data_dict["browser"]).run()
    os = os.tolist()
    browser = browser.tolist()
    t = time.time()
    res_np = loaded_data.to_numpy([loaded_data.csv_data_dict["data"], \
        processed_time_list, \
        os, \
        browser, \
        login1, \
        login2, \
        loaded_data.csv_data_dict["target"]])

    train, test = split_and_balance_data(res_np)
    filename = save_dir + "t2_"
    np.save(filename + "train.npy", train)
    np.save(filename + "test.npy", test)
예제 #3
0
파일: data9.py 프로젝트: z130110/project_
def features(data_dir, file_name, percent_to_read):
    dl = DataLoader(data_dir, file_name, features_to_use= \
        ["sessionstarttime_weekday", \
        "sessionstarttime_hour", \
         "sessionstarttime_minute", \
        "data", \
        "os", \
        "browser", \
        "combined_pagelocation", \
        "combined_eventtimestamp", \
        "target"],percent_to_read=percent_to_read)

    t = time.time()
    processed_time = GetAvgTime(dl.csv_data_dict["combined_eventtimestamp"]).run()
        
    #processed_time = ProcessTime(dl.csv_data_dict["combined_pagelocation"], \
    #    dl.csv_data_dict["combined_eventtimestamp"]).run()
    processed_time_list = processed_time.tolist()
    t = time.time()
    
    login1, login2 = LoginTime(dl.csv_data_dict["sessionstarttime_hour"], dl.csv_data_dict["sessionstarttime_minute"]).run()

    os, browser = ProcessUserAgents(dl.csv_data_dict["os"], \
        dl.csv_data_dict["browser"]).run()
    os = os.tolist()
    browser = browser.tolist()
    t = time.time()
    res_np = dl.to_numpy([dl.csv_data_dict["data"], \
        processed_time_list, \
        os, \
        browser, \
        login1, \
        login2, \
        dl.csv_data_dict["target"]])
    return res_np
예제 #4
0
def t1(loaded_data):
    print("processing t1")
    save_dir = "/data/numpy_conversions/data9/"
    page_encoded_data = PageEncode(
        loaded_data.csv_data_dict["combined_pagelocation"]).run()
    event_time = NormalizeTime(
        loaded_data.csv_data_dict["combined_eventtimestamp"]).run()

    login1, login2 = LoginTime(loaded_data.csv_data_dict["sessionstarttime_hour"],\
                               loaded_data.csv_data_dict["sessionstarttime_minute"]).run()

    os, browser = ProcessUserAgents(loaded_data.csv_data_dict["os"], \
        loaded_data.csv_data_dict["browser"]).run()
    os = os.tolist()
    browser = browser.tolist()

    res_np = loaded_data.to_numpy([page_encoded_data, \
        event_time, \
        os, \
        browser, \
        login1, \
        login2, \
        loaded_data.csv_data_dict["target"]])
    train, test = split_and_balance_data(res_np)
    filename = save_dir + "t1_"
    np.save(filename + "train.npy", train)
    np.save(filename + "test.npy", test)
예제 #5
0
def tbow(loaded_data, task):
    #print("processing t1")
    save_dir = "/data/numpy_conversions/task1large/"

    avg_time, avg_std = GetAvgTime(
        loaded_data.csv_data_dict["all_eventtimestamp"]).run()
    avg_time = avg_time.tolist()
    avg_std = avg_std.tolist()
    event_time = NormalizeTime(loaded_data.csv_data_dict["all_eventtimestamp"],
                               cut_off=10).run()

    login1,login2,login3,login4 = LoginTime(loaded_data.csv_data_dict["sessionstarttime_hour"],\
             loaded_data.csv_data_dict["sessionstarttime_minute"],loaded_data.csv_data_dict["sessionstarttime_weekday"]).run()

    os, browser = ProcessUserAgents(loaded_data.csv_data_dict["os"], \
        loaded_data.csv_data_dict["browser"]).run()
    os = os.tolist()
    browser = browser.tolist()

    bow = BOW(loaded_data.csv_data_dict["combined_pagelocation"]).run()
    bow = bow.tolist()
    if task == 1:

        reg_np = loaded_data.to_numpy([bow, \
            event_time, \
            os, \
            browser, \
            login1, \
            login2, \
            login3, \
            login4, \
            avg_time, \
            avg_std, \
            loaded_data.csv_data_dict["target"]])
        train, test = split_and_balance_data(reg_np, False)
        filename = save_dir + "t1_bow_reg_unbalanced_"
        np.save(filename + "train.npy", train)
        np.save(filename + "test.npy", test)

        #train, test, reg_np = None
    if task == 2:
        class_np = loaded_data.to_numpy([bow, \
            event_time, \
            os, \
            browser, \
            login1, \
            login2, \
            login3, \
            login4, \
            avg_time, \
            avg_std, \
            ConvertClass(loaded_data.csv_data_dict["target"]).run()])
        train, test = split_and_balance_data_class(class_np, False)
        filename = save_dir + "t2_bow_class_unbalanced_"
        np.save(filename + "train.npy", train)
        np.save(filename + "test.npy", test)
    """
예제 #6
0
def t1(loaded_data):
    save_dir = "/data/numpy_conversions/data10/"
    bow = BOW(loaded_data.csv_data_dict["combined_pagelocation"]).run()
    bow = bow.tolist()
    page_encoded_data = PageEncode(
        loaded_data.csv_data_dict["combined_pagelocation"]).run()
    event_time = NormalizeTime(
        loaded_data.csv_data_dict["combined_eventtimestamp"]).run()

    avg_time, avg_std = GetAvgTime(
        loaded_data.csv_data_dict["combined_eventtimestamp"]).run()
    avg_time = avg_time.tolist()
    avg_std = avg_std.tolist()

    time_pca =\
        ProcessTime(loaded_data.csv_data_dict["combined_pagelocation"],loaded_data.csv_data_dict["combined_eventtimestamp"],170).run()
    time_discrete = TimeDiscretize(
        loaded_data.csv_data_dict["combined_eventtimestamp"]).run()
    print("her")
    print(time_pca.shape)

    login1,login2,login3,login4 = LoginTime(loaded_data.csv_data_dict["sessionstarttime_hour"],\
             loaded_data.csv_data_dict["sessionstarttime_minute"],loaded_data.csv_data_dict["sessionstarttime_weekday"]).run()

    os, browser = ProcessUserAgents(loaded_data.csv_data_dict["os"], \
        loaded_data.csv_data_dict["browser"]).run()
    os = os.tolist()
    browser = browser.tolist()

    res_np = loaded_data.to_numpy([page_encoded_data, \
        loaded_data.csv_data_dict["data"],\
        bow,
        time_pca,\
        time_discrete,\
        event_time, \
        os, \
        browser, \
        login1, \
        login2, \
        login3, \
        login4, \
        avg_time, \
        avg_std, \
        loaded_data.csv_data_dict["target"]])
    train, test = split_and_balance_data(res_np)
    filename = save_dir + "t1bow_"
    np.save(filename + "train.npy", train)
    np.save(filename + "test.npy", test)
예제 #7
0
def tword2vec(loaded_data):
    #print("processing t1")
    save_dir = "/data/numpy_conversions/finalreptask3/"
    loaded_data.csv_data_dict["all_eventtimestamp"]
    avg_time, avg_std = GetAvgTime(loaded_data.csv_data_dict["all_eventtimestamp"]).run()
    avg_time = avg_time.tolist()
    avg_std = avg_std.tolist()
    event_time = NormalizeTime(loaded_data.csv_data_dict["all_eventtimestamp"],cut_off = 10).run()
    
    login1,login2,login3,login4 = LoginTime(loaded_data.csv_data_dict["sessionstarttime_hour"],\
             loaded_data.csv_data_dict["sessionstarttime_minute"],loaded_data.csv_data_dict["sessionstarttime_weekday"]).run()
    
    os, browser = ProcessUserAgents(loaded_data.csv_data_dict["os"], \
        loaded_data.csv_data_dict["browser"]).run()
    os = os.tolist()
    browser = browser.tolist()

    reg_np = loaded_data.to_numpy([loaded_data.csv_data_dict["data"], \
        event_time, \
        os, \
        browser, \
        login1, \
        login2, \
        login3, \
        login4, \
        avg_time, \
        avg_std, \
        loaded_data.csv_data_dict["target"]])
    
    #train, test = split_and_balance_data(reg_np,False)
    #filename = save_dir + "t3_word2vec_unbalanced_"
    #np.save(filename + "train.npy", train)
    #np.save(filename + "test.npy", test)
    
    train, test = split_and_balance_data_class(reg_np,True)
    filename = save_dir + "t3_word2vec_balanced_"
    np.save(filename + "train.npy", train)
    np.save(filename + "test.npy", test)
    
    
    train, test = split_and_balance_data_class(reg_np,False)
    filename = save_dir + "t3_word2vec_unbalanced_"
    np.save(filename + "train.npy", train)
    np.save(filename + "test.npy", test)
    """
예제 #8
0
def features(data_dir, file_name, percent_to_read):
    dl = DataLoader(data_dir, file_name, features_to_use= \
        ["sessionstarttime_weekday", \
        "sessionstarttime_hour", \
         "sessionstarttime_minute", \
        "data", \
        "os", \
        "browser", \
        "combined_pagelocation", \
        "combined_eventtimestamp", \
        "target"],percent_to_read=percent_to_read)

    page_encoded_data = PageEncode(
        dl.csv_data_dict["combined_pagelocation"]).run()
    #print(page_encoded_data)
    #print(list(map(len, page_encoded_data)))

    event_time = NormalizeTime(
        dl.csv_data_dict["combined_eventtimestamp"]).run()
    #print(event_time)

    login1, login2 = LoginTime(
        dl.csv_data_dict["sessionstarttime_hour"],
        dl.csv_data_dict["sessionstarttime_minute"]).run()

    os, browser = ProcessUserAgents(dl.csv_data_dict["os"], \
        dl.csv_data_dict["browser"]).run()
    os = os.tolist()
    browser = browser.tolist()

    res_np = dl.to_numpy([page_encoded_data, \
        event_time, \
        os, \
        browser, \
        login1, \
        login2, \
        dl.csv_data_dict["target"]])

    b1 = BalanceDataset(res_np, {"target": -1}, "not classification").run()
    return res_np
예제 #9
0
def t2(loaded_data):
    #print("processing t1")
    save_dir = "/data/numpy_conversions/data10/"

    avg_time, avg_std = GetAvgTime(
        loaded_data.csv_data_dict["combined_eventtimestamp"]).run()
    avg_time = avg_time.tolist()
    avg_std = avg_std.tolist()

    login1,login2,login3,login4 = LoginTime(loaded_data.csv_data_dict["sessionstarttime_hour"],\
             loaded_data.csv_data_dict["sessionstarttime_minute"],loaded_data.csv_data_dict["sessionstarttime_weekday"]).run()

    os, browser = ProcessUserAgents(loaded_data.csv_data_dict["os"], \
        loaded_data.csv_data_dict["browser"]).run()
    os = os.tolist()
    browser = browser.tolist()

    bow = BOW(loaded_data.csv_data_dict["combined_pagelocation"]).run()
    bow = bow.tolist()

    res_np = loaded_data.to_numpy([bow, \
        os, \
        browser, \
        login1, \
        login2, \
        login3, \
        login4, \
        avg_time, \
        avg_std, \
        loaded_data.csv_data_dict["target"]])
    balanced = False
    train, test = split_and_balance_data(res_np, False)
    if balanced:
        filename = save_dir + "t2classbow_"
    else:
        filename = save_dir + "t2classbowunbalanced_"
    #filename = save_dir + "t2classbowunbalanced_"
    np.save(filename + "train.npy", train)
    np.save(filename + "test.npy", test)
예제 #10
0
def features(data_dir, file_name, percent_to_read):
    dl = DataLoader(data_dir, file_name, features_to_use= \
        ["sessionstarttime_weekday", \
        "sessionstarttime_hour", \
         "sessionstarttime_minute", \
        "data", \
        "os", \
        "browser", \
        "combined_pagelocation", \
        "combined_eventtimestamp", \
        "target"],percent_to_read=percent_to_read)

    t = time.time()
    processed_time = NormalizeTime(
        dl.csv_data_dict["combined_eventtimestamp"]).run()

    tokens = TokenizePages(dl.csv_data_dict["combined_pagelocation"]).run()
    login1, login2 = LoginTime(
        dl.csv_data_dict["sessionstarttime_hour"],
        dl.csv_data_dict["sessionstarttime_minute"]).run()

    os, browser = ProcessUserAgents(dl.csv_data_dict["os"], \
        dl.csv_data_dict["browser"]).run()
    os = os.tolist()
    browser = browser.tolist()
    res_np = dl.to_numpy([
        tokens, \
        processed_time, \
        os, \
        browser, \
        login1, \
        login2, \
        dl.csv_data_dict["target"]])

    train, test = split_and_balance_data(res_np)
    save_dir = "/data/numpy_conversions/data10/"
    np.save(save_dir + "tokenized_train.npy", train)
    np.save(save_dir + "tokenized_test.npy", test)
예제 #11
0
def tbow(loaded_data):
    save_dir = "/data/numpy_conversions/finalreptask3/"
    
    avg_time, avg_std = GetAvgTime(loaded_data.csv_data_dict["all_eventtimestamp"]).run()
    avg_time = avg_time.tolist()
    avg_std = avg_std.tolist()
    event_time = NormalizeTime(loaded_data.csv_data_dict["all_eventtimestamp"],cut_off = 10).run()

    
    login1,login2,login3,login4 = LoginTime(loaded_data.csv_data_dict["sessionstarttime_hour"],\
             loaded_data.csv_data_dict["sessionstarttime_minute"],loaded_data.csv_data_dict["sessionstarttime_weekday"]).run()
    
    os, browser = ProcessUserAgents(loaded_data.csv_data_dict["os"], \
        loaded_data.csv_data_dict["browser"]).run()
    os = os.tolist()
    browser = browser.tolist()
    
    
    clicks = loaded_data.csv_data_dict["combined_pagelocation2"]
    bow = BOW(clicks,json_name="task3_new").run()
    #bow = bow.tolist()
    
    reg_np = loaded_data.to_numpy([bow, \
        event_time, \
        os, \
        browser, \
        login1, \
        login2, \
        login3, \
        login4, \
        avg_time, \
        avg_std, \
        loaded_data.csv_data_dict["target"]])

    train, test d= split_and_balance_data_class(reg_np,False)
    filename = save_dir + "t3_bow_unbalanced_"
    np.save(filename + "train.npy", train)
    np.save(filename + "test.npy", test)
    
    train, test = split_and_balance_data_class(reg_np,True)
    filename = save_dir + "t3_bow_balanced_"
    np.save(filename + "train.npy", train)
    np.save(filename + "test.npy", test)

    """
    class_np = loaded_data.to_numpy([bow, \
        os, \
        browser, \
        login1, \
        login2, \
        login3, \
        login4, \
        avg_time, \
        avg_std, \
        ConvertClass(loaded_data.csv_data_dict["target"]).run()])
    
    """
    """
    train, test = split_and_balance_data_class(class_np,False)
    filename = save_dir + "t2_bow_class_unbalanced_"
    
    np.save(filename + "train.npy", train)
    np.save(filename + "test.npy", test)
    """
    """