예제 #1
0
파일: build.py 프로젝트: ririgi/valkyrie
def nullDailyDesc():

    s = time.time()
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            data = pd.read_csv(os.path.join(data_dir, f), nrows=100)

            data = utils.addWeekDay(data, f)

            df = stats.getNullDescription(data)

            df = pd.melt(df,
                         id_vars=["colnames"],
                         value_vars=["num_null", "not_null"])
            df["filter_type"] = "day"
            df["filter_value"] = data.iloc[0]["day"]
            df["week"] = data.iloc[0]["week"]

            df = df[[
                "filter_type", "filter_value", "colnames", "variable", "value",
                "week"
            ]]
            print(df.head())

            utils.toCSV(df, "../../results/daily_null_description.csv")

    e = time.time()
    print("Runtime nullDailyDesc: ",
          time.strftime("%H:%M:%S", time.gmtime(e - s)))
예제 #2
0
파일: build.py 프로젝트: ririgi/valkyrie
def count_is_logged_in():

    s = time.time()

    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            data = pd.read_csv(os.path.join(data_dir, f), nrows=200000)

            data = utils.addWeekDay(data, f)
            data = utils.addIsLoggedIn(data)

            new_df = pd.DataFrame(index=[data.iloc[0]["day"]],
                                  data=data["isloggedin"].sum(),
                                  columns=['isloggedin'])
            new_df.index.name = 'day'
            new_df["week"] = data.iloc[0]["week"]

            utils.toCSV(new_df.reset_index(),
                        "../../results/daily_logged_in.csv")

    data = utils.readCSV("../../results/daily_logged_in.csv")
    data = data.groupby("week")["isloggedin"].sum().to_frame()
    utils.toCSV(data.reset_index(), "../../results/weekly_logged_in.csv")

    e = time.time()
    print("Runtime count_is_logged_in: ",
          time.strftime("%H:%M:%S", time.gmtime(e - s)))
예제 #3
0
def getSessionDuration(filename):
    s = time.time()

    usecols = [
        "bigdatasessionid", "pagetitle", "sessionstarttimestamp",
        "sessionendtimestamp"
    ]
    data = readCSV(filename, usecols)
    # print(len(data))
    data = data.drop(data.loc[data.sessionendtimestamp ==
                              "NaN/NaN/NaN NaN:NaN:NaN.NaN"].index)
    # print(len(data))
    data.sessionendtimestamp = data.sessionendtimestamp.astype(
        "datetime64[ns]")

    data = data \
     .groupby(["bigdatasessionid", "pagetitle"])["sessionstarttimestamp", "sessionendtimestamp"] \
     .agg({"sessionstarttimestamp": "min", "sessionendtimestamp": "max"}) \
     .pipe(pd.DataFrame) \
     .reset_index()

    data.loc[:, "session_duration"] = (
        data.sessionendtimestamp -
        data.sessionstarttimestamp) / np.timedelta64(1, 's')
    data = getDateInfo(data)
    data = add_id(data)

    toCSV(data, "session_information.csv", filename)

    e = time.time()
    print("Runtime getSessionDuration: ",
          time.strftime("%H:%M:%S", time.gmtime(e - s)), "\n")
예제 #4
0
def getReferralInformation(filename):

	s = time.time()

	cols = ["bigdatasessionid", "pagetitle", "currentwebpage", "previouswebpage"]
	
	data = readCSV(filename, cols = cols)

	print(len(data))

	data.dropna(subset = ["currentwebpage"], inplace = True)

	print(len(data))

	data["referraltype"] = data[["currentwebpage", "previouswebpage"]].apply(lambda x: get_referraltype(x), axis = 1)

	data = add_id(data)

	print(len(data))
#
	data.dropna(subset = ["ID"], inplace = True)

	print(len(data))
	
	toCSV(data, "referral_information.csv", filename)

	e = time.time()
	print("Runtime getReferralInformation: ", time.strftime("%H:%M:%S", time.gmtime(e-s)), "\n")
예제 #5
0
def getReadingDuration(filename):

    s = time.time()

    cols = [
        "bigdatasessionid", "pagetitle", "articlecontentamount",
        "readingduration", "viewpageduration", "pagedepth"
    ]

    data = readCSV(filename, cols)

    data = data.groupby(["bigdatasessionid",
                         "pagetitle"])["articlecontentamount",
                                       "readingduration", "viewpageduration",
                                       "pagedepth"].agg(['sum'])

    data.columns = data.columns.to_flat_index()

    data.columns = [col[0] for col in data.columns]

    data = add_id(data)

    #outfile = "../results/"+filename[-6:-4]+"/reading_duration.csv"
    #toCSV(data, outfile, filename)

    toCSV(data, "reading_information.csv", filename)

    e = time.time()
    print("Runtime getReadingDuration: ",
          time.strftime("%H:%M:%S", time.gmtime(e - s)), "\n")
예제 #6
0
파일: build.py 프로젝트: ririgi/valkyrie
def summarize():

    s = time.time()

    count_is_logged_in()

    data = summarizeDailyWeekly("day")
    utils.toCSV(data, "../../results/daily_summary.csv")

    data = summarizeDailyWeekly("week")
    utils.toCSV(data, "../../results/weekly_summary.csv")

    e = time.time()
    print("Runtime summarize: ", time.strftime("%H:%M:%S", time.gmtime(e - s)))
예제 #7
0
def getInformation(filename, mode):
	s = time.time()

	if mode == "content":
		usecols = readCSVAsArray("../data/content_information.csv")
		drop_cols = ["pagetitle", "videourl"]
	else:
		usecols =readCSVAsArray("../data/device_information.csv")
		drop_cols = usecols[:-1]

	data = readCSV(filename, cols=usecols)
	# data = data.drop_duplicates(subset=drop_cols)
	data = add_id(data)

	toCSV(data, mode + "_information.csv", filename)

	e = time.time()
	print("Runtime getInformation_", mode, ": ", time.strftime("%H:%M:%S", time.gmtime(e-s)), "\n")
예제 #8
0
def colWeeklyStats():
    start_time = time.time()

    order_cols = [
        "filter_type", "filter_value", "data_type", "colnames", "variable",
        "value"
    ]

    cols_dict = {}
    cols_dict["filter_type"] = "week"
    cols_dict["data_type"] = "cardinal"

    cardinal_cols = readCSVAsArray("../data/cardinal_columns.csv")

    source_path = "../../dummy/09"
    data_files = os.listdir("../../dummy/09/")

    dates = organized_files_weekly(data_files)

    for week, group in dates.groupby(["week"])["filename"]:
        print(week)

        total_df = pd.DataFrame()

        for i, file in enumerate(group.values):
            print(i, file)

            file_path = os.path.join(source_path, file)

            data = readCSV(file_path, cardinal_cols)

            total_df = total_df.append(data)

        res = cardinal_week_column_description(total_df, cardinal_cols)

        cols_dict["filter_value"] = week

        res = add_additional_cols(res, cols_dict)

        toCSV(res[order_cols],
              "../results/column_MeanMedianMode_weekly_description.csv")

    print("RUNTIME: ", time.time() - start_time)
예제 #9
0
파일: build.py 프로젝트: ririgi/valkyrie
def nullDesc():

    s = time.time()

    nullDailyDesc()
    nullWeeklyDesc()

    dfs = []
    daily = utils.readCSV("../../results/daily_null_description.csv")
    weekly = utils.readCSV("../../results/weekly_null_description.csv")

    dfs.append(daily.drop("week", axis=1))
    dfs.append(weekly)

    data = utils.concatDF(dfs)

    utils.toCSV(data, "../../results/null_description.csv")

    e = time.time()
    print("Runtime nullDesc: ", time.strftime("%H:%M:%S", time.gmtime(e - s)))
예제 #10
0
파일: build.py 프로젝트: ririgi/valkyrie
def nullWeeklyDesc():

    s = time.time()

    data = utils.readCSV("../../results/daily_null_description.csv")
    data = data.groupby(["week", "colnames", "variable"]).sum().reset_index()

    data.drop("filter_value", axis=1, inplace=True)

    data["filter_type"] = "week"
    data.rename(columns={"week": "filter_value"}, inplace=True)

    data = data[[
        "filter_type", "filter_value", "colnames", "variable", "value"
    ]]

    utils.toCSV(data, "../../results/weekly_null_description.csv")

    e = time.time()
    print("Runtime nullWeeklyDesc: ",
          time.strftime("%H:%M:%S", time.gmtime(e - s)))
예제 #11
0
def getUserInformation(filename):
	s = time.time()

	usecols = ["bigdatasessionid", "pagetitle", "gigyaid", "fingerprintid", "bigdatacookieid"]
	data = readCSV(filename, usecols)
	# data = data.drop_duplicates(subset=usecols[:-2])

	data.loc[:, "userid"] = data.gigyaid
	data.loc[data.userid.isnull(), "userid"] = data.fingerprintid.map(str) + "_" + data.bigdatacookieid.map(str)

	data.loc[:, "usertype"] = "NOTLOGGEDIN"
	data.loc[~data.gigyaid.isnull(), "usertype"] = "LOGGEDIN"

	data.drop_duplicates(inplace = True)

	data = add_id(data)
	
	toCSV(data, "user_information.csv", filename)

	e = time.time()
	print("Runtime getUserInformation: ", time.strftime("%H:%M:%S", time.gmtime(e-s)), "\n")
	
예제 #12
0
def colDailyDesc(data_type):
    start_time = time.time()

    order_cols = [
        "published_date", "filter_type", "filter_value", "data_type",
        "colnames", "variable", "value"
    ]

    cols_dict = {}
    cols_dict["filter_type"] = "day"
    cols_dict["data_type"] = data_type

    cols = readCSVAsArray("../data/" + data_type + "_columns.csv")

    source_path = "../../dummy/09"
    data_files = os.listdir("../../dummy/09/")  # Change to use real data

    for i, file in enumerate(data_files):
        print("Calculating ", i, file)

        file_path = os.path.join(source_path, file)

        data = readCSV(file_path, cols)

        if data_type == "categorical":
            res = categorical_day_column_description(data, cols)

        else:
            res = cardinal_day_column_description(data, cols)

        cols_dict["published_date"] = convert_string_date(file)
        cols_dict["filter_value"] = get_day(file)

        res = add_additional_cols(res, cols_dict)

        toCSV(res[order_cols],
              "../results/column_" + data_type + "_daily_description.csv")

    print("RUNTIME: ", time.time() - start_time)
예제 #13
0
def colWeeklyDesc(data_type):
    start_time = time.time()

    order_cols = [
        "filter_type", "filter_value", "data_type", "colnames", "variable",
        "value"
    ]

    cols_dict = {}
    cols_dict["filter_type"] = "week"
    cols_dict["data_type"] = data_type

    file = "../results/column_" + data_type + "_daily_description.csv"
    data = pd.read_csv(file, sep=",", parse_dates=["published_date"])

    data.loc[:, "filter_value"] = data.published_date.apply(
        lambda x: (x + dt.timedelta(days=1)).week)

    if data_type == "categorical":

        res = data \
         .groupby(["filter_value", "colnames", "variable"]) ["value"] \
         .agg("sum") \
         .rename("value") \
         .pipe(pd.DataFrame) \
         .reset_index()

    elif data_type == "cardinal":

        res = get_weekly_cardinal(data, "max")
        res = res.append(get_weekly_cardinal(data, "min"))

    res = add_additional_cols(res, cols_dict)

    toCSV(res[order_cols],
          "../results/column_" + data_type + "_weekly_description.csv")