def nullDailyDesc(): s = time.time() for f in sorted(os.listdir(data_dir)): if f.endswith(".csv"): data = pd.read_csv(os.path.join(data_dir, f), nrows=100) data = utils.addWeekDay(data, f) df = stats.getNullDescription(data) df = pd.melt(df, id_vars=["colnames"], value_vars=["num_null", "not_null"]) df["filter_type"] = "day" df["filter_value"] = data.iloc[0]["day"] df["week"] = data.iloc[0]["week"] df = df[[ "filter_type", "filter_value", "colnames", "variable", "value", "week" ]] print(df.head()) utils.toCSV(df, "../../results/daily_null_description.csv") e = time.time() print("Runtime nullDailyDesc: ", time.strftime("%H:%M:%S", time.gmtime(e - s)))
def count_is_logged_in(): s = time.time() for f in sorted(os.listdir(data_dir)): if f.endswith(".csv"): data = pd.read_csv(os.path.join(data_dir, f), nrows=200000) data = utils.addWeekDay(data, f) data = utils.addIsLoggedIn(data) new_df = pd.DataFrame(index=[data.iloc[0]["day"]], data=data["isloggedin"].sum(), columns=['isloggedin']) new_df.index.name = 'day' new_df["week"] = data.iloc[0]["week"] utils.toCSV(new_df.reset_index(), "../../results/daily_logged_in.csv") data = utils.readCSV("../../results/daily_logged_in.csv") data = data.groupby("week")["isloggedin"].sum().to_frame() utils.toCSV(data.reset_index(), "../../results/weekly_logged_in.csv") e = time.time() print("Runtime count_is_logged_in: ", time.strftime("%H:%M:%S", time.gmtime(e - s)))
def getSessionDuration(filename): s = time.time() usecols = [ "bigdatasessionid", "pagetitle", "sessionstarttimestamp", "sessionendtimestamp" ] data = readCSV(filename, usecols) # print(len(data)) data = data.drop(data.loc[data.sessionendtimestamp == "NaN/NaN/NaN NaN:NaN:NaN.NaN"].index) # print(len(data)) data.sessionendtimestamp = data.sessionendtimestamp.astype( "datetime64[ns]") data = data \ .groupby(["bigdatasessionid", "pagetitle"])["sessionstarttimestamp", "sessionendtimestamp"] \ .agg({"sessionstarttimestamp": "min", "sessionendtimestamp": "max"}) \ .pipe(pd.DataFrame) \ .reset_index() data.loc[:, "session_duration"] = ( data.sessionendtimestamp - data.sessionstarttimestamp) / np.timedelta64(1, 's') data = getDateInfo(data) data = add_id(data) toCSV(data, "session_information.csv", filename) e = time.time() print("Runtime getSessionDuration: ", time.strftime("%H:%M:%S", time.gmtime(e - s)), "\n")
def getReferralInformation(filename): s = time.time() cols = ["bigdatasessionid", "pagetitle", "currentwebpage", "previouswebpage"] data = readCSV(filename, cols = cols) print(len(data)) data.dropna(subset = ["currentwebpage"], inplace = True) print(len(data)) data["referraltype"] = data[["currentwebpage", "previouswebpage"]].apply(lambda x: get_referraltype(x), axis = 1) data = add_id(data) print(len(data)) # data.dropna(subset = ["ID"], inplace = True) print(len(data)) toCSV(data, "referral_information.csv", filename) e = time.time() print("Runtime getReferralInformation: ", time.strftime("%H:%M:%S", time.gmtime(e-s)), "\n")
def getReadingDuration(filename): s = time.time() cols = [ "bigdatasessionid", "pagetitle", "articlecontentamount", "readingduration", "viewpageduration", "pagedepth" ] data = readCSV(filename, cols) data = data.groupby(["bigdatasessionid", "pagetitle"])["articlecontentamount", "readingduration", "viewpageduration", "pagedepth"].agg(['sum']) data.columns = data.columns.to_flat_index() data.columns = [col[0] for col in data.columns] data = add_id(data) #outfile = "../results/"+filename[-6:-4]+"/reading_duration.csv" #toCSV(data, outfile, filename) toCSV(data, "reading_information.csv", filename) e = time.time() print("Runtime getReadingDuration: ", time.strftime("%H:%M:%S", time.gmtime(e - s)), "\n")
def summarize(): s = time.time() count_is_logged_in() data = summarizeDailyWeekly("day") utils.toCSV(data, "../../results/daily_summary.csv") data = summarizeDailyWeekly("week") utils.toCSV(data, "../../results/weekly_summary.csv") e = time.time() print("Runtime summarize: ", time.strftime("%H:%M:%S", time.gmtime(e - s)))
def getInformation(filename, mode): s = time.time() if mode == "content": usecols = readCSVAsArray("../data/content_information.csv") drop_cols = ["pagetitle", "videourl"] else: usecols =readCSVAsArray("../data/device_information.csv") drop_cols = usecols[:-1] data = readCSV(filename, cols=usecols) # data = data.drop_duplicates(subset=drop_cols) data = add_id(data) toCSV(data, mode + "_information.csv", filename) e = time.time() print("Runtime getInformation_", mode, ": ", time.strftime("%H:%M:%S", time.gmtime(e-s)), "\n")
def colWeeklyStats(): start_time = time.time() order_cols = [ "filter_type", "filter_value", "data_type", "colnames", "variable", "value" ] cols_dict = {} cols_dict["filter_type"] = "week" cols_dict["data_type"] = "cardinal" cardinal_cols = readCSVAsArray("../data/cardinal_columns.csv") source_path = "../../dummy/09" data_files = os.listdir("../../dummy/09/") dates = organized_files_weekly(data_files) for week, group in dates.groupby(["week"])["filename"]: print(week) total_df = pd.DataFrame() for i, file in enumerate(group.values): print(i, file) file_path = os.path.join(source_path, file) data = readCSV(file_path, cardinal_cols) total_df = total_df.append(data) res = cardinal_week_column_description(total_df, cardinal_cols) cols_dict["filter_value"] = week res = add_additional_cols(res, cols_dict) toCSV(res[order_cols], "../results/column_MeanMedianMode_weekly_description.csv") print("RUNTIME: ", time.time() - start_time)
def nullDesc(): s = time.time() nullDailyDesc() nullWeeklyDesc() dfs = [] daily = utils.readCSV("../../results/daily_null_description.csv") weekly = utils.readCSV("../../results/weekly_null_description.csv") dfs.append(daily.drop("week", axis=1)) dfs.append(weekly) data = utils.concatDF(dfs) utils.toCSV(data, "../../results/null_description.csv") e = time.time() print("Runtime nullDesc: ", time.strftime("%H:%M:%S", time.gmtime(e - s)))
def nullWeeklyDesc(): s = time.time() data = utils.readCSV("../../results/daily_null_description.csv") data = data.groupby(["week", "colnames", "variable"]).sum().reset_index() data.drop("filter_value", axis=1, inplace=True) data["filter_type"] = "week" data.rename(columns={"week": "filter_value"}, inplace=True) data = data[[ "filter_type", "filter_value", "colnames", "variable", "value" ]] utils.toCSV(data, "../../results/weekly_null_description.csv") e = time.time() print("Runtime nullWeeklyDesc: ", time.strftime("%H:%M:%S", time.gmtime(e - s)))
def getUserInformation(filename): s = time.time() usecols = ["bigdatasessionid", "pagetitle", "gigyaid", "fingerprintid", "bigdatacookieid"] data = readCSV(filename, usecols) # data = data.drop_duplicates(subset=usecols[:-2]) data.loc[:, "userid"] = data.gigyaid data.loc[data.userid.isnull(), "userid"] = data.fingerprintid.map(str) + "_" + data.bigdatacookieid.map(str) data.loc[:, "usertype"] = "NOTLOGGEDIN" data.loc[~data.gigyaid.isnull(), "usertype"] = "LOGGEDIN" data.drop_duplicates(inplace = True) data = add_id(data) toCSV(data, "user_information.csv", filename) e = time.time() print("Runtime getUserInformation: ", time.strftime("%H:%M:%S", time.gmtime(e-s)), "\n")
def colDailyDesc(data_type): start_time = time.time() order_cols = [ "published_date", "filter_type", "filter_value", "data_type", "colnames", "variable", "value" ] cols_dict = {} cols_dict["filter_type"] = "day" cols_dict["data_type"] = data_type cols = readCSVAsArray("../data/" + data_type + "_columns.csv") source_path = "../../dummy/09" data_files = os.listdir("../../dummy/09/") # Change to use real data for i, file in enumerate(data_files): print("Calculating ", i, file) file_path = os.path.join(source_path, file) data = readCSV(file_path, cols) if data_type == "categorical": res = categorical_day_column_description(data, cols) else: res = cardinal_day_column_description(data, cols) cols_dict["published_date"] = convert_string_date(file) cols_dict["filter_value"] = get_day(file) res = add_additional_cols(res, cols_dict) toCSV(res[order_cols], "../results/column_" + data_type + "_daily_description.csv") print("RUNTIME: ", time.time() - start_time)
def colWeeklyDesc(data_type): start_time = time.time() order_cols = [ "filter_type", "filter_value", "data_type", "colnames", "variable", "value" ] cols_dict = {} cols_dict["filter_type"] = "week" cols_dict["data_type"] = data_type file = "../results/column_" + data_type + "_daily_description.csv" data = pd.read_csv(file, sep=",", parse_dates=["published_date"]) data.loc[:, "filter_value"] = data.published_date.apply( lambda x: (x + dt.timedelta(days=1)).week) if data_type == "categorical": res = data \ .groupby(["filter_value", "colnames", "variable"]) ["value"] \ .agg("sum") \ .rename("value") \ .pipe(pd.DataFrame) \ .reset_index() elif data_type == "cardinal": res = get_weekly_cardinal(data, "max") res = res.append(get_weekly_cardinal(data, "min")) res = add_additional_cols(res, cols_dict) toCSV(res[order_cols], "../results/column_" + data_type + "_weekly_description.csv")