def get_vod_id(streamer): query = f"SELECT v.id, v.duration FROM streamer s JOIN vod v ON s.id = v.streamer_id where s.display_name = '{streamer}' and v.created_at > '2019-07-01T00:00:00Z' and v.chatlog is NULL;" postgres = tp.Postgres() records = np.array(postgres.rawselect(query)) postgres.close() try: vodtimes = np.array([[int(record[0]), int(parse_duration(record[1]))] for record in records]) except: return 0 # vodid = vodtimes[np.argmax(vodtimes[:, 1])][0] for vodtime in vodtimes: vodid = vodtime[0] query = f"SELECT chatlog FROM vod where id = {vodid};" postgres = tp.Postgres() records = np.array(postgres.rawselect(query)) postgres.close() if type(records[0, 0]) == type(None): output = f"/Users/cchang5/PycharmProjects/talent_program/chatlog" command = f"tcd --video {vodid} --format irc --output {output}" subprocess.run(command, shell=True) query = ( f"UPDATE vod SET chatlog = '{output}/{vodid}.log' WHERE id = {vodid};" ) postgres = tp.Postgres() postgres.rawsql(query) postgres.close() else: print("chatlog already downloaded")
def timeofday(features_dict, bins): try: file = open(f"./features/timeofday_{bins}bins.pickle", "rb") preload = pickle.load(file) print(f"Preloading timeofday {bins} bins feature") for key in preload: features_dict[key].extend(preload[key]) except: print(f"Generating timeofday {bins} bins feature") columns = [f"day_part{bin}" for bin in range(bins)] features_dict["columns"].extend(columns) # set up dict to pickle for_pickle = make_empty_dict() for_pickle["columns"].extend(columns) # end streamers = chatlog.get_display_name() for streamer in streamers: try: result, days = sch.daily_schedule(streamer, False) result_list = np.array(result.values).flatten() except: print("Unexpected error:", sys.exc_info()) freshday = sch.make_freshday() result_list = np.zeros(len(freshday)) days = 0 binsize = int(np.ceil(len(result_list) / bins)) result_binned = np.nanmax( np.pad( result_list.astype(float), (0, binsize - result_list.size % binsize), mode="constant", constant_values=np.NaN, ).reshape(-1, binsize), axis=1, ) # number of sessions per day during time of day result_norm = result_binned / days features_dict[streamer].extend(result_norm) for_pickle[streamer].extend(result_norm) file = open(f"./features/timeofday_{bins}bins.pickle", "wb") pickle.dump(for_pickle, file) file.close() if False: # write to sql for streamer in preload: if streamer == "columns": continue query = f"SELECT id FROM streamer WHERE display_name='{streamer}'" postgres = tp.Postgres() streamer_id = np.array(postgres.rawselect(query))[0, 0] postgres.close() hourlyQ = ", ".join([str(i) for i in preload[streamer]]) coltag = ["day_part%s" % idx for idx in range(24)] coltagQ = ", ".join(coltag) query = f"INSERT INTO hourly_proba (streamer_id, {coltagQ}) VALUES ({streamer_id}, {hourlyQ});" postgres = tp.Postgres() postgres.rawsql(query) postgres.close() return features_dict
def tarray_feature(features_dict, binsize=60): # make features out of timearray of chatlogs excitement_dict = dict() excitement_dict["columns"] = ["excitement"] streamers = chatlog.get_display_name() #for streamer in streamers: for streamer in ["Metaphor"]: # get streamer_id query = f"SELECT id FROM streamer WHERE display_name='{streamer}';" postgres = tp.Postgres() streamer_id = np.array(postgres.rawselect(query))[0, 0] postgres.close() query = f"SELECT timearray FROM chatlog WHERE streamer_id={streamer_id};" postgres = tp.Postgres() records = postgres.rawselect(query) postgres.close() """ for record in records[:1]: binned_chat = np.array(record[0])//binsize binned_counter = np.bincount(binned_chat) bins = len(binned_counter) fig = plt.figure(figsize=(7,4)) ax = plt.axes([0.15, 0.15, 0.8, 0.8]) ax.hist(binned_chat, bins=bins) plt.show() """ concat_chat = [0] for record in records: concat_chat.extend(np.array(record[0]) + concat_chat[-1]) binned_chat = np.array(concat_chat) // binsize binned_counter = np.bincount(binned_chat) bins = len(binned_counter) sort_counter = np.sort(binned_counter) median = np.median(sort_counter) excite_count = 0 for event in sort_counter: if event > transformed_cut(median): excite_count += 1 excitements_per_hour = excite_count / (bins * binsize) * 3600. excitement_dict[streamer] = [excitements_per_hour] # FIGURE fig = plt.figure(figsize=(7, 4)) ax = plt.axes([0.15, 0.15, 0.8, 0.8]) ax.hist(binned_chat, bins=bins) ax.axhline(transformed_cut(median), color="red") ax.set_ylim([0, 100]) ax.set_xlim([0, max(binned_chat)]) ax.set_xlabel("Minutes in stream", fontsize=16) ax.set_ylabel("Lines of chat / minute", fontsize=16) plt.savefig("./plots/chat_frequency.png", dpi=300, transparent=False) #plt.show() # END FIGURE for key in features_dict: features_dict[key].extend(excitement_dict[key]) return features_dict
def chat_frequency(): # creates timearray and inserts into database streamers = chatlog.get_display_name() for streamer in streamers: # get streamer_id query = f"SELECT id FROM streamer WHERE display_name='{streamer}';" postgres = tp.Postgres() streamer_id = np.array(postgres.rawselect(query))[0, 0] postgres.close() # get chatlog list #query = f"SELECT id, chatlog FROM vod WHERE streamer_id='{streamer_id}' AND created_at > '2019-07-01T00:00:00Z'" query = f"SELECT v.id, v.chatlog FROM vod v LEFT JOIN chatlog c ON v.id = c.vod_id WHERE v.streamer_id='{streamer_id}' AND v.created_at > '2019-07-01T00:00:00Z' AND timearray IS null;" postgres = tp.Postgres() logdirs = np.array(postgres.rawselect(query)) postgres.close() # strip for time stamps for logdir in logdirs: vod_id = logdir[0] # check if directory is good if not check_logdir(logdir[1]): continue # if good path timearray = [] with open(logdir[1], "r") as chat: for line in chat: timestamp = line[1:].split("]")[0].split(":") minute = int(timestamp[1]) second = int(timestamp[2]) try: hour = int(timestamp[0]) tdelta = int( datetime.timedelta(hours=hour, minutes=minute, seconds=second).total_seconds()) except: dayhour = timestamp[0].split(" day, ") day = int(dayhour[0]) hour = int(dayhour[1]) tdelta = int( datetime.timedelta(days=day, hours=hour, minutes=minute, seconds=second).total_seconds()) timearray.append(tdelta) if len(timearray) == 0: timearray = [0] query = f"INSERT INTO chatlog (streamer_id, vod_id, timearray) VALUES ('{streamer_id}', '{vod_id}', ARRAY{timearray});" postgres = tp.Postgres() postgres.rawsql(query) postgres.close()
def summonerlevel(features_dict): query = "SELECT display_name, summonerlevel FROM streamer;" postgres = tp.Postgres() records = postgres.rawselect(query) postgres.close() records_dict = {record[0]: [record[1]] for record in records} records_dict["columns"] = ["summonerlevel"] for display_name in features_dict: features_dict[display_name].extend(records_dict[display_name]) return features_dict
def label_data(features_dict, success_list=["partner"]): query = "SELECT display_name, broadcaster_type FROM streamer;" postgres = tp.Postgres() records = postgres.rawselect(query) postgres.close() records_dict = {record[0]: record[1] for record in records} records_dict["columns"] = "label" for display_name in features_dict: if display_name in ["columns"]: features_dict[display_name].extend(["label"]) continue if records_dict[display_name] in success_list: features_dict[display_name].extend([1]) else: features_dict[display_name].extend([0]) return features_dict
def tier_rank(features_dict): query = "SELECT display_name, tier, rank FROM streamer;" postgres = tp.Postgres() records = postgres.rawselect(query) postgres.close() # load percentile of tier rank with open("./lookup/ranktier.json", "r") as file: percentile = json.load(file) records_dict = { record[0]: [percentile[record[1]][record[2]]] for record in records } records_dict["columns"] = ["ranktier"] # push into features for display_name in features_dict: features_dict[display_name].extend(records_dict[display_name]) return features_dict
def get_display_name(): query = "SELECT DISTINCT display_name FROM streamer;" postgres = tp.Postgres() records = np.array(postgres.rawselect(query)).flatten() postgres.close() return records
def tarray_feature(features_dict, binsize=60): # make features out of timearray of chatlogs try: a = asdf file = open(f"./features/excitement.pickle", "rb") excitement_dict = pickle.load(file) print(f"Preloading excitement feature") except: excitement_dict = dict() excitement_dict["columns"] = ["excitement"] streamers = chatlog.get_display_name() #for streamer in streamers: for streamer in ['Metaphor']: # get streamer_id query = f"SELECT id FROM streamer WHERE display_name='{streamer}';" postgres = tp.Postgres() streamer_id = np.array(postgres.rawselect(query))[0, 0] postgres.close() query = f"SELECT timearray FROM chatlog WHERE streamer_id={streamer_id};" postgres = tp.Postgres() records = postgres.rawselect(query) postgres.close() """ for record in records[:1]: binned_chat = np.array(record[0])//binsize binned_counter = np.bincount(binned_chat) bins = len(binned_counter) fig = plt.figure(figsize=(7,4)) ax = plt.axes([0.15, 0.15, 0.8, 0.8]) ax.hist(binned_chat, bins=bins) plt.show() """ concat_chat = [0] for record in records: concat_chat.extend(np.array(record[0]) + concat_chat[-1]) binned_chat = np.array(concat_chat) // binsize binned_counter = np.bincount(binned_chat) bins = len(binned_counter) sort_counter = np.sort(binned_counter) median = np.median(sort_counter) #print("median:", sort_counter[bins//2]) #print("middle 90%:", sort_counter[int(bins*0.95)], sort_counter[int(bins*0.05)]) excite_count = 0 for event in sort_counter: if event > transformed_cut(median): excite_count += 1 #if sort_counter[bins//2] == 0: # excite_count = 0 #print("cutoff:", transformed_cut(median)) #print(excite_count) excitements_per_hour = excite_count / (bins * binsize) * 3600. excitement_dict[streamer] = [excitements_per_hour] # FIGURE fig = plt.figure(figsize=(7, 4)) ax = plt.axes([0.15, 0.15, 0.8, 0.8]) n, bins, patches = ax.hist(binned_chat, bins=bins, color="#6441A4") ax.axhline(transformed_cut(median), color="red") ax.set_xlim([0, max(bins)]) ax.set_ylim([0, 100]) ax.set_xlabel("Minutes in stream", fontsize=16) ax.set_ylabel("Lines of chat / minute", fontsize=16) plt.tick_params(axis="both", labelsize=16) plt.savefig(f"./features/chathistory.png", dpi=300, transparent=False) plt.show() # END FIGURE #file = open(f"./features/excitement.pickle", "wb") #pickle.dump(excitement_dict, file) #file.close() for key in features_dict: features_dict[key].extend(excitement_dict[key]) if False: # push into database for streamer in excitement_dict: if streamer == "columns": continue postgres = tp.Postgres() query = f"SELECT id FROM streamer WHERE display_name='{streamer}';" streamer_id = np.array(postgres.rawselect(query))[0, 0] postgres.close() excite = excitement_dict[streamer][0] query = f"INSERT INTO excitement (streamer_id, excite) VALUES ({streamer_id}, {excite});" postgres = tp.Postgres() postgres.rawsql(query) postgres.close() return features_dict
def sausage(X, y): model = LogisticRegression(penalty="elasticnet", solver="saga", l1_ratio=1, max_iter=10000) # single test train split train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1) model.fit(train_X, train_y) prediction = np.array(model.predict_proba(val_X)) # for idx in range(len(prediction)): # print(X.index[idx], prediction[idx][0], y[idx]) result = pd.DataFrame(prediction[:, 1], index=val_X.index, columns=["predict_proba"]) result["label"] = val_y sorted_recall = np.sort(result[result["label"] == 1]["predict_proba"]) print("sorted recall (10%):", sorted_recall[int(0.1 * len(sorted_recall))]) pred_y = np.array(model.predict(val_X)) print(confusion_matrix(val_y, pred_y)) print("score:", model.score(train_X, train_y)) print("coefficients") print(model.coef_) if True: # ROC curve THRESHOLD_LIST = [0.1] probs = model.predict_proba(val_X)[:, 1] fpr, tpr, thresholds = roc_curve(val_y, probs) # plot no skill fig = plt.figure(figsize=(7, 7)) ax = plt.axes([0.15, 0.15, 0.8, 0.8]) for THRESHOLD in THRESHOLD_LIST: pred_y = np.where( model.predict_proba(val_X)[:, 1] > THRESHOLD, 1, 0) cmatrix = confusion_matrix(val_y, pred_y) FPR = cmatrix[0, 1] / (cmatrix[0, 1] + cmatrix[0, 0]) TPR = cmatrix[1, 1] / (cmatrix[1, 1] + cmatrix[1, 0]) # ax.axhline(TPR, color="red") # ax.axvline(FPR, color="red") ax.errorbar(x=[FPR], xerr=[0.05], y=[TPR], yerr=[0.05], color="red") ax.errorbar(x=[0, 1], y=[0, 1], linestyle="--", color="k") # plot the roc curve for the model ax.errorbar(x=fpr, y=tpr, marker=".") ax.set_xlabel("False Positive Rate", fontsize=16) ax.set_ylabel("True Positive Rate", fontsize=16) ax.set_xlim([0, 1]) ax.set_ylim([0, 1]) plt.tick_params(axis="both", labelsize=16) # show the plot plt.savefig(f"./validation/ROC.png", dpi=300, transparent=False) # plt.show() # cross validation threshlist = np.linspace(0, 1, 51) gvscore_list, score, model, accuracy, confusion = stratcrossvalid( X, y, THRESHOLD_LIST=threshlist) for key in [0.2]: # confusion: print(key) cmatrix = np.array(confusion[key][-1]) print(cmatrix) print("FPR:", cmatrix[0, 1] / (cmatrix[0, 1] + cmatrix[0, 0])) print("TPR:", cmatrix[1, 1] / (cmatrix[1, 1] + cmatrix[1, 0])) predict_proba = model.predict_proba(X) if False: for idx, display_name in enumerate(X.index): query = f"SELECT id FROM streamer WHERE display_name='{display_name}';" postgres = tp.Postgres() streamer_id = np.array(postgres.rawselect(query))[0, 0] postgres.close() proba = predict_proba[idx][1] query = f"INSERT INTO prediction (streamer_id, proba) VALUES({streamer_id}, {proba});" postgres = tp.Postgres() postgres.rawsql(query) postgres.close() # x-valid results print("Threshold:", threshlist) print("Xvalid recall:", gvscore_list) print("Xvalid accuracy:", np.mean(accuracy), np.std(accuracy, ddof=1)) print("coeff:", model.coef_) if True: fig = plt.figure(figsize=(7, 4)) ax = plt.axes([0.15, 0.15, 0.8, 0.8]) ax.errorbar( x=threshlist, y=[i.mean for i in gvscore_list], yerr=[i.sdev for i in gvscore_list], ) plt.draw() plt.show() """ scores = cross_val_score(model, X, y, cv=cv, scoring="recall") print(scores) print("Recall: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) scores = cross_val_score(model, X, y, cv=cv, scoring="f1") print(scores) print("f1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) """ # sns.distplot(result["predict_proba"]) # plt.show() # Precision recall fig = plt.figure("precision recall", figsize=(7, 7)) ax = plt.axes([0.15, 0.15, 0.8, 0.8]) precision, recall, _ = precision_recall_curve(val_y, probs) step_kwargs = ({ "step": "post" } if "step" in signature(plt.fill_between).parameters else {}) ax.step(recall, precision, color="#6441A4", where="post") ax.fill_between(recall, precision, color="#6441A4", **step_kwargs) ax.set_xlabel("Recall", fontsize=16) ax.set_ylabel("Precision", fontsize=16) plt.tick_params(axis="both", labelsize=16) plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.savefig(f"./validation/precision_recall.png", dpi=300, transparent=False) plt.show()