예제 #1
0
def get_vod_id(streamer):
    query = f"SELECT v.id, v.duration FROM streamer s JOIN vod v ON s.id = v.streamer_id where s.display_name = '{streamer}' and v.created_at > '2019-07-01T00:00:00Z' and  v.chatlog is NULL;"
    postgres = tp.Postgres()
    records = np.array(postgres.rawselect(query))
    postgres.close()
    try:
        vodtimes = np.array([[int(record[0]),
                              int(parse_duration(record[1]))]
                             for record in records])
    except:
        return 0
    # vodid = vodtimes[np.argmax(vodtimes[:, 1])][0]
    for vodtime in vodtimes:
        vodid = vodtime[0]
        query = f"SELECT chatlog FROM vod where id = {vodid};"
        postgres = tp.Postgres()
        records = np.array(postgres.rawselect(query))
        postgres.close()
        if type(records[0, 0]) == type(None):
            output = f"/Users/cchang5/PycharmProjects/talent_program/chatlog"
            command = f"tcd --video {vodid} --format irc --output {output}"
            subprocess.run(command, shell=True)
            query = (
                f"UPDATE vod SET chatlog = '{output}/{vodid}.log' WHERE id = {vodid};"
            )
            postgres = tp.Postgres()
            postgres.rawsql(query)
            postgres.close()
        else:
            print("chatlog already downloaded")
예제 #2
0
def timeofday(features_dict, bins):
    try:
        file = open(f"./features/timeofday_{bins}bins.pickle", "rb")
        preload = pickle.load(file)
        print(f"Preloading timeofday {bins} bins feature")
        for key in preload:
            features_dict[key].extend(preload[key])
    except:
        print(f"Generating timeofday {bins} bins feature")
        columns = [f"day_part{bin}" for bin in range(bins)]
        features_dict["columns"].extend(columns)
        # set up dict to pickle
        for_pickle = make_empty_dict()
        for_pickle["columns"].extend(columns)
        # end
        streamers = chatlog.get_display_name()
        for streamer in streamers:
            try:
                result, days = sch.daily_schedule(streamer, False)
                result_list = np.array(result.values).flatten()
            except:
                print("Unexpected error:", sys.exc_info())
                freshday = sch.make_freshday()
                result_list = np.zeros(len(freshday))
                days = 0
            binsize = int(np.ceil(len(result_list) / bins))
            result_binned = np.nanmax(
                np.pad(
                    result_list.astype(float),
                    (0, binsize - result_list.size % binsize),
                    mode="constant",
                    constant_values=np.NaN,
                ).reshape(-1, binsize),
                axis=1,
            )
            # number of sessions per day during time of day
            result_norm = result_binned / days
            features_dict[streamer].extend(result_norm)
            for_pickle[streamer].extend(result_norm)
        file = open(f"./features/timeofday_{bins}bins.pickle", "wb")
        pickle.dump(for_pickle, file)
        file.close()
    if False:  # write to sql
        for streamer in preload:
            if streamer == "columns":
                continue
            query = f"SELECT id FROM streamer WHERE display_name='{streamer}'"
            postgres = tp.Postgres()
            streamer_id = np.array(postgres.rawselect(query))[0, 0]
            postgres.close()
            hourlyQ = ", ".join([str(i) for i in preload[streamer]])
            coltag = ["day_part%s" % idx for idx in range(24)]
            coltagQ = ", ".join(coltag)
            query = f"INSERT INTO hourly_proba (streamer_id, {coltagQ}) VALUES ({streamer_id}, {hourlyQ});"
            postgres = tp.Postgres()
            postgres.rawsql(query)
            postgres.close()

    return features_dict
예제 #3
0
def tarray_feature(features_dict, binsize=60):
    # make features out of timearray of chatlogs
    excitement_dict = dict()
    excitement_dict["columns"] = ["excitement"]
    streamers = chatlog.get_display_name()
    #for streamer in streamers:
    for streamer in ["Metaphor"]:
        # get streamer_id
        query = f"SELECT id FROM streamer WHERE display_name='{streamer}';"
        postgres = tp.Postgres()
        streamer_id = np.array(postgres.rawselect(query))[0, 0]
        postgres.close()
        query = f"SELECT timearray FROM chatlog WHERE streamer_id={streamer_id};"
        postgres = tp.Postgres()
        records = postgres.rawselect(query)
        postgres.close()
        """
        for record in records[:1]:
            binned_chat = np.array(record[0])//binsize
            binned_counter = np.bincount(binned_chat)
            bins = len(binned_counter)
            fig = plt.figure(figsize=(7,4))
            ax = plt.axes([0.15, 0.15, 0.8, 0.8])
            ax.hist(binned_chat, bins=bins)
            plt.show()
        """
        concat_chat = [0]
        for record in records:
            concat_chat.extend(np.array(record[0]) + concat_chat[-1])
        binned_chat = np.array(concat_chat) // binsize
        binned_counter = np.bincount(binned_chat)
        bins = len(binned_counter)
        sort_counter = np.sort(binned_counter)
        median = np.median(sort_counter)
        excite_count = 0
        for event in sort_counter:
            if event > transformed_cut(median):
                excite_count += 1
        excitements_per_hour = excite_count / (bins * binsize) * 3600.
        excitement_dict[streamer] = [excitements_per_hour]
        # FIGURE
        fig = plt.figure(figsize=(7, 4))
        ax = plt.axes([0.15, 0.15, 0.8, 0.8])
        ax.hist(binned_chat, bins=bins)
        ax.axhline(transformed_cut(median), color="red")
        ax.set_ylim([0, 100])
        ax.set_xlim([0, max(binned_chat)])
        ax.set_xlabel("Minutes in stream", fontsize=16)
        ax.set_ylabel("Lines of chat / minute", fontsize=16)
        plt.savefig("./plots/chat_frequency.png", dpi=300, transparent=False)
        #plt.show()
        # END FIGURE
    for key in features_dict:
        features_dict[key].extend(excitement_dict[key])
    return features_dict
예제 #4
0
def chat_frequency():
    # creates timearray and inserts into database
    streamers = chatlog.get_display_name()
    for streamer in streamers:
        # get streamer_id
        query = f"SELECT id FROM streamer WHERE display_name='{streamer}';"
        postgres = tp.Postgres()
        streamer_id = np.array(postgres.rawselect(query))[0, 0]
        postgres.close()
        # get chatlog list
        #query = f"SELECT id, chatlog FROM vod WHERE streamer_id='{streamer_id}' AND created_at > '2019-07-01T00:00:00Z'"
        query = f"SELECT v.id, v.chatlog FROM vod v LEFT JOIN chatlog c ON v.id = c.vod_id WHERE v.streamer_id='{streamer_id}' AND v.created_at > '2019-07-01T00:00:00Z' AND timearray IS null;"
        postgres = tp.Postgres()
        logdirs = np.array(postgres.rawselect(query))
        postgres.close()
        # strip for time stamps
        for logdir in logdirs:
            vod_id = logdir[0]
            # check if directory is good
            if not check_logdir(logdir[1]):
                continue
            # if good path
            timearray = []
            with open(logdir[1], "r") as chat:
                for line in chat:
                    timestamp = line[1:].split("]")[0].split(":")
                    minute = int(timestamp[1])
                    second = int(timestamp[2])
                    try:
                        hour = int(timestamp[0])
                        tdelta = int(
                            datetime.timedelta(hours=hour,
                                               minutes=minute,
                                               seconds=second).total_seconds())
                    except:
                        dayhour = timestamp[0].split(" day, ")
                        day = int(dayhour[0])
                        hour = int(dayhour[1])
                        tdelta = int(
                            datetime.timedelta(days=day,
                                               hours=hour,
                                               minutes=minute,
                                               seconds=second).total_seconds())

                    timearray.append(tdelta)
            if len(timearray) == 0:
                timearray = [0]
            query = f"INSERT INTO chatlog (streamer_id, vod_id, timearray) VALUES ('{streamer_id}', '{vod_id}', ARRAY{timearray});"
            postgres = tp.Postgres()
            postgres.rawsql(query)
            postgres.close()
예제 #5
0
def summonerlevel(features_dict):
    query = "SELECT display_name, summonerlevel FROM streamer;"
    postgres = tp.Postgres()
    records = postgres.rawselect(query)
    postgres.close()
    records_dict = {record[0]: [record[1]] for record in records}
    records_dict["columns"] = ["summonerlevel"]
    for display_name in features_dict:
        features_dict[display_name].extend(records_dict[display_name])
    return features_dict
예제 #6
0
def label_data(features_dict, success_list=["partner"]):
    query = "SELECT display_name, broadcaster_type FROM streamer;"
    postgres = tp.Postgres()
    records = postgres.rawselect(query)
    postgres.close()
    records_dict = {record[0]: record[1] for record in records}
    records_dict["columns"] = "label"
    for display_name in features_dict:
        if display_name in ["columns"]:
            features_dict[display_name].extend(["label"])
            continue
        if records_dict[display_name] in success_list:
            features_dict[display_name].extend([1])
        else:
            features_dict[display_name].extend([0])
    return features_dict
예제 #7
0
def tier_rank(features_dict):
    query = "SELECT display_name, tier, rank FROM streamer;"
    postgres = tp.Postgres()
    records = postgres.rawselect(query)
    postgres.close()
    # load percentile of tier rank
    with open("./lookup/ranktier.json", "r") as file:
        percentile = json.load(file)
    records_dict = {
        record[0]: [percentile[record[1]][record[2]]]
        for record in records
    }
    records_dict["columns"] = ["ranktier"]
    # push into features
    for display_name in features_dict:
        features_dict[display_name].extend(records_dict[display_name])
    return features_dict
예제 #8
0
def get_display_name():
    query = "SELECT DISTINCT display_name FROM streamer;"
    postgres = tp.Postgres()
    records = np.array(postgres.rawselect(query)).flatten()
    postgres.close()
    return records
예제 #9
0
def tarray_feature(features_dict, binsize=60):
    # make features out of timearray of chatlogs
    try:
        a = asdf
        file = open(f"./features/excitement.pickle", "rb")
        excitement_dict = pickle.load(file)
        print(f"Preloading excitement feature")
    except:
        excitement_dict = dict()
        excitement_dict["columns"] = ["excitement"]
        streamers = chatlog.get_display_name()
        #for streamer in streamers:
        for streamer in ['Metaphor']:
            # get streamer_id
            query = f"SELECT id FROM streamer WHERE display_name='{streamer}';"
            postgres = tp.Postgres()
            streamer_id = np.array(postgres.rawselect(query))[0, 0]
            postgres.close()
            query = f"SELECT timearray FROM chatlog WHERE streamer_id={streamer_id};"
            postgres = tp.Postgres()
            records = postgres.rawselect(query)
            postgres.close()
            """
            for record in records[:1]:
                binned_chat = np.array(record[0])//binsize
                binned_counter = np.bincount(binned_chat)
                bins = len(binned_counter)
                fig = plt.figure(figsize=(7,4))
                ax = plt.axes([0.15, 0.15, 0.8, 0.8])
                ax.hist(binned_chat, bins=bins)
                plt.show()
            """
            concat_chat = [0]
            for record in records:
                concat_chat.extend(np.array(record[0]) + concat_chat[-1])
            binned_chat = np.array(concat_chat) // binsize
            binned_counter = np.bincount(binned_chat)
            bins = len(binned_counter)
            sort_counter = np.sort(binned_counter)
            median = np.median(sort_counter)
            #print("median:", sort_counter[bins//2])
            #print("middle 90%:", sort_counter[int(bins*0.95)], sort_counter[int(bins*0.05)])
            excite_count = 0
            for event in sort_counter:
                if event > transformed_cut(median):
                    excite_count += 1
            #if sort_counter[bins//2] == 0:
            #    excite_count = 0
            #print("cutoff:", transformed_cut(median))
            #print(excite_count)
            excitements_per_hour = excite_count / (bins * binsize) * 3600.
            excitement_dict[streamer] = [excitements_per_hour]
            # FIGURE
            fig = plt.figure(figsize=(7, 4))
            ax = plt.axes([0.15, 0.15, 0.8, 0.8])
            n, bins, patches = ax.hist(binned_chat, bins=bins, color="#6441A4")
            ax.axhline(transformed_cut(median), color="red")
            ax.set_xlim([0, max(bins)])
            ax.set_ylim([0, 100])
            ax.set_xlabel("Minutes in stream", fontsize=16)
            ax.set_ylabel("Lines of chat / minute", fontsize=16)
            plt.tick_params(axis="both", labelsize=16)
            plt.savefig(f"./features/chathistory.png",
                        dpi=300,
                        transparent=False)
            plt.show()
            # END FIGURE
            #file = open(f"./features/excitement.pickle", "wb")
            #pickle.dump(excitement_dict, file)
            #file.close()
    for key in features_dict:
        features_dict[key].extend(excitement_dict[key])
    if False:  # push into database
        for streamer in excitement_dict:
            if streamer == "columns":
                continue
            postgres = tp.Postgres()
            query = f"SELECT id FROM streamer WHERE display_name='{streamer}';"
            streamer_id = np.array(postgres.rawselect(query))[0, 0]
            postgres.close()
            excite = excitement_dict[streamer][0]
            query = f"INSERT INTO excitement (streamer_id, excite) VALUES ({streamer_id}, {excite});"
            postgres = tp.Postgres()
            postgres.rawsql(query)
            postgres.close()
    return features_dict
예제 #10
0
def sausage(X, y):
    model = LogisticRegression(penalty="elasticnet",
                               solver="saga",
                               l1_ratio=1,
                               max_iter=10000)

    # single test train split
    train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
    model.fit(train_X, train_y)
    prediction = np.array(model.predict_proba(val_X))
    # for idx in range(len(prediction)):
    #    print(X.index[idx], prediction[idx][0], y[idx])
    result = pd.DataFrame(prediction[:, 1],
                          index=val_X.index,
                          columns=["predict_proba"])
    result["label"] = val_y

    sorted_recall = np.sort(result[result["label"] == 1]["predict_proba"])
    print("sorted recall (10%):", sorted_recall[int(0.1 * len(sorted_recall))])

    pred_y = np.array(model.predict(val_X))
    print(confusion_matrix(val_y, pred_y))

    print("score:", model.score(train_X, train_y))

    print("coefficients")
    print(model.coef_)

    if True:  # ROC curve
        THRESHOLD_LIST = [0.1]
        probs = model.predict_proba(val_X)[:, 1]
        fpr, tpr, thresholds = roc_curve(val_y, probs)
        # plot no skill
        fig = plt.figure(figsize=(7, 7))
        ax = plt.axes([0.15, 0.15, 0.8, 0.8])
        for THRESHOLD in THRESHOLD_LIST:
            pred_y = np.where(
                model.predict_proba(val_X)[:, 1] > THRESHOLD, 1, 0)
            cmatrix = confusion_matrix(val_y, pred_y)
            FPR = cmatrix[0, 1] / (cmatrix[0, 1] + cmatrix[0, 0])
            TPR = cmatrix[1, 1] / (cmatrix[1, 1] + cmatrix[1, 0])
            # ax.axhline(TPR, color="red")
            # ax.axvline(FPR, color="red")
            ax.errorbar(x=[FPR],
                        xerr=[0.05],
                        y=[TPR],
                        yerr=[0.05],
                        color="red")
        ax.errorbar(x=[0, 1], y=[0, 1], linestyle="--", color="k")
        # plot the roc curve for the model
        ax.errorbar(x=fpr, y=tpr, marker=".")
        ax.set_xlabel("False Positive Rate", fontsize=16)
        ax.set_ylabel("True Positive Rate", fontsize=16)
        ax.set_xlim([0, 1])
        ax.set_ylim([0, 1])
        plt.tick_params(axis="both", labelsize=16)
        # show the plot
        plt.savefig(f"./validation/ROC.png", dpi=300, transparent=False)
        # plt.show()

    # cross validation
    threshlist = np.linspace(0, 1, 51)
    gvscore_list, score, model, accuracy, confusion = stratcrossvalid(
        X, y, THRESHOLD_LIST=threshlist)
    for key in [0.2]:  # confusion:
        print(key)
        cmatrix = np.array(confusion[key][-1])
        print(cmatrix)
        print("FPR:", cmatrix[0, 1] / (cmatrix[0, 1] + cmatrix[0, 0]))
        print("TPR:", cmatrix[1, 1] / (cmatrix[1, 1] + cmatrix[1, 0]))
    predict_proba = model.predict_proba(X)
    if False:
        for idx, display_name in enumerate(X.index):
            query = f"SELECT id FROM streamer WHERE display_name='{display_name}';"
            postgres = tp.Postgres()
            streamer_id = np.array(postgres.rawselect(query))[0, 0]
            postgres.close()
            proba = predict_proba[idx][1]
            query = f"INSERT INTO prediction (streamer_id, proba) VALUES({streamer_id}, {proba});"
            postgres = tp.Postgres()
            postgres.rawsql(query)
            postgres.close()

    # x-valid results
    print("Threshold:", threshlist)
    print("Xvalid recall:", gvscore_list)
    print("Xvalid accuracy:", np.mean(accuracy), np.std(accuracy, ddof=1))
    print("coeff:", model.coef_)

    if True:
        fig = plt.figure(figsize=(7, 4))
        ax = plt.axes([0.15, 0.15, 0.8, 0.8])
        ax.errorbar(
            x=threshlist,
            y=[i.mean for i in gvscore_list],
            yerr=[i.sdev for i in gvscore_list],
        )
        plt.draw()
        plt.show()
    """
    scores = cross_val_score(model, X, y, cv=cv, scoring="recall")
    print(scores)
    print("Recall: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    scores = cross_val_score(model, X, y, cv=cv, scoring="f1")
    print(scores)
    print("f1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    """

    # sns.distplot(result["predict_proba"])
    # plt.show()

    # Precision recall
    fig = plt.figure("precision recall", figsize=(7, 7))
    ax = plt.axes([0.15, 0.15, 0.8, 0.8])
    precision, recall, _ = precision_recall_curve(val_y, probs)
    step_kwargs = ({
        "step": "post"
    } if "step" in signature(plt.fill_between).parameters else {})
    ax.step(recall, precision, color="#6441A4", where="post")
    ax.fill_between(recall, precision, color="#6441A4", **step_kwargs)

    ax.set_xlabel("Recall", fontsize=16)
    ax.set_ylabel("Precision", fontsize=16)
    plt.tick_params(axis="both", labelsize=16)
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.savefig(f"./validation/precision_recall.png",
                dpi=300,
                transparent=False)
    plt.show()