target = float([wp30, re30][is_re])
                    true_engagement.append(target)
                    random_guess = 0.5
                    if is_re:
                        guess_engagement.append(random_guess)
                    else:
                        guess_engagement.append(to_watch_percentage(engagement_map, duration, random_guess, lookup_keys=split_keys))

    print('>>> Predict {0} on duration...'.format(['watch percentage', 'relative engagement'][is_re]))
    print('>>> MAE on test set: {0:.4f}'.format(mean_absolute_error(true_engagement, guess_engagement)))
    print('>>> R2 on test set: {0:.4f}'.format(r2_score(true_engagement, guess_engagement)))
    print('=' * 79)

    timer.stop()

    # write to pickle file
    to_write = True
    true_result_dict = {vid: true for vid, true in zip(test_vids, true_engagement)}
    predict_result_dict = {vid: pred for vid, pred in zip(test_vids, guess_engagement)}
    test_duration_dict = {vid: duration for vid, duration in zip(test_vids, test_duration)}
    if to_write:
        print('>>> Prepare to write to pickle file...')
        print('>>> Number of videos in final test result dict: {0}'.format(len(test_vids)))
        write_dict_to_pickle(dict=true_result_dict,
                             path=os.path.join(output_dir, '{0}_true_predictor.p'.format(['wp', 're'][is_re])))
        write_dict_to_pickle(dict=predict_result_dict,
                             path=os.path.join(output_dir, '{0}_duration_predictor.p'.format(['wp', 're'][is_re])))
        if not os.path.exists(os.path.join(output_dir, 'test_duration.p')):
            write_dict_to_pickle(dict=test_duration_dict, path=os.path.join(output_dir, 'test_duration.p'))

    train_matrix = []
    print('>>> Start to load training dataset...')
    for subdir, _, files in os.walk(train_loc):
        for f in files:
            train_matrix.extend(_load_data(os.path.join(subdir, f)))
    train_matrix = np.array(train_matrix)

    test_matrix = []
    print('>>> Start to load test dataset...')
    for subdir, _, files in os.walk(test_loc):
        for f in files:
            test_matrix.extend(_load_data(os.path.join(subdir, f)))
    test_matrix = np.array(test_matrix)

    print('>>> Finish loading all data!\n')

    # predict test data from customized ridge regressor
    test_yhat, test_vids = RidgeRegressor(train_matrix, test_matrix).predict_from_sparse(vectorize_train_data,
                                                                                         vectorize_test_data)

    # get running time
    print('\n>>> Total running time: {0}'.format(str(datetime.timedelta(seconds=time.time() - start_time)))[:-3])

    # write to pickle file
    to_write = True
    predict_result_dict = {vid: pred for vid, pred in zip(test_vids, test_yhat)}
    if to_write:
        print('>>> Prepare to write to pickle file...')
        print('>>> Number of videos in final test result dict: {0}'.format(len(predict_result_dict)))
        write_dict_to_pickle(dict=predict_result_dict, path='./output/sparse_topic_predictor.p')
Exemplo n.º 3
0
    test_matrix = []
    for subdir, _, files in os.walk(test_loc):
        for f in files:
            test_matrix.extend(_load_data(os.path.join(subdir, f), is_re))

    print('>>> Finish loading all data!')

    # predict test data from customized ridge regressor
    test_yhat, test_vids = RidgeRegressor(train_matrix,
                                          test_matrix).predict_from_sparse(
                                              vectorize_train_data,
                                              vectorize_test_data)

    timer.stop()

    # write to pickle file
    to_write = True
    predict_result_dict = {
        vid: pred
        for vid, pred in zip(test_vids, test_yhat)
    }
    if to_write:
        print('>>> Prepare to write to pickle file...')
        print('>>> Number of videos in final test result dict: {0}'.format(
            len(predict_result_dict)))
        write_dict_to_pickle(dict=predict_result_dict,
                             path=os.path.join(
                                 output_dir,
                                 '{0}_sparse_context_topic_predictor.p'.format(
                                     ['wp', 're'][is_re])))
        for f in files:
            with open(os.path.join(subdir, f), 'r') as fin:
                # read header
                fin.readline()
                for line in fin:
                    vid, _, duration, dump = line.rstrip().split('\t', 3)
                    test_vids.append(vid)
                    duration = int(duration)
                    wp30 = float(dump.split('\t')[7])
                    true_wp.append(wp30)
                    random_guess = 0.5
                    guess_wp.append(to_watch_percentage(engagement_map, duration, random_guess, lookup_keys=lookup_durations))

    print('>>> Predict watch percentage on duration...')
    print('>>> MAE on test set: {0:.4f}'.format(mean_absolute_error(true_wp, guess_wp)))
    print('>>> R2 on test set: {0:.4f}'.format(r2_score(true_wp, guess_wp)))
    print('=' * 79)

    # get running time
    print('\n>>> Total running time: {0}'.format(str(datetime.timedelta(seconds=time.time() - start_time)))[:-3])

    # write to pickle file
    to_write = True
    true_result_dict = {vid: true for vid, true in zip(test_vids, true_wp)}
    predict_result_dict = {vid: pred for vid, pred in zip(test_vids, guess_wp)}
    if to_write:
        print('>>> Prepare to write to pickle file...')
        print('>>> Number of videos in final test result dict: {0}'.format(len(test_vids)))
        write_dict_to_pickle(dict=true_result_dict, path='./output/true_predictor.p')
        write_dict_to_pickle(dict=predict_result_dict, path='./output/duration_predictor.p')
                        if detect_lang in lang_dict:
                            row[3+category_cnt + lang_dict[detect_lang]] = 1
                        if topics == '' or topics == 'NA':
                            row[3 + category_cnt + lang_cnt + topic_dict['NA']] = 1
                        else:
                            topics = topics.split(',')
                            for topic in topics:
                                if topic in topic_dict:
                                    row[3 + category_cnt + lang_cnt + topic_dict[topic]] = 1
                                else:
                                    row[3 + category_cnt + lang_cnt + topic_dict['NA']] = 1
                        row[-1] = float(re30)
                        test_matrix.append(row)
                        test_vids.append(vid)
                test_matrix = np.array(test_matrix)

                # predict test data from customized ridge regressor
                test_yhat = RidgeRegressor(train_matrix, test_matrix, verbose=False).predict()

                predict_result_dict.update({vid: pred for vid, pred in zip(test_vids, test_yhat)})

    # get running time
    print('\n>>> Total running time: {0}'.format(str(datetime.timedelta(seconds=time.time() - start_time)))[:-3])

    # write to pickle file
    to_write = True
    if to_write:
        print('>>> Prepare to write to pickle file...')
        print('>>> Number of videos in final test result dict: {0}'.format(len(predict_result_dict)))
        write_dict_to_pickle(dict=predict_result_dict, path='./output/csp_predictor_{0}.p'.format(k))
Exemplo n.º 6
0
                            row[1] = 1
                        if category in category_dict:
                            row[2 + category_dict[category]] = 1
                        if detect_lang in lang_dict:
                            row[2 + category_cnt + lang_dict[detect_lang]] = 1
                        if not (topics == '' or topics == 'NA'):
                            topics = topics.split(',')
                            for topic in topics:
                                if topic in topic_dict:
                                    row[2 + category_cnt + lang_cnt + topic_dict[topic]] = 1
                        target = [wp30, re30][is_re]
                        row[-1] = float(target)
                        test_matrix.append(row)
                        test_vids.append(vid)
                    test_matrix = np.array(test_matrix)

                    # predict test data from customized ridge regressor
                    test_yhat = RidgeRegressor(train_matrix, test_matrix, verbose=False).predict()

                    predict_result_dict.update({vid: pred for vid, pred in zip(test_vids, test_yhat)})

    timer.stop()

    # write to pickle file
    to_write = True
    if to_write:
        print('>>> Prepare to write to pickle file...')
        print('>>> Number of videos in final test result dict: {0}'.format(len(predict_result_dict)))
        write_dict_to_pickle(dict=predict_result_dict,
                             path=os.path.join(output_dir, '{0}_csp_predictor_{1}.p'.format(['wp', 're'][is_re], k)))
    for subdir, _, files in os.walk(train_loc):
        for f in files:
            train_matrix.extend(_load_data(os.path.join(subdir, f))[0])
    train_matrix = np.array(train_matrix)

    print('>>> Start to load test dataset...')
    test_matrix = []
    test_vids = []
    for subdir, _, files in os.walk(test_loc):
        for f in files:
            matrix, vids = _load_data(os.path.join(subdir, f))
            test_matrix.extend(matrix)
            test_vids.extend(vids)
    test_matrix = np.array(test_matrix)

    print('>>> Finish loading all data!\n')

    # predict test data from customized ridge regressor
    test_yhat = RidgeRegressor(train_matrix, test_matrix).predict()

    # get running time
    print('\n>>> Total running time: {0}'.format(str(datetime.timedelta(seconds=time.time() - start_time)))[:-3])

    # write to pickle file
    to_write = True
    predict_result_dict = {vid: pred for vid, pred in zip(test_vids, test_yhat)}
    if to_write:
        print('>>> Prepare to write to pickle file...')
        print('>>> Number of videos in final test result dict: {0}'.format(len(predict_result_dict)))
        write_dict_to_pickle(dict=predict_result_dict, path='./output/content_predictor.p')
    for subdir, _, files in os.walk(test_loc):
        for f in files:
            matrix, vids = _load_data(os.path.join(subdir, f), is_re)
            test_matrix.extend(matrix)
            test_vids.extend(vids)
    test_matrix = np.array(test_matrix)

    print('>>> Finish loading all data!')

    # predict test data from customized ridge regressor
    test_yhat = RidgeRegressor(train_matrix,
                               test_matrix).predict(show_params=True)

    timer.stop()

    # write to pickle file
    to_write = True
    predict_result_dict = {
        vid: pred
        for vid, pred in zip(test_vids, test_yhat)
    }
    if to_write:
        print('>>> Prepare to write to pickle file...')
        print('>>> Number of videos in final test result dict: {0}'.format(
            len(predict_result_dict)))
        write_dict_to_pickle(dict=predict_result_dict,
                             path=os.path.join(
                                 output_dir,
                                 '{0}_reputation_predictor.p'.format(
                                     ['wp', 're'][is_re])))