Пример #1
0
    def run_predict_cv(self) -> None:
        """クロスバリデーションで学習した各foldのモデルの平均により、テストデータの予測を行う

        あらかじめrun_train_cvを実行しておく必要がある
        """

        logger.info(f'{self.run_name} - start prediction cv')
        X_test = self.X_test
        preds = []

        show_feature_importance = 'LGBM' in str(self.model_cls)
        if show_feature_importance:
            feature_importances = pd.DataFrame()

        # 各foldのモデルで予測を行う
        for i_fold in range(self.cv.n_splits):
            logger.info(f'{self.run_name} - start prediction fold:{i_fold}')
            model = self.build_model(i_fold)
            model.load_model()
            pred = model.predict(X_test)
            preds.append(pred)
            logger.info(f'{self.run_name} - end prediction fold:{i_fold}')
            if show_feature_importance:
                feature_importances = pd.concat(
                    [feature_importances,
                     model.feature_importance(X_test)],
                    axis=0)

        # 予測の平均値を出力する
        pred_avg = np.mean(preds, axis=0)

        # 予測結果の保存
        Data.dump(pred_avg, f'../output/pred/{self.run_name}-test.pkl')

        logger.info(f'{self.run_name} - end prediction cv')

        # 特徴量の重要度
        if show_feature_importance:
            japanize_matplotlib.japanize()
            aggs = feature_importances.groupby('Feature').mean().sort_values(
                by="importance", ascending=False)
            cols = aggs[:10].index
            pd.DataFrame(aggs.index).to_csv(
                f'../output/importance/{self.run_name}-fi.csv', index=False)

            best_features = feature_importances.loc[
                feature_importances.Feature.isin(cols)]
            plt.figure(figsize=(5, 7))
            sns.barplot(x="importance",
                        y="Feature",
                        data=best_features.sort_values(by="importance",
                                                       ascending=False))
            plt.title('LightGBM Features (averaged over folds)')
            plt.tight_layout()
            plt.savefig(f'../output/importance/{self.run_name}-fi.png')
            plt.show()

            # mlflow
            mlflow.start_run(run_id=self.run_id)
            log_artifact(f'../output/importance/{self.run_name}-fi.png')
            mlflow.end_run()
Пример #2
0
import japanize_matplotlib
import warnings
from collections import defaultdict
from matplotlib_venn import venn2  # venn図を作成する用
from pandas_profiling import ProfileReport  # profile report を作る用
import pandas_profiling as pdp
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
japanize_matplotlib.japanize()
%matplotlib inline


# 警告が鬱陶しい時はこれを記述
warnings.filterwarnings('ignore')
Пример #3
0
def main():
    """Tweet the number of subscribers with text or an image."""
    # get arguments
    args = get_arguments()

    # set logger
    if args.verbose > 1:
        from logging import DEBUG
        basicConfig(
            level=DEBUG,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    elif args.verbose == 1:
        from logging import INFO
        basicConfig(
            level=INFO,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    else:
        from logging import WARN
        basicConfig(
            level=WARN,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    logger = getLogger(__name__)

    # load config
    with open(args.conf, mode="r") as f:
        config = yaml.load(f, Loader=yaml.SafeLoader)

    # get API keys
    api_keys = config.get("api_keys", "")
    try:
        DEVELOPER_KEY = api_keys["DEVELOPER_KEY"]
        API_KEY = api_keys["API_KEY"]
        API_KEY_SECRET = api_keys["API_KEY_SECRET"]
        ACCESS_TOKEN = api_keys["ACCESS_TOKEN"]
        ACCESS_TOKEN_SECRET = api_keys["ACCESS_TOKEN_SECRET"]
    except KeyError:
        logger.error("API keys must be specified.")
        sys.exit(1)
    if not all(api_keys.values()):
        logger.error("API keys must be specified.")
        sys.exit(1)

    # load channel info
    # chinfo: (chid, name, dirname)
    chinfo = pd.read_csv(config["chinfo"])

    # check log dirctory existance
    log_root_path = config.get("log_root_path", "subscribers_log")
    if not os.path.exists(log_root_path):
        os.makedirs(log_root_path)
        logger.info(f"Create {log_root_path}")
    log_path_list = [
        os.path.join(log_root_path, dirname) for dirname in chinfo["dirname"]
    ]
    chinfo["log_path"] = log_path_list
    for log_path in log_path_list:
        if not os.path.exists(log_path):
            os.mkdir(log_path)
            logger.info(f"Create {log_path}")

    # get current time
    datetime_now = datetime.datetime.now()
    # set timestamp (YYYY-MM-DD_hh:mm:ss)
    timestamp = datetime_now.strftime("%Y-%m-%d_%H:%M:%S")

    # launch a session
    YOUTUBE_API_SERVICE_NAME = "youtube"
    YOUTUBE_API_VERSION = "v3"
    youtube = build(YOUTUBE_API_SERVICE_NAME,
                    YOUTUBE_API_VERSION,
                    developerKey=DEVELOPER_KEY,
                    cache_discovery=False)
    twitter = OAuth1Session(API_KEY, API_KEY_SECRET, ACCESS_TOKEN,
                            ACCESS_TOKEN_SECRET)

    # get the number of subscribers
    chids = ",".join(chinfo["chid"])
    chid_subsc_list = get_subscribers(youtube=youtube, chids=chids)

    # output logs of the number of subscribers
    output_log(datetime_now, chid_subsc_list, chinfo=chinfo)

    # tweet the number of subscribers
    tweet_type = config.get(
        "tweet_type", "image") if not args.tweet_type else args.tweet_type
    prev_subsc_dict = check_prev_subsc(chid_subsc_list, chinfo=chinfo)
    # FIXME: "text" doesn't work
    #        status_list should be divided considering the word limit.
    if tweet_type == "text":
        status_list = []
        nmax = max([len(name) for name in chinfo["name"]])
        for chid, subsc in chid_subsc_list:
            prev_subsc = prev_subsc_dict.get(chid)
            # NOTE: not sophisticated
            name = chinfo[chinfo["chid"] == chid]["name"].values[0]
            if prev_subsc is not None:
                diff = subsc - prev_subsc
                if diff != 0:
                    status_list.append(
                        f"{form_name(name, nmax=nmax):s}: {subsc:_=8,d} ({subsc-prev_subsc:+})"
                    )
                else:
                    status_list.append(
                        f"{form_name(name, nmax=nmax):s}: {subsc:_=8,d} (0)")
            else:
                status_list.append(
                    f"{form_name(name, nmax=nmax):s}: {subsc:_=8,d}")
        exit(1)

        tweet_id = update(twitter=twitter, status="")
        update(twitter=twitter, status="", tweet_id=tweet_id)
    elif tweet_type == "image":
        table = []
        for chid, subsc in chid_subsc_list:
            prev_subsc = prev_subsc_dict.get(chid)
            # NOTE: not sophisticated
            name = chinfo[chinfo["chid"] == chid]["name"].values[0]
            if prev_subsc is not None:
                diff = subsc - prev_subsc
                if diff != 0:
                    table.append([f"{name:s}", f"{subsc:,d}", f"{diff:+,d}"])
                else:
                    table.append([f"{name:s}", f"{subsc:,d}", "0"])
            else:
                table.append([f"{name:s}", f"{subsc:,d}", "-"])
        logger.debug(table)
        # NOTE: japanize_matplotlib is used for matplotlib to display japanese,
        #       but it might be better to set fonts on my own.
        #       reference: https://github.com/uehara1414/japanize-matplotlib
        japanize_matplotlib.japanize()

        # generate a table image
        fig, ax = plt.subplots(1, 1, figsize=(4.0, 4.0))
        ax.axis("off")
        ax.axis("tight")
        colors = [['#D8D8D8'
                   for _ in row] if i % 2 == 1 else ["w" for _ in row]
                  for i, row in enumerate(table)]

        table = ax.table(
            cellText=table,
            colLabels=["名前", "登録者数", "前日比"],
            loc="center",
            cellColours=colors,
        )

        # table settings
        # Reference:
        #   https://matplotlib.org/stable/api/table_api.html#matplotlib.table.Table
        #   https://matplotlib.org/stable/api/table_api.html#matplotlib.table.Cell
        table.auto_set_font_size(False)
        for key, cell in table.get_celld().items():
            row, column = key
            if row == 0:
                cell.set_facecolor('#2F2F2F')
                cell.set_text_props(color='w')
            elif row > 0:
                cell.set_text_props(horizontalalignment="center")
            cell.set_width(0.8 * cell.get_width())
            cell.set_height(1.2 * cell.get_height())
        table.auto_set_column_width([0])
        plt.tight_layout()
        plt.savefig("table.png", format="png", dpi=400)
        plt.close()

        # upload an image and tweet
        with open("table.png", "rb") as media:
            media_ids = upload(twitter=twitter, media=media)
        update(twitter=twitter, status=timestamp, media_ids=media_ids)
        os.remove("table.png")
    else:
        logger.error(f"tweet_type \"{tweet_type}\" is not supported.")
        sys.exit(1)