def create_by_information_gain(raw_rdd: RDD, threshold: float) -> None:
    """
    information gain(IG)を元にして不要語を抽出する.
    各カテゴリckに対して単語tj毎にInformation Gainを以下のように定義

    IG(tj, ck) = p(tj, ck)(log(p(tj, ck)) - log(p(tj)) - log(p(ck)))
                    + p(!tj, ck)(log(p(!tj, ck)) - log(p(!tj)) - log(p(ck)))
    """

    # rdd(key, rows)
    category_group = raw_rdd.groupBy(
        # row(category, words)
        lambda row: row['category']
    )
    category_hist = category_group.map(
        lambda key_and_rows: len(key_and_ros[1]) # length of rows
    )