예제 #1
0
def main():
    assert args.working_directory is not None
    try:
        os.mkdir(args.working_directory)
    except:
        pass

    # 学習に用いるテキストデータを準備
    corpus_l = build_corpus(args.train_filename_l,
                            args.train_directory_l,
                            supervised=True)
    corpus_u = build_corpus(args.train_filename_u,
                            args.train_directory_u,
                            supervised=False)

    # 辞書
    dictionary = nlp.dictionary()

    # 訓練データ・検証データに分けてデータセットを作成
    # 同時に辞書が更新される
    dataset_l = nlp.dataset(corpus_l, dictionary, args.train_dev_split,
                            args.seed)  # 教師あり
    dataset_u = nlp.dataset(corpus_u, dictionary, args.train_dev_split,
                            args.seed)  # 教師なし

    # 辞書を保存
    dictionary.save(os.path.join(args.working_directory, "char.dict"))

    # 確認
    size_train_l = dataset_l.get_size_train()
    size_dev_l = dataset_l.get_size_dev()
    size_train_u = dataset_u.get_size_train()
    size_dev_u = dataset_u.get_size_dev()
    table = [
        ["Labeled", size_train_l, size_dev_l, size_train_l + size_dev_l],
        ["Unlabeled", size_train_u, size_dev_u, size_train_u + size_dev_u],
        [
            "Total", size_train_u + size_train_l, size_dev_u + size_dev_l,
            size_train_u + size_train_l + size_dev_u + size_dev_l
        ],
    ]
    print(tabulate(table, headers=["Data", "Train", "Dev", "Total"]))

    num_character_ids = dictionary.get_num_characters()

    # モデル
    crf = nlp.crf(
        dataset_labeled=dataset_l,
        num_character_ids=num_character_ids,
        feature_x_unigram_start=args.crf_feature_x_unigram_start,
        feature_x_unigram_end=args.crf_feature_x_unigram_end,
        feature_x_bigram_start=args.crf_feature_x_bigram_start,
        feature_x_bigram_end=args.crf_feature_x_bigram_end,
        feature_x_identical_1_start=args.crf_feature_x_identical_1_start,
        feature_x_identical_1_end=args.crf_feature_x_identical_1_end,
        feature_x_identical_2_start=args.crf_feature_x_identical_2_start,
        feature_x_identical_2_end=args.crf_feature_x_identical_2_end,
        initial_lambda_0=args.crf_lambda_0,
        sigma=args.crf_prior_sigma)

    npylm = nlp.npylm(max_word_length=args.max_word_length,
                      g0=1.0 / num_character_ids,
                      initial_lambda_a=args.lambda_a,
                      initial_lambda_b=args.lambda_b,
                      vpylm_beta_stop=args.vpylm_beta_stop,
                      vpylm_beta_pass=args.vpylm_beta_pass)

    npycrf = nlp.npycrf(npylm=npylm, crf=crf)

    num_features = crf.get_num_features()
    print(
        tabulate([["#characters", num_character_ids],
                  ["#features", num_features]]))

    # 学習の準備
    trainer = nlp.trainer(dataset_labeled=dataset_l,
                          dataset_unlabeled=dataset_u,
                          dictionary=dictionary,
                          npycrf=npycrf,
                          crf_regularization_constant=1.0)

    # 文字列の単語IDが衝突しているかどうかをチェック
    # 時間の無駄なので一度したらしなくてよい
    # メモリを大量に消費します
    if False:
        print("ハッシュの衝突を確認中 ...")
        num_checked_words = trainer.detect_hash_collision(args.max_word_length)
        print("衝突はありません (総単語数 {})".format(num_checked_words))

    learning_rate = args.crf_learning_rate
    batchsize = 32
    start = time.time()

    # 初期化
    trainer.add_labeled_data_to_npylm()  # 教師データをNPYLMに追加
    trainer.sgd(learning_rate, batchsize, pure_crf=True)  # NPYLMを除いてCRF単体を最適化

    print("Iteration {} / {} - {:.3f} sec".format(0, args.epochs,
                                                  time.time() - start))

    for epoch in range(1, args.epochs + 1):
        start = time.time()

        # 学習
        ## NPYLMのパラメータをギブスサンプリング
        ## 教師データも含めた方がいい気がする
        trainer.gibbs(include_labeled_data=True)
        ## CRFを最適化
        trainer.sgd(learning_rate, batchsize)

        # 各種サンプリング
        ## HPYLMとVPYLMのハイパーパラメータの更新
        trainer.sample_hpylm_vpylm_hyperparameters()
        ## 単語長のポアソン分布のパラメータλの更新
        trainer.sample_npylm_lambda()

        # この推定は数イテレーション後にやるほうが精度が良い
        if epoch > 3:
            # VPYLMから長さkの単語が生成される確率P(k|VPYLM)を推定
            trainer.update_p_k_given_vpylm()

        # ログ
        print("Iteration {} / {} - {:.3f} sec".format(epoch, args.epochs,
                                                      time.time() - start))

        # log_likelihood_l = trainer.compute_log_likelihood_labeled_dev()
        # log_likelihood_u = trainer.compute_log_likelihood_unlabeled_dev()
        # table = [
        # 	["Labeled", log_likelihood_l],
        # 	["Unlabeled", log_likelihood_u]
        # ]
        # print(tabulate(table, headers=["Log-likelihood", "Dev"]))

        trainer.print_segmentation_labeled_dev(10)  # ランダムに分割を表示
        trainer.print_segmentation_unlabeled_dev(10)  # ランダムに分割を表示

        precision, recall = trainer.compute_precision_and_recall_labeled_dev()
        f_measure = 2 * precision * recall / (precision + recall)
        print(
            tabulate([["Labeled", precision, recall, f_measure]],
                     headers=["Precision", "Recall", "F-measure"]))

        # モデルの保存
        npylm.save(os.path.join(args.working_directory, "npylm.model"))
        crf.save(os.path.join(args.working_directory, "crf.model"))

        # trainer.print_p_k_vpylm()
        print("lambda_0:", crf.get_lambda_0())
예제 #2
0
def main():
    assert args.working_directory is not None
    try:
        os.mkdir(args.working_directory)
    except:
        pass

    # 辞書
    dictionary = nlp.dictionary(
        os.path.join(args.working_directory, "char.dict"))

    # モデル
    crf = nlp.crf(os.path.join(args.working_directory, "crf.model"))
    npylm = nlp.npylm(os.path.join(args.working_directory, "npylm.model"))
    npycrf = nlp.npycrf(npylm=npylm, crf=crf)

    num_features = crf.get_num_features()
    num_character_ids = dictionary.get_num_characters()
    print(
        tabulate([["#characters", num_character_ids],
                  ["#features", num_features]]))

    # ビタビアルゴリズムによる最尤分解を求める
    assert args.test_filename is not None or args.test_directory is not None
    sentence_list = []

    def preprocess(sentence):
        sentence = re.sub(r"[0-9.,]+", "#", sentence)
        sentence = sentence.strip()
        return sentence

    if args.test_filename is not None:
        with codecs.open(args.test_filename, "r", "utf-8") as f:
            for sentence_str in f:
                sentence_str = preprocess(sentence_str)
                sentence_list.append(sentence_str)

    if args.test_directory is not None:
        for filename in os.listdir(args.test_directory):
            with codecs.open(os.path.join(args.test_directory, filename), "r",
                             "utf-8") as f:
                for sentence_str in f:
                    sentence_str = preprocess(sentence_str)
                    sentence_list.append(sentence_str)

    # 教師データはMeCabによる分割
    tagger = MeCab.Tagger() if args.neologd_path is None else MeCab.Tagger(
        "-d " + args.neologd_path)
    tagger.parse("")  # バグ回避のため空データを分割
    for sentence_str in sentence_list:
        sentence_str = sentence_str.strip()
        m = tagger.parseToNode(sentence_str)  # 形態素解析
        words_true = []
        while m:
            word = m.surface
            if len(word) > 0:
                words_true.append(word)
            m = m.next
        if len(words_true) > 0:
            words_npycrf = npycrf.parse(sentence_str, dictionary)
            words_npylm = npylm.parse(sentence_str, dictionary)
            print("MeCab+NEologd")
            print(" / ".join(words_true))
            print("NPYCRF")
            print(" / ".join(words_npycrf))
            print("NPYLM")
            print(" / ".join(words_npylm))
            print()