예제 #1
0
def main():

    ###########################
    #### create dictionary ####
    ###########################

    if os.path.exists('./data/corpus/dictionary.dict'):
        if args.lang == 'ja':
            corpus = ProposalConvCorpus(file_path=None,
                                        batch_size=batchsize,
                                        size_filter=True)
        else:
            corpus = ConvCorpus(file_path=None,
                                batch_size=batchsize,
                                size_filter=True)
        corpus.load(load_dir='./data/corpus/')
    else:
        if args.lang == 'ja':
            corpus = ProposalConvCorpus(file_path=data_file,
                                        batch_size=batchsize,
                                        size_filter=True)
        else:
            corpus = ConvCorpus(file_path=data_file,
                                batch_size=batchsize,
                                size_filter=True)
        corpus.save(save_dir='./data/corpus/')
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))
    print('Emotion size: ', len(corpus.emotion_set))

    # search word_threshold (general - emotional)
    ma = 0
    mi = 999999
    for word in corpus.emotion_set:
        wid = corpus.dic.token2id[word]
        if wid > ma:
            ma = wid
        if wid < mi:
            mi = wid
    # print(corpus.dic.token2id['<start>'], corpus.dic.token2id['<eos>'], corpus.dic.token2id['happy'], mi, ma)
    word_threshold = mi

    ######################
    #### create model ####
    ######################

    model = Seq2Seq(all_vocab_size=len(corpus.dic.token2id),
                    emotion_vocab_size=len(corpus.emotion_set),
                    feature_num=feature_num,
                    hidden_num=hidden_num,
                    batch_size=batchsize,
                    label_num=label_num,
                    label_embed_num=label_embed,
                    gpu_flg=args.gpu)

    if args.gpu >= 0:
        model.to_gpu()
    optimizer = optimizers.Adam(alpha=0.001)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001))

    ##########################
    #### create ID corpus ####
    ##########################

    input_mat = []
    output_mat = []
    input_mat_rev = []
    label_mat = []
    max_input_ren = max_output_ren = 0
    print('start making corpus matrix...')
    for input_text, output_text in zip(corpus.rough_posts, corpus.rough_cmnts):

        # reverse an input and add eos tag
        output_text.append(corpus.dic.token2id["<eos>"])  # 出力の最後にeosを挿入

        # update max sentence length
        max_input_ren = max(max_input_ren, len(input_text))
        max_output_ren = max(max_output_ren, len(output_text))

        # make a list of lists
        input_mat.append(input_text)
        output_mat.append(output_text)

        # make label lists TODO: 3値分類
        n_num = p_num = 0
        for word in output_text:
            if corpus.dic[word] in corpus.neg_words:
                n_num += 1
            if corpus.dic[word] in corpus.pos_words:
                p_num += 1
        if (n_num + p_num) == 0:
            label_mat.append([1 for _ in range(len(output_text))])
        elif n_num <= p_num:
            label_mat.append([2 for _ in range(len(output_text))])
        elif n_num > p_num:
            label_mat.append([0 for _ in range(len(output_text))])
        else:
            raise ValueError

    # make reverse corpus
    for input_text in input_mat:
        input_mat_rev.append(input_text[::-1])

    # padding (inputの文頭・outputの文末にパディングを挿入する)
    print('start labeling...')
    for li in input_mat:
        insert_num = max_input_ren - len(li)
        for _ in range(insert_num):
            li.append(corpus.dic.token2id['<pad>'])
    for li in output_mat:
        insert_num = max_output_ren - len(li)
        for _ in range(insert_num):
            li.append(corpus.dic.token2id['<pad>'])
    for li in input_mat_rev:
        insert_num = max_input_ren - len(li)
        for _ in range(insert_num):
            li.insert(0, corpus.dic.token2id['<pad>'])
    for li in label_mat:
        insert_num = max_output_ren - len(li)
        for _ in range(insert_num):
            li.append(corpus.dic.token2id['<pad>'])
    if len(output_mat) != len(label_mat):
        print('Output matrix and label matrix should have the same dimension.')
        raise ValueError

    # create batch matrix
    print('transpose...')
    input_mat = np.array(input_mat, dtype=np.int32).T
    input_mat_rev = np.array(input_mat_rev, dtype=np.int32).T
    output_mat = np.array(output_mat, dtype=np.int32).T
    label_mat = np.array(label_mat, dtype=np.int32).T

    # separate corpus into Train and Test TODO:実験時はテストデータとトレーニングデータに分離する
    print('split train and test...')
    train_input_mat = input_mat
    train_output_mat = output_mat
    train_input_mat_rev = input_mat_rev
    train_label_mat = label_mat

    #############################
    #### train seq2seq model ####
    #############################

    accum_loss = 0
    train_loss_data = []
    print('start training...')
    for num, epoch in enumerate(range(n_epoch)):
        total_loss = 0
        batch_num = 0
        perm = np.random.permutation(len(corpus.rough_posts))

        # for training
        for i in range(0, len(corpus.rough_posts), batchsize):

            # select batch data
            input_batch = remove_extra_padding(
                train_input_mat[:, perm[i:i + batchsize]], reverse_flg=False)
            input_batch_rev = remove_extra_padding(
                train_input_mat_rev[:, perm[i:i + batchsize]],
                reverse_flg=True)
            output_batch = remove_extra_padding(
                train_output_mat[:, perm[i:i + batchsize]], reverse_flg=False)
            label_batch = remove_extra_padding(
                train_label_mat[:, perm[i:i + batchsize]], reverse_flg=False)

            # Encode a sentence
            model.initialize(
                batch_size=input_batch.shape[1])  # initialize cell
            model.encode(input_batch, input_batch_rev,
                         train=True)  # encode (output: hidden Variable)

            # Decode from encoded context
            input_ids = xp.array(
                [corpus.dic.token2id["<start>"] for _ in range(batchsize)])
            for w_ids, l_ids in zip(output_batch, label_batch):
                loss, predict_mat = model.decode(input_ids,
                                                 w_ids,
                                                 label_id=l_ids,
                                                 word_th=word_threshold,
                                                 train=True)
                input_ids = w_ids
                accum_loss += loss

            # learn model
            model.cleargrads()  # initialize all grad to zero
            accum_loss.backward()  # back propagation
            optimizer.update()
            total_loss += float(accum_loss.data)
            batch_num += 1
            print('Epoch: ', num, 'Batch_num', batch_num,
                  'batch loss: {:.2f}'.format(float(accum_loss.data)))
            accum_loss = 0

        train_loss_data.append(float(total_loss / batch_num))

        # save model and optimizer
        print('-----', epoch + 1, ' times -----')
        print('save the model and optimizer')
        serializers.save_hdf5('data/' + str(epoch) + '.model', model)
        serializers.save_hdf5('data/' + str(epoch) + '.state', optimizer)

    # save loss data
    with open('./data/loss_train_data.pkl', 'wb') as f:
        pickle.dump(train_loss_data, f)
def test_run(data_path, model_path, n_show=80):
    """
    Test function.
    Input is training data.
    Output have to be the sentence which is correct data in training phase.
    :return:
    """

    # call dictionary class
    if args.lang == 'en':
        corpus = ConvCorpus(file_path=None)
    elif args.lang == 'ja':
        corpus = ProposalConvCorpus(file_path=None)
    else:
        print(
            'You gave wrong argument to this system. Check out your argument about languages.'
        )
        raise ValueError
    corpus.load(load_dir=data_path)
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))
    print('')

    # rebuild seq2seq model
    model = Seq2Seq(all_vocab_size=len(corpus.dic.token2id),
                    emotion_vocab_size=len(corpus.emotion_set),
                    feature_num=args.feature_num,
                    hidden_num=args.hidden_num,
                    label_num=args.label_num,
                    label_embed_num=args.label_embed,
                    batch_size=1,
                    gpu_flg=args.gpu)
    serializers.load_hdf5(model_path, model)

    # run an interpreter
    for num, input_sentence in enumerate(corpus.rough_posts):
        id_sequence = input_sentence.copy()
        input_sentence_rev = input_sentence[::-1]

        # make label lists TODO: 3値分類
        n_num = p_num = 0
        for word in corpus.rough_cmnts[num]:
            if corpus.dic[word] in corpus.neg_words:
                n_num += 1
            if corpus.dic[word] in corpus.pos_words:
                p_num += 1
        if (n_num + p_num) == 0:
            label_id = 1
        elif n_num <= p_num:
            label_id = 2
        elif n_num > p_num:
            label_id = 0
        else:
            raise ValueError

        # generate an output
        model.initialize(batch_size=1)  # initialize cell
        sentence = model.generate(input_sentence,
                                  input_sentence_rev,
                                  sentence_limit=len(input_sentence) + 20,
                                  label_id=label_id,
                                  word2id=corpus.dic.token2id,
                                  id2word=corpus.dic)
        print("teacher : ",
              " ".join([corpus.dic[w_id] for w_id in id_sequence]), label_id)
        print("correct :",
              " ".join([corpus.dic[w_id] for w_id in corpus.rough_cmnts[num]]))
        print("-> ", sentence)
        print('')

        if num == n_show:
            break
def interpreter(data_path, model_path):
    """
    Run this function, if you want to talk to seq2seq model.
    if you type "exit", finish to talk.
    :param data_path: the path of corpus you made model learn
    :param model_path: the path of model you made learn
    :return:
    """
    # call dictionary class
    if args.lang == 'en':
        corpus = ConvCorpus(file_path=None)
    elif args.lang == 'ja':
        corpus = ProposalConvCorpus(file_path=None)
    else:
        print(
            'You gave wrong argument to this system. Check out your argument about languages.'
        )
        raise ValueError
    corpus.load(load_dir=data_path)
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))
    print('')

    # rebuild seq2seq model
    model = Seq2Seq(all_vocab_size=len(corpus.dic.token2id),
                    emotion_vocab_size=len(corpus.emotion_set),
                    feature_num=args.feature_num,
                    hidden_num=args.hidden_num,
                    label_num=args.label_num,
                    label_embed_num=args.label_embed,
                    batch_size=1,
                    gpu_flg=args.gpu)
    serializers.load_hdf5(model_path, model)
    label_index = [index for index in range(LABEL_NUM)]

    # run conversation system
    print('The system is ready to run, please talk to me!')
    print('( If you want to end a talk, please type "exit". )')
    print('')
    while True:
        print('>> ', end='')
        sentence = input()
        if sentence == 'exit':
            print('See you again!')
            break

        # check a sentiment tag
        input_vocab = sentence.split(' ')
        label_id = input_vocab.pop(-1)
        label_false_flg = 1
        for index in label_index:
            if label_id == str(index):
                label_id = index  # TODO: ラベルのインデックスに注意する.今は3値分類 (0, 1, 2)
                label_false_flg = 0
                break
        if label_false_flg:
            print('caution: you donot set any enable tags!')
            input_vocab = sentence.split(' ')
            label_id = -1

        if args.lang == 'en':
            input_vocab = [
                unicodedata.normalize('NFKC', word.lower())
                for word in word_tokenize(sentence)
            ]
        elif args.lang == 'ja':
            input_vocab = [
                unicodedata.normalize('NFKC', word.lower())
                for word in parse_ja_text(sentence)
            ]

        input_vocab_rev = input_vocab[::-1]

        # convert word into ID
        input_sentence = [
            corpus.dic.token2id[word] for word in input_vocab
            if not corpus.dic.token2id.get(word) is None
        ]
        input_sentence_rev = [
            corpus.dic.token2id[word] for word in input_vocab_rev
            if not corpus.dic.token2id.get(word) is None
        ]

        model.initialize(batch_size=1)
        if args.beam_search:
            hypotheses = model.beam_search(
                model.initial_state_function,
                model.generate_function,
                X=input_sentence,
                X_rev=input_sentence_rev,
                start_id=corpus.dic.token2id['<start>'],
                end_id=corpus.dic.token2id['<eos>'],
                label_id=label_id)
            for hypothesis in hypotheses:
                generated_indices = hypothesis.to_sequence_of_values()
                generated_tokens = [corpus.dic[i] for i in generated_indices]
                print("--> ", " ".join(generated_tokens))
        else:
            sentence = model.generate(input_sentence,
                                      input_sentence_rev,
                                      sentence_limit=len(input_sentence) + 20,
                                      label_id=label_id,
                                      word2id=corpus.dic.token2id,
                                      id2word=corpus.dic)
        print("-> ", sentence)
        print('')
def test_run(data_path, model_path, n_show=80):
    """
    Test function.
    Input is training data.
    Output have to be the sentence which is correct data in training phase.
    :return:
    """

    # call dictionary class
    if args.lang == 'en':
        corpus = ConvCorpus(file_path=None)
    elif args.lang == 'ja':
        corpus = ProposalConvCorpus(file_path=None)
    else:
        print(
            'You gave wrong argument to this system. Check out your argument about languages.'
        )
        raise ValueError
    corpus.load(load_dir=data_path)
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))
    print('')

    # rebuild seq2seq model
    model = MultiTaskSeq2Seq(all_vocab_size=len(corpus.dic.token2id),
                             emotion_vocab_size=len(corpus.emotion_set),
                             feature_num=args.feature_num,
                             hidden_num=args.hidden_num,
                             label_num=args.label_num,
                             label_embed_num=args.label_embed,
                             batch_size=1,
                             gpu_flg=args.gpu)
    serializers.load_hdf5(model_path, model)

    # run an interpreter
    for num, input_sentence in enumerate(corpus.fine_posts):
        id_sequence = input_sentence.copy()
        input_sentence_rev = input_sentence[::-1]

        # make label lists TODO: 3値分類
        n_num = p_num = 0
        topic_label_id = correct_emo_label = -1
        for index, w_id in enumerate(corpus.fine_cmnts[num]):
            # comment の最初にトピックラベルが付いているものとする
            if index == 0:
                if w_id == corpus.dic.token2id['__label__0']:
                    topic_label_id = 0
                elif w_id == corpus.dic.token2id['__label__1']:
                    topic_label_id = 1
                else:
                    print('no label error: ', w_id)
                    raise ValueError
            # pos or neg word の判定
            else:
                if corpus.dic[w_id] in corpus.neg_words:
                    n_num += 1
                if corpus.dic[w_id] in corpus.pos_words:
                    p_num += 1
        if (n_num + p_num) == 0:
            correct_emo_label = 1
        elif n_num <= p_num:
            correct_emo_label = 2
        elif n_num > p_num:
            correct_emo_label = 0
        else:
            raise ValueError

        # generate an output
        print("input : ", " ".join([corpus.dic[w_id] for w_id in id_sequence]))
        print("train emotion label: ", correct_emo_label)
        print(
            "correct :", " ".join([
                corpus.dic[w_id]
                for index, w_id in enumerate(corpus.fine_cmnts[num])
                if index != 0
            ]))
        print(input_sentence)
        print(input_sentence_rev)
        for emo_label in range(LABEL_NUM):
            model.initialize(batch_size=1)
            sentence = model.generate(input_sentence,
                                      input_sentence_rev,
                                      sentence_limit=len(input_sentence) + 20,
                                      emo_label_id=emo_label,
                                      topic_label_id=topic_label_id,
                                      word2id=corpus.dic.token2id,
                                      id2word=corpus.dic)
            if emo_label == 0:
                print("neg -> ", sentence)
            elif emo_label == 1:
                print("neu -> ", sentence)
            else:
                print("pos -> ", sentence)
        print('')

        if num == n_show:
            break
def fixed_interpreter(data_path, model_path):
    """
    感情タグを入力しない方,トピックラベルの方は0,1の二値を入力
    :param data_path: the path of corpus you made model learn
    :param model_path: the path of model you made learn
    :return:
    """
    # call dictionary class
    if args.lang == 'en':
        corpus = ConvCorpus(file_path=None)
    elif args.lang == 'ja':
        corpus = ProposalConvCorpus(file_path=None)
    else:
        print(
            'You gave wrong argument to this system. Check out your argument about languages.'
        )
        raise ValueError
    corpus.load(load_dir=data_path)
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))
    print('')

    # rebuild seq2seq model
    model = MultiTaskSeq2Seq(all_vocab_size=len(corpus.dic.token2id),
                             emotion_vocab_size=len(corpus.emotion_set),
                             feature_num=args.feature_num,
                             hidden_num=args.hidden_num,
                             label_num=args.label_num,
                             label_embed_num=args.label_embed,
                             batch_size=1,
                             gpu_flg=args.gpu)
    serializers.load_hdf5(model_path, model)
    emo_label_index = [index for index in range(LABEL_NUM)]
    topic_label_index = [index for index in range(TOPIC_NUM)]

    # run conversation system
    print('The system is ready to run, please talk to me!')
    print('( If you want to end a talk, please type "exit". )')
    print('')
    while True:
        print('>> ', end='')
        sentence = input()
        if sentence == 'exit':
            print('See you again!')
            break

        # check a topic tag
        input_vocab = sentence.split(' ')
        topic_label_id = input_vocab.pop(-1)
        label_false_flg = 1
        for index in topic_label_index:
            if topic_label_id == str(index):
                topic_label_id = index  # TODO: ラベルのインデックスに注意する.今は3値分類 (0, 1, 2)
                label_false_flg = 0
                break
        if label_false_flg:
            print('caution: you donot set any enable tags!')
            input_vocab = sentence.split(' ')
            topic_label_id = -1

        if args.lang == 'en':
            input_vocab = [
                unicodedata.normalize('NFKC', word.lower())
                for word in word_tokenize(sentence)
            ]
        elif args.lang == 'ja':
            input_vocab = [
                unicodedata.normalize('NFKC', word.lower())
                for word in parse_ja_text(sentence)
            ]

        input_vocab.pop(-1)
        input_vocab_rev = input_vocab[::-1]

        # convert word into ID
        input_sentence = [
            corpus.dic.token2id[word] for word in input_vocab
            if not corpus.dic.token2id.get(word) is None
        ]
        input_sentence_rev = [
            corpus.dic.token2id[word] for word in input_vocab_rev
            if not corpus.dic.token2id.get(word) is None
        ]

        model.initialize(batch_size=1)
        for emo_label in range(LABEL_NUM):
            sentence = model.generate(input_sentence,
                                      input_sentence_rev,
                                      sentence_limit=len(input_sentence) + 20,
                                      emo_label_id=emo_label,
                                      topic_label_id=topic_label_id,
                                      word2id=corpus.dic.token2id,
                                      id2word=corpus.dic)
            if emo_label == 0:
                print("neg -> ", sentence)
            elif emo_label == 1:
                print("neu -> ", sentence)
            elif emo_label == 2:
                print("pos -> ", sentence)
            else:
                raise ValueError
        print('')
def check_corpus():
    """
    考察の章で使用するコーパス情報を取得する関数.
    ポジティブ文,ネガティブ文,ニュートラル文の個数を数える.
    :return:
    """
    from proposal_util import ProposalConvCorpus

    DATA_DIR = './proposal_model/data/corpus/'

    # call dictionary class
    corpus = ProposalConvCorpus(file_path=None)
    corpus.load(load_dir=DATA_DIR)
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))
    print('')

    # check data
    emotion_freq = {
        'baseball': {
            'pos': 0,
            'neg': 0,
            'neu': 0
        },
        'pokemongo': {
            'pos': 0,
            'neg': 0,
            'neu': 0
        }
    }
    for num, input_sentence in enumerate(corpus.fine_posts):
        n_num = p_num = 0
        topic_label = ''
        for index, w_id in enumerate(corpus.fine_cmnts[num]):
            # comment の最初にトピックラベルが付いているものとする
            if index == 0:
                if w_id == corpus.dic.token2id['__label__0']:
                    topic_label = 'baseball'
                elif w_id == corpus.dic.token2id['__label__1']:
                    topic_label = 'pokemongo'
                else:
                    print('no label error: ', w_id)
                    raise ValueError
            # pos or neg word の判定
            else:
                if corpus.dic[w_id] in corpus.neg_words:
                    n_num += 1
                if corpus.dic[w_id] in corpus.pos_words:
                    p_num += 1
        if (n_num + p_num) == 0:
            emotion_freq[topic_label]['neu'] += 1
        elif n_num <= p_num:
            emotion_freq[topic_label]['pos'] += 1
        elif n_num > p_num:
            emotion_freq[topic_label]['neg'] += 1
        else:
            raise ValueError
        if num % 10000 == 0:
            print(num, 'end...')
    print(emotion_freq)
    for topic in emotion_freq:
        text = 0
        for tag in emotion_freq[topic]:
            text += emotion_freq[topic][tag]
        print(topic, ':', text)

    emotion_freq = {'pos': 0, 'neg': 0, 'neu': 0}
    for num, input_sentence in enumerate(corpus.rough_posts):
        n_num = p_num = 0
        for index, w_id in enumerate(corpus.rough_cmnts[num]):
            if corpus.dic[w_id] in corpus.neg_words:
                n_num += 1
            if corpus.dic[w_id] in corpus.pos_words:
                p_num += 1
        if (n_num + p_num) == 0:
            emotion_freq['neu'] += 1
        elif n_num <= p_num:
            emotion_freq['pos'] += 1
        elif n_num > p_num:
            emotion_freq['neg'] += 1
        else:
            raise ValueError
        if num % 100000 == 0:
            print(num, 'end...')

    print(emotion_freq)
    text = 0
    for tag in emotion_freq:
        text += emotion_freq[tag]
    print('fine texts: ', text)