Пример #1
0
def show_heatmap(data_path, model_path):
    import seaborn
    import matplotlib
    seaborn.set()
    matplotlib.rc('font', family='sans-serif')

    # call dictionary class
    if args.lang == 'en':
        corpus = ConvCorpus(file_path=None)
        corpus.load(load_dir=data_path)
    elif args.lang == 'ja':
        corpus = JaConvCorpus(file_path=None)
        corpus.load(load_dir=data_path)
    else:
        print(
            'You gave wrong argument to this system. Check out your argument about languages.'
        )
        raise ValueError
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))
    print('')

    # rebuild seq2seq model
    model = Seq2Seq(len(corpus.dic.token2id),
                    feature_num=args.feature_num,
                    hidden_num=args.hidden_num,
                    label_num=args.label_num,
                    label_embed_num=args.label_embed,
                    batch_size=1,
                    gpu_flg=args.gpu)
    serializers.load_hdf5(model_path, model)

    # sentiment matrix in the decoder
    sentiment_mat = model.dec.le.W.data
    cmap = seaborn.diverging_palette(
        220, 10, as_cmap=True)  # Generate a custom diverging colormap
    seaborn.heatmap(sentiment_mat,
                    cmap=cmap,
                    center=0,
                    linewidths=.5,
                    xticklabels=False
                    )  # square=True, cbar_kws={"orientation": "horizontal"})
    plt.xlabel("Dimension (=" + str(sentiment_mat.shape[1]) + ")")
    plt.ylabel("Sentiment")
    plt.savefig('./data/sentiment_matrix.png')

    # encoder embedding matrix
    encode_mat = model.enc.xe.W
    seaborn.heatmap(encode_mat,
                    cmap=cmap,
                    center=0,
                    linewidths=.5,
                    xticklabels=False
                    )  # square=True, cbar_kws={"orientation": "horizontal"})
    plt.xlabel("Dimension (=" + str(sentiment_mat.shape[1]) + ")")
    plt.ylabel("Sentiment")
    plt.savefig('./data/sentiment_matrix.png')
Пример #2
0
def main():

    ###########################
    #### create dictionary ####
    ###########################

    if os.path.exists('./data/corpus/dictionary.dict'):
        if args.lang == 'ja':
            corpus = JaConvCorpus(file_path=None, batch_size=batchsize, size_filter=True)
        else:
            corpus = ConvCorpus(file_path=None, batch_size=batchsize)
        corpus.load(load_dir='./data/corpus/')
    else:
        if args.lang == 'ja':
            corpus = JaConvCorpus(file_path=data_file, batch_size=batchsize, size_filter=True)
        else:
            corpus = ConvCorpus(file_path=data_file, batch_size=batchsize)
        corpus.save(save_dir='./data/corpus/')
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))

    ######################
    #### create model ####
    ######################

    model = Seq2Seq(len(corpus.dic.token2id), feature_num=feature_num,
                    hidden_num=hidden_num, label_num=label_num,
                    label_embed_num=label_embed_num,
                    batch_size=batchsize, gpu_flg=args.gpu)
    if args.gpu >= 0:
        model.to_gpu()
    optimizer = optimizers.Adam(alpha=0.001)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(5))
    # optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001))

    ##########################
    #### create ID corpus ####
    ##########################

    input_mat = []
    output_mat = []
    label_mat = []
    max_input_ren = max_output_ren = 0

    for input_text, output_text in zip(corpus.posts, corpus.cmnts):

        # convert to list
        # input_text.reverse()                                  # encode words in a reverse order
        # input_text.insert(0, corpus.dic.token2id["<eos>"])
        output_text.append(corpus.dic.token2id["<eos>"])

        # update max sentence length
        max_input_ren = max(max_input_ren, len(input_text))
        max_output_ren = max(max_output_ren, len(output_text))

        # listのlistを作る(要修正)
        label = input_text.pop(-1)
        if label == corpus.dic.token2id["__label__1"]:
            label_mat.append([0 for _ in range(len(output_text))])
        elif label == corpus.dic.token2id["__label__2"]:
            label_mat.append([1 for _ in range(len(output_text))])
        else:
            print('label error!: ', label)
            raise ValueError
        input_mat.append(input_text)
        output_mat.append(output_text)

    # padding
    for li in input_mat:
        insert_num = max_input_ren - len(li)
        for _ in range(insert_num):
            li.append(corpus.dic.token2id['<pad>'])             # そのままの入力順序にする場合
            # li.insert(0, corpus.dic.token2id['<pad>'])        # 入力順序を逆にする場合
    for li in output_mat:
        insert_num = max_output_ren - len(li)
        for _ in range(insert_num):
            li.append(corpus.dic.token2id['<pad>'])
    for li in label_mat:
        insert_num = max_output_ren - len(li)
        for _ in range(insert_num):
            li.append(corpus.dic.token2id['<pad>'])

    # create batch matrix
    input_mat = np.array(input_mat, dtype=np.int32).T
    output_mat = np.array(output_mat, dtype=np.int32).T
    label_mat = np.array(label_mat, dtype=np.int32).T

    # separate corpus into Train and Test
    perm = np.random.permutation(len(corpus.posts))
    test_input_mat = input_mat[:, perm[0:0 + testsize]]
    test_output_mat = output_mat[:, perm[0:0 + testsize]]
    test_label_mat = label_mat[:, perm[0:0 + testsize]]
    train_input_mat = input_mat[:, perm[testsize:]]
    train_output_mat = output_mat[:, perm[testsize:]]
    train_label_mat = label_mat[:, perm[testsize:]]

    list_of_references = []
    for text_ndarray in test_output_mat.T:
        reference = text_ndarray.tolist()
        references = [[w_id for w_id in reference if w_id is not -1]]
        list_of_references.append(references)

    #############################
    #### train seq2seq model ####
    #############################

    accum_loss = 0
    train_loss_data = []
    test_loss_data = []
    bleu_score_data = []
    wer_score_data = []
    for num, epoch in enumerate(range(n_epoch)):
        total_loss = test_loss = 0
        batch_num = 0
        perm = np.random.permutation(len(corpus.posts) - testsize)

        # for training
        for i in range(0, len(corpus.posts) - testsize, batchsize):

            # select batch data
            input_batch = train_input_mat[:, perm[i:i + batchsize]]
            output_batch = train_output_mat[:, perm[i:i + batchsize]]
            label_batch = train_label_mat[:, perm[i:i + batchsize]]

            # Encode a sentence
            model.initialize()                     # initialize cell
            model.encode(input_batch, train=True)  # encode (output: hidden Variable)

            # Decode from encoded context
            next_ids = xp.array([corpus.dic.token2id["<eos>"] for _ in range(batchsize)])
            accum_loss = 0
            for w_ids, l_ids in zip(output_batch, label_batch):
                loss, predict_mat = model.decode(next_ids, w_ids, l_ids, train=True)
                next_ids = w_ids
                accum_loss += loss

            # learn model
            model.cleargrads()     # initialize all grad to zero
            accum_loss.backward()  # back propagation
            optimizer.update()
            total_loss += float(accum_loss.data)
            batch_num += 1
            print('Epoch: ', num, 'Batch_num', batch_num, 'batch loss: {:.2f}'.format(float(accum_loss.data)))

        # for testing
        list_of_hypotheses = []
        for i in range(0, testsize, batchsize):

            # select test batch data
            input_batch = test_input_mat[:, i:i + batchsize]
            output_batch = test_output_mat[:, i:i + batchsize]
            label_batch = test_label_mat[:, i:i + batchsize]

            # Encode a sentence
            model.initialize()                     # initialize cell
            model.encode(input_batch, train=True)  # encode (output: hidden Variable)

            # Decode from encoded context
            next_ids = xp.array([corpus.dic.token2id["<start>"] for _ in range(batchsize)])
            if args.gpu >= 0:
                hypotheses = [cuda.to_cpu(next_ids)]
            else:
                hypotheses = [next_ids]
            for w_ids, l_ids in zip(output_batch, label_batch):
                loss, predict_mat = model.decode(next_ids, w_ids, l_ids, train=True)
                next_ids = xp.argmax(predict_mat.data, axis=1)
                test_loss += loss.data
                if args.gpu >= 0:
                    hypotheses.append(cuda.to_cpu(next_ids))
                else:
                    hypotheses.append(next_ids)

            # collect hypotheses for calculating BLEU score
            hypotheses = np.array(hypotheses).T
            for hypothesis in hypotheses:
                text_list = hypothesis.tolist()
                list_of_hypotheses.append([w_id for w_id in text_list if w_id is not -1])

        # calculate BLEU score from test (develop) data
        bleu_score = nltk.translate.bleu_score.corpus_bleu(list_of_references, list_of_hypotheses,
                                                           weights=(0.25, 0.25, 0.25, 0.25))
        bleu_score_data.append(bleu_score)
        print('Epoch: ', num, 'BLEU SCORE: ', bleu_score)

        # calculate WER score from test (develop) data
        wer_score = 0
        for index, references in enumerate(list_of_references):
            wer_score += wer(references[0], list_of_hypotheses[index])
        wer_score /= len(list_of_references)
        wer_score_data.append(wer_score)
        print('Epoch: ', num, 'WER SCORE: ', wer_score)

        # save model and optimizer
        if (epoch + 1) % 10 == 0:
            print('-----', epoch + 1, ' times -----')
            print('save the model and optimizer')
            serializers.save_hdf5('data/' + str(epoch) + '.model', model)
            serializers.save_hdf5('data/' + str(epoch) + '.state', optimizer)

        # display the on-going status
        print('Epoch: ', num,
              'Train loss: {:.2f}'.format(total_loss),
              'Test loss: {:.2f}'.format(float(test_loss)))
        train_loss_data.append(float(total_loss / batch_num))
        test_loss_data.append(float(test_loss))

        # evaluate a test loss
        check_loss = test_loss_data[-10:]           # check out the last 10 loss data
        end_flg = [j for j in range(len(check_loss) - 1) if check_loss[j] < check_loss[j + 1]]
        if len(end_flg) > 9:
            print('Probably it is over-fitting. So stop to learn...')
            break

    # save loss data
    with open('./data/loss_train_data.pkl', 'wb') as f:
        pickle.dump(train_loss_data, f)
    with open('./data/loss_test_data.pkl', 'wb') as f:
        pickle.dump(test_loss_data, f)
    with open('./data/bleu_score_data.pkl', 'wb') as f:
        pickle.dump(bleu_score_data, f)
    with open('./data/wer_score_data.pkl', 'wb') as f:
        pickle.dump(wer_score_data, f)
Пример #3
0
def main():

    ###########################
    #### create dictionary ####
    ###########################

    if os.path.exists('./data/corpus/dictionary.dict'):
        if args.lang == 'ja':
            corpus = JaConvCorpus(file_path=None,
                                  batch_size=batchsize,
                                  size_filter=True)
        else:
            corpus = ConvCorpus(file_path=None,
                                batch_size=batchsize,
                                size_filter=True)
        corpus.load(load_dir='./data/corpus/')
    else:
        if args.lang == 'ja':
            corpus = JaConvCorpus(file_path=data_file,
                                  batch_size=batchsize,
                                  size_filter=True)
        else:
            corpus = ConvCorpus(file_path=data_file,
                                batch_size=batchsize,
                                size_filter=True)
        corpus.save(save_dir='./data/corpus/')
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))

    ######################
    #### create model ####
    ######################

    model = Seq2Seq(vocab_size=len(corpus.dic.token2id),
                    feature_num=feature_num,
                    hidden_num=hidden_num,
                    batch_size=batchsize,
                    gpu_flg=args.gpu)
    if args.gpu >= 0:
        model.to_gpu()
    optimizer = optimizers.Adam(alpha=0.001)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(5))

    ##########################
    #### create ID corpus ####
    ##########################

    input_mat = []
    output_mat = []
    input_mat_rev = []
    # output_wp_mat = []

    max_input_ren = max_output_ren = 0
    for input_text, output_text in zip(corpus.posts, corpus.cmnts):

        output_text.append(corpus.dic.token2id["<eos>"])

        # update max sentence length
        max_input_ren = max(max_input_ren, len(input_text))
        max_output_ren = max(max_output_ren, len(output_text))

        input_mat.append(input_text)
        output_mat.append(output_text)

        # # create word prediction matrix
        # wp = []
        # for wid in output_text:
        #     if wid not in wp:
        #         wp.append(wid)
        # output_wp_mat.append(wp)

    # make reverse corpus
    for input_text in input_mat:
        input_mat_rev.append(input_text[::-1])

    # padding
    for li in input_mat:
        insert_num = max_input_ren - len(li)
        for _ in range(insert_num):
            li.append(corpus.dic.token2id['<pad>'])
    for li in output_mat:
        insert_num = max_output_ren - len(li)
        for _ in range(insert_num):
            li.append(corpus.dic.token2id['<pad>'])
    for li in input_mat_rev:
        insert_num = max_input_ren - len(li)
        for _ in range(insert_num):
            li.insert(0, corpus.dic.token2id['<pad>'])

    # create batch matrix
    input_mat = np.array(input_mat, dtype=np.int32).T
    input_mat_rev = np.array(input_mat_rev, dtype=np.int32).T
    output_mat = np.array(output_mat, dtype=np.int32).T

    # separate corpus into Train and Test
    perm = np.random.permutation(len(corpus.posts))
    test_input_mat = input_mat[:, perm[0:0 + testsize]]
    test_output_mat = output_mat[:, perm[0:0 + testsize]]
    test_input_mat_rev = input_mat_rev[:, perm[0:0 + testsize]]
    train_input_mat = input_mat[:, perm[testsize:]]
    train_output_mat = output_mat[:, perm[testsize:]]
    train_input_mat_rev = input_mat_rev[:, perm[testsize:]]

    # train_output_wp_mat = []
    # for index in perm[testsize:]:
    #     train_output_wp_mat.append(output_wp_mat[index])

    #############################
    #### train seq2seq model ####
    #############################

    accum_loss = 0
    train_loss_data = []
    for num, epoch in enumerate(range(n_epoch)):
        total_loss = 0
        batch_num = 0
        perm = np.random.permutation(len(corpus.posts) - testsize)

        # for training
        for i in range(0, len(corpus.posts) - testsize, batchsize):

            # select batch data
            input_batch = remove_extra_padding(
                train_input_mat[:, perm[i:i + batchsize]], reverse_flg=False)
            input_batch_rev = remove_extra_padding(
                train_input_mat_rev[:, perm[i:i + batchsize]],
                reverse_flg=True)
            output_batch = remove_extra_padding(
                train_output_mat[:, perm[i:i + batchsize]], reverse_flg=False)
            # output_wp_batch = []
            # for index in perm[i:i + batchsize]:
            #     output_wp_batch.append(train_output_wp_mat[index])
            # output_wp_batch = create_wp_batch(vocab_size=len(corpus.dic.token2id),
            #                                   wp_lists=output_wp_batch)

            # Encode a sentence
            model.initialize(batch_size=input_batch.shape[1])
            model.encode(input_batch, input_batch_rev, train=True)

            # Decode from encoded context
            end_batch = xp.array([
                corpus.dic.token2id["<start>"]
                for _ in range(input_batch.shape[1])
            ])
            first_words = output_batch[0]
            loss, predict_mat = model.decode(end_batch,
                                             first_words,
                                             train=True)
            next_ids = first_words
            accum_loss += loss
            for w_ids in output_batch[1:]:
                loss, predict_mat = model.decode(next_ids, w_ids, train=True)
                next_ids = w_ids
                accum_loss += loss

            # learn model
            model.cleargrads()  # initialize all grad to zero
            accum_loss.backward()  # back propagation
            optimizer.update()
            total_loss += float(accum_loss.data)
            batch_num += 1
            print('Epoch: ', num, 'Batch_num', batch_num,
                  'batch loss: {:.2f}'.format(float(accum_loss.data)))
            accum_loss = 0

        train_loss_data.append(float(total_loss / batch_num))

        # save model and optimizer
        if (epoch + 1) % 5 == 0:
            print('-----', epoch + 1, ' times -----')
            print('save the model and optimizer')
            serializers.save_hdf5('data/' + str(epoch) + '.model', model)
            serializers.save_hdf5('data/' + str(epoch) + '.state', optimizer)

    # save loss data
    with open('./data/loss_train_data.pkl', 'wb') as f:
        pickle.dump(train_loss_data, f)
Пример #4
0
def main():

    ###########################
    #### create dictionary ####
    ###########################

    if os.path.exists(CORPUS_DIR + 'dictionary.dict'):
        corpus = JaConvCorpus(create_flg=False,
                              batch_size=batchsize,
                              size_filter=True)
        corpus.load(load_dir=CORPUS_DIR)
    else:
        corpus = JaConvCorpus(create_flg=True,
                              batch_size=batchsize,
                              size_filter=True)
        corpus.save(save_dir=CORPUS_DIR)
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))
    print('Emotion size: ', len(corpus.emotion_set))

    # search word_threshold (general or emotional)
    ma = 0
    mi = 999999
    for word in corpus.emotion_set:
        wid = corpus.dic.token2id[word]
        if wid > ma:
            ma = wid
        if wid < mi:
            mi = wid
    word_threshold = mi

    ######################
    #### create model ####
    ######################

    model = PreTrainSeq2Seq(all_vocab_size=len(corpus.dic.token2id),
                            emotion_vocab_size=len(corpus.emotion_set),
                            feature_num=feature_num,
                            hidden_num=hidden_num,
                            batch_size=batchsize,
                            label_num=label_num,
                            label_embed_num=label_embed,
                            gpu_flg=args.gpu)

    if args.gpu >= 0:
        model.to_gpu()
    optimizer = optimizers.Adam(alpha=0.001)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001))

    ##########################
    #### create ID corpus ####
    ##########################

    input_mat = []
    output_mat = []
    input_mat_rev = []
    label_mat = []
    max_input_ren = max_output_ren = 0
    print('start making corpus matrix...')
    for input_text, output_text in zip(corpus.rough_posts, corpus.rough_cmnts):

        # reverse an input and add eos tag
        output_text.append(corpus.dic.token2id["<eos>"])  # 出力の最後にeosを挿入

        # update max sentence length
        max_input_ren = max(max_input_ren, len(input_text))
        max_output_ren = max(max_output_ren, len(output_text))

        # make a list of lists
        input_mat.append(input_text)
        output_mat.append(output_text)

        # make label lists TODO: 3値分類(pos, neg, neu)のみの対応なので可変にする
        n_num = p_num = 0
        for word in output_text:
            if corpus.dic[word] in corpus.neg_words:
                n_num += 1
            if corpus.dic[word] in corpus.pos_words:
                p_num += 1
        if (n_num + p_num) == 0:
            label_mat.append([1 for _ in range(len(output_text))])
        elif n_num <= p_num:
            label_mat.append([2 for _ in range(len(output_text))])
        elif n_num > p_num:
            label_mat.append([0 for _ in range(len(output_text))])
        else:
            raise ValueError

    # make reverse corpus
    for input_text in input_mat:
        input_mat_rev.append(input_text[::-1])

    # padding (inputの文頭・outputの文末にパディングを挿入する)
    print('start labeling...')
    for li in input_mat:
        insert_num = max_input_ren - len(li)
        for _ in range(insert_num):
            li.append(corpus.dic.token2id['<pad>'])
    for li in output_mat:
        insert_num = max_output_ren - len(li)
        for _ in range(insert_num):
            li.append(corpus.dic.token2id['<pad>'])
    for li in input_mat_rev:
        insert_num = max_input_ren - len(li)
        for _ in range(insert_num):
            li.insert(0, corpus.dic.token2id['<pad>'])
    for li in label_mat:
        insert_num = max_output_ren - len(li)
        for _ in range(insert_num):
            li.append(corpus.dic.token2id['<pad>'])
    if len(output_mat) != len(label_mat):
        print('Output matrix and label matrix should have the same dimension.')
        raise ValueError

    # create batch matrix
    print('transpose...')
    input_mat = np.array(input_mat, dtype=np.int32).T
    input_mat_rev = np.array(input_mat_rev, dtype=np.int32).T
    output_mat = np.array(output_mat, dtype=np.int32).T
    label_mat = np.array(label_mat, dtype=np.int32).T

    # separate corpus into Train and Test TODO:実験時はテストデータとトレーニングデータに分離する
    print('split train and test...')
    train_input_mat = input_mat
    train_output_mat = output_mat
    train_input_mat_rev = input_mat_rev
    train_label_mat = label_mat

    #############################
    #### train seq2seq model ####
    #############################

    accum_loss = 0
    train_loss_data = []
    print('start training...')
    for num, epoch in enumerate(range(n_epoch)):
        total_loss = 0
        batch_num = 0
        perm = np.random.permutation(len(corpus.rough_posts))

        # for training
        for i in range(0, len(corpus.rough_posts), batchsize):

            # select batch data
            input_batch = remove_extra_padding(
                train_input_mat[:, perm[i:i + batchsize]], reverse_flg=False)
            input_batch_rev = remove_extra_padding(
                train_input_mat_rev[:, perm[i:i + batchsize]],
                reverse_flg=True)
            output_batch = remove_extra_padding(
                train_output_mat[:, perm[i:i + batchsize]], reverse_flg=False)
            label_batch = remove_extra_padding(
                train_label_mat[:, perm[i:i + batchsize]], reverse_flg=False)

            # Encode a sentence
            model.initialize(
                batch_size=input_batch.shape[1])  # initialize cell
            model.encode(input_batch, input_batch_rev,
                         train=True)  # encode (output: hidden Variable)

            # Decode from encoded context
            input_ids = xp.array([
                corpus.dic.token2id["<start>"]
                for _ in range(input_batch.shape[1])
            ])
            for w_ids, l_ids in zip(output_batch, label_batch):
                loss, predict_mat = model.decode(input_ids,
                                                 w_ids,
                                                 label_id=l_ids,
                                                 word_th=word_threshold,
                                                 train=True)
                input_ids = w_ids
                accum_loss += loss

            # learn model
            model.cleargrads()  # initialize all grad to zero
            accum_loss.backward()  # back propagation
            optimizer.update()
            total_loss += float(accum_loss.data)
            batch_num += 1
            print('Epoch: ', num, 'Batch_num', batch_num,
                  'batch loss: {:.2f}'.format(float(accum_loss.data)))
            accum_loss = 0

        train_loss_data.append(float(total_loss / batch_num))

        # save model and optimizer
        print('-----', epoch + 1, ' times -----')
        print('save the model and optimizer')
        serializers.save_hdf5('../data/seq2seq/' + str(epoch) + '_rough.model',
                              model)
        serializers.save_hdf5('../data/seq2seq/' + str(epoch) + '_rough.state',
                              optimizer)

    # save loss data
    with open('./data/loss_train_data.pkl', 'wb') as f:
        pickle.dump(train_loss_data, f)
Пример #5
0
def interpreter(data_path, model_path):
    """
    Run this function, if you want to talk to seq2seq model.
    if you type "exit", finish to talk.
    :param data_path: the path of corpus you made model learn
    :param model_path: the path of model you made learn
    :return:
    """
    # call dictionary class
    if args.lang == 'en':
        corpus = ConvCorpus(file_path=None)
        corpus.load(load_dir=data_path)
    elif args.lang == 'ja':
        corpus = JaConvCorpus(file_path=None)
        corpus.load(load_dir=data_path)
    else:
        print(
            'You gave wrong argument to this system. Check out your argument about languages.'
        )
        raise ValueError
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))
    print('')

    # rebuild seq2seq model
    model = Seq2Seq(len(corpus.dic.token2id),
                    feature_num=args.feature_num,
                    hidden_num=args.hidden_num,
                    label_num=args.label_num,
                    label_embed_num=args.label_embed,
                    batch_size=1,
                    gpu_flg=args.gpu)
    serializers.load_hdf5(model_path, model)

    # run conversation system
    print('The system is ready to run, please talk to me!')
    print('( If you want to end a talk, please type "exit". )')
    print('')
    while True:
        print('>> ', end='')
        sentence = input()
        if sentence == 'exit':
            print('See you again!')
            break

        # convert to a list
        if args.lang == 'en':
            input_vocab = [
                unicodedata.normalize('NFKC', word.lower())
                for word in word_tokenize(sentence)
            ]
        elif args.lang == 'ja':
            input_vocab = parse_ja_text(sentence)
        else:
            print("Sorry, but your language is not supported...")
            raise ValueError

        # check a sentiment tag
        label_id = -1
        if len(input_vocab) == 0:
            print('caution: you donot set any words!)')
            pass
        elif input_vocab[-1] == '2':
            del input_vocab[-1]
            label_id = 1
        elif input_vocab[-1] == '1':
            del input_vocab[-1]
            label_id = 0
        else:
            print('caution: you donot set any sentiment tags!')
            break

        # input_vocab.reverse()
        # input_vocab.insert(0, "<eos>")

        # convert word into ID
        input_sentence = [
            corpus.dic.token2id[word] for word in input_vocab
            if not corpus.dic.token2id.get(word) is None
        ]

        model.initialize()  # initialize cell
        sentence = model.generate(input_sentence,
                                  sentence_limit=len(input_sentence) + 30,
                                  word2id=corpus.dic.token2id,
                                  id2word=corpus.dic,
                                  label_id=label_id)
        print("-> ", sentence)
        print('')
Пример #6
0
def calculate_embedding_vectors(data_path, model_path):

    # call dictionary class
    if args.lang == 'en':
        corpus = ConvCorpus(file_path=None)
        corpus.load(load_dir=data_path)
    elif args.lang == 'ja':
        corpus = JaConvCorpus(file_path=None)
        corpus.load(load_dir=data_path)
    else:
        print(
            'You gave wrong argument to this system. Check out your argument about languages.'
        )
        raise ValueError
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))
    print('')

    # rebuild seq2seq model
    model = Seq2Seq(len(corpus.dic.token2id),
                    feature_num=args.feature_num,
                    hidden_num=args.hidden_num,
                    label_num=args.label_num,
                    label_embed_num=args.label_embed,
                    batch_size=1,
                    gpu_flg=args.gpu)
    serializers.load_hdf5(model_path, model)

    # get embedding vectors
    embed_mat = model.dec.ye.W.data
    sentiment_mat = model.dec.le.W.data
    neg_vec = np.array([sentiment_mat[0, :]])
    pos_vec = np.array([sentiment_mat[1, :]])

    # calculate cos similarity
    neg_sim_dic = {}
    pos_sim_dic = {}
    for i in range(embed_mat.shape[0]):
        word_vec = np.array([embed_mat[i, :]])
        neg_sim_dic[i] = cosine_similarity(word_vec, neg_vec)
        pos_sim_dic[i] = cosine_similarity(word_vec, pos_vec)

        # if cosine_similarity(word_vec, pos_vec) > cosine_similarity(word_vec, neg_vec):
        #     print('pos: ', corpus.dic[i])
        # elif cosine_similarity(word_vec, pos_vec) < cosine_similarity(word_vec, neg_vec):
        #     print('neg: ', corpus.dic[i])
        # else:
        #     print('???: ', corpus.dic[i])
        #     raise ValueError

    # sort in descending order
    neg_ordered = collections.OrderedDict(
        sorted(neg_sim_dic.items(), key=lambda x: x[1], reverse=True))
    pos_ordered = collections.OrderedDict(
        sorted(pos_sim_dic.items(), key=lambda x: x[1], reverse=True))

    # show TOP50 words
    print('------- The words which is similar to a NEGATIVE tag --------')
    for index, w_index in enumerate(neg_ordered):
        print(corpus.dic[w_index], ': ', neg_ordered[w_index][0, 0])
        if index == 49:
            break
    print('------- The words which is similar to a POSITIVE tag --------')
    for index, w_index in enumerate(pos_ordered):
        print(corpus.dic[w_index], ': ', pos_ordered[w_index][0, 0])
        if index == 49:
            break
Пример #7
0
def interpreter(data_path, model_path):
    """
    Run this function, if you want to talk to seq2seq model.
    if you type "exit", finish to talk.
    :param data_path: the path of corpus you made model learn
    :param model_path: the path of model you made learn
    :return:
    """
    # call dictionary class
    if args.lang == 'en':
        corpus = ConvCorpus(file_path=None)
        corpus.load(load_dir=data_path)
    elif args.lang == 'ja':
        corpus = JaConvCorpus(file_path=None)
        corpus.load(load_dir=data_path)
    else:
        print(
            'You gave wrong argument to this system. Check out your argument about languages.'
        )
        raise ValueError
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))
    print('')

    # rebuild seq2seq model
    model = Seq2Seq(len(corpus.dic.token2id),
                    feature_num=args.feature_num,
                    hidden_num=args.hidden_num,
                    batch_size=1,
                    gpu_flg=args.gpu)
    serializers.load_hdf5(model_path, model)

    # run conversation system
    print('The system is ready to run, please talk to me!')
    print('( If you want to end a talk, please type "exit". )')
    print('')
    while True:
        print('>> ', end='')
        sentence = input()
        if sentence == 'exit':
            print('See you again!')
            break

        if args.lang == 'en':
            input_vocab = [
                unicodedata.normalize('NFKC', word.lower())
                for word in word_tokenize(sentence)
            ]
        elif args.lang == 'ja':
            input_vocab = [
                unicodedata.normalize('NFKC', word.lower())
                for word in parse_ja_text(sentence)
            ]

        input_vocab_rev = input_vocab[::-1]

        # convert word into ID
        input_sentence = [
            corpus.dic.token2id[word] for word in input_vocab
            if not corpus.dic.token2id.get(word) is None
        ]
        input_sentence_rev = [
            corpus.dic.token2id[word] for word in input_vocab_rev
            if not corpus.dic.token2id.get(word) is None
        ]

        model.initialize(batch_size=1)  # initialize cell
        sentence = model.generate(input_sentence,
                                  input_sentence_rev,
                                  sentence_limit=len(input_sentence) + 30,
                                  word2id=corpus.dic.token2id,
                                  id2word=corpus.dic)
        print("-> ", sentence)
        print('')
def interpreter(data_path, model_path):
    """
    Run this function, if you want to talk to seq2seq model.
    if you type "exit", finish to talk.
    :param data_path: the path of corpus you made model learn
    :param model_path: the path of model you made learn
    :return:
    """
    # call dictionary class
    corpus = JaConvCorpus(create_flg=False)
    corpus.load(load_dir=data_path)
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))
    print('')

    # rebuild seq2seq model
    model = FineTuneSeq2Seq(all_vocab_size=len(corpus.dic.token2id), emotion_vocab_size=len(corpus.emotion_set),
                            feature_num=args.feature_num, hidden_num=args.hidden_num,
                            label_num=args.label_num, label_embed_num=args.label_embed, batch_size=1, gpu_flg=args.gpu)
    serializers.load_hdf5(model_path, model)
    emo_label_index = [index for index in range(args.label_num)]
    topic_label_index = [index for index in range(args.topic_num)]

    # run conversation system
    print('The system is ready to run, please talk to me!')
    print('( If you want to end a talk, please type "exit". )')
    print('')
    while True:
        print('>> ', end='')
        sentence = input()
        if sentence == 'exit':
            print('See you again!')
            break

        # check a sentiment tag
        input_vocab = sentence.split(' ')
        emo_label_id = input_vocab.pop(-1)
        topic_label_id = input_vocab.pop(-1)
        label_false_flg = 1

        for index in emo_label_index:
            if emo_label_id == str(index):
                emo_label_id = index               # TODO: ラベルのインデックスに注意.今は3値分類 (0, 1, 2)
                label_false_flg = 0
                break
        if label_false_flg:
            print('caution: you donot set any enable tags! (emotion label)')
            emo_label_id = -1

        # check a topic tag                        # TODO: 本当はユーザ側の指定ではなく,tweet2vecの判定から決定する
        label_false_flg = 1
        for index in topic_label_index:
            if topic_label_id == str(index):
                topic_label_id = index             # TODO: ラベルのインデックスに注意.今は3値分類 (0, 1, 2)
                label_false_flg = 0
                break
        if label_false_flg:
            print('caution: you donot set any enable tags! (topic label)')
            topic_label_id = -1

        input_vocab = [unicodedata.normalize('NFKC', word.lower()) for word in parse_ja_text(sentence)]
        input_vocab_rev = input_vocab[::-1]

        # convert word into ID
        input_sentence = [corpus.dic.token2id[word] for word in input_vocab if not corpus.dic.token2id.get(word) is None]
        input_sentence_rev = [corpus.dic.token2id[word] for word in input_vocab_rev if not corpus.dic.token2id.get(word) is None]

        model.initialize(batch_size=1)
        if args.beam_search:
            hypotheses = model.beam_search(model.initial_state_function, model.generate_function,
                                           input_sentence, input_sentence_rev, start_id=corpus.dic.token2id['<start>'],
                                           end_id=corpus.dic.token2id['<eos>'], emo_label_id=emo_label_id,
                                           topic_label_id=topic_label_id)
            for hypothesis in hypotheses:
                generated_indices = hypothesis.to_sequence_of_values()
                generated_tokens = [corpus.dic[i] for i in generated_indices]
                print("--> ", " ".join(generated_tokens))
        else:
            sentence = model.generate(input_sentence, input_sentence_rev, sentence_limit=len(input_sentence) + 20,
                                      emo_label_id=emo_label_id, topic_label_id=topic_label_id,
                                      word2id=corpus.dic.token2id, id2word=corpus.dic)
        print("-> ", sentence)
        print('')
def output_file(data_path, model_path):
    """
    :param data_path: the path of corpus you made model learn
    :param model_path: the path of model you made learn
    :return:
    """
    # call dictionary class
    corpus = JaConvCorpus(create_flg=False)
    corpus.load(load_dir=data_path)
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))
    print('')

    # rebuild seq2seq model
    model = FineTuneSeq2Seq(all_vocab_size=len(corpus.dic.token2id), emotion_vocab_size=len(corpus.emotion_set),
                            feature_num=args.feature_num, hidden_num=args.hidden_num,
                            label_num=args.label_num, label_embed_num=args.label_embed, batch_size=1, gpu_flg=args.gpu)
    serializers.load_hdf5(model_path, model)
    emo_label_index = [index for index in range(args.label_num)]
    topic_label_index = [index for index in range(args.topic_num)]

    # run conversation system
    r_label = re.compile("(__label__)([0-9]+)")
    pattern = "(.+?)(\t)(.+?)(\n|\r\n)"
    r = re.compile(pattern)
    for line in open(T2V_OUTPUT, 'r', encoding='utf-8'):
        m = r.search(line)
        if m is not None:
            topic_label = m.group(1)
            sentence = m.group(3)

            # check a topic tag
            label_info = r_label.search(topic_label)
            if int(label_info.group(2)) < len(topic_label_index):
                topic_label_id = int(label_info.group(2))
            else:
                print('domain label がドメイン数の上限を超えています.')
                raise ValueError

            # parse text by mecab
            input_vocab = [unicodedata.normalize('NFKC', word.lower()) for word in parse_ja_text(sentence)]
            input_vocab_rev = input_vocab[::-1]

            # convert word into ID
            input_sentence = [corpus.dic.token2id[word] for word in input_vocab if not corpus.dic.token2id.get(word) is None]
            input_sentence_rev = [corpus.dic.token2id[word] for word in input_vocab_rev if not corpus.dic.token2id.get(word) is None]

            print("input -> ", sentence, "domain:", topic_label_id)
            model.initialize(batch_size=1)
            for emo_label in range(LABEL_NUM):
                sentence = model.generate(input_sentence, input_sentence_rev, sentence_limit=len(input_sentence) + 20,
                                          emo_label_id=emo_label, topic_label_id=topic_label_id,
                                          word2id=corpus.dic.token2id, id2word=corpus.dic)
                if emo_label == 0:
                    print("neg -> ", sentence)
                elif emo_label == 1:
                    print("neu -> ", sentence)
                elif emo_label == 2:
                    print("pos -> ", sentence)
                else:
                    raise ValueError
            print('')