Пример #1
0
def main():

    ###########################
    #### create dictionary ####
    ###########################

    if os.path.exists('./data/corpus/dictionary.dict'):
        if args.lang == 'ja':
            corpus = ProposalConvCorpus(file_path=None,
                                        batch_size=batchsize,
                                        size_filter=True)
        else:
            corpus = ConvCorpus(file_path=None,
                                batch_size=batchsize,
                                size_filter=True)
        corpus.load(load_dir='./data/corpus/')
    else:
        if args.lang == 'ja':
            corpus = ProposalConvCorpus(file_path=data_file,
                                        batch_size=batchsize,
                                        size_filter=True)
        else:
            corpus = ConvCorpus(file_path=data_file,
                                batch_size=batchsize,
                                size_filter=True)
        corpus.save(save_dir='./data/corpus/')
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))
    print('Emotion size: ', len(corpus.emotion_set))

    # search word_threshold (general - emotional)
    ma = 0
    mi = 999999
    for word in corpus.emotion_set:
        wid = corpus.dic.token2id[word]
        if wid > ma:
            ma = wid
        if wid < mi:
            mi = wid
    # print(corpus.dic.token2id['<start>'], corpus.dic.token2id['<eos>'], corpus.dic.token2id['happy'], mi, ma)
    word_threshold = mi

    ######################
    #### create model ####
    ######################

    model = Seq2Seq(all_vocab_size=len(corpus.dic.token2id),
                    emotion_vocab_size=len(corpus.emotion_set),
                    feature_num=feature_num,
                    hidden_num=hidden_num,
                    batch_size=batchsize,
                    label_num=label_num,
                    label_embed_num=label_embed,
                    gpu_flg=args.gpu)

    if args.gpu >= 0:
        model.to_gpu()
    optimizer = optimizers.Adam(alpha=0.001)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001))

    ##########################
    #### create ID corpus ####
    ##########################

    input_mat = []
    output_mat = []
    input_mat_rev = []
    label_mat = []
    max_input_ren = max_output_ren = 0
    print('start making corpus matrix...')
    for input_text, output_text in zip(corpus.rough_posts, corpus.rough_cmnts):

        # reverse an input and add eos tag
        output_text.append(corpus.dic.token2id["<eos>"])  # 出力の最後にeosを挿入

        # update max sentence length
        max_input_ren = max(max_input_ren, len(input_text))
        max_output_ren = max(max_output_ren, len(output_text))

        # make a list of lists
        input_mat.append(input_text)
        output_mat.append(output_text)

        # make label lists TODO: 3値分類
        n_num = p_num = 0
        for word in output_text:
            if corpus.dic[word] in corpus.neg_words:
                n_num += 1
            if corpus.dic[word] in corpus.pos_words:
                p_num += 1
        if (n_num + p_num) == 0:
            label_mat.append([1 for _ in range(len(output_text))])
        elif n_num <= p_num:
            label_mat.append([2 for _ in range(len(output_text))])
        elif n_num > p_num:
            label_mat.append([0 for _ in range(len(output_text))])
        else:
            raise ValueError

    # make reverse corpus
    for input_text in input_mat:
        input_mat_rev.append(input_text[::-1])

    # padding (inputの文頭・outputの文末にパディングを挿入する)
    print('start labeling...')
    for li in input_mat:
        insert_num = max_input_ren - len(li)
        for _ in range(insert_num):
            li.append(corpus.dic.token2id['<pad>'])
    for li in output_mat:
        insert_num = max_output_ren - len(li)
        for _ in range(insert_num):
            li.append(corpus.dic.token2id['<pad>'])
    for li in input_mat_rev:
        insert_num = max_input_ren - len(li)
        for _ in range(insert_num):
            li.insert(0, corpus.dic.token2id['<pad>'])
    for li in label_mat:
        insert_num = max_output_ren - len(li)
        for _ in range(insert_num):
            li.append(corpus.dic.token2id['<pad>'])
    if len(output_mat) != len(label_mat):
        print('Output matrix and label matrix should have the same dimension.')
        raise ValueError

    # create batch matrix
    print('transpose...')
    input_mat = np.array(input_mat, dtype=np.int32).T
    input_mat_rev = np.array(input_mat_rev, dtype=np.int32).T
    output_mat = np.array(output_mat, dtype=np.int32).T
    label_mat = np.array(label_mat, dtype=np.int32).T

    # separate corpus into Train and Test TODO:実験時はテストデータとトレーニングデータに分離する
    print('split train and test...')
    train_input_mat = input_mat
    train_output_mat = output_mat
    train_input_mat_rev = input_mat_rev
    train_label_mat = label_mat

    #############################
    #### train seq2seq model ####
    #############################

    accum_loss = 0
    train_loss_data = []
    print('start training...')
    for num, epoch in enumerate(range(n_epoch)):
        total_loss = 0
        batch_num = 0
        perm = np.random.permutation(len(corpus.rough_posts))

        # for training
        for i in range(0, len(corpus.rough_posts), batchsize):

            # select batch data
            input_batch = remove_extra_padding(
                train_input_mat[:, perm[i:i + batchsize]], reverse_flg=False)
            input_batch_rev = remove_extra_padding(
                train_input_mat_rev[:, perm[i:i + batchsize]],
                reverse_flg=True)
            output_batch = remove_extra_padding(
                train_output_mat[:, perm[i:i + batchsize]], reverse_flg=False)
            label_batch = remove_extra_padding(
                train_label_mat[:, perm[i:i + batchsize]], reverse_flg=False)

            # Encode a sentence
            model.initialize(
                batch_size=input_batch.shape[1])  # initialize cell
            model.encode(input_batch, input_batch_rev,
                         train=True)  # encode (output: hidden Variable)

            # Decode from encoded context
            input_ids = xp.array(
                [corpus.dic.token2id["<start>"] for _ in range(batchsize)])
            for w_ids, l_ids in zip(output_batch, label_batch):
                loss, predict_mat = model.decode(input_ids,
                                                 w_ids,
                                                 label_id=l_ids,
                                                 word_th=word_threshold,
                                                 train=True)
                input_ids = w_ids
                accum_loss += loss

            # learn model
            model.cleargrads()  # initialize all grad to zero
            accum_loss.backward()  # back propagation
            optimizer.update()
            total_loss += float(accum_loss.data)
            batch_num += 1
            print('Epoch: ', num, 'Batch_num', batch_num,
                  'batch loss: {:.2f}'.format(float(accum_loss.data)))
            accum_loss = 0

        train_loss_data.append(float(total_loss / batch_num))

        # save model and optimizer
        print('-----', epoch + 1, ' times -----')
        print('save the model and optimizer')
        serializers.save_hdf5('data/' + str(epoch) + '.model', model)
        serializers.save_hdf5('data/' + str(epoch) + '.state', optimizer)

    # save loss data
    with open('./data/loss_train_data.pkl', 'wb') as f:
        pickle.dump(train_loss_data, f)
def check_corpus():
    """
    考察の章で使用するコーパス情報を取得する関数.
    ポジティブ文,ネガティブ文,ニュートラル文の個数を数える.
    :return:
    """
    from proposal_util import ProposalConvCorpus

    DATA_DIR = './proposal_model/data/corpus/'

    # call dictionary class
    corpus = ProposalConvCorpus(file_path=None)
    corpus.load(load_dir=DATA_DIR)
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))
    print('')

    # check data
    emotion_freq = {
        'baseball': {
            'pos': 0,
            'neg': 0,
            'neu': 0
        },
        'pokemongo': {
            'pos': 0,
            'neg': 0,
            'neu': 0
        }
    }
    for num, input_sentence in enumerate(corpus.fine_posts):
        n_num = p_num = 0
        topic_label = ''
        for index, w_id in enumerate(corpus.fine_cmnts[num]):
            # comment の最初にトピックラベルが付いているものとする
            if index == 0:
                if w_id == corpus.dic.token2id['__label__0']:
                    topic_label = 'baseball'
                elif w_id == corpus.dic.token2id['__label__1']:
                    topic_label = 'pokemongo'
                else:
                    print('no label error: ', w_id)
                    raise ValueError
            # pos or neg word の判定
            else:
                if corpus.dic[w_id] in corpus.neg_words:
                    n_num += 1
                if corpus.dic[w_id] in corpus.pos_words:
                    p_num += 1
        if (n_num + p_num) == 0:
            emotion_freq[topic_label]['neu'] += 1
        elif n_num <= p_num:
            emotion_freq[topic_label]['pos'] += 1
        elif n_num > p_num:
            emotion_freq[topic_label]['neg'] += 1
        else:
            raise ValueError
        if num % 10000 == 0:
            print(num, 'end...')
    print(emotion_freq)
    for topic in emotion_freq:
        text = 0
        for tag in emotion_freq[topic]:
            text += emotion_freq[topic][tag]
        print(topic, ':', text)

    emotion_freq = {'pos': 0, 'neg': 0, 'neu': 0}
    for num, input_sentence in enumerate(corpus.rough_posts):
        n_num = p_num = 0
        for index, w_id in enumerate(corpus.rough_cmnts[num]):
            if corpus.dic[w_id] in corpus.neg_words:
                n_num += 1
            if corpus.dic[w_id] in corpus.pos_words:
                p_num += 1
        if (n_num + p_num) == 0:
            emotion_freq['neu'] += 1
        elif n_num <= p_num:
            emotion_freq['pos'] += 1
        elif n_num > p_num:
            emotion_freq['neg'] += 1
        else:
            raise ValueError
        if num % 100000 == 0:
            print(num, 'end...')

    print(emotion_freq)
    text = 0
    for tag in emotion_freq:
        text += emotion_freq[tag]
    print('fine texts: ', text)