def main(): ########################### #### create dictionary #### ########################### if os.path.exists('./data/corpus/dictionary.dict'): if args.lang == 'ja': corpus = ProposalConvCorpus(file_path=None, batch_size=batchsize, size_filter=True) else: corpus = ConvCorpus(file_path=None, batch_size=batchsize, size_filter=True) corpus.load(load_dir='./data/corpus/') else: if args.lang == 'ja': corpus = ProposalConvCorpus(file_path=data_file, batch_size=batchsize, size_filter=True) else: corpus = ConvCorpus(file_path=data_file, batch_size=batchsize, size_filter=True) corpus.save(save_dir='./data/corpus/') print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) print('Emotion size: ', len(corpus.emotion_set)) # search word_threshold (general - emotional) ma = 0 mi = 999999 for word in corpus.emotion_set: wid = corpus.dic.token2id[word] if wid > ma: ma = wid if wid < mi: mi = wid # print(corpus.dic.token2id['<start>'], corpus.dic.token2id['<eos>'], corpus.dic.token2id['happy'], mi, ma) word_threshold = mi ###################### #### create model #### ###################### model = Seq2Seq(all_vocab_size=len(corpus.dic.token2id), emotion_vocab_size=len(corpus.emotion_set), feature_num=feature_num, hidden_num=hidden_num, batch_size=batchsize, label_num=label_num, label_embed_num=label_embed, gpu_flg=args.gpu) if args.gpu >= 0: model.to_gpu() optimizer = optimizers.Adam(alpha=0.001) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001)) ########################## #### create ID corpus #### ########################## input_mat = [] output_mat = [] input_mat_rev = [] label_mat = [] max_input_ren = max_output_ren = 0 print('start making corpus matrix...') for input_text, output_text in zip(corpus.rough_posts, corpus.rough_cmnts): # reverse an input and add eos tag output_text.append(corpus.dic.token2id["<eos>"]) # 出力の最後にeosを挿入 # update max sentence length max_input_ren = max(max_input_ren, len(input_text)) max_output_ren = max(max_output_ren, len(output_text)) # make a list of lists input_mat.append(input_text) output_mat.append(output_text) # make label lists TODO: 3値分類 n_num = p_num = 0 for word in output_text: if corpus.dic[word] in corpus.neg_words: n_num += 1 if corpus.dic[word] in corpus.pos_words: p_num += 1 if (n_num + p_num) == 0: label_mat.append([1 for _ in range(len(output_text))]) elif n_num <= p_num: label_mat.append([2 for _ in range(len(output_text))]) elif n_num > p_num: label_mat.append([0 for _ in range(len(output_text))]) else: raise ValueError # make reverse corpus for input_text in input_mat: input_mat_rev.append(input_text[::-1]) # padding (inputの文頭・outputの文末にパディングを挿入する) print('start labeling...') for li in input_mat: insert_num = max_input_ren - len(li) for _ in range(insert_num): li.append(corpus.dic.token2id['<pad>']) for li in output_mat: insert_num = max_output_ren - len(li) for _ in range(insert_num): li.append(corpus.dic.token2id['<pad>']) for li in input_mat_rev: insert_num = max_input_ren - len(li) for _ in range(insert_num): li.insert(0, corpus.dic.token2id['<pad>']) for li in label_mat: insert_num = max_output_ren - len(li) for _ in range(insert_num): li.append(corpus.dic.token2id['<pad>']) if len(output_mat) != len(label_mat): print('Output matrix and label matrix should have the same dimension.') raise ValueError # create batch matrix print('transpose...') input_mat = np.array(input_mat, dtype=np.int32).T input_mat_rev = np.array(input_mat_rev, dtype=np.int32).T output_mat = np.array(output_mat, dtype=np.int32).T label_mat = np.array(label_mat, dtype=np.int32).T # separate corpus into Train and Test TODO:実験時はテストデータとトレーニングデータに分離する print('split train and test...') train_input_mat = input_mat train_output_mat = output_mat train_input_mat_rev = input_mat_rev train_label_mat = label_mat ############################# #### train seq2seq model #### ############################# accum_loss = 0 train_loss_data = [] print('start training...') for num, epoch in enumerate(range(n_epoch)): total_loss = 0 batch_num = 0 perm = np.random.permutation(len(corpus.rough_posts)) # for training for i in range(0, len(corpus.rough_posts), batchsize): # select batch data input_batch = remove_extra_padding( train_input_mat[:, perm[i:i + batchsize]], reverse_flg=False) input_batch_rev = remove_extra_padding( train_input_mat_rev[:, perm[i:i + batchsize]], reverse_flg=True) output_batch = remove_extra_padding( train_output_mat[:, perm[i:i + batchsize]], reverse_flg=False) label_batch = remove_extra_padding( train_label_mat[:, perm[i:i + batchsize]], reverse_flg=False) # Encode a sentence model.initialize( batch_size=input_batch.shape[1]) # initialize cell model.encode(input_batch, input_batch_rev, train=True) # encode (output: hidden Variable) # Decode from encoded context input_ids = xp.array( [corpus.dic.token2id["<start>"] for _ in range(batchsize)]) for w_ids, l_ids in zip(output_batch, label_batch): loss, predict_mat = model.decode(input_ids, w_ids, label_id=l_ids, word_th=word_threshold, train=True) input_ids = w_ids accum_loss += loss # learn model model.cleargrads() # initialize all grad to zero accum_loss.backward() # back propagation optimizer.update() total_loss += float(accum_loss.data) batch_num += 1 print('Epoch: ', num, 'Batch_num', batch_num, 'batch loss: {:.2f}'.format(float(accum_loss.data))) accum_loss = 0 train_loss_data.append(float(total_loss / batch_num)) # save model and optimizer print('-----', epoch + 1, ' times -----') print('save the model and optimizer') serializers.save_hdf5('data/' + str(epoch) + '.model', model) serializers.save_hdf5('data/' + str(epoch) + '.state', optimizer) # save loss data with open('./data/loss_train_data.pkl', 'wb') as f: pickle.dump(train_loss_data, f)
def test_run(data_path, model_path, n_show=80): """ Test function. Input is training data. Output have to be the sentence which is correct data in training phase. :return: """ # call dictionary class if args.lang == 'en': corpus = ConvCorpus(file_path=None) elif args.lang == 'ja': corpus = ProposalConvCorpus(file_path=None) else: print( 'You gave wrong argument to this system. Check out your argument about languages.' ) raise ValueError corpus.load(load_dir=data_path) print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) print('') # rebuild seq2seq model model = Seq2Seq(all_vocab_size=len(corpus.dic.token2id), emotion_vocab_size=len(corpus.emotion_set), feature_num=args.feature_num, hidden_num=args.hidden_num, label_num=args.label_num, label_embed_num=args.label_embed, batch_size=1, gpu_flg=args.gpu) serializers.load_hdf5(model_path, model) # run an interpreter for num, input_sentence in enumerate(corpus.rough_posts): id_sequence = input_sentence.copy() input_sentence_rev = input_sentence[::-1] # make label lists TODO: 3値分類 n_num = p_num = 0 for word in corpus.rough_cmnts[num]: if corpus.dic[word] in corpus.neg_words: n_num += 1 if corpus.dic[word] in corpus.pos_words: p_num += 1 if (n_num + p_num) == 0: label_id = 1 elif n_num <= p_num: label_id = 2 elif n_num > p_num: label_id = 0 else: raise ValueError # generate an output model.initialize(batch_size=1) # initialize cell sentence = model.generate(input_sentence, input_sentence_rev, sentence_limit=len(input_sentence) + 20, label_id=label_id, word2id=corpus.dic.token2id, id2word=corpus.dic) print("teacher : ", " ".join([corpus.dic[w_id] for w_id in id_sequence]), label_id) print("correct :", " ".join([corpus.dic[w_id] for w_id in corpus.rough_cmnts[num]])) print("-> ", sentence) print('') if num == n_show: break
def interpreter(data_path, model_path): """ Run this function, if you want to talk to seq2seq model. if you type "exit", finish to talk. :param data_path: the path of corpus you made model learn :param model_path: the path of model you made learn :return: """ # call dictionary class if args.lang == 'en': corpus = ConvCorpus(file_path=None) elif args.lang == 'ja': corpus = ProposalConvCorpus(file_path=None) else: print( 'You gave wrong argument to this system. Check out your argument about languages.' ) raise ValueError corpus.load(load_dir=data_path) print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) print('') # rebuild seq2seq model model = Seq2Seq(all_vocab_size=len(corpus.dic.token2id), emotion_vocab_size=len(corpus.emotion_set), feature_num=args.feature_num, hidden_num=args.hidden_num, label_num=args.label_num, label_embed_num=args.label_embed, batch_size=1, gpu_flg=args.gpu) serializers.load_hdf5(model_path, model) label_index = [index for index in range(LABEL_NUM)] # run conversation system print('The system is ready to run, please talk to me!') print('( If you want to end a talk, please type "exit". )') print('') while True: print('>> ', end='') sentence = input() if sentence == 'exit': print('See you again!') break # check a sentiment tag input_vocab = sentence.split(' ') label_id = input_vocab.pop(-1) label_false_flg = 1 for index in label_index: if label_id == str(index): label_id = index # TODO: ラベルのインデックスに注意する.今は3値分類 (0, 1, 2) label_false_flg = 0 break if label_false_flg: print('caution: you donot set any enable tags!') input_vocab = sentence.split(' ') label_id = -1 if args.lang == 'en': input_vocab = [ unicodedata.normalize('NFKC', word.lower()) for word in word_tokenize(sentence) ] elif args.lang == 'ja': input_vocab = [ unicodedata.normalize('NFKC', word.lower()) for word in parse_ja_text(sentence) ] input_vocab_rev = input_vocab[::-1] # convert word into ID input_sentence = [ corpus.dic.token2id[word] for word in input_vocab if not corpus.dic.token2id.get(word) is None ] input_sentence_rev = [ corpus.dic.token2id[word] for word in input_vocab_rev if not corpus.dic.token2id.get(word) is None ] model.initialize(batch_size=1) if args.beam_search: hypotheses = model.beam_search( model.initial_state_function, model.generate_function, X=input_sentence, X_rev=input_sentence_rev, start_id=corpus.dic.token2id['<start>'], end_id=corpus.dic.token2id['<eos>'], label_id=label_id) for hypothesis in hypotheses: generated_indices = hypothesis.to_sequence_of_values() generated_tokens = [corpus.dic[i] for i in generated_indices] print("--> ", " ".join(generated_tokens)) else: sentence = model.generate(input_sentence, input_sentence_rev, sentence_limit=len(input_sentence) + 20, label_id=label_id, word2id=corpus.dic.token2id, id2word=corpus.dic) print("-> ", sentence) print('')
def test_run(data_path, model_path, n_show=80): """ Test function. Input is training data. Output have to be the sentence which is correct data in training phase. :return: """ # call dictionary class if args.lang == 'en': corpus = ConvCorpus(file_path=None) elif args.lang == 'ja': corpus = ProposalConvCorpus(file_path=None) else: print( 'You gave wrong argument to this system. Check out your argument about languages.' ) raise ValueError corpus.load(load_dir=data_path) print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) print('') # rebuild seq2seq model model = MultiTaskSeq2Seq(all_vocab_size=len(corpus.dic.token2id), emotion_vocab_size=len(corpus.emotion_set), feature_num=args.feature_num, hidden_num=args.hidden_num, label_num=args.label_num, label_embed_num=args.label_embed, batch_size=1, gpu_flg=args.gpu) serializers.load_hdf5(model_path, model) # run an interpreter for num, input_sentence in enumerate(corpus.fine_posts): id_sequence = input_sentence.copy() input_sentence_rev = input_sentence[::-1] # make label lists TODO: 3値分類 n_num = p_num = 0 topic_label_id = correct_emo_label = -1 for index, w_id in enumerate(corpus.fine_cmnts[num]): # comment の最初にトピックラベルが付いているものとする if index == 0: if w_id == corpus.dic.token2id['__label__0']: topic_label_id = 0 elif w_id == corpus.dic.token2id['__label__1']: topic_label_id = 1 else: print('no label error: ', w_id) raise ValueError # pos or neg word の判定 else: if corpus.dic[w_id] in corpus.neg_words: n_num += 1 if corpus.dic[w_id] in corpus.pos_words: p_num += 1 if (n_num + p_num) == 0: correct_emo_label = 1 elif n_num <= p_num: correct_emo_label = 2 elif n_num > p_num: correct_emo_label = 0 else: raise ValueError # generate an output print("input : ", " ".join([corpus.dic[w_id] for w_id in id_sequence])) print("train emotion label: ", correct_emo_label) print( "correct :", " ".join([ corpus.dic[w_id] for index, w_id in enumerate(corpus.fine_cmnts[num]) if index != 0 ])) print(input_sentence) print(input_sentence_rev) for emo_label in range(LABEL_NUM): model.initialize(batch_size=1) sentence = model.generate(input_sentence, input_sentence_rev, sentence_limit=len(input_sentence) + 20, emo_label_id=emo_label, topic_label_id=topic_label_id, word2id=corpus.dic.token2id, id2word=corpus.dic) if emo_label == 0: print("neg -> ", sentence) elif emo_label == 1: print("neu -> ", sentence) else: print("pos -> ", sentence) print('') if num == n_show: break
def fixed_interpreter(data_path, model_path): """ 感情タグを入力しない方,トピックラベルの方は0,1の二値を入力 :param data_path: the path of corpus you made model learn :param model_path: the path of model you made learn :return: """ # call dictionary class if args.lang == 'en': corpus = ConvCorpus(file_path=None) elif args.lang == 'ja': corpus = ProposalConvCorpus(file_path=None) else: print( 'You gave wrong argument to this system. Check out your argument about languages.' ) raise ValueError corpus.load(load_dir=data_path) print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) print('') # rebuild seq2seq model model = MultiTaskSeq2Seq(all_vocab_size=len(corpus.dic.token2id), emotion_vocab_size=len(corpus.emotion_set), feature_num=args.feature_num, hidden_num=args.hidden_num, label_num=args.label_num, label_embed_num=args.label_embed, batch_size=1, gpu_flg=args.gpu) serializers.load_hdf5(model_path, model) emo_label_index = [index for index in range(LABEL_NUM)] topic_label_index = [index for index in range(TOPIC_NUM)] # run conversation system print('The system is ready to run, please talk to me!') print('( If you want to end a talk, please type "exit". )') print('') while True: print('>> ', end='') sentence = input() if sentence == 'exit': print('See you again!') break # check a topic tag input_vocab = sentence.split(' ') topic_label_id = input_vocab.pop(-1) label_false_flg = 1 for index in topic_label_index: if topic_label_id == str(index): topic_label_id = index # TODO: ラベルのインデックスに注意する.今は3値分類 (0, 1, 2) label_false_flg = 0 break if label_false_flg: print('caution: you donot set any enable tags!') input_vocab = sentence.split(' ') topic_label_id = -1 if args.lang == 'en': input_vocab = [ unicodedata.normalize('NFKC', word.lower()) for word in word_tokenize(sentence) ] elif args.lang == 'ja': input_vocab = [ unicodedata.normalize('NFKC', word.lower()) for word in parse_ja_text(sentence) ] input_vocab.pop(-1) input_vocab_rev = input_vocab[::-1] # convert word into ID input_sentence = [ corpus.dic.token2id[word] for word in input_vocab if not corpus.dic.token2id.get(word) is None ] input_sentence_rev = [ corpus.dic.token2id[word] for word in input_vocab_rev if not corpus.dic.token2id.get(word) is None ] model.initialize(batch_size=1) for emo_label in range(LABEL_NUM): sentence = model.generate(input_sentence, input_sentence_rev, sentence_limit=len(input_sentence) + 20, emo_label_id=emo_label, topic_label_id=topic_label_id, word2id=corpus.dic.token2id, id2word=corpus.dic) if emo_label == 0: print("neg -> ", sentence) elif emo_label == 1: print("neu -> ", sentence) elif emo_label == 2: print("pos -> ", sentence) else: raise ValueError print('')
def check_corpus(): """ 考察の章で使用するコーパス情報を取得する関数. ポジティブ文,ネガティブ文,ニュートラル文の個数を数える. :return: """ from proposal_util import ProposalConvCorpus DATA_DIR = './proposal_model/data/corpus/' # call dictionary class corpus = ProposalConvCorpus(file_path=None) corpus.load(load_dir=DATA_DIR) print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) print('') # check data emotion_freq = { 'baseball': { 'pos': 0, 'neg': 0, 'neu': 0 }, 'pokemongo': { 'pos': 0, 'neg': 0, 'neu': 0 } } for num, input_sentence in enumerate(corpus.fine_posts): n_num = p_num = 0 topic_label = '' for index, w_id in enumerate(corpus.fine_cmnts[num]): # comment の最初にトピックラベルが付いているものとする if index == 0: if w_id == corpus.dic.token2id['__label__0']: topic_label = 'baseball' elif w_id == corpus.dic.token2id['__label__1']: topic_label = 'pokemongo' else: print('no label error: ', w_id) raise ValueError # pos or neg word の判定 else: if corpus.dic[w_id] in corpus.neg_words: n_num += 1 if corpus.dic[w_id] in corpus.pos_words: p_num += 1 if (n_num + p_num) == 0: emotion_freq[topic_label]['neu'] += 1 elif n_num <= p_num: emotion_freq[topic_label]['pos'] += 1 elif n_num > p_num: emotion_freq[topic_label]['neg'] += 1 else: raise ValueError if num % 10000 == 0: print(num, 'end...') print(emotion_freq) for topic in emotion_freq: text = 0 for tag in emotion_freq[topic]: text += emotion_freq[topic][tag] print(topic, ':', text) emotion_freq = {'pos': 0, 'neg': 0, 'neu': 0} for num, input_sentence in enumerate(corpus.rough_posts): n_num = p_num = 0 for index, w_id in enumerate(corpus.rough_cmnts[num]): if corpus.dic[w_id] in corpus.neg_words: n_num += 1 if corpus.dic[w_id] in corpus.pos_words: p_num += 1 if (n_num + p_num) == 0: emotion_freq['neu'] += 1 elif n_num <= p_num: emotion_freq['pos'] += 1 elif n_num > p_num: emotion_freq['neg'] += 1 else: raise ValueError if num % 100000 == 0: print(num, 'end...') print(emotion_freq) text = 0 for tag in emotion_freq: text += emotion_freq[tag] print('fine texts: ', text)