def preprocess_text(self, name_csv, input_text_path, output_text_path, input_image_path): print("\nInizio preprocessamento testo..") data = pd.read_csv(name_csv, delimiter=';') # leggo il csv # Preprocessing descrizone mapping_name_description = load_descriptions(data) clean_descriptions(mapping_name_description) #vocabulary = to_vocabulary(mapping_name_description) #print("Lunghezza vocabolario: ", len(vocabulary)) save_descriptions(mapping_name_description, '../clean_dataset.csv') tokenizer = create_tokenizer(mapping_name_description) vocab_size = len(tokenizer.word_index) + 1 max_len = max_length(mapping_name_description) create_sequences(tokenizer, max_len, mapping_name_description, vocab_size, input_image_path, input_text_path, output_text_path) print("Fine preprocessamento testo..")
from keras.callbacks import ModelCheckpoint filename = 'dataset/Flickr_8k.trainImages.txt' train = load_set(filename) print('Dataset: %d' % len(train)) train_descriptions = load_clean_descriptions('descriptions.txt', train) print('Descriptions: train=%d' % len(train_descriptions)) # photo features train_features = load_photo_features('features.pkl', train) print('Photos: train=%d' % len(train_features)) # prepare tokenizer tokenizer = create_tokenizer(train_descriptions) vocab_size = len(tokenizer.word_index) + 1 print('Vocabulary Size: %d' % vocab_size) max_length = max_length(train_descriptions) print('Description Length: %d' % max_length) # prepare sequences X1train, X2train, ytrain = create_sequences(tokenizer, max_length, train_descriptions, train_features) # dev dataset # load test set filename = 'dataset/Flickr_8k.devImages.txt' test = load_set(filename) print('Dataset: %d' % len(test)) test_descriptions = load_clean_descriptions('descriptions.txt', test) print('Descriptions: test=%d' % len(test_descriptions)) test_features = load_photo_features('features.pkl', test) print('Photos: test=%d' % len(test_features))
parser.add_argument('--model',default="Seq2Seq", type=str, help='choose a model: Seq2Seq') args = parser.parse_args() if __name__ == '__main__': dataset = 'Couplets' # 数据集 model_name = args.model # Seq2Seq x = import_module('models.' + model_name) #一个函数运行需要根据不同项目的配置,动态导入对应的配置文件运行。 config = x.Config(dataset) #进入到对应模型的__init__方法进行参数初始化 start_time = time.time() print("Loading data...") input_tensor, target_tensor, input_tokenizer, targ_tokenizer = load_dataset(config.train_path, config.num_samples) # 计算目标张量的最大长度 (max_length) max_length_targ, max_length_input = max_length(target_tensor), max_length(input_tensor) # 采用 80 - 20 的比例切分训练集和验证集 input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2) # 显示长度 print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)) BUFFER_SIZE = len(input_tensor_train) config.steps_per_epoch = len(input_tensor_train) // config.batch_size vocab_input_size = len(input_tokenizer.word_index) + 1 vocab_targ_size = len(targ_tokenizer.word_index) + 1 config.num_encoder_tokens = vocab_input_size
vocab_inp_size = len(inp_lang.word2idx) vocab_tar_size = len(targ_lang.word2idx) optimizer = tf.train.AdamOptimizer() EPOCHS = 1000000 input_tensor = [[ inp_lang.word2idx[token] for token in tokenize_sentence(question) ] for question in questions] target_tensor = [[ targ_lang.word2idx[token] for token in tokenize_sentence(answer) ] for answer in answers] # Calculate max_length of input and output tensor # Here, we'll set those to the longest sentence in the dataset max_length_inp, max_length_tar = utils.max_length( input_tensor), utils.max_length(target_tensor) # Padding the input and output tensor to the maximum length input_tensor = tf.keras.preprocessing.sequence.pad_sequences( input_tensor, maxlen=max_length_inp, padding='post', value=EMPTY_IDX) target_tensor = tf.keras.preprocessing.sequence.pad_sequences( target_tensor, maxlen=max_length_tar, padding='post', value=EMPTY_IDX) # Creating training and validation sets using an 80-20 split input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split( input_tensor, target_tensor, test_size=0.2) BUFFER_SIZE = len(input_tensor_train) dataset = tf.data.Dataset.from_tensor_slices( (input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
BATCH_SIZE = 64 embedding_dim = 256 units = 1024 questions, answers = data.load_conv_text() inp_lang = LanguageIndex(questions) targ_lang = LanguageIndex(answers) input_tensor = [[ inp_lang.word2idx[token] for token in tokenize_sentence(question) ] for question in questions] target_tensor = [[ targ_lang.word2idx[token] for token in tokenize_sentence(answer) ] for answer in answers] max_length_inp, max_length_tar = max_length(input_tensor), max_length( target_tensor) model = load_trained_model(BATCH_SIZE, embedding_dim, units, tf.train.AdamOptimizer()) def generate_answer(sentence, model, inp_lang, targ_lang, max_length_inp, max_length_tar): inputs = [inp_lang.word2idx[i] for i in tokenize_sentence(sentence)] inputs = tf.keras.preprocessing.sequence.pad_sequences( [inputs], maxlen=max_length_inp, padding='post') inputs = tf.convert_to_tensor(inputs) result = ''
def main(): tf.enable_eager_execution() questions1, answers1 = data.load_conv_text() # questions2, answers2 = data.load_opensubtitles_text() questions = list(questions1) answers = list(answers1) inp_lang, targ_lang = LanguageIndex(questions), LanguageIndex(answers) input_tensor = [[inp_lang.word2idx[token] for token in tokenize_sentence(question)] for question in questions] target_tensor = [[targ_lang.word2idx[token] for token in tokenize_sentence(answer)] for answer in answers] max_length_inp, max_length_tar = max_length( input_tensor), max_length(target_tensor) input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, maxlen=max_length_inp, padding='post') target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, maxlen=max_length_tar, padding='post') BUFFER_SIZE = len(input_tensor) dataset = tf.data.Dataset.from_tensor_slices( (input_tensor, target_tensor)).shuffle(BUFFER_SIZE) dataset = dataset.batch(BATCH_SIZE, drop_remainder=True) model: encoder_decoder.Seq2Seq = load_trained_model( BATCH_SIZE, EMBEDDING_DIM, UNITS, tf.train.AdamOptimizer()) # sentimental_words = ["absolutely","abundant","accept","acclaimed","accomplishment","achievement","action","active","activist","acumen","adjust","admire","adopt","adorable","adored","adventure","affirmation","affirmative","affluent","agree","airy","alive","alliance","ally","alter","amaze","amity","animated","answer","appreciation","approve","aptitude","artistic","assertive","astonish","astounding","astute","attractive","authentic","basic","beaming","beautiful","believe","benefactor","benefit","bighearted","blessed","bliss","bloom","bountiful","bounty","brave","bright","brilliant","bubbly","bunch","burgeon","calm","care","celebrate","certain","change","character","charitable","charming","cheer","cherish","clarity","classy","clean","clever","closeness","commend","companionship","complete","comradeship","confident","connect","connected","constant","content","conviction","copious","core","coupled","courageous","creative","cuddle","cultivate","cure","curious","cute","dazzling","delight","direct","discover","distinguished","divine","donate","each","day","eager","earnest","easy","ecstasy","effervescent","efficient","effortless","electrifying","elegance","embrace","encompassing","encourage","endorse","energized","energy","enjoy","enormous","enthuse","enthusiastic","entirely","essence","established","esteem","everyday","everyone","excited","exciting","exhilarating","expand","explore","express","exquisite","exultant","faith","familiar","family","famous","feat","fit","flourish","fortunate","fortune","freedom","fresh","friendship","full","funny","gather","generous","genius","genuine","give","glad","glow","good","gorgeous","grace","graceful","gratitude","green","grin","group","grow","handsome","happy","harmony","healed","healing","healthful","healthy","heart","hearty","heavenly","helpful","here","highest","good","hold","holy","honest","honor","hug","i","affirm","i","allow","i","am","willing","i","am.","i","can","i","choose","i","create","i","follow","i","know","i","know,","without","a","doubt","i","make","i","realize","i","take","action","i","trust","idea","ideal","imaginative","increase","incredible","independent","ingenious","innate","innovate","inspire","instantaneous","instinct","intellectual","intelligence","intuitive","inventive","joined","jovial","joy","jubilation","keen","key","kind","kiss","knowledge","laugh","leader","learn","legendary","let","go","light","lively","love","loveliness","lucidity","lucrative","luminous","maintain","marvelous","master","meaningful","meditate","mend","metamorphosis","mind-blowing","miracle","mission","modify","motivate","moving","natural","nature","nourish","nourished","novel","now","nurture","nutritious","one","open","openhanded","optimistic","paradise","party","peace","perfect","phenomenon","pleasure","plenteous","plentiful","plenty","plethora","poise","polish","popular","positive","powerful","prepared","pretty","principle","productive","project","prominent","prosperous","protect","proud","purpose","quest","quick","quiet","ready","recognize","refinement","refresh","rejoice","rejuvenate","relax","reliance","rely","remarkable","renew","renowned","replenish","resolution","resound","resources","respect","restore","revere","revolutionize","rewarding","rich","robust","rousing","safe","secure","see","sensation","serenity","shift","shine","show","silence","simple","sincerity","smart","smile","smooth","solution","soul","sparkling","spirit","spirited","spiritual","splendid","spontaneous","still","stir","strong","style","success","sunny","support","sure","surprise","sustain","synchronized","team","thankful","therapeutic","thorough","thrilled","thrive","today","together","tranquil","transform","triumph","trust","truth","unity","unusual","unwavering","upbeat","value","vary","venerate","venture","very","vibrant","victory","vigorous","vision","visualize","vital","vivacious","voyage","wealthy","welcome","well","whole","wholesome","willing","wonder","wonderful","wondrous","xanadu","yes","yippee","young","youth","youthful","zeal","zest","zing","zip"] sentimental_words = ["good", "excellent", "well"] targ_lang_embd = get_GloVe_embeddings(targ_lang.vocab, EMBEDDING_DIM) sentimental_words_embd = get_GloVe_embeddings( sentimental_words, EMBEDDING_DIM) sim_scores = np.dot(sentimental_words_embd, np.transpose(targ_lang_embd)) print(sim_scores.shape) #max_prob_ids = np.argmax(sim_scores, axis=1) # print(max_prob_ids) # print(targ_lang.word2idx) # print(targ_lang.idx2word(max_prob_ids[1])) optimizer = tf.train.AdamOptimizer() checkpoint_dir = './training_checkpoints' checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") checkpoint = tf.train.Checkpoint(optimizer=optimizer, seq2seq=model) for episode in range(EPISODES): # Start of Episode start = time.time() total_loss = 0 for (batch, (inp, targ)) in enumerate(dataset): with tf.GradientTape() as tape: hidden = tf.zeros((BATCH_SIZE, UNITS)) enc_hidden = model.encoder(inp, hidden) dec_hidden = enc_hidden dec_input = tf.expand_dims( [targ_lang.word2idx[BEGIN_TAG]] * BATCH_SIZE, 1) loss = 0 # loss for decoder pg_loss = 0 # loss for semantic result = '' for t in range(1, targ.shape[1]): actions = [] probs = [] rewards = [] predictions, dec_hidden = model.decoder( dec_input, dec_hidden) ''' predicted_id = tf.argmax(predictions[0]).numpy() if targ_lang.idx2word[predicted_id] == END_TAG: print("result: ", result) else: result += ' ' + targ_lang.idx2word[predicted_id] ''' # using teacher forcing dec_input = tf.expand_dims(targ[:, t], 1) for ps in predictions: # action = tf.distributions.Categorical(ps).sample(1)[0] top_k_indices = tf.nn.top_k(ps, TOP_K).indices.numpy() action = np.random.choice(top_k_indices, 1)[0] actions.append(action) prob = ps.numpy()[action] probs.append(prob) reward = np.max(sim_scores[1:, action]) print(targ_lang.idx2word[action], reward) # print(targ_lang.idx2word[action], reward) rewards.append(reward) # normalize reward reward_mean = np.mean(rewards) reward_std = np.std(rewards) norm_rewards = [(r - reward_mean) / reward_std for r in rewards] if targ_lang.idx2word[actions[0]] == END_TAG: print("result: ", result) else: result += ' ' + targ_lang.idx2word[actions[0]] onehot_labels = tf.keras.utils.to_categorical( y=actions, num_classes=len(targ_lang.word2idx)) norm_rewards = tf.convert_to_tensor( norm_rewards, dtype="float32") # print(onehot_labels.shape) # print(predictions.shape) loss += model.loss_function(targ[:, t], predictions) # print("------") # print(loss) # print(probs) #pg_loss_cross = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=onehot_labels, logits=targ[:, t])) pg_loss_cross = model.loss_function( targ[:, t], ) # pg_loss_cross = tf.reduce_mean( # pg_loss_cross * norm_rewards) pg_loss_cross = tf.reduce_mean( pg_loss_cross * rewards) # print(pg_loss_cross) # print("------") # print(pg_loss_cross) pg_loss += pg_loss_cross # End of Episode # Update policy batch_loss = ((loss + pg_loss) / int(targ.shape[1])) total_loss += batch_loss variables = model.encoder.variables + model.decoder.variables gradients = tape.gradient(loss, variables) optimizer.apply_gradients(zip(gradients, variables)) if batch % 10 == 0: print('batch {} training loss {:.4f}'.format( batch, total_loss.numpy())) # saving (checkpoint) the model every 100 epochs #if (episode + 1) % 100 == 0: #checkpoint.save(file_prefix=checkpoint_prefix) print('Time taken for {} episode {} sec\n'.format( episode, time.time() - start))
# print('len(train_input): {}'.format(len(train_input))) # print('len(train_input[0]: {}'.format(len(train_input[0]))) # print('---------------------------------------------------------------') train_output = [] # print(train_x) for i, s in train_x.iterrows(): # print(s['dfa']) if s['dfa'] == target_dfa: train_output.append(1) else: train_output.append(0) (test_x, test_y) = utils.load_test_data() data_width = max(utils.max_length(train_y), utils.max_length(test_y)) # print('data_width: {}'.format(data_width)) # exit() # print(train_y[0]) # print(test_x) print("Testing and training data loaded") # Data's dimensions are Unknown rows, with an unknown length, with one # transition out from the current state. data = tf.placeholder(dtype=tf.float32, shape=[None, None, 1]) print('Data has been chunked.') # The resulting target has a completely unknown shape at this time and is # not specified.
def main(): input_lang, output_lang, pairs, data1, data2 = read_langs("eng", "fra", True) input_tensor = [[input_lang.word2index[s] for s in es.split(' ')] for es in data1] target_tensor = [[output_lang.word2index[s] for s in es.split(' ')] for es in data2] max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor) input_tensor = [pad_sequences(x, max_length_inp) for x in input_tensor] target_tensor = [pad_sequences(x, max_length_tar) for x in target_tensor] print(len(target_tensor)) input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2) # Show length print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)) BUFFER_SIZE = len(input_tensor_train) BATCH_SIZE = 64 N_BATCH = BUFFER_SIZE // BATCH_SIZE embedding_dim = 256 units = 1024 vocab_inp_size = len(input_lang.word2index) vocab_tar_size = len(output_lang.word2index) train_dataset = MyData(input_tensor_train, target_tensor_train) val_dataset = MyData(input_tensor_val, target_tensor_val) dataset = DataLoader(train_dataset, batch_size=BATCH_SIZE, drop_last=True, shuffle=True) device = torch.device("cpu") encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE) decoder = Decoder(vocab_tar_size, embedding_dim, units, units, BATCH_SIZE) encoder.to(device) decoder.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.001) EPOCHS = 10 for epoch in range(EPOCHS): start = time() encoder.train() decoder.train() total_loss = 0 for (batch, (inp, targ, inp_len)) in enumerate(dataset): loss = 0 xs, ys, lens = sort_batch(inp, targ, inp_len) enc_output, enc_hidden = encoder(xs.to(device), lens, device) dec_hidden = enc_hidden dec_input = torch.tensor([[output_lang.word2index['<sos>']]] * BATCH_SIZE) for t in range(1, ys.size(1)): predictions, dec_hidden, _ = decoder(dec_input.to(device), dec_hidden.to(device), enc_output.to(device)) loss += loss_function(criterion, ys[:, t].to(device), predictions.to(device)) # loss += loss_ dec_input = ys[:, t].unsqueeze(1) batch_loss = (loss / int(ys.size(1))) total_loss += batch_loss optimizer.zero_grad() loss.backward() ### UPDATE MODEL PARAMETERS optimizer.step() if batch % 100 == 0: print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.detach().item())) ### TODO: Save checkpoint for model print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / N_BATCH)) print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
] for sen in train['en'].values.tolist()] input_tensor_val = [[input_lang_valid.w2i[word] for word in sen.split(' ')] for sen in valid['de'].values.tolist()] target_tensor_val = [[target_lang_valid.w2i[word] for word in sen.split(' ')] for sen in valid['en'].values.tolist()] input_tensor_test = [[input_lang_test.w2i[word] for word in sen.split(' ')] for sen in test['de'].values.tolist()] target_tensor_test = [[target_lang_test.w2i[word] for word in sen.split(' ')] for sen in test['en'].values.tolist()] """ (2) max_length """ max_len_inp_train = max_length(input_tensor_train) max_len_inp_valid = max_length(input_tensor_val) max_len_inp_test = max_length(input_tensor_test) max_len_tgt_train = max_length(target_tensor_train) max_len_tgt_valid = max_length(target_tensor_val) max_len_tgt_test = max_length(target_tensor_test) """ (3) padding sequence """ input_tensor_train = [ pad_sequence(x, max_len_inp_train) for x in input_tensor_train ] target_tensor_train = [ pad_sequence(x, max_len_tgt_train) for x in target_tensor_train