Пример #1
0
    def preprocess_text(self, name_csv, input_text_path, output_text_path,
                        input_image_path):

        print("\nInizio preprocessamento testo..")

        data = pd.read_csv(name_csv, delimiter=';')  # leggo il csv

        # Preprocessing descrizone
        mapping_name_description = load_descriptions(data)
        clean_descriptions(mapping_name_description)
        #vocabulary = to_vocabulary(mapping_name_description)
        #print("Lunghezza vocabolario: ", len(vocabulary))
        save_descriptions(mapping_name_description, '../clean_dataset.csv')

        tokenizer = create_tokenizer(mapping_name_description)
        vocab_size = len(tokenizer.word_index) + 1
        max_len = max_length(mapping_name_description)
        create_sequences(tokenizer, max_len, mapping_name_description,
                         vocab_size, input_image_path, input_text_path,
                         output_text_path)

        print("Fine preprocessamento testo..")
from keras.callbacks import ModelCheckpoint

filename = 'dataset/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
# photo features
train_features = load_photo_features('features.pkl', train)
print('Photos: train=%d' % len(train_features))
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)
# prepare sequences
X1train, X2train, ytrain = create_sequences(tokenizer, max_length,
                                            train_descriptions, train_features)

# dev dataset

# load test set
filename = 'dataset/Flickr_8k.devImages.txt'
test = load_set(filename)
print('Dataset: %d' % len(test))
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))
test_features = load_photo_features('features.pkl', test)
print('Photos: test=%d' % len(test_features))
Пример #3
0
parser.add_argument('--model',default="Seq2Seq", type=str, help='choose a model: Seq2Seq')
args = parser.parse_args()


if __name__ == '__main__':
    dataset = 'Couplets'  # 数据集
    model_name = args.model  # Seq2Seq

    x = import_module('models.' + model_name) #一个函数运行需要根据不同项目的配置,动态导入对应的配置文件运行。
    config = x.Config(dataset) #进入到对应模型的__init__方法进行参数初始化
    start_time = time.time()
    print("Loading data...")
    input_tensor, target_tensor, input_tokenizer, targ_tokenizer = load_dataset(config.train_path, config.num_samples)

    # 计算目标张量的最大长度 (max_length)
    max_length_targ, max_length_input = max_length(target_tensor), max_length(input_tensor)

    # 采用 80 - 20 的比例切分训练集和验证集
    input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor,
                                                                                                    target_tensor,
                                                                                                    test_size=0.2)
    # 显示长度
    print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

    BUFFER_SIZE = len(input_tensor_train)

    config.steps_per_epoch = len(input_tensor_train) // config.batch_size

    vocab_input_size = len(input_tokenizer.word_index) + 1
    vocab_targ_size = len(targ_tokenizer.word_index) + 1
    config.num_encoder_tokens = vocab_input_size
Пример #4
0
    vocab_inp_size = len(inp_lang.word2idx)
    vocab_tar_size = len(targ_lang.word2idx)

    optimizer = tf.train.AdamOptimizer()
    EPOCHS = 1000000

    input_tensor = [[
        inp_lang.word2idx[token] for token in tokenize_sentence(question)
    ] for question in questions]
    target_tensor = [[
        targ_lang.word2idx[token] for token in tokenize_sentence(answer)
    ] for answer in answers]
    # Calculate max_length of input and output tensor
    # Here, we'll set those to the longest sentence in the dataset
    max_length_inp, max_length_tar = utils.max_length(
        input_tensor), utils.max_length(target_tensor)

    # Padding the input and output tensor to the maximum length
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(
        input_tensor, maxlen=max_length_inp, padding='post', value=EMPTY_IDX)

    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(
        target_tensor, maxlen=max_length_tar, padding='post', value=EMPTY_IDX)

    # Creating training and validation sets using an 80-20 split
    input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(
        input_tensor, target_tensor, test_size=0.2)

    BUFFER_SIZE = len(input_tensor_train)
    dataset = tf.data.Dataset.from_tensor_slices(
        (input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
Пример #5
0
BATCH_SIZE = 64
embedding_dim = 256
units = 1024

questions, answers = data.load_conv_text()
inp_lang = LanguageIndex(questions)
targ_lang = LanguageIndex(answers)

input_tensor = [[
    inp_lang.word2idx[token] for token in tokenize_sentence(question)
] for question in questions]
target_tensor = [[
    targ_lang.word2idx[token] for token in tokenize_sentence(answer)
] for answer in answers]

max_length_inp, max_length_tar = max_length(input_tensor), max_length(
    target_tensor)

model = load_trained_model(BATCH_SIZE, embedding_dim, units,
                           tf.train.AdamOptimizer())


def generate_answer(sentence, model, inp_lang, targ_lang, max_length_inp,
                    max_length_tar):
    inputs = [inp_lang.word2idx[i] for i in tokenize_sentence(sentence)]
    inputs = tf.keras.preprocessing.sequence.pad_sequences(
        [inputs], maxlen=max_length_inp, padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''
Пример #6
0
def main():
    tf.enable_eager_execution()

    questions1, answers1 = data.load_conv_text()
    # questions2, answers2 = data.load_opensubtitles_text()

    questions = list(questions1)
    answers = list(answers1)

    inp_lang, targ_lang = LanguageIndex(questions), LanguageIndex(answers)

    input_tensor = [[inp_lang.word2idx[token]
                     for token in tokenize_sentence(question)] for question in questions]
    target_tensor = [[targ_lang.word2idx[token]
                      for token in tokenize_sentence(answer)] for answer in answers]
    max_length_inp, max_length_tar = max_length(
        input_tensor), max_length(target_tensor)
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor,
                                                                 maxlen=max_length_inp,
                                                                 padding='post')
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor,
                                                                  maxlen=max_length_tar,
                                                                  padding='post')
    BUFFER_SIZE = len(input_tensor)
    dataset = tf.data.Dataset.from_tensor_slices(
        (input_tensor, target_tensor)).shuffle(BUFFER_SIZE)
    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

    model: encoder_decoder.Seq2Seq = load_trained_model(
        BATCH_SIZE, EMBEDDING_DIM, UNITS, tf.train.AdamOptimizer())

    # sentimental_words = ["absolutely","abundant","accept","acclaimed","accomplishment","achievement","action","active","activist","acumen","adjust","admire","adopt","adorable","adored","adventure","affirmation","affirmative","affluent","agree","airy","alive","alliance","ally","alter","amaze","amity","animated","answer","appreciation","approve","aptitude","artistic","assertive","astonish","astounding","astute","attractive","authentic","basic","beaming","beautiful","believe","benefactor","benefit","bighearted","blessed","bliss","bloom","bountiful","bounty","brave","bright","brilliant","bubbly","bunch","burgeon","calm","care","celebrate","certain","change","character","charitable","charming","cheer","cherish","clarity","classy","clean","clever","closeness","commend","companionship","complete","comradeship","confident","connect","connected","constant","content","conviction","copious","core","coupled","courageous","creative","cuddle","cultivate","cure","curious","cute","dazzling","delight","direct","discover","distinguished","divine","donate","each","day","eager","earnest","easy","ecstasy","effervescent","efficient","effortless","electrifying","elegance","embrace","encompassing","encourage","endorse","energized","energy","enjoy","enormous","enthuse","enthusiastic","entirely","essence","established","esteem","everyday","everyone","excited","exciting","exhilarating","expand","explore","express","exquisite","exultant","faith","familiar","family","famous","feat","fit","flourish","fortunate","fortune","freedom","fresh","friendship","full","funny","gather","generous","genius","genuine","give","glad","glow","good","gorgeous","grace","graceful","gratitude","green","grin","group","grow","handsome","happy","harmony","healed","healing","healthful","healthy","heart","hearty","heavenly","helpful","here","highest","good","hold","holy","honest","honor","hug","i","affirm","i","allow","i","am","willing","i","am.","i","can","i","choose","i","create","i","follow","i","know","i","know,","without","a","doubt","i","make","i","realize","i","take","action","i","trust","idea","ideal","imaginative","increase","incredible","independent","ingenious","innate","innovate","inspire","instantaneous","instinct","intellectual","intelligence","intuitive","inventive","joined","jovial","joy","jubilation","keen","key","kind","kiss","knowledge","laugh","leader","learn","legendary","let","go","light","lively","love","loveliness","lucidity","lucrative","luminous","maintain","marvelous","master","meaningful","meditate","mend","metamorphosis","mind-blowing","miracle","mission","modify","motivate","moving","natural","nature","nourish","nourished","novel","now","nurture","nutritious","one","open","openhanded","optimistic","paradise","party","peace","perfect","phenomenon","pleasure","plenteous","plentiful","plenty","plethora","poise","polish","popular","positive","powerful","prepared","pretty","principle","productive","project","prominent","prosperous","protect","proud","purpose","quest","quick","quiet","ready","recognize","refinement","refresh","rejoice","rejuvenate","relax","reliance","rely","remarkable","renew","renowned","replenish","resolution","resound","resources","respect","restore","revere","revolutionize","rewarding","rich","robust","rousing","safe","secure","see","sensation","serenity","shift","shine","show","silence","simple","sincerity","smart","smile","smooth","solution","soul","sparkling","spirit","spirited","spiritual","splendid","spontaneous","still","stir","strong","style","success","sunny","support","sure","surprise","sustain","synchronized","team","thankful","therapeutic","thorough","thrilled","thrive","today","together","tranquil","transform","triumph","trust","truth","unity","unusual","unwavering","upbeat","value","vary","venerate","venture","very","vibrant","victory","vigorous","vision","visualize","vital","vivacious","voyage","wealthy","welcome","well","whole","wholesome","willing","wonder","wonderful","wondrous","xanadu","yes","yippee","young","youth","youthful","zeal","zest","zing","zip"]
    
    sentimental_words = ["good", "excellent", "well"]
    targ_lang_embd = get_GloVe_embeddings(targ_lang.vocab, EMBEDDING_DIM)
    sentimental_words_embd = get_GloVe_embeddings(
        sentimental_words, EMBEDDING_DIM)
    sim_scores = np.dot(sentimental_words_embd, np.transpose(targ_lang_embd))
    print(sim_scores.shape)
    #max_prob_ids = np.argmax(sim_scores, axis=1)
    # print(max_prob_ids)
    # print(targ_lang.word2idx)
    # print(targ_lang.idx2word(max_prob_ids[1]))

    optimizer = tf.train.AdamOptimizer()

    checkpoint_dir = './training_checkpoints'
    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
    checkpoint = tf.train.Checkpoint(optimizer=optimizer, seq2seq=model)

    for episode in range(EPISODES):

        # Start of Episode
        start = time.time()
        total_loss = 0
        for (batch, (inp, targ)) in enumerate(dataset):
            with tf.GradientTape() as tape:

                hidden = tf.zeros((BATCH_SIZE, UNITS))
                enc_hidden = model.encoder(inp, hidden)
                dec_hidden = enc_hidden
                dec_input = tf.expand_dims(
                    [targ_lang.word2idx[BEGIN_TAG]] * BATCH_SIZE, 1)

                loss = 0  # loss for decoder
                pg_loss = 0  # loss for semantic

                result = ''
                for t in range(1, targ.shape[1]):
                    actions = []
                    probs = []
                    rewards = []
                    predictions, dec_hidden = model.decoder(
                        dec_input, dec_hidden)
                    '''
                    predicted_id = tf.argmax(predictions[0]).numpy()
                    if targ_lang.idx2word[predicted_id] == END_TAG:
                        print("result: ", result)
                    else:
                        result += ' ' + targ_lang.idx2word[predicted_id]
                    '''
                    # using teacher forcing
                    dec_input = tf.expand_dims(targ[:, t], 1)
                    for ps in predictions:
                        # action = tf.distributions.Categorical(ps).sample(1)[0]
                        top_k_indices = tf.nn.top_k(ps, TOP_K).indices.numpy()
                        action = np.random.choice(top_k_indices, 1)[0]
                        actions.append(action)
                        prob = ps.numpy()[action]
                        probs.append(prob)
                        reward = np.max(sim_scores[1:, action])
                        print(targ_lang.idx2word[action], reward)
                        # print(targ_lang.idx2word[action], reward)
                        rewards.append(reward)

                        # normalize reward
                        reward_mean = np.mean(rewards)
                        reward_std = np.std(rewards)
                        norm_rewards = [(r - reward_mean) /
                                        reward_std for r in rewards]

                    if targ_lang.idx2word[actions[0]] == END_TAG:
                        print("result: ", result)
                    else:
                        result += ' ' + targ_lang.idx2word[actions[0]]

                    onehot_labels = tf.keras.utils.to_categorical(
                        y=actions, num_classes=len(targ_lang.word2idx))

                    norm_rewards = tf.convert_to_tensor(
                        norm_rewards, dtype="float32")
                    # print(onehot_labels.shape)
                    # print(predictions.shape)
                    loss += model.loss_function(targ[:, t], predictions)
                    # print("------")
                    # print(loss)
                    # print(probs)
                    #pg_loss_cross = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=onehot_labels, logits=targ[:, t]))
                    pg_loss_cross = model.loss_function(
                        targ[:, t],  )
                    # pg_loss_cross = tf.reduce_mean(
                    #     pg_loss_cross * norm_rewards)
                    pg_loss_cross = tf.reduce_mean(
                        pg_loss_cross * rewards)
                    # print(pg_loss_cross)
                    # print("------")
                    # print(pg_loss_cross)
                    pg_loss += pg_loss_cross
                # End of Episode
                # Update policy
                batch_loss = ((loss + pg_loss) / int(targ.shape[1]))
                total_loss += batch_loss
                variables = model.encoder.variables + model.decoder.variables
                gradients = tape.gradient(loss, variables)
                optimizer.apply_gradients(zip(gradients, variables))
                if batch % 10 == 0:
                    print('batch {} training loss {:.4f}'.format(
                        batch, total_loss.numpy()))

        # saving (checkpoint) the model every 100 epochs
        #if (episode + 1) % 100 == 0:
            #checkpoint.save(file_prefix=checkpoint_prefix)

        print('Time taken for {} episode {} sec\n'.format(
            episode, time.time() - start))
Пример #7
0
# print('len(train_input): {}'.format(len(train_input)))
# print('len(train_input[0]: {}'.format(len(train_input[0])))
# print('---------------------------------------------------------------')

train_output = []
# print(train_x)
for i, s in train_x.iterrows():
    # print(s['dfa'])
    if s['dfa'] == target_dfa:
        train_output.append(1)
    else:
        train_output.append(0)

(test_x, test_y) = utils.load_test_data()

data_width = max(utils.max_length(train_y), utils.max_length(test_y))
# print('data_width: {}'.format(data_width))
# exit()

# print(train_y[0])
# print(test_x)
print("Testing and training data loaded")

# Data's dimensions are Unknown rows, with an unknown length, with one
# transition out from the current state.
data = tf.placeholder(dtype=tf.float32, shape=[None, None, 1])

print('Data has been chunked.')

# The resulting target has a completely unknown shape at this time and is
# not specified.
Пример #8
0
def main():
    input_lang, output_lang, pairs, data1, data2 = read_langs("eng", "fra", True)
    input_tensor = [[input_lang.word2index[s] for s in es.split(' ')] for es in data1]
    target_tensor = [[output_lang.word2index[s] for s in es.split(' ')] for es in data2]
    max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)

    input_tensor = [pad_sequences(x, max_length_inp) for x in input_tensor]
    target_tensor = [pad_sequences(x, max_length_tar) for x in target_tensor]
    print(len(target_tensor))

    input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor,
                                                                                                    target_tensor,
                                                                                                    test_size=0.2)

    # Show length
    print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

    BUFFER_SIZE = len(input_tensor_train)
    BATCH_SIZE = 64
    N_BATCH = BUFFER_SIZE // BATCH_SIZE
    embedding_dim = 256
    units = 1024
    vocab_inp_size = len(input_lang.word2index)
    vocab_tar_size = len(output_lang.word2index)

    train_dataset = MyData(input_tensor_train, target_tensor_train)
    val_dataset = MyData(input_tensor_val, target_tensor_val)

    dataset = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                         drop_last=True,
                         shuffle=True)

    device = torch.device("cpu")

    encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
    decoder = Decoder(vocab_tar_size, embedding_dim, units, units, BATCH_SIZE)

    encoder.to(device)
    decoder.to(device)

    criterion = nn.CrossEntropyLoss()

    optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()),
                           lr=0.001)

    EPOCHS = 10

    for epoch in range(EPOCHS):
        start = time()

        encoder.train()
        decoder.train()

        total_loss = 0

        for (batch, (inp, targ, inp_len)) in enumerate(dataset):
            loss = 0

            xs, ys, lens = sort_batch(inp, targ, inp_len)
            enc_output, enc_hidden = encoder(xs.to(device), lens, device)
            dec_hidden = enc_hidden
            dec_input = torch.tensor([[output_lang.word2index['<sos>']]] * BATCH_SIZE)

            for t in range(1, ys.size(1)):
                predictions, dec_hidden, _ = decoder(dec_input.to(device),
                                                     dec_hidden.to(device),
                                                     enc_output.to(device))
                loss += loss_function(criterion, ys[:, t].to(device), predictions.to(device))
                # loss += loss_
                dec_input = ys[:, t].unsqueeze(1)

            batch_loss = (loss / int(ys.size(1)))
            total_loss += batch_loss

            optimizer.zero_grad()

            loss.backward()

            ### UPDATE MODEL PARAMETERS
            optimizer.step()

            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                             batch,
                                                             batch_loss.detach().item()))

        ### TODO: Save checkpoint for model
        print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                            total_loss / N_BATCH))
        print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
Пример #9
0
] for sen in train['en'].values.tolist()]

input_tensor_val = [[input_lang_valid.w2i[word] for word in sen.split(' ')]
                    for sen in valid['de'].values.tolist()]
target_tensor_val = [[target_lang_valid.w2i[word] for word in sen.split(' ')]
                     for sen in valid['en'].values.tolist()]

input_tensor_test = [[input_lang_test.w2i[word] for word in sen.split(' ')]
                     for sen in test['de'].values.tolist()]
target_tensor_test = [[target_lang_test.w2i[word] for word in sen.split(' ')]
                      for sen in test['en'].values.tolist()]
"""
(2) max_length
"""

max_len_inp_train = max_length(input_tensor_train)
max_len_inp_valid = max_length(input_tensor_val)
max_len_inp_test = max_length(input_tensor_test)

max_len_tgt_train = max_length(target_tensor_train)
max_len_tgt_valid = max_length(target_tensor_val)
max_len_tgt_test = max_length(target_tensor_test)
"""
(3) padding sequence
"""

input_tensor_train = [
    pad_sequence(x, max_len_inp_train) for x in input_tensor_train
]
target_tensor_train = [
    pad_sequence(x, max_len_tgt_train) for x in target_tensor_train