Пример #1
0
def get_prob_dist(lst):
    fdist = FreqDist(lst)
    fdist_lst = fdist.most_common()
    [vals, freqs] = unzip_lst(fdist_lst)
    freq_total = sum(freqs)
    probs = [freq / freq_total for freq in freqs]
    return (vals, probs)
    triple_lsts[1] = neutral_kept_lst_polite
    triple_lsts[0] = triple_lsts[0] + neutral_popped_lst_polite

    (neutral_kept_lst_rude, neutral_popped_lst_rude) = split_triple_lst(triple_lsts[1], bad_indices)
    triple_lsts[1] = neutral_kept_lst_rude
    triple_lsts[2] = triple_lsts[2] + neutral_popped_lst_rude
    
    print([len(triple_lst) for triple_lst in triple_lsts])


if reorganize:
    # Store reorganized utterances
    for (filename, triple_lst) in zip(filenames[13:], triple_lsts):
        dump_pickle(data_path + filename, triple_lst)

unzipped_triples = [unzip_lst(triple_lst) for triple_lst in triple_lsts]
[[polite_sources, polite_targets, _],
 [neutral_sources, neutral_targets, _],
 [rude_sources, rude_targets, _]] = unzipped_triples

sources_lst = [polite_sources, neutral_sources, rude_sources]
targets_lst = [polite_targets, neutral_targets, rude_targets]

labeled_sources_lst = [prepend(sources, label) 
                      for (sources, label) 
                      in zip(sources_lst, labels)]

# Comine the three parts to make train dataset
labeled_source_train = labeled_sources_lst[0] + labeled_sources_lst[1] + labeled_sources_lst[2]
labeled_target_train = polite_targets + neutral_targets + rude_targets
[labeled_source_train, labeled_target_train] = shuffle(labeled_source_train, labeled_target_train)
Пример #3
0
def build_classifier(inputs, seq_lengths, reuse):

    max_seq_length = tf.reduce_max(seq_lengths)
    keep_prob = tf.convert_to_tensor(1.0)  # since we are doing inference only

    # Get the mask of all valid tokens (Assuming that unk_token == pad_token == 0)
    valid_mask = get_valid_mask(inputs)

    # Embedding layer
    with tf.variable_scope("embedding", reuse=reuse):
        embedding_unk = tf.get_variable("embedding_unk",
                                        shape=[1, embedding_size])
        embedding_politeness_original = tf.get_variable(
            "embedding_politeness_original",
            shape=[shared_vocab_size_politeness, embedding_size])
        embedding_politeness_new = tf.get_variable(
            "embedding_politeness_new",
            shape=[new_vocab_size_politeness, embedding_size])
        embeddings = [
            embedding_unk, embedding_politeness_original,
            embedding_politeness_new
        ]
        embedding = tf.concat(embeddings, axis=0)
        embedded_inputs = tf.nn.embedding_lookup(embedding, inputs)

    with tf.variable_scope("lstm", reuse=reuse):
        cell_fw = lstm(embedding_size, hidden_size_classifier, keep_prob,
                       reuse)
        cell_bw = lstm(embedding_size, hidden_size_classifier, keep_prob,
                       reuse)

        (outputs, final_state) = tf.nn.bidirectional_dynamic_rnn(
            cell_fw,
            cell_bw,
            embedded_inputs,
            sequence_length=seq_lengths,
            dtype=tf.float32,
            swap_memory=True)

    # H's shape: batch_size_per_gpu * max_seq_length * (2 * hidden_size_classifier)
    H = tf.concat(outputs, axis=2)

    # CNN + maxpooling layer
    with tf.variable_scope("CNN_maxpooling", reuse=reuse):
        H_expanded = tf.expand_dims(
            H, axis=-1)  # expand H to 4-D (add a chanel dim) for CNN
        pooled_outputs = []

        for (j, filter_size) in enumerate(filter_sizes):
            with tf.variable_scope("filter_%d" %
                                   j):  # sub-scope inherits the reuse flag
                # CNN layer
                filter_shape = [
                    filter_size, 2 * hidden_size_classifier, 1, num_filters
                ]
                W_conv = tf.get_variable(
                    "W_conv",
                    shape=filter_shape,
                    initializer=tf.contrib.layers.xavier_initializer_conv2d())
                b_conv = tf.get_variable(
                    "b_conv",
                    shape=[num_filters],
                    initializer=tf.constant_initializer(0.1))
                conv = tf.nn.conv2d(H_expanded,
                                    W_conv,
                                    strides=[1, 1, 1, 1],
                                    padding="VALID")
                output_conv = tf.nn.relu(tf.nn.bias_add(conv, b_conv))
                # maxpooling layer
                maxpooled_lst = [
                ]  # maxpooled results for each example in the batch
                for k in range(batch_size_per_gpu):
                    sequence_conv = output_conv[k, :seq_lengths[k], 0, :]
                    maxpooled = tf.reduce_max(sequence_conv, axis=0)
                    maxpooled_lst.append(maxpooled)
                batch_maxpooled = tf.stack(maxpooled_lst, axis=0)
                pooled_outputs.append(batch_maxpooled)
        h_maxpool = tf.concat(pooled_outputs, axis=1)
        h_maxpool_dropout = tf.nn.dropout(h_maxpool, keep_prob=keep_prob)

    # output layer (fully connected)
    with tf.variable_scope("output", reuse=reuse):
        num_filters_total = num_filters * len(filter_sizes)
        W_out = tf.get_variable(
            "W_out",
            shape=[num_filters_total, num_classes],
            initializer=tf.contrib.layers.xavier_initializer())
        b_out = tf.get_variable("b_out",
                                shape=[num_classes],
                                initializer=tf.constant_initializer(0.1))
        logits = tf.nn.xw_plus_b(h_maxpool_dropout,
                                 weights=W_out,
                                 biases=b_out)
        scores = tf.nn.softmax(logits, axis=-1)
        politeness_scores = scores[:, 1]

    optimizer = tf.train.AdamOptimizer(0.001)

    if credit_assignment:
        # Note: currently the tf.gather has to appear after computing gradients,
        # otherwise tf.gradients returns None!
        credit_weights_lst = []
        for j in range(batch_size_per_gpu):
            #         embedding_grads = tf.convert_to_tensor(
            #             tf.gradients(politeness_scores[j], embedding)[0])

            [embedding_grads, _] = unzip_lst(
                optimizer.compute_gradients(politeness_scores[j],
                                            var_list=embeddings))

            # Display warning message if some element in embedding_grads is "None"
            for grad in embedding_grads:
                if grad is None:
                    print(
                        "Warning: one of the credit assignment embedding grads is None: ",
                        grad)

            embedding_grads_concat = tf.concat(embedding_grads, axis=0)
            gathered_embedding_grads = tf.gather(embedding_grads_concat,
                                                 inputs[j, :])
            normed_embedding_grads = tf.norm(gathered_embedding_grads, axis=1)
            credit_weights = softmax_with_mask(normed_embedding_grads,
                                               valid_mask[j, :])
            credit_weights_lst.append(credit_weights)

        stacked_credit_weigths = tf.stack(credit_weights_lst, axis=0)
    else:
        stacked_credit_weigths = tf.zeros_like(inputs)

    return (politeness_scores, stacked_credit_weigths)
Пример #4
0
if infer_only:
    [source_test,
     target_test] = zip_remove_duplicates_unzip([source_test, target_test])
else:
    [source_train,
     target_train] = zip_remove_duplicates_unzip([source_train, target_train])

print(len(source_train))

# Also eliminate empty target/source, and list of lists
no_empty = [
    [source, target] for (source, target) in zip(source_train, target_train)
    if (source != [] and target != [] and not isinstance(source[0], list)
        and not isinstance(target[0], list))
]
[source_train, target_train] = unzip_lst(no_empty)

print(len(source_train))

if pretrain:
    source_threshold = 32
else:
    source_threshold = 32 * 2

zipped_lst = [(source, target)
              for (source, target) in zip(source_train, target_train)
              if (len(source) <= source_threshold and len(source) >= 5
                  and len(target) <= 32)]
#         and source.count(0) == 0
#         and target.count(0) == 0
print(len(zipped_lst))
Пример #5
0
def run_vhred(model, sess, mode, epoch):
    training_flag = True if mode == "train" else False
    norm_dialogues = norm_data_dict[mode]
    adv_dialogues = adv_data_dict[mode]

    generator = DataGenerator(norm_dialogues,
                              adv_dialogues=adv_dialogues,
                              feed_both_examples=feed_both_examples,
                              is_training=training_flag,
                              batch_size=batch_size,
                              max_dialogue_length=max_dialogue_length)
    batch = generator.batch_generator()
    print("Initialized data generator.")

    responses = []
    total_loss = 0.0
    adv_total_loss = 0.0
    total_num_tokens = 0.0
    batch_counter = 0
    if mode != "train":
        source_lst = []
        target_lst = []
        dialogue_indices_lst = []
        start_turn_indices_lst = []

    if use_max_margin:
        avg_margin = 0.0

    while True:
        next_batch = next(batch)
        if next_batch is None:
            break

        # if it's os, we always set start_turn_indices to 0
        (dialogue_indices, start_turn_indices, examples,
         turn_lengths_lst) = next_batch

        feed_dict_seqs = {
            model.dialogue: examples,
            model.turn_length: turn_lengths_lst,
            model.start_turn_index: start_turn_indices,
            model.start_tokens: [start_token] * batch_size
        }

        if mode == "train":
            if use_max_margin:
                fetches = [
                    model.batch_total_loss,
                    model.batch_num_tokens,
                    model.apply_gradients_op,
                    model.avg_margin_loss,  # testing
                    model.global_step
                ]
            else:
                fetches = [
                    model.batch_total_loss, model.batch_num_tokens,
                    model.apply_gradients_op, model.global_step
                ]
            feed_dict = {
                model.keep_prob: 1 - dropout_rate,
                model.is_training: training_flag
            }

            result = sess.run(fetches,
                              feed_dict={
                                  **feed_dict_seqs,
                                  **feed_dict
                              })

            if use_max_margin:
                avg_margin = (avg_margin * batch_counter +
                              result[-2]) / (batch_counter + 1)
                print(
                    "Avg margin (this should be getting smaller (or getting larger in abs. value) over time):",
                    avg_margin)

            if feed_both_examples:
                (loss, adv_loss) = result[0]
            else:
                loss = result[0]

            average_log_perplexity = loss / result[1]
            total_loss += loss
            total_num_tokens += result[1]
            print("Epoch (%s) %d, Batch %d, Global step %d:" %
                  (mode, epoch, batch_counter, result[-1]))
            print("Perplexity: %.2f" % exp(average_log_perplexity))
            print("Perplexity so far:", exp(total_loss / total_num_tokens))

            if feed_both_examples:
                adv_average_log_perplexity = adv_loss / result[1]
                adv_total_loss += adv_loss
                print("Adv-perplexity: %.2f" % exp(adv_average_log_perplexity))
                print("Adv-perplexity so far:",
                      exp(adv_total_loss / total_num_tokens))
        else:
            (source, target) = get_source_and_target(examples)
            source_lst.extend(source)
            target_lst.extend(target)

            dialogue_indices_lst.extend(dialogue_indices)
            start_turn_indices_lst.extend(start_turn_indices)

            feed_dict = {
                model.keep_prob: 1.0,
                model.is_training: training_flag
            }
            (ids, lengths) = sess.run(
                [model.batch_sample_ids_beam, model.batch_final_lengths_beam],
                feed_dict={
                    **feed_dict_seqs,
                    **feed_dict
                })

            batch_responses = [[
                index for index in response[:length]
            ] for (response, length) in zip(ids.tolist(), lengths.tolist())]
            responses.extend(batch_responses)
            print("Finished testing batch %d" % batch_counter)

        batch_counter += 1

    if mode == "train":
        epoch_perplexity = total_loss / total_num_tokens
        print("Epoch (%s) %d average perplexity: %.2f" %
              (mode, epoch, exp(epoch_perplexity)))

        if force_store_point == "":
            store_ckpt = os.path.join(ckpt_path, f"{model_extra_str}_{epoch}")
        else:
            store_ckpt = force_store_point
        saver_seq2seq.save(sess, store_ckpt)
        print(f"Checkpoint saved for epoch {epoch}.")
    else:
        zipped = zip_lsts([
            dialogue_indices_lst, start_turn_indices_lst, source_lst,
            target_lst, responses
        ])
        zipped.sort(key=lambda x: x[:2]
                    )  # sort on dialogue indices & start_turn_indices
        zipped_responses = zip_lsts(unzip_lst(zipped)[2:])
        return zipped_responses
def run_seq2seq(sess, mode, epoch, feed_score=1.0):
    """see if we need to append end_token"""    
    is_training = (mode == "train")
    
    if is_training:
        (source_lst, target_lst, score_lst) = unzip_lst(LFT_examples)        
    else:
        (source_lst, target_lst) = data_dict[mode]
        score_lst = [feed_score] * len(source_lst)
    
#     source_lst = source_lst[:batch_size * 2]
#     target_lst = target_lst[:batch_size * 2]
#     score_lst = score_lst[:batch_size * 2]
    
    num_examples = len(source_lst)
    assert num_examples >= batch_size
    num_batches = num_examples // batch_size
    
    keep_prob = (1 - dropout_rate) if is_training else 1.0
    start_tokens = [start_token] * batch_size
    
    total_loss = 0.0
    num_tokens = 0
    zipped_lst = []
    for i in range(num_batches):
        start = i * batch_size
        end = start + batch_size
        
        sources = source_lst[start:end]
        source_lengths = list(map(len, sources))
        targets = target_lst[start:end]
        target_lengths = list(map(len, targets))
        
        scores = score_lst[start:end]
        
        feed_dict = {
            model.source: pad(sources, source_lengths),
            model.source_length: source_lengths,
            model.target: pad(targets, target_lengths),
            model.target_length: target_lengths,
            model.start_tokens: start_tokens,
            model.keep_prob: keep_prob,
            model.is_training: is_training,
            model.score: scores}
        
        if is_training:
            fetches = [model.batch_total_loss, model.batch_num_tokens, model.apply_gradients_op]
        else:
            fetches = [model.batch_sample_ids_beam, model.batch_final_lengths_beam]
        
        result = sess.run(fetches, feed_dict=feed_dict)
        
        if is_training:
            total_loss += result[0]
            num_tokens += result[1]
            print("Epoch (%s) %d Batch %d perplexity: %.2f" % 
                  (mode, epoch, i, exp(result[0] / result[1])))
            print("Perplexity so far:", exp(total_loss / num_tokens))
        else:
            print("Finished testing batch %d" % i)
            responses = [response[:length] 
                         for (response, length) 
                         in zip(result[0].tolist(), result[1].tolist())]
            zipped = zip_lsts([sources, targets, responses])
            zipped_lst.extend(zipped)
                    
    if is_training:
        print("Epoch (%s) %d average perplexity: %.2f" % 
              (mode, epoch, exp(total_loss / num_tokens)))
        if not get_PPL:
            saver_seq2seq.save(sess, "%sseq2seq_RL%s_%d" % (ckpt_path, extra_str, epoch))
            print("Checkpoint saved for epoch %d." % epoch)
                    
    return zipped_lst
def zip_remove_duplicates_unzip(lsts):
    zipped = zip_lsts(lsts)
    zipped_without_duplicates = remove_duplicates(zipped)    
    unzipped = unzip_lst(zipped_without_duplicates)
    return unzipped
            mode = "valid"
            score_range = [1.0]
            zipped = run_seq2seq(sess, mode, i + start_epoch, feed_score=score_range[0])
            
        if infer_only and not get_PPL and (i + start_epoch - 1) % 5 == 0: # for getting perplexity of test data, use train branch
            print("Inferring on test set...")
            mode = "test"

            responses_lst = []
            source_lst = []
            target_lst = []
            score_range = list(np.arange(0.0, 1.1, 0.5))
            for score in score_range:
                zipped_responses = run_seq2seq(
                    sess, mode, i + start_epoch, feed_score=score)
                (source_lst, target_lst, responses) = unzip_lst(zipped_responses)
                responses_lst.append(responses)
            num_responses = len(responses_lst[0])    

            zipped = zip_lsts([source_lst, target_lst] + responses_lst)
        
        flattened = [decode2string(index2token, sent, end_token=end_token, remove_END_TOKEN=True) 
                     for tp in zipped for sent in tp]

        # now we mark sentences that are generated by our model
        num_lines = len(score_range) + 2
        marked_G = [("G: " + sent)
                    if k % num_lines == 1 else sent
                    for (k, sent) in enumerate(flattened)]

        marked_M = [("M: " + sent) 
shared_vocab_size_politeness = len(shared_vocab_politeness)
shared_vocab_size_movie = len(shared_vocab_movie)
source_train = data[5] + data[7]
target_train = data[6] + data[8]
[source_train,
 target_train] = zip_remove_duplicates_unzip([source_train, target_train])
assert len(source_train) == len(target_train)
source_test = data[9]
target_test = data[10]
assert len(source_test) == len(target_test)
embedding_word2vec_politeness = data[11]
embedding_word2vec_movie = data[12]

# Load all the polite utterances
polite_lst = remove_duplicates(
    unzip_lst(load_pickle("../data/polite_movie_target.pkl"))[0])
print("Loaded %d polite examples!" % len(polite_lst))

shared_vocab_size_politeness = len(shared_vocab_politeness)
shared_vocab_size_movie = len(shared_vocab_movie)

special_tokens = ["UNK_TOKEN", "START_TOKEN", "END_TOKEN"]
vocab_size = len(vocab)

# Index vocabulary
index2token = {i: token for (i, token) in enumerate(vocab)}
token2index = {token: i for (i, token) in enumerate(vocab)}

[unk_token, start_token,
 end_token] = [token2index[token] for token in special_tokens]