Exemplo n.º 1
0
def main():
    usage = "%prog persona_dir"
    parser = OptionParser(usage=usage)
    #parser.add_option('-a', dest='alpha', default=0.1,
    #                  help='smoothing parameter for multinomial distribution: default=%default')
    parser.add_option('-b', dest='beta', default=0.000001,
                      help='Regularization strength: default=%default')
    parser.add_option('-d', dest='hidden_dim', default=50,
                      help='Hidden node dimension: default=%default')
    parser.add_option('--edge_dim', dest='edge_dim', default=3,
                      help='Edge vector dimension: default=%default')
    parser.add_option('--pos_dim', dest='pos_dim', default=3,
                      help='Edge vector dimension: default=%default')
    parser.add_option('-e', dest='epochs', default=20,
                      help='Number of epochs: default=%default')
    parser.add_option('-i', dest='iter_display', default=4000,
                      help='Number of iterations between output: default=%default')
    parser.add_option('-o', dest='optimization', default='adagrad',
                      help='Optimization method [sgd|sgdm|adagrad]: default=%default')
    parser.add_option('-l', dest='learning_rate', default=0.2,
                      help='Initial learning rate: default=%default')
    parser.add_option('--emb_lr', dest='emb_lr', default=0.1,
                      help='Learning rate for embeddings (not for sgd): default=%default')
    parser.add_option('--decay', dest='decay', default=1.00,
                      help='Learning rate decay: default=%default')
    parser.add_option('--momentum', dest='momentum', default=0.5,
                      help='Momentum parameter (sgdm only): default=%default')
    parser.add_option('-s', dest='seed', default=42,
                      help='Random seed: default=%default')
    parser.add_option('--glove_file', dest='glove_file', default='',
                      help='Location of glove file: default=do not load')
    parser.add_option('--word2vec_file', dest='word2vec_file', default='',
                      help='Location of word2vec file: default=do not load')
    parser.add_option('--n_dev', dest='n_dev', default=5000,
                      help='Number of sentences to use as a dev set: default=%default')
    parser.add_option('--no_eval', action="store_true", dest="no_eval", default=False,
                      help='Skip the evaluation between epochs: default=%default')
    parser.add_option('--min_df', dest='min_df', default=2,
                      help='Minimum document frequency for input and output vocabs: default=%default')
    parser.add_option('--latent_dim', dest='latent_dim', default=100,
                      help='Minimum document frequency for input and output vocabs: default=%default')


    (options, args) = parser.parse_args()
    assert len(args) > 0
    persona_dir = args[0]

    seed = int(options.seed)
    n_epochs = int(options.epochs)
    #alpha = float(options.alpha)
    beta = float(options.beta)
    lr = float(options.learning_rate)
    emb_lr = float(options.emb_lr)
    iter_display = int(options.iter_display)
    opti_method = options.optimization
    lr_decay = float(options.decay)
    momentum = float(options.momentum)
    glove_file = options.glove_file
    word2vec_file = options.word2vec_file
    no_eval = options.no_eval
    n_dev = int(options.n_dev)
    min_df = int(options.min_df)

    if seed > 0:
        np.random.seed(seed)
        random.seed(seed)

    dh = int(options.hidden_dim)
    de = int(options.edge_dim)
    dt = int(options.pos_dim)
    dx = 300
    dz = int(options.latent_dim)

    np.__config__.show()

    # load input data
    input_filename = os.path.join(persona_dir, 'rnn_data.json')
    with codecs.open(input_filename, 'r', encoding='utf-8') as input_file:
        orig_data = json.load(input_file)

    n_entities = len(orig_data)
    print "Loaded", n_entities, "entities"

    tuple_type_set = set()
    data = []
    mention_id = 0
    n_tuples = 0
    for d_i, d in enumerate(orig_data):
        words = []
        doc_id = d[0]
        entity_id = d[1]
        appearances = d[2]
        for tuples in appearances:
            n_tuples += len(tuples)
            for t in tuples:
                tuple_id = t[0]
                #orig_index = t[1]
                role_type = t[1]
                word = t[2]
                arc = t[3]
                head = t[4]
                pos = t[5]
                head_phrase = t[6]

                words.append(word)
            #words.append(tuples[0][5])
        words = [START] + words + [END]
        item = {'doc_id': doc_id, 'entity_id': entity_id, 'words': words}
        data.append(item)


    print len(data), "appearances"
    print n_tuples, "tuples"

    print np.bincount([len(d['words']) for d in data])

    print "Building vocabulary"
    #attribute_counts = Counter()
    #agent_counts = Counter()
    #patient_counts = Counter()
    word_counts = Counter()
    for d in data:
        #attribute_counts.update(d['attributes'])
        #agent_counts.update(d['agent_roles'])
        #patient_counts.update(d['patient_roles'])
        word_counts.update(d['words'])

    n_entities = len(data)
    train_set = set(range(n_entities))
    dev_set = set(np.random.choice(n_entities, n_dev).tolist())
    train_set = train_set - dev_set

    print "train:", len(train_set), " dev", len(dev_set)

    # pad sentences with start and end tokens, and adjust target index accordingly
    train_data = [data[i] for i in train_set]
    dev_data = [data[i] for i in dev_set]

    #attribute_counts.update([UNK])
    #agent_counts.update([UNK])
    #patient_counts.update([UNK])
    #attribute_vocab = [w for w, c in attribute_counts]
    #agent_vocab = [w for w, c in agent_counts]
    #patient_vocab = [w for w, c in patient_counts]
    word_counts.update(UNK)
    vocab = [w for w, c in word_counts.items()]

    #print "Attribute vocab size =", len(attribute_vocab)
    #print "Agent vocab size =", len(agent_vocab)
    #print "Patient vocab size =", len(patient_vocab)
    #attribute_vocab.sort()
    #agent_vocab.sort()
    #patient_vocab.sort()
    vocab_size = len(vocab)
    print "full vocab size =", vocab_size
    vocab.sort()

    #attribute_vocab_index = dict(zip(attribute_vocab, range(len(attribute_vocab))))
    #agent_vocab_index = dict(zip(agent_vocab, range(len(agent_vocab))))
    #patient_vocab_index = dict(zip(patient_vocab, range(len(patient_vocab))))
    vocab_index = dict(zip(vocab, range(vocab_size)))

    print "Indexing words"
    for t in data:
        t['idxs'] = [vocab_index[w] if w in vocab_index else vocab_index[UNK] for w in t['words']]

    initial_embeddings = None
    if glove_file != '':
        initial_embeddings = np.zeros([vocab_size, dx], dtype=np.float32)
        print "Loading glove vectors"
        glove_vocab, glove_embeddings = load_vectors.load_glove_vectors(glove_file, vocab)
        glove_index = dict(zip(glove_vocab, range(len(glove_vocab))))

        for w_i, w in enumerate(vocab):
            if w in glove_index:
                initial_embeddings[w_i, :] = glove_embeddings[glove_index[w], :]
            else:
                initial_embeddings[w_i, :] = 0.05 * np.random.uniform(-1.0, 1.0, (1, dx))

        print len(list(set(vocab) - set(glove_vocab))), "words in training vocabulary with no glove vector"
        #if not no_eval:
        #    print len(list(dev_vocab - set(glove_vocab))), "words in dev vocabulary with no glove vector"
        #    print len(list(test_vocab - set(glove_vocab))), "words in test vocabulary with no glove vector"

    elif word2vec_file != '':
        # load pre-trained word vectors
        print "Loading pre-trained word vectors"
        vectors = gensim.models.Word2Vec.load_word2vec_format(word2vec_file, binary=True)

        word2vec_vocab = set()

        print "Preparing word vectors"
        initial_embeddings = np.zeros([vocab_size, dx], dtype=np.float32)

        for v_i, v in enumerate(vocab):
            if v in vectors:
                initial_embeddings[v_i, :] = vectors[v]
                word2vec_vocab.add(v)
            else:
                initial_embeddings[v_i, :] = 0.05 * np.random.uniform(-1.0, 1.0, (1, dx))

        print len(list(set(vocab) - word2vec_vocab)), "words in training vocabulary with no word2vec vector"
        #if not no_eval:
        #    print len(list(dev_vocab - word2vec_vocab)), "words in dev vocabulary with no word2vec vector"
        #    print len(list(test_vocab - word2vec_vocab)), "words in test vocabulary with no word2vec vector"

    # create the LSTM
    theano_seed = np.random.randint(2 ** 30)

    #tnn = TreeRNN(word_vocab_size, edge_vocab_size, pos_vocab_size, dh, dx, de, dt, beta=beta,
    #             initial_embeddings=initial_embeddings, initial_edge_embeddings=initial_edge_embeddings,
    #             update=opti_method, momentum=momentum, seed=theano_seed)  # create generator

    LAE = LinearAutoencoder(vocab_size, dh, dx, dz, beta=beta, initial_embeddings=initial_embeddings,
                            update=opti_method, momentum=momentum, seed=theano_seed)  # create TreeAutoencoder


    best_dev_log_loss = 1e6
    print "Training"
    for epoch in range(n_epochs):
        sum_log_loss = 0
        sum_loss = 0
        mistakes = 0
        pos_mistakes = 0
        n_items = 0
        keys = range(len(train_data))
        random.shuffle(keys)

        for k_i, k in enumerate(keys):
            d = train_data[k]

            word_idxs = d['idxs']
            seq_length = len(word_idxs)
            n_words = seq_length - 2
            n_items += n_words

            idxs_array = np.array(np.reshape(word_idxs, (seq_length, 1)), dtype=np.int32)
            mask = np.ones((seq_length, 1), dtype=np.int32)

            pred_y, p_y_given_x, log_loss, loss, mu, log_sigma, KLD = LAE.train(idxs_array, mask)

            sum_log_loss += log_loss
            sum_loss += loss
            mistakes += np.sum(pred_y != idxs_array[1:n_words+1, :])

            if k_i % iter_display == 0 and k_i > 0:
                #print [word_vocab[w] for w in word_idxs]
                #   print np.max(h_p), np.mean(h_p), np.array(h_p)
                denom = float(n_items)
                #print d['word'], d['edge'], d['output'], output_vocab[true_y], output_vocab[pred_y], p_y_given_x[true_y], p_y_given_x[pred_y]
                print '%d\t%d\t%.4f\t%.4f\t%.4f\t%.6f' % \
                      (epoch, k_i, sum_log_loss/denom, sum_loss/denom, mistakes/denom, KLD), ' '.join([vocab[i] for i in word_idxs]), ' '.join([vocab[i] for i in pred_y[:, 0]])

                start_idxs = vocab_index[START] * np.ones((1,), dtype=np.int32)
                pred_y, pred_y_blind = LAE.predict_both(idxs_array, mask, start_idxs)
                print ' '.join([vocab[i] for i in pred_y[:, 0]])
                print ' '.join([vocab[i] for i in pred_y_blind[:, 0]])

                #print mu
                #print log_sigma
                #KLD = T.mean(0.5 * T.sum(1 + log_sigma - mu**2 - T.exp(log_sigma), axis=1))
                #print 0.5 * np.sum(1 + log_sigma - mu ** 2 - np.exp(log_sigma), axis=1)

            if k_i % 50000 == 0 and k_i > 0:
                if not no_eval:
                    zo_loss, dev_log_loss_words = autoencoder_evaluate(dev_data, LAE, vocab, print_examples=True)
                    print "Dev evaluation at", k_i, ":", zo_loss, dev_log_loss_words
                    dev_log_loss = dev_log_loss_words
                    if best_dev_log_loss > dev_log_loss:
                        best_dev_log_loss = dev_log_loss
                        print "New best dev log loss =", dev_log_loss
                        #print "Saving vectors"
                        #output_basename = os.path.join(persona_dir, 'vectors_auto')
                        #save_vectors(data, LAE, n_tuples, dh, output_basename=output_basename)
                        #phi = np.array(tae.get_phi())[0]
                        #save_phi(phi, edge_vocab, word_vocab, output_basename)



        if not no_eval:
            zo_loss, dev_log_loss_words = autoencoder_evaluate(dev_data, LAE, vocab, print_examples=True)
            print "Dev evaluation after epoch", epoch, ":", zo_loss, dev_log_loss_words
            dev_log_loss = dev_log_loss_words
            if best_dev_log_loss > dev_log_loss:
                best_dev_log_loss = dev_log_loss
                print "New best dev log loss =", dev_log_loss
                output_basename = os.path.join(persona_dir, 'vectors_auto_' + str(epoch))
                #print "Saving vectors to", output_basename
                #save_vectors(data, LAE, n_tuples, dh, output_basename=output_basename)
                #phi = np.array(tae.get_phi())[0]
                #save_phi(phi, edge_vocab, word_vocab, output_basename)

        lr *= lr_decay
Exemplo n.º 2
0
def main():
    usage = "%prog input_filename"
    parser = OptionParser(usage=usage)
    parser.add_option('-a', dest='alpha', default=0.000001,
                      help='Regularization strength: default=%default')
    parser.add_option('-d', dest='hidden_dim', default=100,
                      help='Hidden node dimension: default=%default')
    parser.add_option('-e', dest='epochs', default=20,
                      help='Number of epochs: default=%default')
    parser.add_option('-i', dest='iter_display', default=4000,
                      help='Number of iterations between output: default=%default')
    parser.add_option('-o', dest='optimization', default='adagrad',
                      help='Optimization method [sgd|sgdm|adagrad]: default=%default')
    parser.add_option('-l', dest='learning_rate', default=0.05,
                      help='Initial learning rate: default=%default')
    parser.add_option('--emb_lr', dest='emb_lr', default=0.01,
                      help='Learning rate for embeddings (not for sgd): default=%default')
    parser.add_option('--decay', dest='decay', default=1.00,
                      help='Learning rate decay: default=%default')
    parser.add_option('--momentum', dest='momentum', default=0.5,
                      help='Momentum parameter (sgdm only): default=%default')
    parser.add_option('-s', dest='seed', default=42,
                      help='Random seed: default=%default')
    parser.add_option('--glove_file', dest='glove_file', default='',
                      help='Location of glove file: default=do not load')
    parser.add_option('--word2vec_file', dest='word2vec_file', default='',
                      help='Location of word2vec file: default=do not load')
    parser.add_option('--n_dev', dest='n_dev', default=4000,
                      help='Number of sentences to use as a dev set: default=%default')
    parser.add_option('--min_wf', dest='min_wf', default=5,
                      help='Exclude words that occur less than this number of times: default=%default')
    parser.add_option('--no_eval', action="store_true", dest="no_eval", default=False,
                      help='Skip the evaluation between epochs: default=%default')
    #parser.add_option('--drop_x', action="store_true", dest="drop_x", default=False,
    #                  help='Add dropout to the input layer: default=%default')


    (options, args) = parser.parse_args()
    input_filename = args[0]

    #lr = 0.002  # 0.05 / batch_size=25
    #alpha = 0.000002  # 10^-4 / batch_size=25 / 2.0

    load_all_word_vectors = False

    seed = int(options.seed)
    n_epochs = int(options.epochs)
    alpha = float(options.alpha)
    lr = float(options.learning_rate)
    emb_lr = float(options.emb_lr)
    iter_display = int(options.iter_display)
    opti_method = options.optimization
    lr_decay = float(options.decay)
    momentum = float(options.momentum)
    glove_file = options.glove_file
    word2vec_file = options.word2vec_file
    no_eval = options.no_eval
    n_dev = int(options.n_dev)
    min_wf = int(options.min_wf)
    #drop_x = int(options.drop_x)

    if seed > 0:
        np.random.seed(seed)
        random.seed(seed)

    dh = int(options.hidden_dim)
    dx = 300

    np.__config__.show()
    print THEANO_FLAGS

    # load sentences
    with codecs.open(input_filename, 'r', encoding='utf-8') as input_file:
        data = json.load(input_file)

    n_sentences = len(data)
    print "Loaded", n_sentences, "sentences"
    train_set = set(range(n_sentences))
    dev_set = set(np.random.choice(n_sentences, n_dev).tolist())
    train_set = train_set - dev_set

    # pad sentences with start and end tokens, and adjust target index accordingly
    train_data = [{'target': data[i][0] + 1, 'words': ['__START__'] + data[i][1].split() + ['__END__']} for i in train_set]
    dev_data = [{'target': data[i][0] + 1, 'words': ['__START__'] + data[i][1].split() + ['__END__']} for i in dev_set]

    print "Building vocabulary"
    vocab = Counter()
    target_vocab = Counter()
    for d in train_data:
        vocab.update(d['words'])
        target_vocab.update([d['words'][d['target']]])

    print "Filtering vocabulary"
    vocab = [v for v, c in vocab.items() if c >= min_wf]
    vocab.append('__UNK__')
    target_vocab = [v for v, c in target_vocab.items() if c > 1]
    target_vocab.append('__UNK__')

    vocab = list(vocab)
    vocab.sort()
    vocab_size = len(vocab)
    vocab_index = dict(zip(vocab, range(vocab_size)))
    print "Size of full vocab =", vocab_size
    target_vocab = list(target_vocab)
    target_vocab.sort()
    target_vocab_size = len(target_vocab)
    target_vocab_index = dict(zip(target_vocab, range(target_vocab_size)))

    print "Size of target vocab =", target_vocab_size
    nc = target_vocab_size

    if glove_file != '':
        initial_embeddings = np.zeros([vocab_size, dx], dtype=np.float32)
        print "Loading glove vectors"
        glove_vocab, glove_embeddings = load_vectors.load_glove_vectors(glove_file, vocab)
        glove_index = dict(zip(glove_vocab, range(len(glove_vocab))))

        for w_i, w in enumerate(vocab):
            if w in glove_index:
                initial_embeddings[w_i, :] = glove_embeddings[glove_index[w], :]
            else:
                initial_embeddings[w_i, :] = 0.05 * np.random.uniform(-1.0, 1.0, (1, dx))

        print len(list(set(vocab) - set(glove_vocab))), "words in training vocabulary with no glove vector"
        #if not no_eval:
        #    print len(list(dev_vocab - set(glove_vocab))), "words in dev vocabulary with no glove vector"
        #    print len(list(test_vocab - set(glove_vocab))), "words in test vocabulary with no glove vector"

    elif word2vec_file != '':
        # load pre-trained word vectors
        print "Loading pre-trained word vectors"
        vectors = gensim.models.Word2Vec.load_word2vec_format(word2vec_file, binary=True)

        word2vec_vocab = set()

        print "Preparing word vectors"
        initial_embeddings = np.zeros([vocab_size, dx], dtype=np.float32)

        for v, i in vocab_index.items():
            if v == '_':
                initial_embeddings[i, :] = np.zeros(dx)
            elif v in vectors:
                initial_embeddings[i, :] = vectors[v]
                word2vec_vocab.add(v)
            else:
                initial_embeddings[i, :] = 0.05 * np.random.uniform(-1.0, 1.0, (1, dx))

        print len(list(set(vocab) - word2vec_vocab)), "words in training vocabulary with no word2vec vector"
        #if not no_eval:
        #    print len(list(dev_vocab - word2vec_vocab)), "words in dev vocabulary with no word2vec vector"
        #    print len(list(test_vocab - word2vec_vocab)), "words in test vocabulary with no word2vec vector"

    initial_embeddings = np.zeros([vocab_size, 2])
    for i in range(40):
        initial_embeddings[i, :] = i


    print "Indexing words"
    for t in train_data:
        t['idxs'] = [vocab_index[w] if w in vocab_index else vocab_index['__UNK__'] for w in t['words']]
        target_word = t['words'][t['target']]
        if target_word in target_vocab_index:
            t['target_idx'] = target_vocab_index[target_word]
        else:
            t['target_idx'] = target_vocab_index['__UNK__']
    for t in dev_data:
        t['idxs'] = [vocab_index[w] if w in vocab_index else vocab_index['__UNK__'] for w in t['words']]
        target_word = t['words'][t['target']]
        if target_word in target_vocab_index:
            t['target_idx'] = target_vocab_index[target_word]
        else:
            t['target_idx'] = target_vocab_index['__UNK__']

    # create the LSTM
    theano_seed = np.random.randint(2 ** 30)
    ctreeLSTM = ConstituencyTreeLSTM(vocab_size, dh, 2, nc, initial_embeddings=initial_embeddings, alpha=alpha,
                                     update=opti_method, seed=theano_seed, momentum=momentum)

    sent_lengths = [(len(t['words']), key) for key, t in enumerate(train_data)]
    sent_lengths.sort()

    if not no_eval:
        print "Pre-training evaluation"
        #train_z_o_loss, train_log_loss = evaluate(dev_data, ctreeLSTM, vocab_index, drop_x)
        print "Dev evaluation:", evaluate(dev_data, ctreeLSTM, vocab_index)

        #test_z_o_loss, test_log_loss = evaluate(test_root_trees, ctreeLSTM, vocab_index, drop_x)
        #print ('epoch=%d\ttrain_0/1=%.3f\ttrain_log=%.3f\tdev_0/1=%.3f\tdev_log=%.3f\ttest_0/1=%.3f\ttest_log=%.3f') % \
        #      (-1, train_z_o_loss, train_log_loss, valid_z_o_loss, valid_log_loss, test_z_o_loss, test_log_loss)

    print "Training"
    for epoch in range(n_epochs):
        sum_log_loss = 0
        sum_loss = 0
        mistakes = 0
        pred0 = 0
        pred1 = 0
        if epoch == 0:
            print "sorting trees"
            keys = [key for length, key in sent_lengths]
        else:
            keys = range(len(train_data))
            random.shuffle(keys)
        print "epoch\titems\tloss\tl+reg\terrs\tpredict 1"
        for k_i, t_i in enumerate(keys):
            t = train_data[t_i]
            words = t['words']
            idxs = t['idxs']
            target = t['target']
            value = t['target_idx']
            #idxs = [vocab_index[w] for w in words]


            counter = np.array(np.arange(0, len(idxs)), dtype=np.int32)
            print counter
            #pred_y, p_y_given_x, shape = ctreeLSTM.predict_prob(idxs, left_mask, right_mask, counter)

            pred_y, p_y_given_x, log_loss, loss, c = ctreeLSTM.train(counter, target, value, lr, emb_lr, 1)
            c = np.array(c)
            print c
            sys.exit()
            sum_log_loss += log_loss
            sum_loss += loss
            if pred_y != value:
                mistakes += 1
            if k_i % iter_display == 0:
                d = float(k_i+1)
                print ' '.join(words[1:-1]), ':', target_vocab[value], target_vocab[pred_y], p_y_given_x[value], p_y_given_x[pred_y]
                print '%d\t%d\t%.4f\t%.4f\t%.4f' % \
                      (epoch, k_i, sum_log_loss/d, sum_loss/d, mistakes/d)

        if not no_eval:
            #train_z_o_loss, train_log_loss = evaluate(train_root_trees, ctreeLSTM, vocab_index, drop_x)
            print "Dev evaluation:", evaluate(dev_data, ctreeLSTM, vocab_index)
            print "Saving results"
            output_filename = 'vectors' + str(epoch) + '.json'
            print "Train evaluation:", evaluate(train_data, ctreeLSTM, vocab_index, save_vectors=True, output_filename=output_filename)
            #test_z_o_loss, test_log_loss = evaluate(test_root_trees, ctreeLSTM, vocab_index, drop_x)
            #print ('epoch=%d\ttrain_0/1=%.3f\ttrain_log=%.3f\tdev_0/1=%.3f\tdev_log=%.3f\ttest_0/1=%.3f\ttest_log=%.3f') % \
            #      (epoch, train_z_o_loss, train_log_loss, valid_z_o_loss, valid_log_loss, test_z_o_loss, test_log_loss)

        lr *= lr_decay