def deep_test():
    sess = tf.Session()

    start_dim = 50000

    x = tf.placeholder("float", [None, start_dim])
    autoencoder = create(
        x, [500]
    )  # Dimensionality of the hidden layers. To start with, only use 1 hidden layer.

    tokens, vects, norm_tweets = convertTweetsToVec('all', start_dim)
    tweets_dev, targets_dev, labels_dev = readTweetsOfficial(
        tokenize_tweets.FILEDEV, 'windows-1252', 2)
    vects_dev, norm_tweets_dev = tokenize_tweets.convertTweetsOfficialToVec(
        start_dim, tokens, tweets_dev)
    devbatch = []
    for v in vects_dev:
        devbatch.append(v)

    # Add ops to save and restore all the variables.
    saver = tf.train.Saver()

    # Restore variables from disk.
    saver.restore(sess, "model.ckpt")
    print("Model restored.")

    decoded = sess.run(autoencoder['decoded'],
                       feed_dict={x: devbatch})  # apply to dev
    encoded = sess.run(autoencoder['encoded'],
                       feed_dict={x: devbatch})  # apply to dev

    sampnr = 12  # which ones of the dev samples to display for sanity check
    print("\noriginal", labels_dev[sampnr],
          norm_tweets_dev[sampnr])  # print "\noriginal", norm_tweets[2]
    print(vects_dev[sampnr])

    dec_tweet = []
    n = 0
    for r in decoded[sampnr]:  # display first result
        if r > 0.1:
            dec_tweet.append(tokens[n])
        n += 1

    print(" cost", sess.run(autoencoder['cost'], feed_dict={x: devbatch}))
    #print i, " original", batch[0]
    print(
        " encoded",
        encoded[sampnr])  # latent representation of input, feed this to SVM(s)
    print(" decoded", decoded[sampnr])
    print(" decoded bow", dec_tweet)
def extractFeaturesAutoencoder(autoencodermodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev,
                               cross_features='false', usephrasemodel=False):
    sess = tf.Session()

    start_dim = 50000

    x = tf.placeholder("float", [None, start_dim])
    autoencoder = create(x, [100])  # Dimensionality of the hidden layers. To start with, only use 1 hidden layer.

    tokens = readToks2(start_dim, usephrasemodel)

    # read dev data and convert to vectors
    #tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
    vects_train,norm_tweets_train = tokenize_tweets.convertTweetsOfficialToVec(start_dim, tokens, tweets_train, filtering=True)
    vects_train_targets, norm_train_targets = tokenize_tweets.convertTweetsOfficialToVec(start_dim, tokens, targets_train) # optimise runtime with more code later

    # read dev data and convert to vectors
    #tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
    vects_dev,norm_tweets_dev = tokenize_tweets.convertTweetsOfficialToVec(start_dim, tokens, tweets_dev, filtering=True)
    vects_dev_targets, norm_dev_targets = tokenize_tweets.convertTweetsOfficialToVec(start_dim, tokens, targets_dev)


    # Add ops to save and restore all the variables.
    saver = tf.train.Saver()

    # Restore variables from disk.
    saver.restore(sess, autoencodermodel)
    print("Model restored.")

    # apply autoencoder to train and dev data
    encoded_train = sess.run(autoencoder['encoded'], feed_dict={x: vects_train})  # apply to tweets
    encoded_train_target = sess.run(autoencoder['encoded'], feed_dict={x: vects_train_targets})  # apply to target

    encoded_dev = sess.run(autoencoder['encoded'], feed_dict={x: vects_dev})  # apply to tweets
    encoded_dev_target = sess.run(autoencoder['encoded'], feed_dict={x: vects_dev_targets})  # apply to target

    # decoder is just for sanity check, we don't really need that
    #decoded_dev = sess.run(autoencoder['decoded'], feed_dict={x: vects_dev})  # apply to tweets
    #decoded_dev_target = sess.run(autoencoder['decoded'], feed_dict={x: vects_dev_targets})  # apply to target

    print("cost train tweets", sess.run(autoencoder['cost'], feed_dict={x: vects_train}))
    print("cost train target", sess.run(autoencoder['cost'], feed_dict={x: vects_train_targets}))

    print("cost dev tweets", sess.run(autoencoder['cost'], feed_dict={x: vects_dev}))
    print("cost dev target", sess.run(autoencoder['cost'], feed_dict={x: vects_dev_targets}))

    features_train = []
    features_dev = []
    if cross_features == "true":
        for i, enc in enumerate(encoded_train_target):
            features_train_i = []
            for v in np.outer(encoded_train[i], encoded_train_target[i]):
                features_train_i.extend(v)
            features_train.append(features_train_i)
        for i, enc in enumerate(encoded_dev_target):
            features_dev_i = []
            for v in np.outer(encoded_dev[i], encoded_dev_target[i]):
                features_dev_i.extend(v)
            features_dev.append(features_dev_i)
    elif cross_features == "added":
        for i, enc in enumerate(encoded_train_target):
            features_train.append(np.append(encoded_train[i], enc))
        for i, enc in enumerate(encoded_dev_target):
            features_dev.append(np.append(encoded_dev[i], enc))
    else:
        features_train = encoded_train
        features_dev = encoded_dev

    print("Features extracted!")

    return features_train, labels_train, features_dev, labels_dev
示例#3
0
def extractFeaturesAutoencoder(autoencodermodel,
                               tweets_train,
                               targets_train,
                               labels_train,
                               tweets_dev,
                               targets_dev,
                               labels_dev,
                               cross_features='false',
                               usephrasemodel=False):
    sess = tf.Session()

    start_dim = 50000

    x = tf.placeholder("float", [None, start_dim])
    autoencoder = create(
        x, [100]
    )  # Dimensionality of the hidden layers. To start with, only use 1 hidden layer.

    tokens = readToks2(start_dim, usephrasemodel)

    # read dev data and convert to vectors
    #tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
    vects_train, norm_tweets_train = tokenize_tweets.convertTweetsOfficialToVec(
        start_dim, tokens, tweets_train, filtering=True)
    vects_train_targets, norm_train_targets = tokenize_tweets.convertTweetsOfficialToVec(
        start_dim, tokens,
        targets_train)  # optimise runtime with more code later

    # read dev data and convert to vectors
    #tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
    vects_dev, norm_tweets_dev = tokenize_tweets.convertTweetsOfficialToVec(
        start_dim, tokens, tweets_dev, filtering=True)
    vects_dev_targets, norm_dev_targets = tokenize_tweets.convertTweetsOfficialToVec(
        start_dim, tokens, targets_dev)

    # Add ops to save and restore all the variables.
    saver = tf.train.Saver()

    # Restore variables from disk.
    saver.restore(sess, autoencodermodel)
    print("Model restored.")

    # apply autoencoder to train and dev data
    encoded_train = sess.run(autoencoder['encoded'],
                             feed_dict={x: vects_train})  # apply to tweets
    encoded_train_target = sess.run(autoencoder['encoded'],
                                    feed_dict={x: vects_train_targets
                                               })  # apply to target

    encoded_dev = sess.run(autoencoder['encoded'],
                           feed_dict={x: vects_dev})  # apply to tweets
    encoded_dev_target = sess.run(autoencoder['encoded'],
                                  feed_dict={x: vects_dev_targets
                                             })  # apply to target

    # decoder is just for sanity check, we don't really need that
    #decoded_dev = sess.run(autoencoder['decoded'], feed_dict={x: vects_dev})  # apply to tweets
    #decoded_dev_target = sess.run(autoencoder['decoded'], feed_dict={x: vects_dev_targets})  # apply to target

    print("cost train tweets",
          sess.run(autoencoder['cost'], feed_dict={x: vects_train}))
    print("cost train target",
          sess.run(autoencoder['cost'], feed_dict={x: vects_train_targets}))

    print("cost dev tweets",
          sess.run(autoencoder['cost'], feed_dict={x: vects_dev}))
    print("cost dev target",
          sess.run(autoencoder['cost'], feed_dict={x: vects_dev_targets}))

    features_train = []
    features_dev = []
    if cross_features == "true":
        for i, enc in enumerate(encoded_train_target):
            features_train_i = []
            for v in np.outer(encoded_train[i], encoded_train_target[i]):
                features_train_i.extend(v)
            features_train.append(features_train_i)
        for i, enc in enumerate(encoded_dev_target):
            features_dev_i = []
            for v in np.outer(encoded_dev[i], encoded_dev_target[i]):
                features_dev_i.extend(v)
            features_dev.append(features_dev_i)
    elif cross_features == "added":
        for i, enc in enumerate(encoded_train_target):
            features_train.append(np.append(encoded_train[i], enc))
        for i, enc in enumerate(encoded_dev_target):
            features_dev.append(np.append(encoded_dev[i], enc))
    else:
        features_train = encoded_train
        features_dev = encoded_dev

    print("Features extracted!")

    return features_train, labels_train, features_dev, labels_dev
def deep(modelname, layers, phrasem=True, useDev=True):
    sess = tf.Session()

    #load and convert tweets
    tokens, vects, norm_tweets = convertTweetsToVec('all',
                                                    50000,
                                                    phrasemodel=phrasem)

    start_dim = 50000  #tokens.__sizeof__() # 129887 tokens without singletons. Dimensionality of input. keep as big as possible, but throw singletons away.
    x = tf.placeholder("float", [None, start_dim])
    print("Creating autoencoder")
    autoencoder = create(
        x, layers
    )  # Dimensionality of the hidden layers. To start with, only use 1 hidden layer.
    print("Creating Adam")
    train_step = tf.train.AdamOptimizer(0.1).minimize(autoencoder['cost'])

    print("Initialising all variables")
    init = tf.initialize_all_variables()
    sess.run(init)

    print("Converting official training data to vectors")
    tweets_train, targets_train, labels_train = readTweetsOfficial(
        tokenize_tweets.FILETRAIN)
    tweets_trump, targets_trump, labels_trump = readTweetsOfficial(
        tokenize_tweets.FILETRUMP, 'utf-8', 1)
    vects_train, norm_tweets_train = tokenize_tweets.convertTweetsOfficialToVec(
        start_dim, tokens, tweets_train, filtering=True)
    vects_trump, norm_tweets_trump = tokenize_tweets.convertTweetsOfficialToVec(
        start_dim, tokens, tweets_trump, filtering=True)
    for v in vects_train:
        vects.append(v)
    for v in vects_trump:
        vects.append(v)

    tweets_dev, targets_dev, labels_dev = readTweetsOfficial(
        tokenize_tweets.FILEDEV)
    vects_dev, norm_tweets_dev = tokenize_tweets.convertTweetsOfficialToVec(
        start_dim, tokens, tweets_dev, filtering=True)

    devbatch = []
    if useDev == False:
        for v in vects_dev:
            devbatch.append(v)

    else:
        for v in vects_dev:
            vects.append(v)
        tweets_test, targets_test, labels_test = readTweetsOfficial(
            tokenize_tweets.FILETEST)
        vects_test, norm_tweets_test = tokenize_tweets.convertTweetsOfficialToVec(
            start_dim, tokens, tweets_test, filtering=True)
        for v in vects_test:
            devbatch.append(v)

    # start training
    sampnr = 12  # which ones of the dev samples to display for sanity check
    print("\noriginal", labels_dev[sampnr],
          norm_tweets_dev[sampnr])  # print "\noriginal", norm_tweets[2]
    print(vects[sampnr])

    # Add ops to save and restore all the variables.
    saver = tf.train.Saver()

    cost = 1.0
    # do 1000 training steps
    #for i in range(2000):
    i = 0
    while cost > 0.01:
        # make a batch of 100:
        batch = []
        for j in range(500):
            num = random.randint(0, len(vects) - 1)
            batch.append(vects[num])
        sess.run(train_step, feed_dict={x: np.array(batch)})
        if i % 100 == 0:
            decoded = sess.run(autoencoder['decoded'],
                               feed_dict={x: devbatch})  # apply to dev
            encoded = sess.run(autoencoder['encoded'],
                               feed_dict={x: devbatch})  # apply to dev

            #dec_tweet = []
            #n = 0
            #for r in decoded[sampnr]:  # display first result
            #    if r > 0.1:
            #        dec_tweet.append(tokens[n])
            #    n+=1

            cost = sess.run(autoencoder['cost'], feed_dict={x: devbatch})
            print(i, " cost", cost)
            #print i, " original", batch[0]
            #print i, " encoded", encoded[sampnr] # latent representation of input, feed this to SVM(s)
            print(i, " decoded", decoded[sampnr])
            #print i, " decoded bow", dec_tweet

            save_path = saver.save(
                sess, modelname.replace(".ckpt", "_it" + str(i) + ".ckpt"))
            print("Model saved in file: %s" % save_path)
        i += 1