def deep_test(): sess = tf.Session() start_dim = 50000 x = tf.placeholder("float", [None, start_dim]) autoencoder = create( x, [500] ) # Dimensionality of the hidden layers. To start with, only use 1 hidden layer. tokens, vects, norm_tweets = convertTweetsToVec('all', start_dim) tweets_dev, targets_dev, labels_dev = readTweetsOfficial( tokenize_tweets.FILEDEV, 'windows-1252', 2) vects_dev, norm_tweets_dev = tokenize_tweets.convertTweetsOfficialToVec( start_dim, tokens, tweets_dev) devbatch = [] for v in vects_dev: devbatch.append(v) # Add ops to save and restore all the variables. saver = tf.train.Saver() # Restore variables from disk. saver.restore(sess, "model.ckpt") print("Model restored.") decoded = sess.run(autoencoder['decoded'], feed_dict={x: devbatch}) # apply to dev encoded = sess.run(autoencoder['encoded'], feed_dict={x: devbatch}) # apply to dev sampnr = 12 # which ones of the dev samples to display for sanity check print("\noriginal", labels_dev[sampnr], norm_tweets_dev[sampnr]) # print "\noriginal", norm_tweets[2] print(vects_dev[sampnr]) dec_tweet = [] n = 0 for r in decoded[sampnr]: # display first result if r > 0.1: dec_tweet.append(tokens[n]) n += 1 print(" cost", sess.run(autoencoder['cost'], feed_dict={x: devbatch})) #print i, " original", batch[0] print( " encoded", encoded[sampnr]) # latent representation of input, feed this to SVM(s) print(" decoded", decoded[sampnr]) print(" decoded bow", dec_tweet)
def extractFeaturesAutoencoder(autoencodermodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, cross_features='false', usephrasemodel=False): sess = tf.Session() start_dim = 50000 x = tf.placeholder("float", [None, start_dim]) autoencoder = create(x, [100]) # Dimensionality of the hidden layers. To start with, only use 1 hidden layer. tokens = readToks2(start_dim, usephrasemodel) # read dev data and convert to vectors #tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2) vects_train,norm_tweets_train = tokenize_tweets.convertTweetsOfficialToVec(start_dim, tokens, tweets_train, filtering=True) vects_train_targets, norm_train_targets = tokenize_tweets.convertTweetsOfficialToVec(start_dim, tokens, targets_train) # optimise runtime with more code later # read dev data and convert to vectors #tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2) vects_dev,norm_tweets_dev = tokenize_tweets.convertTweetsOfficialToVec(start_dim, tokens, tweets_dev, filtering=True) vects_dev_targets, norm_dev_targets = tokenize_tweets.convertTweetsOfficialToVec(start_dim, tokens, targets_dev) # Add ops to save and restore all the variables. saver = tf.train.Saver() # Restore variables from disk. saver.restore(sess, autoencodermodel) print("Model restored.") # apply autoencoder to train and dev data encoded_train = sess.run(autoencoder['encoded'], feed_dict={x: vects_train}) # apply to tweets encoded_train_target = sess.run(autoencoder['encoded'], feed_dict={x: vects_train_targets}) # apply to target encoded_dev = sess.run(autoencoder['encoded'], feed_dict={x: vects_dev}) # apply to tweets encoded_dev_target = sess.run(autoencoder['encoded'], feed_dict={x: vects_dev_targets}) # apply to target # decoder is just for sanity check, we don't really need that #decoded_dev = sess.run(autoencoder['decoded'], feed_dict={x: vects_dev}) # apply to tweets #decoded_dev_target = sess.run(autoencoder['decoded'], feed_dict={x: vects_dev_targets}) # apply to target print("cost train tweets", sess.run(autoencoder['cost'], feed_dict={x: vects_train})) print("cost train target", sess.run(autoencoder['cost'], feed_dict={x: vects_train_targets})) print("cost dev tweets", sess.run(autoencoder['cost'], feed_dict={x: vects_dev})) print("cost dev target", sess.run(autoencoder['cost'], feed_dict={x: vects_dev_targets})) features_train = [] features_dev = [] if cross_features == "true": for i, enc in enumerate(encoded_train_target): features_train_i = [] for v in np.outer(encoded_train[i], encoded_train_target[i]): features_train_i.extend(v) features_train.append(features_train_i) for i, enc in enumerate(encoded_dev_target): features_dev_i = [] for v in np.outer(encoded_dev[i], encoded_dev_target[i]): features_dev_i.extend(v) features_dev.append(features_dev_i) elif cross_features == "added": for i, enc in enumerate(encoded_train_target): features_train.append(np.append(encoded_train[i], enc)) for i, enc in enumerate(encoded_dev_target): features_dev.append(np.append(encoded_dev[i], enc)) else: features_train = encoded_train features_dev = encoded_dev print("Features extracted!") return features_train, labels_train, features_dev, labels_dev
def extractFeaturesAutoencoder(autoencodermodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, cross_features='false', usephrasemodel=False): sess = tf.Session() start_dim = 50000 x = tf.placeholder("float", [None, start_dim]) autoencoder = create( x, [100] ) # Dimensionality of the hidden layers. To start with, only use 1 hidden layer. tokens = readToks2(start_dim, usephrasemodel) # read dev data and convert to vectors #tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2) vects_train, norm_tweets_train = tokenize_tweets.convertTweetsOfficialToVec( start_dim, tokens, tweets_train, filtering=True) vects_train_targets, norm_train_targets = tokenize_tweets.convertTweetsOfficialToVec( start_dim, tokens, targets_train) # optimise runtime with more code later # read dev data and convert to vectors #tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2) vects_dev, norm_tweets_dev = tokenize_tweets.convertTweetsOfficialToVec( start_dim, tokens, tweets_dev, filtering=True) vects_dev_targets, norm_dev_targets = tokenize_tweets.convertTweetsOfficialToVec( start_dim, tokens, targets_dev) # Add ops to save and restore all the variables. saver = tf.train.Saver() # Restore variables from disk. saver.restore(sess, autoencodermodel) print("Model restored.") # apply autoencoder to train and dev data encoded_train = sess.run(autoencoder['encoded'], feed_dict={x: vects_train}) # apply to tweets encoded_train_target = sess.run(autoencoder['encoded'], feed_dict={x: vects_train_targets }) # apply to target encoded_dev = sess.run(autoencoder['encoded'], feed_dict={x: vects_dev}) # apply to tweets encoded_dev_target = sess.run(autoencoder['encoded'], feed_dict={x: vects_dev_targets }) # apply to target # decoder is just for sanity check, we don't really need that #decoded_dev = sess.run(autoencoder['decoded'], feed_dict={x: vects_dev}) # apply to tweets #decoded_dev_target = sess.run(autoencoder['decoded'], feed_dict={x: vects_dev_targets}) # apply to target print("cost train tweets", sess.run(autoencoder['cost'], feed_dict={x: vects_train})) print("cost train target", sess.run(autoencoder['cost'], feed_dict={x: vects_train_targets})) print("cost dev tweets", sess.run(autoencoder['cost'], feed_dict={x: vects_dev})) print("cost dev target", sess.run(autoencoder['cost'], feed_dict={x: vects_dev_targets})) features_train = [] features_dev = [] if cross_features == "true": for i, enc in enumerate(encoded_train_target): features_train_i = [] for v in np.outer(encoded_train[i], encoded_train_target[i]): features_train_i.extend(v) features_train.append(features_train_i) for i, enc in enumerate(encoded_dev_target): features_dev_i = [] for v in np.outer(encoded_dev[i], encoded_dev_target[i]): features_dev_i.extend(v) features_dev.append(features_dev_i) elif cross_features == "added": for i, enc in enumerate(encoded_train_target): features_train.append(np.append(encoded_train[i], enc)) for i, enc in enumerate(encoded_dev_target): features_dev.append(np.append(encoded_dev[i], enc)) else: features_train = encoded_train features_dev = encoded_dev print("Features extracted!") return features_train, labels_train, features_dev, labels_dev
def deep(modelname, layers, phrasem=True, useDev=True): sess = tf.Session() #load and convert tweets tokens, vects, norm_tweets = convertTweetsToVec('all', 50000, phrasemodel=phrasem) start_dim = 50000 #tokens.__sizeof__() # 129887 tokens without singletons. Dimensionality of input. keep as big as possible, but throw singletons away. x = tf.placeholder("float", [None, start_dim]) print("Creating autoencoder") autoencoder = create( x, layers ) # Dimensionality of the hidden layers. To start with, only use 1 hidden layer. print("Creating Adam") train_step = tf.train.AdamOptimizer(0.1).minimize(autoencoder['cost']) print("Initialising all variables") init = tf.initialize_all_variables() sess.run(init) print("Converting official training data to vectors") tweets_train, targets_train, labels_train = readTweetsOfficial( tokenize_tweets.FILETRAIN) tweets_trump, targets_trump, labels_trump = readTweetsOfficial( tokenize_tweets.FILETRUMP, 'utf-8', 1) vects_train, norm_tweets_train = tokenize_tweets.convertTweetsOfficialToVec( start_dim, tokens, tweets_train, filtering=True) vects_trump, norm_tweets_trump = tokenize_tweets.convertTweetsOfficialToVec( start_dim, tokens, tweets_trump, filtering=True) for v in vects_train: vects.append(v) for v in vects_trump: vects.append(v) tweets_dev, targets_dev, labels_dev = readTweetsOfficial( tokenize_tweets.FILEDEV) vects_dev, norm_tweets_dev = tokenize_tweets.convertTweetsOfficialToVec( start_dim, tokens, tweets_dev, filtering=True) devbatch = [] if useDev == False: for v in vects_dev: devbatch.append(v) else: for v in vects_dev: vects.append(v) tweets_test, targets_test, labels_test = readTweetsOfficial( tokenize_tweets.FILETEST) vects_test, norm_tweets_test = tokenize_tweets.convertTweetsOfficialToVec( start_dim, tokens, tweets_test, filtering=True) for v in vects_test: devbatch.append(v) # start training sampnr = 12 # which ones of the dev samples to display for sanity check print("\noriginal", labels_dev[sampnr], norm_tweets_dev[sampnr]) # print "\noriginal", norm_tweets[2] print(vects[sampnr]) # Add ops to save and restore all the variables. saver = tf.train.Saver() cost = 1.0 # do 1000 training steps #for i in range(2000): i = 0 while cost > 0.01: # make a batch of 100: batch = [] for j in range(500): num = random.randint(0, len(vects) - 1) batch.append(vects[num]) sess.run(train_step, feed_dict={x: np.array(batch)}) if i % 100 == 0: decoded = sess.run(autoencoder['decoded'], feed_dict={x: devbatch}) # apply to dev encoded = sess.run(autoencoder['encoded'], feed_dict={x: devbatch}) # apply to dev #dec_tweet = [] #n = 0 #for r in decoded[sampnr]: # display first result # if r > 0.1: # dec_tweet.append(tokens[n]) # n+=1 cost = sess.run(autoencoder['cost'], feed_dict={x: devbatch}) print(i, " cost", cost) #print i, " original", batch[0] #print i, " encoded", encoded[sampnr] # latent representation of input, feed this to SVM(s) print(i, " decoded", decoded[sampnr]) #print i, " decoded bow", dec_tweet save_path = saver.save( sess, modelname.replace(".ckpt", "_it" + str(i) + ".ckpt")) print("Model saved in file: %s" % save_path) i += 1