def deep_test(): sess = tf.Session() start_dim = 50000 x = tf.placeholder("float", [None, start_dim]) autoencoder = create( x, [500] ) # Dimensionality of the hidden layers. To start with, only use 1 hidden layer. tokens, vects, norm_tweets = convertTweetsToVec('all', start_dim) tweets_dev, targets_dev, labels_dev = readTweetsOfficial( tokenize_tweets.FILEDEV, 'windows-1252', 2) vects_dev, norm_tweets_dev = tokenize_tweets.convertTweetsOfficialToVec( start_dim, tokens, tweets_dev) devbatch = [] for v in vects_dev: devbatch.append(v) # Add ops to save and restore all the variables. saver = tf.train.Saver() # Restore variables from disk. saver.restore(sess, "model.ckpt") print("Model restored.") decoded = sess.run(autoencoder['decoded'], feed_dict={x: devbatch}) # apply to dev encoded = sess.run(autoencoder['encoded'], feed_dict={x: devbatch}) # apply to dev sampnr = 12 # which ones of the dev samples to display for sanity check print("\noriginal", labels_dev[sampnr], norm_tweets_dev[sampnr]) # print "\noriginal", norm_tweets[2] print(vects_dev[sampnr]) dec_tweet = [] n = 0 for r in decoded[sampnr]: # display first result if r > 0.1: dec_tweet.append(tokens[n]) n += 1 print(" cost", sess.run(autoencoder['cost'], feed_dict={x: devbatch})) #print i, " original", batch[0] print( " encoded", encoded[sampnr]) # latent representation of input, feed this to SVM(s) print(" decoded", decoded[sampnr]) print(" decoded bow", dec_tweet)
def deep(modelname, layers, phrasem=True, useDev=True): sess = tf.Session() #load and convert tweets tokens, vects, norm_tweets = convertTweetsToVec('all', 50000, phrasemodel=phrasem) start_dim = 50000 #tokens.__sizeof__() # 129887 tokens without singletons. Dimensionality of input. keep as big as possible, but throw singletons away. x = tf.placeholder("float", [None, start_dim]) print("Creating autoencoder") autoencoder = create( x, layers ) # Dimensionality of the hidden layers. To start with, only use 1 hidden layer. print("Creating Adam") train_step = tf.train.AdamOptimizer(0.1).minimize(autoencoder['cost']) print("Initialising all variables") init = tf.initialize_all_variables() sess.run(init) print("Converting official training data to vectors") tweets_train, targets_train, labels_train = readTweetsOfficial( tokenize_tweets.FILETRAIN) tweets_trump, targets_trump, labels_trump = readTweetsOfficial( tokenize_tweets.FILETRUMP, 'utf-8', 1) vects_train, norm_tweets_train = tokenize_tweets.convertTweetsOfficialToVec( start_dim, tokens, tweets_train, filtering=True) vects_trump, norm_tweets_trump = tokenize_tweets.convertTweetsOfficialToVec( start_dim, tokens, tweets_trump, filtering=True) for v in vects_train: vects.append(v) for v in vects_trump: vects.append(v) tweets_dev, targets_dev, labels_dev = readTweetsOfficial( tokenize_tweets.FILEDEV) vects_dev, norm_tweets_dev = tokenize_tweets.convertTweetsOfficialToVec( start_dim, tokens, tweets_dev, filtering=True) devbatch = [] if useDev == False: for v in vects_dev: devbatch.append(v) else: for v in vects_dev: vects.append(v) tweets_test, targets_test, labels_test = readTweetsOfficial( tokenize_tweets.FILETEST) vects_test, norm_tweets_test = tokenize_tweets.convertTweetsOfficialToVec( start_dim, tokens, tweets_test, filtering=True) for v in vects_test: devbatch.append(v) # start training sampnr = 12 # which ones of the dev samples to display for sanity check print("\noriginal", labels_dev[sampnr], norm_tweets_dev[sampnr]) # print "\noriginal", norm_tweets[2] print(vects[sampnr]) # Add ops to save and restore all the variables. saver = tf.train.Saver() cost = 1.0 # do 1000 training steps #for i in range(2000): i = 0 while cost > 0.01: # make a batch of 100: batch = [] for j in range(500): num = random.randint(0, len(vects) - 1) batch.append(vects[num]) sess.run(train_step, feed_dict={x: np.array(batch)}) if i % 100 == 0: decoded = sess.run(autoencoder['decoded'], feed_dict={x: devbatch}) # apply to dev encoded = sess.run(autoencoder['encoded'], feed_dict={x: devbatch}) # apply to dev #dec_tweet = [] #n = 0 #for r in decoded[sampnr]: # display first result # if r > 0.1: # dec_tweet.append(tokens[n]) # n+=1 cost = sess.run(autoencoder['cost'], feed_dict={x: devbatch}) print(i, " cost", cost) #print i, " original", batch[0] #print i, " encoded", encoded[sampnr] # latent representation of input, feed this to SVM(s) print(i, " decoded", decoded[sampnr]) #print i, " decoded bow", dec_tweet save_path = saver.save( sess, modelname.replace(".ckpt", "_it" + str(i) + ".ckpt")) print("Model saved in file: %s" % save_path) i += 1