print("Iter " + str(epoch) + ", Minibatch Loss= " + \ "{:.6f}".format(mse)) # train_accuracy = test_all(sess, accuracy, kdm.train, fc7, word_vec, output_layer, correct_prediction, loss, kdm, epoch, datasetType="Train") # validation_accuracy = test_all(sess, accuracy, kdm.validation, fc7, word_vec, output_layer, correct_prediction, loss, kdm, epoch, datasetType="Validation") # if (validation_accuracy > Placeholders.best_accuracy_so_far): # Placeholders.best_accuracy_so_far = validation_accuracy # test_all(sess, accuracy, kdm.test, fc7, word_vec, output_layer, correct_prediction, loss, kdm, epoch) # elif (train_accuracy > 70): # test_all(sess, accuracy, kdm.test, fc7, word_vec, output_layer, correct_prediction, loss, kdm, epoch) # test_all(sess, accuracy, kdm.test, fc7, word_vec, output_layer, correct_prediction, loss, kdm) save_path = saver.save(sess, self.model_filename) print("Model saved in file: %s" % save_path) return accuracy def test(self, tweets): pass datafolder = 'data/classification_data/' exports_folder = 'data/exports/' fileName = 'Dataset_z_1024_tweets.json' embedding_generator = word2vec.word2vec() noise_remover = NoiseRemover() #fileName = 'junk.json' filepath = os.path.join(datafolder, fileName) dataset = TweetDataSet(datafolder, fileName, vectorizer=embedding_generator, transformer=noise_remover) classifier = lstm_classifier(embedding_generator) classifier.train(dataset, True)
import time from subcluster_similarity_experiment import SubclusterSimilarityExperiment from data_processors.noise_remover import NoiseRemover from parameters import Parameters datafolder = 'data/exports/' exports_folder = 'data/exports/sub-clusters' fileName = 'guided_LDA_0.25_Dataset_z_1045_tweets.json_20180709-194323.csv' #fileName = 'junk.json' experiment_datafolder = time.strftime("%Y%m%d-%H%M%S") filepath = os.path.join(datafolder, fileName) dataset = TweetLDADataSet(datafolder, fileName) dataloader = DataLoader(dataset, batch_size=1) pre_processor = NoiseRemover() class Baseline_Subcluster_Similarity_Experiment(SubclusterSimilarityExperiment ): parameters = Parameters() def __init__(self, name="Baseline subcluster similarity experiment", eps=0, cluster_number=99, experiment_folder="", timestamp="", positive_training_data=[], negative_training_data=[]): self.timestamp = timestamp
from nltk.corpus import stopwords from guidedlda import guidedlda, utils stop_words = stopwords.words('english') stop_words.extend(['from', 'subject', 're', 'edu', 'use']) datafolder = 'data/classification_data/' exports_folder = 'data/exports/' fileName = 'Dataset_z_41_tweets.json' # fileName = 'junk.json' filepath = os.path.join(datafolder, fileName) dataset = TweetDataSet(datafolder, fileName) dataloader = DataLoader(dataset, batch_size=1) pre_processor = NoiseRemover() tweets = [] tweets_dict = set() sentence_vectors = [] for data in dataloader: tweet = Tweet(data) clean_text = pre_processor.process_tweet(tweet.tweet_text) if clean_text not in tweets_dict: tweets_dict.add(clean_text) else: continue tweet.set_clean_text(clean_text) if tweet.get_word_count() > 5: tweets.append(tweet)