예제 #1
0
                        print("Iter " + str(epoch) + ", Minibatch Loss= " + \
                              "{:.6f}".format(mse))
                        # train_accuracy = test_all(sess, accuracy, kdm.train, fc7, word_vec, output_layer, correct_prediction, loss, kdm, epoch, datasetType="Train")
                        # validation_accuracy = test_all(sess, accuracy, kdm.validation, fc7, word_vec, output_layer, correct_prediction, loss, kdm, epoch, datasetType="Validation")
                        # if (validation_accuracy > Placeholders.best_accuracy_so_far):
                        #     Placeholders.best_accuracy_so_far = validation_accuracy
                        #     test_all(sess, accuracy, kdm.test, fc7, word_vec, output_layer, correct_prediction, loss, kdm, epoch)
                        # elif (train_accuracy > 70):
                        #     test_all(sess, accuracy, kdm.test, fc7, word_vec, output_layer, correct_prediction, loss, kdm, epoch)
                # test_all(sess, accuracy, kdm.test, fc7, word_vec, output_layer, correct_prediction, loss, kdm)
                save_path = saver.save(sess, self.model_filename)
                print("Model saved in file: %s" % save_path)
        return accuracy

    def test(self, tweets):
        pass


datafolder = 'data/classification_data/'
exports_folder = 'data/exports/'
fileName = 'Dataset_z_1024_tweets.json'
embedding_generator = word2vec.word2vec()
noise_remover = NoiseRemover()
#fileName = 'junk.json'
filepath = os.path.join(datafolder, fileName)
dataset = TweetDataSet(datafolder,
                       fileName,
                       vectorizer=embedding_generator,
                       transformer=noise_remover)
classifier = lstm_classifier(embedding_generator)
classifier.train(dataset, True)
import time
from subcluster_similarity_experiment import SubclusterSimilarityExperiment
from data_processors.noise_remover import NoiseRemover
from parameters import Parameters

datafolder = 'data/exports/'
exports_folder = 'data/exports/sub-clusters'
fileName = 'guided_LDA_0.25_Dataset_z_1045_tweets.json_20180709-194323.csv'

#fileName = 'junk.json'
experiment_datafolder = time.strftime("%Y%m%d-%H%M%S")
filepath = os.path.join(datafolder, fileName)

dataset = TweetLDADataSet(datafolder, fileName)
dataloader = DataLoader(dataset, batch_size=1)
pre_processor = NoiseRemover()


class Baseline_Subcluster_Similarity_Experiment(SubclusterSimilarityExperiment
                                                ):
    parameters = Parameters()

    def __init__(self,
                 name="Baseline subcluster similarity experiment",
                 eps=0,
                 cluster_number=99,
                 experiment_folder="",
                 timestamp="",
                 positive_training_data=[],
                 negative_training_data=[]):
        self.timestamp = timestamp
예제 #3
0
from nltk.corpus import stopwords
from guidedlda import guidedlda, utils

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

datafolder = 'data/classification_data/'
exports_folder = 'data/exports/'
fileName = 'Dataset_z_41_tweets.json'

# fileName = 'junk.json'
filepath = os.path.join(datafolder, fileName)

dataset = TweetDataSet(datafolder, fileName)
dataloader = DataLoader(dataset, batch_size=1)
pre_processor = NoiseRemover()

tweets = []
tweets_dict = set()
sentence_vectors = []

for data in dataloader:
    tweet = Tweet(data)
    clean_text = pre_processor.process_tweet(tweet.tweet_text)
    if clean_text not in tweets_dict:
        tweets_dict.add(clean_text)
    else:
        continue
    tweet.set_clean_text(clean_text)
    if tweet.get_word_count() > 5:
        tweets.append(tweet)