def previous_main(): DEBUGMODE = 0 numFeatures = 100 path_dataset_dav_windows = 'Dati/training_set_text.csv' path_class_csv = 'Dati/training_set_features.csv' path_model_file = 'Dati/model.dat' cleaner = TweetsCleaner.TweetsCleaner() loader = DatasetLoader.DatasetLoader() model = VectorModel.VectorModel() classificator = BayesanClassificator.BayesanClassificator() evaluator = ClassifierEvaluation.ClassifierEvaluation() tweets_dataset = loader.LoadTweets(path_dataset_dav_windows) tweets_cleaned = cleaner.ProcessDatasetDict(tweets_dataset) features_dataset = loader.LoadFeatures(path_class_csv, 400) """ Trasforma il vettore delle features in un dizionario con chiave IdDoc e valore la classe corrispondente (1 : neutra, 2: positiva, 3: negativa, 4: mista """ classes_dataset = loader.createClasses(features_dataset) """ Genero il Modello TF-IDF """ all_phrases = list(tweets_cleaned.values())[:400] count = 0 phrases_tuples = [] for phrase in all_phrases: phrases_tuples.append((count, phrase)) count += 1 if not DEBUGMODE or not os.path.exists(path_model_file): tfidf = model.get_tfidf(phrases_tuples) model.persist_tfidf(tfidf, path_model_file) else: tfidf = model.deserialize_tfidf(path_model_file) doc_index = model.get_doc_index(tfidf) # prendo le etichette delle classi per la gold solution labels = numpy.array(list(classes_dataset.values())) # applico LSA reduced = model.LSA(model.get_doc_index_table(doc_index), numFeatures) # scalo in [0,1] reduced = loader.NormalizeDataset(reduced) BayesTest(reduced, labels)
import TweetsCleaner import VectorModel import ClassifierEvaluation import pickle if __name__ == "__main__": DEBUGMODE = 1 path_dataset_dav_windows = 'Dati/training_set_text.csv' path_class_csv = 'Dati/training_set_features.csv' path_model_file = 'Dati/model.dat' cleaner = TweetsCleaner.TweetsCleaner() loader = DatasetLoader.DatasetLoader() model = VectorModel.VectorModel() evaluator = ClassifierEvaluation.ClassifierEvaluation() tweets_dataset = loader.LoadTweets(path_dataset_dav_windows) tweets_cleaned = cleaner.ProcessDatasetDict(tweets_dataset) features_dataset = loader.LoadFeatures(path_class_csv) """ Trasforma il vettore delle features in un dizionario con chiave IdDoc e valore la classe corrispondente (1 : neutra, 2: positiva, 3: negativa, 4: mista """ classes_dataset = loader.createClasses(features_dataset) """ Genero il Modello TF-IDF """ all_phrases = list(tweets_cleaned.values())
#!/usr/bin/env python __author__ = "Tom Kocmi" import logging import VectorModel import Cons import generateRules import new_fixes import time logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO) start = time.time() # for counting the time model = VectorModel.getVectorModel() # this will load existing model # in case if you want to generate new model, put True in the bracket prefixes, suffixes = new_fixes.generateFixes(model.vocab.keys()) rules = GenerateRules.generate(prefixes, suffixes, model) print rules # generate prefixes and suffixes from the vocabulary of the model # if the _fixes already exist it will load them from file instead of generating new ones. # In case that you want to forse it to generate, put True as a second parameter # experiments with the model, the words must be in the dictionary # print model.most_similar(positive=['winston', 'love'], negative=['war']) # print model.doesnt_match("winston julia brother goldstein".split()) # print model.similarity("winston", "julia") print "Time: " + str(time.time() - start)
#!/usr/bin/env python # -*- coding:utf-8 -*- __author__ = "Tom Kocmi" import logging import VectorModel import Cons, Fixes, GenerateRules import time import pickle import Queue start = time.time() # for counting the time logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO) model = VectorModel.getVectorModel() # vocabulary = Fixes.downsampleVocabulary(model, Cons.MAXWORDS4AFFIXES) # with open("models/vocabulary2.data", 'w') as f: # pickle.dump(vocabulary, f) # with open("models/vocabulary.data", 'r') as f: # vocabulary = pickle.load(f) prefixes, suffixes = Fixes.generateFixes(vocabulary) # rules = GenerateRules.generate(prefixes, suffixes, model, vocabulary) # with open("models/rules6.data", 'w') as f: # pickle.dump(rules, f)