def previous_main(): DEBUGMODE = 0 numFeatures = 100 path_dataset_dav_windows = 'Dati/training_set_text.csv' path_class_csv = 'Dati/training_set_features.csv' path_model_file = 'Dati/model.dat' cleaner = TweetsCleaner.TweetsCleaner() loader = DatasetLoader.DatasetLoader() model = VectorModel.VectorModel() classificator = BayesanClassificator.BayesanClassificator() evaluator = ClassifierEvaluation.ClassifierEvaluation() tweets_dataset = loader.LoadTweets(path_dataset_dav_windows) tweets_cleaned = cleaner.ProcessDatasetDict(tweets_dataset) features_dataset = loader.LoadFeatures(path_class_csv, 400) """ Trasforma il vettore delle features in un dizionario con chiave IdDoc e valore la classe corrispondente (1 : neutra, 2: positiva, 3: negativa, 4: mista """ classes_dataset = loader.createClasses(features_dataset) """ Genero il Modello TF-IDF """ all_phrases = list(tweets_cleaned.values())[:400] count = 0 phrases_tuples = [] for phrase in all_phrases: phrases_tuples.append((count, phrase)) count += 1 if not DEBUGMODE or not os.path.exists(path_model_file): tfidf = model.get_tfidf(phrases_tuples) model.persist_tfidf(tfidf, path_model_file) else: tfidf = model.deserialize_tfidf(path_model_file) doc_index = model.get_doc_index(tfidf) # prendo le etichette delle classi per la gold solution labels = numpy.array(list(classes_dataset.values())) # applico LSA reduced = model.LSA(model.get_doc_index_table(doc_index), numFeatures) # scalo in [0,1] reduced = loader.NormalizeDataset(reduced) BayesTest(reduced, labels)
import TweetsCleaner import VectorModel import ClassifierEvaluation import pickle if __name__ == "__main__": DEBUGMODE = 1 path_dataset_dav_windows = 'Dati/training_set_text.csv' path_class_csv = 'Dati/training_set_features.csv' path_model_file = 'Dati/model.dat' cleaner = TweetsCleaner.TweetsCleaner() loader = DatasetLoader.DatasetLoader() model = VectorModel.VectorModel() evaluator = ClassifierEvaluation.ClassifierEvaluation() tweets_dataset = loader.LoadTweets(path_dataset_dav_windows) tweets_cleaned = cleaner.ProcessDatasetDict(tweets_dataset) features_dataset = loader.LoadFeatures(path_class_csv) """ Trasforma il vettore delle features in un dizionario con chiave IdDoc e valore la classe corrispondente (1 : neutra, 2: positiva, 3: negativa, 4: mista """ classes_dataset = loader.createClasses(features_dataset) """ Genero il Modello TF-IDF """ all_phrases = list(tweets_cleaned.values())