def import_set(): with open(get_file_path('interim\\training.pkl'), 'rb') as f: training = pickle.load(f) with open(get_file_path('interim\\testing.pkl'), 'rb') as f: testing = pickle.load(f) return training, testing
def import_neutral_negative(): neg = '' neu = '' with open(get_file_path("negative.txt"), 'r') as f: neg = f.read() with open(get_file_path("neutral.txt"), 'r') as f: neu = f.read() return neg.split(), neu.split()
def export_to_dict(): positive = (open(get_file_path("positive-words.txt"), "r").read()) negative = (open(get_file_path("negative-words.txt"), "r").read()) words = dict() for line in positive.split(): words[line] = 1 for line in negative.split(): words[line] = -1 f = open("words.py", "w") f.write("WORD_SENTIMENT=" + str(words)) f.close()
def import_dataset(dataset_name): reviews_list = list() with open(get_file_path("raw\\" + dataset_name + ".json"), encoding="utf8") as json_file: for line in json_file: sample = json.loads(line) reviews_list.append(sample) return reviews_list
def create_lexicon(corpus, name): positive = (open(get_file_path("new_positive.txt"), "r").read()) negative = (open(get_file_path("new_negative.txt"), "r").read()) print("Cooccurrence matrix") d = cooccurrence_matrix(corpus) print("Sorting vocab") vocab = get_sorted_vocab(d) print("Cosine matrix") cm = cosine_similarity_matrix(vocab, d) print(datetime.now()) print("Propagation ") prop = graph_propagation(cm, vocab, positive.split(), negative.split(), 2) print(datetime.now()) final = list() for key, val in sorted(prop.items(), key=itemgetter(1), reverse=True): final.append((key, val)) print(datetime.now()) print("Saving") save_lexicon_results(final, "lexicon_results_grupo_amigo") print(datetime.now())
def prepare_dataframe(): with open(get_file_path("dataframe.pkl"), 'rb') as f: df = pickle.load(f) new_df = pd.DataFrame(index=[ 'reviews_Automotive', 'reviews_Cell_Phones_and_Accessories', 'reviews_Video_Games', 'reviews_Movies_and_TV' ], columns=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) for file in get_file_names(): for i in range(0, 10): df_temp = df[df.categories == file] m = df_temp[i].mean() new_df.at[file, i] = m return new_df
def plot_explained_variance(file_name): file = open(get_file_path(file_name), "r") total = 0.0 data = list() data_y = list() i = 1 for num in file.read().split(): total = float(num) + total data.append(total) data_y.append(i) i += 1 plt.plot(data, data_y) plt.ylabel('Number of Components') plt.xlabel('Explained Variance') plt.title('Bag Of Nouns SVD Components Explained Variance') plt.show()
def import_and_divide(): files = get_file_names() training = list() testing = list() for file in files: with open(get_file_path('interim\\sample_' + file + '.pkl'), 'rb') as f: lines = pickle.load(f) t = choice(lines, size=70000, replace=False) for l in tqdm(t): lines.remove(l) l['category'] = file training.append(l) for l in lines: l['category'] = file testing.append(l) export_training_testing(training, testing)
from features.explore import bag_of_words, tf_idf, bigrm, only_nouns from src.data.import_dataset import import_cleaned_training_set from visualization.visualize import display_features from features.normalize import lemmatize, letters_only, lower_only, remove_contractions, remove_stopwords from src.utils.utils import get_file_path from nltk.corpus import stopwords print(type(stopwords.words('English'))) with open(get_file_path("stopwords.txt"), "w") as file: for word in stopwords.words('English'): file.write("%s\n" % word)
def export_to_txt(scores, name): with open(get_file_path(name + ".txt"), "w") as file: for t in scores: file.write(' '.join(str(s) for s in t) + '\n')
def import_cleaned_testing_set(): with open(get_file_path('processed\\testing.pkl'), 'rb') as file: testing = pickle.load(file) return testing
def save_lexicon_results(results, name): with open(get_file_path(name + '.pkl'), 'wb') as f: pickle.dump(results, f, pickle.HIGHEST_PROTOCOL)
def export_nouns_adj_adv(data, filename): with open(get_file_path('processed\\' + filename + '.pkl'), 'wb') as f: pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
def import_tagged_words(name): with open(get_file_path('processed\\tagged_words_' + name + '.pkl'), 'rb') as file: return pickle.load(file)
def read_pickle(folder, file): with open(get_file_path(file + '.pkl'), 'rb') as lines: return pickle.load(lines)
def export_training_testing(training, testing): with open(get_file_path("interim\\training.pkl"), "wb") as file: pickle.dump(training, file, pickle.HIGHEST_PROTOCOL) with open(get_file_path("interim\\testing.pkl"), "wb") as file: pickle.dump(testing, file, pickle.HIGHEST_PROTOCOL)
def import_lexicon_set(): with open(get_file_path('interim\\lexicon_dataset_smaller.pkl'), 'rb') as f: return pickle.load(f)
def write_new_pickle(review_list, name): with open(get_file_path("interim\\" + name + ".pkl"), "wb") as f: pickle.dump(review_list, f)
def export_scores(scores): with open(get_file_path("scores.txt"), "w") as file: file.write(scores)
def read_pickle_files(file): with open(get_file_path('raw\\' + file + '.pkl'), 'rb') as lines: return pickle.load(lines)
def export_comments(set_to_save): with open(get_file_path("processed\\comments.pkl"), "wb") as file: pickle.dump(set_to_save, file, pickle.HIGHEST_PROTOCOL)
def export_dataset(set_to_save, name): with open(get_file_path("processed\\" + name + ".pkl"), "wb") as file: pickle.dump(set_to_save, file, pickle.HIGHEST_PROTOCOL)
from features.normalize import lemmatize from src.utils.utils import get_file_path positive = (open(get_file_path("positive-words.txt"), "r").read()) negative = (open(get_file_path("negative-words.txt"), "r").read()) positive = positive.replace("\n", " ") negative = negative.replace("\n", " ") pos = lemmatize(positive) neg = lemmatize(negative) new_positive = open(get_file_path("positive_lemmatized.txt"), 'w') pos = sorted(list(set(pos.split()))) for word in pos: new_positive.write("%s\n" % word) new_negative = open(get_file_path("negative_lemmatized.txt"), "w") neg = sorted(list(set(neg.split()))) for word in neg: new_negative.write("%s\n" % word)
def get_stopwords(): with open(get_file_path('stopwords.txt'), 'r') as f: return f.read()
def get_file_path_test(): file_path = utils.get_file_path(__file__) print(file_path)
def export_sampled_datasets(train, file): with open(get_file_path("raw\\"+file + ".pkl"), "wb") as f: dump(train, f)