-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
154 lines (130 loc) · 6.1 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import config
import logging
from DocumentBank import DocumentBank, Movie
from parseReview import HtmlReviewParser, AmazonReviewsParser
from time import time, strftime
import os
import logger
import utils
from random import shuffle
import json
from shutil import copyfile
def date():
return strftime('%Y-%m-%d-%H:%M:%S')
def write_report(filename, report):
with open('reports/' + filename + '.json', 'w') as outfile:
json.dump(report, outfile, sort_keys=True, indent=2)
def append_html_reviews_to_bank(bank, reviews_path, max_reviews):
logging.info('Starting to parse reviews')
# Only for debugging and logging purposes
t0 = time()
success = 0
failed = 0
partial_success = 0
progress = 0
files_names = os.listdir(reviews_path)[:max_reviews]
files_number = len(files_names)
for i, file_name in enumerate(files_names):
# Logging progress
new_progress = int(i / files_number * 100)
if new_progress != progress and new_progress % 10 == 0:
logging.info('Progress: %i%%' % new_progress)
progress = new_progress
# do the work
with open(os.path.join(reviews_path, file_name), encoding='latin-1') as file:
try:
doc = HtmlReviewParser.parse(file.read())
bank.add_document(doc.pop('review'), doc)
if doc['rating'] != '?':
success += 1
else:
# Rating parsing often fails, so if it's a ? it's considered a partial success
partial_success += 1
except Exception as e:
failed += 1
logging.debug('Failed on %s : %s' % (file_name, str(e)))
logging.info('Tried %i documents, %i%% full success, %i failed, in %is.' %
(int(files_number),
int(success / files_number * 100),
int(failed),
int(time() - t0)))
def main():
# Initialization
start_date = date()
logger.initialize('.')
bank = DocumentBank(document_class=Movie)
# Fetching stopwords
logging.info('Fetching stop words')
stop_words = utils.stop_words(config.LANGUAGE_STOP_WORDS_PATH)
stop_words.extend(utils.stop_words(config.PROJECT_STOP_WORDS_PATH))
logging.info('Fetched %i stop words' % len(stop_words))
n_movies = config.maxsize if config.READ_ALL_THEN_SHUFFLE else config.MOVIES_TO_CLASSIFY + config.MOVIES_TO_ANALYZE
# Read reviews from disk
n_reviews, movies_reviews = AmazonReviewsParser.from_json(config.AMAZON_REVIEWS_FILE,
meta=config.METADATA_FILE,
max_movies=n_movies)
movies = [Movie(movie_id, movie['title'], [{
'userID': review['reviewer_id'],
'rating': review['score'],
'review': review['review']
} for review in movie['reviews']])
for movie_id, movie in movies_reviews.items()]
# Shuffle the array, so that the movies to classify at the end aren't biased
shuffle(movies)
# Separate movies to add to the bank (and add them to it), and movies to classify afterwards
movies_to_analyze = [movie for movie in movies[:config.MOVIES_TO_ANALYZE]]
movies_to_classify = [movie for movie in movies[-config.MOVIES_TO_CLASSIFY:]]
logging.info('Analyzing %i movies' % len(movies_to_analyze))
bank.add_documents([movie.serialize() for movie in movies_to_analyze])
# First vectorize the dataset
bank.vectorize(stop_words=stop_words, max_features=config.MAX_FEATURES)
# Then extract topics and assign them to movies in the dataset
training_counter = bank.topic_extraction({'rank': config.N_TOPICS, 'beta': config.BETA}, n_words=config.N_TOP_WORDS)
# Train the classifiers with the assigned topics
bank.train_classifiers_fullset(n_jobs=config.N_JOBS,
min_amount_relevant=int(config.MIN_RELEVANCE * len(movies_to_analyze)))
# Retrieving results
topics = bank.shelf['topics']
classification_counter = dict((i, []) for i in range(-1, config.N_TOPICS))
for movie in movies_to_classify:
movie_topics = [topics[topic_id] for topic_id in
bank.classify_document(movie.full_text())]
for topic in movie_topics:
classification_counter[topic.id].append({
'id': movie.id,
'title': movie.title
})
if len(movie_topics):
logging.info('Topics for document: %s: %s' % (movie.title, str(movie_topics)))
else:
classification_counter[-1].append(movie.title)
for topic in classification_counter.keys():
logging.info('Topic #%i: %i movies assigned' % (topic, len(classification_counter[topic])))
logging.info('Managed to classify %i%% of the documents.' %
int((len(movies_to_classify) - len(classification_counter[-1])) / len(movies_to_classify) * 100))
# Writing results to JSON
report_filename = date()
write_report(report_filename, {
'start_date': start_date,
'end_date': date(),
'params': {
'max_reviews': config.MAX_REVIEWS,
'max_features': config.MAX_FEATURES,
'min_relevance': config.MIN_RELEVANCE,
'n_topics': config.N_TOPICS,
'n_reviews': n_reviews,
'n_movies': len(movies),
'n_movies_training': len(movies_to_analyze),
'n_movies_classify': len(movies_to_classify),
'beta': config.BETA,
},
'results': [{
'topic': topics[topic_id].top_words,
'training_movies_in_topic': training_counter[topic_id],
'classification_movies_in_topic': classification_counter[topic_id]
} for topic_id in topics]
})
bank.close()
copyfile('./all.log', './reports/%s.log' % report_filename)
if __name__ == '__main__':
main()