def main(): # dataset = 'hotel' dataset = 'restaurant' my_folder = '/Users/fpena/UCC/Thesis/datasets/context/' my_training_records_file =\ my_folder + 'classified_' + dataset + '_reviews.json' my_training_reviews_file =\ my_folder + 'classified_' + dataset + '_reviews.pkl' my_training_records = ETLUtils.load_json_file(my_training_records_file) with open(my_training_reviews_file, 'rb') as read_file: my_training_reviews = pickle.load(read_file) classifier = ReviewsClassifier() classifier.train(my_training_records, my_training_reviews) my_input_records_file =\ my_folder + 'yelp_training_set_review_' + dataset + 's_shuffled.json' my_input_reviews_file =\ my_folder + 'reviews_' + dataset + '_shuffled.pkl' my_output_records_file =\ my_folder + 'yelp_training_set_review_' + dataset +\ 's_shuffled_tagged.json' with open(my_input_reviews_file, 'rb') as read_file: my_input_reviews = pickle.load(read_file) my_input_records = ETLUtils.load_json_file(my_input_records_file) my_output_records =\ classifier.label_json_reviews(my_input_records, my_input_reviews) ETLUtils.save_json_file(my_output_records_file, my_output_records)
def export_records_to_predict(self, records_file): if self.records_to_predict is None: self.records_to_predict = self.get_records_to_predict() ETLUtils.save_json_file(records_file, self.records_to_predict) with open(records_file + '.pkl', 'wb') as write_file: pickle.dump( self.items_to_predict, write_file, pickle.HIGHEST_PROTOCOL)
def export_records(self): print('%s: get_records_to_predict_topn records' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.dictionary.save(Constants.DICTIONARY_FILE) ETLUtils.save_json_file( Constants.FULL_PROCESSED_RECORDS_FILE, self.records) self.drop_unnecessary_fields() ETLUtils.save_json_file(Constants.PROCESSED_RECORDS_FILE, self.records)
def export_records_to_predict(self, records_file): if self.records_to_predict is None: self.records_to_predict = self.get_records_to_predict() ETLUtils.save_json_file(records_file, self.records_to_predict) with open(records_file + '.pkl', 'wb') as write_file: pickle.dump(self.items_to_predict, write_file, pickle.HIGHEST_PROTOCOL)
def export_records(self): print('%s: exporting transformed records' % time.strftime("%Y/%m/%d-%H:%M:%S")) records_to_export = [] desired_fields = [ Constants.USER_INTEGER_ID_FIELD, Constants.ITEM_INTEGER_ID_FIELD, Constants.RATING_FIELD, Constants.CONTEXT_FIELD, ] for record in self.records: new_record = {field: record[field] for field in desired_fields} records_to_export.append(new_record) file_name = Constants.generate_file_name( 'recsys_formatted_context_records', 'json', Constants.CACHE_FOLDER, None, None, True, True, uses_carskit=False, normalize_topics=True, format_context=True) ETLUtils.save_json_file(file_name, records_to_export)
def count_frequencies(self): """ Counts the number of reviews each user and item have and stores the results in two separate files, one for the users and another one for the items. Note that the integer IDs are used and not the original user and item IDs """ print('%s: count frequencies' % time.strftime("%Y/%m/%d-%H:%M:%S")) user_frequency_map = ETLUtils.count_frequency( self.records, Constants.USER_INTEGER_ID_FIELD) item_frequency_map = ETLUtils.count_frequency( self.records, Constants.ITEM_INTEGER_ID_FIELD) user_frequency_file = Constants.generate_file_name( 'user_frequency_map', 'json', Constants.CACHE_FOLDER, None, None, False ) item_frequency_file = Constants.generate_file_name( 'item_frequency_map', 'json', Constants.CACHE_FOLDER, None, None, False ) ETLUtils.save_json_file(user_frequency_file, [user_frequency_map]) ETLUtils.save_json_file(item_frequency_file, [item_frequency_map])
def train_topic_model(self, cycle_index, fold_index): context_extractor = topic_model_creator.create_topic_model( self.train_records, cycle_index, fold_index) self.context_rich_topics = context_extractor.context_rich_topics topics_file_path = Constants.generate_file_name( 'context_topics', 'json', Constants.CACHE_FOLDER, cycle_index, fold_index, True) ETLUtils.save_json_file(topics_file_path, [dict(self.context_rich_topics)]) print('Trained Context Extractor: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) return context_extractor
def lemmatize_records(self): if os.path.exists(Constants.LEMMATIZED_RECORDS_FILE): print('Records were already lemmatized') self.records = \ ETLUtils.load_json_file(Constants.LEMMATIZED_RECORDS_FILE) return if Constants.DOCUMENT_LEVEL == 'review': self.records = self.lemmatize_reviews(self.records) elif Constants.DOCUMENT_LEVEL == 'sentence' or\ isinstance(Constants.DOCUMENT_LEVEL, (int, long)): self.records = self.lemmatize_sentences(self.records) ETLUtils.save_json_file(Constants.LEMMATIZED_RECORDS_FILE, self.records)
def test(): document_term_matrix = NmfTopicExtractor.load_document_term_matrix() results = [] # my_list = range(2, 31) my_list = range(2, 61) for i in my_list: Constants.update_properties( {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i}) topic_model = NmfTopicExtractor() topic_model.load_trained_data() document_topic_matrix = topic_model.document_topic_matrix topic_term_matrix = topic_model.topic_term_matrix divergence = calculate_divergence(document_term_matrix, document_topic_matrix, topic_term_matrix) result = { 'num_topics': Constants.TOPIC_MODEL_NUM_TOPICS, 'divergence': divergence, Constants.TOPIC_MODEL_TYPE_FIELD: 'ensemble', Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE } results.append(result) print('Num topics: %d, Divergence: %f' % (Constants.TOPIC_MODEL_NUM_TOPICS, divergence)) for result in results: print('%d %f' % (result['num_topics'], result['divergence'])) prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE +\ '_topic_model_divergence' csv_file_path = prefix + '.csv' json_file_path = prefix + '.json' headers = sorted(results[0].keys()) ETLUtils.save_csv_file(csv_file_path, results, headers) ETLUtils.save_json_file(json_file_path, results)
def tag_reviews_language(self): print('%s: tag reviews language' % time.strftime("%Y/%m/%d-%H:%M:%S")) if os.path.exists(Constants.LANGUAGE_RECORDS_FILE): print('Records have already been tagged with language field') self.records = \ ETLUtils.load_json_file(Constants.LANGUAGE_RECORDS_FILE) return DetectorFactory.seed = 0 for record in self.records: try: language = langdetect.detect(record[Constants.TEXT_FIELD]) except LangDetectException: language = 'unknown' record[Constants.LANGUAGE_FIELD] = language ETLUtils.save_json_file(Constants.LANGUAGE_RECORDS_FILE, self.records)
def test(): document_term_matrix = NmfTopicExtractor.load_document_term_matrix() results = [] # my_list = range(2, 31) my_list = range(2, 61) for i in my_list: Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i}) topic_model = NmfTopicExtractor() topic_model.load_trained_data() document_topic_matrix = topic_model.document_topic_matrix topic_term_matrix = topic_model.topic_term_matrix divergence = calculate_divergence( document_term_matrix, document_topic_matrix, topic_term_matrix) result = { 'num_topics': Constants.TOPIC_MODEL_NUM_TOPICS, 'divergence': divergence, Constants.TOPIC_MODEL_TYPE_FIELD: 'ensemble', Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE } results.append(result) print('Num topics: %d, Divergence: %f' % (Constants.TOPIC_MODEL_NUM_TOPICS, divergence)) for result in results: print('%d %f' % (result['num_topics'], result['divergence'])) prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE +\ '_topic_model_divergence' csv_file_path = prefix + '.csv' json_file_path = prefix + '.json' headers = sorted(results[0].keys()) ETLUtils.save_csv_file(csv_file_path, results, headers) ETLUtils.save_json_file(json_file_path, results)
def load_all_reviews(): city_files = [ '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/Chicago_review.xml', '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/Dublin_review.xml', '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/Hong kong_review.xml', '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/London_review.xml', '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/New York_review.xml', '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/Singapore_review.xml' ] all_reviews = [] for city_file in city_files: city_reviews = load_reviews(city_file) all_reviews.extend(city_reviews) ETLUtils.save_json_file('/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/all_reviews.json', all_reviews) cleaned_reviews = clean_reviews(all_reviews) ETLUtils.save_json_file('/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/cleaned_reviews.json', cleaned_reviews) return all_reviews
def label_json_reviews(self, input_file, output_file, reviews=None): records = ETLUtils.load_json_file(input_file) if reviews is None: reviews = [] for record in records: reviews.append(Review(record["text"])) if len(records) != len(reviews): msg = "The size of the records and reviews arrays must be the same" raise ValueError(msg) predicted_classes = self.predict(reviews) for record, predicted_class in zip(records, predicted_classes): if predicted_class: label = "specific" else: label = "generic" record["predicted_class"] = label ETLUtils.save_json_file(output_file, records)
def find_reviews_topics(self, context_extractor, cycle_index, fold_index): print('find topics: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) train_records_file_path = Constants.generate_file_name( 'context_train_records', 'json', Constants.CACHE_FOLDER, cycle_index, fold_index, Constants.USE_CONTEXT) if os.path.exists(train_records_file_path): self.train_records = \ ETLUtils.load_json_file(train_records_file_path) else: context_extractor.find_contextual_topics(self.train_records) ETLUtils.save_json_file(train_records_file_path, self.train_records) context_extractor.find_contextual_topics( self.important_records, Constants.TEXT_SAMPLING_PROPORTION) self.context_topics_map = {} for record in self.important_records: self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \ record[Constants.CONTEXT_TOPICS_FIELD] self.important_records = None gc.collect()
def separate_recsys_topic_model_records(self): print('%s: separate_recsys_topic_model_records' % time.strftime("%Y/%m/%d-%H:%M:%S")) num_records = len(self.records) topic_model_records = self.records[:num_records / 2] if not Constants.USE_CONTEXT: recsys_records = self.records[num_records / 2:] file_name = \ Constants.generate_file_name( 'recsys_contextual_records', 'json', Constants.CACHE_FOLDER, None, None, False, True) print('Records without context file: %s' % file_name) for record in recsys_records: record[Constants.CONTEXT_TOPICS_FIELD] = {'na': 1.0} ETLUtils.save_json_file(file_name, recsys_records) return topic_model_creator.train_topic_model(topic_model_records) if os.path.exists(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE): print('Recsys topic records have already been generated') recsys_records = ETLUtils.load_json_file( Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE) else: recsys_records = self.records[num_records / 2:] self.find_topic_distribution(recsys_records) ETLUtils.save_json_file( Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE, recsys_records) if os.path.exists(Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE): print('Recsys contextual records have already been generated') print(Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE) recsys_records = ETLUtils.load_json_file( Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE) else: self.update_context_topics(recsys_records) ETLUtils.save_json_file( Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE, recsys_records ) context_transformer = ContextTransformer(recsys_records) context_transformer.load_data() context_transformer.transform_records() context_transformer.export_records()
if review['business_id'] in business_ids: filtered_reviews.append(review) return filtered_reviews @staticmethod def sort_records(records, field, reverse=False): return sorted(records, key=itemgetter(field), reverse=reverse) start = time.time() review_etl = ReviewETL() my_business_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_business.json" my_reviews_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review.json" my_business_ids = BusinessETL.get_business_ids(my_business_file, 'Hotels') my_reviews = ETLUtils.load_json_file(my_reviews_file) # print(len(ReviewETL.filter_reviews_by_business(my_reviews, my_business_ids, 'text'))) my_restaurant_reviews = ReviewETL.filter_reviews_by_business_slow(my_reviews, my_business_ids) my_restaurants_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review_hotels.json" ETLUtils.save_json_file(my_restaurants_file, my_restaurant_reviews) # my_sorted_reviews = ReviewETL.sort_records(my_reviews, 'business_id') # print(len(my_sorted_reviews)) # main() end = time.time() total_time = end - start print("Total time = %f seconds" % total_time)
return reviews def extract_fields(reviews): """ Modifies the given list of reviews in order to extract the values contained in the ratings field to top level fields. For instance, a review which is in the form {'user_id': 'U1', 'offering_id': :'I1', 'ratings': {'cleanliness': 4.0, 'location': 5.0}} would become: {'user_id': 'U1', 'offering_id': :'I1', 'ratings': {'cleanliness': 4.0, 'location': 5.0}, 'cleanliness_rating': 4.0, 'location_rating': 5.0} :param reviews: a list of reviews. """ for review in reviews: review['offering_id'] = review['business_id'] review['overall_rating'] = review['stars'] my_reviews = pre_process_reviews() filtered_reviews_file = '/Users/fpena/UCC/Thesis/datasets/yelp_phoenix_academic_dataset/filtered_reviews.json' ETLUtils.save_json_file(filtered_reviews_file, my_reviews) # print('Num reviews', len(my_reviews)) print(my_reviews[0]) print(my_reviews[1]) # print(my_reviews[2]) # print(my_reviews[3])
def transform_manually_labeled_reviews(): full_records = ETLUtils.load_json_file(Constants.DATASET_FOLDER + 'yelp_training_set_review_restaurants_shuffled_tagged.json') records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE) print('total records: %d' % len(records)) new_records = [] for record in records: sentence_index = record['sentence_index'] if sentence_index > 0: continue record['predicted_class'] = record['sentence_type'] new_records.append(record) # count = 0 # for new_record in new_records: # internal_count = 0 # for full_record in full_records: # if full_record['text'].startswith(new_record['text']): # # print(full_record['text']) # internal_count += 1 # count += 1 # print('internal count: %d\treview_id: %s' % (internal_count, full_record['review_id'])) # # if internal_count > 1: # print('internal count: %d\treview_id: %s' % (internal_count, new_record['text'])) # print('count: %d' % count) index = 0 for new_record in new_records: while True: full_record = full_records[index] if full_record['text'].startswith(new_record['text']): new_record[Constants.USER_ID_FIELD] = full_record['user_id'] new_record[Constants.ITEM_ID_FIELD] = full_record['business_id'] new_record[Constants.REVIEW_ID_FIELD] = full_record['review_id'] new_record[Constants.RATING_FIELD] = full_record['stars'] break index += 1 index += 1 print('index: %d' % index) for new_record in new_records: for full_record in full_records: if new_record['review_id'] == full_record['review_id']: print('%s ====' % new_record['text']) print(full_record['text']) print('******************\n******************\n******************\n******************') break # reviews_preprocessor = ReviewsPreprocessor() # new_records = reviews_preprocessor.lemmatize_sentences(new_records) # reviews_preprocessor.records = new_records # reviews_preprocessor.build_bag_of_words() # reviews_preprocessor.drop_unnecessary_fields() new_classified_records_file = Constants.DATASET_FOLDER + 'classified_' + \ Constants.ITEM_TYPE + '_reviews_first_sentences.json' print(new_records[0]) ETLUtils.save_json_file(new_classified_records_file, new_records)
def export_records(self): print('%s: export records' % time.strftime("%Y/%m/%d-%H:%M:%S")) ETLUtils.save_json_file( Constants.FULL_PROCESSED_RECORDS_FILE, self.records) self.drop_unnecessary_fields() ETLUtils.save_json_file(Constants.PROCESSED_RECORDS_FILE, self.records)
def extract_fields(reviews): """ Modifies the given list of reviews in order to extract the values contained in the ratings field to top level fields. For instance, a review which is in the form {'user_id': 'U1', 'offering_id': :'I1', 'ratings': {'cleanliness': 4.0, 'location': 5.0}} would become: {'user_id': 'U1', 'offering_id': :'I1', 'ratings': {'cleanliness': 4.0, 'location': 5.0}, 'cleanliness_rating': 4.0, 'location_rating': 5.0} :param reviews: a list of reviews. """ for review in reviews: review['offering_id'] = review['business_id'] review['overall_rating'] = review['stars'] my_reviews = pre_process_reviews() filtered_reviews_file = '/Users/fpena/UCC/Thesis/datasets/yelp_phoenix_academic_dataset/filtered_reviews.json' ETLUtils.save_json_file(filtered_reviews_file, my_reviews) # print('Num reviews', len(my_reviews)) print(my_reviews[0]) print(my_reviews[1]) # print(my_reviews[2]) # print(my_reviews[3])