예제 #1
0
def main():
    # dataset = 'hotel'
    dataset = 'restaurant'
    my_folder = '/Users/fpena/UCC/Thesis/datasets/context/'
    my_training_records_file =\
        my_folder + 'classified_' + dataset + '_reviews.json'
    my_training_reviews_file =\
        my_folder + 'classified_' + dataset + '_reviews.pkl'
    my_training_records = ETLUtils.load_json_file(my_training_records_file)

    with open(my_training_reviews_file, 'rb') as read_file:
        my_training_reviews = pickle.load(read_file)

    classifier = ReviewsClassifier()
    classifier.train(my_training_records, my_training_reviews)

    my_input_records_file =\
        my_folder + 'yelp_training_set_review_' + dataset + 's_shuffled.json'
    my_input_reviews_file =\
        my_folder + 'reviews_' + dataset + '_shuffled.pkl'
    my_output_records_file =\
        my_folder + 'yelp_training_set_review_' + dataset +\
        's_shuffled_tagged.json'

    with open(my_input_reviews_file, 'rb') as read_file:
        my_input_reviews = pickle.load(read_file)

    my_input_records = ETLUtils.load_json_file(my_input_records_file)

    my_output_records =\
        classifier.label_json_reviews(my_input_records, my_input_reviews)

    ETLUtils.save_json_file(my_output_records_file, my_output_records)
예제 #2
0
 def export_records_to_predict(self, records_file):
     if self.records_to_predict is None:
         self.records_to_predict = self.get_records_to_predict()
     ETLUtils.save_json_file(records_file, self.records_to_predict)
     with open(records_file + '.pkl', 'wb') as write_file:
         pickle.dump(
             self.items_to_predict, write_file, pickle.HIGHEST_PROTOCOL)
예제 #3
0
 def export_records(self):
     print('%s: get_records_to_predict_topn records' % time.strftime("%Y/%m/%d-%H:%M:%S"))
     self.dictionary.save(Constants.DICTIONARY_FILE)
     ETLUtils.save_json_file(
         Constants.FULL_PROCESSED_RECORDS_FILE, self.records)
     self.drop_unnecessary_fields()
     ETLUtils.save_json_file(Constants.PROCESSED_RECORDS_FILE, self.records)
예제 #4
0
 def export_records_to_predict(self, records_file):
     if self.records_to_predict is None:
         self.records_to_predict = self.get_records_to_predict()
     ETLUtils.save_json_file(records_file, self.records_to_predict)
     with open(records_file + '.pkl', 'wb') as write_file:
         pickle.dump(self.items_to_predict, write_file,
                     pickle.HIGHEST_PROTOCOL)
예제 #5
0
    def export_records(self):
        print('%s: exporting transformed records' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        records_to_export = []
        desired_fields = [
            Constants.USER_INTEGER_ID_FIELD,
            Constants.ITEM_INTEGER_ID_FIELD,
            Constants.RATING_FIELD,
            Constants.CONTEXT_FIELD,
        ]

        for record in self.records:
            new_record = {field: record[field] for field in desired_fields}
            records_to_export.append(new_record)

        file_name = Constants.generate_file_name(
            'recsys_formatted_context_records',
            'json',
            Constants.CACHE_FOLDER,
            None,
            None,
            True,
            True,
            uses_carskit=False,
            normalize_topics=True,
            format_context=True)
        ETLUtils.save_json_file(file_name, records_to_export)
예제 #6
0
    def count_frequencies(self):
        """
        Counts the number of reviews each user and item have and stores the
        results in two separate files, one for the users and another one for the
        items. Note that the integer IDs are used and not the original user and
        item IDs
        """
        print('%s: count frequencies' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        user_frequency_map = ETLUtils.count_frequency(
            self.records, Constants.USER_INTEGER_ID_FIELD)
        item_frequency_map = ETLUtils.count_frequency(
            self.records, Constants.ITEM_INTEGER_ID_FIELD)

        user_frequency_file = Constants.generate_file_name(
            'user_frequency_map', 'json', Constants.CACHE_FOLDER, None, None,
            False
        )
        item_frequency_file = Constants.generate_file_name(
            'item_frequency_map', 'json', Constants.CACHE_FOLDER, None, None,
            False
        )

        ETLUtils.save_json_file(user_frequency_file, [user_frequency_map])
        ETLUtils.save_json_file(item_frequency_file, [item_frequency_map])
예제 #7
0
    def train_topic_model(self, cycle_index, fold_index):

        context_extractor = topic_model_creator.create_topic_model(
            self.train_records, cycle_index, fold_index)
        self.context_rich_topics = context_extractor.context_rich_topics

        topics_file_path = Constants.generate_file_name(
            'context_topics', 'json', Constants.CACHE_FOLDER, cycle_index,
            fold_index, True)
        ETLUtils.save_json_file(topics_file_path,
                                [dict(self.context_rich_topics)])
        print('Trained Context Extractor: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        return context_extractor
예제 #8
0
    def lemmatize_records(self):

        if os.path.exists(Constants.LEMMATIZED_RECORDS_FILE):
            print('Records were already lemmatized')
            self.records = \
                ETLUtils.load_json_file(Constants.LEMMATIZED_RECORDS_FILE)
            return

        if Constants.DOCUMENT_LEVEL == 'review':
            self.records = self.lemmatize_reviews(self.records)
        elif Constants.DOCUMENT_LEVEL == 'sentence' or\
                isinstance(Constants.DOCUMENT_LEVEL, (int, long)):
            self.records = self.lemmatize_sentences(self.records)

        ETLUtils.save_json_file(Constants.LEMMATIZED_RECORDS_FILE, self.records)
예제 #9
0
def test():
    document_term_matrix = NmfTopicExtractor.load_document_term_matrix()

    results = []

    # my_list = range(2, 31)
    my_list = range(2, 61)

    for i in my_list:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        topic_model = NmfTopicExtractor()
        topic_model.load_trained_data()

        document_topic_matrix = topic_model.document_topic_matrix
        topic_term_matrix = topic_model.topic_term_matrix

        divergence = calculate_divergence(document_term_matrix,
                                          document_topic_matrix,
                                          topic_term_matrix)

        result = {
            'num_topics': Constants.TOPIC_MODEL_NUM_TOPICS,
            'divergence': divergence,
            Constants.TOPIC_MODEL_TYPE_FIELD: 'ensemble',
            Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE
        }

        results.append(result)

        print('Num topics: %d, Divergence: %f' %
              (Constants.TOPIC_MODEL_NUM_TOPICS, divergence))

    for result in results:
        print('%d %f' % (result['num_topics'], result['divergence']))

    prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE +\
        '_topic_model_divergence'
    csv_file_path = prefix + '.csv'
    json_file_path = prefix + '.json'
    headers = sorted(results[0].keys())
    ETLUtils.save_csv_file(csv_file_path, results, headers)
    ETLUtils.save_json_file(json_file_path, results)
예제 #10
0
    def tag_reviews_language(self):

        print('%s: tag reviews language' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        if os.path.exists(Constants.LANGUAGE_RECORDS_FILE):
            print('Records have already been tagged with language field')
            self.records = \
                ETLUtils.load_json_file(Constants.LANGUAGE_RECORDS_FILE)
            return

        DetectorFactory.seed = 0

        for record in self.records:
            try:
                language = langdetect.detect(record[Constants.TEXT_FIELD])
            except LangDetectException:
                language = 'unknown'
            record[Constants.LANGUAGE_FIELD] = language

        ETLUtils.save_json_file(Constants.LANGUAGE_RECORDS_FILE, self.records)
예제 #11
0
def test():
    document_term_matrix = NmfTopicExtractor.load_document_term_matrix()

    results = []

    # my_list = range(2, 31)
    my_list = range(2, 61)

    for i in my_list:
        Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        topic_model = NmfTopicExtractor()
        topic_model.load_trained_data()

        document_topic_matrix = topic_model.document_topic_matrix
        topic_term_matrix = topic_model.topic_term_matrix

        divergence = calculate_divergence(
            document_term_matrix, document_topic_matrix, topic_term_matrix)

        result = {
            'num_topics': Constants.TOPIC_MODEL_NUM_TOPICS,
            'divergence': divergence,
            Constants.TOPIC_MODEL_TYPE_FIELD: 'ensemble',
            Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE
        }

        results.append(result)

        print('Num topics: %d, Divergence: %f' %
              (Constants.TOPIC_MODEL_NUM_TOPICS, divergence))

    for result in results:
        print('%d %f' % (result['num_topics'], result['divergence']))

    prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE +\
        '_topic_model_divergence'
    csv_file_path = prefix + '.csv'
    json_file_path = prefix + '.json'
    headers = sorted(results[0].keys())
    ETLUtils.save_csv_file(csv_file_path, results, headers)
    ETLUtils.save_json_file(json_file_path, results)
예제 #12
0
    def export_records(self):
        print('%s: exporting transformed records' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        records_to_export = []
        desired_fields = [
            Constants.USER_INTEGER_ID_FIELD,
            Constants.ITEM_INTEGER_ID_FIELD,
            Constants.RATING_FIELD,
            Constants.CONTEXT_FIELD,
        ]

        for record in self.records:
            new_record = {field: record[field] for field in desired_fields}
            records_to_export.append(new_record)

        file_name = Constants.generate_file_name(
            'recsys_formatted_context_records', 'json', Constants.CACHE_FOLDER,
            None, None, True, True, uses_carskit=False, normalize_topics=True,
            format_context=True)
        ETLUtils.save_json_file(file_name, records_to_export)
예제 #13
0
def load_all_reviews():
    city_files = [
        '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/Chicago_review.xml',
        '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/Dublin_review.xml',
        '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/Hong kong_review.xml',
        '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/London_review.xml',
        '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/New York_review.xml',
        '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/Singapore_review.xml'
    ]

    all_reviews = []

    for city_file in city_files:
        city_reviews = load_reviews(city_file)
        all_reviews.extend(city_reviews)

    ETLUtils.save_json_file('/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/all_reviews.json', all_reviews)

    cleaned_reviews = clean_reviews(all_reviews)
    ETLUtils.save_json_file('/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/cleaned_reviews.json', cleaned_reviews)

    return all_reviews
예제 #14
0
    def label_json_reviews(self, input_file, output_file, reviews=None):

        records = ETLUtils.load_json_file(input_file)

        if reviews is None:
            reviews = []
            for record in records:
                reviews.append(Review(record["text"]))

        if len(records) != len(reviews):
            msg = "The size of the records and reviews arrays must be the same"
            raise ValueError(msg)
        predicted_classes = self.predict(reviews)

        for record, predicted_class in zip(records, predicted_classes):
            if predicted_class:
                label = "specific"
            else:
                label = "generic"

            record["predicted_class"] = label

        ETLUtils.save_json_file(output_file, records)
예제 #15
0
    def find_reviews_topics(self, context_extractor, cycle_index, fold_index):
        print('find topics: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        train_records_file_path = Constants.generate_file_name(
            'context_train_records', 'json', Constants.CACHE_FOLDER,
            cycle_index, fold_index, Constants.USE_CONTEXT)

        if os.path.exists(train_records_file_path):
            self.train_records = \
                ETLUtils.load_json_file(train_records_file_path)
        else:
            context_extractor.find_contextual_topics(self.train_records)
            ETLUtils.save_json_file(train_records_file_path,
                                    self.train_records)
        context_extractor.find_contextual_topics(
            self.important_records, Constants.TEXT_SAMPLING_PROPORTION)

        self.context_topics_map = {}
        for record in self.important_records:
            self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \
                record[Constants.CONTEXT_TOPICS_FIELD]

        self.important_records = None
        gc.collect()
예제 #16
0
    def separate_recsys_topic_model_records(self):

        print('%s: separate_recsys_topic_model_records' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        num_records = len(self.records)
        topic_model_records = self.records[:num_records / 2]

        if not Constants.USE_CONTEXT:
            recsys_records = self.records[num_records / 2:]

            file_name = \
                Constants.generate_file_name(
                    'recsys_contextual_records', 'json', Constants.CACHE_FOLDER,
                    None, None, False, True)

            print('Records without context file: %s' % file_name)

            for record in recsys_records:
                record[Constants.CONTEXT_TOPICS_FIELD] = {'na': 1.0}

            ETLUtils.save_json_file(file_name, recsys_records)
            return

        topic_model_creator.train_topic_model(topic_model_records)

        if os.path.exists(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE):
            print('Recsys topic records have already been generated')
            recsys_records = ETLUtils.load_json_file(
                Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE)
        else:
            recsys_records = self.records[num_records / 2:]
            self.find_topic_distribution(recsys_records)
            ETLUtils.save_json_file(
                Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE, recsys_records)

        if os.path.exists(Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE):
            print('Recsys contextual records have already been generated')
            print(Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE)
            recsys_records = ETLUtils.load_json_file(
                Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE)
        else:
            self.update_context_topics(recsys_records)
            ETLUtils.save_json_file(
                Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE,
                recsys_records
            )

        context_transformer = ContextTransformer(recsys_records)
        context_transformer.load_data()
        context_transformer.transform_records()
        context_transformer.export_records()
예제 #17
0
            if review['business_id'] in business_ids:
                filtered_reviews.append(review)

        return filtered_reviews

    @staticmethod
    def sort_records(records, field, reverse=False):
        return sorted(records, key=itemgetter(field), reverse=reverse)



start = time.time()

review_etl = ReviewETL()
my_business_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_business.json"
my_reviews_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review.json"
my_business_ids = BusinessETL.get_business_ids(my_business_file, 'Hotels')
my_reviews = ETLUtils.load_json_file(my_reviews_file)
# print(len(ReviewETL.filter_reviews_by_business(my_reviews, my_business_ids, 'text')))
my_restaurant_reviews = ReviewETL.filter_reviews_by_business_slow(my_reviews, my_business_ids)
my_restaurants_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review_hotels.json"
ETLUtils.save_json_file(my_restaurants_file, my_restaurant_reviews)
# my_sorted_reviews = ReviewETL.sort_records(my_reviews, 'business_id')
# print(len(my_sorted_reviews))


# main()
end = time.time()
total_time = end - start
print("Total time = %f seconds" % total_time)
예제 #18
0
    return reviews

def extract_fields(reviews):
    """
    Modifies the given list of reviews in order to extract the values contained
    in the ratings field to top level fields. For instance, a review which is
    in the form
    {'user_id': 'U1', 'offering_id': :'I1',
    'ratings': {'cleanliness': 4.0, 'location': 5.0}}
    would become:

    {'user_id': 'U1', 'offering_id': :'I1',
    'ratings': {'cleanliness': 4.0, 'location': 5.0},
    'cleanliness_rating': 4.0, 'location_rating': 5.0}

    :param reviews: a list of reviews.
    """

    for review in reviews:
        review['offering_id'] = review['business_id']
        review['overall_rating'] = review['stars']


my_reviews = pre_process_reviews()
filtered_reviews_file = '/Users/fpena/UCC/Thesis/datasets/yelp_phoenix_academic_dataset/filtered_reviews.json'
ETLUtils.save_json_file(filtered_reviews_file, my_reviews)
# print('Num reviews', len(my_reviews))
print(my_reviews[0])
print(my_reviews[1])
# print(my_reviews[2])
# print(my_reviews[3])
예제 #19
0
파일: main.py 프로젝트: swarnamd/yelp
def transform_manually_labeled_reviews():

    full_records = ETLUtils.load_json_file(Constants.DATASET_FOLDER + 'yelp_training_set_review_restaurants_shuffled_tagged.json')

    records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    print('total records: %d' % len(records))

    new_records = []

    for record in records:

        sentence_index = record['sentence_index']

        if sentence_index > 0:
            continue
        record['predicted_class'] = record['sentence_type']
        new_records.append(record)

    # count = 0
    # for new_record in new_records:
    #     internal_count = 0
    #     for full_record in full_records:
    #         if full_record['text'].startswith(new_record['text']):
    #             # print(full_record['text'])
    #             internal_count += 1
    #             count += 1
    #             print('internal count: %d\treview_id: %s' % (internal_count, full_record['review_id']))
    #
    #             if internal_count > 1:
    #                 print('internal count: %d\treview_id: %s' % (internal_count, new_record['text']))

    # print('count: %d' % count)

    index = 0

    for new_record in new_records:

        while True:

            full_record = full_records[index]

            if full_record['text'].startswith(new_record['text']):
                new_record[Constants.USER_ID_FIELD] = full_record['user_id']
                new_record[Constants.ITEM_ID_FIELD] = full_record['business_id']
                new_record[Constants.REVIEW_ID_FIELD] = full_record['review_id']
                new_record[Constants.RATING_FIELD] = full_record['stars']
                break
            index += 1
        index += 1

    print('index: %d' % index)

    for new_record in new_records:

        for full_record in full_records:
            if new_record['review_id'] == full_record['review_id']:
                print('%s ====' % new_record['text'])
                print(full_record['text'])
                print('******************\n******************\n******************\n******************')
                break

    # reviews_preprocessor = ReviewsPreprocessor()
    # new_records = reviews_preprocessor.lemmatize_sentences(new_records)
    # reviews_preprocessor.records = new_records
    # reviews_preprocessor.build_bag_of_words()
    # reviews_preprocessor.drop_unnecessary_fields()

    new_classified_records_file = Constants.DATASET_FOLDER + 'classified_' + \
        Constants.ITEM_TYPE + '_reviews_first_sentences.json'

    print(new_records[0])

    ETLUtils.save_json_file(new_classified_records_file, new_records)
예제 #20
0
 def export_records(self):
     print('%s: export records' % time.strftime("%Y/%m/%d-%H:%M:%S"))
     ETLUtils.save_json_file(
         Constants.FULL_PROCESSED_RECORDS_FILE, self.records)
     self.drop_unnecessary_fields()
     ETLUtils.save_json_file(Constants.PROCESSED_RECORDS_FILE, self.records)
예제 #21
0

def extract_fields(reviews):
    """
    Modifies the given list of reviews in order to extract the values contained
    in the ratings field to top level fields. For instance, a review which is
    in the form
    {'user_id': 'U1', 'offering_id': :'I1',
    'ratings': {'cleanliness': 4.0, 'location': 5.0}}
    would become:

    {'user_id': 'U1', 'offering_id': :'I1',
    'ratings': {'cleanliness': 4.0, 'location': 5.0},
    'cleanliness_rating': 4.0, 'location_rating': 5.0}

    :param reviews: a list of reviews.
    """

    for review in reviews:
        review['offering_id'] = review['business_id']
        review['overall_rating'] = review['stars']


my_reviews = pre_process_reviews()
filtered_reviews_file = '/Users/fpena/UCC/Thesis/datasets/yelp_phoenix_academic_dataset/filtered_reviews.json'
ETLUtils.save_json_file(filtered_reviews_file, my_reviews)
# print('Num reviews', len(my_reviews))
print(my_reviews[0])
print(my_reviews[1])
# print(my_reviews[2])
# print(my_reviews[3])