def update_labeled_reviews_records():

    reviews_label_map = compare_records()
    agreed_review_ids = set(reviews_label_map.keys())
    classifier_records = \
        ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    classifier_review_ids = \
        {record[Constants.REVIEW_ID_FIELD] for record in classifier_records}
    non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids)

    # for record in classifier_records:
    # print(record)

    print('number of records before: %d' % len(classifier_records))

    print(reviews_label_map)
    print(non_agreed_review_ids)
    review_type_map = {'s': 'yes', 'g': 'no'}

    # We remove from the classifier records the ones who don't have agreed on a
    # label
    classifier_records = ETLUtils.filter_out_records(classifier_records,
                                                     Constants.REVIEW_ID_FIELD,
                                                     non_agreed_review_ids)

    # Finally we make the update of the labels
    for record in classifier_records:
        review_id = record[Constants.REVIEW_ID_FIELD]
        record[Constants.SPECIFIC] = review_type_map[
            reviews_label_map[review_id]]
        # print(record)

    print('number of records after: %d' % len(classifier_records))
def update_labeled_reviews_records():

    reviews_label_map = compare_records()
    agreed_review_ids = set(reviews_label_map.keys())
    classifier_records = \
        ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    classifier_review_ids = \
        {record[Constants.REVIEW_ID_FIELD] for record in classifier_records}
    non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids)

    # for record in classifier_records:
        # print(record)

    print('number of records before: %d' % len(classifier_records))

    print(reviews_label_map)
    print(non_agreed_review_ids)
    review_type_map = {'s': 'yes', 'g': 'no'}

    # We remove from the classifier records the ones who don't have agreed on a
    # label
    classifier_records = ETLUtils.filter_out_records(
        classifier_records, Constants.REVIEW_ID_FIELD, non_agreed_review_ids)

    # Finally we make the update of the labels
    for record in classifier_records:
        review_id = record[Constants.REVIEW_ID_FIELD]
        record[Constants.SPECIFIC] = review_type_map[reviews_label_map[review_id]]
        # print(record)

    print('number of records after: %d' % len(classifier_records))
示例#3
0
    def test_filter_out_records(self):

        field = 'offering_id'
        values = [1, 3, 5]

        expected_result = [{
            'user_id': 'U1',
            'offering_id': 2,
            'overall_rating': 7.0
        }, {
            'user_id': 'U1',
            'offering_id': 4,
            'overall_rating': 7.0
        }, {
            'user_id': 'U2',
            'offering_id': 2,
            'overall_rating': 7.0
        }, {
            'user_id': 'U2',
            'offering_id': 4,
            'overall_rating': 7.0
        }]

        actual_result = ETLUtils.filter_out_records(reviews_matrix_5_short,
                                                    field, values)

        self.assertEqual(expected_result, actual_result)
def foo():
    my_records = []
    for i in range(10):
        my_records.append({'column1': i})
    print(my_records)
    to_remove = set(range(1, 10, 2))
    print(to_remove)

    new_records = ETLUtils.filter_out_records(my_records, 'column1', to_remove)
    print(new_records)
def foo():
    my_records = []
    for i in range(10):
        my_records.append({'column1': i})
    print(my_records)
    to_remove = set(range(1, 10, 2))
    print(to_remove)

    new_records = ETLUtils.filter_out_records(my_records, 'column1', to_remove)
    print(new_records)
示例#6
0
    def remove_reviews_from_classifier_training_set(self):
        """
        Removes the records that are part of the training set of the reviews
        classifier
        """
        classifier_records = \
            ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
        classifier_review_ids = \
            {record[Constants.REVIEW_ID_FIELD] for record in classifier_records}

        self.records = ETLUtils.filter_out_records(
            self.records, Constants.REVIEW_ID_FIELD, classifier_review_ids)
示例#7
0
    def test_filter_out_records(self):

        field = 'offering_id'
        values = [1, 3, 5]

        expected_result = [
            {'user_id': 'U1', 'offering_id': 2, 'overall_rating': 7.0},
            {'user_id': 'U1', 'offering_id': 4, 'overall_rating': 7.0},
            {'user_id': 'U2', 'offering_id': 2, 'overall_rating': 7.0},
            {'user_id': 'U2', 'offering_id': 4, 'overall_rating': 7.0}
        ]

        actual_result = ETLUtils.filter_out_records(reviews_matrix_5_short, field, values)

        self.assertEqual(expected_result, actual_result)
示例#8
0
    def clean_reviews(self):
        print('%s: clean reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        initial_length = len(self.records)

        if Constants.ITEM_TYPE == 'fourcity_hotel':
            self.records = ETLUtils.filter_out_records(
                self.records, Constants.USER_ID_FIELD, ['', 'CATID_'])
            final_length = len(self.records)
            removed_records_count = initial_length - final_length
            percentage = removed_records_count / float(initial_length) * 100

            msg = "A total of %d (%f%%) records were removed because they " \
                  "were dirty" % (removed_records_count, percentage)
            print(msg)