def update_labeled_reviews_records(): reviews_label_map = compare_records() agreed_review_ids = set(reviews_label_map.keys()) classifier_records = \ ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE) classifier_review_ids = \ {record[Constants.REVIEW_ID_FIELD] for record in classifier_records} non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids) # for record in classifier_records: # print(record) print('number of records before: %d' % len(classifier_records)) print(reviews_label_map) print(non_agreed_review_ids) review_type_map = {'s': 'yes', 'g': 'no'} # We remove from the classifier records the ones who don't have agreed on a # label classifier_records = ETLUtils.filter_out_records(classifier_records, Constants.REVIEW_ID_FIELD, non_agreed_review_ids) # Finally we make the update of the labels for record in classifier_records: review_id = record[Constants.REVIEW_ID_FIELD] record[Constants.SPECIFIC] = review_type_map[ reviews_label_map[review_id]] # print(record) print('number of records after: %d' % len(classifier_records))
def update_labeled_reviews_records(): reviews_label_map = compare_records() agreed_review_ids = set(reviews_label_map.keys()) classifier_records = \ ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE) classifier_review_ids = \ {record[Constants.REVIEW_ID_FIELD] for record in classifier_records} non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids) # for record in classifier_records: # print(record) print('number of records before: %d' % len(classifier_records)) print(reviews_label_map) print(non_agreed_review_ids) review_type_map = {'s': 'yes', 'g': 'no'} # We remove from the classifier records the ones who don't have agreed on a # label classifier_records = ETLUtils.filter_out_records( classifier_records, Constants.REVIEW_ID_FIELD, non_agreed_review_ids) # Finally we make the update of the labels for record in classifier_records: review_id = record[Constants.REVIEW_ID_FIELD] record[Constants.SPECIFIC] = review_type_map[reviews_label_map[review_id]] # print(record) print('number of records after: %d' % len(classifier_records))
def test_filter_out_records(self): field = 'offering_id' values = [1, 3, 5] expected_result = [{ 'user_id': 'U1', 'offering_id': 2, 'overall_rating': 7.0 }, { 'user_id': 'U1', 'offering_id': 4, 'overall_rating': 7.0 }, { 'user_id': 'U2', 'offering_id': 2, 'overall_rating': 7.0 }, { 'user_id': 'U2', 'offering_id': 4, 'overall_rating': 7.0 }] actual_result = ETLUtils.filter_out_records(reviews_matrix_5_short, field, values) self.assertEqual(expected_result, actual_result)
def foo(): my_records = [] for i in range(10): my_records.append({'column1': i}) print(my_records) to_remove = set(range(1, 10, 2)) print(to_remove) new_records = ETLUtils.filter_out_records(my_records, 'column1', to_remove) print(new_records)
def foo(): my_records = [] for i in range(10): my_records.append({'column1': i}) print(my_records) to_remove = set(range(1, 10, 2)) print(to_remove) new_records = ETLUtils.filter_out_records(my_records, 'column1', to_remove) print(new_records)
def remove_reviews_from_classifier_training_set(self): """ Removes the records that are part of the training set of the reviews classifier """ classifier_records = \ ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE) classifier_review_ids = \ {record[Constants.REVIEW_ID_FIELD] for record in classifier_records} self.records = ETLUtils.filter_out_records( self.records, Constants.REVIEW_ID_FIELD, classifier_review_ids)
def test_filter_out_records(self): field = 'offering_id' values = [1, 3, 5] expected_result = [ {'user_id': 'U1', 'offering_id': 2, 'overall_rating': 7.0}, {'user_id': 'U1', 'offering_id': 4, 'overall_rating': 7.0}, {'user_id': 'U2', 'offering_id': 2, 'overall_rating': 7.0}, {'user_id': 'U2', 'offering_id': 4, 'overall_rating': 7.0} ] actual_result = ETLUtils.filter_out_records(reviews_matrix_5_short, field, values) self.assertEqual(expected_result, actual_result)
def clean_reviews(self): print('%s: clean reviews' % time.strftime("%Y/%m/%d-%H:%M:%S")) initial_length = len(self.records) if Constants.ITEM_TYPE == 'fourcity_hotel': self.records = ETLUtils.filter_out_records( self.records, Constants.USER_ID_FIELD, ['', 'CATID_']) final_length = len(self.records) removed_records_count = initial_length - final_length percentage = removed_records_count / float(initial_length) * 100 msg = "A total of %d (%f%%) records were removed because they " \ "were dirty" % (removed_records_count, percentage) print(msg)