Python ETLUtils.filter_out_records 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: etl

클래스/타입: ETLUtils

메소드/함수: filter_out_records

hotexamples.com에서의 예제들: 8

Python ETLUtils.filter_out_records - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 etl.ETLUtils.filter_out_records에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

load_json_file(30)

filter_records(23)

save_json_file(13)

save_csv_file(12)

split_train_test(7)

drop_fields(6)

load_csv_file(6)

filter_out_records(5)

select_fields(4)

split_train_test_copy(4)

add_transpose_list_column(2)

write_row_to_csv(2)

write_row_to_json(2)

count_frequency(1)

keep_fields(1)

search_sentences(1)

예제 #1

파일 보기

파일: labeled_reviews_comparator.py 프로젝트: swarnamd/yelp

def update_labeled_reviews_records():

    reviews_label_map = compare_records()
    agreed_review_ids = set(reviews_label_map.keys())
    classifier_records = \
        ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    classifier_review_ids = \
        {record[Constants.REVIEW_ID_FIELD] for record in classifier_records}
    non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids)

    # for record in classifier_records:
    # print(record)

    print('number of records before: %d' % len(classifier_records))

    print(reviews_label_map)
    print(non_agreed_review_ids)
    review_type_map = {'s': 'yes', 'g': 'no'}

    # We remove from the classifier records the ones who don't have agreed on a
    # label
    classifier_records = ETLUtils.filter_out_records(classifier_records,
                                                     Constants.REVIEW_ID_FIELD,
                                                     non_agreed_review_ids)

    # Finally we make the update of the labels
    for record in classifier_records:
        review_id = record[Constants.REVIEW_ID_FIELD]
        record[Constants.SPECIFIC] = review_type_map[
            reviews_label_map[review_id]]
        # print(record)

    print('number of records after: %d' % len(classifier_records))

예제 #2

파일 보기

파일: labeled_reviews_comparator.py 프로젝트: melqkiades/yelp

def update_labeled_reviews_records():

    reviews_label_map = compare_records()
    agreed_review_ids = set(reviews_label_map.keys())
    classifier_records = \
        ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    classifier_review_ids = \
        {record[Constants.REVIEW_ID_FIELD] for record in classifier_records}
    non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids)

    # for record in classifier_records:
        # print(record)

    print('number of records before: %d' % len(classifier_records))

    print(reviews_label_map)
    print(non_agreed_review_ids)
    review_type_map = {'s': 'yes', 'g': 'no'}

    # We remove from the classifier records the ones who don't have agreed on a
    # label
    classifier_records = ETLUtils.filter_out_records(
        classifier_records, Constants.REVIEW_ID_FIELD, non_agreed_review_ids)

    # Finally we make the update of the labels
    for record in classifier_records:
        review_id = record[Constants.REVIEW_ID_FIELD]
        record[Constants.SPECIFIC] = review_type_map[reviews_label_map[review_id]]
        # print(record)

    print('number of records after: %d' % len(classifier_records))

예제 #3

파일 보기

    def test_filter_out_records(self):

        field = 'offering_id'
        values = [1, 3, 5]

        expected_result = [{
            'user_id': 'U1',
            'offering_id': 2,
            'overall_rating': 7.0
        }, {
            'user_id': 'U1',
            'offering_id': 4,
            'overall_rating': 7.0
        }, {
            'user_id': 'U2',
            'offering_id': 2,
            'overall_rating': 7.0
        }, {
            'user_id': 'U2',
            'offering_id': 4,
            'overall_rating': 7.0
        }]

        actual_result = ETLUtils.filter_out_records(reviews_matrix_5_short,
                                                    field, values)

        self.assertEqual(expected_result, actual_result)

예제 #4

파일 보기

파일: labeled_reviews_comparator.py 프로젝트: swarnamd/yelp

def foo():
    my_records = []
    for i in range(10):
        my_records.append({'column1': i})
    print(my_records)
    to_remove = set(range(1, 10, 2))
    print(to_remove)

    new_records = ETLUtils.filter_out_records(my_records, 'column1', to_remove)
    print(new_records)

예제 #5

파일 보기

파일: labeled_reviews_comparator.py 프로젝트: melqkiades/yelp

def foo():
    my_records = []
    for i in range(10):
        my_records.append({'column1': i})
    print(my_records)
    to_remove = set(range(1, 10, 2))
    print(to_remove)

    new_records = ETLUtils.filter_out_records(my_records, 'column1', to_remove)
    print(new_records)

예제 #6

파일 보기

    def remove_reviews_from_classifier_training_set(self):
        """
        Removes the records that are part of the training set of the reviews
        classifier
        """
        classifier_records = \
            ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
        classifier_review_ids = \
            {record[Constants.REVIEW_ID_FIELD] for record in classifier_records}

        self.records = ETLUtils.filter_out_records(
            self.records, Constants.REVIEW_ID_FIELD, classifier_review_ids)

예제 #7

파일 보기

파일: test_etl_utils.py 프로젝트: antoine-tran/yelp

    def test_filter_out_records(self):

        field = 'offering_id'
        values = [1, 3, 5]

        expected_result = [
            {'user_id': 'U1', 'offering_id': 2, 'overall_rating': 7.0},
            {'user_id': 'U1', 'offering_id': 4, 'overall_rating': 7.0},
            {'user_id': 'U2', 'offering_id': 2, 'overall_rating': 7.0},
            {'user_id': 'U2', 'offering_id': 4, 'overall_rating': 7.0}
        ]

        actual_result = ETLUtils.filter_out_records(reviews_matrix_5_short, field, values)

        self.assertEqual(expected_result, actual_result)

예제 #8

파일 보기

    def clean_reviews(self):
        print('%s: clean reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        initial_length = len(self.records)

        if Constants.ITEM_TYPE == 'fourcity_hotel':
            self.records = ETLUtils.filter_out_records(
                self.records, Constants.USER_ID_FIELD, ['', 'CATID_'])
            final_length = len(self.records)
            removed_records_count = initial_length - final_length
            percentage = removed_records_count / float(initial_length) * 100

            msg = "A total of %d (%f%%) records were removed because they " \
                  "were dirty" % (removed_records_count, percentage)
            print(msg)