예제 #1
0
def test_formatting_yelp_nyc_gives_correct_genuine_label():
    review = review_pb2.Review()
    format_yelp_nyc_review(review, _get_entry_text(),
                           _get_nyc_metadata(label=1))
    assert review.label == False
예제 #2
0
def protify_data(data_directory):

    # NYC

    review_set = review_set_pb2.ReviewSet()
    with open(
            os.path.normpath(data_directory +
                             '/YelpData/YelpNYC/reviewContent'), 'r') as f1:
        with open(
                os.path.normpath(data_directory +
                                 '/YelpData/YelpNYC/metadata'), 'r') as f2:
            for line in f1:
                format_yelp_nyc_review(review_set.reviews.add(), line,
                                       f2.readline())

    with open(os.path.normpath(data_directory + '/normalizedData/yelpNYC'),
              'wb') as f:
        f.write(review_set.SerializeToString())

    # Zip

    review_set = review_set_pb2.ReviewSet()
    with open(
            os.path.normpath(data_directory +
                             '/YelpData/YelpZip/reviewContent'), 'r') as f1:
        with open(
                os.path.normpath(data_directory +
                                 '/YelpData/YelpZip/metadata'), 'r') as f2:
            for line in f1:
                format_yelp_nyc_review(review_set.reviews.add(), line,
                                       f2.readline())

    with open(os.path.normpath(data_directory + '/normalizedData/yelpZip'),
              'wb') as f:
        f.write(review_set.SerializeToString())

    # Chicago

    userid_map_service = IDMapService(id_func)
    productid_map_service = IDMapService(id_func)

    review_set = review_set_pb2.ReviewSet()
    with open(
            os.path.normpath(
                data_directory +
                '/YelpData/YelpCHI/output_review_yelpHotelData_NRYRcleaned.txt'
            ), 'r') as f1:
        with open(
                os.path.normpath(
                    data_directory +
                    '/YelpData/YelpCHI/output_meta_yelpHotelData_NRYRcleaned.txt'
                ), 'r') as f2:
            for line in f1:
                format_yelp_chi_review(review_set.reviews.add(), line,
                                       f2.readline(), userid_map_service,
                                       productid_map_service)

    with open(
            os.path.normpath(data_directory +
                             '/normalizedData/yelpCHI-hotels'), 'w') as f:
        f.write(str(review_set))

    userid_map_service = IDMapService(id_func)
    productid_map_service = IDMapService(id_func)

    review_set = review_set_pb2.ReviewSet()
    with open(
            os.path.normpath(
                data_directory +
                '/YelpData/YelpCHI/output_review_yelpResData_NRYRcleaned.txt'),
            'r') as f1:
        with open(
                os.path.normpath(
                    data_directory +
                    '/YelpData/YelpCHI/output_meta_yelpResData_NRYRcleaned.txt'
                ), 'r') as f2:
            for line in f1:
                format_yelp_chi_review(review_set.reviews.add(), line,
                                       f2.readline(), userid_map_service,
                                       productid_map_service)

    with open(
            os.path.normpath(data_directory +
                             '/normalizedData/yelpCHI-restaurants'), 'w') as f:
        f.write(str(review_set))

    # Amazon

    review_set = review_set_pb2.ReviewSet()

    userid_map_service = IDMapService(id_func)
    productid_map_service = IDMapService(id_func)

    with open(os.path.normpath(data_directory + '/amazonBooks/reviewContent'),
              'r') as f:
        for line in f:
            reviewObj = json.loads(line.replace('},', '}'))
            format_amazonBooks_review(review_set.reviews.add(), reviewObj,
                                      userid_map_service,
                                      productid_map_service)

    with open(os.path.normpath(data_directory + '/normalizedData/amazonBooks'),
              'w') as f:
        f.write(str(review_set))
예제 #3
0
def test_formatting_yelp_nyc_gives_correct_review_content():
    review = review_pb2.Review()
    entry = _get_entry_text(content="Blarg")
    format_yelp_nyc_review(review, entry, _get_nyc_metadata())
    assert review.review_content == "Blarg"
예제 #4
0
def test_formatting_yelp_nyc_gives_correct_rating():
    review = review_pb2.Review()
    entry = _get_entry_text()
    format_yelp_nyc_review(review, entry, _get_nyc_metadata(rating=1.0))
    assert review.rating == 1
예제 #5
0
def test_formatting_yelp_nyc_gives_correct_date():
    review = review_pb2.Review()
    entry = _get_entry_text(date="2001-02-03")
    format_yelp_nyc_review(review, entry, _get_nyc_metadata())
    assert review.date == "2001-02-03"
예제 #6
0
def test_formatting_yelp_nyc_gives_correct_product_id():
    review = review_pb2.Review()
    entry = _get_entry_text(productid=3)
    format_yelp_nyc_review(review, entry, _get_nyc_metadata())
    assert review.product_id == 3
예제 #7
0
def test_formatting_yelp_nyc_gives_correct_user_id():
    review = review_pb2.Review()
    entry = _get_entry_text(userid=2)
    format_yelp_nyc_review(review, entry, _get_nyc_metadata())
    assert review.user_id == 2