예제 #1
0
    def test_integration(self):
        obj = MotelsHomeRecommendation('input/bids.gz.parquet',
                                       'input/exchange_rate.txt',
                                       'input/motels.gz.parquet', self.tmpdir)
        obj.process_data()
        err_result = self.sc.textFile(os.path.join(self.tmpdir, 'erroneous'))
        err_expected = self.sc.textFile(
            os.path.join('expected_output', 'expected_sql'))
        self.maxDiff = None
        self.assertCountEqual(err_result.collect(), err_expected.collect())

        aggregated_result = self.sc.textFile(
            os.path.join(self.tmpdir, 'aggregated'))
        aggregated_expected = self.sc.textFile(
            os.path.join('expected_output', 'aggregated'))
        self.assertCountEqual(aggregated_result.collect(),
                              aggregated_expected.collect())
예제 #2
0
 def test_expand(self):
     rawBid = '0000002,11-05-08-2016,0.92,1.68,0.81,0.68,1.59,,1.63,1.77,2.06,0.66,1.53,,0.32,0.88,0.83,1.01'.split(
         ',')
     exchange_rate = '0.8'
     result = MotelsHomeRecommendation.expand((rawBid, exchange_rate))
     expected = [
         '0000002,2016-08-05 11:00,US,0.544',
         '0000002,2016-08-05 11:00,MX,1.272',
         '0000002,2016-08-05 11:00,CA,1.304'
     ]
     self.assertEqual([str(x) for x in result], expected)
예제 #3
0
 def test_get_enriched(self):
     bids = self.sc.parallelize([
         BidItem('0000002', '2016-05-06 11:00', 'US', '1.662'),
         BidItem('0000002', '2016-05-06 11:00', 'CA', '0.715')
     ])
     motels = self.sc.parallelize([
         ['0000002', 'Novelo Granja'],
     ])
     expected = self.sc.parallelize(
         ['0000002,Novelo Granja,2016-05-06 11:00,US,1.662'])
     result = MotelsHomeRecommendation.get_enriched(self, bids, motels)
     self.assertEqual(result.collect(), expected.collect())
예제 #4
0
    def test_should_collect_errournes_records(self):
        rawBids = self.sc.parallelize([["1", "06-05-02-2016", "ERROR_1"],
                                       ["2", "15-04-08-2016", "0.89"],
                                       ["3", "07-05-02-2016", "ERROR_2"],
                                       ["4", "06-05-02-2016", "ERROR_1"],
                                       ["5", "06-05-02-2016", "ERROR_2"]])
        expected = self.sc.parallelize([
            "06-05-02-2016,ERROR_1,2", "06-05-02-2016,ERROR_2,1",
            "07-05-02-2016,ERROR_2,1"
        ])

        erroneousRecords = MotelsHomeRecommendation.get_erroneous_records(
            self, rawBids)
        self.assertCountEqual(expected.collect(), erroneousRecords.collect())
예제 #5
0
 def test_get_bids(self):
     rawBids = self.sc.parallelize([[
         "0000002", "11-06-05-2016", "0.89", "0.92", "1.32", "2.07", "",
         "1.35", "0.89", "0.92", "1.32", "2.07", "", "1.35", "0.89", "0.92",
         "1.32", "2.07"
     ], ["0000001", "10-06-11-2015", "ERROR_NO_BIDS_FOR_HOTEL"]])
     exchangedRates = self.sc.parallelize([['11-06-05-2016', '0.803'],
                                           ['10-06-11-2015', '0.987']])
     expected = self.sc.parallelize([
         '0000002,2016-05-06 11:00,US,1.662',
         '0000002,2016-05-06 11:00,CA,0.715'
     ])
     result = MotelsHomeRecommendation.get_bids(self.obj, rawBids,
                                                exchangedRates)
     self.assertEqual([str(x) for x in result.collect()],
                      expected.collect())
예제 #6
0
class TestMR(unittest.TestCase):
    INPUT_BIDS_SAMPLE = "bids_sample.txt"
    INPUT_EXCHANGED_RATES_SAMPLE = "ex_sample.txt"
    INPUT_MOTELS_SAMPLE = "motels_sample.txt"

    def setUp(self):
        self.sc = SparkContext.getOrCreate(SparkConf())
        self.spark = SparkSession.builder.getOrCreate()
        self.obj = MotelsHomeRecommendation('', '', '', '')
        self.maxDiff = None

    def test_get_exchanged_rates(self):
        result = self.obj.get_exchange_rates(self.spark,
                                             self.INPUT_EXCHANGED_RATES_SAMPLE)
        expected = self.spark.createDataFrame(
            [('11-06-05-2016', 'Euro', 'EUR', '0.803'),
             ('11-05-08-2016', 'Euro', 'EUR', '0.873'),
             ('10-06-11-2015', 'Euro', 'EUR', '0.987')],
            self.obj.EXCHANGE_RATES_HEADER)
        self.assertEqual(expected.collect(), result.collect())

    def test_get_bids(self):
        rawBids = self.spark.createDataFrame(
            [("0000002", "20-20-06-2016", "0.89", "0.92", "1.32", "2.07", "",
              "1.35", "0.89", "0.87", "1.22", "1.06", "0.93", "0.88", "1.36",
              "1.48", "1.14", "0.99"),
             ("0000001", "10-06-11-2015", "ERROR_NO_BIDS_FOR_HOTEL", None,
              None, None, None, None, None, None, None, None, None, None, None,
              None, None, None)], self.obj.BIDS_HEADER)
        exchangedRates = self.spark.createDataFrame(
            [('20-20-06-2016', 'Euro', 'EUR', '0.803'),
             ('10-06-11-2015', 'Euro', 'EUR', '0.987')],
            self.obj.EXCHANGE_RATES_HEADER)
        expected = self.spark.createDataFrame(
            [('0000002', 'US', '2016-06-20 20:00', '1.662'),
             ('0000002', 'CA', '2016-06-20 20:00', '0.715')],
            ['MotelID', 'loSa', 'BidDate', 'price'])
        result = self.obj.get_bids(rawBids, exchangedRates)
        self.assertEqual(expected.collect(), result.collect())

    def test_get_bids_same(self):
        rawBids = self.spark.createDataFrame(
            [("0000002", "20-20-06-2016", "0.89", "0.92", "1.32", "0.89",
              "0.89", "1.35", "0.89", "0.87", "1.22", "1.06", "0.93", "0.88",
              "1.36", "1.48", "1.14", "0.99")], self.obj.BIDS_HEADER)
        exchangedRates = self.spark.createDataFrame(
            [('20-20-06-2016', 'Euro', 'EUR', '0.803')],
            self.obj.EXCHANGE_RATES_HEADER)
        expected = self.spark.createDataFrame(
            [('0000002', 'US', '2016-06-20 20:00', '0.715'),
             ('0000002', 'MX', '2016-06-20 20:00', '0.715'),
             ('0000002', 'CA', '2016-06-20 20:00', '0.715')],
            ['MotelID', 'loSa', 'BidDate', 'price'])
        result = self.obj.get_bids(rawBids, exchangedRates)
        self.assertEqual(expected.collect(), result.collect())

    def test_get_motels(self):
        result = self.obj.get_motels(self.spark, self.INPUT_MOTELS_SAMPLE)
        expected = self.spark.createDataFrame(
            [('0000001', 'Grand Mengo Casino'), ('0000002', 'Novelo Granja'),
             ('0000003', 'Tiny City Motor Inn')], ['MotelID', 'MotelName'])
        self.assertEqual(expected.collect(), result.collect())

    def test_get_enriched(self):
        bids = self.spark.createDataFrame(
            [('0000002', 'US', '2016-06-20 20:00', '1.662'),
             ('0000002', 'CA', '2016-06-20 20:00', '0.715')],
            ['MotelID', 'loSa', 'BidDate', 'price'])
        motels = self.spark.createDataFrame([('0000002', 'Novelo Granja')],
                                            ['MotelID', 'MotelName'])
        expected = self.spark.createDataFrame(
            [('0000002', 'Novelo Granja', '2016-06-20 20:00', 'US', '1.662')],
            ['MotelID', 'MotelName', 'BidDate', 'loSa', 'price'])
        result = self.obj.get_enriched(bids, motels)
        self.assertEqual(result.collect(), expected.collect())

    def test_should_read_raw_bids(self):
        expected = self.spark.createDataFrame(
            [("0000002", "15-04-08-2016", "0.89", "0.92", "1.32", "2.07",
              "0.99", "1.35", "0.89", "0.92", "1.32", "2.07", "0.99", "1.35",
              "2.07", "0.99", "1.35", "0.99"),
             ("0000001", "06-05-02-2016", "ERROR_NO_BIDS_FOR_HOTEL", None,
              None, None, None, None, None, None, None, None, None, None, None,
              None, None, None)], self.obj.BIDS_HEADER)
        rawBids = self.obj.get_raw_bids(self.spark, self.INPUT_BIDS_SAMPLE)
        self.assertEqual(expected.collect(), rawBids.collect())

    def test_should_collect_errournes_records(self):
        rawBids = self.spark.createDataFrame([
            ("0000001", "06-05-02-2016", "ERROR_1", None, None, None, None,
             None, None, None, None, None, None, None, None, None, None, None),
            ("0000002", "15-04-08-2016", "0.89", "0.92", "1.32", "2.07", "",
             "1.35", "0.89", "0.87", "1.22", "1.06", "0.93", "0.88", "1.36",
             "1.48", "1.14", "0.99"),
            ("0000001", "07-05-02-2016", "ERROR_2", None, None, None, None,
             None, None, None, None, None, None, None, None, None, None, None),
            ("0000001", "06-05-02-2016", "ERROR_1", None, None, None, None,
             None, None, None, None, None, None, None, None, None, None, None),
            ("0000001", "06-05-02-2016", "ERROR_2", None, None, None, None,
             None, None, None, None, None, None, None, None, None, None, None)
        ], self.obj.BIDS_HEADER)
        expected = self.spark.createDataFrame(
            [('06-05-02-2016', 'ERROR_2', 1), ('07-05-02-2016', 'ERROR_2', 1),
             ('06-05-02-2016', 'ERROR_1', 2)], ['BidDate', 'HU', 'count'])

        erroneousRecords = self.obj.get_erroneous_records(rawBids)
        self.assertEqual(expected.collect(), erroneousRecords.collect())
예제 #7
0
 def setUp(self):
     self.sc = SparkContext.getOrCreate(SparkConf())
     self.spark = SparkSession.builder.getOrCreate()
     self.obj = MotelsHomeRecommendation('', '', '', '')
     self.maxDiff = None
예제 #8
0
class TestMR(unittest.TestCase):
    INPUT_BIDS_SAMPLE = "bids_sample.txt"
    INPUT_EXCHANGED_RATES_SAMPLE = "ex_sample.txt"
    INPUT_MOTELS_SAMPLE = "motels_sample.txt"

    def setUp(self):
        self.sc = SparkContext.getOrCreate(SparkConf())
        self.spark = SparkSession(self.sc)
        self.obj = MotelsHomeRecommendation('', '', '', '')

    def test_get_exchanged_rates(self):
        result = self.obj.get_exchange_rates(self.sc,
                                             self.INPUT_EXCHANGED_RATES_SAMPLE)
        expected = self.sc.parallelize([['11-06-05-2016', '0.803'],
                                        ['11-05-08-2016', '0.873'],
                                        ['10-06-11-2015', '0.987']])
        self.assertCountEqual(expected.collect(), result.collect())

    def test_get_bids(self):
        rawBids = self.sc.parallelize([[
            "0000002", "11-06-05-2016", "0.89", "0.92", "1.32", "2.07", "",
            "1.35", "0.89", "0.92", "1.32", "2.07", "", "1.35", "0.89", "0.92",
            "1.32", "2.07"
        ], ["0000001", "10-06-11-2015", "ERROR_NO_BIDS_FOR_HOTEL"]])
        exchangedRates = self.sc.parallelize([['11-06-05-2016', '0.803'],
                                              ['10-06-11-2015', '0.987']])
        expected = self.sc.parallelize([
            '0000002,2016-05-06 11:00,US,1.662',
            '0000002,2016-05-06 11:00,CA,0.715'
        ])
        result = MotelsHomeRecommendation.get_bids(self.obj, rawBids,
                                                   exchangedRates)
        self.assertEqual([str(x) for x in result.collect()],
                         expected.collect())

    def test_get_motels(self):
        result = self.obj.get_motels(self.sc, self.INPUT_MOTELS_SAMPLE)
        expected = self.sc.parallelize([['0000001', 'Grand Mengo Casino'],
                                        ['0000002', 'Novelo Granja'],
                                        ['0000003', 'Tiny City Motor Inn']])
        self.assertCountEqual(expected.collect(), result.collect())

    def test_get_enriched(self):
        bids = self.sc.parallelize([
            BidItem('0000002', '2016-05-06 11:00', 'US', '1.662'),
            BidItem('0000002', '2016-05-06 11:00', 'CA', '0.715')
        ])
        motels = self.sc.parallelize([
            ['0000002', 'Novelo Granja'],
        ])
        expected = self.sc.parallelize(
            ['0000002,Novelo Granja,2016-05-06 11:00,US,1.662'])
        result = MotelsHomeRecommendation.get_enriched(self, bids, motels)
        self.assertEqual(result.collect(), expected.collect())

    def test_transform_date(self):
        input = '11-05-06-2011'
        expected = '2011-06-05 11:00'
        result = MotelsHomeRecommendation.transform_date(input)
        self.assertEqual(expected, result)

    def test_to_euro(self):
        price_usd = '100'
        exchange_rate = '0.3333333333'
        expected = 33.333
        result = MotelsHomeRecommendation.to_euro(price_usd, exchange_rate)
        self.assertEqual(expected, result)

    def test_to_euro_empty(self):
        price_usd = ''
        exchange_rate = '0.8'
        expected = ''
        result = MotelsHomeRecommendation.to_euro(price_usd, exchange_rate)
        self.assertEqual(expected, result)

    def test_expand(self):
        rawBid = '0000002,11-05-08-2016,0.92,1.68,0.81,0.68,1.59,,1.63,1.77,2.06,0.66,1.53,,0.32,0.88,0.83,1.01'.split(
            ',')
        exchange_rate = '0.8'
        result = MotelsHomeRecommendation.expand((rawBid, exchange_rate))
        expected = [
            '0000002,2016-08-05 11:00,US,0.544',
            '0000002,2016-08-05 11:00,MX,1.272',
            '0000002,2016-08-05 11:00,CA,1.304'
        ]
        self.assertEqual([str(x) for x in result], expected)

    def test_should_read_raw_bids(self):
        expected = self.sc.parallelize([[
            "0000002", "15-04-08-2016", "0.89", "0.92", "1.32", "2.07", "",
            "1.35"
        ], ["0000001", "06-05-02-2016", "ERROR_NO_BIDS_FOR_HOTEL"]])

        rawBids = self.obj.get_raw_bids(self.sc, self.INPUT_BIDS_SAMPLE)

        self.assertEqual(expected.collect(), rawBids.collect())

    def test_should_collect_errournes_records(self):
        rawBids = self.sc.parallelize([["1", "06-05-02-2016", "ERROR_1"],
                                       ["2", "15-04-08-2016", "0.89"],
                                       ["3", "07-05-02-2016", "ERROR_2"],
                                       ["4", "06-05-02-2016", "ERROR_1"],
                                       ["5", "06-05-02-2016", "ERROR_2"]])
        expected = self.sc.parallelize([
            "06-05-02-2016,ERROR_1,2", "06-05-02-2016,ERROR_2,1",
            "07-05-02-2016,ERROR_2,1"
        ])

        erroneousRecords = MotelsHomeRecommendation.get_erroneous_records(
            self, rawBids)
        self.assertCountEqual(expected.collect(), erroneousRecords.collect())
예제 #9
0
 def test_to_euro_empty(self):
     price_usd = ''
     exchange_rate = '0.8'
     expected = ''
     result = MotelsHomeRecommendation.to_euro(price_usd, exchange_rate)
     self.assertEqual(expected, result)
예제 #10
0
 def test_to_euro(self):
     price_usd = '100'
     exchange_rate = '0.3333333333'
     expected = 33.333
     result = MotelsHomeRecommendation.to_euro(price_usd, exchange_rate)
     self.assertEqual(expected, result)
예제 #11
0
 def test_transform_date(self):
     input = '11-05-06-2011'
     expected = '2011-06-05 11:00'
     result = MotelsHomeRecommendation.transform_date(input)
     self.assertEqual(expected, result)
예제 #12
0
 def setUp(self):
     self.sc = SparkContext.getOrCreate(SparkConf())
     self.spark = SparkSession(self.sc)
     self.obj = MotelsHomeRecommendation('', '', '', '')