def test_integration(self): obj = MotelsHomeRecommendation('input/bids.gz.parquet', 'input/exchange_rate.txt', 'input/motels.gz.parquet', self.tmpdir) obj.process_data() err_result = self.sc.textFile(os.path.join(self.tmpdir, 'erroneous')) err_expected = self.sc.textFile( os.path.join('expected_output', 'expected_sql')) self.maxDiff = None self.assertCountEqual(err_result.collect(), err_expected.collect()) aggregated_result = self.sc.textFile( os.path.join(self.tmpdir, 'aggregated')) aggregated_expected = self.sc.textFile( os.path.join('expected_output', 'aggregated')) self.assertCountEqual(aggregated_result.collect(), aggregated_expected.collect())
def test_expand(self): rawBid = '0000002,11-05-08-2016,0.92,1.68,0.81,0.68,1.59,,1.63,1.77,2.06,0.66,1.53,,0.32,0.88,0.83,1.01'.split( ',') exchange_rate = '0.8' result = MotelsHomeRecommendation.expand((rawBid, exchange_rate)) expected = [ '0000002,2016-08-05 11:00,US,0.544', '0000002,2016-08-05 11:00,MX,1.272', '0000002,2016-08-05 11:00,CA,1.304' ] self.assertEqual([str(x) for x in result], expected)
def test_get_enriched(self): bids = self.sc.parallelize([ BidItem('0000002', '2016-05-06 11:00', 'US', '1.662'), BidItem('0000002', '2016-05-06 11:00', 'CA', '0.715') ]) motels = self.sc.parallelize([ ['0000002', 'Novelo Granja'], ]) expected = self.sc.parallelize( ['0000002,Novelo Granja,2016-05-06 11:00,US,1.662']) result = MotelsHomeRecommendation.get_enriched(self, bids, motels) self.assertEqual(result.collect(), expected.collect())
def test_should_collect_errournes_records(self): rawBids = self.sc.parallelize([["1", "06-05-02-2016", "ERROR_1"], ["2", "15-04-08-2016", "0.89"], ["3", "07-05-02-2016", "ERROR_2"], ["4", "06-05-02-2016", "ERROR_1"], ["5", "06-05-02-2016", "ERROR_2"]]) expected = self.sc.parallelize([ "06-05-02-2016,ERROR_1,2", "06-05-02-2016,ERROR_2,1", "07-05-02-2016,ERROR_2,1" ]) erroneousRecords = MotelsHomeRecommendation.get_erroneous_records( self, rawBids) self.assertCountEqual(expected.collect(), erroneousRecords.collect())
def test_get_bids(self): rawBids = self.sc.parallelize([[ "0000002", "11-06-05-2016", "0.89", "0.92", "1.32", "2.07", "", "1.35", "0.89", "0.92", "1.32", "2.07", "", "1.35", "0.89", "0.92", "1.32", "2.07" ], ["0000001", "10-06-11-2015", "ERROR_NO_BIDS_FOR_HOTEL"]]) exchangedRates = self.sc.parallelize([['11-06-05-2016', '0.803'], ['10-06-11-2015', '0.987']]) expected = self.sc.parallelize([ '0000002,2016-05-06 11:00,US,1.662', '0000002,2016-05-06 11:00,CA,0.715' ]) result = MotelsHomeRecommendation.get_bids(self.obj, rawBids, exchangedRates) self.assertEqual([str(x) for x in result.collect()], expected.collect())
class TestMR(unittest.TestCase): INPUT_BIDS_SAMPLE = "bids_sample.txt" INPUT_EXCHANGED_RATES_SAMPLE = "ex_sample.txt" INPUT_MOTELS_SAMPLE = "motels_sample.txt" def setUp(self): self.sc = SparkContext.getOrCreate(SparkConf()) self.spark = SparkSession.builder.getOrCreate() self.obj = MotelsHomeRecommendation('', '', '', '') self.maxDiff = None def test_get_exchanged_rates(self): result = self.obj.get_exchange_rates(self.spark, self.INPUT_EXCHANGED_RATES_SAMPLE) expected = self.spark.createDataFrame( [('11-06-05-2016', 'Euro', 'EUR', '0.803'), ('11-05-08-2016', 'Euro', 'EUR', '0.873'), ('10-06-11-2015', 'Euro', 'EUR', '0.987')], self.obj.EXCHANGE_RATES_HEADER) self.assertEqual(expected.collect(), result.collect()) def test_get_bids(self): rawBids = self.spark.createDataFrame( [("0000002", "20-20-06-2016", "0.89", "0.92", "1.32", "2.07", "", "1.35", "0.89", "0.87", "1.22", "1.06", "0.93", "0.88", "1.36", "1.48", "1.14", "0.99"), ("0000001", "10-06-11-2015", "ERROR_NO_BIDS_FOR_HOTEL", None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)], self.obj.BIDS_HEADER) exchangedRates = self.spark.createDataFrame( [('20-20-06-2016', 'Euro', 'EUR', '0.803'), ('10-06-11-2015', 'Euro', 'EUR', '0.987')], self.obj.EXCHANGE_RATES_HEADER) expected = self.spark.createDataFrame( [('0000002', 'US', '2016-06-20 20:00', '1.662'), ('0000002', 'CA', '2016-06-20 20:00', '0.715')], ['MotelID', 'loSa', 'BidDate', 'price']) result = self.obj.get_bids(rawBids, exchangedRates) self.assertEqual(expected.collect(), result.collect()) def test_get_bids_same(self): rawBids = self.spark.createDataFrame( [("0000002", "20-20-06-2016", "0.89", "0.92", "1.32", "0.89", "0.89", "1.35", "0.89", "0.87", "1.22", "1.06", "0.93", "0.88", "1.36", "1.48", "1.14", "0.99")], self.obj.BIDS_HEADER) exchangedRates = self.spark.createDataFrame( [('20-20-06-2016', 'Euro', 'EUR', '0.803')], self.obj.EXCHANGE_RATES_HEADER) expected = self.spark.createDataFrame( [('0000002', 'US', '2016-06-20 20:00', '0.715'), ('0000002', 'MX', '2016-06-20 20:00', '0.715'), ('0000002', 'CA', '2016-06-20 20:00', '0.715')], ['MotelID', 'loSa', 'BidDate', 'price']) result = self.obj.get_bids(rawBids, exchangedRates) self.assertEqual(expected.collect(), result.collect()) def test_get_motels(self): result = self.obj.get_motels(self.spark, self.INPUT_MOTELS_SAMPLE) expected = self.spark.createDataFrame( [('0000001', 'Grand Mengo Casino'), ('0000002', 'Novelo Granja'), ('0000003', 'Tiny City Motor Inn')], ['MotelID', 'MotelName']) self.assertEqual(expected.collect(), result.collect()) def test_get_enriched(self): bids = self.spark.createDataFrame( [('0000002', 'US', '2016-06-20 20:00', '1.662'), ('0000002', 'CA', '2016-06-20 20:00', '0.715')], ['MotelID', 'loSa', 'BidDate', 'price']) motels = self.spark.createDataFrame([('0000002', 'Novelo Granja')], ['MotelID', 'MotelName']) expected = self.spark.createDataFrame( [('0000002', 'Novelo Granja', '2016-06-20 20:00', 'US', '1.662')], ['MotelID', 'MotelName', 'BidDate', 'loSa', 'price']) result = self.obj.get_enriched(bids, motels) self.assertEqual(result.collect(), expected.collect()) def test_should_read_raw_bids(self): expected = self.spark.createDataFrame( [("0000002", "15-04-08-2016", "0.89", "0.92", "1.32", "2.07", "0.99", "1.35", "0.89", "0.92", "1.32", "2.07", "0.99", "1.35", "2.07", "0.99", "1.35", "0.99"), ("0000001", "06-05-02-2016", "ERROR_NO_BIDS_FOR_HOTEL", None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)], self.obj.BIDS_HEADER) rawBids = self.obj.get_raw_bids(self.spark, self.INPUT_BIDS_SAMPLE) self.assertEqual(expected.collect(), rawBids.collect()) def test_should_collect_errournes_records(self): rawBids = self.spark.createDataFrame([ ("0000001", "06-05-02-2016", "ERROR_1", None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ("0000002", "15-04-08-2016", "0.89", "0.92", "1.32", "2.07", "", "1.35", "0.89", "0.87", "1.22", "1.06", "0.93", "0.88", "1.36", "1.48", "1.14", "0.99"), ("0000001", "07-05-02-2016", "ERROR_2", None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ("0000001", "06-05-02-2016", "ERROR_1", None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ("0000001", "06-05-02-2016", "ERROR_2", None, None, None, None, None, None, None, None, None, None, None, None, None, None, None) ], self.obj.BIDS_HEADER) expected = self.spark.createDataFrame( [('06-05-02-2016', 'ERROR_2', 1), ('07-05-02-2016', 'ERROR_2', 1), ('06-05-02-2016', 'ERROR_1', 2)], ['BidDate', 'HU', 'count']) erroneousRecords = self.obj.get_erroneous_records(rawBids) self.assertEqual(expected.collect(), erroneousRecords.collect())
def setUp(self): self.sc = SparkContext.getOrCreate(SparkConf()) self.spark = SparkSession.builder.getOrCreate() self.obj = MotelsHomeRecommendation('', '', '', '') self.maxDiff = None
class TestMR(unittest.TestCase): INPUT_BIDS_SAMPLE = "bids_sample.txt" INPUT_EXCHANGED_RATES_SAMPLE = "ex_sample.txt" INPUT_MOTELS_SAMPLE = "motels_sample.txt" def setUp(self): self.sc = SparkContext.getOrCreate(SparkConf()) self.spark = SparkSession(self.sc) self.obj = MotelsHomeRecommendation('', '', '', '') def test_get_exchanged_rates(self): result = self.obj.get_exchange_rates(self.sc, self.INPUT_EXCHANGED_RATES_SAMPLE) expected = self.sc.parallelize([['11-06-05-2016', '0.803'], ['11-05-08-2016', '0.873'], ['10-06-11-2015', '0.987']]) self.assertCountEqual(expected.collect(), result.collect()) def test_get_bids(self): rawBids = self.sc.parallelize([[ "0000002", "11-06-05-2016", "0.89", "0.92", "1.32", "2.07", "", "1.35", "0.89", "0.92", "1.32", "2.07", "", "1.35", "0.89", "0.92", "1.32", "2.07" ], ["0000001", "10-06-11-2015", "ERROR_NO_BIDS_FOR_HOTEL"]]) exchangedRates = self.sc.parallelize([['11-06-05-2016', '0.803'], ['10-06-11-2015', '0.987']]) expected = self.sc.parallelize([ '0000002,2016-05-06 11:00,US,1.662', '0000002,2016-05-06 11:00,CA,0.715' ]) result = MotelsHomeRecommendation.get_bids(self.obj, rawBids, exchangedRates) self.assertEqual([str(x) for x in result.collect()], expected.collect()) def test_get_motels(self): result = self.obj.get_motels(self.sc, self.INPUT_MOTELS_SAMPLE) expected = self.sc.parallelize([['0000001', 'Grand Mengo Casino'], ['0000002', 'Novelo Granja'], ['0000003', 'Tiny City Motor Inn']]) self.assertCountEqual(expected.collect(), result.collect()) def test_get_enriched(self): bids = self.sc.parallelize([ BidItem('0000002', '2016-05-06 11:00', 'US', '1.662'), BidItem('0000002', '2016-05-06 11:00', 'CA', '0.715') ]) motels = self.sc.parallelize([ ['0000002', 'Novelo Granja'], ]) expected = self.sc.parallelize( ['0000002,Novelo Granja,2016-05-06 11:00,US,1.662']) result = MotelsHomeRecommendation.get_enriched(self, bids, motels) self.assertEqual(result.collect(), expected.collect()) def test_transform_date(self): input = '11-05-06-2011' expected = '2011-06-05 11:00' result = MotelsHomeRecommendation.transform_date(input) self.assertEqual(expected, result) def test_to_euro(self): price_usd = '100' exchange_rate = '0.3333333333' expected = 33.333 result = MotelsHomeRecommendation.to_euro(price_usd, exchange_rate) self.assertEqual(expected, result) def test_to_euro_empty(self): price_usd = '' exchange_rate = '0.8' expected = '' result = MotelsHomeRecommendation.to_euro(price_usd, exchange_rate) self.assertEqual(expected, result) def test_expand(self): rawBid = '0000002,11-05-08-2016,0.92,1.68,0.81,0.68,1.59,,1.63,1.77,2.06,0.66,1.53,,0.32,0.88,0.83,1.01'.split( ',') exchange_rate = '0.8' result = MotelsHomeRecommendation.expand((rawBid, exchange_rate)) expected = [ '0000002,2016-08-05 11:00,US,0.544', '0000002,2016-08-05 11:00,MX,1.272', '0000002,2016-08-05 11:00,CA,1.304' ] self.assertEqual([str(x) for x in result], expected) def test_should_read_raw_bids(self): expected = self.sc.parallelize([[ "0000002", "15-04-08-2016", "0.89", "0.92", "1.32", "2.07", "", "1.35" ], ["0000001", "06-05-02-2016", "ERROR_NO_BIDS_FOR_HOTEL"]]) rawBids = self.obj.get_raw_bids(self.sc, self.INPUT_BIDS_SAMPLE) self.assertEqual(expected.collect(), rawBids.collect()) def test_should_collect_errournes_records(self): rawBids = self.sc.parallelize([["1", "06-05-02-2016", "ERROR_1"], ["2", "15-04-08-2016", "0.89"], ["3", "07-05-02-2016", "ERROR_2"], ["4", "06-05-02-2016", "ERROR_1"], ["5", "06-05-02-2016", "ERROR_2"]]) expected = self.sc.parallelize([ "06-05-02-2016,ERROR_1,2", "06-05-02-2016,ERROR_2,1", "07-05-02-2016,ERROR_2,1" ]) erroneousRecords = MotelsHomeRecommendation.get_erroneous_records( self, rawBids) self.assertCountEqual(expected.collect(), erroneousRecords.collect())
def test_to_euro_empty(self): price_usd = '' exchange_rate = '0.8' expected = '' result = MotelsHomeRecommendation.to_euro(price_usd, exchange_rate) self.assertEqual(expected, result)
def test_to_euro(self): price_usd = '100' exchange_rate = '0.3333333333' expected = 33.333 result = MotelsHomeRecommendation.to_euro(price_usd, exchange_rate) self.assertEqual(expected, result)
def test_transform_date(self): input = '11-05-06-2011' expected = '2011-06-05 11:00' result = MotelsHomeRecommendation.transform_date(input) self.assertEqual(expected, result)
def setUp(self): self.sc = SparkContext.getOrCreate(SparkConf()) self.spark = SparkSession(self.sc) self.obj = MotelsHomeRecommendation('', '', '', '')