示例#1
0
 def test_read_col(self):
     """ 
     Tests if spark context is able to read columns of json file 
     """
     etl = ea.ETLAmazon()
     test_file = etl.read_json("thenextbestbook/etl/tests/data/test_file.json.gz")
     test_file_pd = test_file.toPandas()
     test_file_truth = pd.read_json("thenextbestbook/etl/tests/data/test_file.json")
     self.assertEqual(test_file_pd.shape[1], test_file_truth.shape[1])
 def test_sql_cmd(self):
     """ test if sql query is executing correctly on the dataset """
     etl = ea.ETLAmazon()
     runs = etl.read_json(
         "thenextbestbook/etl/tests/data/test_file.json.gz")
     runs.createGlobalTempView("runs")
     query_result = etl.sql_query("SELECT COUNT(*)" "FROM global_temp.runs")
     query_result_pd = query_result.toPandas()
     self.assertEqual(int(query_result_pd.iloc[0]), 263)
""" script to run ETL on Amazon data """
import etl_amazon as ea
import constants as ct

# Initiate Spark Session
etl = ea.ETLAmazon()

# Create variable 'book' to store book review JSON object
books = etl.read_json(ct.AMAZON_BOOKS_JSON)

# Create variable 'metadata' to store metadata JSON object
metadata = etl.read_json(ct.AMAZON_METADATA_JSON)

# Create global variables for spark SQL command
books.createGlobalTempView("books")
metadata.createGlobalTempView("metadata")

# Create variable 'books_with_title' to store result of SQL
books_with_title = etl.get_title_on_asin()

# Save result to JSON folder
books_with_title.write.format('json').save(ct.AMAZON_REVIEWS_DESTINATION)