示例#1
0
class Transformer:
    def __init__(self):
        self.__data = Extractor().extract_csv_data()
        self.__loader = Loader()
        # save all extracted DataFrames from csv files to parquet files
        for k, v in self.__data.items():
            self.__loader.save_to_parquet(k, v)
        # reads all saved parquet files
        data_files = self.__loader.read_parquets("weather")
        # combines all DataFrames into one to get the highest temp from all records
        self.__df = pd.concat(data_files, ignore_index=True)

    def find_hottest_day(self):
        """
        Gets a subset of the combined data containing only the columns we need.
        Then finds the row that equals to the maximum ScreenTemperature and returns it.
        :return: a DataFrame row containing the result of the query.
        """

        # creates a subset of the data with only the columns we need
        df_subset = self.__df[[
            'ObservationDate', 'ScreenTemperature', 'Region'
        ]]
        # find the row with max temperature
        return df_subset[df_subset['ScreenTemperature'] ==
                         df_subset['ScreenTemperature'].max()]
示例#2
0
class LoaderTest(TestCase):
    def setUp(self):
        self.loader = Loader()
        self.project_dir = os.path.abspath(__file__ + "/../../")
        for f in glob.glob(self.project_dir + "/resources/*.parquet.gzip"):
            os.remove(f)

    def tearDown(self):
        for f in glob.glob(self.project_dir + "/resources/*.parquet.gzip"):
            os.remove(f)

    def test_save_parquet(self):
        data = DataFrame([{"key1": "value1", "key2": "value2"}])
        self.loader.save_to_parquet("test", data)
        result = os.path.isfile(self.project_dir +
                                "/resources/test.parquet.gzip")
        self.assertTrue(result)

    def test_read_parquets(self):
        data = DataFrame([{"key1": "value1", "key2": "value2"}])
        self.loader.save_to_parquet("test", data)
        result = self.loader.read_parquets("test")
        self.assertCountEqual(data, result[0])