def test_mag_instantiation(self): start_time = datetime(2017, 2, 12, 15) end_time = datetime(2017, 2, 12, 15, 2) self.config.dims["report_number"].update({ "index_by": "OB_time", "min": start_time, # for convenience, will convert according to index_by units if this is datetime "max": end_time, "other_dim_indicies": { "samples_per_record": 0 }, "expected_cadence": { "report_number": 1, "number_samples_per_report": 10, }, }) agg_list = generate_aggregation_list(self.config, self.files) evaluate_aggregation_list(self.config, agg_list, self.file) with nc.Dataset(self.file) as nc_out: time = nc_out.variables["OB_time"][:, 0] out_start, out_end = nc.num2date(time[[0, -1]], nc_out.variables["OB_time"].units) self.assertGreaterEqual(out_start, start_time) self.assertLessEqual(out_end, end_time) self.assertAlmostEqual(np.mean(np.diff(time)), 1, delta=0.001) self.assertAlmostEqual(np.max(np.diff(time)), 1, delta=0.001) self.assertAlmostEqual(np.min(np.diff(time)), 1, delta=0.001) self.assertAlmostEqual(int( (end_time - start_time).total_seconds()), time.size, delta=1)
def test_mpsh_with_config(self): # first minute of Jan 9, 2017, # note that first 5 seconds of the 9th are in the last file from the 8th start_time = datetime(2017, 1, 9, 0, 0) end_time = datetime(2017, 1, 9, 0, 1) - timedelta(microseconds=1) self.config.dims["report_number"].update( { "index_by": "L1a_SciData_TimeStamp", "min": start_time, # for convenience, will convert according to index_by units if this is datetime "max": end_time, "expected_cadence": {"report_number": 1}, } ) self.config.inter_validate() aggregation_list = generate_aggregation_list(self.config, self.files) evaluate_aggregation_list(self.config, aggregation_list, self.nc_out_filename) with nc.Dataset(self.nc_out_filename) as nc_out: # type: nc.Dataset start_time_num, end_time_num = nc.date2num( [start_time, end_time], nc_out["L1a_SciData_TimeStamp"].units ) time = nc_out.variables["L1a_SciData_TimeStamp"][:] # test spacing self.assertAlmostEqual(np.min(np.diff(time)), 1.0, delta=0.001) self.assertAlmostEqual(np.max(np.diff(time)), 1.0, delta=0.001) self.assertAlmostEqual(np.mean(np.diff(time)), 1.0, delta=0.001) # test bounds are inside self.assertGreaterEqual(time[0], start_time_num) self.assertLess(time[-1], end_time_num) # test bounds are within one cadence of start self.assertLess((start_time_num - time[0]), 1) self.assertLessEqual((end_time_num - time[-1]), 1)
def setUpClass(cls): super(TestEvaluateAggregationList, cls).setUpClass() pwd = os.path.dirname(__file__) cls.start_time = datetime(2017, 3, 16, 15, 27) cls.end_time = datetime(2017, 3, 16, 15, 28) cls.files = glob.glob(os.path.join(pwd, "data", "*.nc")) cls.config = Config.from_nc(cls.files[0]) cls.config.dims["report_number"].update({ "index_by": "OB_time", "min": cls. start_time, # for convenience, will convert according to index_by units if this is datetime "max": cls.end_time, "other_dim_indicies": { "samples_per_record": 0 }, "expected_cadence": { "report_number": 1, "number_samples_per_report": 10, }, }) _, cls.filename = tempfile.mkstemp() agg_list = generate_aggregation_list(cls.config, cls.files) logger.info(agg_list) evaluate_aggregation_list(cls.config, agg_list, cls.filename) cls.output = nc.Dataset(cls.filename, "r")
def test_collapse_second_dim(self): config = Config.from_nc(self.inputs[0]) config.dims["b"].update({"flatten": True, "index_by": "b"}) l = generate_aggregation_list(config, self.inputs) evaluate_aggregation_list(config, l, self.filename) with nc.Dataset(self.filename) as nc_out: # type: nc.Dataset # This is the more practically useful method of aggregation, # where, for example, the dimension "a" might represent time # and dim "b" is maybe satellite, or event, etc. (something that, # at any point in time, there could be an arbitrary number of). # flatten b dimension, should turn out like: # [[0 -- --] # [1 -- --] # [2 -- --] # [3 3 --] # [4 4 --] # [5 5 --] # [6 6 6] # [7 7 7] # [8 8 8]] c = nc_out.variables["c"][:] self.assertEqual(c.shape, (9, 3)) self.assertEqual(np.sum(c), 90) self.assertEqual(np.ma.count_masked(c), 9) for i, a in enumerate(["a", "b", "c"]): self.assertEqual(nc_out.variables["b"][i], a)
def test_exis_with_config(self): """Test an EXIS-L1b-SFXR aggregation with dimensions specified.""" start_time = datetime(2018, 6, 21, 0, 0) end_time = datetime(2018, 6, 21, 0, 5) self.config.dims["report_number"].update( { "index_by": "time", "min": start_time, # for convenience, will convert according to index_by units if this is datetime "max": end_time, "expected_cadence": {"report_number": 1}, } ) self.config.inter_validate() aggregation_list = generate_aggregation_list(self.config, self.files) evaluate_aggregation_list(self.config, aggregation_list, self.nc_out_filename) with nc.Dataset(self.nc_out_filename) as nc_out: # type: nc.Dataset start_time_num, end_time_num = nc.date2num( [start_time, end_time], nc_out["time"].units ) time = nc_out.variables["time"][:] self.assertAlmostEqual(np.min(np.diff(time)), 1.0, delta=0.001) self.assertAlmostEqual(np.max(np.diff(time)), 1.0, delta=0.001) self.assertAlmostEqual(np.mean(np.diff(time)), 1.0, delta=0.001) self.assertGreaterEqual(time[0], start_time_num) self.assertLess(time[-1], end_time_num)
def test_using_product_bounds(self): """ Ok, so the files in data/type3/ don't have an unlimited report_number dimension. Also, euvsCQualityFlags is missing a report_number dimension, can we create an explicit dependence on this? """ start_time = datetime(2017, 8, 25, 0, 3, 30) # 2017-08-25T00:03:29.6Z end_time = datetime(2017, 8, 25, 0, 5, 0) # 2017-08-25T00:04:29.6Z self.config.dims["report_number"].update({ "index_by": "time", "min": start_time, # for convenience, will convert according to index_by units if this is datetime "max": end_time, "expected_cadence": { "report_number": 1.0 / 30.0 }, }) self.config.inter_validate() aggregation_list = generate_aggregation_list(self.config, self.files) self.assertGreater(len(aggregation_list), 2) evaluate_aggregation_list(self.config, aggregation_list, self.nc_out_filename) with nc.Dataset(self.nc_out_filename) as nc_out: # type: nc.Dataset self.assertTrue(nc_out.dimensions["report_number"].isunlimited()) time = nc_out.variables["time"][:] self.assertAlmostEqual(np.min(np.diff(time)), 30.0, delta=0.001) self.assertAlmostEqual(np.max(np.diff(time)), 30.0, delta=0.001)
def test_main(self): start_time = datetime(2017, 7, 14, 0, 0) end_time = start_time + timedelta(days=1) - timedelta(milliseconds=1) self.config.dims["time"].update({ "index_by": "time", "min": start_time, # for convenience, will convert according to index_by units if this is datetime "max": end_time, "expected_cadence": { "time": 1 }, }) agg_list = generate_aggregation_list(self.config, self.files) evaluate_aggregation_list(self.config, agg_list, self.file) with nc.Dataset(self.file) as nc_out: start_time_num, end_time_num = nc.date2num([start_time, end_time], nc_out["time"].units) time = nc_out.variables["time"][:] out_start, out_end = nc.num2date(time[[0, -1]], nc_out.variables["time"].units) self.assertGreaterEqual(out_start, start_time) self.assertLessEqual(out_end, end_time) self.assertAlmostEqual(np.mean(np.diff(time)), 1, delta=0.001) self.assertAlmostEqual(np.max(np.diff(time)), 1, delta=0.001) self.assertAlmostEqual(np.min(np.diff(time)), 1, delta=0.001) self.assertAlmostEqual(int( (end_time - start_time).total_seconds()), time.size, delta=1) self.assertGreaterEqual(time[0], start_time_num) self.assertLess(time[-1], end_time_num)
def test_exis_instantiation(self): """Create just the most basic aggregation list for EXIS.""" aggregation_list = generate_aggregation_list(self.config, self.files[:2]) self.assertEqual(len(aggregation_list), 2) evaluate_aggregation_list(self.config, aggregation_list, self.nc_out_filename) with nc.Dataset(self.nc_out_filename) as nc_out: # type: nc.Dataset self.assertGreater(list(nc_out.variables.values())[0].size, 0)
def test_5min(self): # March 5, 2017. 02:10:00 through 02:15:00 start_time = datetime(2017, 3, 5, 2, 10) end_time = datetime(2017, 3, 5, 2, 15) self.config.dims["time"].update({ "min": start_time, # for convenience, will convert according to index_by units if this is datetime "max": end_time, }) agg_list = generate_aggregation_list(self.config, self.files) self.assertEqual(len(agg_list), 6)
def test_main(self): start_time = datetime(2017, 4, 14, 19, 23) end_time = datetime(2017, 4, 14, 20, 30) self.config.dims["time"].update( { "min": start_time, # for convenience, will convert according to index_by units if this is datetime "max": end_time, } ) agg_list = generate_aggregation_list(self.config, self.files) self.assertEqual(len(agg_list), 8)
def test_5min(self): self.start_time = datetime(2017, 3, 16, 15, 25) self.end_time = datetime(2017, 3, 16, 15, 30) self.config.dims["report_number"].update({ "min": self. start_time, # for convenience, will convert according to index_by units if this is datetime "max": self.end_time, }) agg_list = generate_aggregation_list(self.config, self.files) evaluate_aggregation_list(self.config, agg_list, self.file) self.common_checks()
def test_basic(self): """ Ok, so the files in data/type1/ don't have an unlimited dimension, report_number should be unlimited so I've made report_nubmer unlimited in the config template type1_config.json. Let's see if we can aggregate to it. """ aggregation_list = generate_aggregation_list(self.config, self.files) self.assertEqual(len(aggregation_list), 3) evaluate_aggregation_list(self.config, aggregation_list, self.nc_out_filename) with nc.Dataset(self.nc_out_filename) as nc_out: # type: nc.Dataset time = nc_out.variables["time"][:] self.assertEqual(len(time), 3) self.assertTrue(nc_out.dimensions["report_number"].isunlimited())
def test_subset(self): """Test if it correctly chops out enough outside the time bounds.""" self.start_time = datetime(2017, 3, 16, 15, 25) self.end_time = datetime(2017, 3, 16, 15, 27) self.config.dims["report_number"].update({ "min": self. start_time, # for convenience, will convert according to index_by units if this is datetime "max": self.end_time, }) agg_list = generate_aggregation_list(self.config, self.files) evaluate_aggregation_list(self.config, agg_list, self.file) self.common_checks()
def test_superset_back(self): """Test if it correctly inserts fill node to cover a gap at the end.""" start_time = datetime(2017, 4, 14, 19, 23) end_time = datetime(2017, 4, 14, 20, 35) self.config.dims["time"].update( { "min": start_time, # for convenience, will convert according to index_by units if this is datetime "max": end_time, } ) agg_list = generate_aggregation_list(self.config, self.files) self.assertTrue(isinstance(agg_list[-1], FillNode))
def test_subset(self): """Test if it correctly chops out enough outside the time bounds.""" start_time = datetime(2017, 4, 14, 19, 26) end_time = datetime(2017, 4, 14, 20, 28) self.config.dims["time"].update( { "min": start_time, # for convenience, will convert according to index_by units if this is datetime "max": end_time, } ) agg_list = generate_aggregation_list(self.config, self.files) self.assertEqual(len(agg_list), 3)
def test_superset_back(self): """Test if it correctly inserts fill node to cover a gap at end.""" # March 5, 2017. 02:10:00 through 02:15:00 start_time = datetime(2017, 3, 5, 2, 10) end_time = datetime(2017, 3, 5, 2, 20) self.config.dims["time"].update({ "min": start_time, # for convenience, will convert according to index_by units if this is datetime "max": end_time, }) agg_list = generate_aggregation_list(self.config, self.files) self.assertTrue(isinstance(agg_list[-1], FillNode))
def test_subset(self): """Test if it correctly chops out enough outside the time bounds.""" # March 5, 2017. 02:10:00 through 02:15:00 start_time = datetime(2017, 3, 5, 2, 12, 30) end_time = datetime(2017, 3, 5, 2, 13, 22) self.config.dims["time"].update({ "min": start_time, # for convenience, will convert according to index_by units if this is datetime "max": end_time, }) agg_list = generate_aggregation_list(self.config, self.files) self.assertEqual(len(agg_list), 2)
def test_superset_front(self): """Test if it correctly inserts fill node to cover a gap at the start.""" start_time = datetime(2017, 4, 14, 19, 20) end_time = datetime(2017, 4, 14, 20, 30) self.config.dims["time"].update( { "min": start_time, # for convenience, will convert according to index_by units if this is datetime "max": end_time, } ) agg_list = generate_aggregation_list(self.config, self.files) self.assertEqual(len(agg_list), 8)
def test_superset_front(self): """Test if it correctly inserts fill node to cover a gap at the start.""" # March 5, 2017. 02:10:00 through 02:15:00 start_time = datetime(2017, 3, 5, 2, 5) end_time = datetime(2017, 3, 5, 2, 15) self.config.dims["time"].update({ "min": start_time, # for convenience, will convert according to index_by units if this is datetime "max": end_time, }) agg_list = generate_aggregation_list(self.config, self.files) # with the fill Node in front... this becomes 8 elements self.assertEqual(len(agg_list), 8)
def test_superset_front(self): self.start_time = datetime(2017, 3, 16, 15, 15) self.end_time = datetime(2017, 3, 16, 15, 30) self.config.dims["report_number"].update({ "min": self. start_time, # for convenience, will convert according to index_by units if this is datetime "max": self.end_time, }) agg_list = generate_aggregation_list(self.config, self.files) # with the fill Node in front... self.assertTrue(isinstance(agg_list[0], FillNode)) self.assertFalse(isinstance(agg_list[-1], FillNode)) evaluate_aggregation_list(self.config, agg_list, self.file) self.common_checks()
def test_superset_back(self): """Test if it correctly inserts fill node to cover a gap at the start.""" self.start_time = datetime(2017, 3, 16, 15, 25) self.end_time = datetime(2017, 3, 16, 15, 35) self.config.dims["report_number"].update({ "min": self. start_time, # for convenience, will convert according to index_by units if this is datetime "max": self.end_time, }) agg_list = generate_aggregation_list(self.config, self.files) self.assertFalse(isinstance(agg_list[0], FillNode)) self.assertTrue(isinstance(agg_list[-1], FillNode)) evaluate_aggregation_list(self.config, agg_list, self.file) self.common_checks()
def test_main(self): """ Nothing too fancy here, but just making sure that aggregating a variable of strings works properly. Previous to version 0.8.5 we had trouble with vlen datatypes. """ agg_list = generate_aggregation_list(self.config, self.files) evaluate_aggregation_list(self.config, agg_list, self.file) with nc.Dataset(self.file) as nc_in: status = nc_in.variables["status"] # there should be no fill values... # before ncagg v0.8.5 vlen types like string incorrectly aggregated to all fill values. self.assertFalse(any(status[:] == status._FillValue))
def setUpClass(cls): super(TestEvaluateAggregationList, cls).setUpClass() pwd = os.path.dirname(__file__) cls.start_time = datetime(2017, 6, 8, 16, 45) cls.end_time = datetime(2017, 6, 8, 16, 50) cls.files = glob.glob(os.path.join(pwd, "data", "*.nc")) cls.config = Config.from_nc(cls.files[0]) cls.config.dims["report_number"].update( { "index_by": "L1a_SciData_TimeStamp", "min": cls.start_time, # for convenience, will convert according to index_by units if this is datetime "max": cls.end_time, "expected_cadence": {"report_number": 1, "sensor_unit": 0}, } ) _, cls.filename = tempfile.mkstemp() agg_list = generate_aggregation_list(cls.config, cls.files) evaluate_aggregation_list(cls.config, agg_list, cls.filename) cls.output = nc.Dataset(cls.filename, "r")
def test_exis_with_config(self): """ Test an EXIS-L1b-SFXR aggregation with dimensions specified. This particular test case is designed to cover the situation for which we put handling of strict=False into the get_size_along(dimension). The calculation of overlaps between files was resulting in a negative since gap between files was small, the file was also sticking off the end, so ultimately it was chopped on both sides and ended up with a negative size. This should have been fine, as long as that FileNode doesn't end up in the final aggregation list. A couple files that capture this scenario are in the associated data/. """ # March 5th 00:30 through 00:35 start_time = datetime(2019, 2, 5, 0) end_time = datetime(2019, 2, 6, 0) - timedelta(microseconds=1) self.config.dims["report_number"].update( { "index_by": "time", "min": start_time, # for convenience, will convert according to index_by units if this is datetime "max": end_time, "expected_cadence": {"report_number": 1}, } ) self.config.inter_validate() aggregation_list = generate_aggregation_list(self.config, self.files) evaluate_aggregation_list(self.config, aggregation_list, self.nc_out_filename) with nc.Dataset(self.nc_out_filename) as nc_out: # type: nc.Dataset start_time_num, end_time_num = nc.date2num( [start_time, end_time], nc_out["time"].units ) time = nc_out.variables["time"][:] # have not been able to satisfy this: self.assertEquals(time.size, 86400) self.assertAlmostEqual(np.min(np.diff(time)), 0.854, delta=0.001) self.assertAlmostEqual(np.max(np.diff(time)), 1.0, delta=0.001) self.assertAlmostEqual(np.mean(np.diff(time)), 1.0, delta=0.001) self.assertGreaterEqual(time[0], start_time_num) self.assertLess(time[-1], end_time_num)
def setUpClass(cls): super(TestEvaluateAggregationList, cls).setUpClass() pwd = os.path.dirname(__file__) cls.start_time = datetime(2017, 4, 14, 19, 23) cls.end_time = datetime(2017, 4, 14, 20, 30) cls.files = glob.glob(os.path.join(pwd, "data", "*.nc")) cls.files = glob.glob(os.path.join(pwd, "data", "*.nc")) cls.config = Config.from_nc(cls.files[0]) cls.config.dims["time"].update( { "index_by": "time", "min": cls.start_time, # for convenience, will convert according to index_by units if this is datetime "max": cls.end_time, "expected_cadence": {"time": 10}, } ) _, cls.filename = tempfile.mkstemp() agg_list = generate_aggregation_list(cls.config, cls.files) evaluate_aggregation_list(cls.config, agg_list, cls.filename) cls.output = nc.Dataset(cls.filename, "r")
def test_main(self): """ Test case covers a situation where the aggregator was chopping the first record because it was less than 0.5*expected_time_step from the start boundary. Instead, that first data point should be taken, even if it is exactly on the boundary, since the boundary isn't a realy piece of data. So, the essential piece of this test is to make sure that there remain 86400 records before and after aggregation. """ start_time = datetime(2018, 2, 20, 0, 0) end_time = start_time + timedelta(days=1) - timedelta(milliseconds=1) self.config.dims["time"].update({ "index_by": "time", "min": start_time, # for convenience, will convert according to index_by units if this is datetime "max": end_time, "expected_cadence": { "time": 1 }, }) agg_list = generate_aggregation_list(self.config, self.files) evaluate_aggregation_list(self.config, agg_list, self.file) with nc.Dataset(self.file) as nc_out: start_time_num, end_time_num = nc.date2num([start_time, end_time], nc_out["time"].units) time = nc_out.variables["time"][:] out_start, out_end = nc.num2date(time[[0, -1]], nc_out.variables["time"].units) self.assertEqual(len(time), 86400) self.assertGreaterEqual(out_start, start_time) self.assertLessEqual(out_end, end_time) self.assertAlmostEqual(np.mean(np.diff(time)), 1, delta=0.001) self.assertAlmostEqual(np.max(np.diff(time)), 1, delta=0.001) self.assertAlmostEqual(np.min(np.diff(time)), 1, delta=0.001) self.assertGreaterEqual(time[0], start_time_num) self.assertLess(time[-1], end_time_num)
def test_default_multi_dim(self): config = Config.from_nc(self.inputs[0]) l = generate_aggregation_list(config, self.inputs) evaluate_aggregation_list(config, l, self.filename) with nc.Dataset(self.filename) as nc_out: # type: nc.Dataset # this is the default aggregation produced by aggregation # along both unlimited dimensions. This isn't really practically # useful, but, by our "basic" definition of aggregation along unlitimed # dimensions is correct. Need to make sure we get what's expected. # [[0 -- -- -- -- --] # [1 -- -- -- -- --] # [2 -- -- -- -- --] # [-- 3 3 -- -- --] # [-- 4 4 -- -- --] # [-- 5 5 -- -- --] # [-- -- -- 6 6 6] # [-- -- -- 7 7 7] # [-- -- -- 8 8 8]] c = nc_out.variables["c"][:] self.assertEqual(c.shape, (9, 6)) self.assertEqual(np.sum(c), 90) self.assertEqual(np.ma.count_masked(c), 36)
def test_basic_create_new_dim(self): """ Files in data have SUVI_CROTA dimensionless variables. The config has been modified from the default to give crota variables a dependence on the unlim dimension crota_report_number. Here, make sure this transformation is done correctly. """ aggregation_list = generate_aggregation_list(self.config, self.files) self.assertEqual(len(aggregation_list), 5) evaluate_aggregation_list(self.config, aggregation_list, self.nc_out_filename) with nc.Dataset(self.nc_out_filename) as nc_out: # type: nc.Dataset crota_time = nc_out.variables["SUVI_CROTA_time"][:] self.assertEqual(len(crota_time), 5) self.assertTrue( nc_out.dimensions["crota_report_number"].isunlimited()) # make sure each crota_time value isn't the same, they should be increasing # but there isn't necessarily a unique crota value in each file (it's given # about once a minute), so it's expected that there are cases where two # consecutive files have the same crota data. Hence, on average, it's # increasing. self.assertGreater(np.mean(np.diff(crota_time)), 0)
def test_giving_extra_files(self): start_time = datetime(2017, 2, 12, 15, 30) end_time = datetime(2017, 2, 12, 16) self.config.dims["report_number"].update( { "min": start_time, # for convenience, will convert according to index_by units if this is datetime "max": end_time, } ) agg_list = generate_aggregation_list(self.config, self.files) evaluate_aggregation_list(self.config, agg_list, self.file) with nc.Dataset(self.file) as nc_out: time = nc_out.variables["OB_time"][:, 0] out_start, out_end = nc.num2date( time[[0, -1]], nc_out.variables["OB_time"].units ) self.assertGreaterEqual(out_start, start_time - timedelta(seconds=0.25)) self.assertLessEqual(out_end, end_time + timedelta(seconds=0.25)) self.assertAlmostEqual(np.mean(np.diff(time)), 1, delta=0.001) self.assertAlmostEqual(np.max(np.diff(time)), 1, delta=0.001) self.assertAlmostEqual(np.min(np.diff(time)), 1, delta=0.001) self.assertAlmostEqual( int((end_time - start_time).total_seconds()), time.size, delta=1 )
def setUpClass(cls): super(TestEvaluateAggregationList, cls).setUpClass() pwd = os.path.dirname(__file__) cls.start_time = datetime(2018, 1, 17, 15, 5) cls.end_time = datetime(2018, 1, 17, 15, 56) cls.files = glob.glob(os.path.join(pwd, "data", "*.nc")) cls.config = Config.from_nc(cls.files[0]) cls.config.dims["report_number"].update( { "index_by": "ELF_StartStopTime", "min": cls.start_time, # for convenience, will convert according to index_by units if this is datetime "max": cls.end_time, "expected_cadence": { "report_number": 1.0 / (5.0 * 60.0), "number_of_time_bounds": 1.0 / ((5.0 * 60.0) - 1), }, "size": None, } ) _, cls.filename = tempfile.mkstemp() agg_list = generate_aggregation_list(cls.config, cls.files) evaluate_aggregation_list(cls.config, agg_list, cls.filename) cls.output = nc.Dataset(cls.filename, "r")