def extract_latest_file(self, list_blobs): last_recent_file = None possible_recent_date_collision = False recent_date = datetime.strptime('01-01-1900', '%d-%m-%Y') for filename in list_blobs: date_file = extract_date_from_file(filename) if date_file: if date_file == recent_date: if not self.is_a_spark_directory(filename): possible_recent_date_collision = True else: # it is spark dir. Check if it is the same dir if os.path.dirname(filename) != os.path.dirname( last_recent_file): logger.info(filename + " vs " + last_recent_file) possible_recent_date_collision = True if date_file > recent_date: possible_recent_date_collision = False recent_date = date_file last_recent_file = filename if possible_recent_date_collision: # Raise an error. No filename is unique in the recent date selected. logger.error("Error TWO files with the same date: %s %s", last_recent_file, recent_date.strftime('%d-%m-%Y')) exit(1) logger.info("Latest file: %s %s", last_recent_file, recent_date.strftime('%d-%m-%Y')) return { "latest_filename": last_recent_file, "suffix": recent_date.strftime('%Y-%m-%d'), "spark": self.is_a_spark_directory(last_recent_file) }
def test_all_date_yyyy_mm_dd(self): """ Test format yyyy-mm-dd all 2021 possible filename """ for dt in self.dates: dt_expected = datetime.combine(dt, datetime.min.time()) filename = 'temp_' + str(dt) + '.json.gz' result = extract_date_from_file(filename) self.assertEqual(result, dt_expected)
def test_all_date_dd_mm_yyyy(self): """ Test format dd-mm-yyyy all 2021 possible filename """ for dt in self.dates: dt_expected = datetime.combine(dt, datetime.min.time()) date_reverse = dt.strftime('%d-%m-%Y') filename = 'temp_' + str(date_reverse) + '.json.gz' result = extract_date_from_file(filename) self.assertEqual(result, dt_expected)
def test_all_date_yyyy_m_d(self): """ Test format when the date miss the 0 for months or days. Eg. 2021-2-10 / 2021-11-1 / 2021-1-1 """ for dt in self.dates: date_nozero = '{dt.year}-{dt.month}-{dt.day}'.format(dt=dt) # skip the date already tested above eg. "2021-10-10" if str(dt) != str(date_nozero): dt_expected = datetime.combine(dt, datetime.min.time()) filename = 'temp_' + str(date_nozero) + '.json.gz' result = extract_date_from_file(filename) self.assertEqual(result, dt_expected)
def test_all_date_d_m_yyyy(self): """ Test format when the date miss the 0 for months or days. Eg. 2-10-2021 / 11-5-2021 / 1-1-2021 """ for dt in self.dates: date_reverse = dt.strftime('%d-%m-%Y') date_reverse_no_zero = '{dt.day}-{dt.month}-{dt.year}'.format( dt=dt) # skip the date already tested above eg. "10-10-2021" if str(date_reverse) != str(date_reverse_no_zero): dt_expected = datetime.combine(dt, datetime.min.time()) filename = 'temp_' + str(date_reverse_no_zero) + '.json.gz' result = extract_date_from_file(filename) self.assertEqual(result, dt_expected)
def test_wrong_format(self): expected_date = datetime(2021, 2, 4) filename = 'temp_cosmic0074-02-2021.json.gz' result = extract_date_from_file(filename) self.assertEqual(result, expected_date)
def test_no_date_found(self): filename = 'temp_cosmic007-0402-2021.json.gz' result = extract_date_from_file(filename) self.assertEqual(result, None)