def test_holidays_for_year(self): d = SchoolHolidayDates() res = d.holidays_for_year(2018) self.assertEquals(len(res), 151) for k, v in res.items(): self.assertEquals(sorted(v.keys()), self.EXPECTED_KEYS) with self.assertRaisesRegexp(UnsupportedYearException, "No data for year: 2024"): self.assertEquals({}, d.holidays_for_year(2024))
def test_holidays_for_year(self): d = SchoolHolidayDates() res = d.holidays_for_year(2018) self.assertEquals(len(res), 151) for k, v in res.items(): self.assertEquals(sorted(v.keys()), self.EXPECTED_KEYS) with self.assertRaisesRegexp( UnsupportedYearException, "No data for year: 2021" ): self.assertEquals({}, d.holidays_for_year(2021))
def test_supported_holidays_are_complete(self): d = SchoolHolidayDates() res = d.holidays_for_year(2019) names = set() for _, v in res.items(): names.add(v["nom_vacances"]) expected = set(SchoolHolidayDates.SUPPORTED_HOLIDAY_NAMES) self.assertEquals(names, expected)
def get_encoded_calendar_df(data_final): # Load school holidays for France fr_holidays = SchoolHolidayDates() df_vacances = pd.DataFrame() for year in list(set(data_final['year'])): df_vacances = pd.concat([df_vacances, pd.DataFrame.from_dict(fr_holidays.holidays_for_year(year)).T]) # Load bank holidays for France df_jf = pd.DataFrame() for year in list(set(data_final['year'])): df_jf = pd.concat([df_jf, pd.DataFrame([ {'date': el[0], 'jour_ferie': el[1]} for el in sorted(holidays.FRA(years=year).items())])]) # Merge school and bank holidays df_holidays = pd.merge(df_vacances, df_jf, how='outer', on='date') # Create features from df_holidays dataframes (school holidays and bank holidays): # - 3 binary features for school holidays, taking 1 if the given zone is on holiday, else 0 (vacances_zone_a, # vacances_zone_b, vacances_zone_c) # Definition of a dictionary to encode boolean into numeric dict_map_vac = { True: 1, False: 0 } # Apply dictionary to each holiday column for the three zones (A, B, C) df_holidays['vacances_zone_a'] = df_holidays['vacances_zone_a'].map(dict_map_vac) df_holidays['vacances_zone_b'] = df_holidays['vacances_zone_b'].map(dict_map_vac) df_holidays['vacances_zone_c'] = df_holidays['vacances_zone_c'].map(dict_map_vac) # - 1 binary feature for bank holiday, taking 1 if it is a bank holiday, else 0 # The column "jour ferie" contains either the name of the holiday or a missing value (NaN) # The idea is to put a '1' when it's a holiday (i.e. when the value is different from nan, else 0) df_holidays['jour_ferie'] = df_holidays['jour_ferie'].map(lambda x: 1 if str(x) != 'nan' else 0) # - To go further: Try to create a combined feature with school and bank holidays df_holidays['holiday'] = df_holidays['vacances_zone_a'] + df_holidays['vacances_zone_b'] + df_holidays[ 'vacances_zone_c'] + df_holidays['jour_ferie'] df_holidays['date'] = df_holidays['date'].map(lambda x: str(x)) data_final_cal = pd.merge(data_final, df_holidays, how='left', left_on='release_date', right_on='date').fillna(0) data_final_cal['month'] = data_final_cal['release_date'].map(lambda x: int(x[5:7])) data_final_cal = apply_cos(data_final_cal, 'month', 'cos_month', 12) return data_final_cal