def normalize_record(r): date = datetime.strptime(r['Date'], "%Y-%m-%d") weekday = date.weekday() r['NowDayFromStart'] = (date - START_DATE).days r['NowDayOfWeek'] = int(weekday) r['NowDayOfMonth'] = int(date.day) r['NowMonth'] = int(date.month) r['NowYear'] = int(date.year) r['NowIsWeekend'] = int(weekday >= 5) r['NowWeek'] = int(date.date().isocalendar()[1]) del r['Date'], r['DayOfWeek'] r['Store'] = int(r['Store']) r['Promo'] = int(r['Promo']) r['SchoolHoliday'] = int(r['SchoolHoliday']) if 'Customers' in r: r['Customers'] = int(r['Customers']) if 'Sales' in r: r['Sales'] = int(r['Sales']) if 'Id' in r: r['Id'] = int(r['Id']) ( r['IsOpenYes'], r['IsOpenNo'], r['IsOpenUnknown'], ) = one_hot_encode(r['Open'], ('1', '0', '')) del r['Open'] ( r['StateHolidayNone'], r['StateHolidayPublic'], r['StateHolidayEaster'], r['StateHolidayChristmas'], ) = one_hot_encode(r['StateHoliday'], ('0', 'a', 'b', 'c')) del r['StateHoliday'] r['Promo2'] = int(r['Promo2']) r['CompetitionDistance'] = int(r['CompetitionDistance']) if r['CompetitionDistance'] else sys.maxint if r['CompetitionOpenSinceMonth'] and r['CompetitionOpenSinceYear']: competition_open_since = datetime(int(r['CompetitionOpenSinceYear']), int(r['CompetitionOpenSinceMonth']), 1) r['CompetitionOpenAvailable'] = 1 else: competition_open_since = datetime(2010, 1, 1) r['CompetitionOpenAvailable'] = 0 r['CompetitionOpenDays'] = int((date - competition_open_since).days) r['CompetitionOpenSinceMonth'] = int(competition_open_since.month) r['CompetitionOpenSinceYear'] = int(competition_open_since.year) ( r['AssortmentBasic'], r['AssortmentExtra'], r['AssortmentExtended'], ) = one_hot_encode(r['Assortment'], ('a', 'b', 'c')) del r['Assortment'] ( r['StoreTypeA'], r['StoreTypeB'], r['StoreTypeC'], r['StoreTypeD'], ) = one_hot_encode(r['StoreType'], ('a', 'b', 'c', 'd')) del r['StoreType'] if r['Promo2SinceYear'] and r['Promo2SinceWeek']: promo2_since = Week(int(r['Promo2SinceYear']), int(r['Promo2SinceWeek'])) r['Promo2SinceAvailable'] = 1 else: promo2_since = Week(2000, 1) r['Promo2SinceAvailable'] = 0 promo2_since = datetime.combine(promo2_since.monday(), time(0, 0)) r['Promo2RunDays'] = int((date - promo2_since).days) r['Promo2SinceWeek'] = int(promo2_since.date().isocalendar()[1]) r['Promo2SinceMonth'] = int(promo2_since.month) r['Promo2SinceYear'] = int(promo2_since.year) del r['Promo2'] ( r['Promo2IntervalJanAprJulOct'], r['Promo2IntervalFebMayAugNov'], r['Promo2IntervalMarJunSepDec'], ) = one_hot_encode(r['PromoInterval'], tuple(PROMO2_INTERVAL)) r['Promo2Now'] = int(r['NowMonth'] in PROMO2_INTERVAL.get(r['PromoInterval'], set())) del r['PromoInterval']