示例#1
0
def normalize_record(r):
    date = datetime.strptime(r['Date'], "%Y-%m-%d")
    weekday = date.weekday()
    r['NowDayFromStart'] = (date - START_DATE).days
    r['NowDayOfWeek'] = int(weekday)
    r['NowDayOfMonth'] = int(date.day)
    r['NowMonth'] = int(date.month)
    r['NowYear'] = int(date.year)
    r['NowIsWeekend'] = int(weekday >= 5)
    r['NowWeek'] = int(date.date().isocalendar()[1])
    del r['Date'], r['DayOfWeek']

    r['Store'] = int(r['Store'])
    r['Promo'] = int(r['Promo'])
    r['SchoolHoliday'] = int(r['SchoolHoliday'])
    if 'Customers' in r:
        r['Customers'] = int(r['Customers'])
    if 'Sales' in r:
        r['Sales'] = int(r['Sales'])
    if 'Id' in r:
        r['Id'] = int(r['Id'])

    (
        r['IsOpenYes'],
        r['IsOpenNo'],
        r['IsOpenUnknown'],
    ) = one_hot_encode(r['Open'], ('1', '0', ''))
    del r['Open']

    (
        r['StateHolidayNone'],
        r['StateHolidayPublic'],
        r['StateHolidayEaster'],
        r['StateHolidayChristmas'],
    ) = one_hot_encode(r['StateHoliday'], ('0', 'a', 'b', 'c'))
    del r['StateHoliday']

    r['Promo2'] = int(r['Promo2'])
    r['CompetitionDistance'] = int(r['CompetitionDistance']) if r['CompetitionDistance'] else sys.maxint

    if r['CompetitionOpenSinceMonth'] and r['CompetitionOpenSinceYear']:
        competition_open_since = datetime(int(r['CompetitionOpenSinceYear']), int(r['CompetitionOpenSinceMonth']), 1)
        r['CompetitionOpenAvailable'] = 1
    else:
        competition_open_since = datetime(2010, 1, 1)
        r['CompetitionOpenAvailable'] = 0
    r['CompetitionOpenDays'] = int((date - competition_open_since).days)
    r['CompetitionOpenSinceMonth'] = int(competition_open_since.month)
    r['CompetitionOpenSinceYear'] = int(competition_open_since.year)

    (
        r['AssortmentBasic'],
        r['AssortmentExtra'],
        r['AssortmentExtended'],
    ) = one_hot_encode(r['Assortment'], ('a', 'b', 'c'))
    del r['Assortment']

    (
        r['StoreTypeA'],
        r['StoreTypeB'],
        r['StoreTypeC'],
        r['StoreTypeD'],
    ) = one_hot_encode(r['StoreType'], ('a', 'b', 'c', 'd'))
    del r['StoreType']

    if r['Promo2SinceYear'] and r['Promo2SinceWeek']:
        promo2_since = Week(int(r['Promo2SinceYear']), int(r['Promo2SinceWeek']))
        r['Promo2SinceAvailable'] = 1
    else:
        promo2_since = Week(2000, 1)
        r['Promo2SinceAvailable'] = 0
    promo2_since = datetime.combine(promo2_since.monday(), time(0, 0))
    r['Promo2RunDays'] = int((date - promo2_since).days)
    r['Promo2SinceWeek'] = int(promo2_since.date().isocalendar()[1])
    r['Promo2SinceMonth'] = int(promo2_since.month)
    r['Promo2SinceYear'] = int(promo2_since.year)
    del r['Promo2']

    (
        r['Promo2IntervalJanAprJulOct'],
        r['Promo2IntervalFebMayAugNov'],
        r['Promo2IntervalMarJunSepDec'],
    ) = one_hot_encode(r['PromoInterval'], tuple(PROMO2_INTERVAL))
    r['Promo2Now'] = int(r['NowMonth'] in PROMO2_INTERVAL.get(r['PromoInterval'], set()))
    del r['PromoInterval']