def pre_process_data(data, selected_columns): ''' Does some pre-processing on the existing columns and only keeps columns present in [selected_columns]. Returns a numpy array ''' # Some 'magic' string to datatime function data['datetime'] = pd.to_datetime(data['datetime']) # Since the hour of day is cyclical, e.g. 01:00 is equaly far from midnight # as 23:00 we need to represent this in a meaningful way. We use both sin # and cos, to make sure that 12:00 != 00:00 (which we cannot prevent if we only # use sin) data['hour_of_day'] = data['datetime'].apply(lambda i: i.hour) data['hour_of_day_sin'] = data['hour_of_day'].apply(lambda hour: math.sin(2*math.pi*hour/24)) data['hour_of_day_cos'] = data['hour_of_day'].apply(lambda hour: math.cos(2*math.pi*hour/24)) # Since it seems the service got more popular over time, we might need some # way of telling how far we are from the beginning first_day = datetime.strptime('2011-01-01', "%Y-%m-%d").date() data['day_since_begin'] = data['datetime'].apply(lambda i: (i.date()-first_day).days) # For some reason the dataset didn't indicate new year's day and christmas # day as holidays. Therefore we also use this external libraryto check if # a day is a holiday cal = Maryland() holidays = cal.holidays(2011) holidays += cal.holidays(2012) holidays = set([dt for (dt, name) in holidays]) data['holiday_external'] = data['datetime'].apply(lambda i: int(i.date() in holidays)) # Is it a holiday tomorrow or yesterday? data['almost_holiday'] = data['datetime'].apply( lambda i: int(i.date() - timedelta(days=1) in holidays or i.date() + timedelta(days=1) in holidays) ) # Some simple model of rush hour data['rush_hour'] = data['datetime'].apply( lambda i: min([math.fabs(8-i.hour), math.fabs(18-i.hour)]) ) data.ix[data['workingday'] == 0,'rush_hour'] = \ data['datetime'].apply( lambda i: math.fabs(14-i.hour) ) data.ix[data['holiday_external'] == 1,'rush_hour'] = \ data['datetime'].apply( lambda i: math.fabs(14-i.hour) ) # Add the day of the week data['weekday'] = data['datetime'].apply(lambda i: i.weekday()) # Some variables have no numerical value, they are categorical. E.g. the weather # variable has numerical values, but they cannot be interpreted as such. # In other words value 2 is not two times as small as value 4. # A method to deal with this is one-hot-enconding, which splits the existing # variable in n variables, where n equals the number of possible values. # See for column in ['season', 'weather', 'weekday']: dummies = pd.get_dummies(data[column]) # Concat actual column name with index new_column_names = [column + str(i) for i in dummies.columns] data[new_column_names] = dummies data.to_csv('/home/bolaka/Bike Sharing/train-arnov.csv', index=False) data = data[selected_columns] return data.values
## feature engineering #combined[ 'weekend' ] = 0 #combined.loc[ (combined['holiday'] == 0) & (combined['workingday'] == 0) ,'weekend'] = 1 #combined.loc[ (combined['weekend'] == 1), 'holiday'] = 1 #combined['weekday_holiday'] = combined.holiday * (combined.weekday+1) #combined['atemp_cat'] = pd.cut(combined.atemp.values, 6, labels=[1, 2, 3, 4, 5, 6]) #combined['temp_cat'] = pd.cut(combined.atemp.values, 6, labels=[1, 2, 3, 4, 5, 6]) #combined['hum_cat'] = pd.cut(combined.humidity.values, 4, labels=[1, 2, 3, 4 ]) #combined['windspeed_cat'] = pd.cut(combined.windspeed.values, 4, labels=[ 1, 2, 3, 4 ]) #dummies = pd.get_dummies(combined['windspeed_cat'], prefix='wind') #combined = pd.concat([combined, dummies], axis=1) # For some reason the dataset didn't indicate new year's day and christmas # day as holidays. Therefore we also use this external libraryto check if # a day is a holiday cal = Maryland() holidays = cal.holidays(2011) holidays += cal.holidays(2012) holidays = set([dt for (dt, name) in holidays]) combined['holiday'] = combined['Date'].apply(lambda i: int(i in holidays)) validation['holiday'] = [ int(date.date() in holidays) for (date, hour) in validation.index ] #print(validation['holiday'].sum()) # Was it a holiday yesterday? combined['holiday_lag'] = combined['Date'].apply( lambda i: int(i - timedelta(days=1) in holidays) ) # Is it a holiday tomorrow? combined['holiday_lead'] = combined['Date'].apply(