예제 #1
0
def clusterRegression(stations, startDate, endDate, \
                      features, clusterFeatures=None, \
                      nclusters=1, ranseed=666, \
                      targetVar='TempMax', \
                      lag=1, order=0, scale=False, verbose=False):
     # build regression model to predict a variable for a single
     # station using training data from multiple stations 
     # between startdate and enddate.  Uses a "Taylor expansion" 
     # by combining information from several days (higher order
     # time derivatives)
     #
     #   stations: a list of station codes, the first entry is
     #             the station for which forecast is generated
     #   features: a list of variables to use as predictors
     #         *** if a feature string contains a ":" it is parsed as
     #             an interaction between two features ...
     #         *** features in interaction terms pre-scaled!
     # clusterFeatures: subset of features with respect to which
     #             k-means clustering is applied before training
     #             regression models
     #  nclusters: number of clusters to compute
     #        lag: the number of days in the future to forecast
     #      order: the number of days in the past to include
     #             (also maximum order of time derivative)
     import wUCluster as Clust
     reload(Clust)
     import numpy as np
     import wUUtils as Util
     from sklearn import preprocessing
     from sklearn import linear_model

     # load target variable data
     target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \
                        targetVar, castFloat=True)
     # shift vector by lag
     target = target[lag:]
     # load feature data
     featureData = []
     prescalers = []
     for station in stations:
          for feature in features:
               # check if feature contains an interaction
               if ':' in feature:
                    feat1 = feature.split(':')[0]
                    feat2 = feature.split(':')[1]
                    fd1 = Util.loadDailyVariableRange(station, startDate, endDate, \
                                                      feat1, castFloat=True)
                    fd2 = Util.loadDailyVariableRange(station, startDate, endDate, \
                                                      feat2, castFloat=True)
                    prescaler1 = preprocessing.StandardScaler().fit(fd1)
                    fd1 = prescaler1.transform(fd1)
                    prescaler2 = preprocessing.StandardScaler().fit(fd2)
                    fd2 = prescaler2.transform(fd2)
                    # save prescaler objects (for prediction)
                    prescalers.append([prescaler1,prescaler2])
                    # compute interaction
                    fd = (np.array(fd1)*np.array(fd2)).tolist()
               else:
                    fd = Util.loadDailyVariableRange(station, startDate, endDate, \
                                                     feature, castFloat=True)
                    prescalers.append(None)
               # shorten vector by lag
               fd = fd[:(-lag)]
               featureData.append(fd)
     # add in "derivative" terms
     for ideriv in range(1,order+1):
          ncols = len(stations)*len(features)
          for ii in range(ncols):
               # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat])
               fd = np.diff(featureData[ii],n=ideriv)
               featureData.append(fd)
     # shorten vectors to length of highest order derivative
     nrows = len(featureData[-1])
     for column in range(len(featureData)):
          featureData[column] = featureData[column][-nrows:]
     target = target[-nrows:]

     # apply k-means clustering
     if clusterFeatures is not None:
          classes, clusterParams = Clust.clusterFeatureData(featureData, stations, features, \
                                                            clusterFeatures, nclusters, \
                                                            ranseed)
          classes, featureClusters = Clust.assignClustersAllFeatures(featureData, clusterParams)
          targetClusters = []
          for cl in range(nclusters):
               targetClusters.append([t for i,t in enumerate(target) if classes[i] == cl])
     else:
          # everything is one cluster
          classes = range(len(target))
          featureClusters = [featureData]
          targetClusters = [target]
          clusterParams = { 'nclusters': 1 }

     # train separate regression model for each cluster
     regrs = []
     scalers = []
     for icl in range(nclusters):
          # convert features and target to arrays
          featureClusters[icl] = (np.array(featureClusters[icl])).T
          targetClusters[icl] = np.array(targetClusters[icl])

          scaler = None
          if scale:
               scaler = preprocessing.StandardScaler().fit(featureClusters[icl])
               featureClusters[icl] = scaler.transform(featureClusters[icl])
          scalers.append(scaler)
     
          regr = linear_model.LinearRegression()
          regr.fit(featureClusters[icl], targetClusters[icl])
          regrs.append(regr)
          print('Cluster %d, nrows %d, R^2 %f' \
                       % (icl, \
                          len(targetClusters[icl]), \
                          regr.score(featureClusters[icl],targetClusters[icl])) )
          if verbose:
               print("Regression coefficients:")
               print("  intercept" + ":\t" + str(regr.intercept_))
               column = 0
               for ideriv in range(order+1):
                    print("  " + str(ideriv) + "th derivative:")
                    for jj, station in enumerate(stations):
                         print("    Station: " + station)
                         for ii, feature in enumerate(features):
                              print("       " + feature + ":\t" + str(regr.coef_[column]))
                              column += 1

     # save model parameters
     modelParams = {
            'stations': stations, \
            'startDate': startDate, \
            'endDate': endDate, \
            'targetVar': targetVar, \
            'features': features, \
            'regrs': regrs, \
            'clusterParams': clusterParams, \
            'classes': classes, \
            'lag': lag, \
            'order': order, \
            'scale': scale, \
            'scalers': scalers, \
            'prescalers': prescalers}
     return featureData, target, modelParams
예제 #2
0
def clusterRegressionPredict(modelParams, startDate, endDate, actual=True):
     # predict targetVar for a single station using 
     # previously generated regression model
     import wUCluster as Clust
     reload(Clust)
     import numpy as np
     import wUUtils as Util
     # extract city and feature data
     stations = modelParams['stations']
     targetVar = modelParams['targetVar']
     features = modelParams['features']
     regrs = modelParams['regrs']
     clusterParams = modelParams['clusterParams']
     nclusters = clusterParams['nclusters']
     lag = modelParams['lag']
     order = modelParams['order']
     scale = modelParams['scale']
     prescalers = modelParams['prescalers']
     scalers = modelParams['scalers']

     # build list of dates in datetime format
     date_list = Util.dateList(startDate, endDate)
     date_list = date_list[(lag+order):]
     # if actual data available
     if actual:
          # load target variable data
          target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \
                             targetVar, castFloat=True)
          # "baseline" model is predicted target same as value on prediction day
          baseline = target[order:(-lag)]
          baseline = np.array(baseline)
          # shift vector by lag
          target = target[lag:]
          target = np.array(target)
     else:
          target = None
     # load feature data
     featureData = []
     idata = 0
     for station in stations:
          for feature in features:
               # check if feature contains an interaction
               if ':' in feature:
                    feat1 = feature.split(':')[0]
                    feat2 = feature.split(':')[1]
                    fd1 = Util.loadDailyVariableRange(station, startDate, endDate, \
                                                      feat1, castFloat=True)
                    fd2 = Util.loadDailyVariableRange(station, startDate, endDate, \
                                                      feat2, castFloat=True)
                    # rescale factors in interaction
                    prescaler1, prescaler2 = prescalers[idata]
                    fd1 = prescaler1.transform(fd1)
                    fd2 = prescaler2.transform(fd2)
                    # compute interaction
                    fd = (np.array(fd1)*np.array(fd2)).tolist()
               else:
                    fd = Util.loadDailyVariableRange(station, startDate, endDate, \
                                                     feature, castFloat=True)
               # shorten vector by lag
               fd = fd[:(-lag)]
               featureData.append(fd)
               # increment feature counter
               idata += 1
     # add in "derivative" terms
     for ideriv in range(1,order+1):
          ncols = len(stations)*len(features)
          for ii in range(ncols):
               # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat])
               fd = np.diff(featureData[ii],n=ideriv)
               featureData.append(fd)
     # shorten vectors to length of highest order derivative
     nrows = len(featureData[-1])
     for column in range(len(featureData)):
          featureData[column] = featureData[column][-nrows:]
     if actual:
          target = target[-nrows:]

     # allocate features to clusters
     if clusterParams['nclusters'] > 1:
          classes, featureClusters = Clust.assignClustersAllFeatures(featureData, clusterParams)
          dateClusters = []
          for icl in range(nclusters):
               dateClusters.append([t for i,t in enumerate(date_list) if classes[i] == icl])
          if actual:
               targetClusters = []
               for icl in range(nclusters):
                    targetClusters.append([t for i,t in enumerate(target) if classes[i] == icl])
     else:
          # everything is one cluster
          classes = range(len(target))
          featureClusters = [featureData]
          dateClusters = [date_list]
          if actual:
               targetClusters = [target]

     preds = []
     RMSE = []
     R2 = []
     for icl in range(nclusters):
          # convert features and target to arrays
          featureClusters[icl] = (np.array(featureClusters[icl])).T

          if scale:
               scaler = scalers[icl]
               featureClusters[icl] = scaler.transform(featureClusters[icl])

          regr = regrs[icl]
          preds.append(regr.predict(featureClusters[icl]))
          if actual:
               targetClusters[icl] = np.array(targetClusters[icl])
               print('Cluster %d, %d rows:' % (icl,len(dateClusters[icl])) )
               r2 = regrs[icl].score(featureClusters[icl],targetClusters[icl])
               print('  R^2_mean:' + '\t' + str(r2))
               rmse = np.sqrt(((preds[icl] - targetClusters[icl])**2).mean())
               print('  RMSE:\t' + '\t' + str(rmse))
               RMSE.append(rmse)
               R2.append(r2)

     # assemble predictions into one list
     date_list_mixed = np.concatenate(dateClusters).tolist()
     pred_mixed = np.concatenate(preds).tolist()
     pred = [pr for (d,pr) in sorted(zip(date_list_mixed,pred_mixed))]

     if actual:
          rmse = np.sqrt(((np.array(pred) - np.array(target))**2).mean()) 
          print('\nOverall performance:')
          print('  RMSE:' + '\t' + str(rmse))

          modelPerf = {'RMSE': RMSE, 'R2': R2, 'RMSE_total': rmse }
     else:
          modelPerf = None

     return date_list, pred, target, featureData, classes, modelPerf