예제 #1
0
def makeClimatologies(stations, startDate, endDate, \
                      features, smoothWindow):
     # build climatology for set of features
     import wUUtils as Util
     import numpy as np

     # choose a leap year
     leap_year = Util.dateList('2008-01-01','2008-12-31')
     date_list = Util.dateList(startDate,endDate)

     climatologies=[]

     featureData = []
     for feature in features:
          fd = Util.loadDailyVariableSetRange(stations,startDate,endDate,[feature])
          featureData.append(fd)

     for fd in featureData:
          climatology = []
          for day_of_month in leap_year:
               # average of variable on this calendar day for all years
               this_day = np.mean([stationData[i] for i in range(len(date_list)) \
                                                  for stationData in fd \
                                                  if  date_list[i].month==day_of_month.month \
                                                  and date_list[i].day==day_of_month.day])
               climatology.append(this_day)
          # smooth
          climatology = Util.smooth(climatology,smoothWindow)     
          climatologies.append(climatology)
     return climatologies
예제 #2
0
def pcaTaylorModel(stations, startDate, endDate, \
                   features, ncomp=None, targetVar='TempMax', \
                   lag=1, order=0, smooth_window=0, verbose=False):
     # build regression model to predict "variable" for a single
     # station using training data from multiple stations 
     # between startdate and enddate.
     #
     # The set of values of each feature at all stations is converted
     # to a truncated list of principal components for purposes of 
     # feature-reduction and reduction of multicolinearity 
     # 
     # Uses a "Taylor expansion" by combining information from 
     # several days (higher order time derivatives)
     #
     # stations: a list of station codes, the first entry is
     #             the station for which forecast is generated
     # features: a list of variables to use as predictors
     #    ncomp: a list of same length as features containing the
     #           number of PCA to keep for each feature
     #      lag: the number of days in the future to forecast
     #    order: the number of days in the past to include
     #           (also maximum order of time derivative)
     import numpy as np
     import wUUtils as Util
     import wUPCA
     reload(wUPCA)
     from sklearn import preprocessing
     from sklearn import linear_model
     # load target variable data
     target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \
                        targetVar, castFloat=True)
     if smooth_window > 0:
          target = Util.smooth(target, smooth_window)
     # shift vector by lag
     target = target[lag:]
     # load features data and compute PC
     pcaData, transform_params = wUPCA.pcaConvert(stations, features, \
                                                  startDate, endDate, ncomp)
     # flatten featureData into single list of lists, while shortening by lag
     featureData = [data[:(-lag)] for dataList in pcaData for data in dataList] 
     if smooth_window > 0:
          for data in featureData:
               data = Util.smooth(data,smooth_window)
     # number of PC-transformed features
     if ncomp == None:
          nfeat = len(stations)*len(features)
     else:
          nfeat = sum(ncomp) 
     # add in "derivative" terms
     for ideriv in range(1,order+1):
          for ii in range(nfeat):
               # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat])
               fd = np.diff(featureData[ii],n=ideriv)
               featureData.append(fd)
     # shorten vectors to length of highest order derivative
     nrows = len(featureData[-1])
     for column in range(len(featureData)):
          featureData[column] = featureData[column][-nrows:]
     target = target[-nrows:]

     # convert target and features to np arrays
     target = np.array(target)
     featureData = (np.array(featureData)).T

     # fit regression model
     regr = linear_model.LinearRegression()
     regr.fit(featureData, target)
     model_params = {
            'stations': stations, \
            'startDate': startDate, \
            'endDate': endDate, \
            'targetVar': targetVar, \
            'features': features, \
            'regr': regr, \
            'lag': lag, \
            'order': order, \
            'smooth_window': smooth_window, \
            'transform_params': transform_params}
     # report regression results:
     print("R^2: " + str(regr.score(featureData,target)))
     if verbose:
          print("Regression coefficients:")
          print("  intercept" + ":\t" + str(regr.intercept_))
          column = 0
          for ideriv in range(order+1):
               print("  " + str(ideriv) + "th derivative:")
               for ii, feature in enumerate(features):
                    print("    " + feature)
                    if ncomp == None:
                         nc = len(stations)
                    else:
                         nc = ncomp[ii]
                    for jj in range(nc):
                         print("      PC " + str(jj) + " :\t" + str(regr.coef_[column]))
                         column += 1
     return featureData, target, model_params