def makeClimatologies(stations, startDate, endDate, \ features, smoothWindow): # build climatology for set of features import wUUtils as Util import numpy as np # choose a leap year leap_year = Util.dateList('2008-01-01','2008-12-31') date_list = Util.dateList(startDate,endDate) climatologies=[] featureData = [] for feature in features: fd = Util.loadDailyVariableSetRange(stations,startDate,endDate,[feature]) featureData.append(fd) for fd in featureData: climatology = [] for day_of_month in leap_year: # average of variable on this calendar day for all years this_day = np.mean([stationData[i] for i in range(len(date_list)) \ for stationData in fd \ if date_list[i].month==day_of_month.month \ and date_list[i].day==day_of_month.day]) climatology.append(this_day) # smooth climatology = Util.smooth(climatology,smoothWindow) climatologies.append(climatology) return climatologies
def pcaTaylorModel(stations, startDate, endDate, \ features, ncomp=None, targetVar='TempMax', \ lag=1, order=0, smooth_window=0, verbose=False): # build regression model to predict "variable" for a single # station using training data from multiple stations # between startdate and enddate. # # The set of values of each feature at all stations is converted # to a truncated list of principal components for purposes of # feature-reduction and reduction of multicolinearity # # Uses a "Taylor expansion" by combining information from # several days (higher order time derivatives) # # stations: a list of station codes, the first entry is # the station for which forecast is generated # features: a list of variables to use as predictors # ncomp: a list of same length as features containing the # number of PCA to keep for each feature # lag: the number of days in the future to forecast # order: the number of days in the past to include # (also maximum order of time derivative) import numpy as np import wUUtils as Util import wUPCA reload(wUPCA) from sklearn import preprocessing from sklearn import linear_model # load target variable data target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \ targetVar, castFloat=True) if smooth_window > 0: target = Util.smooth(target, smooth_window) # shift vector by lag target = target[lag:] # load features data and compute PC pcaData, transform_params = wUPCA.pcaConvert(stations, features, \ startDate, endDate, ncomp) # flatten featureData into single list of lists, while shortening by lag featureData = [data[:(-lag)] for dataList in pcaData for data in dataList] if smooth_window > 0: for data in featureData: data = Util.smooth(data,smooth_window) # number of PC-transformed features if ncomp == None: nfeat = len(stations)*len(features) else: nfeat = sum(ncomp) # add in "derivative" terms for ideriv in range(1,order+1): for ii in range(nfeat): # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat]) fd = np.diff(featureData[ii],n=ideriv) featureData.append(fd) # shorten vectors to length of highest order derivative nrows = len(featureData[-1]) for column in range(len(featureData)): featureData[column] = featureData[column][-nrows:] target = target[-nrows:] # convert target and features to np arrays target = np.array(target) featureData = (np.array(featureData)).T # fit regression model regr = linear_model.LinearRegression() regr.fit(featureData, target) model_params = { 'stations': stations, \ 'startDate': startDate, \ 'endDate': endDate, \ 'targetVar': targetVar, \ 'features': features, \ 'regr': regr, \ 'lag': lag, \ 'order': order, \ 'smooth_window': smooth_window, \ 'transform_params': transform_params} # report regression results: print("R^2: " + str(regr.score(featureData,target))) if verbose: print("Regression coefficients:") print(" intercept" + ":\t" + str(regr.intercept_)) column = 0 for ideriv in range(order+1): print(" " + str(ideriv) + "th derivative:") for ii, feature in enumerate(features): print(" " + feature) if ncomp == None: nc = len(stations) else: nc = ncomp[ii] for jj in range(nc): print(" PC " + str(jj) + " :\t" + str(regr.coef_[column])) column += 1 return featureData, target, model_params