def pcaTaylorModel(stations, startDate, endDate, \ features, ncomp=None, targetVar='TempMax', \ lag=1, order=0, smooth_window=0, verbose=False): # build regression model to predict "variable" for a single # station using training data from multiple stations # between startdate and enddate. # # The set of values of each feature at all stations is converted # to a truncated list of principal components for purposes of # feature-reduction and reduction of multicolinearity # # Uses a "Taylor expansion" by combining information from # several days (higher order time derivatives) # # stations: a list of station codes, the first entry is # the station for which forecast is generated # features: a list of variables to use as predictors # ncomp: a list of same length as features containing the # number of PCA to keep for each feature # lag: the number of days in the future to forecast # order: the number of days in the past to include # (also maximum order of time derivative) import numpy as np import wUUtils as Util import wUPCA reload(wUPCA) from sklearn import preprocessing from sklearn import linear_model # load target variable data target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \ targetVar, castFloat=True) if smooth_window > 0: target = Util.smooth(target, smooth_window) # shift vector by lag target = target[lag:] # load features data and compute PC pcaData, transform_params = wUPCA.pcaConvert(stations, features, \ startDate, endDate, ncomp) # flatten featureData into single list of lists, while shortening by lag featureData = [data[:(-lag)] for dataList in pcaData for data in dataList] if smooth_window > 0: for data in featureData: data = Util.smooth(data,smooth_window) # number of PC-transformed features if ncomp == None: nfeat = len(stations)*len(features) else: nfeat = sum(ncomp) # add in "derivative" terms for ideriv in range(1,order+1): for ii in range(nfeat): # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat]) fd = np.diff(featureData[ii],n=ideriv) featureData.append(fd) # shorten vectors to length of highest order derivative nrows = len(featureData[-1]) for column in range(len(featureData)): featureData[column] = featureData[column][-nrows:] target = target[-nrows:] # convert target and features to np arrays target = np.array(target) featureData = (np.array(featureData)).T # fit regression model regr = linear_model.LinearRegression() regr.fit(featureData, target) model_params = { 'stations': stations, \ 'startDate': startDate, \ 'endDate': endDate, \ 'targetVar': targetVar, \ 'features': features, \ 'regr': regr, \ 'lag': lag, \ 'order': order, \ 'smooth_window': smooth_window, \ 'transform_params': transform_params} # report regression results: print("R^2: " + str(regr.score(featureData,target))) if verbose: print("Regression coefficients:") print(" intercept" + ":\t" + str(regr.intercept_)) column = 0 for ideriv in range(order+1): print(" " + str(ideriv) + "th derivative:") for ii, feature in enumerate(features): print(" " + feature) if ncomp == None: nc = len(stations) else: nc = ncomp[ii] for jj in range(nc): print(" PC " + str(jj) + " :\t" + str(regr.coef_[column])) column += 1 return featureData, target, model_params
def pcaClusterModel(stations, startDate, endDate, \ features, ncomp=None, \ clusterVars=[], nclusters=1, \ targetVar='TempMax', \ lag=1, order=0, ranseed=666, verbose=False): # build regression model to predict "variable" for a single # station using training data from multiple stations # between startdate and enddate. # # The set of values of each feature at all stations is converted # to a truncated list of principal components for purposes of # feature-reduction and reduction of multicolinearity # # Clustering is used to train multiple models for different # partitions of the data # # Uses a "Taylor expansion" by combining information from # several days (higher order time derivatives) # # stations: a list of station codes, the first entry is # the station for which forecast is generated # features: a list of variables to use as predictors # ncomp: a list of same length as features containing the # number of PCA to keep for each feature # clusterVars: a list of pairs of form ('feature',npc), where # where npc is the index of the PC to use for # clustering # lag: the number of days in the future to forecast # order: the number of days in the past to include # (also maximum order of time derivative) import numpy as np import wUUtils as Util import wUPCA import wUCluster as Clust from sklearn import preprocessing from sklearn import linear_model # load target variable data target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \ targetVar, castFloat=True) # shift vector by lag target = target[lag:] # load features data and compute PC pcaData, transformParams = wUPCA.pcaConvert(stations, features, \ startDate, endDate, ncomp) # flatten featureData into single list of lists, while shortening by lag featureData = [data[:(-lag)] for dataList in pcaData for data in dataList] # number of PC-transformed features if ncomp == None: nfeat = len(stations)*len(features) else: nfeat = sum(ncomp) # add in "derivative" terms for ideriv in range(1,order+1): for ii in range(nfeat): # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat]) fd = np.diff(featureData[ii],n=ideriv) featureData.append(fd) # shorten vectors to length of highest order derivative nrows = len(featureData[-1]) for column in range(len(featureData)): featureData[column] = featureData[column][-nrows:] target = target[-nrows:] # apply clustering # locate columns to be used for clustering cols = [] for clusterPair in clusterVars: ifeat = features.index(clusterPair[0]) # index of feature col = sum(ncomp[:ifeat]) + clusterPair[1] cols += [col] if clusterPair[1] >= ncomp[ifeat]: print('Requested cluster variable out of range') print(clusterPair[0] + ' ' + str(clusterPair[1]) + ' >= ' + str(ncomp[ifeat])) return print('columns for clustering: ' + str(cols)) clusterData = np.array([featureData[ii] for ii in cols]).T scaler, clusterer = Clust.computeClusters(clusterData, nclusters, ranseed) classes = Clust.assignClusters(scaler, clusterer, clusterData) clusterParams = { \ 'scaler': scaler, \ 'clusterer': clusterer, \ 'nclusters': nclusters, \ 'ranseed': ranseed, \ 'cols': cols } # separate data into clusters featureClusters = [] targetClusters = [] for icl in range(nclusters): # features clust = [f for i,f in enumerate(zip(*featureData)) if classes[i]==icl] featureClusters.append( map(list,zip(*clust)) ) # targetVar clust = [t for i,t in enumerate(target) if classes[i]==icl] targetClusters.append(clust) # train separate regression model for each cluster regrs = [] for icl in range(nclusters): # convert features and target to arrays featureClusters[icl] = (np.array(featureClusters[icl])).T targetClusters[icl] = np.array(targetClusters[icl]) regr = linear_model.LinearRegression() regr.fit(featureClusters[icl], targetClusters[icl]) regrs.append(regr) print('Cluster %d, nrows %d, R^2 %f' \ % (icl, \ len(targetClusters[icl]), \ regr.score(featureClusters[icl],targetClusters[icl])) ) if verbose: print("\nCluster " + str(icl)) print("Regression coefficients:") print(" intercept" + ":\t" + str(regr.intercept_)) column = 0 for ideriv in range(order+1): print(" " + str(ideriv) + "th derivative:") for ii, feature in enumerate(features): print(" " + feature) for jj in range(ncomp[ii]): print(" PC " + str(jj) + " :\t" + str(regr.coef_[column])) column += 1 modelParams = { 'stations': stations, \ 'startDate': startDate, \ 'endDate': endDate, \ 'targetVar': targetVar, \ 'features': features, \ 'clusterVars': clusterVars, \ 'clusterParams': clusterParams, \ 'classes': classes, \ 'regrs': regrs, \ 'lag': lag, \ 'order': order, \ 'transformParams': transformParams} return featureData, target, modelParams