def testSingleTS(): print("------------------- Test # 1 (Single TS). ------------------------") p = 0.7 N = 50 M = 400 timeSteps = N * M # train/test split trainProp = 0.9 M1 = int(trainProp * M) M2 = M - M1 trainPoints = N * M1 testPoints = N * M2 print("Generating data...") harmonicsTS = harmonicDataTest(timeSteps) trendTS = trendDataTest(timeSteps) (armaTS, armaMeanTS) = armaDataTest(timeSteps) meanTS = harmonicsTS + trendTS + armaMeanTS combinedTS = harmonicsTS + trendTS + armaTS # normalize the values to all lie within [-1, 1] -- helps with RMSE comparisons # can use the tsUtils.unnormalize() function to convert everything back to the original range at the end, if needed max1 = np.nanmax(combinedTS) min1 = np.nanmin(combinedTS) max2 = np.nanmax(meanTS) min2 = np.nanmin(meanTS) max = np.max([max1, max2]) min = np.min([min1, min2]) combinedTS = tsUtils.normalize(combinedTS, max, min) meanTS = tsUtils.normalize(meanTS, max, min) # produce timestamps timestamps = np.arange('2017-09-10 20:30:00', timeSteps, dtype='datetime64[1m]') # arbitrary start date # split the data trainDataMaster = combinedTS[ 0: trainPoints] # need this as the true realized values for comparisons later meanTrainData = meanTS[ 0: trainPoints] # this is only needed for various statistical comparisons later # randomly hide training data: choose between randomly hiding entries or randomly hiding consecutive entries (trainData, pObservation) = tsUtils.randomlyHideValues(copy.deepcopy(trainDataMaster), p) # now further hide consecutive entries for a very small fraction of entries in the eventual training matrix (trainData, pObservation) = tsUtils.randomlyHideConsecutiveEntries( copy.deepcopy(trainData), 0.9, int(M1 * 0.25), M1) # interpolating Nans with linear interpolation # trainData = tsUtils.nanInterpolateHelper(trainData) # test data and hidden truth testData = combinedTS[-1 * testPoints:] meanTestData = meanTS[ -1 * testPoints:] # this is only needed for various statistical comparisons # time stamps trainTimestamps = timestamps[0:trainPoints] testTimestamps = timestamps[-1 * testPoints:] # once we have interpolated, pObservation should be set back to 1.0 pObservation = 1.0 # create pandas df key1 = 't1' trainMasterDF = pd.DataFrame(index=trainTimestamps, data={key1: trainDataMaster }) # needed for reference later trainDF = pd.DataFrame(index=trainTimestamps, data={key1: trainData}) meanTrainDF = pd.DataFrame(index=trainTimestamps, data={key1: meanTrainData}) testDF = pd.DataFrame(index=testTimestamps, data={key1: testData}) meanTestDF = pd.DataFrame(index=testTimestamps, data={key1: meanTestData}) # train the model print("Training the model (imputing)...") print('SVD') nbrSingValuesToKeep = 5 mod = SVDModel(key1, nbrSingValuesToKeep, N, M1, probObservation=pObservation, svdMethod='numpy', otherSeriesKeysArray=[], includePastDataOnly=True) mod.fit(trainDF) imputedDf = mod.denoisedDF() print(" RMSE (training imputation vs mean) = %f" % tsUtils.rmse(meanTrainDF[key1].values, imputedDf[key1].values)) print(" RMSE (training imputation vs obs) = %f" % tsUtils.rmse(trainMasterDF[key1].values, imputedDf[key1].values)) return print('ALS') # uncomment below to run the ALS algorithm ; comment out the above line mod = ALSModel(key1, nbrSingValuesToKeep, N, M1, probObservation=pObservation, otherSeriesKeysArray=[], includePastDataOnly=True) mod.fit(trainDF) # imputed + denoised data imputedDf = mod.denoisedDF() print(" RMSE (training imputation vs mean) = %f" % tsUtils.rmse(meanTrainDF[key1].values, imputedDf[key1].values)) print(" RMSE (training imputation vs obs) = %f" % tsUtils.rmse(trainMasterDF[key1].values, imputedDf[key1].values)) print("Forecasting (#points = %d)..." % len(testDF)) # test data is used for point-predictions forecastArray = [] for i in range(0, len(testDF)): pastPoints = np.zeros(N - 1) # need an N-1 length vector of past point j = 0 if (i < N - 1): # the first prediction uses the end of the training data while (j < N - 1 - i): pastPoints[j] = trainMasterDF[key1].values[len(trainDF) - (N - 1 - i) + j] j += 1 if (j < N - 1): # use the new test data pastPoints[j:] = testDF[key1].values[i - (N - 1) + j:i] keyToSeriesDFNew = pd.DataFrame(data={key1: pastPoints}) prediction = mod.predict(pd.DataFrame(data={}), keyToSeriesDFNew, bypassChecks=False) forecastArray.append(prediction) print(" RMSE (prediction vs mean) = %f" % tsUtils.rmse(meanTestDF[key1].values, forecastArray)) print(" RMSE (prediction vs obs) = %f" % tsUtils.rmse(testDF[key1].values, forecastArray)) print("Plotting...") plt.plot(np.concatenate((trainMasterDF[key1].values, testDF[key1].values), axis=0), color='gray', label='Observed') plt.plot(np.concatenate( (meanTrainDF[key1].values, meanTestDF[key1].values), axis=0), color='red', label='True Means') plt.plot(np.concatenate((imputedDf[key1].values, forecastArray), axis=0), color='blue', label='Forecasts') plt.axvline(x=len(trainDF), linewidth=1, color='black', label='Training End') legend = plt.legend(loc='upper left', shadow=True) plt.title('Single Time Series (ARMA + Periodic + Trend) - $p = %.2f$' % p) plt.show()
trainingYears.append(str(i)) testYears = [] for i in range(yearTrainEnd, yearTestEnd, 1): testYears.append(str(i)) trainDataMasterDict = {} trainDataDict = {} testDataDict = {} for key in otherStates: series = dfBasque[dfBasque["regionname"] == key] trainDataMasterDict.update({key: series[trainingYears].values[0]}) # randomly hide training data (trainData, pObservation) = tsUtils.randomlyHideValues( copy.deepcopy(trainDataMasterDict[key]), p) trainDataDict.update({key: trainData}) testDataDict.update({key: series[testYears].values[0]}) series = dfBasque[dfBasque["regionname"] == basqueKey] trainDataMasterDict.update({basqueKey: series[trainingYears].values[0]}) trainDataDict.update({basqueKey: series[trainingYears].values[0]}) testDataDict.update({basqueKey: series[testYears].values[0]}) trainMasterDF = pd.DataFrame(data=trainDataMasterDict) trainDF = pd.DataFrame(data=trainDataDict) testDF = pd.DataFrame(data=testDataDict) # model rscModel = RobustSyntheticControl( basqueKey,
def basque(filename): # BASQUE COUNTRY STUDY df = pd.read_csv(filename) pivot = df.pivot_table(values='gdpcap', index='regionname', columns='year') pivot = pivot.drop('Spain (Espana)') dfBasque = pd.DataFrame(pivot.to_records()) allColumns = dfBasque.columns.values states = list(np.unique(dfBasque['regionname'])) years = np.delete(allColumns, [0]) basqueKey = 'Basque Country (Pais Vasco)' states.remove(basqueKey) otherStates = states yearStart = 1955 yearTrainEnd = 1971 yearTestEnd = 1998 singvals = 1 p = 0.8 trainingYears = [] for i in range(yearStart, yearTrainEnd, 1): trainingYears.append(str(i)) testYears = [] for i in range(yearTrainEnd, yearTestEnd, 1): testYears.append(str(i)) trainDataMasterDict = {} trainDataDict = {} testDataDict = {} for key in otherStates: series = dfBasque[dfBasque['regionname'] == key] trainDataMasterDict.update({key: series[trainingYears].values[0]}) # randomly hide training data (trainData, pObservation) = tsUtils.randomlyHideValues(copy.deepcopy(trainDataMasterDict[key]), p) trainDataDict.update({key: trainData}) testDataDict.update({key: series[testYears].values[0]}) series = dfBasque[dfBasque['regionname'] == basqueKey] trainDataMasterDict.update({basqueKey: series[trainingYears].values[0]}) trainDataDict.update({basqueKey: series[trainingYears].values[0]}) testDataDict.update({basqueKey: series[testYears].values[0]}) trainMasterDF = pd.DataFrame(data=trainDataMasterDict) trainDF = pd.DataFrame(data=trainDataDict) testDF = pd.DataFrame(data=testDataDict) # model rscModel = RobustSyntheticControl(basqueKey, singvals, len(trainDF), probObservation=1.0, modelType='als', otherSeriesKeysArray=otherStates) # fit the model rscModel.fit(trainDF) # save the denoised training data denoisedDF = rscModel.model.denoisedDF() # predict - all at once predictions = rscModel.predict(testDF) # plot yearsToPlot = range(yearStart, yearTestEnd, 1) interventionYear = yearTrainEnd - 1 plt.plot(yearsToPlot, np.append(trainMasterDF[basqueKey], testDF[basqueKey], axis=0), color='red', label='observations') plt.plot(yearsToPlot, np.append(denoisedDF[basqueKey], predictions, axis=0), color='blue', label='predictions') plt.axvline(x=interventionYear, linewidth=1, color='black', label='Intervention') # plt.ylim((-1, 0)) legend = plt.legend(loc='upper right', shadow=True) plt.title('Abadie et al. Basque Country Case Study - $p = %.2f$' % p) plt.show()
def testMultipleTS(): print( "------------------- Test # 2 (Multiple TS). ------------------------") p = 1.0 N = 50 M = 400 timeSteps = N * M # train/test split trainProp = 0.7 M1 = int(trainProp * M) M2 = M - M1 trainPoints = N * M1 testPoints = N * M2 key1 = 't1' key2 = 't2' key3 = 't3' otherkeys = [key2, key3] includePastDataOnly = True print("Generating data...") harmonicsTS = harmonicDataTest(timeSteps) trendTS = trendDataTest(timeSteps) (armaTS, armaMeanTS) = armaDataTest(timeSteps) meanTS = harmonicsTS + trendTS + armaMeanTS combinedTS = harmonicsTS + trendTS + armaTS combinedTS2 = (0.3 * combinedTS) + np.random.normal( 0.0, 0.5, len(combinedTS)) combinedTS3 = (-0.4 * combinedTS) #normalize the values to all lie within [-1, 1] -- helps with RMSE comparisons # can use the tsUtils.unnormalize() function to convert everything back to the original range at the end, if needed max1 = np.nanmax([combinedTS, combinedTS2, combinedTS3]) min1 = np.nanmin([combinedTS, combinedTS2, combinedTS3]) max2 = np.nanmax(meanTS) min2 = np.nanmin(meanTS) max = np.max([max1, max2]) min = np.min([min1, min2]) combinedTS = tsUtils.normalize(combinedTS, max, min) combinedTS2 = tsUtils.normalize(combinedTS2, max, min) combinedTS3 = tsUtils.normalize(combinedTS3, max, min) meanTS = tsUtils.normalize(meanTS, max, min) # produce timestamps timestamps = np.arange('2017-09-10 20:30:00', timeSteps, dtype='datetime64[1m]') # arbitrary start date # split the data trainDataMaster = combinedTS[ 0: trainPoints] # need this as the true realized values for comparisons later trainDataMaster2 = combinedTS2[0:trainPoints] trainDataMaster3 = combinedTS3[0:trainPoints] meanTrainData = meanTS[ 0: trainPoints] # this is only needed for various statistical comparisons later # randomly hide training data (trainData, pObservation) = tsUtils.randomlyHideValues(copy.deepcopy(trainDataMaster), p) (trainData2, pObservation) = tsUtils.randomlyHideValues( copy.deepcopy(trainDataMaster2), p) (trainData3, pObservation) = tsUtils.randomlyHideValues( copy.deepcopy(trainDataMaster3), p) # now further hide consecutive entries for a very small fraction of entries in the eventual training matrix (trainData, pObservation) = tsUtils.randomlyHideConsecutiveEntries( copy.deepcopy(trainData), 0.95, int(M1 * 0.25), M1) (trainData2, pObservation) = tsUtils.randomlyHideConsecutiveEntries( copy.deepcopy(trainData2), 0.95, int(M1 * 0.25), M1) (trainData3, pObservation) = tsUtils.randomlyHideConsecutiveEntries( copy.deepcopy(trainData3), 0.95, int(M1 * 0.25), M1) # once we have interpolated, pObservation should be set back to 1.0 pObservation = 1.0 # interpolating Nans with linear interpolation #trainData = tsUtils.nanInterpolateHelper(trainData) #trainData2 = tsUtils.nanInterpolateHelper(trainData2) #trainData3 = tsUtils.nanInterpolateHelper(trainData3) # test data and hidden truth testData = combinedTS[-1 * testPoints:] testData2 = combinedTS2[-1 * testPoints:] testData3 = combinedTS3[-1 * testPoints:] meanTestData = meanTS[ -1 * testPoints:] # this is only needed for various statistical comparisons # time stamps trainTimestamps = timestamps[0:trainPoints] testTimestamps = timestamps[-1 * testPoints:] # create pandas df trainMasterDF = pd.DataFrame(index=trainTimestamps, data={ key1: trainDataMaster, key2: trainDataMaster2, key3: trainDataMaster3 }) # needed for reference later trainDF = pd.DataFrame(index=trainTimestamps, data={ key1: trainData, key2: trainData2, key3: trainData3 }) meanTrainDF = pd.DataFrame(index=trainTimestamps, data={key1: meanTrainData}) testDF = pd.DataFrame(index=testTimestamps, data={ key1: testData, key2: testData2, key3: testData3 }) meanTestDF = pd.DataFrame(index=testTimestamps, data={key1: meanTestData}) # train the model print("Training the model (imputing)...") nbrSingValuesToKeep = 5 mod = SVDModel(key1, nbrSingValuesToKeep, N, M1, probObservation=pObservation, svdMethod='numpy', otherSeriesKeysArray=otherkeys, includePastDataOnly=includePastDataOnly) # uncomment below to run the ALS algorithm ; comment out the above line #mod = ALSModel(key1, nbrSingValuesToKeep, N, M1, probObservation=pObservation, otherSeriesKeysArray=otherkeys, includePastDataOnly=True) mod.fit(trainDF) # imputed + denoised data imputedDf = mod.denoisedDF() print(" RMSE (training imputation vs mean) = %f" % tsUtils.rmse(meanTrainDF[key1].values, imputedDf[key1].values)) print(" RMSE (training imputation vs obs) = %f" % tsUtils.rmse(trainMasterDF[key1].values, imputedDf[key1].values)) print("Forecasting (#points = %d)..." % len(testDF)) # test data is used for point-predictions otherTSPoints = N if (includePastDataOnly == True): otherTSPoints = N - 1 forecastArray = [] for i in range(0, len(testDF)): pastPointsPrediction = np.zeros( N - 1 ) # for the time series of interest, we only use the past N - 1 points # first fill in the time series of interest j = 0 if (i < N - 1): # the first prediction uses the end of the training data while (j < N - 1 - i): pastPointsPrediction[j] = trainMasterDF[key1].values[ len(trainDF) - (N - 1 - i) + j] j += 1 if (j < N - 1): # use the new test data pastPointsPrediction[j:] = testDF[key1].values[i - (N - 1) + j:i] # now fill in the other series otherSeriesDataDict = {} for key in otherkeys: pastPointsOthers = np.zeros( otherTSPoints ) # need an appropriate length vector of past points for each series j = 0 if (i < N - 1 ): # the first prediction uses the end of the training data while (j < N - 1 - i): pastPointsOthers[j] = trainMasterDF[key].values[ len(trainDF) - (N - 1 - i) + j] j += 1 if (j < otherTSPoints): # use the new test data if (includePastDataOnly == True): pastPointsOthers[j:] = testDF[key].values[i - (N - 1) + j:i] else: pastPointsOthers[j:] = testDF[key].values[i - (N - 1) + j:i + 1] otherSeriesDataDict.update({key: pastPointsOthers}) otherKeysToSeriesDFNew = pd.DataFrame(data=otherSeriesDataDict) keyToSeriesDFNew = pd.DataFrame(data={key1: pastPointsPrediction}) prediction = mod.predict(otherKeysToSeriesDFNew, keyToSeriesDFNew, bypassChecks=False) forecastArray.append(prediction) print(" RMSE (prediction vs mean) = %f" % tsUtils.rmse(meanTestDF[key1].values, forecastArray)) print(" RMSE (prediction vs obs) = %f" % tsUtils.rmse(testDF[key1].values, forecastArray)) print("Plotting...") plt.plot(np.concatenate((trainMasterDF[key1].values, testDF[key1].values), axis=0), color='gray', label='Observed') plt.plot(np.concatenate( (meanTrainDF[key1].values, meanTestDF[key1].values), axis=0), color='red', label='True Means') plt.plot(np.concatenate((imputedDf[key1].values, forecastArray), axis=0), color='blue', label='Forecasts') plt.axvline(x=len(trainDF), linewidth=1, color='black', label='Training End') legend = plt.legend(loc='upper left', shadow=True) plt.title('Single Time Series (ARMA + Periodic + Trend) - $p = %.2f$' % p) plt.show()
def prop99(filename): # CALIFORNIA PROP 99 STUDY df = pd.read_csv(filename) df = df[df['SubMeasureDesc'] == 'Cigarette Consumption (Pack Sales Per Capita)'] pivot = df.pivot_table(values='Data_Value', index='LocationDesc', columns=['Year']) dfProp99 = pd.DataFrame(pivot.to_records()) allColumns = dfProp99.columns.values states = list(np.unique(dfProp99['LocationDesc'])) years = np.delete(allColumns, [0]) caStateKey = 'California' states.remove(caStateKey) otherStates = states yearStart = 1970 yearTrainEnd = 1989 yearTestEnd = 2015 singvals = 2 p = 1.0 trainingYears = [] for i in range(yearStart, yearTrainEnd, 1): trainingYears.append(str(i)) testYears = [] for i in range(yearTrainEnd, yearTestEnd, 1): testYears.append(str(i)) trainDataMasterDict = {} trainDataDict = {} testDataDict = {} for key in otherStates: series = dfProp99[dfProp99['LocationDesc'] == key] trainDataMasterDict.update({key: series[trainingYears].values[0]}) # randomly hide training data (trainData, pObservation) = tsUtils.randomlyHideValues(copy.deepcopy(trainDataMasterDict[key]), p) trainDataDict.update({key: trainData}) testDataDict.update({key: series[testYears].values[0]}) series = dfProp99[dfProp99['LocationDesc'] == caStateKey] trainDataMasterDict.update({caStateKey: series[trainingYears].values[0]}) trainDataDict.update({caStateKey: series[trainingYears].values[0]}) testDataDict.update({caStateKey: series[testYears].values[0]}) trainMasterDF = pd.DataFrame(data=trainDataMasterDict) trainDF = pd.DataFrame(data=trainDataDict) testDF = pd.DataFrame(data=testDataDict) # model rscModel = RobustSyntheticControl(caStateKey, singvals, len(trainDF), probObservation=1.0, modelType='als', otherSeriesKeysArray=otherStates) # fit the model rscModel.fit(trainDF) # save the denoised training data denoisedDF = rscModel.model.denoisedDF() # predict - all at once predictions = rscModel.predict(testDF) # plot yearsToPlot = range(yearStart, yearTestEnd, 1) interventionYear = yearTrainEnd - 1 plt.plot(yearsToPlot, np.append(trainMasterDF[caStateKey], testDF[caStateKey], axis=0), color='red', label='observations') plt.plot(yearsToPlot, np.append(denoisedDF[caStateKey], predictions, axis=0), color='blue', label='predictions') plt.axvline(x=interventionYear, linewidth=1, color='black', label='Intervention') legend = plt.legend(loc='lower left', shadow=True) plt.title('Abadie et al. Prop 99 Case Study (CA) - $p = %.2f$' % p) plt.show()