singvals, len(trainDF), probObservation=1.0, modelType="svd", svdMethod="numpy", otherSeriesKeysArray=otherStates, ) # fit the model rscModel.fit(trainDF) # save the denoised training data denoisedDF = rscModel.model.denoisedDF() # predict - all at once predictions = rscModel.predict(testDF) # plot yearsToPlot = range(yearStart, yearTestEnd, 1) interventionYear = yearTrainEnd - 1 plt.plot( yearsToPlot, np.append(trainMasterDF[basqueKey], testDF[basqueKey], axis=0), color="red", label="observations", ) plt.plot( yearsToPlot, np.append(denoisedDF[basqueKey], predictions, axis=0), color="blue", label="predictions",
def basque(filename): # BASQUE COUNTRY STUDY df = pd.read_csv(filename) pivot = df.pivot_table(values='gdpcap', index='regionname', columns='year') pivot = pivot.drop('Spain (Espana)') dfBasque = pd.DataFrame(pivot.to_records()) allColumns = dfBasque.columns.values states = list(np.unique(dfBasque['regionname'])) years = np.delete(allColumns, [0]) basqueKey = 'Basque Country (Pais Vasco)' states.remove(basqueKey) otherStates = states yearStart = 1955 yearTrainEnd = 1971 yearTestEnd = 1998 singvals = 1 p = 0.8 trainingYears = [] for i in range(yearStart, yearTrainEnd, 1): trainingYears.append(str(i)) testYears = [] for i in range(yearTrainEnd, yearTestEnd, 1): testYears.append(str(i)) trainDataMasterDict = {} trainDataDict = {} testDataDict = {} for key in otherStates: series = dfBasque[dfBasque['regionname'] == key] trainDataMasterDict.update({key: series[trainingYears].values[0]}) # randomly hide training data (trainData, pObservation) = tsUtils.randomlyHideValues(copy.deepcopy(trainDataMasterDict[key]), p) trainDataDict.update({key: trainData}) testDataDict.update({key: series[testYears].values[0]}) series = dfBasque[dfBasque['regionname'] == basqueKey] trainDataMasterDict.update({basqueKey: series[trainingYears].values[0]}) trainDataDict.update({basqueKey: series[trainingYears].values[0]}) testDataDict.update({basqueKey: series[testYears].values[0]}) trainMasterDF = pd.DataFrame(data=trainDataMasterDict) trainDF = pd.DataFrame(data=trainDataDict) testDF = pd.DataFrame(data=testDataDict) # model rscModel = RobustSyntheticControl(basqueKey, singvals, len(trainDF), probObservation=1.0, modelType='als', otherSeriesKeysArray=otherStates) # fit the model rscModel.fit(trainDF) # save the denoised training data denoisedDF = rscModel.model.denoisedDF() # predict - all at once predictions = rscModel.predict(testDF) # plot yearsToPlot = range(yearStart, yearTestEnd, 1) interventionYear = yearTrainEnd - 1 plt.plot(yearsToPlot, np.append(trainMasterDF[basqueKey], testDF[basqueKey], axis=0), color='red', label='observations') plt.plot(yearsToPlot, np.append(denoisedDF[basqueKey], predictions, axis=0), color='blue', label='predictions') plt.axvline(x=interventionYear, linewidth=1, color='black', label='Intervention') # plt.ylim((-1, 0)) legend = plt.legend(loc='upper right', shadow=True) plt.title('Abadie et al. Basque Country Case Study - $p = %.2f$' % p) plt.show()
def runAnalysis(N, T, TrainingEnd, rowRank, colRank): # generate metric matrices genFunctionOne = simpleFunctionOne genFunctionTwo = simpleFunctionTwo trueWeights = np.random.uniform(0.0, 1.0, N) trueWeights = trueWeights / np.sum(trueWeights) thetaArrayParams = np.random.uniform(0.0, 1.0, rowRank) rhoArrayParams = np.random.uniform(0.0, 1.0, colRank) rowParams = np.random.choice(thetaArrayParams, N) colParams = np.random.choice(rhoArrayParams, T) # metric 1 (observationMatrix1, meanMatrix1, trainDF1, testDF1, meanTrainingDict1, meanTestDict1) = generateOneMetricMatrix(N, T, TrainingEnd, rowRank, colRank, genFunctionOne, trueWeights, rowParams, colParams) # metric 2 (observationMatrix2, meanMatrix2, trainDF2, testDF2, meanTrainingDict2, meanTestDict2) = generateOneMetricMatrix(N, T, TrainingEnd, rowRank, colRank, genFunctionTwo, trueWeights, rowParams, colParams) keySeriesLabel = '0' otherSeriesLabels = [] for ind in range(1, N + 1): otherSeriesLabels.append(str(ind)) # RSC analysis singvals = 8 ############################ #### RSC for metric 1 rscmodel1 = RobustSyntheticControl(keySeriesLabel, singvals, len(trainDF1), probObservation=1.0, svdMethod='numpy', otherSeriesKeysArray=otherSeriesLabels) # fit the model rscmodel1.fit(trainDF1) predictionsRSC1 = rscmodel1.predict(testDF1) rscRMSE1 = np.sqrt( np.mean((predictionsRSC1 - meanTestDict1[keySeriesLabel])**2)) #print("\n\n *** RSC rmse1:") #print(rscRMSE1) ############################ ##### RSC for metric 2 rscmodel2 = RobustSyntheticControl(keySeriesLabel, singvals, len(trainDF2), probObservation=1.0, svdMethod='numpy', otherSeriesKeysArray=otherSeriesLabels) # fit the model rscmodel2.fit(trainDF2) predictionsRSC2 = rscmodel2.predict(testDF2) rscRMSE2 = np.sqrt( np.mean((predictionsRSC2 - meanTestDict2[keySeriesLabel])**2)) #print("\n\n *** RSC rmse2:") #print(rscRMSE2) ############################ #### multi RSC model (combined) -- relative_weights = [1.0, 1.0] # instantiate the model mrscmodel = MultiRobustSyntheticControl( 2, relative_weights, keySeriesLabel, singvals, len(trainDF1), probObservation=1.0, svdMethod='numpy', otherSeriesKeysArray=otherSeriesLabels) # fit mrscmodel.fit([trainDF1, trainDF2]) # predict combinedPredictionsArray = mrscmodel.predict( [testDF1[otherSeriesLabels], testDF2[otherSeriesLabels]]) # split the predictions for the metrics predictionsmRSC_1 = combinedPredictionsArray[0] predictionsmRSC_2 = combinedPredictionsArray[1] # compute RMSE mrscRMSE1 = np.sqrt( np.mean((predictionsmRSC_1 - meanTestDict1[keySeriesLabel])**2)) mrscRMSE2 = np.sqrt( np.mean((predictionsmRSC_2 - meanTestDict2[keySeriesLabel])**2)) #print("\n\n *** mRSC rmse1:") #print(mrscRMSE1) #print("\n\n *** mRSC rmse2:") #print(mrscRMSE1) return ({ "rsc1": rscRMSE1, "rsc2": rscRMSE2, "mrsc1": mrscRMSE1, "mrsc2": mrscRMSE2 })
def prop99(filename): # CALIFORNIA PROP 99 STUDY df = pd.read_csv(filename) df = df[df['SubMeasureDesc'] == 'Cigarette Consumption (Pack Sales Per Capita)'] pivot = df.pivot_table(values='Data_Value', index='LocationDesc', columns=['Year']) dfProp99 = pd.DataFrame(pivot.to_records()) allColumns = dfProp99.columns.values states = list(np.unique(dfProp99['LocationDesc'])) years = np.delete(allColumns, [0]) caStateKey = 'California' states.remove(caStateKey) otherStates = states yearStart = 1970 yearTrainEnd = 1989 yearTestEnd = 2015 singvals = 2 p = 1.0 trainingYears = [] for i in range(yearStart, yearTrainEnd, 1): trainingYears.append(str(i)) testYears = [] for i in range(yearTrainEnd, yearTestEnd, 1): testYears.append(str(i)) trainDataMasterDict = {} trainDataDict = {} testDataDict = {} for key in otherStates: series = dfProp99[dfProp99['LocationDesc'] == key] trainDataMasterDict.update({key: series[trainingYears].values[0]}) # randomly hide training data (trainData, pObservation) = tsUtils.randomlyHideValues(copy.deepcopy(trainDataMasterDict[key]), p) trainDataDict.update({key: trainData}) testDataDict.update({key: series[testYears].values[0]}) series = dfProp99[dfProp99['LocationDesc'] == caStateKey] trainDataMasterDict.update({caStateKey: series[trainingYears].values[0]}) trainDataDict.update({caStateKey: series[trainingYears].values[0]}) testDataDict.update({caStateKey: series[testYears].values[0]}) trainMasterDF = pd.DataFrame(data=trainDataMasterDict) trainDF = pd.DataFrame(data=trainDataDict) testDF = pd.DataFrame(data=testDataDict) # model rscModel = RobustSyntheticControl(caStateKey, singvals, len(trainDF), probObservation=1.0, modelType='als', otherSeriesKeysArray=otherStates) # fit the model rscModel.fit(trainDF) # save the denoised training data denoisedDF = rscModel.model.denoisedDF() # predict - all at once predictions = rscModel.predict(testDF) # plot yearsToPlot = range(yearStart, yearTestEnd, 1) interventionYear = yearTrainEnd - 1 plt.plot(yearsToPlot, np.append(trainMasterDF[caStateKey], testDF[caStateKey], axis=0), color='red', label='observations') plt.plot(yearsToPlot, np.append(denoisedDF[caStateKey], predictions, axis=0), color='blue', label='predictions') plt.axvline(x=interventionYear, linewidth=1, color='black', label='Intervention') legend = plt.legend(loc='lower left', shadow=True) plt.title('Abadie et al. Prop 99 Case Study (CA) - $p = %.2f$' % p) plt.show()