예제 #1
0
    singvals,
    len(trainDF),
    probObservation=1.0,
    modelType="svd",
    svdMethod="numpy",
    otherSeriesKeysArray=otherStates,
)

# fit the model
rscModel.fit(trainDF)

# save the denoised training data
denoisedDF = rscModel.model.denoisedDF()

# predict - all at once
predictions = rscModel.predict(testDF)

# plot
yearsToPlot = range(yearStart, yearTestEnd, 1)
interventionYear = yearTrainEnd - 1
plt.plot(
    yearsToPlot,
    np.append(trainMasterDF[basqueKey], testDF[basqueKey], axis=0),
    color="red",
    label="observations",
)
plt.plot(
    yearsToPlot,
    np.append(denoisedDF[basqueKey], predictions, axis=0),
    color="blue",
    label="predictions",
def basque(filename):
    # BASQUE COUNTRY STUDY
    df = pd.read_csv(filename)
    pivot = df.pivot_table(values='gdpcap', index='regionname', columns='year')
    pivot = pivot.drop('Spain (Espana)')
    dfBasque = pd.DataFrame(pivot.to_records())

    allColumns = dfBasque.columns.values

    states = list(np.unique(dfBasque['regionname']))
    years = np.delete(allColumns, [0])

    basqueKey = 'Basque Country (Pais Vasco)'
    states.remove(basqueKey)
    otherStates = states

    yearStart = 1955
    yearTrainEnd = 1971
    yearTestEnd = 1998

    singvals = 1
    p = 0.8

    trainingYears = []
    for i in range(yearStart, yearTrainEnd, 1):
        trainingYears.append(str(i))

    testYears = []
    for i in range(yearTrainEnd, yearTestEnd, 1):
        testYears.append(str(i))

    trainDataMasterDict = {}
    trainDataDict = {}
    testDataDict = {}
    for key in otherStates:
        series = dfBasque[dfBasque['regionname'] == key]

        trainDataMasterDict.update({key: series[trainingYears].values[0]})

        # randomly hide training data
        (trainData, pObservation) = tsUtils.randomlyHideValues(copy.deepcopy(trainDataMasterDict[key]), p)
        trainDataDict.update({key: trainData})
        testDataDict.update({key: series[testYears].values[0]})

    series = dfBasque[dfBasque['regionname'] == basqueKey]
    trainDataMasterDict.update({basqueKey: series[trainingYears].values[0]})
    trainDataDict.update({basqueKey: series[trainingYears].values[0]})
    testDataDict.update({basqueKey: series[testYears].values[0]})

    trainMasterDF = pd.DataFrame(data=trainDataMasterDict)
    trainDF = pd.DataFrame(data=trainDataDict)
    testDF = pd.DataFrame(data=testDataDict)

    # model
    rscModel = RobustSyntheticControl(basqueKey, singvals, len(trainDF), probObservation=1.0, modelType='als',
                                      otherSeriesKeysArray=otherStates)

    # fit the model
    rscModel.fit(trainDF)

    # save the denoised training data
    denoisedDF = rscModel.model.denoisedDF()

    # predict - all at once
    predictions = rscModel.predict(testDF)

    # plot
    yearsToPlot = range(yearStart, yearTestEnd, 1)
    interventionYear = yearTrainEnd - 1
    plt.plot(yearsToPlot, np.append(trainMasterDF[basqueKey], testDF[basqueKey], axis=0), color='red',
             label='observations')
    plt.plot(yearsToPlot, np.append(denoisedDF[basqueKey], predictions, axis=0), color='blue', label='predictions')
    plt.axvline(x=interventionYear, linewidth=1, color='black', label='Intervention')
    # plt.ylim((-1, 0))
    legend = plt.legend(loc='upper right', shadow=True)
    plt.title('Abadie et al. Basque Country Case Study - $p = %.2f$' % p)
    plt.show()
예제 #3
0
def runAnalysis(N, T, TrainingEnd, rowRank, colRank):

    # generate metric matrices
    genFunctionOne = simpleFunctionOne
    genFunctionTwo = simpleFunctionTwo

    trueWeights = np.random.uniform(0.0, 1.0, N)
    trueWeights = trueWeights / np.sum(trueWeights)

    thetaArrayParams = np.random.uniform(0.0, 1.0, rowRank)
    rhoArrayParams = np.random.uniform(0.0, 1.0, colRank)

    rowParams = np.random.choice(thetaArrayParams, N)
    colParams = np.random.choice(rhoArrayParams, T)

    # metric 1
    (observationMatrix1, meanMatrix1, trainDF1, testDF1, meanTrainingDict1,
     meanTestDict1) = generateOneMetricMatrix(N, T, TrainingEnd, rowRank,
                                              colRank, genFunctionOne,
                                              trueWeights, rowParams,
                                              colParams)

    # metric 2
    (observationMatrix2, meanMatrix2, trainDF2, testDF2, meanTrainingDict2,
     meanTestDict2) = generateOneMetricMatrix(N, T, TrainingEnd, rowRank,
                                              colRank, genFunctionTwo,
                                              trueWeights, rowParams,
                                              colParams)

    keySeriesLabel = '0'
    otherSeriesLabels = []
    for ind in range(1, N + 1):
        otherSeriesLabels.append(str(ind))

    # RSC analysis
    singvals = 8

    ############################
    #### RSC for metric 1
    rscmodel1 = RobustSyntheticControl(keySeriesLabel,
                                       singvals,
                                       len(trainDF1),
                                       probObservation=1.0,
                                       svdMethod='numpy',
                                       otherSeriesKeysArray=otherSeriesLabels)

    # fit the model
    rscmodel1.fit(trainDF1)
    predictionsRSC1 = rscmodel1.predict(testDF1)

    rscRMSE1 = np.sqrt(
        np.mean((predictionsRSC1 - meanTestDict1[keySeriesLabel])**2))
    #print("\n\n *** RSC rmse1:")
    #print(rscRMSE1)

    ############################
    ##### RSC for metric 2
    rscmodel2 = RobustSyntheticControl(keySeriesLabel,
                                       singvals,
                                       len(trainDF2),
                                       probObservation=1.0,
                                       svdMethod='numpy',
                                       otherSeriesKeysArray=otherSeriesLabels)

    # fit the model
    rscmodel2.fit(trainDF2)
    predictionsRSC2 = rscmodel2.predict(testDF2)

    rscRMSE2 = np.sqrt(
        np.mean((predictionsRSC2 - meanTestDict2[keySeriesLabel])**2))
    #print("\n\n *** RSC rmse2:")
    #print(rscRMSE2)

    ############################
    ####  multi RSC model (combined) --
    relative_weights = [1.0, 1.0]

    # instantiate the model
    mrscmodel = MultiRobustSyntheticControl(
        2,
        relative_weights,
        keySeriesLabel,
        singvals,
        len(trainDF1),
        probObservation=1.0,
        svdMethod='numpy',
        otherSeriesKeysArray=otherSeriesLabels)

    # fit
    mrscmodel.fit([trainDF1, trainDF2])

    # predict
    combinedPredictionsArray = mrscmodel.predict(
        [testDF1[otherSeriesLabels], testDF2[otherSeriesLabels]])

    # split the predictions for the metrics
    predictionsmRSC_1 = combinedPredictionsArray[0]
    predictionsmRSC_2 = combinedPredictionsArray[1]

    # compute RMSE
    mrscRMSE1 = np.sqrt(
        np.mean((predictionsmRSC_1 - meanTestDict1[keySeriesLabel])**2))
    mrscRMSE2 = np.sqrt(
        np.mean((predictionsmRSC_2 - meanTestDict2[keySeriesLabel])**2))

    #print("\n\n *** mRSC rmse1:")
    #print(mrscRMSE1)

    #print("\n\n *** mRSC rmse2:")
    #print(mrscRMSE1)

    return ({
        "rsc1": rscRMSE1,
        "rsc2": rscRMSE2,
        "mrsc1": mrscRMSE1,
        "mrsc2": mrscRMSE2
    })
def prop99(filename):
    # CALIFORNIA PROP 99 STUDY
    df = pd.read_csv(filename)
    df = df[df['SubMeasureDesc'] == 'Cigarette Consumption (Pack Sales Per Capita)']
    pivot = df.pivot_table(values='Data_Value', index='LocationDesc', columns=['Year'])
    dfProp99 = pd.DataFrame(pivot.to_records())

    allColumns = dfProp99.columns.values

    states = list(np.unique(dfProp99['LocationDesc']))
    years = np.delete(allColumns, [0])

    caStateKey = 'California'
    states.remove(caStateKey)
    otherStates = states

    yearStart = 1970
    yearTrainEnd = 1989
    yearTestEnd = 2015

    singvals = 2
    p = 1.0

    trainingYears = []
    for i in range(yearStart, yearTrainEnd, 1):
        trainingYears.append(str(i))

    testYears = []
    for i in range(yearTrainEnd, yearTestEnd, 1):
        testYears.append(str(i))

    trainDataMasterDict = {}
    trainDataDict = {}
    testDataDict = {}
    for key in otherStates:
        series = dfProp99[dfProp99['LocationDesc'] == key]

        trainDataMasterDict.update({key: series[trainingYears].values[0]})

        # randomly hide training data
        (trainData, pObservation) = tsUtils.randomlyHideValues(copy.deepcopy(trainDataMasterDict[key]), p)
        trainDataDict.update({key: trainData})
        testDataDict.update({key: series[testYears].values[0]})

    series = dfProp99[dfProp99['LocationDesc'] == caStateKey]
    trainDataMasterDict.update({caStateKey: series[trainingYears].values[0]})
    trainDataDict.update({caStateKey: series[trainingYears].values[0]})
    testDataDict.update({caStateKey: series[testYears].values[0]})

    trainMasterDF = pd.DataFrame(data=trainDataMasterDict)
    trainDF = pd.DataFrame(data=trainDataDict)
    testDF = pd.DataFrame(data=testDataDict)

    # model
    rscModel = RobustSyntheticControl(caStateKey, singvals, len(trainDF), probObservation=1.0, modelType='als',
                                      otherSeriesKeysArray=otherStates)

    # fit the model
    rscModel.fit(trainDF)

    # save the denoised training data
    denoisedDF = rscModel.model.denoisedDF()

    # predict - all at once
    predictions = rscModel.predict(testDF)

    # plot
    yearsToPlot = range(yearStart, yearTestEnd, 1)
    interventionYear = yearTrainEnd - 1
    plt.plot(yearsToPlot, np.append(trainMasterDF[caStateKey], testDF[caStateKey], axis=0), color='red',
             label='observations')
    plt.plot(yearsToPlot, np.append(denoisedDF[caStateKey], predictions, axis=0), color='blue', label='predictions')
    plt.axvline(x=interventionYear, linewidth=1, color='black', label='Intervention')
    legend = plt.legend(loc='lower left', shadow=True)
    plt.title('Abadie et al. Prop 99 Case Study (CA) - $p = %.2f$' % p)
    plt.show()