def testSingleTS():
    print("------------------- Test # 1 (Single TS). ------------------------")
    p = 0.7
    N = 50
    M = 400
    timeSteps = N * M

    # train/test split
    trainProp = 0.9
    M1 = int(trainProp * M)
    M2 = M - M1

    trainPoints = N * M1
    testPoints = N * M2

    print("Generating data...")
    harmonicsTS = harmonicDataTest(timeSteps)
    trendTS = trendDataTest(timeSteps)
    (armaTS, armaMeanTS) = armaDataTest(timeSteps)

    meanTS = harmonicsTS + trendTS + armaMeanTS
    combinedTS = harmonicsTS + trendTS + armaTS

    # normalize the values to all lie within [-1, 1] -- helps with RMSE comparisons
    # can use the tsUtils.unnormalize() function to convert everything back to the original range at the end, if needed
    max1 = np.nanmax(combinedTS)
    min1 = np.nanmin(combinedTS)
    max2 = np.nanmax(meanTS)
    min2 = np.nanmin(meanTS)
    max = np.max([max1, max2])
    min = np.min([min1, min2])

    combinedTS = tsUtils.normalize(combinedTS, max, min)
    meanTS = tsUtils.normalize(meanTS, max, min)

    # produce timestamps
    timestamps = np.arange('2017-09-10 20:30:00',
                           timeSteps,
                           dtype='datetime64[1m]')  # arbitrary start date

    # split the data
    trainDataMaster = combinedTS[
        0:
        trainPoints]  # need this as the true realized values for comparisons later
    meanTrainData = meanTS[
        0:
        trainPoints]  # this is only needed for various statistical comparisons later

    # randomly hide training data: choose between randomly hiding entries or randomly hiding consecutive entries
    (trainData,
     pObservation) = tsUtils.randomlyHideValues(copy.deepcopy(trainDataMaster),
                                                p)

    # now further hide consecutive entries for a very small fraction of entries in the eventual training matrix
    (trainData, pObservation) = tsUtils.randomlyHideConsecutiveEntries(
        copy.deepcopy(trainData), 0.9, int(M1 * 0.25), M1)

    # interpolating Nans with linear interpolation
    # trainData = tsUtils.nanInterpolateHelper(trainData)

    # test data and hidden truth
    testData = combinedTS[-1 * testPoints:]
    meanTestData = meanTS[
        -1 *
        testPoints:]  # this is only needed for various statistical comparisons

    # time stamps
    trainTimestamps = timestamps[0:trainPoints]
    testTimestamps = timestamps[-1 * testPoints:]

    # once we have interpolated, pObservation should be set back to 1.0
    pObservation = 1.0

    # create pandas df
    key1 = 't1'
    trainMasterDF = pd.DataFrame(index=trainTimestamps,
                                 data={key1: trainDataMaster
                                       })  # needed for reference later
    trainDF = pd.DataFrame(index=trainTimestamps, data={key1: trainData})
    meanTrainDF = pd.DataFrame(index=trainTimestamps,
                               data={key1: meanTrainData})

    testDF = pd.DataFrame(index=testTimestamps, data={key1: testData})
    meanTestDF = pd.DataFrame(index=testTimestamps, data={key1: meanTestData})

    # train the model
    print("Training the model (imputing)...")
    print('SVD')
    nbrSingValuesToKeep = 5
    mod = SVDModel(key1,
                   nbrSingValuesToKeep,
                   N,
                   M1,
                   probObservation=pObservation,
                   svdMethod='numpy',
                   otherSeriesKeysArray=[],
                   includePastDataOnly=True)
    mod.fit(trainDF)
    imputedDf = mod.denoisedDF()

    print(" RMSE (training imputation vs mean) = %f" %
          tsUtils.rmse(meanTrainDF[key1].values, imputedDf[key1].values))
    print(" RMSE (training imputation vs obs)  = %f" %
          tsUtils.rmse(trainMasterDF[key1].values, imputedDf[key1].values))
    return
    print('ALS')
    # uncomment below to run the ALS algorithm ; comment out the above line
    mod = ALSModel(key1,
                   nbrSingValuesToKeep,
                   N,
                   M1,
                   probObservation=pObservation,
                   otherSeriesKeysArray=[],
                   includePastDataOnly=True)
    mod.fit(trainDF)

    # imputed + denoised data
    imputedDf = mod.denoisedDF()

    print(" RMSE (training imputation vs mean) = %f" %
          tsUtils.rmse(meanTrainDF[key1].values, imputedDf[key1].values))
    print(" RMSE (training imputation vs obs)  = %f" %
          tsUtils.rmse(trainMasterDF[key1].values, imputedDf[key1].values))

    print("Forecasting (#points = %d)..." % len(testDF))
    # test data is used for point-predictions
    forecastArray = []
    for i in range(0, len(testDF)):
        pastPoints = np.zeros(N - 1)  # need an N-1 length vector of past point
        j = 0
        if (i < N -
                1):  # the first prediction uses the end of the training data
            while (j < N - 1 - i):
                pastPoints[j] = trainMasterDF[key1].values[len(trainDF) -
                                                           (N - 1 - i) + j]
                j += 1

        if (j < N - 1):  # use the new test data
            pastPoints[j:] = testDF[key1].values[i - (N - 1) + j:i]

        keyToSeriesDFNew = pd.DataFrame(data={key1: pastPoints})
        prediction = mod.predict(pd.DataFrame(data={}),
                                 keyToSeriesDFNew,
                                 bypassChecks=False)
        forecastArray.append(prediction)

    print(" RMSE (prediction vs mean) = %f" %
          tsUtils.rmse(meanTestDF[key1].values, forecastArray))
    print(" RMSE (prediction vs obs)  = %f" %
          tsUtils.rmse(testDF[key1].values, forecastArray))

    print("Plotting...")
    plt.plot(np.concatenate((trainMasterDF[key1].values, testDF[key1].values),
                            axis=0),
             color='gray',
             label='Observed')
    plt.plot(np.concatenate(
        (meanTrainDF[key1].values, meanTestDF[key1].values), axis=0),
             color='red',
             label='True Means')
    plt.plot(np.concatenate((imputedDf[key1].values, forecastArray), axis=0),
             color='blue',
             label='Forecasts')
    plt.axvline(x=len(trainDF),
                linewidth=1,
                color='black',
                label='Training End')
    legend = plt.legend(loc='upper left', shadow=True)
    plt.title('Single Time Series (ARMA + Periodic + Trend) - $p = %.2f$' % p)
    plt.show()
예제 #2
0
    trainingYears.append(str(i))

testYears = []
for i in range(yearTrainEnd, yearTestEnd, 1):
    testYears.append(str(i))

trainDataMasterDict = {}
trainDataDict = {}
testDataDict = {}
for key in otherStates:
    series = dfBasque[dfBasque["regionname"] == key]

    trainDataMasterDict.update({key: series[trainingYears].values[0]})

    # randomly hide training data
    (trainData, pObservation) = tsUtils.randomlyHideValues(
        copy.deepcopy(trainDataMasterDict[key]), p)
    trainDataDict.update({key: trainData})
    testDataDict.update({key: series[testYears].values[0]})

series = dfBasque[dfBasque["regionname"] == basqueKey]
trainDataMasterDict.update({basqueKey: series[trainingYears].values[0]})
trainDataDict.update({basqueKey: series[trainingYears].values[0]})
testDataDict.update({basqueKey: series[testYears].values[0]})

trainMasterDF = pd.DataFrame(data=trainDataMasterDict)
trainDF = pd.DataFrame(data=trainDataDict)
testDF = pd.DataFrame(data=testDataDict)

# model
rscModel = RobustSyntheticControl(
    basqueKey,
def basque(filename):
    # BASQUE COUNTRY STUDY
    df = pd.read_csv(filename)
    pivot = df.pivot_table(values='gdpcap', index='regionname', columns='year')
    pivot = pivot.drop('Spain (Espana)')
    dfBasque = pd.DataFrame(pivot.to_records())

    allColumns = dfBasque.columns.values

    states = list(np.unique(dfBasque['regionname']))
    years = np.delete(allColumns, [0])

    basqueKey = 'Basque Country (Pais Vasco)'
    states.remove(basqueKey)
    otherStates = states

    yearStart = 1955
    yearTrainEnd = 1971
    yearTestEnd = 1998

    singvals = 1
    p = 0.8

    trainingYears = []
    for i in range(yearStart, yearTrainEnd, 1):
        trainingYears.append(str(i))

    testYears = []
    for i in range(yearTrainEnd, yearTestEnd, 1):
        testYears.append(str(i))

    trainDataMasterDict = {}
    trainDataDict = {}
    testDataDict = {}
    for key in otherStates:
        series = dfBasque[dfBasque['regionname'] == key]

        trainDataMasterDict.update({key: series[trainingYears].values[0]})

        # randomly hide training data
        (trainData, pObservation) = tsUtils.randomlyHideValues(copy.deepcopy(trainDataMasterDict[key]), p)
        trainDataDict.update({key: trainData})
        testDataDict.update({key: series[testYears].values[0]})

    series = dfBasque[dfBasque['regionname'] == basqueKey]
    trainDataMasterDict.update({basqueKey: series[trainingYears].values[0]})
    trainDataDict.update({basqueKey: series[trainingYears].values[0]})
    testDataDict.update({basqueKey: series[testYears].values[0]})

    trainMasterDF = pd.DataFrame(data=trainDataMasterDict)
    trainDF = pd.DataFrame(data=trainDataDict)
    testDF = pd.DataFrame(data=testDataDict)

    # model
    rscModel = RobustSyntheticControl(basqueKey, singvals, len(trainDF), probObservation=1.0, modelType='als',
                                      otherSeriesKeysArray=otherStates)

    # fit the model
    rscModel.fit(trainDF)

    # save the denoised training data
    denoisedDF = rscModel.model.denoisedDF()

    # predict - all at once
    predictions = rscModel.predict(testDF)

    # plot
    yearsToPlot = range(yearStart, yearTestEnd, 1)
    interventionYear = yearTrainEnd - 1
    plt.plot(yearsToPlot, np.append(trainMasterDF[basqueKey], testDF[basqueKey], axis=0), color='red',
             label='observations')
    plt.plot(yearsToPlot, np.append(denoisedDF[basqueKey], predictions, axis=0), color='blue', label='predictions')
    plt.axvline(x=interventionYear, linewidth=1, color='black', label='Intervention')
    # plt.ylim((-1, 0))
    legend = plt.legend(loc='upper right', shadow=True)
    plt.title('Abadie et al. Basque Country Case Study - $p = %.2f$' % p)
    plt.show()
예제 #4
0
def testMultipleTS():

    print(
        "------------------- Test # 2 (Multiple TS). ------------------------")
    p = 1.0
    N = 50
    M = 400
    timeSteps = N * M

    # train/test split
    trainProp = 0.7
    M1 = int(trainProp * M)
    M2 = M - M1

    trainPoints = N * M1
    testPoints = N * M2

    key1 = 't1'
    key2 = 't2'
    key3 = 't3'
    otherkeys = [key2, key3]

    includePastDataOnly = True

    print("Generating data...")
    harmonicsTS = harmonicDataTest(timeSteps)
    trendTS = trendDataTest(timeSteps)
    (armaTS, armaMeanTS) = armaDataTest(timeSteps)

    meanTS = harmonicsTS + trendTS + armaMeanTS
    combinedTS = harmonicsTS + trendTS + armaTS

    combinedTS2 = (0.3 * combinedTS) + np.random.normal(
        0.0, 0.5, len(combinedTS))
    combinedTS3 = (-0.4 * combinedTS)

    #normalize the values to all lie within [-1, 1] -- helps with RMSE comparisons
    # can use the tsUtils.unnormalize() function to convert everything back to the original range at the end, if needed
    max1 = np.nanmax([combinedTS, combinedTS2, combinedTS3])
    min1 = np.nanmin([combinedTS, combinedTS2, combinedTS3])
    max2 = np.nanmax(meanTS)
    min2 = np.nanmin(meanTS)
    max = np.max([max1, max2])
    min = np.min([min1, min2])

    combinedTS = tsUtils.normalize(combinedTS, max, min)
    combinedTS2 = tsUtils.normalize(combinedTS2, max, min)
    combinedTS3 = tsUtils.normalize(combinedTS3, max, min)
    meanTS = tsUtils.normalize(meanTS, max, min)

    # produce timestamps
    timestamps = np.arange('2017-09-10 20:30:00',
                           timeSteps,
                           dtype='datetime64[1m]')  # arbitrary start date

    # split the data
    trainDataMaster = combinedTS[
        0:
        trainPoints]  # need this as the true realized values for comparisons later
    trainDataMaster2 = combinedTS2[0:trainPoints]
    trainDataMaster3 = combinedTS3[0:trainPoints]

    meanTrainData = meanTS[
        0:
        trainPoints]  # this is only needed for various statistical comparisons later

    # randomly hide training data
    (trainData,
     pObservation) = tsUtils.randomlyHideValues(copy.deepcopy(trainDataMaster),
                                                p)
    (trainData2, pObservation) = tsUtils.randomlyHideValues(
        copy.deepcopy(trainDataMaster2), p)
    (trainData3, pObservation) = tsUtils.randomlyHideValues(
        copy.deepcopy(trainDataMaster3), p)

    # now further hide consecutive entries for a very small fraction of entries in the eventual training matrix
    (trainData, pObservation) = tsUtils.randomlyHideConsecutiveEntries(
        copy.deepcopy(trainData), 0.95, int(M1 * 0.25), M1)
    (trainData2, pObservation) = tsUtils.randomlyHideConsecutiveEntries(
        copy.deepcopy(trainData2), 0.95, int(M1 * 0.25), M1)
    (trainData3, pObservation) = tsUtils.randomlyHideConsecutiveEntries(
        copy.deepcopy(trainData3), 0.95, int(M1 * 0.25), M1)

    # once we have interpolated, pObservation should be set back to 1.0
    pObservation = 1.0

    # interpolating Nans with linear interpolation
    #trainData = tsUtils.nanInterpolateHelper(trainData)
    #trainData2 = tsUtils.nanInterpolateHelper(trainData2)
    #trainData3 = tsUtils.nanInterpolateHelper(trainData3)

    # test data and hidden truth
    testData = combinedTS[-1 * testPoints:]
    testData2 = combinedTS2[-1 * testPoints:]
    testData3 = combinedTS3[-1 * testPoints:]

    meanTestData = meanTS[
        -1 *
        testPoints:]  # this is only needed for various statistical comparisons

    # time stamps
    trainTimestamps = timestamps[0:trainPoints]
    testTimestamps = timestamps[-1 * testPoints:]

    # create pandas df
    trainMasterDF = pd.DataFrame(index=trainTimestamps,
                                 data={
                                     key1: trainDataMaster,
                                     key2: trainDataMaster2,
                                     key3: trainDataMaster3
                                 })  # needed for reference later
    trainDF = pd.DataFrame(index=trainTimestamps,
                           data={
                               key1: trainData,
                               key2: trainData2,
                               key3: trainData3
                           })
    meanTrainDF = pd.DataFrame(index=trainTimestamps,
                               data={key1: meanTrainData})

    testDF = pd.DataFrame(index=testTimestamps,
                          data={
                              key1: testData,
                              key2: testData2,
                              key3: testData3
                          })
    meanTestDF = pd.DataFrame(index=testTimestamps, data={key1: meanTestData})

    # train the model
    print("Training the model (imputing)...")
    nbrSingValuesToKeep = 5
    mod = SVDModel(key1,
                   nbrSingValuesToKeep,
                   N,
                   M1,
                   probObservation=pObservation,
                   svdMethod='numpy',
                   otherSeriesKeysArray=otherkeys,
                   includePastDataOnly=includePastDataOnly)

    # uncomment below to run the ALS algorithm ; comment out the above line
    #mod = ALSModel(key1, nbrSingValuesToKeep, N, M1, probObservation=pObservation, otherSeriesKeysArray=otherkeys, includePastDataOnly=True)
    mod.fit(trainDF)

    # imputed + denoised data
    imputedDf = mod.denoisedDF()

    print(" RMSE (training imputation vs mean) = %f" %
          tsUtils.rmse(meanTrainDF[key1].values, imputedDf[key1].values))
    print(" RMSE (training imputation vs obs)  = %f" %
          tsUtils.rmse(trainMasterDF[key1].values, imputedDf[key1].values))

    print("Forecasting (#points = %d)..." % len(testDF))

    # test data is used for point-predictions
    otherTSPoints = N
    if (includePastDataOnly == True):
        otherTSPoints = N - 1
    forecastArray = []
    for i in range(0, len(testDF)):

        pastPointsPrediction = np.zeros(
            N - 1
        )  # for the time series of interest, we only use the past N - 1 points

        # first fill in the time series of interest
        j = 0
        if (i < N -
                1):  # the first prediction uses the end of the training data
            while (j < N - 1 - i):
                pastPointsPrediction[j] = trainMasterDF[key1].values[
                    len(trainDF) - (N - 1 - i) + j]
                j += 1

        if (j < N - 1):  # use the new test data
            pastPointsPrediction[j:] = testDF[key1].values[i - (N - 1) + j:i]

        # now fill in the other series
        otherSeriesDataDict = {}
        for key in otherkeys:
            pastPointsOthers = np.zeros(
                otherTSPoints
            )  # need an appropriate length vector of past points for each series
            j = 0
            if (i < N - 1
                ):  # the first prediction uses the end of the training data
                while (j < N - 1 - i):
                    pastPointsOthers[j] = trainMasterDF[key].values[
                        len(trainDF) - (N - 1 - i) + j]
                    j += 1

            if (j < otherTSPoints):  # use the new test data
                if (includePastDataOnly == True):
                    pastPointsOthers[j:] = testDF[key].values[i - (N - 1) +
                                                              j:i]
                else:
                    pastPointsOthers[j:] = testDF[key].values[i - (N - 1) +
                                                              j:i + 1]

            otherSeriesDataDict.update({key: pastPointsOthers})

        otherKeysToSeriesDFNew = pd.DataFrame(data=otherSeriesDataDict)
        keyToSeriesDFNew = pd.DataFrame(data={key1: pastPointsPrediction})

        prediction = mod.predict(otherKeysToSeriesDFNew,
                                 keyToSeriesDFNew,
                                 bypassChecks=False)
        forecastArray.append(prediction)

    print(" RMSE (prediction vs mean) = %f" %
          tsUtils.rmse(meanTestDF[key1].values, forecastArray))
    print(" RMSE (prediction vs obs)  = %f" %
          tsUtils.rmse(testDF[key1].values, forecastArray))

    print("Plotting...")
    plt.plot(np.concatenate((trainMasterDF[key1].values, testDF[key1].values),
                            axis=0),
             color='gray',
             label='Observed')
    plt.plot(np.concatenate(
        (meanTrainDF[key1].values, meanTestDF[key1].values), axis=0),
             color='red',
             label='True Means')
    plt.plot(np.concatenate((imputedDf[key1].values, forecastArray), axis=0),
             color='blue',
             label='Forecasts')
    plt.axvline(x=len(trainDF),
                linewidth=1,
                color='black',
                label='Training End')
    legend = plt.legend(loc='upper left', shadow=True)
    plt.title('Single Time Series (ARMA + Periodic + Trend) - $p = %.2f$' % p)
    plt.show()
def prop99(filename):
    # CALIFORNIA PROP 99 STUDY
    df = pd.read_csv(filename)
    df = df[df['SubMeasureDesc'] == 'Cigarette Consumption (Pack Sales Per Capita)']
    pivot = df.pivot_table(values='Data_Value', index='LocationDesc', columns=['Year'])
    dfProp99 = pd.DataFrame(pivot.to_records())

    allColumns = dfProp99.columns.values

    states = list(np.unique(dfProp99['LocationDesc']))
    years = np.delete(allColumns, [0])

    caStateKey = 'California'
    states.remove(caStateKey)
    otherStates = states

    yearStart = 1970
    yearTrainEnd = 1989
    yearTestEnd = 2015

    singvals = 2
    p = 1.0

    trainingYears = []
    for i in range(yearStart, yearTrainEnd, 1):
        trainingYears.append(str(i))

    testYears = []
    for i in range(yearTrainEnd, yearTestEnd, 1):
        testYears.append(str(i))

    trainDataMasterDict = {}
    trainDataDict = {}
    testDataDict = {}
    for key in otherStates:
        series = dfProp99[dfProp99['LocationDesc'] == key]

        trainDataMasterDict.update({key: series[trainingYears].values[0]})

        # randomly hide training data
        (trainData, pObservation) = tsUtils.randomlyHideValues(copy.deepcopy(trainDataMasterDict[key]), p)
        trainDataDict.update({key: trainData})
        testDataDict.update({key: series[testYears].values[0]})

    series = dfProp99[dfProp99['LocationDesc'] == caStateKey]
    trainDataMasterDict.update({caStateKey: series[trainingYears].values[0]})
    trainDataDict.update({caStateKey: series[trainingYears].values[0]})
    testDataDict.update({caStateKey: series[testYears].values[0]})

    trainMasterDF = pd.DataFrame(data=trainDataMasterDict)
    trainDF = pd.DataFrame(data=trainDataDict)
    testDF = pd.DataFrame(data=testDataDict)

    # model
    rscModel = RobustSyntheticControl(caStateKey, singvals, len(trainDF), probObservation=1.0, modelType='als',
                                      otherSeriesKeysArray=otherStates)

    # fit the model
    rscModel.fit(trainDF)

    # save the denoised training data
    denoisedDF = rscModel.model.denoisedDF()

    # predict - all at once
    predictions = rscModel.predict(testDF)

    # plot
    yearsToPlot = range(yearStart, yearTestEnd, 1)
    interventionYear = yearTrainEnd - 1
    plt.plot(yearsToPlot, np.append(trainMasterDF[caStateKey], testDF[caStateKey], axis=0), color='red',
             label='observations')
    plt.plot(yearsToPlot, np.append(denoisedDF[caStateKey], predictions, axis=0), color='blue', label='predictions')
    plt.axvline(x=interventionYear, linewidth=1, color='black', label='Intervention')
    legend = plt.legend(loc='lower left', shadow=True)
    plt.title('Abadie et al. Prop 99 Case Study (CA) - $p = %.2f$' % p)
    plt.show()