def setUpFiles(basePath):
    """
    Creates folder structure in OriginalModelAssessment folder and copies all years training data + test set
    :return:
    """

    allMonthsPath = 'AllMonthsDryHalf/'
    regions = ['IntMnt', 'Xeric']
    months = [
        'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct',
        'nov', 'dec'
    ]
    subFolders = ['CurrentFoldData', 'Output', 'Prediction']

    for region in regions:
        for month in months:

            print('Processing:', region, month.capitalize())

            # Create folder for each month & region in original model folder
            newFolderPath = basePath + region + '/' + month
            if not os.path.exists(newFolderPath):
                os.makedirs(newFolderPath)

            # Create subfolders used in model pipeline
            for subFolder in subFolders:
                newSubFolderPath = newFolderPath + '/' + subFolder
                if not os.path.exists(newSubFolderPath):
                    os.makedirs(newSubFolderPath)

            # Copy all years training sets and test sets
            for i in range(5):

                # Build paths
                trainFileName = '{}_{}_all_{}_train.csv'.format(
                    month, region, i)
                testFileName = '{}_{}_{}_test.csv'.format(month, region, i)
                sourceTrainFilePath = allMonthsPath + region + '/' + month + '/' + trainFileName
                newTrainFilePath = newFolderPath + '/' + trainFileName
                sourceTestFilePath = allMonthsPath + region + '/' + month + '/' + testFileName
                newTestFilePath = newFolderPath + '/' + testFileName

                # Copy files
                shutil.copyfile(sourceTrainFilePath, newTrainFilePath)
                shutil.copyfile(sourceTestFilePath, newTestFilePath)

            # Add in full dataset and Sacramento data for prediction
            fullFileName = '{}_{}_all.csv'.format(month, region)
            sourceFullFilePath = allMonthsPath + region + '/' + month + '/' + fullFileName
            newFullFilePath = newFolderPath + '/Prediction/' + fullFileName
            shutil.copyfile(sourceFullFilePath, newFullFilePath)
            sacData = thesisFunctions.prepSacramentoData(month, region)
            predictionFilePath = newFolderPath + '/Prediction/sacramentoData.csv'
            sacData.to_csv(predictionFilePath, index=False)
def setUpFiles(basePath):
    """
    Creates folder structure in OriginalModelAssessment folder and copies all years training data + test set
    :return:
    """

    allMonthsPath = 'AllMonthsDryHalf/'
    regions = ['IntMnt', 'Xeric']
    months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
    subFolders = ['CurrentFoldData', 'Output', 'Prediction']

    for region in regions:
        for month in months:

            print('Processing:', region, month.capitalize())

            # Create folder for each month & region in original model folder
            newFolderPath = basePath + region + '/' + month
            if not os.path.exists(newFolderPath):
                os.makedirs(newFolderPath)

            # Create subfolders used in model pipeline
            for subFolder in subFolders:
                newSubFolderPath = newFolderPath + '/' + subFolder
                if not os.path.exists(newSubFolderPath):
                    os.makedirs(newSubFolderPath)

            # Copy all years training sets and test sets
            for i in range(5):

                # Build paths
                trainFileName = '{}_{}_all_{}_train.csv'.format(month, region, i)
                testFileName = '{}_{}_{}_test.csv'.format(month, region, i)
                sourceTrainFilePath = allMonthsPath + region + '/' + month + '/' + trainFileName
                newTrainFilePath = newFolderPath + '/' + trainFileName
                sourceTestFilePath = allMonthsPath + region + '/' + month + '/' + testFileName
                newTestFilePath = newFolderPath + '/' + testFileName

                # Copy files
                shutil.copyfile(sourceTrainFilePath, newTrainFilePath)
                shutil.copyfile(sourceTestFilePath, newTestFilePath)

            # Add in full dataset and Sacramento data for prediction
            fullFileName = '{}_{}_all.csv'.format(month, region)
            sourceFullFilePath = allMonthsPath + region + '/' + month + '/' + fullFileName
            newFullFilePath = newFolderPath + '/Prediction/' + fullFileName
            shutil.copyfile(sourceFullFilePath, newFullFilePath)
            sacData = thesisFunctions.prepSacramentoData(month,
                                                         region)
            predictionFilePath = newFolderPath + '/Prediction/sacramentoData.csv'
            sacData.to_csv(predictionFilePath, index=False)
        # header for first column and deleting the 'X.1' mystery variable when it shows up
        sourceFile = rfDataPath + region.lower() + '/' + month + '_' + region + '_ref.csv'
        regionTrainingDF = pandas.read_csv(sourceFile)
        regionTrainingDF.rename(columns={'Unnamed: 0': 'ObsID'}, inplace=True)
        badVariable = 'X.1'
        if badVariable in regionTrainingDF.columns.values:
            regionTrainingDF.drop(badVariable, axis=1, inplace=True)

        # Drop "T" from beginning of STAID column (someone added it as a string casting hack before I got the data)
        regionTrainingDF['STAID'] = regionTrainingDF['STAID'].map(lambda x: x[1:])

        # Subset all training data to just those observations in the Sacramento basin
        regionTrainingDF = regionTrainingDF[regionTrainingDF.STAID.isin(sacRefGages)]

        # Get prediction data
        regionPredictionDF = thesisFunctions.prepSacramentoData(month, region)

        # Hack: If we're in the CoastMnt region loop, reassign region as 'IntMnt' because the only Sacramento reference
        # gage in that region's source file, 11371000, is actually in the West Mnt region in the Gages II database. None
        # of the prediction data is in the CoastMnt region so it won't affect anything in that dataframe.
        if region == 'CoastMnt':
            regionForDF = 'IntMnt'
        else:
            regionForDF = region

        # Add columns that ID current region and month (1 if True, 0 if False) to each DataFrame, leaving off last one
        # in each list to prevent model being over-specified (actually the last two for the regions, because of the
        # 11371000 misclassification as CoastMnt explained above
        for regionColumn in regions[:-2]:

            if regionColumn == regionForDF:
        regionTrainingDF = pandas.read_csv(sourceFile)
        regionTrainingDF.rename(columns={'Unnamed: 0': 'ObsID'}, inplace=True)
        badVariable = 'X.1'
        if badVariable in regionTrainingDF.columns.values:
            regionTrainingDF.drop(badVariable, axis=1, inplace=True)

        # Drop "T" from beginning of STAID column (someone added it as a string casting hack before I got the data)
        regionTrainingDF['STAID'] = regionTrainingDF['STAID'].map(
            lambda x: x[1:])

        # Subset all training data to just those observations in the Sacramento basin
        regionTrainingDF = regionTrainingDF[regionTrainingDF.STAID.isin(
            sacRefGages)]

        # Get prediction data
        regionPredictionDF = thesisFunctions.prepSacramentoData(month, region)

        # Hack: If we're in the CoastMnt region loop, reassign region as 'IntMnt' because the only Sacramento reference
        # gage in that region's source file, 11371000, is actually in the West Mnt region in the Gages II database. None
        # of the prediction data is in the CoastMnt region so it won't affect anything in that dataframe.
        if region == 'CoastMnt':
            regionForDF = 'IntMnt'
        else:
            regionForDF = region

        # Add columns that ID current region and month (1 if True, 0 if False) to each DataFrame, leaving off last one
        # in each list to prevent model being over-specified (actually the last two for the regions, because of the
        # 11371000 misclassification as CoastMnt explained above
        for regionColumn in regions[:-2]:

            if regionColumn == regionForDF:
Пример #5
0
        destinationFilePath = destinationFolderPath + month + '_' + region + '_all.csv'

        sourceFile = pandas.read_csv(sourceFilePath)

        sourceFile.rename(columns={'Unnamed: 0': 'ObsID'}, inplace=True)
        badVariable = 'X.1'
        if badVariable in sourceFile.columns.values:
            sourceFile.drop(badVariable, axis=1, inplace=True)

        sourceFile.to_csv(destinationFilePath, index=False)

        # Create subfolders used in model pipeline
        subFolders = ['CurrentFoldData', 'Output', 'Prediction']
        for subFolder in subFolders:
            newSubFolderPath = destinationFolderPath + '/' + subFolder
            if not os.path.exists(newSubFolderPath):
                os.makedirs(newSubFolderPath)

        # Also add a copy of NOAA water years
        waterYearsSourceFilePath = allMonthsPath + 'NOAAWaterYearsDriestToWettest.csv'
        waterYearsDestinationFilePath = destinationFolderPath + 'NOAAWaterYearsDriestToWettest.csv'
        shutil.copyfile(waterYearsSourceFilePath,
                        waterYearsDestinationFilePath)

        # Add in Sacramento data for prediction
        sacData = thesisFunctions.prepSacramentoData(
            month, region, wetOrDry, waterYearsDestinationFilePath,
            proportionOfInterest)
        predictionFilePath = allMonthsPath + region + '/' + month + '/Prediction/sacramentoData.csv'
        sacData.to_csv(predictionFilePath, index=False)
        sourceFile = pandas.read_csv(sourceFilePath)

        sourceFile.rename(columns={'Unnamed: 0': 'ObsID'}, inplace=True)
        badVariable = 'X.1'
        if badVariable in sourceFile.columns.values:
            sourceFile.drop(badVariable, axis=1, inplace=True)

        sourceFile.to_csv(destinationFilePath, index=False)

        # Create subfolders used in model pipeline
        subFolders = ['CurrentFoldData', 'Output', 'Prediction']
        for subFolder in subFolders:
            newSubFolderPath = destinationFolderPath + '/' + subFolder
            if not os.path.exists(newSubFolderPath):
                os.makedirs(newSubFolderPath)

        # Also add a copy of NOAA water years
        waterYearsSourceFilePath = allMonthsPath + 'NOAAWaterYearsDriestToWettest.csv'
        waterYearsDestinationFilePath = destinationFolderPath + 'NOAAWaterYearsDriestToWettest.csv'
        shutil.copyfile(waterYearsSourceFilePath, waterYearsDestinationFilePath)

        # Add in Sacramento data for prediction
        sacData = thesisFunctions.prepSacramentoData(month,
                                                     region,
                                                     wetOrDry,
                                                     waterYearsDestinationFilePath,
                                                     proportionOfInterest)
        predictionFilePath = allMonthsPath + region + '/' + month + '/Prediction/sacramentoData.csv'
        sacData.to_csv(predictionFilePath, index=False)