def setUpFiles(basePath): """ Creates folder structure in OriginalModelAssessment folder and copies all years training data + test set :return: """ allMonthsPath = 'AllMonthsDryHalf/' regions = ['IntMnt', 'Xeric'] months = [ 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec' ] subFolders = ['CurrentFoldData', 'Output', 'Prediction'] for region in regions: for month in months: print('Processing:', region, month.capitalize()) # Create folder for each month & region in original model folder newFolderPath = basePath + region + '/' + month if not os.path.exists(newFolderPath): os.makedirs(newFolderPath) # Create subfolders used in model pipeline for subFolder in subFolders: newSubFolderPath = newFolderPath + '/' + subFolder if not os.path.exists(newSubFolderPath): os.makedirs(newSubFolderPath) # Copy all years training sets and test sets for i in range(5): # Build paths trainFileName = '{}_{}_all_{}_train.csv'.format( month, region, i) testFileName = '{}_{}_{}_test.csv'.format(month, region, i) sourceTrainFilePath = allMonthsPath + region + '/' + month + '/' + trainFileName newTrainFilePath = newFolderPath + '/' + trainFileName sourceTestFilePath = allMonthsPath + region + '/' + month + '/' + testFileName newTestFilePath = newFolderPath + '/' + testFileName # Copy files shutil.copyfile(sourceTrainFilePath, newTrainFilePath) shutil.copyfile(sourceTestFilePath, newTestFilePath) # Add in full dataset and Sacramento data for prediction fullFileName = '{}_{}_all.csv'.format(month, region) sourceFullFilePath = allMonthsPath + region + '/' + month + '/' + fullFileName newFullFilePath = newFolderPath + '/Prediction/' + fullFileName shutil.copyfile(sourceFullFilePath, newFullFilePath) sacData = thesisFunctions.prepSacramentoData(month, region) predictionFilePath = newFolderPath + '/Prediction/sacramentoData.csv' sacData.to_csv(predictionFilePath, index=False)
def setUpFiles(basePath): """ Creates folder structure in OriginalModelAssessment folder and copies all years training data + test set :return: """ allMonthsPath = 'AllMonthsDryHalf/' regions = ['IntMnt', 'Xeric'] months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] subFolders = ['CurrentFoldData', 'Output', 'Prediction'] for region in regions: for month in months: print('Processing:', region, month.capitalize()) # Create folder for each month & region in original model folder newFolderPath = basePath + region + '/' + month if not os.path.exists(newFolderPath): os.makedirs(newFolderPath) # Create subfolders used in model pipeline for subFolder in subFolders: newSubFolderPath = newFolderPath + '/' + subFolder if not os.path.exists(newSubFolderPath): os.makedirs(newSubFolderPath) # Copy all years training sets and test sets for i in range(5): # Build paths trainFileName = '{}_{}_all_{}_train.csv'.format(month, region, i) testFileName = '{}_{}_{}_test.csv'.format(month, region, i) sourceTrainFilePath = allMonthsPath + region + '/' + month + '/' + trainFileName newTrainFilePath = newFolderPath + '/' + trainFileName sourceTestFilePath = allMonthsPath + region + '/' + month + '/' + testFileName newTestFilePath = newFolderPath + '/' + testFileName # Copy files shutil.copyfile(sourceTrainFilePath, newTrainFilePath) shutil.copyfile(sourceTestFilePath, newTestFilePath) # Add in full dataset and Sacramento data for prediction fullFileName = '{}_{}_all.csv'.format(month, region) sourceFullFilePath = allMonthsPath + region + '/' + month + '/' + fullFileName newFullFilePath = newFolderPath + '/Prediction/' + fullFileName shutil.copyfile(sourceFullFilePath, newFullFilePath) sacData = thesisFunctions.prepSacramentoData(month, region) predictionFilePath = newFolderPath + '/Prediction/sacramentoData.csv' sacData.to_csv(predictionFilePath, index=False)
# header for first column and deleting the 'X.1' mystery variable when it shows up sourceFile = rfDataPath + region.lower() + '/' + month + '_' + region + '_ref.csv' regionTrainingDF = pandas.read_csv(sourceFile) regionTrainingDF.rename(columns={'Unnamed: 0': 'ObsID'}, inplace=True) badVariable = 'X.1' if badVariable in regionTrainingDF.columns.values: regionTrainingDF.drop(badVariable, axis=1, inplace=True) # Drop "T" from beginning of STAID column (someone added it as a string casting hack before I got the data) regionTrainingDF['STAID'] = regionTrainingDF['STAID'].map(lambda x: x[1:]) # Subset all training data to just those observations in the Sacramento basin regionTrainingDF = regionTrainingDF[regionTrainingDF.STAID.isin(sacRefGages)] # Get prediction data regionPredictionDF = thesisFunctions.prepSacramentoData(month, region) # Hack: If we're in the CoastMnt region loop, reassign region as 'IntMnt' because the only Sacramento reference # gage in that region's source file, 11371000, is actually in the West Mnt region in the Gages II database. None # of the prediction data is in the CoastMnt region so it won't affect anything in that dataframe. if region == 'CoastMnt': regionForDF = 'IntMnt' else: regionForDF = region # Add columns that ID current region and month (1 if True, 0 if False) to each DataFrame, leaving off last one # in each list to prevent model being over-specified (actually the last two for the regions, because of the # 11371000 misclassification as CoastMnt explained above for regionColumn in regions[:-2]: if regionColumn == regionForDF:
regionTrainingDF = pandas.read_csv(sourceFile) regionTrainingDF.rename(columns={'Unnamed: 0': 'ObsID'}, inplace=True) badVariable = 'X.1' if badVariable in regionTrainingDF.columns.values: regionTrainingDF.drop(badVariable, axis=1, inplace=True) # Drop "T" from beginning of STAID column (someone added it as a string casting hack before I got the data) regionTrainingDF['STAID'] = regionTrainingDF['STAID'].map( lambda x: x[1:]) # Subset all training data to just those observations in the Sacramento basin regionTrainingDF = regionTrainingDF[regionTrainingDF.STAID.isin( sacRefGages)] # Get prediction data regionPredictionDF = thesisFunctions.prepSacramentoData(month, region) # Hack: If we're in the CoastMnt region loop, reassign region as 'IntMnt' because the only Sacramento reference # gage in that region's source file, 11371000, is actually in the West Mnt region in the Gages II database. None # of the prediction data is in the CoastMnt region so it won't affect anything in that dataframe. if region == 'CoastMnt': regionForDF = 'IntMnt' else: regionForDF = region # Add columns that ID current region and month (1 if True, 0 if False) to each DataFrame, leaving off last one # in each list to prevent model being over-specified (actually the last two for the regions, because of the # 11371000 misclassification as CoastMnt explained above for regionColumn in regions[:-2]: if regionColumn == regionForDF:
destinationFilePath = destinationFolderPath + month + '_' + region + '_all.csv' sourceFile = pandas.read_csv(sourceFilePath) sourceFile.rename(columns={'Unnamed: 0': 'ObsID'}, inplace=True) badVariable = 'X.1' if badVariable in sourceFile.columns.values: sourceFile.drop(badVariable, axis=1, inplace=True) sourceFile.to_csv(destinationFilePath, index=False) # Create subfolders used in model pipeline subFolders = ['CurrentFoldData', 'Output', 'Prediction'] for subFolder in subFolders: newSubFolderPath = destinationFolderPath + '/' + subFolder if not os.path.exists(newSubFolderPath): os.makedirs(newSubFolderPath) # Also add a copy of NOAA water years waterYearsSourceFilePath = allMonthsPath + 'NOAAWaterYearsDriestToWettest.csv' waterYearsDestinationFilePath = destinationFolderPath + 'NOAAWaterYearsDriestToWettest.csv' shutil.copyfile(waterYearsSourceFilePath, waterYearsDestinationFilePath) # Add in Sacramento data for prediction sacData = thesisFunctions.prepSacramentoData( month, region, wetOrDry, waterYearsDestinationFilePath, proportionOfInterest) predictionFilePath = allMonthsPath + region + '/' + month + '/Prediction/sacramentoData.csv' sacData.to_csv(predictionFilePath, index=False)
sourceFile = pandas.read_csv(sourceFilePath) sourceFile.rename(columns={'Unnamed: 0': 'ObsID'}, inplace=True) badVariable = 'X.1' if badVariable in sourceFile.columns.values: sourceFile.drop(badVariable, axis=1, inplace=True) sourceFile.to_csv(destinationFilePath, index=False) # Create subfolders used in model pipeline subFolders = ['CurrentFoldData', 'Output', 'Prediction'] for subFolder in subFolders: newSubFolderPath = destinationFolderPath + '/' + subFolder if not os.path.exists(newSubFolderPath): os.makedirs(newSubFolderPath) # Also add a copy of NOAA water years waterYearsSourceFilePath = allMonthsPath + 'NOAAWaterYearsDriestToWettest.csv' waterYearsDestinationFilePath = destinationFolderPath + 'NOAAWaterYearsDriestToWettest.csv' shutil.copyfile(waterYearsSourceFilePath, waterYearsDestinationFilePath) # Add in Sacramento data for prediction sacData = thesisFunctions.prepSacramentoData(month, region, wetOrDry, waterYearsDestinationFilePath, proportionOfInterest) predictionFilePath = allMonthsPath + region + '/' + month + '/Prediction/sacramentoData.csv' sacData.to_csv(predictionFilePath, index=False)