def create_output_dirs_files(settings, is_sub_gcm_delta=False): """ Create the output directories and files needed to processes wateruse. Parameters ---------- settings : dictionary Dictionary of user settings Returns ------- info_dir : string string path to info directory gcm_delta_dir : string string path to ecoflow directory info_file : string string path to info file Notes ----- Uses settings set in user_settings.py """ # create output directories info_dir = helpers.make_directory( path=settings["simulation_directory"], directory_name=settings["info_directory_name"]) gcm_delta_dir = helpers.make_directory( path=settings["simulation_directory"], directory_name=settings["gcm_delta_directory_name"]) # path to info file if is_sub_gcm_delta: info_file = os.path.join(info_dir, settings["sub_gcm_delta_info_file_name"]) else: info_file = os.path.join(info_dir, settings["gcm_delta_info_file_name"]) # print input and output information helpers.print_input_output_info(input_dict={ "simulation_directory": settings["simulation_directory"], "gcm_delta_prepend_name": settings["gcm_delta_prepend_name"], "gcm_delta_directory_name": settings["gcm_delta_directory_name"], "gcm_delta_info_file_name": settings["gcm_delta_info_file_name"], "gcm_delta_non_intersecting_file_name": settings["gcm_delta_non_intersecting_file_name"], "sub_gcm_delta_info_file_name": settings["sub_gcm_delta_info_file_name"], }, output_dict={ "info_dir": info_dir, "info_file": info_file, "gcm_delta_dir": gcm_delta_dir }) return info_dir, gcm_delta_dir, info_file
def process_water_files(file_list, settings, print_data=True): """ Process a list of WATER xml files according to options contained in arguments parameter. Parameters ---------- file_list : list List of files to parse, process, and plot. arguments : argparse object An argparse object containing user options. """ print("Processing WATER files ...\n") for f in file_list: ext = os.path.splitext(f)[1] assert ext == ".txt" or ext == ".xml", "Can not process file {}. File extension {} is not .txt or .xml".format( f, ext) filedir, filename = helpers.get_file_info(f) if ext == ".txt": output_dir = helpers.make_directory( path=filedir, directory_name=settings["watertxt_directory_name"]) helpers.print_input_output_info( input_dict={"input_file": f}, output_dict={"output_directory": output_dir}) waterapputils_logging.initialize_loggers(output_dir=output_dir) data = watertxt.read_file(f) watertxt_viewer.plot_watertxt_data(data, save_path=output_dir) if print_data: watertxt_viewer.print_watertxt_data(data) elif ext == ".xml": output_dir = helpers.make_directory( path=filedir, directory_name=settings["waterxml_directory_name"]) waterapputils_logging.initialize_loggers(output_dir=output_dir) helpers.print_input_output_info( input_dict={"input_file": f}, output_dict={"output_directory": output_dir}) data = waterxml.read_file(f) waterxml_viewer.plot_waterxml_timeseries_data(data, save_path=output_dir) waterxml_viewer.plot_waterxml_topographic_wetness_index_data( data, save_path=output_dir) if print_data: waterxml_viewer.print_waterxml_data(data) waterapputils_logging.remove_loggers()
def test_get_rankings_year_after_sofifa(): year = 2008 temp_folder = os.path.join(os.getcwd(), 'temp') csv_file = '{}-{}.csv'.format(year, year + 1) from_file = os.path.join(RAW_CLEANED_DATA_FILE_PATH, csv_file) to_file = os.path.join(temp_folder, csv_file) make_directory(temp_folder) get_rankings(from_file, to_file, '{}-12-31'.format(str(year+1)), include_prediction=False) cmp_file = os.path.join(STANDINGS_PATH, csv_file) assert compare_csv(cmp_file, to_file) remove_directory(temp_folder)
def test_get_rankings_all(): temp_folder = os.path.join(os.getcwd(), 'temp/file.csv') make_directory(temp_folder) from_year, to_year = 1993, 2019 get_rankings_all(from_year, to_year, RAW_CLEANED_DATA_FILE_PATH, temp_folder) for year in range(from_year, to_year + 1): csv_file = '{}-{}.csv'.format(year, year + 1) created_file = os.path.join(temp_folder, csv_file) cmp_file = os.path.join(STANDINGS_PATH, csv_file) assert compare_csv(cmp_file, created_file) remove_directory(temp_folder)
def create_output_dirs_files(settings, is_sub_wateruse = False): """ Create the output directories and files needed to processes wateruse. Parameters ---------- settings : dictionary Dictionary of user settings Returns ------- info_dir : string string path to info directory ecoflow_dir : string string path to ecoflow directory oasis_dir : string string path to oasis directory info_file : string string path to info file Notes ----- Uses settings set in user_settings.py """ # create output directories info_dir = helpers.make_directory(path = settings["simulation_directory"], directory_name = settings["info_directory_name"]) ecoflow_dir = helpers.make_directory(path = settings["simulation_directory"], directory_name = settings["ecoflow_directory_name"]) oasis_dir = helpers.make_directory(path = settings["simulation_directory"], directory_name = settings["oasis_directory_name"]) # path to info file if is_sub_wateruse: info_file = os.path.join(info_dir, settings["sub_wateruse_info_file_name"]) else: info_file = os.path.join(info_dir, settings["wateruse_info_file_name"]) # print input and output information helpers.print_input_output_info( input_dict = {"simulation_directory": settings["simulation_directory"], "wateruse_prepend_name": settings["wateruse_prepend_name"], "wateruse_directory_name": settings["wateruse_directory_name"], "wateruse_info_file_name": settings["wateruse_info_file_name"], "wateruse_non_intersecting_file_name": settings["wateruse_non_intersecting_file_name"], "sub_wateruse_info_file_name": settings["sub_wateruse_info_file_name"], }, output_dict = {"info_dir": info_dir, "info_file": info_file, "ecoflow_dir": ecoflow_dir, "oasis_dir": oasis_dir} ) return info_dir, ecoflow_dir, oasis_dir, info_file
def write_oasis_file(file_list, dir_name, file_name): for f in file_list: filedir, filename = helpers.get_file_info(f) oasis_dir = helpers.make_directory(path=filedir, directory_name=dir_name) helpers.print_input_output_info( input_dict={"input_file": f}, output_dict={"output_directory": oasis_dir}) waterapputils_logging.initialize_loggers(output_dir=oasis_dir) watertxt_data = watertxt.read_file(f) # write timeseries of discharge + water use for OASIS watertxt.write_timeseries_file( watertxt_data=watertxt_data, name="Discharge + Water Use", save_path=oasis_dir, filename="-".join([watertxt_data["stationid"], file_name])) waterapputils_logging.remove_loggers()
def create_output_dir(settings): """ Create the output directories and files needed to processes wateruse. Parameters ---------- settings : dictionary Dictionary of user settings Returns ------- map_dir : string string path to map directory Notes ----- Uses settings set in user_settings.py """ # create output directories map_dir = helpers.make_directory( path=settings["simulation_directory"], directory_name=settings["map_directory_name"]) # print input and output information helpers.print_input_output_info(input_dict={ "simulation_directory": settings["simulation_directory"], "map_directory_name": settings["map_directory_name"], }, output_dict={" map_dir": map_dir}) return map_dir
def create_output_dir(settings): """ Create the output directories and files needed to processes wateruse. Parameters ---------- settings : dictionary Dictionary of user settings Returns ------- map_dir : string string path to map directory Notes ----- Uses settings set in user_settings.py """ # create output directories map_dir = helpers.make_directory(path = settings["simulation_directory"], directory_name = settings["map_directory_name"]) # print input and output information helpers.print_input_output_info( input_dict = {"simulation_directory": settings["simulation_directory"], "map_directory_name": settings["map_directory_name"], }, output_dict = {" map_dir": map_dir} ) return map_dir
def write_ecoflow_file_stationid(file_list, dir_name, file_name, parameter_name = "Discharge + Water Use"): """ Write a csv file containing a timeseries for a particular parameter contained in a WATER.txt file Parameters ---------- file_list : list List of WATER.txt files to process dir_name : string String name for output directory file_name : string String name for output file parameter_name : string String name for a parameter contained in a WATER.txt file """ for f in file_list: filedir, filename = helpers.get_file_info(f) ecoflow_dir = helpers.make_directory(path = filedir, directory_name = dir_name) helpers.print_input_output_info(input_dict = {"input_file": f}, output_dict = {"output_directory": ecoflow_dir}) waterapputils_logging.initialize_loggers(output_dir = ecoflow_dir) watertxt_data = watertxt.read_file(f) # write timeseries of discharge + water use for ecoflow program watertxt.write_timeseries_file_stationid(watertxt_data, name = parameter_name, save_path = ecoflow_dir, filename = file_name, stationid = watertxt_data["stationid"]) waterapputils_logging.remove_loggers()
def write_ecoflow_file_drainageareaxml(file_list, dir_name, file_name): """ Write a csv file containing a label (basin id number) and its corresponding area. Parameters ---------- file_list : list List of WATERSimulation.xml files to process dir_name : string String name for output directory file_name : string String name for output file """ area_data = {} for f in file_list: filedir, filename = helpers.get_file_info(f) ecoflow_dir = helpers.make_directory(path=filedir, directory_name=dir_name) helpers.print_input_output_info( input_dict={"input_file": f}, output_dict={"output_directory": ecoflow_dir}) waterapputils_logging.initialize_loggers(output_dir=ecoflow_dir) # read xml file waterxml_tree = waterxml.read_file(f) # get area from each region from the xml file and sum for a total area project, study, simulation = waterxml.get_xml_data( waterxml_tree=waterxml_tree) # get the project name which is the same as the stationid stationid = project["ProjName"] # get the area means for each region areas = waterxml.get_study_unit_areas(simulation_dict=simulation) # calculate total area total_area = waterxml.calc_total_study_unit_areas(areas) # fill area_data with total area area_data[stationid] = total_area # convert from km**2 to mi**2 area_data = helpers.convert_area_values(area_data, in_units="km2", out_units="mi2") # write timeseries of dishcarge + water use for ecoflow program watertxt.write_drainagearea_file(area_data=area_data, save_path=ecoflow_dir, filename=file_name) waterapputils_logging.remove_loggers()
def test_get_current_fixtures(): temp_folder = os.path.join(os.getcwd(), 'temp') temp_file = os.path.join(temp_folder, 'temp.csv') make_directory(temp_file) get_current_fixtures(temp_file) assert os.path.isfile(temp_file) df = pd.read_csv(temp_file) df_columns_list = list(df) assert 'Date' in df_columns_list assert 'HomeTeam' in df_columns_list assert 'AwayTeam' in df_columns_list assert 'FTHG' in df_columns_list assert 'FTAG' in df_columns_list assert 'FTR' in df_columns_list remove_directory(temp_folder)
def process_water_files(file_list, settings, print_data=True): """ Process a list of WATER xml files according to options contained in arguments parameter. Parameters ---------- file_list : list List of files to parse, process, and plot. arguments : argparse object An argparse object containing user options. """ print("Processing WATER files ...\n") for f in file_list: ext = os.path.splitext(f)[1] assert ext == ".txt" or ext == ".xml", "Can not process file {}. File extension {} is not .txt or .xml".format( f, ext ) filedir, filename = helpers.get_file_info(f) if ext == ".txt": output_dir = helpers.make_directory(path=filedir, directory_name=settings["watertxt_directory_name"]) helpers.print_input_output_info(input_dict={"input_file": f}, output_dict={"output_directory": output_dir}) waterapputils_logging.initialize_loggers(output_dir=output_dir) data = watertxt.read_file(f) watertxt_viewer.plot_watertxt_data(data, save_path=output_dir) if print_data: watertxt_viewer.print_watertxt_data(data) elif ext == ".xml": output_dir = helpers.make_directory(path=filedir, directory_name=settings["waterxml_directory_name"]) waterapputils_logging.initialize_loggers(output_dir=output_dir) helpers.print_input_output_info(input_dict={"input_file": f}, output_dict={"output_directory": output_dir}) data = waterxml.read_file(f) waterxml_viewer.plot_waterxml_timeseries_data(data, save_path=output_dir) waterxml_viewer.plot_waterxml_topographic_wetness_index_data(data, save_path=output_dir) if print_data: waterxml_viewer.print_waterxml_data(data) waterapputils_logging.remove_loggers()
def write_ecoflow_file_drainageareashp(file_list, dir_name, file_name, label_field, query_field): """ Write a csv file containing a label (basin id number) and its corresponding area. Two methods to get the area from each respective shapefile: 1. if shapefile(s) has an area field and user specifies it in user_settings.py under the *basin_shapefile_area_field* variable then get the area for each basin using the specified area field name (query_field) 2. if shapefile(s) do not have an area field or user does not specify is in user_settings.py, then calculate it using osgeo and label each basin according to *basin_shapefile_id_field* in user_settings.py Parameters ---------- file_list : list List of files to process; files are shapefiles settings : dictionary Dictionary of user settings label_field : string String name of an id field (basin id number) to associate with a basin query_field : string String name of an area field Notes ----- Uses settings set in user_settings.py """ for f in file_list: filedir, filename = helpers.get_file_info(f) ecoflow_dir = helpers.make_directory(path=filedir, directory_name=dir_name) waterapputils_logging.initialize_loggers(output_dir=ecoflow_dir) helpers.print_input_output_info( input_dict={"input_file": f}, output_dict={"output_directory": ecoflow_dir}) basin_shapefile = osgeo.ogr.Open(f) # get the areas for each region areas = spatialvectors.get_areas_dict(shapefile=basin_shapefile, id_field=label_field, query_field=query_field) # write timeseries of dishcarge + water use for ecoflow program watertxt.write_drainagearea_file(area_data=areas, save_path=ecoflow_dir, filename=file_name) waterapputils_logging.remove_loggers()
def write_ecoflow_file_drainageareaxml(file_list, dir_name, file_name): """ Write a csv file containing a label (basin id number) and its corresponding area. Parameters ---------- file_list : list List of WATERSimulation.xml files to process dir_name : string String name for output directory file_name : string String name for output file """ area_data = {} for f in file_list: filedir, filename = helpers.get_file_info(f) ecoflow_dir = helpers.make_directory(path = filedir, directory_name = dir_name) helpers.print_input_output_info(input_dict = {"input_file": f}, output_dict = {"output_directory": ecoflow_dir}) waterapputils_logging.initialize_loggers(output_dir = ecoflow_dir) # read xml file waterxml_tree = waterxml.read_file(f) # get area from each region from the xml file and sum for a total area project, study, simulation = waterxml.get_xml_data(waterxml_tree = waterxml_tree) # get the project name which is the same as the stationid stationid = project["ProjName"] # get the area means for each region areas = waterxml.get_study_unit_areas(simulation_dict = simulation) # calculate total area total_area = waterxml.calc_total_study_unit_areas(areas) # fill area_data with total area area_data[stationid] = total_area # convert from km**2 to mi**2 area_data = helpers.convert_area_values(area_data, in_units = "km2", out_units = "mi2") # write timeseries of dishcarge + water use for ecoflow program watertxt.write_drainagearea_file(area_data = area_data, save_path = ecoflow_dir, filename = file_name) waterapputils_logging.remove_loggers()
def write_ecoflow_file_drainageareashp(file_list, dir_name, file_name, label_field, query_field): """ Write a csv file containing a label (basin id number) and its corresponding area. Two methods to get the area from each respective shapefile: 1. if shapefile(s) has an area field and user specifies it in user_settings.py under the *basin_shapefile_area_field* variable then get the area for each basin using the specified area field name (query_field) 2. if shapefile(s) do not have an area field or user does not specify is in user_settings.py, then calculate it using osgeo and label each basin according to *basin_shapefile_id_field* in user_settings.py Parameters ---------- file_list : list List of files to process; files are shapefiles settings : dictionary Dictionary of user settings label_field : string String name of an id field (basin id number) to associate with a basin query_field : string String name of an area field Notes ----- Uses settings set in user_settings.py """ for f in file_list: filedir, filename = helpers.get_file_info(f) ecoflow_dir = helpers.make_directory(path = filedir, directory_name = dir_name) waterapputils_logging.initialize_loggers(output_dir = ecoflow_dir) helpers.print_input_output_info(input_dict = {"input_file": f}, output_dict = {"output_directory": ecoflow_dir}) basin_shapefile = osgeo.ogr.Open(f) # get the areas for each region areas = spatialvectors.get_areas_dict(shapefile = basin_shapefile, id_field = label_field, query_field = query_field) # write timeseries of dishcarge + water use for ecoflow program watertxt.write_drainagearea_file(area_data = areas, save_path = ecoflow_dir, filename = file_name) waterapputils_logging.remove_loggers()
def write_oasis_file(file_list, dir_name, file_name): for f in file_list: filedir, filename = helpers.get_file_info(f) oasis_dir = helpers.make_directory(path = filedir, directory_name = dir_name) helpers.print_input_output_info(input_dict = {"input_file": f}, output_dict = {"output_directory": oasis_dir}) waterapputils_logging.initialize_loggers(output_dir = oasis_dir) watertxt_data = watertxt.read_file(f) # write timeseries of discharge + water use for OASIS watertxt.write_timeseries_file(watertxt_data = watertxt_data, name = "Discharge + Water Use", save_path = oasis_dir, filename = "-".join([watertxt_data["stationid"], file_name])) waterapputils_logging.remove_loggers()
def write_ecoflow_file_stationid(file_list, dir_name, file_name, parameter_name="Discharge + Water Use"): """ Write a csv file containing a timeseries for a particular parameter contained in a WATER.txt file Parameters ---------- file_list : list List of WATER.txt files to process dir_name : string String name for output directory file_name : string String name for output file parameter_name : string String name for a parameter contained in a WATER.txt file """ for f in file_list: filedir, filename = helpers.get_file_info(f) ecoflow_dir = helpers.make_directory(path=filedir, directory_name=dir_name) helpers.print_input_output_info( input_dict={"input_file": f}, output_dict={"output_directory": ecoflow_dir}) waterapputils_logging.initialize_loggers(output_dir=ecoflow_dir) watertxt_data = watertxt.read_file(f) # write timeseries of discharge + water use for ecoflow program watertxt.write_timeseries_file_stationid( watertxt_data, name=parameter_name, save_path=ecoflow_dir, filename=file_name, stationid=watertxt_data["stationid"]) waterapputils_logging.remove_loggers()
def process_intersecting_centroids(intersecting_centroids, settings, ecoflow_dir, oasis_dir): """ Apply water use data to a WATER \*.txt file. The new file created is saved to the same directory as the \*.xml file. Parameters ---------- intersecting_centroids : dictionary Dictionary containing lists of values for a particular field that were intersected by another shapefile. settings : dictionary Dictionary of user settings ecoflow_dir : string String path to directory that will contain output specific for ecoflow program oasis_dir : string String path to directory that will contain output specific for oasis Notes ----- Uses settings set in user_settings.py """ # create a file for the output for featureid, centroids in intersecting_centroids.iteritems(): # get sum of the water use data if settings["wateruse_factor_file"]: total_wateruse_dict = wateruse.get_all_total_wateruse(wateruse_files = settings["wateruse_files"], id_list = centroids, wateruse_factor_file = settings["wateruse_factor_file"], in_cfs = True) else: total_wateruse_dict = wateruse.get_all_total_wateruse(wateruse_files = settings["wateruse_files"], id_list = centroids, wateruse_factor_file = None, in_cfs = True) # print monthly output in nice format to info file print("FeatureId: {}\n Centroids: {}\n Total Water Use:\n".format(featureid, centroids)) helpers.print_monthly_dict(monthly_dict = total_wateruse_dict) # get the txt data file that has a parent directory matching the current featureid if settings["is_batch_simulation"]: path = os.path.join(settings["simulation_directory"], featureid) else: path = settings["simulation_directory"] # find the WATER.txt file watertxt_file = helpers.find_file(name = settings["water_text_file_name"], path = path) # get file info watertxt_dir, watertxt_filename = helpers.get_file_info(watertxt_file) # create an output directory output_dir = helpers.make_directory(path = watertxt_dir, directory_name = settings["wateruse_directory_name"]) # initialize error logging waterapputils_logging.initialize_loggers(output_dir = output_dir) # read the txt watertxt_data = watertxt.read_file(watertxt_file) # apply water use watertxt_data = watertxt.apply_wateruse(watertxt_data, wateruse_totals = total_wateruse_dict) # write updated txt watertxt_with_wateruse_file = settings["wateruse_prepend_name"] + watertxt_filename watertxt.write_file(watertxt_data = watertxt_data, save_path = output_dir, filename = watertxt_with_wateruse_file) # plot updated_watertxt_file = os.path.join(output_dir, watertxt_with_wateruse_file) water_files_processing.process_water_files(file_list = [updated_watertxt_file], settings = settings, print_data = True) # write timeseries of discharge + water use for OASIS watertxt.write_timeseries_file(watertxt_data = watertxt_data, name = settings["ecoflow_parameter_name"], save_path = oasis_dir, filename = "-".join([watertxt_data["stationid"], settings["oasis_file_name"]])) # write timeseries of dishcarge + water use for ecoflow program watertxt.write_timeseries_file_stationid(watertxt_data, name = settings["ecoflow_parameter_name"], save_path = ecoflow_dir, filename = "", stationid = watertxt_data["stationid"])
def get_rankings(from_file, to_file, date=None, include_prediction=False, predicted_date_so_far=None, ranking_summary_file=None): if date: datet = datetime.strptime(date, '%Y-%m-%d') if not (from_file and to_file): raise ValueError("Error: get_rankings: Give a from_file/to_file pair") df = pd.read_csv(from_file) scores = dict() for _, row in df.iterrows(): if type(row['Date']) is float: continue if date and datetime.strptime(row['Date'], '%Y-%m-%d') > datet: break # That means this row is a prediction value if not include_prediction and row['FTHG'] == 0 and row['FTAG'] == 0 and row['FTR'] != 'D': break # Meaning this game is not played and not predicted yet if row['FTR'] is np.nan: break home = row['HomeTeam'] away = row['AwayTeam'] if home not in scores: scores[home] = { 'match_played': 0, 'points': 0, 'goal_diff': 0, 'win': 0 } if away not in scores: scores[away] = { 'match_played': 0, 'points': 0, 'goal_diff': 0, 'win': 0 } scores[home]['match_played'] += 1 scores[away]['match_played'] += 1 match_goal_diff = row['FTHG'] - row['FTAG'] scores[home]['goal_diff'] += match_goal_diff scores[away]['goal_diff'] -= match_goal_diff if row['FTR'] == 'H': scores[home]['points'] += 3 scores[home]['win'] += 1 elif row['FTR'] == 'A': scores[away]['points'] += 3 scores[away]['win'] += 1 else: scores[home]['points'] += 1 scores[away]['points'] += 1 teams = sorted(scores, key=lambda k: scores[k]['points'], reverse=True) points, goal_diff, win_rate = [], [], [] for name in teams: val = scores[name] points.append(val['points']) goal_diff.append(val['goal_diff']) win_rate.append(val['win'] / val['match_played']) df = pd.DataFrame(list(zip(teams, points, goal_diff, win_rate)), columns=['Team', 'Points', 'Goal_Diff', 'Win_Rate']) make_directory(to_file) df.to_csv(to_file, index=False) if include_prediction and predicted_date_so_far and ranking_summary_file: round_df = pd.DataFrame(list(zip(teams, points)), columns=['Team', predicted_date_so_far]) round_df.set_index('Team', inplace=True) round_df = round_df.transpose() round_df.index.name = 'Date' if os.path.isfile(ranking_summary_file): summary_df = pd.read_csv(ranking_summary_file) summary_df.set_index('Date', inplace=True) summary_df = pd.concat([summary_df, round_df], sort=False) summary_df.to_csv(ranking_summary_file) else: round_df.to_csv(ranking_summary_file) return teams[0]
def process_cmp(file_list, settings, print_data=True): """ Compare two WATER text files according to options contained in arguments parameter. Parameters ---------- file_list : list List of files to parse, process, and plot. arguments : argparse object An argparse object containing user options. """ print("Comparing WATER files ...\n") water_file1 = file_list[0] water_file2 = file_list[1] filedir1, filename1 = helpers.get_file_info(water_file1) filedir2, filename2 = helpers.get_file_info(water_file2) ext1 = os.path.splitext(filename1)[1] ext2 = os.path.splitext(filename2)[1] assert ext1 == ".txt" or ext1 == ".xml", "Can not process file {}. File extension {} is not .txt or .xml".format( filename1, ext1) assert ext2 == ".txt" or ext2 == ".xml", "Can not process file {}. File extension {} is not .txt or .xml".format( filename2, ext2) if ext1 == ".txt" and ext2 == ".txt": output_dir = helpers.make_directory( path=filedir1, directory_name=settings["watertxt_directory_name"]) helpers.print_input_output_info( input_dict={ "input_file_1": water_file1, "input_file_2": water_file2 }, output_dict={"output_directory": output_dir}) waterapputils_logging.initialize_loggers(output_dir=output_dir) watertxt_data1 = watertxt.read_file(water_file1) watertxt_data2 = watertxt.read_file(water_file2) watertxt_viewer.plot_watertxt_comparison(watertxt_data1, watertxt_data2, save_path=output_dir) if print_data: watertxt_viewer.print_watertxt_data(watertxt_data1) watertxt_viewer.print_watertxt_data(watertxt_data2) elif ext1 == ".xml" and ext2 == ".xml": output_dir = helpers.make_directory( path=filedir1, directory_name=settings["waterxml_directory_name"]) helpers.print_input_output_info( input_dict={ "input_file_1": water_file1, "input_file_2": water_file2 }, output_dict={"output_directory": output_dir}) waterapputils_logging.initialize_loggers(output_dir=output_dir) waterxml_data1 = waterxml.read_file(water_file1) waterxml_data2 = waterxml.read_file(water_file2) waterxml_viewer.plot_waterxml_timeseries_comparison( waterxml_data1, waterxml_data2, save_path=output_dir) if print_data: waterxml_viewer.print_waterxml_data(waterxml_data1) waterxml_viewer.print_waterxml_data(waterxml_data2) else: print( "Can not process files {} and {}. File extensions {} and {} both need to be .txt or .xml" .format(filename1, filename2, ext1, ext2)) waterapputils_logging.remove_loggers()
def process_cmp(file_list, settings, print_data=True): """ Compare two WATER text files according to options contained in arguments parameter. Parameters ---------- file_list : list List of files to parse, process, and plot. arguments : argparse object An argparse object containing user options. """ print("Comparing WATER files ...\n") water_file1 = file_list[0] water_file2 = file_list[1] filedir1, filename1 = helpers.get_file_info(water_file1) filedir2, filename2 = helpers.get_file_info(water_file2) ext1 = os.path.splitext(filename1)[1] ext2 = os.path.splitext(filename2)[1] assert ext1 == ".txt" or ext1 == ".xml", "Can not process file {}. File extension {} is not .txt or .xml".format( filename1, ext1 ) assert ext2 == ".txt" or ext2 == ".xml", "Can not process file {}. File extension {} is not .txt or .xml".format( filename2, ext2 ) if ext1 == ".txt" and ext2 == ".txt": output_dir = helpers.make_directory(path=filedir1, directory_name=settings["watertxt_directory_name"]) helpers.print_input_output_info( input_dict={"input_file_1": water_file1, "input_file_2": water_file2}, output_dict={"output_directory": output_dir}, ) waterapputils_logging.initialize_loggers(output_dir=output_dir) watertxt_data1 = watertxt.read_file(water_file1) watertxt_data2 = watertxt.read_file(water_file2) watertxt_viewer.plot_watertxt_comparison(watertxt_data1, watertxt_data2, save_path=output_dir) if print_data: watertxt_viewer.print_watertxt_data(watertxt_data1) watertxt_viewer.print_watertxt_data(watertxt_data2) elif ext1 == ".xml" and ext2 == ".xml": output_dir = helpers.make_directory(path=filedir1, directory_name=settings["waterxml_directory_name"]) helpers.print_input_output_info( input_dict={"input_file_1": water_file1, "input_file_2": water_file2}, output_dict={"output_directory": output_dir}, ) waterapputils_logging.initialize_loggers(output_dir=output_dir) waterxml_data1 = waterxml.read_file(water_file1) waterxml_data2 = waterxml.read_file(water_file2) waterxml_viewer.plot_waterxml_timeseries_comparison(waterxml_data1, waterxml_data2, save_path=output_dir) if print_data: waterxml_viewer.print_waterxml_data(waterxml_data1) waterxml_viewer.print_waterxml_data(waterxml_data2) else: print( "Can not process files {} and {}. File extensions {} and {} both need to be .txt or .xml".format( filename1, filename2, ext1, ext2 ) ) waterapputils_logging.remove_loggers()
def process_intersecting_centroids(intersecting_centroids, settings, ecoflow_dir, oasis_dir): """ Apply water use data to a WATER \*.txt file. The new file created is saved to the same directory as the \*.xml file. Parameters ---------- intersecting_centroids : dictionary Dictionary containing lists of values for a particular field that were intersected by another shapefile. settings : dictionary Dictionary of user settings ecoflow_dir : string String path to directory that will contain output specific for ecoflow program oasis_dir : string String path to directory that will contain output specific for oasis Notes ----- Uses settings set in user_settings.py """ # create a file for the output for featureid, centroids in intersecting_centroids.iteritems(): # get sum of the water use data if settings["wateruse_factor_file"]: total_wateruse_dict = wateruse.get_all_total_wateruse( wateruse_files=settings["wateruse_files"], id_list=centroids, wateruse_factor_file=settings["wateruse_factor_file"], in_cfs=True) else: total_wateruse_dict = wateruse.get_all_total_wateruse( wateruse_files=settings["wateruse_files"], id_list=centroids, wateruse_factor_file=None, in_cfs=True) # print monthly output in nice format to info file print( "FeatureId: {}\n Centroids: {}\n Total Water Use:\n".format( featureid, centroids)) helpers.print_monthly_dict(monthly_dict=total_wateruse_dict) # get the txt data file that has a parent directory matching the current featureid if settings["is_batch_simulation"]: path = os.path.join(settings["simulation_directory"], featureid) else: path = settings["simulation_directory"] # find the WATER.txt file watertxt_file = helpers.find_file( name=settings["water_text_file_name"], path=path) # get file info watertxt_dir, watertxt_filename = helpers.get_file_info(watertxt_file) # create an output directory output_dir = helpers.make_directory( path=watertxt_dir, directory_name=settings["wateruse_directory_name"]) # initialize error logging waterapputils_logging.initialize_loggers(output_dir=output_dir) # read the txt watertxt_data = watertxt.read_file(watertxt_file) # apply water use watertxt_data = watertxt.apply_wateruse( watertxt_data, wateruse_totals=total_wateruse_dict) # write updated txt watertxt_with_wateruse_file = settings[ "wateruse_prepend_name"] + watertxt_filename watertxt.write_file(watertxt_data=watertxt_data, save_path=output_dir, filename=watertxt_with_wateruse_file) # plot updated_watertxt_file = os.path.join(output_dir, watertxt_with_wateruse_file) water_files_processing.process_water_files( file_list=[updated_watertxt_file], settings=settings, print_data=True) # write timeseries of discharge + water use for OASIS watertxt.write_timeseries_file(watertxt_data=watertxt_data, name=settings["ecoflow_parameter_name"], save_path=oasis_dir, filename="-".join([ watertxt_data["stationid"], settings["oasis_file_name"] ])) # write timeseries of dishcarge + water use for ecoflow program watertxt.write_timeseries_file_stationid( watertxt_data, name=settings["ecoflow_parameter_name"], save_path=ecoflow_dir, filename="", stationid=watertxt_data["stationid"])
def process_intersecting_tiles(intersecting_tiles, settings, gcm_delta_dir): """ Apply water use data to a WATER \*.txt file. The new file created is saved to the same directory as the \*.xml file. Parameters ---------- intersecting_tiles : dictionary Dictionary containing lists of values for a particular field that were intersected by another shapefile. settings : dictionary Dictionary of user settings gcm_delta_dir : string string path to ecoflow directory Notes ----- Uses settings set in user_settings.py """ # create a file for the output for featureid, tiles in intersecting_tiles.iteritems(): # get monthly average gcm delta values deltas_data_list, deltas_avg_dict = deltas.get_deltas(delta_files = settings["gcm_delta_files"], tiles = tiles) # print monthly output in nice format to info file print("FeatureId: {}\n Tiles: {}\n Average GCM Deltas:\n".format(featureid, tiles)) for key in deltas_avg_dict.keys(): print(" {}\n".format(key)) helpers.print_monthly_dict(monthly_dict = deltas_avg_dict[key]) # get the txt data file that has a parent directory matching the current featureid if settings["is_batch_simulation"]: path = os.path.join(settings["simulation_directory"], featureid) else: path = settings["simulation_directory"] # find the WATERSimulation.xml and WATER.txt files waterxml_file = helpers.find_file(name = settings["water_database_file_name"], path = path) watertxt_file = helpers.find_file(name = settings["water_text_file_name"], path = path) # get file info waterxml_dir, waterxml_filename = helpers.get_file_info(waterxml_file) watertxt_dir, watertxt_filename = helpers.get_file_info(watertxt_file) # create an output directory output_dir = helpers.make_directory(path = waterxml_dir, directory_name = settings["gcm_delta_directory_name"]) # initialize error logging waterapputils_logging.initialize_loggers(output_dir = output_dir) # read the xml file waterxml_tree = waterxml.read_file(waterxml_file) watertxt_data = watertxt.read_file(watertxt_file) # apply gcm delta for key, value in deltas_avg_dict.iteritems(): if key == "Ppt": waterxml.apply_factors(waterxml_tree = waterxml_tree, element = "ClimaticPrecipitationSeries", factors = deltas_avg_dict[key]) elif key == "Tmax": waterxml.apply_factors(waterxml_tree = waterxml_tree, element = "ClimaticTemperatureSeries", factors = deltas_avg_dict[key]) elif key == "PET": watertxt.apply_factors(watertxt_data, name = "PET", factors = deltas_avg_dict[key], is_additive = False) # update the project name in the updated xml project = waterxml.create_project_dict() project = waterxml.fill_dict(waterxml_tree = waterxml_tree, data_dict = project, element = "Project", keys = project.keys()) waterxml.change_element_value(waterxml_tree = waterxml_tree, element = "Project", child = "ProjName" , new_value = settings["gcm_delta_prepend_name"] + project["ProjName"]) # write updated xml waterxml_with_gcm_delta_file = settings["gcm_delta_prepend_name"] + waterxml_filename waterxml.write_file(waterxml_tree = waterxml_tree, save_path = output_dir, filename = waterxml_with_gcm_delta_file) # write the pet timeseries file watertxt.write_timeseries_file(watertxt_data, name = "PET", save_path = output_dir, filename = settings["pet_timeseries_file_name"]) # plot updated_waterxml_file = os.path.join(output_dir, waterxml_with_gcm_delta_file) water_files_processing.process_water_files(file_list = [updated_waterxml_file ], settings = settings, print_data = False) water_files_processing.process_cmp(file_list = [updated_waterxml_file, waterxml_file], settings = settings, print_data = False) # plot the gcm deltas for deltas_data in deltas_data_list: deltas_viewer.plot_deltas_data(deltas_data = deltas_data, save_path = helpers.make_directory(path = gcm_delta_dir, directory_name = settings["gcm_delta_directory_name"]))
def predict_next_round(clf, final_path, current_raw_cleaned_path, statistics=False, stat_path=None, first=True): # First indicates whether the one being predicted is the upcoming round # Load final data csv df = pd.read_csv(final_path) # Get the row count of the dataframe len_df = df.shape[0] # Normalize each columns and remove rows that should not be predicted yet df = prepare_data(df, drop_na=False) df = df.loc[(df['FTR'] != 'H') & (df['FTR'] != 'D') & (df['FTR'] != 'A')] df = df.drop(columns=['FTR']) if statistics: if stat_path is not None: make_directory(stat_path) else: raise ValueError( "specify 'stat_path' to save prediction result. Exiting...") if len(df) > 0: df_indices = [x - len_df for x in df.index] prediction = clf.predict(df).tolist() prediction_probability = clf.predict_proba(df).tolist() clf_classes = clf.classes_ df_to_predict = pd.read_csv(current_raw_cleaned_path) len_df = df_to_predict.shape[0] print("{:20} {:20} {:20} {}".format("Home", "Away", "Predict", "Probability")) for (index, result, pred_prob) in zip(df_indices, prediction, prediction_probability): HT = df_to_predict.at[index + len_df, 'HomeTeam'] AT = df_to_predict.at[index + len_df, 'AwayTeam'] date_so_far = df_to_predict.at[index + len_df, 'Date'] df_to_predict.at[index + len_df, 'FTR'] = result df_to_predict.at[index + len_df, 'FTHG'] = 0 df_to_predict.at[index + len_df, 'FTAG'] = 0 for (outcome, prob) in zip(clf_classes, pred_prob): df_to_predict.at[index + len_df, 'prob_' + outcome] = prob print("{:20} {:20} {:20} {}".format(HT, AT, HT if result == "H" else AT, max(pred_prob))) if statistics: if first: if os.path.exists(stat_path): os.remove(stat_path) df_to_predict.to_csv(stat_path, index=False) else: if os.path.isfile(stat_path): stat_df = pd.read_csv(stat_path) stat_df.update(df_to_predict) stat_df.to_csv(stat_path, index=False) else: raise ValueError( 'FATAL ERROR: either set first=True, or feed stat_path.' ) df_to_predict = df_to_predict.drop( columns=['prob_' + outcome for outcome in clf_classes]) df_to_predict.to_csv(current_raw_cleaned_path, index=False) return True, date_so_far else: print("There are no more games to make prediction") return False, None
def get_clf(final_file_path, model_confidence_csv_path, clf_file, recalculate=True): if not recalculate and os.path.isfile(clf_file): return joblib.load(clf_file), None, None # First load the data from csv file data = pd.read_csv(final_file_path) # Drop columns that are not needed and normalized each columns data = prepare_data(data, drop_na=True) data = data.loc[(data['FTR'] == 'H') | (data['FTR'] == 'D') | (data['FTR'] == 'A')] # Divide data into features and label X_all = data.drop(columns=['FTR']) y_all = data['FTR'] # List of Classifiers that we are going to run classifiers = [ # Logistic Regressions LogisticRegression(), # Best param in this grid search LogisticRegression(penalty='l2', solver='newton-cg', multi_class='ovr', C=0.1, warm_start=True), LogisticRegression(penalty='l2', solver='lbfgs', multi_class='multinomial', C=0.4, warm_start=False), # SVC SVC(probability=True), SVC(C=0.3, class_weight=None, decision_function_shape='ovo', degree=1, kernel='rbf', probability=True, shrinking=True, tol=0.0005), SVC(C=0.28, class_weight=None, decision_function_shape='ovo', degree=1, kernel='rbf', probability=True, shrinking=True, tol=0.0002), # XGBoost xgb.XGBClassifier(), xgb.XGBClassifier(learning_rate=0.01, n_estimators=1000, max_depth=2, min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.7, scale_pos_weight=0.8, reg_alpha=1e-5, booster='gbtree', objective='multi:softprob'), # KNeighborsClassifier(), # RandomForestClassifier(), # GaussianNB(), # DecisionTreeClassifier(), # GradientBoostingClassifier(), # LinearSVC(), # SGDClassifier() ] # # Example of how to grid search classifiers # # Logistic Regression # clf_L = LogisticRegression() # parameters_L = {'penalty': ['l2'], # 'solver': ['lbfgs', 'newton-cg', 'sag'], # 'multi_class': ['ovr', 'multinomial'], # 'C': [x * 0.1 + 0.1 for x in range(10)], # 'warm_start': [True, False], # 'fit_intercept':[True, False], # 'class_weight':['balanced',None]} # f1_scorer_L = make_scorer(f1_score, labels=['H','D','A'], average = 'micro') # clf_L = get_grid_clf(clf_L, f1_scorer_L, parameters_L, X_all, y_all) # classifiers.append(clf_L) # # SVC # clf_L = SVC() # parameters_L = { # 'C': [x * 0.01 + 0.27 for x in range(5)], # 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], # 'degree': [x + 1 for x in range(3)], # 'shrinking': [True, False], # 'tol':[x * 0.0005 + 0.0005 for x in range(3)], # 'class_weight':['balanced',None], # 'decision_function_shape': ['ovo', 'ovr'] # } # f1_scorer_L = make_scorer(f1_score, labels=['H','D','A'], average = 'micro') # clf_L = get_grid_clf(clf_L, f1_scorer_L, parameters_L, X_all, y_all) # classifiers.append(clf_L) # # XGBoost # clf_L = xgb.XGBClassifier() # parameters_L = { # 'learning_rate': [0.01], # 'n_estimators':[1000], # 'max_depth': [2], # 'min_child_weight': [5], # 'gamma': [0], # 'subsample': [0.8], # 'colsample_bytree': [0.7], # 'scale_pos_weight':[0.8], # 'reg_alpha':[1e-5], # 'booster': ['gbtree'], # 'objective': ['multi:softprob'] # } # f1_scorer_L = make_scorer(f1_score, labels=['H','D','A'], average = 'micro') # clf_L = get_grid_clf(clf_L, f1_scorer_L, parameters_L, X_all, y_all) # classifiers.append(clf_L) # We are going to record accuracies of each classifier prediction iteration len_classifiers = len(classifiers) result = [[] for _ in range(len_classifiers)] y_results = [[] for _ in range(len_classifiers + 1)] # Using 10-fold cross validation (Dividing the data into sub groups (90% to fit, 10% to test), and run # prediction with each classifiers using the sub groups as a dataset) split = 10 kf = KFold(n_splits=split, shuffle=True) for split_index, (train_index, test_index) in enumerate(kf.split(X_all)): print("Processing {}/{} of KFold Cross Validation...".format( split_index + 1, split)) X_train, X_test = X_all.iloc[train_index], X_all.iloc[test_index] y_train, y_test = y_all.iloc[train_index], y_all.iloc[test_index] y_results[len_classifiers] += y_test.tolist() for index, clf in enumerate(classifiers): print("KFold: {}/{}. clf_index: {}/{}.".format( split_index + 1, split, index + 1, len(classifiers))) confidence, predicted_result = train_predict( clf, X_train, y_train, X_test, y_test) result[index].append(confidence) y_results[index] += predicted_result.tolist() # Make a dictionary of average accuracies for each classifiers avg_dict, best_clf, best_clf_average = process_print_result( classifiers, result) # Put the result into csv file if os.path.isfile(model_confidence_csv_path): df = pd.read_csv(model_confidence_csv_path) newdf = pd.DataFrame(avg_dict, index=[df.shape[1]]) df = pd.concat([df, newdf], ignore_index=True, sort=False) else: make_directory(model_confidence_csv_path) df = pd.DataFrame(avg_dict, index=[0]) df.to_csv(model_confidence_csv_path, index=False) # Saves the classifier using joblib module joblib.dump(best_clf, clf_file) # Return the best classifier return best_clf, y_results, best_clf_average
def process_intersecting_tiles(intersecting_tiles, settings, gcm_delta_dir): """ Apply water use data to a WATER \*.txt file. The new file created is saved to the same directory as the \*.xml file. Parameters ---------- intersecting_tiles : dictionary Dictionary containing lists of values for a particular field that were intersected by another shapefile. settings : dictionary Dictionary of user settings gcm_delta_dir : string string path to ecoflow directory Notes ----- Uses settings set in user_settings.py """ # create a file for the output for featureid, tiles in intersecting_tiles.iteritems(): # get monthly average gcm delta values deltas_data_list, deltas_avg_dict = deltas.get_deltas( delta_files=settings["gcm_delta_files"], tiles=tiles) # print monthly output in nice format to info file print("FeatureId: {}\n Tiles: {}\n Average GCM Deltas:\n".format( featureid, tiles)) for key in deltas_avg_dict.keys(): print(" {}\n".format(key)) helpers.print_monthly_dict(monthly_dict=deltas_avg_dict[key]) # get the txt data file that has a parent directory matching the current featureid if settings["is_batch_simulation"]: path = os.path.join(settings["simulation_directory"], featureid) else: path = settings["simulation_directory"] # find the WATERSimulation.xml and WATER.txt files waterxml_file = helpers.find_file( name=settings["water_database_file_name"], path=path) watertxt_file = helpers.find_file( name=settings["water_text_file_name"], path=path) # get file info waterxml_dir, waterxml_filename = helpers.get_file_info(waterxml_file) watertxt_dir, watertxt_filename = helpers.get_file_info(watertxt_file) # create an output directory output_dir = helpers.make_directory( path=waterxml_dir, directory_name=settings["gcm_delta_directory_name"]) # initialize error logging waterapputils_logging.initialize_loggers(output_dir=output_dir) # read the xml file waterxml_tree = waterxml.read_file(waterxml_file) watertxt_data = watertxt.read_file(watertxt_file) # apply gcm delta for key, value in deltas_avg_dict.iteritems(): if key == "Ppt": waterxml.apply_factors(waterxml_tree=waterxml_tree, element="ClimaticPrecipitationSeries", factors=deltas_avg_dict[key]) elif key == "Tmax": waterxml.apply_factors(waterxml_tree=waterxml_tree, element="ClimaticTemperatureSeries", factors=deltas_avg_dict[key]) elif key == "PET": watertxt.apply_factors(watertxt_data, name="PET", factors=deltas_avg_dict[key], is_additive=False) # update the project name in the updated xml project = waterxml.create_project_dict() project = waterxml.fill_dict(waterxml_tree=waterxml_tree, data_dict=project, element="Project", keys=project.keys()) waterxml.change_element_value( waterxml_tree=waterxml_tree, element="Project", child="ProjName", new_value=settings["gcm_delta_prepend_name"] + project["ProjName"]) # write updated xml waterxml_with_gcm_delta_file = settings[ "gcm_delta_prepend_name"] + waterxml_filename waterxml.write_file(waterxml_tree=waterxml_tree, save_path=output_dir, filename=waterxml_with_gcm_delta_file) # write the pet timeseries file watertxt.write_timeseries_file( watertxt_data, name="PET", save_path=output_dir, filename=settings["pet_timeseries_file_name"]) # plot updated_waterxml_file = os.path.join(output_dir, waterxml_with_gcm_delta_file) water_files_processing.process_water_files( file_list=[updated_waterxml_file], settings=settings, print_data=False) water_files_processing.process_cmp( file_list=[updated_waterxml_file, waterxml_file], settings=settings, print_data=False) # plot the gcm deltas for deltas_data in deltas_data_list: deltas_viewer.plot_deltas_data( deltas_data=deltas_data, save_path=helpers.make_directory( path=gcm_delta_dir, directory_name=settings["gcm_delta_directory_name"]))