def print_running_avg(daily_cases, window_size): """Calls running_average() and prints running avg and window size Parameters ---------- daily_cases: list List of daily counts in a county window_size: int Size of window to use in calculation Prints -------- running_avg: list of floats Running averages for daily counts window: int Size of the window used in calculation """ try: running_avg, window = mu.running_average(daily_cases, window_size) except TypeError: running_avg, window = mu.running_average(daily_cases) print(*running_avg, sep='\n') print(window)
def test_running_avg(self): self.assertTrue( all( my_utils.running_average(np.array([0, 1, 2, 3, 4]), window=3) == np.array([0., 0.5, 1., 2., 3.]))) # test when decreasing value self.assertTrue( all( my_utils.running_average(np.array([0, 9, 2, 6, 1]), window=3) == np.array([0., 4.5, 11. / 3., 17. / 3., 3.]))) # test when window is smaller than length of array self.assertTrue( all( my_utils.running_average(np.array([0, 1, 2, 3, 4]), window=10) == np.array([0., 0.5, 1., 1.5, 2.])))
def test_running_avg(self): # simple test avgs, window = my_utils.running_average([4, 6, 3, 1, 8, 99], window_size=4) self.assertEqual(avgs, [3.5, 4.5, 27.75]) # randomized test for i in range(1000): data_size = random.randint(100, 1000) data = array('i') for j in range(data_size): data.append(random.randint(1, 100)) # random window within range test_window = data_size - random.randint(1, 100) avgs, window_size = my_utils.running_average(data, test_window) for j in range(len(avgs)): self.assertEqual(avgs[j], np.mean(data[j:j + test_window])) self.assertEqual(window_size, test_window)
def test_window_too_large(self): # simple test avgs, window = my_utils.running_average([4, 6, 3, 1, 8, 98], window_size=10) self.assertEqual(avgs, [20]) self.assertEqual(window, 6) # randomized test for i in range(1000): data_size = random.randint(100, 1000) data = array('i') for j in range(data_size): data.append(random.randint(1, 100)) # window bigger than data size test_window = data_size + random.randint(1, 100) avgs, window_size = my_utils.running_average(data, test_window) for j in range(len(avgs)): self.assertEqual(avgs[j], np.mean(data[j:j + data_size])) self.assertEqual(window_size, data_size)
def test_get_running_average_random_mode(self): for i in range(10): arr = [] for j in range(100): x = random.randint(0, 10000) arr.append(x) for k in range(100): window = random.randint(1, 100) test_data, _ = mu.running_average(arr, window) for m in range(int(100 / window) - 1): expected_result = np.mean(arr[m:m + window]) self.assertEqual(test_data[m], expected_result)
def test_window_negative(self): for i in range(1000): data_size = random.randint(100, 1000) data = array('i') for j in range(data_size): data.append(random.randint(1, 100)) # window negative test_window = random.randint(-100, -1) avgs, window_size = my_utils.running_average(data, test_window) for j in range(len(avgs)): self.assertEqual(avgs[j], np.mean(data[j:j + 5])) self.assertEqual(window_size, 5)
def main(): desc = 'Opens a file and extracts data from a specific column.' parser = argparse.ArgumentParser(description=desc) parser.add_argument('--file', dest='file_name', type=str, required=True, help='Name of the file to be opened by the script.') parser.add_argument('--result_column', dest='result_column', default=4, help='Column of file to be returned by the script.\ Defaults to 4 and must correspond to an index\ found in the file.') parser.add_argument('--county_column', dest='county_column', type=int, required=True, help='Column of file to be queried by the script.') parser.add_argument('--county', dest='county', type=str, required=True, help='Name of county to retrieve data from.') parser.add_argument('--return_daily_increment', dest='return_daily_increment', type=bool, default=False, help='Decides whether results\ are returned as daily increments.') parser.add_argument('--return_running_average', dest='return_running_average', type=bool, default=False, help='Decides whether to return\ running averages from results.') parser.add_argument('--running_avg_window_size', dest='running_avg_window_size', type=int, default=5, help='Determines the window\ size for the running average.') parser.add_argument('--date_column', dest='date_column', type=int, default=0, help='Determines the date column.') args = parser.parse_args() print() print('Results:') results = [] try: args.result_column = int(args.result_column) except ValueError: pass if ',' in args.result_column: result_array = [] for result in args.result_column.split(','): result_array.append(str(result)) args.result_column = result_array try: results = mu.get_columns(args.file_name, args.county_column, args.county, args.result_column, args.date_column) except ValueError: print('ValueError during get columns') else: try: results = mu.get_column(args.file_name, args.county_column, args.county, args.result_column, args.date_column) except ValueError: print('ValueError during get column') if args.return_daily_increment is True: try: results = mu.get_daily_count( get_cases(args.file_name, args.county_column, args.county, args.result_column, args.date_column)) except ValueError: print('Value Error during get daily increment.') if args.return_running_average is True: try: results, _ = mu.running_average( results, window_size=args.running_avg_window_size) except ValueError: print('ValueError during running average') for result in results: print(result) print() print()
def test_get_running_average_error_mode(self): with self.assertRaises(SystemExit) as cm: mu.running_average(None) self.assertEqual(cm.exception.code, 3)
def test_get_running_average(self): test_results = None test_results = mu.running_average([1, 2, 1, 2]) self.assertAlmostEqual(test_results[0], 1.5)
def main(): """ calculate the number of covid19 cases per capita\ for each county in a given State for a given date. Cases are per 100,000 people and rounded to 1 decimal Required Args: --------------- state: str Name of USA State (No abbreviations) query_date: str date in ISO format 'YYYY-MM-DD' Optional Args (have defaults): see argparser section ------------------------------------------- covid_file_name: str census_file_name: str daily_new: bool default=True running_avg: bool default=False window: int coviddata_county_column: int * cases_column: int * date_column: int * census_state_column: int * census_county_column: int * pop_column: int * Note: *= only needs to be changed if format of\ covid19 and census data files are changed Returns: --------- out_lists: list of [str, float] [county_name, county_caserate_at_date] """ # parse command line arguments parser = argparse.ArgumentParser(description='process args for \ reading covid data CSV file') parser.add_argument('--state', type=str, help='Name of the State', required=True) parser.add_argument('--query_date', type=str, help='date in ISO format "YY-MM-DD" ', required=True) parser.add_argument('--covid_file_name', type=str, help='Name of the input covid cases data file', default='covid-19-data/us-counties.csv') parser.add_argument('--census_file_name', type=str, help='Name of the input census data file', default='census-data/co-est2019-alldata.csv') parser.add_argument('--coviddata_county_column', type=int, help='column ind for county names in covid CSVfile', default=1) parser.add_argument('--cases_column', type=int, help='column ind for number of cases in covid CSVfile', default=4) parser.add_argument('--date_column', type=int, default=0, help='column ind for date in covid CSV file') parser.add_argument('--census_state_column', type=int, help='column ind for state names in census CSV file', default=5) parser.add_argument('--census_county_column', type=int, help='column ind for county names in census CSV file', default=6) parser.add_argument('--pop_column', type=int, help='column ind for populaiton in census CSV file', default=7) parser.add_argument('--daily_new', type=bool, default=True, help='daily newcases. False gives cumulativ cases') parser.add_argument('--running_avg', type=bool, default=False, help='running average of cases.\ default is False, window size is required') parser.add_argument('--window', type=int, default=5, help='Window size of running average') # parse arguments and store them in args args = parser.parse_args() # assign arguments coviddata_file_name = args.covid_file_name coviddata_county_column = args.coviddata_county_column cases_column = args.cases_column date_column = args.date_column daily_new = args.daily_new running_avg = args.running_avg window = args.window census_file_name = args.census_file_name census_state_column = args.census_state_column state = args.state census_county_column = args.census_county_column pop_column = args.pop_column query_date = date.fromisoformat(args.query_date) # make CSV file copy of only state covid-19-data if coviddata_file_name == 'covid-19-data/us-counties.csv': state_coviddata_file_name = 'covid-19-data/'+state+'-counties.csv' try: f1 = open(state_coviddata_file_name, 'r') f1.close() except FileNotFoundError: print('creating state_covidfile') state_coviddata_file_name = make_statefile(state) print(state_coviddata_file_name, 'state_coviddata_file_name') else: Warning('This script must be run on data within only \ one state or else has error if counties of \ the same name in different states across USA.\ if not using default args.covid_file_name, please\ check that county names are not duplicated.\ NOTE: Proceeding by assigning variable\ state_coviddata_file_name = args.covid_file_name ;\ Watch out for errors from this issue.') state_coviddata_file_name = args.covid_file_name # get state county names and population data from census file census_state_data = get_column(census_file_name, census_state_column, state, result_columns=[census_county_column, pop_column], date_column=None) county_pop_list = census_state_data[1][1:] # census file has names as "countyname + County", so rm " County" county_names_list_withcounty = census_state_data[0][1:] county_names_list = [] for c in range(len(county_names_list_withcounty)): county_names_list.append(county_names_list_withcounty[c][:-7]) # make hashtable of (key-county_name, value= county_pop) N = 260 # hashtable size. Max number counties in a State is Texas with 254 census_hashtable = [[] for i in range(N)] for c in range(len(county_names_list)): hash_table.put(census_hashtable, N, county_names_list[c], county_pop_list[c], method='rolling') # daily cases option and running avg cases option if daily_new is True: from my_utils import get_daily_count if running_avg is True: from my_utils import running_average # Loop through each county in state out_lists = [] for c in range(len(county_names_list)): county_cases_data_cumulative = get_column(state_coviddata_file_name, coviddata_county_column, county_names_list[c], result_columns=[cases_column], date_column=date_column, return_dates=True) # dates are stored in last index of list, in datetime format dates = county_cases_data_cumulative[-1] # convert cases from type str to int county_cases = list(map(int, county_cases_data_cumulative[0])) # daily cases option and running avg options if daily_new is True: county_cases = get_daily_count(county_cases) if running_avg is True: county_cases = running_average(county_cases, window) # binary search for county cases at date county_cases_at_date = binary_search(query_date, [dates, county_cases]) # case rate per 100,000 people if county_cases_at_date is not None: county_caserate_at_date = county_cases_at_date * 100000 \ / int(hash_table.get(census_hashtable, N, county_names_list[c], method='rolling')) out_lists.append([county_names_list[c], round(county_caserate_at_date, 1)]) print(out_lists) return out_lists
county_column, county, result_columns=[cases_column], date_column=date_column) # convert cases from type str to int cases = list(map(int, cases[0])) # print daily cases option if print_daily is True: from my_utils import get_daily_count day_cases = get_daily_count(cases) # print runing average cases option if print_running_avg is True: from my_utils import running_average running_avg_cases = running_average(day_cases, window) # print outputs. (print one value per line) print('cumulative cases by each date:') for c in range(0, len(cases)): print(cases[c]) if print_daily is True: print('daily cases:') for c in range(0, len(day_cases)): print(day_cases[c]) if print_running_avg is True: print('running average cases, window = ' + str(window) + " :") for c in range(0, len(running_avg_cases)): print(running_avg_cases[c])
def main(): """ get Covid19 case data and census data and convert to per-capita rates data are from two different files. Per Capita Rates are per 100,000 people Required Args: --------------- state: str Name of USA State (No abbreviations) coviddata_countys_list: list of str Optional Args (have defaults): see argparser section ------------------------------------------- data_out_file: str name of CSV file if want one to be made. or '[]' covid_file_name: str census_file_name: str daily_new: bool default=True running_avg: bool default=False running_sum: bool default=False window: int coviddata_county_column: int * cases_column: int * date_column: int * census_state_column: int * census_county_column: int * pop_column: int * Note: *= only needs to be changed if format of covid19 and census data files are changed Returns: --------- out_data : list of lists of lists: [census_countys_list, [[dates for c1],[dates for c2],..], [per_capita_rates c1],[per_capita_rates c2],...] Where: ------ per_capita_rates: list list of cases / population (these are per 100,000 people) dates: list list of dates in format datetime.date(YYYY, MM, D) """ # parse command line arguments parser = argparse.ArgumentParser(description='process args for \ reading covid data CSV file') parser.add_argument('--state', type=str, help='Name of the State', required=True) parser.add_argument('--coviddata_countys_list', type=str, nargs='+', help='list of strings for \ Name(s) of the county(s) in covid CSV file \ that we want to look at', required=True) parser.add_argument('--data_out_file', type=str, help='Name of the CSV file to write this data \ out to. If not wanted, is "[]", which\ is coded to not return any data_out_file', default='[]') parser.add_argument('--covid_file_name', type=str, help='Name of the input covid cases data file', default='covid-19-data/us-counties.csv') parser.add_argument('--census_file_name', type=str, help='Name of the input census data file', default='census-data/co-est2019-alldata.csv') parser.add_argument('--coviddata_county_column', type=int, help='column ind for county names in covid CSVfile', default=1) parser.add_argument('--cases_column', type=int, help='column ind for number of cases in covid CSVfile', default=4) parser.add_argument('--date_column', type=int, default=0, help='column ind for date in covid CSV file') parser.add_argument('--census_state_column', type=int, help='column ind for state names in census CSV file', default=5) parser.add_argument('--census_county_column', type=int, help='column ind for county names in census CSV file', default=6) parser.add_argument('--pop_column', type=int, help='column ind for populaiton in census CSV file', default=7) parser.add_argument('--daily_new', type=bool, default=True, help='daily newcases. default is cumulativ dailycases') parser.add_argument('--running_avg', type=bool, default=False, help='running average of cases.\ default is False, window size is required') parser.add_argument('--running_sum', type=bool, default=False, help='running sum of cases over a window.\ default is False, window size is required.\ cannot be switched on at same \ time as running_avg') parser.add_argument('--window', type=int, default=5, help='Window size of running average or running sum') # parse arguments and store them in args args = parser.parse_args() # assign arguments state = args.state coviddata_countys_list = [ i.replace('-', ' ') for i in args.coviddata_countys_list ] data_out_file = args.data_out_file coviddata_file_name = args.covid_file_name coviddata_county_column = args.coviddata_county_column cases_column = args.cases_column date_column = args.date_column daily_new = args.daily_new running_avg = args.running_avg running_summation = args.running_sum window = args.window census_file_name = args.census_file_name census_state_column = args.census_state_column census_county_column = args.census_county_column pop_column = args.pop_column # make CSV file copy of only state covid-19-data if coviddata_file_name == 'covid-19-data/us-counties.csv': state_coviddata_file_name = 'covid-19-data/' + state + '-counties.csv' try: f1 = open(state_coviddata_file_name, 'r') f1.close() except FileNotFoundError: print('creating state_covidfile') state_coviddata_file_name = make_statefile(state) print(state_coviddata_file_name, 'state_coviddata_file_name') elif coviddata_file_name == 'covid-19-data/' + state + '-counties.csv': state_coviddata_file_name = coviddata_file_name else: Warning('This script must be run on data within only \ one state or else has error if counties of \ the same name in different states across USA.\ if not using default args.covid_file_name, please\ check that county names are not duplicated.\ NOTE: Proceeding by assigning variable\ state_coviddata_file_name = args.covid_file_name ;\ Watch out for errors from this issue.') state_coviddata_file_name = args.covid_file_name # get census data for all counties in the state census_state_data = get_column( census_file_name, census_state_column, state, result_columns=[census_county_column, pop_column], date_column=None) # sort census_state_data by county name # census_state_data is of list [[county_names], [census2010pops]) sorted_pairs = sorted(zip(census_state_data[0], census_state_data[1])) tuples = zip(*sorted_pairs) list1, list2 = [list(tuple) for tuple in tuples] census_state_data_sorted = [list1, list2] # pre-allocate structure of out_data list of lists of lists # out_data[0] will be coviddata_countys_list # out_data[1] will be list of dates for each county # out_data[2] will be list of per_capita_rates for each county out_data = [[], [], []] # run for each county for county_index in range(0, len(coviddata_countys_list)): coviddata_county_name = coviddata_countys_list[county_index] out_data[0].append(coviddata_county_name) # run get_column() on covid data and census data cases_data_cumulative = get_column(state_coviddata_file_name, coviddata_county_column, coviddata_county_name, result_columns=[cases_column], date_column=date_column, return_dates=True) # convert cases from type str to int cases_data_cumulative[0] = list(map(int, cases_data_cumulative[0])) # dates are stored in last index of list, in datetime format dates = cases_data_cumulative[-1] # daily cases option if daily_new is True: from my_utils import get_daily_count cases = get_daily_count(cases_data_cumulative[0]) else: cases = cases_data_cumulative[0] # print running average OR running sum cases option OR neither if running_avg is True: from my_utils import running_average cases = running_average(cases, window) elif running_summation is True: from my_utils import running_sum cases = running_sum(cases, window) # use binary search to get county pop census data out of state data census_county_name = coviddata_county_name + ' County' county_pop = binary_search(census_county_name, census_state_data_sorted) # raise error if county census not found if county_pop is None: ValueError print('county census not found') sys.exit(1) county_pop = int(county_pop) # convert cases to per-capita rates by dividing county case by pop if type(cases) == list: cases = np.asarray(cases) per_capita_rates = np.round(cases / county_pop * 100000, 2) # convert per_capita_rates back from nparray to list per_capita_rates = per_capita_rates.tolist() # append to out_data lists out_data[1].append([dates]) out_data[2].append([per_capita_rates]) # write out_data to a CSV file in format 'County','date','per_capita_rate' if data_out_file != '[]': fout = open(data_out_file, 'w') fout.write("county,date,per_capita_rate \n") for county_index in range(0, len(out_data[0])): for date_ind in range(0, len(out_data[1][county_index][0])): fout.write(out_data[0][county_index] + ',' + str(out_data[1][county_index][0][date_ind]) + ',' + str(out_data[2][county_index][0][date_ind]) + '\n') fout.close() return out_data
def main(): """ get Covid19 case data and census data and convert to per-capita rates data are from two different files Returns: --------- per_capita_rates: list list of cases / population dates: list list of dates in format datetime.date(YYYY, MM, D) """ # TODO: add main def docstring # parse command line arguments parser = argparse.ArgumentParser(description='process args for \ reading covid data CSV file') parser.add_argument('--covid_file_name', type=str, help='Name of the input covid cases data file', required=True) parser.add_argument('--census_file_name', type=str, help='Name of the input census data file', required=True) parser.add_argument('--plot_file_name', type=str, help='output plot file generated', required=True) parser.add_argument('--state', type=str, help='Name of the State', required=True) parser.add_argument('--coviddata_county', type=str, help='Name of the county in covid CSV file', required=True) parser.add_argument('--census_county', type=str, help='Name of the county in census CSV file', required=True) parser.add_argument('--coviddata_county_column', type=int, help='column ind for county names in covid CSVfile') parser.add_argument('--cases_column', type=int, help='column ind for number of cases in covid CSVfile') parser.add_argument('--date_column', type=int, default=0, help='column ind for date in covid CSV file') parser.add_argument('--census_state_column', type=int, help='column ind for state names in census CSV file') parser.add_argument('--census_county_column', type=int, help='column ind for county names in census CSV file') parser.add_argument('--pop_column', type=int, help='column ind for populaiton in census CSV file') parser.add_argument('--daily_new', type=bool, default=False, help='daily newcases. default is cumulativ dailycases') parser.add_argument('--running_avg', type=bool, default=False, help='running average of cases.\ default is False, window size is required') parser.add_argument('--window', type=int, default=5, help='Window size of running average') # parse arguments and store them in args args = parser.parse_args() # assign arguments coviddata_file_name = args.covid_file_name coviddata_county_column = args.coviddata_county_column plot_file_name = args.plot_file_name coviddata_county_name = args.coviddata_county cases_column = args.cases_column date_column = args.date_column daily_new = args.daily_new running_avg = args.running_avg window = args.window census_file_name = args.census_file_name census_state_column = args.census_state_column state = args.state census_county_name = args.census_county census_county_column = args.census_county_column pop_column = args.pop_column # run get_column() on covid data and census data cases_data_cumulative = get_column(coviddata_file_name, coviddata_county_column, coviddata_county_name, result_columns=[cases_column], date_column=date_column, return_dates=True) census_state_data = get_column( census_file_name, census_state_column, state, result_columns=[census_county_column, pop_column], date_column=None) # convert cases from type str to int cases_data_cumulative[0] = list(map(int, cases_data_cumulative[0])) # dates are stored in last index of list, in datetime format dates = cases_data_cumulative[-1] # daily cases option if daily_new is True: from my_utils import get_daily_count cases = get_daily_count(cases_data_cumulative[0]) # not dates column else: cases = cases_data_cumulative[0] # print runing average cases option if running_avg is True: from my_utils import running_average cases = running_average(cases, window) # census_state_data is of list [[county_names], [census2010pops]) # sort census_state_data by county name sorted_pairs = sorted(zip(census_state_data[0], census_state_data[1])) tuples = zip(*sorted_pairs) list1, list2 = [list(tuple) for tuple in tuples] census_state_data_sorted = [list1, list2] # use binary search to get county pop census data out of state data county_pop = binary_search(census_county_name, census_state_data_sorted) # raise error if county census not found if county_pop is None: ValueError print('county census not found') sys.exit(1) county_pop = int(county_pop) # convert cases to per-capita rates by dividing county case by population if type(cases) == list: cases = np.asarray(cases) per_capita_rates = cases / county_pop # convert per_capita_rates back from nparray to list per_capita_rates = per_capita_rates.tolist() # plot using plot_lines plot_points = [[]] for point in range(0, len(per_capita_rates)): plot_points[0].append([dates[point], per_capita_rates[point]]) plot_labels = ['dates', 'per_capita_rates'] plot = plot_lines(plot_points, plot_labels, plot_file_name) return plot # NOTE: idk if this line is needed?