def test_ok(self, mock_sheet): with tempfile.TemporaryDirectory() as tmpdirname: txt_config = f""" [DEFAULT] [FILES] DATA = {tmpdirname}/covid-19.data.prod.TIMESTAMP.csv ERRORS = {tmpdirname} LATEST = {tmpdirname} """ config = configparser.ConfigParser() config.read_string(txt_config) mapping = defaultdict(FakeResult) cur_dir = pathlib.Path(__file__).parent.absolute() geocoder = csv_geocoder.CSVGeocoder(os.path.join(cur_dir, "geocoding", "geo_admin.tsv"), mapping.get) mock_sheet.name = "sheet-name" mock_sheet.base_id = "000" mock_sheet.read_values = MagicMock(side_effect=[ # Only one read is done in this test. # First row has columns only, voluntarily in a random order so that we check that the code doesn't depend on column ordering. # Second row has the data. [ ["age","sex","city","province","country","aggregated_num_cases","date_onset_symptoms","date_admission_hospital","date_confirmation","symptoms","lives_in_Wuhan","travel_history_dates","travel_history_location","reported_market_exposure","additional_information","chronic_disease_binary","chronic_disease","source","sequence_available","outcome","date_death_or_discharge","notes_for_discussion","data_moderator_initials","travel_history_binary", 'longitude', 'latitude', 'admin1', 'location', 'admin3', 'admin2', 'geo_resolution', 'country_new', 'admin_id'], ["24", "male","cit","pro","coun","4","20.04.2020","21.04.2020","20.04.2020","symp","no","","","","handsome guy","0","","fake","0","discharged","25.04.2020","","TF","0", '14.15', '16.17', 'admin1', 'loc', 'admin3', 'admin2', 'point', 'France', '42'], ], ]) processor = SheetProcessor([mock_sheet], geocoder, config) processor.process()
def setUp(self): mapping = { "some, where, far": FakeResult(ok=True, lat=42.42, lng=43.43), "some, where, toofar": FakeResult(ok=False, lat=0, lng=0), } cur_dir = pathlib.Path(__file__).parent.absolute() self.geocoder = csv_geocoder.CSVGeocoder( os.path.join(cur_dir, "geo_admin.tsv"), mapping.get)
def main(): args = parser.parse_args() config = configparser.ConfigParser() config.optionxform = str # to preserve case config.read(args.config_file) logging.basicConfig( format='%(asctime)s %(filename)s:%(lineno)d %(message)s', filename='cleanup.log', filemode="w", level=logging.INFO) geocoder = csv_geocoder.CSVGeocoder(config['GEOCODING'].get('TSV_PATH')) sheets = get_GoogleSheets(config) processor = SheetProcessor(sheets, geocoder, config) processor.process() if args.push_to_git: processor.push_to_github()
def main(): args = parser.parse_args() config = configparser.ConfigParser() config.optionxform = str # to preserve case config.read(args.config_file) logging.basicConfig( format='%(asctime)s %(filename)s:%(lineno)d %(message)s', filename='cleanup.log', filemode="w", level=logging.INFO) sheets = get_GoogleSheets(config) # Load geocoder early so that invalid tsv paths errors are caught early on. geocoder = csv_geocoder.CSVGeocoder(config['GEOCODING'].get('TSV_PATH'), arcgis) sp = sheet_processor.SheetProcessor(sheets, geocoder, config) sp.process() if args.push_to_git: sp.push_to_github()
def main(): cur_dir = pathlib.Path(__file__).parent.absolute() geocode_path = os.path.join(cur_dir, "geocoding", "geo_admin.tsv") args = parser.parse_args() config = configparser.ConfigParser() config.optionxform = str # to preserve case config.read(args.config_file) logging.basicConfig( format='%(asctime)s %(filename)s:%(lineno)d %(message)s', filename='cleanup.log', filemode="w", level=logging.INFO) # Increase default timeout, 60s isn't enough for big sheets. socket.setdefaulttimeout(600) sheets = get_GoogleSheets(config, args.creds_file, args.is_service_account) # Load geocoder early so that invalid tsv paths errors are caught early on. geocoder = csv_geocoder.CSVGeocoder(geocode_path, arcgis) git_repo_path = os.path.join(cur_dir, "..", "..") sp = sheet_processor.SheetProcessor(sheets, geocoder, git_repo_path) sp.process() if args.push_to_git: sp.push_to_github()
def main(): args = parser.parse_args() config = configparser.ConfigParser() config.read(args.config_file) logging.basicConfig(filename='cleanup.log', level=logging.INFO) sheets = get_GoogleSheets(config) for_github = [] # Load geocoder early so that invalid tsv paths errors are caught early on. geocoder = csv_geocoder.CSVGeocoder(config['GEOCODING'].get('TSV_PATH')) for s in sheets: logging.info("Processing sheet %s", s.name) insert_ids(s, config) time.sleep(args.sleep_time_sec) ### Clean Private Sheet Entries. ### # note : private sheet gets updated on the fly and redownloaded to ensure continuity between fixes (granted its slower). range_ = f'{s.name}!A:AG' values = read_values(s.spreadsheetid, range_, config) columns = s.columns column_dict = {c:index2A1(i) for i,c in enumerate(columns)} # to get A1 notation, doing it now to ensure proper order data = values2dataframe(values) # Trailing Spaces trailing = get_trailing_spaces(data) if len(trailing) > 0: logging.info('fixing %d trailing whitespace', len(trailing)) fix_cells(s.spreadsheetid, s.name, trailing, column_dict, config) values = read_values(s.spreadsheetid, range_, config) data = values2dataframe(values) time.sleep(args.sleep_time_sec) # fix N/A => NA na_errors = get_NA_errors(data) if len(na_errors) > 0: logging.info('fixing %d N/A -> NA', len(na_errors)) fix_cells(s.spreadsheetid, s.name, na_errors, column_dict, config) values = read_values(s.spreadsheetid, range_, config) data = values2dataframe(values) time.sleep(args.sleep_time_sec) # Regex fixes fixable, non_fixable = generate_error_tables(data) if len(fixable) > 0: logging.info('fixing %d regexps', len(fixable)) fix_cells(s.spreadsheetid, s.name, fixable, column_dict, config) values = read_values(s.spreadsheetid, range_, config) data = values2dataframe(values) time.sleep(args.sleep_time_sec) clean = data[~data.ID.isin(non_fixable.ID)] clean = clean.drop('row', axis=1) clean.sort_values(by='ID') s.data = clean non_fixable = non_fixable.sort_values(by='ID') # Save error_reports # These are separated by Sheet. logging.info('Saving error reports') directory = config['FILES']['ERRORS'] file_name = f'{s.name}.error-report.csv' error_file = os.path.join(directory, file_name) non_fixable.to_csv(error_file, index=False, header=True) for_github.append(error_file) # Combine data from all sheets into a single datafile: all_data = [] for s in sheets: data = s.data if s.name == 'outside_Hubei': data['ID'] = data['ID'].apply(lambda x : f'000-1-{x}') elif s.name == 'Hubei': data['ID'] = data['ID'].apply(lambda x: f'000-2-{x}') all_data.append(data) all_data = pd.concat(all_data, ignore_index=True) all_data = all_data.sort_values(by='ID') # Fill geo columns. geocode_matched = 0 for i, row in all_data.iterrows(): geocode = geocoder.Geocode(row.city, row.province, row.country) if not geocode: continue geocode_matched += 1 all_data.at[i, 'latitude'] = geocode.lat all_data.at[i, 'longitude'] = geocode.lng all_data.at[i, 'geo_resolution'] = geocode.geo_resolution all_data.at[i, 'location'] = geocode.location all_data.at[i, 'admin3'] = geocode.admin3 all_data.at[i, 'admin2'] = geocode.admin2 all_data.at[i, 'admin1'] = geocode.admin1 all_data.at[i, 'admin_id'] = geocode.admin_id all_data.at[i, 'country_new'] = geocode.country_new logging.info("Geocode matched %d/%d", geocode_matched, len(all_data)) # Reorganize csv columns so that they are in the same order as when we # used to have those geolocation within the spreadsheet. # This is to avoid breaking latestdata.csv consumers. all_data = all_data[["ID","age","sex","city","province","country","latitude","longitude","geo_resolution","date_onset_symptoms","date_admission_hospital","date_confirmation","symptoms","lives_in_Wuhan","travel_history_dates","travel_history_location","reported_market_exposure","additional_information","chronic_disease_binary","chronic_disease","source","sequence_available","outcome","date_death_or_discharge","notes_for_discussion","location","admin3","admin2","admin1","country_new","admin_id","data_moderator_initials","travel_history_binary"]] #drop_invalid_ids = [] #for i, row in all_data.iterrows(): # if row['ID'].str.match(' # save logging.info("Saving files to disk") dt = datetime.now().strftime('%Y-%m-%dT%H%M%S') file_name = config['FILES']['DATA'].replace('TIMESTAMP', dt) latest_name = os.path.join(config['FILES']['LATEST'], 'latestdata.csv') all_data.to_csv(file_name, index=False) all_data.to_csv(latest_name, index=False) logging.info("Wrote %s, %s", file_name, latest_name) if args.push_to_git: logging.info("Pushing to github") # Create script for uploading to github for_github.extend([file_name, latest_name]) script = 'set -e\n' script += 'cd {}\n'.format(config['GIT']['REPO']) script += 'git pull origin master\n' for g in for_github: script += f'git add {g}\n' script += 'git commit -m "data update"\n' script += 'git push origin master\n' script += f'cd {os.getcwd()}\n' print(script) os.system(script)
def setUp(self): cur_dir = pathlib.Path(__file__).parent.absolute() self.geocoder = csv_geocoder.CSVGeocoder( os.path.join(cur_dir, "geo_admin.tsv"))
def main(): args = parser.parse_args() config = configparser.ConfigParser() config.optionxform=str # to preserve case config.read(args.config_file) logging.basicConfig( format='%(asctime)s %(filename)s:%(lineno)d %(message)s', filename='cleanup.log', filemode="w", level=logging.INFO) sheets = get_GoogleSheets(config) for_github = [] # Load geocoder early so that invalid tsv paths errors are caught early on. geocoder = csv_geocoder.CSVGeocoder( config['GEOCODING'].get('TSV_PATH'), arcgis) for s in sheets: logging.info("Processing sheet %s", s.name) ### Clean Private Sheet Entries. ### # note : private sheet gets updated on the fly and redownloaded to ensure continuity between fixes (granted its slower). range_ = f'{s.name}!A:AG' data = values2dataframe(s.read_values(range_)) # Expand aggregated cases into one row each. logging.info("Rows before expansion: %d", len(data)) if len(data) > 150000: logging.warning("Sheet %s has more than 150K rows, it should be split soon", s.name) data.aggregated_num_cases = pd.to_numeric(data.aggregated_num_cases, errors='coerce') data = duplicate_rows_per_column(data, "aggregated_num_cases") logging.info("Rows after expansion: %d", len(data)) # Generate IDs for each row sequentially following the sheet_id-inc_int pattern. data['ID'] = s.base_id + "-" + pd.Series(range(1, len(data)+1)).astype(str) # Remove whitespace. data = trim_df(data) # Fix columns that can be fixed easily. data.sex = fix_sex(data.sex) # fix N/A => NA for col in data.select_dtypes("string"): data[col] = fix_na(data[col]) # Regex fixes fixable, non_fixable = generate_error_tables(data) if len(fixable) > 0: logging.info('fixing %d regexps', len(fixable)) s.fix_cells(fixable) data = values2dataframe(s.read_values(range_)) # ~ negates, here clean = data with IDs not in non_fixable IDs. clean = data[~data.ID.isin(non_fixable.ID)] clean = clean.drop('row', axis=1) clean.sort_values(by='ID') s.data = clean non_fixable = non_fixable.sort_values(by='ID') # Save error_reports # These are separated by Sheet. logging.info('Saving error reports') directory = config['FILES']['ERRORS'] file_name = f'{s.name}.error-report.csv' error_file = os.path.join(directory, file_name) non_fixable.to_csv(error_file, index=False, header=True, encoding="utf-8") for_github.append(error_file) # Combine data from all sheets into a single datafile all_data = [] for s in sheets: logging.info("sheet %s had %d rows", s.name, len(s.data)) all_data.append(s.data) all_data = pd.concat(all_data, ignore_index=True) all_data = all_data.sort_values(by='ID') logging.info("all_data has %d rows", len(all_data)) # Fill geo columns. geocode_matched = 0 for i, row in all_data.iterrows(): geocode = geocoder.geocode(row.city, row.province, row.country) if not geocode: continue geocode_matched += 1 all_data.at[i, 'latitude'] = geocode.lat all_data.at[i, 'longitude'] = geocode.lng all_data.at[i, 'geo_resolution'] = geocode.geo_resolution all_data.at[i, 'location'] = geocode.location all_data.at[i, 'admin3'] = geocode.admin3 all_data.at[i, 'admin2'] = geocode.admin2 all_data.at[i, 'admin1'] = geocode.admin1 all_data.at[i, 'admin_id'] = geocode.admin_id all_data.at[i, 'country_new'] = geocode.country_new logging.info("Geocode matched %d/%d", geocode_matched, len(all_data)) logging.info("Top 10 geocode misses: %s",geocoder.misses.most_common(10)) with open("geocode_misses.csv", "w") as f: geocoder.write_misses_to_csv(f) logging.info("Wrote all geocode misses to geocode_misses.csv") if len(geocoder.new_geocodes) > 0: logging.info("Appending new geocodes to geo_admin.tsv") with open(config['GEOCODING'].get('TSV_PATH'), "a") as f: geocoder.append_new_geocodes_to_init_file(f) for_github.append(config['GEOCODING'].get('TSV_PATH')) # Reorganize csv columns so that they are in the same order as when we # used to have those geolocation within the spreadsheet. # This is to avoid breaking latestdata.csv consumers. all_data = all_data[["ID","age","sex","city","province","country","latitude","longitude","geo_resolution","date_onset_symptoms","date_admission_hospital","date_confirmation","symptoms","lives_in_Wuhan","travel_history_dates","travel_history_location","reported_market_exposure","additional_information","chronic_disease_binary","chronic_disease","source","sequence_available","outcome","date_death_or_discharge","notes_for_discussion","location","admin3","admin2","admin1","country_new","admin_id","data_moderator_initials","travel_history_binary"]] # ensure new data is >= than the last one. latest_name = os.path.join(config['FILES']['LATEST'], 'latestdata.csv') line_diff = len(all_data) - len(pd.read_csv(latest_name)) if line_diff >= 0: logging.info(f"Line check passed, {line_diff} new lines") else: logging.info("Line check failed") return # save logging.info("Saving files to disk") dt = datetime.now().strftime('%Y-%m-%dT%H%M%S') file_name = config['FILES']['DATA'].replace('TIMESTAMP', dt) all_data.to_csv(file_name, index=False, encoding="utf-8") all_data.to_csv(latest_name, index=False, encoding="utf-8") logging.info("Wrote %s, %s", file_name, latest_name) if args.push_to_git: logging.info("Pushing to github") # Create script for uploading to github for_github.extend([file_name, latest_name]) script = 'set -e\n' script += 'cd {}\n'.format(config['GIT']['REPO']) script += 'git pull origin master\n' for g in for_github: script += f'git add {g}\n' script += 'git commit -m "data update"\n' script += 'git push origin master\n' script += f'cd {os.getcwd()}\n' print(script) os.system(script)