def test_ok(self, mock_sheet):
        with tempfile.TemporaryDirectory() as tmpdirname:
            txt_config = f"""
            [DEFAULT]

            [FILES]
            DATA = {tmpdirname}/covid-19.data.prod.TIMESTAMP.csv
            ERRORS = {tmpdirname}
            LATEST = {tmpdirname}
            """
            config = configparser.ConfigParser()
            config.read_string(txt_config)

            mapping = defaultdict(FakeResult)

            cur_dir = pathlib.Path(__file__).parent.absolute()
            geocoder = csv_geocoder.CSVGeocoder(os.path.join(cur_dir, "geocoding", "geo_admin.tsv"), mapping.get)
            
            mock_sheet.name = "sheet-name"
            mock_sheet.base_id = "000"
            mock_sheet.read_values = MagicMock(side_effect=[
                # Only one read is done in this test.
                # First row has columns only, voluntarily in a random order so that we check that the code doesn't depend on column ordering.
                # Second row has the data.
                [
                    ["age","sex","city","province","country","aggregated_num_cases","date_onset_symptoms","date_admission_hospital","date_confirmation","symptoms","lives_in_Wuhan","travel_history_dates","travel_history_location","reported_market_exposure","additional_information","chronic_disease_binary","chronic_disease","source","sequence_available","outcome","date_death_or_discharge","notes_for_discussion","data_moderator_initials","travel_history_binary", 'longitude', 'latitude', 'admin1', 'location', 'admin3', 'admin2', 'geo_resolution', 'country_new', 'admin_id'],
                    ["24", "male","cit","pro","coun","4","20.04.2020","21.04.2020","20.04.2020","symp","no","","","","handsome guy","0","","fake","0","discharged","25.04.2020","","TF","0", '14.15', '16.17', 'admin1', 'loc', 'admin3', 'admin2', 'point', 'France', '42'],
                ],
            ])
            processor = SheetProcessor([mock_sheet], geocoder, config)
            processor.process()
예제 #2
0
 def setUp(self):
     mapping = {
         "some, where, far": FakeResult(ok=True, lat=42.42, lng=43.43),
         "some, where, toofar": FakeResult(ok=False, lat=0, lng=0),
     }
     cur_dir = pathlib.Path(__file__).parent.absolute()
     self.geocoder = csv_geocoder.CSVGeocoder(
         os.path.join(cur_dir, "geo_admin.tsv"), mapping.get)
예제 #3
0
def main():
    args = parser.parse_args()
    config = configparser.ConfigParser()
    config.optionxform = str  # to preserve case
    config.read(args.config_file)
    logging.basicConfig(
        format='%(asctime)s %(filename)s:%(lineno)d %(message)s',
        filename='cleanup.log',
        filemode="w",
        level=logging.INFO)

    geocoder = csv_geocoder.CSVGeocoder(config['GEOCODING'].get('TSV_PATH'))
    sheets = get_GoogleSheets(config)

    processor = SheetProcessor(sheets, geocoder, config)
    processor.process()

    if args.push_to_git:
        processor.push_to_github()
예제 #4
0
def main():
    args = parser.parse_args()
    config = configparser.ConfigParser()
    config.optionxform = str  # to preserve case
    config.read(args.config_file)
    logging.basicConfig(
        format='%(asctime)s %(filename)s:%(lineno)d %(message)s',
        filename='cleanup.log',
        filemode="w",
        level=logging.INFO)

    sheets = get_GoogleSheets(config)

    # Load geocoder early so that invalid tsv paths errors are caught early on.
    geocoder = csv_geocoder.CSVGeocoder(config['GEOCODING'].get('TSV_PATH'),
                                        arcgis)
    sp = sheet_processor.SheetProcessor(sheets, geocoder, config)
    sp.process()

    if args.push_to_git:
        sp.push_to_github()
예제 #5
0
def main():
    cur_dir = pathlib.Path(__file__).parent.absolute()
    geocode_path = os.path.join(cur_dir, "geocoding", "geo_admin.tsv")
    args = parser.parse_args()
    config = configparser.ConfigParser()
    config.optionxform = str  # to preserve case
    config.read(args.config_file)
    logging.basicConfig(
        format='%(asctime)s %(filename)s:%(lineno)d %(message)s',
        filename='cleanup.log',
        filemode="w",
        level=logging.INFO)
    # Increase default timeout, 60s isn't enough for big sheets.
    socket.setdefaulttimeout(600)
    sheets = get_GoogleSheets(config, args.creds_file, args.is_service_account)

    # Load geocoder early so that invalid tsv paths errors are caught early on.
    geocoder = csv_geocoder.CSVGeocoder(geocode_path, arcgis)
    git_repo_path = os.path.join(cur_dir, "..", "..")
    sp = sheet_processor.SheetProcessor(sheets, geocoder, git_repo_path)
    sp.process()

    if args.push_to_git:
        sp.push_to_github()
예제 #6
0
def main():
    args = parser.parse_args()
    config = configparser.ConfigParser()
    config.read(args.config_file)
    logging.basicConfig(filename='cleanup.log', level=logging.INFO)

    sheets = get_GoogleSheets(config)
    for_github = []
    # Load geocoder early so that invalid tsv paths errors are caught early on.
    geocoder = csv_geocoder.CSVGeocoder(config['GEOCODING'].get('TSV_PATH'))
    for s in sheets:
        logging.info("Processing sheet %s", s.name)
        insert_ids(s, config)
        time.sleep(args.sleep_time_sec)

        ### Clean Private Sheet Entries. ###
        # note : private sheet gets updated on the fly and redownloaded to ensure continuity between fixes (granted its slower).
        range_      = f'{s.name}!A:AG'
        values      = read_values(s.spreadsheetid, range_, config)
        columns     = s.columns
        column_dict = {c:index2A1(i) for i,c in enumerate(columns)} # to get A1 notation, doing it now to ensure proper order
        data        = values2dataframe(values)

        # Trailing Spaces
        trailing = get_trailing_spaces(data)
        if len(trailing) > 0:
            logging.info('fixing %d trailing whitespace', len(trailing))
            fix_cells(s.spreadsheetid, s.name, trailing, column_dict, config)
            values = read_values(s.spreadsheetid, range_, config)
            data   = values2dataframe(values)
            time.sleep(args.sleep_time_sec)

        # fix N/A => NA
        na_errors = get_NA_errors(data)
        if len(na_errors) > 0:
            logging.info('fixing %d N/A -> NA', len(na_errors))
            fix_cells(s.spreadsheetid, s.name, na_errors, column_dict, config)
            values = read_values(s.spreadsheetid, range_, config)
            data   = values2dataframe(values)
            time.sleep(args.sleep_time_sec)

        # Regex fixes
        fixable, non_fixable = generate_error_tables(data)
        if len(fixable) > 0:
            logging.info('fixing %d regexps', len(fixable))
            fix_cells(s.spreadsheetid, s.name, fixable, column_dict, config)
            values = read_values(s.spreadsheetid, range_, config)
            data   = values2dataframe(values)
            time.sleep(args.sleep_time_sec)

        
        clean = data[~data.ID.isin(non_fixable.ID)]
        clean = clean.drop('row', axis=1)
        clean.sort_values(by='ID')
        s.data = clean
        non_fixable = non_fixable.sort_values(by='ID')
        

        # Save error_reports
        # These are separated by Sheet.
        logging.info('Saving error reports')
        directory   = config['FILES']['ERRORS']
        file_name   = f'{s.name}.error-report.csv'
        error_file  = os.path.join(directory, file_name)
        non_fixable.to_csv(error_file, index=False, header=True)
        for_github.append(error_file)

        
    # Combine data from all sheets into a single datafile:
    all_data = []
    for s in sheets:
        data = s.data
        
        if s.name == 'outside_Hubei':
            data['ID'] = data['ID'].apply(lambda x : f'000-1-{x}')
        
        elif s.name  == 'Hubei':
            data['ID'] = data['ID'].apply(lambda x: f'000-2-{x}')

        all_data.append(data)
    
    all_data = pd.concat(all_data, ignore_index=True)
    all_data = all_data.sort_values(by='ID')

    # Fill geo columns.
    geocode_matched = 0
    for i, row in all_data.iterrows():
        geocode = geocoder.Geocode(row.city, row.province, row.country)
        if not geocode:
            continue
        geocode_matched += 1
        all_data.at[i, 'latitude'] = geocode.lat
        all_data.at[i, 'longitude'] = geocode.lng
        all_data.at[i, 'geo_resolution'] = geocode.geo_resolution
        all_data.at[i, 'location'] = geocode.location
        all_data.at[i, 'admin3'] = geocode.admin3
        all_data.at[i, 'admin2'] = geocode.admin2
        all_data.at[i, 'admin1'] = geocode.admin1
        all_data.at[i, 'admin_id'] = geocode.admin_id
        all_data.at[i, 'country_new'] = geocode.country_new
    logging.info("Geocode matched %d/%d", geocode_matched, len(all_data))
    # Reorganize csv columns so that they are in the same order as when we
    # used to have those geolocation within the spreadsheet.
    # This is to avoid breaking latestdata.csv consumers.
    all_data = all_data[["ID","age","sex","city","province","country","latitude","longitude","geo_resolution","date_onset_symptoms","date_admission_hospital","date_confirmation","symptoms","lives_in_Wuhan","travel_history_dates","travel_history_location","reported_market_exposure","additional_information","chronic_disease_binary","chronic_disease","source","sequence_available","outcome","date_death_or_discharge","notes_for_discussion","location","admin3","admin2","admin1","country_new","admin_id","data_moderator_initials","travel_history_binary"]]
    
    #drop_invalid_ids = []
    #for i, row in all_data.iterrows():
    #    if row['ID'].str.match('

    # save
    logging.info("Saving files to disk")
    dt = datetime.now().strftime('%Y-%m-%dT%H%M%S')
    file_name   = config['FILES']['DATA'].replace('TIMESTAMP', dt)
    latest_name = os.path.join(config['FILES']['LATEST'], 'latestdata.csv')
    all_data.to_csv(file_name, index=False)
    all_data.to_csv(latest_name, index=False)
    logging.info("Wrote %s, %s", file_name, latest_name)

    if args.push_to_git:
        logging.info("Pushing to github")
        # Create script for uploading to github
        for_github.extend([file_name, latest_name])
        script  = 'set -e\n'
        script += 'cd {}\n'.format(config['GIT']['REPO'])
        script += 'git pull origin master\n'
        
        for g in for_github:
            script += f'git add {g}\n'
        script += 'git commit -m "data update"\n'
        script += 'git push origin master\n'
        script += f'cd {os.getcwd()}\n'
        print(script)
        os.system(script)
 def setUp(self):
     cur_dir = pathlib.Path(__file__).parent.absolute()
     self.geocoder = csv_geocoder.CSVGeocoder(
         os.path.join(cur_dir, "geo_admin.tsv"))
예제 #8
0
def main():
    args = parser.parse_args()
    config = configparser.ConfigParser()
    config.optionxform=str # to preserve case
    config.read(args.config_file) 
    logging.basicConfig(
        format='%(asctime)s %(filename)s:%(lineno)d %(message)s',
        filename='cleanup.log', filemode="w", level=logging.INFO)
    
    sheets = get_GoogleSheets(config)
    for_github = []

    # Load geocoder early so that invalid tsv paths errors are caught early on.
    geocoder = csv_geocoder.CSVGeocoder(
        config['GEOCODING'].get('TSV_PATH'),
        arcgis)
    for s in sheets:
        logging.info("Processing sheet %s", s.name)

        ### Clean Private Sheet Entries. ###
        # note : private sheet gets updated on the fly and redownloaded to ensure continuity between fixes (granted its slower).
        
        range_ = f'{s.name}!A:AG'
        data = values2dataframe(s.read_values(range_))

         # Expand aggregated cases into one row each.
        logging.info("Rows before expansion: %d", len(data))
        if len(data) > 150000:
            logging.warning("Sheet %s has more than 150K rows, it should be split soon", s.name)
        data.aggregated_num_cases = pd.to_numeric(data.aggregated_num_cases, errors='coerce')
        data = duplicate_rows_per_column(data, "aggregated_num_cases")
        logging.info("Rows after expansion: %d", len(data))

        # Generate IDs for each row sequentially following the sheet_id-inc_int pattern.
        data['ID'] = s.base_id + "-" + pd.Series(range(1, len(data)+1)).astype(str)

        # Remove whitespace.
        data = trim_df(data)

        # Fix columns that can be fixed easily.
        data.sex = fix_sex(data.sex)

        # fix N/A => NA
        for col in data.select_dtypes("string"):
            data[col] = fix_na(data[col])

        # Regex fixes
        fixable, non_fixable = generate_error_tables(data)
        if len(fixable) > 0:
            logging.info('fixing %d regexps', len(fixable))
            s.fix_cells(fixable)
            data = values2dataframe(s.read_values(range_))
        
        # ~ negates, here clean = data with IDs not in non_fixable IDs.
        clean = data[~data.ID.isin(non_fixable.ID)]
        clean = clean.drop('row', axis=1)
        clean.sort_values(by='ID')
        s.data = clean
        non_fixable = non_fixable.sort_values(by='ID')

        # Save error_reports
        # These are separated by Sheet.
        logging.info('Saving error reports')
        directory   = config['FILES']['ERRORS']
        file_name   = f'{s.name}.error-report.csv'
        error_file  = os.path.join(directory, file_name)
        non_fixable.to_csv(error_file, index=False, header=True, encoding="utf-8")
        for_github.append(error_file)
        
    # Combine data from all sheets into a single datafile
    all_data = []
    for s in sheets:
        logging.info("sheet %s had %d rows", s.name, len(s.data))
        all_data.append(s.data)
    
    all_data = pd.concat(all_data, ignore_index=True)
    all_data = all_data.sort_values(by='ID')
    logging.info("all_data has %d rows", len(all_data))

    # Fill geo columns.
    geocode_matched = 0
    for i, row in all_data.iterrows():
        geocode = geocoder.geocode(row.city, row.province, row.country)
        if not geocode:
            continue
        geocode_matched += 1
        all_data.at[i, 'latitude'] = geocode.lat
        all_data.at[i, 'longitude'] = geocode.lng
        all_data.at[i, 'geo_resolution'] = geocode.geo_resolution
        all_data.at[i, 'location'] = geocode.location
        all_data.at[i, 'admin3'] = geocode.admin3
        all_data.at[i, 'admin2'] = geocode.admin2
        all_data.at[i, 'admin1'] = geocode.admin1
        all_data.at[i, 'admin_id'] = geocode.admin_id
        all_data.at[i, 'country_new'] = geocode.country_new
    logging.info("Geocode matched %d/%d", geocode_matched, len(all_data))
    logging.info("Top 10 geocode misses: %s",geocoder.misses.most_common(10))
    with open("geocode_misses.csv", "w") as f:
        geocoder.write_misses_to_csv(f)
        logging.info("Wrote all geocode misses to geocode_misses.csv")
    if len(geocoder.new_geocodes) > 0:
        logging.info("Appending new geocodes to geo_admin.tsv")
        with open(config['GEOCODING'].get('TSV_PATH'), "a") as f:
            geocoder.append_new_geocodes_to_init_file(f)
        for_github.append(config['GEOCODING'].get('TSV_PATH'))
    # Reorganize csv columns so that they are in the same order as when we
    # used to have those geolocation within the spreadsheet.
    # This is to avoid breaking latestdata.csv consumers.
    all_data = all_data[["ID","age","sex","city","province","country","latitude","longitude","geo_resolution","date_onset_symptoms","date_admission_hospital","date_confirmation","symptoms","lives_in_Wuhan","travel_history_dates","travel_history_location","reported_market_exposure","additional_information","chronic_disease_binary","chronic_disease","source","sequence_available","outcome","date_death_or_discharge","notes_for_discussion","location","admin3","admin2","admin1","country_new","admin_id","data_moderator_initials","travel_history_binary"]]

    # ensure new data is >= than the last one. 
    latest_name = os.path.join(config['FILES']['LATEST'], 'latestdata.csv')

    line_diff = len(all_data) - len(pd.read_csv(latest_name))
    if line_diff >= 0:
        logging.info(f"Line check passed, {line_diff} new lines")
    else:
        logging.info("Line check failed")
        return 

    # save
    logging.info("Saving files to disk")
    dt = datetime.now().strftime('%Y-%m-%dT%H%M%S')
    file_name   = config['FILES']['DATA'].replace('TIMESTAMP', dt)
    all_data.to_csv(file_name, index=False, encoding="utf-8")
    all_data.to_csv(latest_name, index=False, encoding="utf-8")
    logging.info("Wrote %s, %s", file_name, latest_name)

    if args.push_to_git:
        logging.info("Pushing to github")
        # Create script for uploading to github
        for_github.extend([file_name, latest_name])
        script  = 'set -e\n'
        script += 'cd {}\n'.format(config['GIT']['REPO'])
        script += 'git pull origin master\n'
        
        for g in for_github:
            script += f'git add {g}\n'
        script += 'git commit -m "data update"\n'
        script += 'git push origin master\n'
        script += f'cd {os.getcwd()}\n'
        print(script)
        os.system(script)