class JHU_COUNTRY_CODES(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "JHU" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) def files_to_submissions(self): codes_dict = get_codes_dictionary() locations = self.get_existing_locations() for location in locations: codes = get_codes_for_country_name(codes_dict, location["country_region"]) # do not update the record if it already has the codes if location["iso2"] == codes["iso2"] and location["iso3"] == codes[ "iso3"]: continue record = {k: v for k, v in location.items() if v != None} record.update({ "type": "summary_location", "projects": [{ "code": self.project_code }], "iso2": codes["iso2"], "iso3": codes["iso3"], }) self.metadata_helper.add_record_to_submit(record) def submit_metadata(self): self.metadata_helper.batch_submit_records() def get_existing_locations(self): print("Getting summary_location data from Peregrine") query_string = ('{ summary_location (first: 0, project_id: "' + self.program_name + "-" + self.project_code + '") { submitter_id, country_region, iso2, iso3 } }') query_res = self.metadata_helper.query_peregrine(query_string) return [location for location in query_res["data"]["summary_location"]]
def main(): headers = {"Authorization": f"bearer {access_token}"} records = get_existing_data(base_url, program, project, old_node, headers) metadata_helper = MetadataHelper( base_url=base_url, program_name=program, project_code=project, access_token=access_token, ) print(f"Submitting {new_node} data") for old_rec in records: new_rec = {"type": new_node, "project_id": f"{program}-{project}"} for key, value in old_rec.items(): if value: new_rec[key] = value metadata_helper.add_record_to_submit(new_rec) metadata_helper.batch_submit_records()
class STOPLIGHT(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.summary_clinicals = [] self.summary_locations = [] self.program_name = "open" self.project_code = "covidstoplight" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) def files_to_submissions(self): """ Reads json files and converts the data to Sheepdog records """ url = "https://covidstoplight.org/api/v0/location/US" self.parse_file(url) def parse_file(self, url): """ Converts a json file to data we can submit via Sheepdog. Stores the records to submit in `self.location_data` and `self.time_series_data`. Ignores any records that are already in Sheepdog (relies on unique `submitter_id` to check) Args: url (str): URL at which the file is available """ print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: data = r.json() timestamp_created = data["data"]["generated"] country = data["country_code"] summary_location_list = [] try: for zipcode, feelings in data["data"]["submissions"].items(): node = { "zipcode": zipcode, "feelings": feelings, "timestamp_created": timestamp_created, "country": country, } summary_location, summary_clinical = self.parse_node(node) summary_location_submitter_id = summary_location[ "submitter_id"] if summary_location_submitter_id not in summary_location_list: self.summary_locations.append(summary_location) summary_location_list.append( summary_location_submitter_id) self.summary_clinicals.append(summary_clinical) except ValueError as e: print(f"ERROR: value error. Detail {e}") def parse_node(self, node): """ Converts an element of an JSON file to data we can submit via Sheepdog Args: node (dict): node data Returns: (dict, dict) tuple: - location data, in a format ready to be submitted to Sheepdog - { "date1": <value>, "date2": <value> } from the row data """ zipcode = node["zipcode"] feelings = node["feelings"] timestamp_created = node["timestamp_created"] country = node["country"] summary_location_submitter_id = format_location_submitter_id( country, zipcode) summary_location = { "country_region": country, "submitter_id": summary_location_submitter_id, "projects": [{ "code": self.project_code }], "zipcode": zipcode, } date = datetime.strptime(timestamp_created, "%Y-%m-%dT%H:%M:%S").date() date = date.strftime("%Y-%m-%d") summary_clinical_submitter_id = format_summary_clinical_submitter_id( summary_location_submitter_id, date) summary_clinical = { "date": date, "timestamp_created": timestamp_created, "submitter_id": summary_clinical_submitter_id, "summary_locations": [{ "submitter_id": summary_location_submitter_id }], } map_fields = { 1: "feeling_healthy_count", 2: "feeling_not_so_good_count", 3: "feeling_sick_count", } for element in feelings: summary_clinical[map_fields[element["feeling"]]] = element["count"] return summary_location, summary_clinical def submit_metadata(self): """ Converts the data in `self.time_series_data` to Sheepdog records. `self.location_data already contains Sheepdog records. Batch submits all records in `self.location_data` and `self.time_series_data` """ print("Submitting summary_location data") for loc in self.summary_locations: loc_record = {"type": "summary_location"} loc_record.update(loc) self.metadata_helper.add_record_to_submit(loc_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for rep in self.summary_clinicals: rep_record = {"type": "summary_clinical"} rep_record.update(rep) self.metadata_helper.add_record_to_submit(rep_record) self.metadata_helper.batch_submit_records()
class IDPH_HOSPITAL(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "IDPH-Hospital" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.country = "US" self.state = "IL" self.summary_locations = [] self.summary_clinicals = [] def files_to_submissions(self): """ Reads JSON file and convert the data to Sheepdog records """ latest_submitted_date = self.metadata_helper.get_latest_submitted_date_idph( ) today = datetime.date.today() if latest_submitted_date == today: print( "Nothing to submit: today and latest submitted date are the same." ) return today_str = today.strftime("%Y%m%d") print(f"Getting data for date: {today_str}") url = "https://dph.illinois.gov/sitefiles/COVIDHospitalRegions.json" self.parse_file(latest_submitted_date, url) def parse_file(self, latest_submitted_date, url): """ Converts a JSON files to data we can submit via Sheepdog. Stores the records to submit in `self.summary_locations` and `self.summary_clinicals`. Args: latest_submitted_date (date): the date of latest available "summary_clinical" for project url (str): URL at which the JSON file is available """ print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: data = r.json() date = idph_get_date(data["LastUpdateDate"]) if latest_submitted_date and date == latest_submitted_date.strftime( "%Y-%m-%d"): print( "Nothing to submit: latest submitted date and date from data are the same." ) return ( summary_location, summary_clinical_statewide_current, ) = self.parse_statewide_values(date, data["statewideValues"]) self.summary_locations.append(summary_location) for utilization in data["HospitalUtilizationResults"]: summary_clinical = self.parse_historical( utilization, summary_clinical_statewide_current) self.summary_clinicals.append(summary_clinical) for region in data["regionValues"]: (summary_location, summary_clinical) = self.parse_region(date, region) self.summary_locations.append(summary_location) self.summary_clinicals.append(summary_clinical) def parse_historical(self, utilization, summary_clinical_statewide_current): utilization_mapping = { "reportDate": "date", "TotalBeds": "state_total_beds", "TotalOpenBeds": "total_open_beds", "TotalInUseBedsNonCOVID": "total_in_use_beds_non_covid", "TotalInUseBedsCOVID": "total_in_use_beds_covid", "ICUBeds": "icu_beds", "ICUOpenBeds": "icu_open_beds", "ICUInUseBedsNonCOVID": "icu_in_use_beds_non_covid", "ICUInUseBedsCOVID": "icu_in_use_beds_covid", "VentilatorCapacity": "ventilator_capacity", "VentilatorAvailable": "ventilator_available", "VentilatorInUseNonCOVID": "ventilator_in_use_non_covid", "VentilatorInUseCOVID": "ventilator_in_use_covid", } date = utilization["reportDate"] summary_location_submitter_id = format_submitter_id( "summary_location", { "project": "idph_hospital", "country": self.country, "state": self.state }, ) summary_clinical_submitter_id = derived_submitter_id( summary_location_submitter_id, "summary_location", "summary_clinical", { "project": "idph_hospital", "date": date }, ) summary_clinical = { "submitter_id": summary_clinical_submitter_id, "date": date, "summary_locations": [{ "submitter_id": summary_location_submitter_id }], } for k, v in utilization.items(): summary_clinical[utilization_mapping[k]] = v if (summary_clinical_submitter_id == summary_clinical_statewide_current["submitter_id"]): summary_clinical.update(summary_clinical_statewide_current) return summary_clinical def parse_statewide_values(self, date, statewide_values): statewide_mapping = { "ICUCapacity": "state_icu_capacity", "ICUCovidPatients": "state_icu_covid_patients", "VentCapacity": "state_vent_capacity", "VentCovidPatients": "state_vent_covid_patients", "ICUAvailable": "state_icu_available", "VentsAvailable": "state_vents_available", "TotalBeds": "state_total_beds", "TotalBedsAvailable": "state_total_beds_available", "TotalBedsUsed": "state_total_beds_used", "PctHospitalBedsAvailable": "state_pct_hospital_beds_available", "AdultICUCapacity": "state_adult_icu_capacity", "ICUOpenBeds": "state_icu_open_beds", "ICUBedsUsed": "state_icu_beds_used", "ICUOpenBedsPct": "state_icu_open_beds_pct", "COVIDPUIPatients": "state_covid_pui_patients", "COVIDPUIPatientsPct": "state_covid_pui_patients_pct", "COVIDPUIPatientsBedsInUsePct": "state_covid_pui_patients_beds_in_use_pct", "VentilatorCapacity": "state_ventilator_capacity", "VentilatorsOpen": "state_ventilators_open", "VentilatorsOpenPct": "state_Ventilators_open_pct", "VentilatorsInUse": "state_ventilators_in_use", "VentilatorsInUseCOVID": "state_ventilators_in_use_covid", "VentilatorsCOVIDPatientsPct": "state_ventilators_covid_patients_pct", "VentilatorsCOVIDPatientsInUsePct": "state_ventilators_covid_patients_in_use_pct", "CovidPatientsNonICU": "state_covid_patients_non_icu", "TotalCOVIDPUIInICU": "state_total_covid_pui_in_icu", "TotalCOVIDPUIInHospital": "state_total_covid_pui_in_hospital", "PctBedsCOVIDPUI": "state_pct_beds_covid_pui", "MedSurgBeds": "state_med_surg_beds", "MedSurgBedsOpen": "state_med_surg_beds_open", "MedSurgBedsOpenPct": "state_med_surg_beds_open_pct", "MedSurgBedsInUse": "state_med_surg_beds_in_use", } summary_location_submitter_id = format_submitter_id( "summary_location", { "project": "idph_hospital", "country": self.country, "state": self.state }, ) summary_location = { "submitter_id": summary_location_submitter_id, "projects": [{ "code": self.project_code }], "country_region": self.country, "province_state": self.state, } summary_clinical_submitter_id = derived_submitter_id( summary_location_submitter_id, "summary_location", "summary_clinical", { "project": "idph_hospital", "date": date }, ) summary_clinical = { "submitter_id": summary_clinical_submitter_id, "date": date, "summary_locations": [{ "submitter_id": summary_location_submitter_id }], } for k, v in statewide_values.items(): summary_clinical[statewide_mapping[k]] = v return summary_location, summary_clinical def parse_region(self, date, hospital_region): """ From county-level data, generate the data we can submit via Sheepdog """ region = hospital_region["region"] region_description = hospital_region["region_description"] summary_location_submitter_id = format_submitter_id( "summary_location", { "project": "idph_hospital", "country": self.country, "state": self.state, "region": region, }, ) summary_location = { "country_region": self.country, "submitter_id": summary_location_submitter_id, "projects": [{ "code": self.project_code }], "province_state": self.state, "state_hospital_region": region, "state_region_description": strip_prefix(region_description), } summary_clinical_submitter_id = derived_submitter_id( summary_location_submitter_id, "summary_location", "summary_clinical", { "project": "idph_hospital", "date": date }, ) summary_clinical = { "submitter_id": summary_clinical_submitter_id, "date": date, "summary_locations": [{ "submitter_id": summary_location_submitter_id }], "region_icu_avail": hospital_region["ICUAvail"], "region_icu_capacity": hospital_region["ICUCapacity"], "region_vents_available": hospital_region["VentsAvailable"], "region_vents_capacity": hospital_region["VentsCapacity"], } return summary_location, summary_clinical def submit_metadata(self): print("Submitting data...") print("Submitting summary_location data") for sl in self.summary_locations: sl_record = {"type": "summary_location"} sl_record.update(sl) self.metadata_helper.add_record_to_submit(sl_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for sc in self.summary_clinicals: sc_record = {"type": "summary_clinical"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records()
class NPI_PRO(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "NPI-PRO" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.country = "US" self.summary_locations = [] self.summary_clinicals = [] def download_dataset(self, url): r = requests.get(url, allow_redirects=True) tf = tempfile.NamedTemporaryFile(suffix=".gdb.zip", delete=False) with open(tf.name, "wb") as npi_pro_geodatabase: npi_pro_geodatabase.write(r.content) return tf.name def files_to_submissions(self): print("Getting geodatabase for NPI-PRO dataset...") url = "https://www.arcgis.com/sharing/rest/content/items/7e80baf1773e4fd9b44fe9fb054677db/data" tf = self.download_dataset(url) self.parse_file(file_path=tf) def parse_file(self, file_path): try: gdf = gpd.read_file(file_path) except Exception as e: print(e) return print("Until better solution, submit only Illinois data") il_only = gdf.loc[gdf["Provider_Business_Practice_ST"] == "IL"] for i, row in il_only.iterrows(): summary_location, summary_clinical = self.parse_row(row) self.summary_locations.append(summary_location) self.summary_clinicals.append(summary_clinical) def parse_row(self, row): fields_mapping = { "NPI": ("summary_location", "npi"), "Provider_First_Line_Business_Pra": ( "summary_location", "first_line_address", ), "Provider_Second_Line_Business_Pr": ( "summary_location", "second_line_address", ), "Provider_Business_Practice_City": ("summary_location", "city"), "Provider_Business_Practice_ST": ("summary_location", "province_state"), "TaxonomyCode": ("summary_clinical", "taxonomy_code"), "ProviderType": ("summary_clinical", "provider_type"), "ProviderSubtype": ("summary_clinical", "provider_subtype"), "DetailedSpecialty": ("summary_clinical", "detailed_specialty"), } npi = row["NPI"] state = row["Provider_Business_Practice_ST"] summary_location_submitter_id = format_submitter_id( "summary_location", { "country": self.country, "state": state, "npi": npi }) summary_clinical_submitter_id = derived_submitter_id( summary_location_submitter_id, "summary_location", "summary_clinical", {}) result = { "summary_location": { "submitter_id": summary_location_submitter_id, "projects": [{ "code": self.project_code }], }, "summary_clinical": { "submitter_id": summary_clinical_submitter_id, "summary_locations": [{ "submitter_id": summary_location_submitter_id }], }, } for original_field, mappings in fields_mapping.items(): node, node_field = mappings if node_field == "npi": result[node][node_field] = str(row[original_field]) else: result[node][node_field] = row[original_field] return result["summary_location"], result["summary_clinical"] def submit_metadata(self): print("Submitting data...") print("Submitting summary_location data") for sl in self.summary_locations: sl_record = {"type": "summary_location"} sl_record.update(sl) self.metadata_helper.add_record_to_submit(sl_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for sc in self.summary_clinicals: sc_record = {"type": "summary_clinical"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records()
class IDPH_ZIPCODE(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "IDPH-zipcode" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.country = "US" self.state = "IL" self.summary_locations = [] self.summary_clinicals = [] def files_to_submissions(self): """ Reads JSON file and convert the data to Sheepdog records """ latest_submitted_date = self.metadata_helper.get_latest_submitted_date_idph( ) today = datetime.date.today() if latest_submitted_date == today: print( "Nothing to submit: today and latest submitted date are the same." ) return today_str = today.strftime("%Y%m%d") print(f"Getting data for date: {today_str}") url = "http://dph.illinois.gov/sitefiles/COVIDZip.json?nocache=1" self.parse_file(latest_submitted_date, url) def parse_file(self, latest_submitted_date, url): """ Converts a JSON files to data we can submit via Sheepdog. Stores the records to submit in `self.summary_locations` and `self.summary_clinicals`. Args: latest_submitted_date (date): date for latest submitted date url (str): URL at which the JSON file is available """ print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: data = r.json() date = idph_get_date(data["LastUpdateDate"]) if latest_submitted_date and date == latest_submitted_date.strftime( "%Y-%m-%d"): print( "Nothing to submit: latest submitted date and date from data are the same." ) return for zipcode_values in data["zip_values"]: (summary_location, summary_clinical) = self.parse_zipcode(date, zipcode_values) self.summary_locations.append(summary_location) self.summary_clinicals.append(summary_clinical) def parse_zipcode(self, date, zipcode_values): """ From county-level data, generate the data we can submit via Sheepdog """ zipcode = zipcode_values["zip"] summary_location_submitter_id = format_submitter_id( "summary_location", { "country": self.country, "state": self.state, "zipcode": zipcode }, ) summary_location = { "submitter_id": summary_location_submitter_id, "country_region": self.country, "province_state": self.state, "zipcode": zipcode, "projects": [{ "code": self.project_code }], } summary_clinical_submitter_id = derived_submitter_id( summary_location_submitter_id, "summary_location", "summary_clinical", {"date": date}, ) summary_clinical = { "submitter_id": summary_clinical_submitter_id, "date": date, "confirmed": zipcode_values["confirmed_cases"], "summary_locations": [{ "submitter_id": summary_location_submitter_id }], } if "demographics" in zipcode_values: demographic = zipcode_values["demographics"] for k, v in fields_mapping.items(): field, mapping = v demographic_group = demographic[k] for item in demographic_group: dst_field = mapping[item[field]] if dst_field: if "count" in item: age_group_count_field = "{}_{}".format( mapping[item[field]], "count") summary_clinical[age_group_count_field] = item[ "count"] if "tested" in item: age_group_tested_field = "{}_{}".format( mapping[item[field]], "tested") summary_clinical[age_group_tested_field] = item[ "tested"] return summary_location, summary_clinical def submit_metadata(self): """ Submits the data in `self.summary_locations` and `self.summary_clinicals` to Sheepdog. """ print("Submitting data...") print("Submitting summary_location data") for sl in self.summary_locations: sl_record = {"type": "summary_location"} sl_record.update(sl) self.metadata_helper.add_record_to_submit(sl_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for sc in self.summary_clinicals: sc_record = {"type": "summary_clinical"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records()
class CHI_NBHD(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.summary_locations = [] self.summary_clinicals = [] self.program_name = "open" self.project_code = "CHI-NBHD" self.country = "US" self.state = "IL" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) def files_to_submissions(self): """ Reads JSON file and convert the data to Sheepdog records """ url = "https://covid19neighborhoods.southsideweekly.com/page-data/index/page-data.json" self.parse_file(url) def parse_file(self, url): print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: data = r.json() data = data["result"]["data"] build_time_str = data["build_time"]["nodes"][0]["buildTime"] build_time = datetime.datetime.strptime( build_time_str, "%Y-%m-%dT%H:%M:%S.%fZ" ) current_date = build_time.strftime("%Y-%m-%d") nbhd_stats = data["community_areas_all"]["nodes"][0]["childGeoJson"][ "features" ] for nbhd_object in nbhd_stats: summary_location, summary_clinical = self.parse_nbhd( nbhd_object, current_date ) self.summary_locations.append(summary_location) self.summary_clinicals.append(summary_clinical) print(summary_location) print(summary_clinical) def parse_nbhd(self, nbhd_object, date): properties = nbhd_object["properties"] nbhd = properties["community"] deaths = properties["value"] population = properties["population"] summary_location_submitter_id = format_submitter_id( "summary_location", {"country": self.country, "state": self.state, "nbhd": nbhd}, ) summary_location = { "submitter_id": summary_location_submitter_id, "community_area": nbhd, "projects": [{"code": self.project_code}], } summary_clinical_submitter_id = derived_submitter_id( summary_location_submitter_id, "summary_location", "summary_clinical", {"date": date}, ) summary_clinical = { "submitter_id": summary_clinical_submitter_id, "date": date, "deaths_per_10000": round(10000 * deaths / population, 2), "deaths": deaths, "summary_locations": [{"submitter_id": summary_location_submitter_id}], } return summary_location, summary_clinical def submit_metadata(self): print("Submitting summary_location data") for loc in self.summary_locations: loc_record = {"type": "summary_location"} loc_record.update(loc) self.metadata_helper.add_record_to_submit(loc_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for sc in self.summary_clinicals: sc_record = {"type": "summary_clinical"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records()
class NCBI(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "ncbi-covid-19" self.manifest_bucket = "sra-pub-sars-cov2" self.sra_src_manifest = "sra-src/Manifest" self.accession_number_filename_map = {} self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.file_helper = AsyncFileHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.data_file = NCBI_FILE( base_url=self.base_url, s3_bucket=self.project_code, access_token=access_token, ) self.submitting_data = { "sample": [], "virus_sequence": [], "core_metadata_collection": [], "virus_sequence_run_taxonomy": [], "virus_sequence_contig": [], "virus_sequence_blastn": [], "virus_sequence_contig_taxonomy": [], "virus_sequence_peptide": [], "virus_sequence_hmm_search": [], } self.submitting_data["core_metadata_collection"].append({ "submitter_id": format_submitter_id("cmc_ncbi_covid19", {}), "projects": [{ "code": self.project_code }], }) read_ncbi_manifest( self.manifest_bucket, self.sra_src_manifest, self.accession_number_filename_map, ) def submit_metadata(self): start = time.strftime("%X") loop = asyncio.get_event_loop() tasks = [] for node_name, _ in self.data_file.nodes.items(): if node_name == "virus_sequence_run_taxonomy": continue else: tasks.append( asyncio.ensure_future( self.files_to_node_submissions(node_name))) try: results = loop.run_until_complete(asyncio.gather(*tasks)) loop.run_until_complete( asyncio.gather( self.files_to_virus_sequence_run_taxonomy_submission( results[0]))) if AsyncFileHelper.session: loop.run_until_complete( asyncio.gather(AsyncFileHelper.close_session())) finally: loop.close() end = time.strftime("%X") for k, v in self.submitting_data.items(): print(f"Submitting {k} data...") for node in v: node_record = {"type": k} node_record.update(node) self.metadata_helper.add_record_to_submit(node_record) self.metadata_helper.batch_submit_records() print(f"Running time: From {start} to {end}") async def files_to_virus_sequence_run_taxonomy_submission( self, submitting_accession_numbers): """get submitting data for virus_sequence_run_taxonomy node""" if not submitting_accession_numbers: return records = self._get_response_from_big_query( submitting_accession_numbers) # Keep track accession_numbers having link to virus_sequence nodes accession_number_set = set() for record in records: if record["acc"] in self.accession_number_filename_map: accession_number = record["acc"] print(f"Get from bigquery response {accession_number}") success = await self._parse_big_query_response(record) if success: accession_number_set.add(accession_number) cmc_submitter_id = format_submitter_id("cmc_ncbi_covid19", {}) for accession_number in submitting_accession_numbers: virus_sequence_run_taxonomy_submitter_id = format_submitter_id( "virus_sequence_run_taxonomy", {"accession_number": accession_number}) submitted_json = { "submitter_id": virus_sequence_run_taxonomy_submitter_id, "core_metadata_collections": [{ "submitter_id": cmc_submitter_id }], "accession_number": accession_number, "data_type": "Virus Sequence Run Taxonomy Analysis", "data_format": "json", "data_category": "Kmer-based Taxonomy Analysis", } # Add link to virus sequence node if accession_number in accession_number_set: submitted_json["virus_sequences"] = [{ "submitter_id": f"virus_sequence_{accession_number}" }] filename = f"virus_sequence_run_taxonomy_{accession_number}.csv" print(f"Get indexd info of {filename}") trying = True while trying: try: ( did, rev, md5sum, filesize, file_name, authz, ) = await self.file_helper.async_find_by_name( filename=filename) trying = False except Exception as e: print( f"Can not get indexd record of {filename}. Detail {e}. Retrying..." ) assert ( did ), f"file {filename} does not exist in the index, rerun NCBI_FILE ETL" if not authz: tries = 0 while tries < MAX_RETRIES: try: await self.file_helper.async_update_authz(did=did, rev=rev) break except Exception as e: tries += 1 print( f"Can not update indexd for {did}. Detail {e}. Retrying..." ) submitted_json["file_size"] = filesize submitted_json["md5sum"] = md5sum submitted_json["object_id"] = did submitted_json["file_name"] = file_name self.submitting_data["virus_sequence_run_taxonomy"].append( submitted_json) async def files_to_node_submissions(self, node_name): """Get submitting data for the node""" retrying = True while retrying: try: submitting_accession_numbers = ( await self.get_submitting_accession_number_list(node_name)) retrying = False except Exception as e: print( f"Can not query peregine with {node_name}. Detail {e}. Retrying ..." ) for accession_number in submitting_accession_numbers: submitter_id = format_submitter_id( node_name, {"accession_number": accession_number}) cmc_submitter_id = format_submitter_id("cmc_ncbi_covid19", {}) contig_submitter_id = format_submitter_id( "virus_sequence_contig", {"accession_number": accession_number}) peptide_submitter_id = format_submitter_id( "virus_sequence_peptide", {"accession_number": accession_number}) run_taxonomy_submitter_id = format_submitter_id( "virus_sequence_run_taxonomy", {"accession_number": accession_number}) contig_taxonomy_submitter_id = format_submitter_id( "virus_sequence_contig_taxonomy", {"accession_number": accession_number}) if node_name == "virus_sequence_contig": submitted_json = { "submitter_id": submitter_id, "core_metadata_collections": [{ "submitter_id": cmc_submitter_id }], "virus_sequences_run_taxonomies": [{ "submitter_id": run_taxonomy_submitter_id }], "accession_number": accession_number, "data_type": "Virus Sequence Contig", "data_format": "json", "data_category": "Nucleotide Contig", } elif node_name == "virus_sequence_blastn": submitted_json = { "submitter_id": submitter_id, "core_metadata_collections": [{ "submitter_id": cmc_submitter_id }], "virus_sequence_contigs": [{ "submitter_id": contig_submitter_id }], "accession_number": accession_number, "data_type": "Virus Sequence Blastn", "data_format": "tsv", "data_category": "Nucleotide Blast", } elif node_name == "virus_sequence_peptide": submitted_json = { "submitter_id": submitter_id, "core_metadata_collections": [{ "submitter_id": cmc_submitter_id }], "virus_sequence_contigs": [{ "submitter_id": contig_submitter_id }], "accession_number": accession_number, "data_type": "Peptides Annotation Using VIGOR3", "data_format": "json", "data_category": "Peptides Annotation", } elif node_name == "virus_sequence_hmm_search": submitted_json = { "submitter_id": submitter_id, "core_metadata_collections": [{ "submitter_id": cmc_submitter_id }], "virus_sequence_peptides": [{ "submitter_id": peptide_submitter_id }], "accession_number": accession_number, "data_type": "Virus Sequence HMM Search", "data_format": "json", "data_category": "HMMER Scab of Contigs", } elif node_name == "virus_sequence_contig_taxonomy": submitted_json = { "submitter_id": submitter_id, "core_metadata_collections": [{ "submitter_id": cmc_submitter_id }], "virus_sequence_contigs": [{ "submitter_id": contig_submitter_id }], "accession_number": accession_number, "data_type": "Contig Taxonomy", "data_format": "json", "data_category": "Kmer-based Taxonomy Analysis of Contigs", } else: raise Exception(f"ERROR: {node_name} does not exist") ext = re.search("\.(.*)$", self.data_file.nodes[node_name][0]).group(1) filename = f"{node_name}_{accession_number}.{ext}" print(f"Get indexd record of {filename}") retrying = True while retrying: try: ( did, rev, md5sum, filesize, file_name, authz, ) = await self.file_helper.async_find_by_name( filename=filename) retrying = False except Exception as e: print( f"ERROR: Fail to query indexd for {filename}. Detail {e}. Retrying ..." ) await asyncio.sleep(5) assert ( did ), f"file {filename} does not exist in the index, rerun NCBI_FILE ETL" if not authz: tries = 0 while tries < MAX_RETRIES: try: await self.file_helper.async_update_authz(did=did, rev=rev) break except Exception as e: tries += 1 print( f"ERROR: Fail to update indexd for {filename}. Detail {e}. Retrying ..." ) await asyncio.sleep(5) submitted_json["file_size"] = filesize submitted_json["md5sum"] = md5sum submitted_json["object_id"] = did submitted_json["file_name"] = file_name self.submitting_data[node_name].append(submitted_json) return submitting_accession_numbers async def get_submitting_accession_number_list_for_run_taxonomy(self): """get submitting number list for run_taxonomy file""" node_name = "virus_sequence_run_taxonomy" submitting_accession_numbers = set() existed_accession_numbers = await self.data_file.get_existed_accession_numbers( node_name) s3 = boto3.resource("s3", config=Config(signature_version=UNSIGNED)) s3_object = s3.Object(self.data_file.bucket, self.data_file.nodes[node_name][0]) file_path = f"{DATA_PATH}/virus_sequence_run_taxonomy.gz" s3_object.download_file(file_path) n_lines = 0 with gzip.open(file_path, "rb") as f: while True: bline = f.readline() if not bline: break n_lines += 1 if n_lines % 10000 == 0: print(f"Finish process {n_lines} of file {node_name}") line = bline.decode("UTF-8") r1 = re.findall("[SDE]RR\d+", line) if len(r1) == 0: continue read_accession_number = r1[0] if (f"{node_name}_{read_accession_number}" not in existed_accession_numbers): submitting_accession_numbers.add(read_accession_number) return list(submitting_accession_numbers) async def get_submitting_accession_number_list(self, node_name): """get submitting acession number list""" submitting_accession_numbers = set() existed_accession_numbers = await self.data_file.get_existed_accession_numbers( node_name) s3 = boto3.resource("s3", config=Config(signature_version=UNSIGNED)) s3_object = s3.Object(self.data_file.bucket, self.data_file.nodes[node_name][0]) line_stream = codecs.getreader("utf-8") n_lines = 0 for line in line_stream(s3_object.get()["Body"]): r1 = re.findall("[SDE]RR\d+", line) n_lines += 1 if n_lines % 10000 == 0: print(f"Finish process {n_lines} of file {node_name}") if len(r1) == 0: continue read_accession_number = r1[0] if (f"{node_name}_{read_accession_number}".lower() not in existed_accession_numbers): submitting_accession_numbers.add(read_accession_number) return list(submitting_accession_numbers) def _get_response_from_big_query(self, accession_numbers): """ Get data from big query. The format of the response json is described as below: [{ "acc": "DRR220591", "assay_type": "RNA-Seq", "center_name": "KUMAMOTO", "consent": "public", "experiment": "DRX210904", "sample_name": "SAMD00217265", "instrument": "Illumina NovaSeq 6000", "librarylayout": "PAIRED", "libraryselection": "RANDOM", "librarysource": "TRANSCRIPTOMIC", "platform": "ILLUMINA", "sample_acc": "DRS139760", "biosample": "SAMD00217265", "organism": "Mus musculus", "sra_study": "DRP006149", #'releasedate': datetime.datetime(2020, 6, 4, 0, 0, tzinfo=<UTC>), "bioproject": "PRJDB9618", "mbytes": 2160, "loaddate": None, "avgspotlen": 300, "mbases": 6395, "insertsize": None, "library_name": None, "biosamplemodel_sam": [], "collection_date_sam": [], "geo_loc_name_country_calc": None, "geo_loc_name_country_continent_calc": None, "geo_loc_name_sam": [], "ena_first_public_run": [], "ena_last_update_run": [], "sample_name_sam": ["WT3_plus"], "datastore_filetype": ["sra"], "datastore_provider": ["gs", "ncbi", "s3"], "datastore_region": ["gs.US", "ncbi.public", "s3.us-east-1"], }] """ assert accession_numbers != [], "accession_numbers is not empty" start = 0 offset = 100 client = bigquery.Client() while start < len(accession_numbers): end = min(start + offset, len(accession_numbers)) stm = 'SELECT * FROM `nih-sra-datastore`.sra.metadata where consent = "public"' stm = stm + f' and (acc = "{accession_numbers[start]}"' for accession_number in accession_numbers[start + 1:end]: stm = stm + f' or acc = "{accession_number}"' stm = stm + ")" query_job = client.query(stm) results = query_job.result() # Waits for job to complete. for row in results: yield dict(row) start = end async def _parse_big_query_response(self, response): """ Parse the big query response and get indexd record Return True if success """ accession_number = response["acc"] sample = {} virus_sequence = {} sample["submitter_id"] = f"sample_{accession_number}" sample["projects"] = [{"code": self.project_code}] for field in [ "ncbi_bioproject", "ncbi_biosample", "sample_accession", "host_associated_environmental_package_sam", "organism", "collection_date", "country_region", "continent", ]: if field in SPECIAL_MAP_FIELDS: old_name, dtype, handler = SPECIAL_MAP_FIELDS[field] sample[field] = handler(response.get(old_name)) elif field in response: sample[field] = str(response.get(field)) virus_sequence["submitter_id"] = f"virus_sequence_{accession_number}" for field in [ "assay_type", "avgspotlen", "bytes", "center_name", "consent", "datastore_provider", "datastore_region", "description_sam", "ena_checklist_sam", "ena_first_public_run", "ena_last_update_run", "experiment", "insdc_center_name_sam", "insdc_first_public_sam", "insdc_center_alias_sam", "insdc_last_update_sam", "investigation_type_sam", "insdc_status_sam", "instrument", "library_name", "libraryselection", "librarysource", "mbases", "mbytes", "platform", "sra_accession_sam", "sra_study", "title_sam", "release_date", "data_format", "librarylayout", ]: if field in SPECIAL_MAP_FIELDS: old_name, dtype, handler = SPECIAL_MAP_FIELDS[field] virus_sequence[field] = handler(response.get(old_name)) elif field in response: virus_sequence[field] = str(response.get(field)) virus_sequence["samples"] = [{"submitter_id": sample["submitter_id"]}] virus_sequence["data_category"] = "Nucleotide" virus_sequence["data_type"] = "Sequence" virus_sequence["file_name"] = self.accession_number_filename_map[ accession_number] virus_sequence["data_format"] = get_file_extension( virus_sequence["file_name"]) filename = virus_sequence["file_name"] retrying = True while retrying: try: ( did, rev, md5sum, filesize, file_name, authz, ) = await self.file_helper.async_find_by_name(filename=filename ) retrying = False except Exception as e: print( f"ERROR: Fail to get indexd for {filename}. Detail {e}. Retrying ..." ) await asyncio.sleep(5) if not did: print( f"file {filename} does not exist in the index, rerun NCBI_MANIFEST ETL" ) return False if not authz: retries = 0 while retries < MAX_RETRIES: try: await self.file_helper.async_update_authz(did=did, rev=rev) break except Exception as e: print( f"ERROR: Fail to update indexd for {filename}. Detail {e}. Retrying ..." ) retries += 1 await asyncio.sleep(5) virus_sequence["file_size"] = filesize virus_sequence["md5sum"] = md5sum virus_sequence["object_id"] = did self.submitting_data["virus_sequence"].append(virus_sequence) self.submitting_data["sample"].append(sample) return True
class IDPH(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "IDPH" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.country = "US" self.state = "IL" self.county_dict = {} self.il_counties() self.summary_locations = [] self.summary_clinicals = [] def get_location_and_clinical_submitter_id(self, county, date): summary_location_submitter_id = format_submitter_id( "summary_location", {"country": self.country, "state": self.state, "county": county} if county is not None else {"country": self.country, "state": self.state}, ) summary_clinical_submitter_id = derived_submitter_id( summary_location_submitter_id, "summary_location", "summary_clinical", {"date": date}, ) return summary_location_submitter_id, summary_clinical_submitter_id def il_counties(self): with open( os.path.join(CURRENT_DIR, "data/IL_counties_central_coords_lat_long.tsv") ) as f: counties = f.readlines() counties = counties[1:] counties = map(lambda l: l.strip().split("\t"), counties) for county, lat, lon in counties: self.county_dict[county] = {"lat": lat, "lon": lon} def files_to_submissions(self): """ Reads JSON file and convert the data to Sheepdog records. """ latest_submitted_date = self.metadata_helper.get_latest_submitted_date_idph() today = datetime.date.today() if latest_submitted_date == today: print("Nothing to submit: today and latest submitted date are the same.") return today_str = today.strftime("%Y%m%d") print(f"Getting data for date: {today_str}") # they changed the URL on April 1, 2020 if today > datetime.date(2020, 3, 31): url = "http://www.dph.illinois.gov/sitefiles/COVIDTestResults.json" else: url = f"https://www.dph.illinois.gov/sites/default/files/COVID19/COVID19CountyResults{today_str}.json" self.parse_file(latest_submitted_date, url) def parse_file(self, latest_submitted_date, url): """ Converts a JSON files to data we can submit via Sheepdog. Stores the records to submit in `self.summary_locations` and `self.summary_clinicals`. Args: latest_submitted_date (date): date for latest submitted date url (str): URL at which the JSON file is available """ print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: data = r.json() date = idph_get_date(data["LastUpdateDate"]) if latest_submitted_date and date == latest_submitted_date.strftime( "%Y-%m-%d" ): print( "Nothing to submit: latest submitted date and date from data are the same." ) return for county in data["characteristics_by_county"]["values"]: demographic = data.get("demographics", None) summary_location, summary_clinical = self.parse_county( date, county, demographic ) self.summary_locations.append(summary_location) self.summary_clinicals.append(summary_clinical) for illinois_data in data["state_testing_results"]["values"]: illinois_historic_data = self.parse_historical_data(illinois_data) self.summary_clinicals.append(illinois_historic_data) def parse_historical_data(self, illinois_data): """ Parses historical state-level data. "summary_location" node is created from "characteristics_by_county" data. Args: illinois_data (dict): data JSON with "testDate", "total_tested", "confirmed_cases" and "deaths" Returns: dict: "summary_clinical" node for Sheepdog """ county = "Illinois" date = datetime.datetime.strptime( illinois_data["testDate"], "%m/%d/%Y" ).strftime("%Y-%m-%d") ( summary_location_submitter_id, summary_clinical_submitter_id, ) = self.get_location_and_clinical_submitter_id(county, date) summary_clinical = { "submitter_id": summary_clinical_submitter_id, "date": date, "confirmed": illinois_data["confirmed_cases"], "testing": illinois_data["total_tested"], "deaths": illinois_data["deaths"], "summary_locations": [{"submitter_id": summary_location_submitter_id}], } return summary_clinical def parse_county(self, date, county_json, demographic): """ From county-level data, generate the data we can submit via Sheepdog Args: date (date): date county_json (dict): JSON for county statistics Returns: (dict, dict): "summary_location" and "summary_clinical" records """ county = county_json["County"] ( summary_location_submitter_id, summary_clinical_submitter_id, ) = self.get_location_and_clinical_submitter_id(county, date) summary_location = { "submitter_id": summary_location_submitter_id, "country_region": self.country, "province_state": self.state, "projects": [{"code": self.project_code}], } # the IDPH data use Illinois in "County" field for aggregated data # in Gen3 it would equal to location with "province_state" equal to "IL" and no "County" field if county != "Illinois": summary_location["county"] = county if county in self.county_dict: summary_location["latitude"] = self.county_dict[county]["lat"] summary_location["longitude"] = self.county_dict[county]["lon"] else: if county_json["lat"] != 0: summary_location["latitude"] = str(county_json["lat"]) if county_json["lon"] != 0: summary_location["longitude"] = str(county_json["lon"]) summary_clinical = { "submitter_id": summary_clinical_submitter_id, "date": date, "confirmed": county_json["confirmed_cases"], "testing": county_json["total_tested"], "deaths": county_json["deaths"], "summary_locations": [{"submitter_id": summary_location_submitter_id}], } if "negative" in county_json: summary_clinical["negative"] = county_json["negative"] if county == "Illinois" and demographic: for k, v in fields_mapping.items(): field, mapping = v demographic_group = demographic[k] for item in demographic_group: dst_field = mapping[item[field]] if dst_field: if "count" in item: age_group_count_field = "{}_{}".format( mapping[item[field]], "count" ) summary_clinical[age_group_count_field] = item["count"] if "tested" in item: age_group_tested_field = "{}_{}".format( mapping[item[field]], "tested" ) summary_clinical[age_group_tested_field] = item["tested"] return summary_location, summary_clinical def submit_metadata(self): """ Submits the data in `self.summary_locations` and `self.summary_clinicals` to Sheepdog. """ print("Submitting data...") print("Submitting summary_location data") for sl in self.summary_locations: sl_record = {"type": "summary_location"} sl_record.update(sl) self.metadata_helper.add_record_to_submit(sl_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for sc in self.summary_clinicals: sc_record = {"type": "summary_clinical"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records()
class COXRAY(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "COXRAY" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.file_helper = FileHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.nodes = { "core_metadata_collection": [], "study": [], "subject": [], "observation": [], "follow_up": [], "demographic": [], "imaging_file": [], } def files_to_submissions(self): with open(Path(COXRAY_DATA_PATH).joinpath("metadata.csv")) as f: reader = csv.reader(f, delimiter=",", quotechar='"') headers = next(reader) for row in reader: row_nodes = self.parse_row(headers, row) for k, v in row_nodes.items(): self.nodes[k].append(v) def parse_row(self, headers, row): cmc_submitter_id = format_submitter_id("cmc_coxray", {}) subject_submitter_id = format_submitter_id( "subject_coxray", {"patientid": row[headers.index("patientid")]}) observation_submitter_id = derived_submitter_id( subject_submitter_id, "subject_coxray", "observation_coxray", {}) follow_up_submitter_id = derived_submitter_id( subject_submitter_id, "subject_coxray", "follow_up_coxray", {"offset": row[headers.index("offset")]}, ) demographic_submitter_id = derived_submitter_id( subject_submitter_id, "subject_coxray", "demographic_coxray", {}) imaging_file_submitter_id = format_submitter_id( "imaging_file_coxray", {"filename": row[headers.index("filename")]}) study_submitter_id = format_submitter_id( "study_coxray", {"doi": row[headers.index("doi")]}) filename = row[headers.index("filename")] filename = Path(filename) filepath = Path(COXRAY_DATA_PATH).joinpath("images", filename) filepath_exist = filepath.exists() nodes = { "core_metadata_collection": { "submitter_id": cmc_submitter_id, "projects": [{ "code": self.project_code }], }, "study": { "submitter_id": study_submitter_id, "projects": [{ "code": self.project_code }], }, "subject": { "submitter_id": subject_submitter_id, "projects": [{ "code": self.project_code }], "studies": [{ "submitter_id": study_submitter_id }], }, "observation": { "submitter_id": observation_submitter_id, "subjects": [{ "submitter_id": subject_submitter_id }], }, "follow_up": { "submitter_id": follow_up_submitter_id, "subjects": [{ "submitter_id": subject_submitter_id }], }, "demographic": { "submitter_id": demographic_submitter_id, "subjects": [{ "submitter_id": subject_submitter_id }], }, } if filepath_exist: data_type = "".join(filename.suffixes) did, rev, md5sum, filesize = self.file_helper.find_by_name( filename=filename) assert ( did ), f"file {filename} does not exist in the index, rerun COXRAY_FILE ETL" self.file_helper.update_authz(did=did, rev=rev) nodes["imaging_file"] = { "submitter_id": imaging_file_submitter_id, "subjects": [{ "submitter_id": subject_submitter_id }], "follow_ups": [{ "submitter_id": follow_up_submitter_id }], "core_metadata_collections": [{ "submitter_id": cmc_submitter_id }], "data_type": data_type, "data_format": "Image File", "data_category": "X-Ray Image", "file_size": filesize, "md5sum": md5sum, "object_id": did, } else: print( f"subject references the file that doesn't exist as a file: {filepath}" ) for k, (node, field, converter) in fields_mapping.items(): value = row[headers.index(k)] if node in nodes and value: if converter: nodes[node][field] = converter(value) else: nodes[node][field] = value return nodes def submit_metadata(self): print("Submitting data...") for k, v in self.nodes.items(): submitter_id_exist = [] print(f"Submitting {k} data...") for node in v: node_record = {"type": k} node_record.update(node) submitter_id = node_record["submitter_id"] if submitter_id not in submitter_id_exist: submitter_id_exist.append(submitter_id) self.metadata_helper.add_record_to_submit(node_record) self.metadata_helper.batch_submit_records()
class OWID(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.summary_locations = [] self.summary_clinicals = [] self.program_name = "open" self.project_code = "OWID" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) # structure is # (csv field name, (node type, node field name, type of field)) testing_fields = [ ("ISO code", ("summary_location", "iso3", str)), ("Entity", (None, None, split_entity)), ("Date", ("summary_clinical", "date", str)), ("Source URL", ("summary_clinical", "source_url", str)), ("Source label", ("summary_clinical", "source_label", str)), ("Notes", ("summary_clinical", "notes", str)), ("Number of observations", ("summary_clinical", "num_observations", int)), ("Cumulative total", ("summary_clinical", "testing", int)), ( "Cumulative total per thousand", ("summary_clinical", "cumulative_total_per_thousand", int), ), ( "Daily change in cumulative total", ("summary_clinical", "daily_change_in_cumulative_total", int), ), ( "Daily change in cumulative total per thousand", ( "summary_clinical", "daily_change_in_cumulative_total_per_thousand", int, ), ), ( "7-day smoothed daily change", ("summary_clinical", "seven_day_smoothed_daily_change", int), ), ( "7-day smoothed daily change per thousand", ( "summary_clinical", "seven_day_smoothed_daily_change_per_thousand", float, ), ), ("Short-term positive rate", (None, None, None)), ("Short-term tests per case", (None, None, None)), ("General source label", ("summary_clinical", "general_source_label", str)), ("General source URL", ("summary_clinical", "general_source_url", str)), ("Short description", ("summary_clinical", "short_description", str)), ("Detailed description", ("summary_clinical", "detailed_description", str)), ] self.headers_mapping = { field: (k, mapping) for k, (field, mapping) in enumerate(testing_fields) } def files_to_submissions(self): """ Reads CSV files and converts the data to Sheepdog records """ url = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/testing/covid-testing-latest-data-source-details.csv" self.parse_file(url) def parse_file(self, url): print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: f = (line.decode("utf-8") for line in r.iter_lines()) reader = csv.reader(f, delimiter=",", quotechar='"') headers = next(reader) assert ( headers[0] != "404: Not Found" ), " Unable to get file contents, received {}.".format(headers) expected_h = list(self.headers_mapping.keys()) obtained_h = headers[:len(expected_h)] assert ( obtained_h == expected_h ), "CSV headers have changed (expected {}, got {}). We may need to update the ETL code".format( expected_h, obtained_h) for row in reader: summary_location, summary_clinical = self.parse_row( row, self.headers_mapping) if summary_location not in self.summary_locations: self.summary_locations.append(summary_location) self.summary_clinicals.append(summary_clinical) def parse_row(self, row, mapping): summary_location = {} summary_clinical = {} for k, (i, (node_type, node_field, type_conv)) in mapping.items(): if k == "Entity": country, test_type = split_entity(row[i]) summary_location["country_region"] = country summary_clinical["test_type"] = test_type if node_field: value = row[i] if value: if node_type == "summary_location": summary_location[node_field] = type_conv(value) if node_type == "summary_clinical": if type_conv == int: summary_clinical[node_field] = type_conv( float(value)) else: summary_clinical[node_field] = type_conv(value) summary_location_submitter_id = format_location_submitter_id( summary_location) summary_location["submitter_id"] = summary_location_submitter_id summary_location["projects"] = [{"code": self.project_code}] summary_clinical[ "submitter_id"] = format_summary_clinical_submitter_id( summary_location_submitter_id, test_type=summary_clinical["test_type"], date=datetime.date.today().strftime("%Y-%m-%d"), ) summary_clinical["summary_locations"] = [{ "submitter_id": summary_location_submitter_id }] return summary_location, summary_clinical def submit_metadata(self): print("Submitting summary_location data") for loc in self.summary_locations: loc_record = {"type": "summary_location"} loc_record.update(loc) self.metadata_helper.add_record_to_submit(loc_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for rep in self.summary_clinicals: rep_record = {"type": "summary_clinical"} rep_record.update(rep) self.metadata_helper.add_record_to_submit(rep_record) self.metadata_helper.batch_submit_records()
class CTP(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.summary_locations = [] self.summary_clinicals = [] self.header_to_column = {} self.program_name = "open" self.project_code = "CTP" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.expected_file_headers = set([ "date", "state", "positive", "negative", "pending", "totalTestResults", "hospitalizedCurrently", "hospitalizedCumulative", "inIcuCurrently", "inIcuCumulative", "onVentilatorCurrently", "onVentilatorCumulative", "recovered", "dataQualityGrade", "lastUpdateEt", "dateModified", "checkTimeEt", "death", "hospitalized", "dateChecked", "totalTestsViral", "positiveTestsViral", "negativeTestsViral", "positiveCasesViral", "deathConfirmed", "deathProbable", "totalTestEncountersViral", "totalTestsPeopleViral", "totalTestsAntibody", "positiveTestsAntibody", "negativeTestsAntibody", "totalTestsPeopleAntibody", "positiveTestsPeopleAntibody", "negativeTestsPeopleAntibody", "totalTestsPeopleAntigen", "positiveTestsPeopleAntigen", "totalTestsAntigen", "positiveTestsAntigen", "fips", "positiveIncrease", "negativeIncrease", "total", "totalTestResultsSource", "totalTestResultsIncrease", "posNeg", "deathIncrease", "hospitalizedIncrease", "hash", "commercialScore", "negativeRegularScore", "negativeScore", "positiveScore", "score", "grade", ]) self.expected_race_headers = set([ "Date", "State", "Cases_Total", "Cases_White", "Cases_Black", "Cases_Latinx", "Cases_Asian", "Cases_AIAN", "Cases_NHPI", "Cases_Multiracial", "Cases_Other", "Cases_Unknown", "Cases_Ethnicity_Hispanic", "Cases_Ethnicity_NonHispanic", "Cases_Ethnicity_Unknown", "Deaths_Total", "Deaths_White", "Deaths_Black", "Deaths_Latinx", "Deaths_Asian", "Deaths_AIAN", "Deaths_NHPI", "Deaths_Multiracial", "Deaths_Other", "Deaths_Unknown", "Deaths_Ethnicity_Hispanic", "Deaths_Ethnicity_NonHispanic", "Deaths_Ethnicity_Unknown", ]) def files_to_submissions(self): """ Reads CSV files and converts the data to Sheepdog records """ url = "https://api.covidtracking.com/v1/states/daily.csv" self.parse_file(url) def extract_races(self): """ Extract race information. Store the data to a dictionary for fast lookup during merging process. """ url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vS8SzaERcKJOD_EzrtCDK1dX1zkoMochlA9iHoHg_RSw3V8bkpfk1mpw4pfL5RdtSOyx_oScsUtyXyk/pub?gid=43720681&single=true&output=csv" print("Getting data from {}".format(url)) races = {} with closing(requests.get(url, stream=True)) as r: f = (line.decode("utf-8") for line in r.iter_lines()) reader = csv.reader(f, delimiter=",", quotechar='"') headers = next(reader) assert ( headers[0] != "404: Not Found" ), "Unable to get file contents, received {}.".format(headers) assert len(headers) >= 3, "Unexpected headers: {}".format(headers) assert (headers[0], headers[1], headers[2]) == ( "Date", "State", "Cases_Total", ), "The first 3 column names of the race data must be Dat, State, Cases_Total. Got: {}".format( headers) assert self.expected_race_headers.issubset( set(headers) ), "CSV headers have changed (expected {} is a subset of {}). We may need to update the ETL code".format( self.expected_race_headers, headers) for row in reader: if not row: continue try: races[(row[0], row[1], row[2])] = row[3:] except Exception as e: print( f"Error processing race row: {row}.\nSkipping row. Detail: {e}" ) return races, headers def parse_file(self, url): """ Converts a CSV file to data we can submit via Sheepdog. Stores the records to submit in `self.location_data` and `self.time_series_data`. Ignores any records that are already in Sheepdog (relies on unique `submitter_id` to check) Args: url (str): URL at which the CSV file is available """ races, race_headers = self.extract_races() print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: f = (line.decode("utf-8") for line in r.iter_lines()) reader = csv.reader(f, delimiter=",", quotechar='"') headers = next(reader) assert ( headers[0] != "404: Not Found" ), "Unable to get file contents, received {}.".format(headers) assert self.expected_file_headers.issubset( set(headers) ), "CSV headers have changed (expected {} is a subset of {}). We may need to update the ETL code".format( self.expected_file_headers, headers) headers = headers + race_headers[3:] for i in range(0, len(headers)): self.header_to_column[headers[i]] = i summary_location_list = [] for row in reader: if (row[0], row[1], row[2]) in races: [row.append(k) for k in races[(row[0], row[1], row[2])]] else: [ row.append("") for _ in range(len(self.expected_race_headers) - 3) ] summary_location, summary_clinical = self.parse_row(row) summary_location_submitter_id = summary_location[ "submitter_id"] if summary_location_submitter_id not in summary_location_list: self.summary_locations.append(summary_location) summary_location_list.append(summary_location_submitter_id) self.summary_clinicals.append(summary_clinical) def parse_row(self, row): """ Converts a row of a CSV file to data we can submit via Sheepdog Args: row (list(str)): row of data Returns: (dict, dict) tuple: - location data, in a format ready to be submitted to Sheepdog - { "date1": <value>, "date2": <value> } from the row data """ date = row[self.header_to_column["date"]] date = datetime.strptime(date, "%Y%m%d").date() date = date.strftime("%Y-%m-%d") country = "US" state = row[self.header_to_column["state"]] summary_location_submitter_id = format_location_submitter_id( country, state) summary_location = { "country_region": country, "submitter_id": summary_location_submitter_id, "projects": [{ "code": self.project_code }], "province_state": state, } fips = row[self.header_to_column["fips"]] if fips: summary_location["FIPS"] = int(fips) summary_clinical_submitter_id = format_summary_clinical_submitter_id( summary_location_submitter_id, date) summary_clinical = { "date": date, "submitter_id": summary_clinical_submitter_id, "summary_locations": [{ "submitter_id": summary_location_submitter_id }], } map_csv_fields = { "confirmed": "positive", "negative": "negative", "pending": "pending", "hospitalizedCurrently": "hospitalizedCurrently", "hospitalizedCumulative": "hospitalizedCumulative", "inIcuCurrently": "inIcuCurrently", "inIcuCumulative": "inIcuCumulative", "onVentilatorCurrently": "onVentilatorCurrently", "recovered": "recovered", "totalTestsViral": "totalTestsViral", "positiveTestsViral": "positiveTestsViral", "negativeTestsViral": "negativeTestsViral", "positiveCasesViral": "positiveCasesViral", "positiveIncrease": "positiveIncrease", "negativeIncrease": "negativeIncrease", "totalTestResultsIncrease": "totalTestResultsIncrease", "deathIncrease": "deathIncrease", "hospitalizedIncrease": "hospitalizedIncrease", "race_white_count": "Cases_White", "race_black_count": "Cases_Black", "race_hispanic_count": "Cases_Latinx", "race_asian_count": "Cases_Asian", "race_ai_an_count": "Cases_AIAN", "race_nh_pi_count": "Cases_NHPI", "race_multiracial_count": "Cases_Multiracial", "race_other_count": "Cases_Other", "race_left_blank_count": "Cases_Unknown", "ethnicity_hispanic_count": "Cases_Ethnicity_Hispanic", "ethnicity_nonhispanic_count": "Cases_Ethnicity_NonHispanic", "ethnicity_unknown_count": "Cases_Ethnicity_Unknown", "deaths": "Deaths_Total", "race_white_deaths": "Deaths_White", "race_black_deaths": "Deaths_Black", "race_hispanic_deaths": "Deaths_Latinx", "race_asian_deaths": "Deaths_Asian", "race_ai_an_deaths": "Deaths_AIAN", "race_nh_pi_deaths": "Deaths_NHPI", "race_multiracial_deaths": "Deaths_Multiracial", "race_other_deaths": "Deaths_Other", "race_left_blank_deaths": "Deaths_Unknown", "ethnicity_hispanic_deaths": "Deaths_Ethnicity_Hispanic", "ethnicity_nonhispanic_deaths": "Deaths_Ethnicity_NonHispanic", "ethnicity_unknown_deaths": "Deaths_Ethnicity_Unknown", } for k, v in map_csv_fields.items(): value = row[self.header_to_column[v]] if value and value.lower() not in ["nan", "n/a"]: summary_clinical[k] = int(value.replace(",", "")) dataQualityGrade = row[self.header_to_column["dataQualityGrade"]] if dataQualityGrade: summary_clinical["dataQualityGrade"] = dataQualityGrade lastUpdateEt = row[self.header_to_column["lastUpdateEt"]] if lastUpdateEt: summary_clinical["lastUpdateEt"] = lastUpdateEt return summary_location, summary_clinical def submit_metadata(self): """ Converts the data in `self.time_series_data` to Sheepdog records. `self.location_data already contains Sheepdog records. Batch submits all records in `self.location_data` and `self.time_series_data` """ # Commented # Only required for one time submission of summary_location print("Submitting summary_location data") for loc in self.summary_locations: loc_record = {"type": "summary_location"} loc_record.update(loc) self.metadata_helper.add_record_to_submit(loc_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for sc in self.summary_clinicals: sc_record = {"type": "summary_clinical"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records()
class LOAD_VIRUS_METADATA(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) # Get all input strings from YAML script = path.splitext(path.basename(__file__))[0].strip("/") script = path.join(CURRENT_DIR, script + ".yaml") with open(script) as f: config = yaml.load(f, Loader=yaml.FullLoader) self.verbose = config["verbose"] self.program_name = config["program_name"] self.project_code = config["project_code"] self.virus_genome_data_category = config["virus_genome_data_category"] self.virus_genome_data_type = config["virus_genome_data_type"] self.virus_genome_data_format = config["virus_genome_data_format"] self.virus_genome_source = config["virus_genome_source"] self.virus_genome_type = config["virus_genome_type"] self.virus_sequence_type = config["virus_sequence_type"] self.virus_sequence_data_type = config["virus_sequence_data_type"] self.virus_sequence_data_format = config["virus_sequence_data_format"] self.virus_sequence_alignment_type = config["virus_sequence_alignment_type"] self.virus_sequence_alignment_data_type = config[ "virus_sequence_alignment_data_type" ] self.virus_sequence_alignment_data_format = config[ "virus_sequence_alignment_data_format" ] self.virus_sequence_alignment_tool = config["virus_sequence_alignment_tool"] self.virus_sequence_hmm_type = config["virus_sequence_hmm_type"] self.virus_sequence_hmm_data_type = config["virus_sequence_hmm_data_type"] self.virus_sequence_hmm_data_format = config["virus_sequence_hmm_data_format"] self.virus_genomes = [] self.virus_sequences = [] self.virus_sequence_alignments = [] self.virus_sequence_hmms = [] self.metadata_helper = MetadataHelper( base_url=base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) def checksum(self, filename): with open(filename, "rb") as f: bytes = f.read() return hashlib.md5(bytes).hexdigest() def files_to_submissions(self): latest_submitted_date = ( self.metadata_helper.get_latest_submitted_data_virus_genome() ) today = datetime.date.today() if latest_submitted_date == today: print("Nothing to submit: today and latest submitted date are the same.") return def submit_metadata(self): latest_submitted_date = ( self.metadata_helper.get_latest_submitted_data_virus_genome() ) today = datetime.date.today() if latest_submitted_date == today: print("Nothing to submit: today and latest submitted date are the same.") return self.read() self.write() def read(self): self.genomes = glob.glob("*.gb", recursive=False) self.seqs = glob.glob("*.fasta", recursive=False) self.alns = glob.glob("*.aln", recursive=False) self.hmms = glob.glob("*.hmm", recursive=False) def write(self): # Genomes for genome in self.genomes: virus_genome_submitter_id = genome.replace(".", "_") virus_genome = { "data_category": self.virus_genome_data_category, "data_type": self.virus_genome_data_type, "data_format": self.virus_genome_data_format, "source": self.virus_genome_source, "submitter_id": virus_genome_submitter_id, "file_name": genome, "md5sum": self.checksum(genome), "file_size": path.getsize(genome), "projects": [{"code": self.project_code}], } self.virus_genomes.append(virus_genome) if self.verbose: print("Submitting virus_genome data") for genome in self.virus_genomes: genome_record = {"type": self.virus_genome_type} genome_record.update(genome) self.metadata_helper.add_record_to_submit(genome_record) self.metadata_helper.batch_submit_records() # Sequences for seq in self.seqs: virus_sequence_id = seq.replace(".", "_") # Data Category: Protein or Nucleotide seqtype = "Protein" if "-aa.fasta" in seq else "Nucleotide" virus_sequence = { "data_category": seqtype, "data_type": self.virus_sequence_data_type, "data_format": self.virus_sequence_data_format, "submitter_id": virus_sequence_id, "file_name": seq, "md5sum": self.checksum(seq), "file_size": path.getsize(seq), "projects": [{"code": self.project_code}], } self.virus_sequences.append(virus_sequence) if self.verbose: print("Submitting virus_sequence data") for seq in self.virus_sequences: seq_record = {"type": self.virus_sequence_type} seq_record.update(seq) self.metadata_helper.add_record_to_submit(seq_record) self.metadata_helper.batch_submit_records() # Alignments for aln in self.alns: virus_sequence_alignment_id = aln.replace(".", "_") # Data Category: Protein or Nucleotide seqtype = "Protein" if "-aa.aln" in aln else "Nucleotide" virus_sequence_alignment = { "data_category": seqtype, "data_type": self.virus_sequence_alignment_data_type, "data_format": self.virus_sequence_alignment_data_format, "submitter_id": virus_sequence_alignment_id, "file_name": aln, "md5sum": self.checksum(aln), "file_size": path.getsize(aln), "projects": [{"code": self.project_code}], "alignment_tool": self.virus_sequence_alignment_tool, } self.virus_sequence_alignments.append(virus_sequence_alignment) if self.verbose: print("Submitting virus_sequence_alignment data") for aln in self.virus_sequence_alignments: aln_record = {"type": self.virus_sequence_alignment_type} aln_record.update(aln) self.metadata_helper.add_record_to_submit(aln_record) self.metadata_helper.batch_submit_records() # HMMs for hmm in self.hmms: virus_sequence_hmm_id = hmm.replace(".", "_") # Data Category: Protein or Nucleotide seqtype = "Protein" if "-aa.hmm" in hmm else "Nucleotide" virus_sequence_hmm = { "data_category": seqtype, "data_type": self.virus_sequence_hmm_data_type, "data_format": self.virus_sequence_hmm_data_format, "submitter_id": virus_sequence_hmm_id, "file_name": hmm, "md5sum": self.checksum(hmm), "file_size": path.getsize(hmm), "projects": [{"code": self.project_code}], } self.virus_sequence_hmms.append(virus_sequence_hmm) if self.verbose: print("Submitting virus_sequence_hmm data") for hmm in self.virus_sequence_hmms: hmm_record = {"type": self.virus_sequence_hmm_type} hmm_record.update(hmm) self.metadata_helper.add_record_to_submit(hmm_record) self.metadata_helper.batch_submit_records()
class IDPH_VACCINE(IDPH): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "IDPH-Vaccine" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.country = "US" self.state = "IL" self.date = "" self.counties_inventory = {} self.summary_locations = {} self.summary_clinicals = {} self.summary_group_demographic = {} def parse_list_of_counties(self): """ Store into `self.date` the date the data was last updated, and into `self.counties_inventory` the data in format: { <county name>: { <county properties> } } """ response = requests.get(ROOT_URL, headers={"content-type": "json"}) json_response = json.loads(response.text) self.date = idph_get_date(json_response.get("lastUpdatedDate")) print(f"Dataset's last updated date: {self.date}") root_json = json_response.get("VaccineAdministration") if root_json is None: return for item in root_json: county = item.get("CountyName") self.counties_inventory[county] = item def files_to_submissions(self): """ Reads JSON file and convert the data to Sheepdog records """ # latest_submitted_date = ( # self.metadata_helper.get_latest_submitted_date_idph() # ) # if latest_submitted_date != None and latest_submitted_date == self.date: # print( # "Nothing to submit: data of latest submitted date and IDPH are the same." # ) # return self.parse_link() def get_group_clinical_demographic_submitter_id( self, summary_clinical_submitter_id, key_dict): summary_group_demographic_submitter_id = derived_submitter_id( summary_clinical_submitter_id, "summary_clinical", "summary_group_demographic", key_dict, ) return summary_group_demographic_submitter_id def map_race(self, value, prop_name): race_mapping = { "Black or African-American": "Black", "Other race": "Other", "Native Hawaiian or Other Pacif": "Native Hawaiian or Other Pacific Islander", "American Indian or Alaska Nati": "American Indian or Alaska Native", "Hispanic or Latino": "Hispanic", } gender_mapping = { "Unknown": "Unknown or Left Blank", } age_group_mapping = {"65+": "greater than 65"} if prop_name == "Race" and value in race_mapping: return race_mapping.get(value) if prop_name == "Gender" and value in gender_mapping: return gender_mapping.get(value) if prop_name == "AgeGroup" and value in age_group_mapping: return age_group_mapping.get(value) return value def parse_group_clinical_demographic(self, props_mapping, props_value): key_props_name = ["AgeGroup", "Race", "Gender"] key_props = {} for k in key_props_name: key = props_mapping.get(k) if key is not None: key_props[key] = props_value.get(k) props_data = {} for (k, v) in props_mapping.items(): if k in props_value: if k in key_props_name: value = props_value.get(k) if k == "AgeGroup": value = value.replace("-", " to ") props_data[v] = self.map_race(value, k) else: props_data[v] = props_value.get(k) return key_props, props_data def parse_link(self): """ Converts the source data to data we can submit via Sheepdog. Stores the records to submit in `self.summary_locations`, `self.summary_clinicals` and `self.summary_group_demographic`. """ illinois_summary_clinical_submitter_id = self.parse_county_data() self.parse_total_state_wide(illinois_summary_clinical_submitter_id) def parse_county_data(self): """ For each county, converts the raw data into Sheepdog submissions by mapping properties to match the PRC data dictionary. Return the `submitter_id` for the state-wide `summary_clinical` record. """ county_vaccine_mapping = { "AdministeredCount": "vaccine_administered_count", "AdministeredCountChange": "vaccine_administered_count_change", "AdministeredCountRollAvg": "vaccine_administered_count_roll_avg", "PersonsFullyVaccinated": "vaccine_persons_fully_vaccinated", "Report_Date": "date", "PctVaccinatedPopulation": "vaccine_persons_fully_vaccinated_pct", } county_demo_mapping = { "AgeGroup": "age_group", "Race": "race", "Gender": "gender", "AdministeredCount": "vaccine_administered_count", "PersonsFullyVaccinated": "vaccine_persons_fully_vaccinated", } inventory_reported = { "LHDReportedInventory": "vaccine_LHDR_reported_inventory", "CommunityReportedInventory": "vaccine_community_reported_inventory", "TotalReportedInventory": "vaccine_reported_inventory", "InventoryReportDate": "date", } self.parse_list_of_counties() illinois_summary_clinical_submitter_id = "" for county in self.counties_inventory: county_covid_response = requests.get( COUNTY_COVID_LINK_FORMAT.format(county), headers={"content-type": "json"}, ) county_covid_data = json.loads( county_covid_response.text).get("CurrentVaccineAdministration") county_demo_response = requests.get( COUNTY_DEMO_LINK_FORMAT.format(county), headers={"content-type": "json"}) county_demo_data = json.loads(county_demo_response.text) ( summary_location_submitter_id, summary_clinical_submitter_id, ) = self.get_location_and_clinical_submitter_id(county, self.date) if county.lower() == "illinois": illinois_summary_clinical_submitter_id = summary_clinical_submitter_id for k in ["Age", "Race", "Gender"]: data = county_demo_data.get(k) for item in data: keys, props = self.parse_group_clinical_demographic( county_demo_mapping, item) group_demographics_submitter_id = ( self.get_group_clinical_demographic_submitter_id( summary_clinical_submitter_id, keys)) props["submitter_id"] = group_demographics_submitter_id props["summary_clinicals"] = [{ "submitter_id": summary_clinical_submitter_id }] self.summary_group_demographic[ group_demographics_submitter_id] = props summary_location = { "country_region": self.country, "submitter_id": summary_location_submitter_id, "projects": [{ "code": self.project_code }], "province_state": self.state, "county": county, } summary_clinical = { "submitter_id": summary_clinical_submitter_id, "date": self.date, "summary_locations": [{ "submitter_id": summary_location_submitter_id }], } for (key, value) in county_vaccine_mapping.items(): if value == "vaccine_persons_fully_vaccinated_pct": summary_clinical[value] = int( county_covid_data.get(key) * 100) elif value == "vaccine_administered_count_roll_avg": summary_clinical[value] = int(county_covid_data.get(key)) elif value == "date": summary_clinical[value] = remove_time_from_date_time( county_covid_data.get(key)) else: summary_clinical[value] = county_covid_data.get(key) # for (key, value) in county_demo_mapping.items(): # summary_clinical[value] = county_demo_data.get(key) for (key, value) in inventory_reported.items(): summary_clinical[value] = ( self.counties_inventory[county].get(key) if value != "date" else remove_time_from_date_time( self.counties_inventory[county].get(key))) self.summary_locations[ summary_location_submitter_id] = summary_location self.summary_clinicals[ summary_clinical_submitter_id] = summary_clinical return illinois_summary_clinical_submitter_id def parse_total_state_wide(self, state_summary_clinical_submitter_id): """ Parse the Illinois total stats """ county_covid_response = requests.get(TOTAL_VACCINE_LINK, headers={"content-type": "json"}) state_total_data = json.loads(county_covid_response.text) total_vaccine_mapping = { "Total_Delivered": "vaccine_total_delivered_vaccine_doses", "Total_Administered": "vaccine_IL_total_administered_vaccine_doses", "Persons_Fully_Vaccinated": "vaccine_IL_total_persons_fully_vaccinated", "LTC_Allocated": "vaccine_long_term_care_allocated", "LTC_Administered": "vaccine_long_term_care_administered", "Report_Date": "date", } for (key, value) in total_vaccine_mapping.items(): if value != "date": self.summary_clinicals[state_summary_clinical_submitter_id][ value] = state_total_data.get(key) else: self.summary_clinicals[state_summary_clinical_submitter_id][ value] = remove_time_from_date_time( state_total_data.get(key)) def submit_metadata(self): print("Submitting data...") print("Submitting summary_location data") for sl in self.summary_locations.values(): sl_record = {"type": "summary_location"} sl_record.update(sl) self.metadata_helper.add_record_to_submit(sl_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for sc in self.summary_clinicals.values(): sc_record = {"type": "summary_clinical"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records() print("Submitting summary_group_demographic data") for sc in self.summary_group_demographic.values(): sc_record = {"type": "summary_group_demographics"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records()
class CHESTXRAY8(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "ChestX-ray8" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.file_helper = FileHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.cmc_submitter_id = format_submitter_id("cmc_chestxray8", {}) self.core_metadata_collection = [{ "submitter_id": self.cmc_submitter_id, "projects": [{ "code": self.project_code }], }] self.imaging_file = [] def files_to_submissions(self): for image_type in ("No_findings", "Pneumonia"): for image_filepath in ( Path(CHESTXRAY8_DATA_PATH).joinpath("COVID-19").joinpath( "X-Ray Image DataSet").joinpath(image_type).iterdir()): did, rev, md5, size = self.file_helper.find_by_name( image_filepath.name) if not did: guid = self.file_helper.upload_file(image_filepath) print( f"file {image_filepath.name} uploaded with guid: {guid}" ) else: print( f"file {image_filepath.name} exists in indexd... skipping..." ) imaging_file_submitter_id = format_submitter_id( "imaging_file_chestxray8", {"filename": image_filepath.name}) uploaded_imaging_file = { "submitter_id": imaging_file_submitter_id, "core_metadata_collections": [{ "submitter_id": self.cmc_submitter_id }], "data_type": "PNG", "data_format": "Image File", "data_category": "X-Ray Image", "file_name": image_filepath.name, "file_size": size, "md5sum": md5, "object_id": did, "clinical_notes": image_type, } self.imaging_file.append(uploaded_imaging_file) def submit_metadata(self): print("Submitting data...") print("Submitting core_metadata_collection data") for cmc in self.core_metadata_collection: cmc_record = {"type": "core_metadata_collection"} cmc_record.update(cmc) self.metadata_helper.add_record_to_submit(cmc_record) self.metadata_helper.batch_submit_records() print("Submitting imaging_file data") for ifile in self.imaging_file: if_record = {"type": "imaging_file"} if_record.update(ifile) self.metadata_helper.add_record_to_submit(if_record) self.metadata_helper.batch_submit_records()
class DSFSI(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.subjects = [] self.demographics = [] self.observations = [] self.program_name = "open" self.project_code = "DSFSI" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) # structure is # (csv field name, (node type, node field name, type of field)) self.countries_fields = [ ("case_id", ("subject", "submitter_id", str)), ("origin_case_id", (None, None, None)), ("date", ("observation", "reporting_date", normalize_date)), ("age", ("demographic", "age", normalize_age)), ("gender", ("demographic", "gender", normalize_gender)), ("city", ("demographic", "city", str)), ("province/state", ("demographic", "province_state", str)), ("country", ("demographic", "country_region", str)), ( "current_status", ("subject", "tmp_current_status", normalize_current_status), ), ( "source", ("observation", "reporting_source_url", str), ), # type of fields "None" is used to remove the value ("symptoms", ("observation", "symptoms", normalize_symptoms)), ( "date_onset_symptoms", ("observation", "date_onset_symptoms", normalize_date), ), ( "date_admission_hospital", ("observation", "date_admission_hospital", normalize_date), ), ("date_confirmation", ("subject", "date_confirmation", normalize_date)), ("underlying_conditions", (None, None, None)), ("travel_history_dates", ("subject", "travel_history_dates", str)), ("travel_history_location", ("subject", "travel_history_location", str)), ("death_date", ("subject", "deceased_date", normalize_date)), ("notes_for_discussion", (None, None, None)), ] def files_to_submissions(self): """ Reads CSV files and converts the data to Sheepdog records """ urls = { "Algeria": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-algeria.csv", "Angola": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-angola.csv", "Benin": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-benin.csv", "Burkina Faso": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-burkina-faso.csv", "Cabo Verde": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-cabo-verde.csv", "Cameroon": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-cameroon.csv", "Central African Republic": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-central-african-republic.csv", "Chad": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-chad.csv", "Côte d'Ivoire": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-cote-divoire.csv", "Democratic Republic of the Congo": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-democratic-republic-of-the-congo.csv", "Djibouti": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-djibouti.csv", # here should be an Egypt dataset, but it's not useful and omitted on purpose "Equatorial Guinea": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-equatorial-guinea.csv", "Eritrea": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-eritrea.csv", "Eswatini": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-eswatini.csv", "Ethiopia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-ethiopia.csv", "Gabon": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-gabon.csv", "Gambia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-gambia.csv", "Ghana": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-ghana.csv", "Guinea Bissau": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-guinea-bissau.csv", "Guinea": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-guinea.csv", "Kenya": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-kenya.csv", "Liberia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-liberia.csv", "Madagascar": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-madagascar.csv", "Mali": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-mali.csv", "Mauritania": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-mauritania.csv", "Mauritius": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-mauritius.csv", "Mozambique": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-mozambique.csv", "Namibia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-namibia.csv", "Niger": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-niger.csv", "Nigeria": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-nigeria.csv", "Republic of Congo": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-republic-of-congo.csv", "Rwanda": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-rwanda.csv", "Senegal": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-senegal.csv", "Seychelles": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-seychelles.csv", "Somalia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-somalia.csv", "South Africa": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-south-africa.csv", "Sudan": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-sudan.csv", "Tanzania": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-tanzania.csv", "Togo": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-togo.csv", "Uganda": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-uganda.csv", "Zambia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-zambia.csv", "Zimbabwe": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-zimbabwe.csv", } for k, url in urls.items(): self.parse_file(k, url) def parse_file(self, country, url): print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: f = (line.decode("utf-8") for line in r.iter_lines()) reader = csv.reader(f, delimiter=",", quotechar='"') headers = next(reader) assert ( headers[0] != "404: Not Found" ), " Unable to get file contents, received {}.".format(headers) countries_with_empty_columns = [ "Angola", "Burkina Faso", "Cabo Verde", "Cameroon", "Central African Republic", "Chad", "Côte d'Ivoire", "Democratic Republic of the Congo", "Djibouti", "Equatorial Guinea", "Eritrea", "Eswatini", "Gabon", "Guinea Bissau", "Guinea", "Liberia", "Madagascar", "Mali", "Mauritania", "Mauritius", "Mozambique", "Republic of Congo", "Senegal", "Seychelles", "Somalia", "Sudan", "Tanzania", "Togo", "Uganda", "Zambia", ] countries_with_mistyped_column = ["South Africa"] countries_without_notes = [ "Eritrea", "Eswatini", "Gabon", "Madagascar", "Mali", "Mauritania", "Mauritius", "Mozambique", "Republic of Congo", "Senegal", "Seychelles", "Somalia", "Sudan", "Tanzania", "Togo", "Uganda", "Zambia", ] # Ok, this is ugly... But, almost all the countries have some ugliness in the CSV format... # And this code deals with it tmp = copy.deepcopy(self.countries_fields) if country in countries_with_empty_columns: tmp.insert(0, ("", (None, None, None))) if country in countries_with_mistyped_column: tmp[14] = ("underlyng_conditions", (None, None, None)) if country in countries_without_notes: del tmp[-1] if country == "Ethiopia": tmp.insert(8, ("original_status", (None, None, None))) del tmp[10] tmp.insert(14, ("closed_date", (None, None, None))) tmp.insert(16, ("quarantine_status", (None, None, None))) del tmp[19] tmp.insert(19, ("contact", (None, None, None))) tmp.append(("source", (None, None, None))) if country == "Niger": del tmp[9] tmp.insert(9, ("source 1", (None, None, None))) tmp.insert(10, ("source 2", (None, None, None))) updated_headers_mapping = { field: (k, mapping) for k, (field, mapping) in enumerate(tmp) } expected_h = list(updated_headers_mapping.keys()) obtained_h = headers[: len(expected_h)] obtained_h = [header.strip() for header in obtained_h] assert ( obtained_h == expected_h ), "CSV headers have changed\nexpected: {}\n got: {})".format( expected_h, obtained_h ) # South Africa dataset has only 274 nice cases # Everything after has the same data and don't have any meaningful information idx = 0 last = None if country == "South Africa": last = 275 for row in reader: idx += 1 if last and idx == last: break subject, demographic, observation = self.parse_row( country, row, updated_headers_mapping ) self.subjects.append(subject) self.demographics.append(demographic) self.observations.append(observation) def parse_row(self, country, row, mapping): subject = {} demographic = {} observation = {} for (i, (node_type, node_field, type_conv)) in mapping.values(): if node_field: value = row[i] if value: if node_type == "subject": if type_conv is None: subject[node_field] = None continue subject[node_field] = type_conv(value) if node_type == "demographic": if type_conv is None: demographic[node_field] = None continue demographic[node_field] = type_conv(value) # init subject node case_id = subject["submitter_id"] subject["submitter_id"] = format_subject_submitter_id( country, subject["submitter_id"] ) subject["projects"] = [{"code": self.project_code}] # Only South Africa dataset has a record with the same case_id... # Because this code deals only with individual rows, it's hard coded right now if country == "South Africa" and case_id == "110": if demographic["age"] == 34: subject["submitter_id"] += "_1" elif demographic["age"] == 27: subject["submitter_id"] += "_2" # init demographic node demographic["submitter_id"] = format_node_submitter_id( subject["submitter_id"], "demographic" ) demographic["subjects"] = [{"submitter_id": subject["submitter_id"]}] # init observation node observation["submitter_id"] = format_node_submitter_id( subject["submitter_id"], "observation" ) observation["subjects"] = [{"submitter_id": subject["submitter_id"]}] if subject.get("date_confirmation"): subject["covid_19_status"] = "Positive" state = subject.get("tmp_current_status") if "tmp_current_status" in subject: del subject["tmp_current_status"] if state == "deceased": subject["vital_status"] = "Dead" elif state in ["alive"]: subject["vital_status"] = state.capitalize() elif state in ["positive"]: subject["covid_19_status"] = state.capitalize() elif state == "isolated": observation["isolation_status"] = state.capitalize() elif state in ["released", "recovered", "in recovery", "in treatment"]: observation["treatment_status"] = state.capitalize() elif state in ["stable", "unstable", "critical"]: observation["condition"] = state.capitalize() elif state: raise Exception('State "{}" is unknown'.format(state)) if "travel_history_dates" in subject: date_list = normalize_date_list(subject["travel_history_dates"]) if date_list: subject["travel_history_dates"] = date_list else: del subject["travel_history_dates"] if "travel_history_location" in subject: loc_list = normalize_location_list(subject["travel_history_location"]) if loc_list: subject["travel_history_location"] = loc_list else: del subject["travel_history_location"] return subject, demographic, observation def submit_metadata(self): print("Submitting subject data") for loc in self.subjects: loc_record = {"type": "subject"} loc_record.update(loc) self.metadata_helper.add_record_to_submit(loc_record) self.metadata_helper.batch_submit_records() print("Submitting demographic data") for dem in self.demographics: dem_record = {"type": "demographic"} dem_record.update(dem) self.metadata_helper.add_record_to_submit(dem_record) self.metadata_helper.batch_submit_records() print("Submitting observation data") for obs in self.observations: obs_record = {"type": "observation"} obs_record.update(obs) self.metadata_helper.add_record_to_submit(obs_record) self.metadata_helper.batch_submit_records()
class COM_MOBILITY(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "Com-Mobility" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.expected_file_headers = [ "country_region_code", "country_region", "sub_region_1", "sub_region_2", "metro_area", "iso_3166_2_code", "census_fips_code", "date", "retail_and_recreation_percent_change_from_baseline", "grocery_and_pharmacy_percent_change_from_baseline", "parks_percent_change_from_baseline", "transit_stations_percent_change_from_baseline", "workplaces_percent_change_from_baseline", "residential_percent_change_from_baseline", ] self.summary_locations = [] self.summary_socio_demographics = [] def files_to_submissions(self): """ Reads CSV files and converts the data to Sheepdog records """ url = "https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv" self.parse_file(url) def parse_file(self, url): """ Converts a CSV file to data we can submit via Sheepdog. Stores the records to submit in `self.location_data` and `self.time_series_data`. Ignores any records that are already in Sheepdog (relies on unique `submitter_id` to check) Args: url (str): URL at which the CSV file is available """ self.last_submission_date_time = self.metadata_helper.get_last_submission() the_lattest_data_datetime = None print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: f = (line.decode("utf-8") for line in r.iter_lines()) reader = csv.reader(f, delimiter=",", quotechar='"') headers = next(reader) assert ( headers[0] != "404: Not Found" ), "Unable to get file contents, received {}.".format(headers) assert set(self.expected_file_headers).issubset( set(headers) ), "CSV headers have changed (expected {} is a subset of {}). We may need to update the ETL code".format( self.expected_file_headers, headers ) for row in reader: # ignore any empty row if not row: continue row_dict = dict(zip(headers, row)) if row_dict["country_region_code"] != "US": continue if ( not self.last_submission_date_time or parse(row_dict["date"]) > self.last_submission_date_time ): if ( the_lattest_data_datetime is None or the_lattest_data_datetime < parse(row_dict["date"]) ): the_lattest_data_datetime = parse(row_dict["date"]) summary_location = {} summary_socio_demographic = {} summary_location_submitter_id = format_submitter_id( "summary_location", row_dict["country_region_code"], row_dict["sub_region_1"], row_dict["sub_region_2"], row_dict["metro_area"], row_dict["date"], ) summary_socio_demographic_submitter_id = format_submitter_id( "summary_socio_demographic", row_dict["country_region_code"], row_dict["sub_region_1"], row_dict["sub_region_2"], row_dict["metro_area"], row_dict["date"], ) summary_location = { "submitter_id": summary_location_submitter_id, "projects": [{"code": self.project_code}], } summary_socio_demographic = { "submitter_id": summary_socio_demographic_submitter_id, "summary_locations": [ {"submitter_id": summary_location_submitter_id} ], } for field in [ "country_region_code", "country_region", "sub_region_1", "sub_region_2", "metro_area", "iso_3166_2_code", "census_fips_code", ]: gen3_field, func = SPECIAL_MAP_FIELDS[field] summary_location[gen3_field] = func(row_dict[field]) for field in [ "retail_and_recreation_percent_change_from_baseline", "grocery_and_pharmacy_percent_change_from_baseline", "parks_percent_change_from_baseline", "transit_stations_percent_change_from_baseline", "workplaces_percent_change_from_baseline", "residential_percent_change_from_baseline", "date", ]: gen3_field, func = SPECIAL_MAP_FIELDS[field] summary_socio_demographic[gen3_field] = func(row_dict[field]) self.summary_locations.append(summary_location) self.summary_socio_demographics.append(summary_socio_demographic) if the_lattest_data_datetime: self.last_submission_date_time = the_lattest_data_datetime def submit_metadata(self): """ Converts the data in `self.time_series_data` to Sheepdog records. `self.location_data already contains Sheepdog records. Batch submits all records in `self.location_data` and `self.time_series_data` """ # Commented # Only required for one time submission of summary_location print("Submitting summary_location data") for loc in self.summary_locations: loc_record = {"type": "summary_location"} loc_record.update(loc) self.metadata_helper.add_record_to_submit(loc_record) self.metadata_helper.batch_submit_records() print("Submitting summary_socio_demographic data") for sc in self.summary_socio_demographics: sc_record = {"type": "summary_socio_demographic"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records() self.metadata_helper.update_last_submission( self.last_submission_date_time.strftime("%Y-%m-%d") )
class CCMAP(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.summary_locations = [] self.summary_clinicals = [] self.summary_socio_demographics = [] self.program_name = "open" self.project_code = "CCMap" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) # structure is # (csv field name, (node type, node field name, type of field)) county_fields = [ ("fips_code", ("summary_location", "FIPS", int)), ("State", ("summary_location", "province_state", str)), ("County Name", ("summary_location", "county", str)), ("Staffed All Beds", ("summary_clinical", "staffed_all_beds", int)), ("Staffed ICU Beds", ("summary_clinical", "staffed_icu_beds", int)), ("Licensed All Beds", ("summary_clinical", "licensed_all_beds", int)), ( "All Bed Occupancy Rate", ("summary_clinical", "all_bed_occupancy_rate", float), ), ( "ICU Bed Occupancy Rate", ("summary_clinical", "icu_bed_occupancy_rate", float), ), ("Population", ("summary_clinical", "population", int)), ("Population (20+)", ("summary_clinical", "population_gtr_20", int)), ("Population (65+)", ("summary_clinical", "population_gtr_65", int)), ( "Staffed All Beds [Per 1000 People]", ("summary_clinical", "staffed_all_beds_per_1000", float), ), ( "Staffed All Beds [Per 1000 Adults (20+)]", ("summary_clinical", "staffed_all_beds_per_1000_gtr_20", float), ), ( "Staffed All Beds [Per 1000 Elderly (65+)]", ("summary_clinical", "staffed_all_beds_per_1000_gtr_65", float), ), ( "Staffed ICU Beds [Per 1000 People]", ("summary_clinical", "staffed_icu_beds_per_1000", float), ), ( "Staffed ICU Beds [Per 1000 Adults (20+)]", ("summary_clinical", "staffed_icu_beds_per_1000_gtr_20", float), ), ( "Staffed ICU Beds [Per 1000 Elderly (65+)]", ("summary_clinical", "staffed_icu_beds_per_1000_gtr_65", float), ), ( "Licensed All Beds [Per 1000 People]", ("summary_clinical", "licensed_all_beds_per_1000", float), ), ( "Licensed All Beds [Per 1000 Adults (20+)]", ("summary_clinical", "licensed_all_beds_per_1000_gtr_20", float), ), ( "Licensed All Beds [Per 1000 Elderly (65+)]", ("summary_clinical", "licensed_all_beds_per_1000_gtr_65", float), ), ] state_fields = [ ("State", ("summary_location", None, int)), ("State Name", ("summary_location", "province_state", str)), ("Staffed All Beds", ("summary_clinical", "staffed_all_beds", int)), ("Staffed ICU Beds", ("summary_clinical", "staffed_icu_beds", int)), ("Licensed All Beds", ("summary_clinical", "licensed_all_beds", int)), ( "All Bed Occupancy Rate", ("summary_clinical", "all_bed_occupancy_rate", float), ), ( "ICU Bed Occupancy Rate", ("summary_clinical", "icu_bed_occupancy_rate", float), ), ("Population", ("summary_clinical", "population", int)), ( "Population (20+)", ("summary_socio_demographic", "population_gtr_20", int), ), ( "Population (65+)", ("summary_socio_demographic", "population_gtr_65", int), ), ( "Staffed All Beds [Per 1000 People]", ("summary_clinical", "staffed_all_beds_per_1000", float), ), ( "Staffed All Beds [Per 1000 Adults (20+)]", ("summary_clinical", "staffed_all_beds_per_1000_gtr_20", float), ), ( "Staffed All Beds [Per 1000 Elderly (65+)]", ("summary_clinical", "staffed_all_beds_per_1000_gtr_65", float), ), ( "Staffed ICU Beds [Per 1000 People]", ("summary_clinical", "staffed_icu_beds_per_1000", float), ), ( "Staffed ICU Beds [Per 1000 Adults (20+)]", ("summary_clinical", "staffed_icu_beds_per_1000_gtr_20", float), ), ( "Staffed ICU Beds [Per 1000 Elderly (65+)]", ("summary_clinical", "staffed_icu_beds_per_1000_gtr_65", float), ), ( "Licensed All Beds [Per 1000 People]", ("summary_clinical", "licensed_all_beds_per_1000", float), ), ( "Licensed All Beds [Per 1000 Adults (20+)]", ("summary_clinical", "licensed_all_beds_per_1000_gtr_20", float), ), ( "Licensed All Beds [Per 1000 Elderly (65+)]", ("summary_clinical", "licensed_all_beds_per_1000_gtr_65", float), ), ( "Estimated No. Full-Featured Mechanical Ventilators (2010 study estimate)", ("summary_clinical", "estimated_full_mech_ventilators", int), ), ( "Estimated No. Full-Featured Mechanical Ventilators per 100,000 Population (2010 study estimate)", ( "summary_clinical", "estimated_full_mech_ventilators_per_100000", float, ), ), ( "Estimated No. Pediatrics-Capable Full-Feature Mechanical Ventilators (2010 study estimate)", ("summary_clinical", "estimated_full_mech_pediatric_ventilators", int), ), ( "Estimated No. Full-Feature Mechanical Ventilators, Pediatrics Capable per 100,000 Population <14 y (2010 study estimate)", ( "summary_clinical", "estimated_full_mech_pediatric_ventilators_per_100000", float, ), ), ] self.headers_mapping = { "county": {field: mapping for field, mapping in county_fields}, "state": {field: mapping for field, mapping in state_fields}, } def files_to_submissions(self): """ Reads CSV files and converts the data to Sheepdog records """ repo = "covidcaremap/covid19-healthsystemcapacity" branch = "master" files = { "county": "data/published/us_healthcare_capacity-county-CovidCareMap.csv", "state": "data/published/us_healthcare_capacity-state-CovidCareMap.csv", } for k, url in files.items(): self.parse_file(repo, branch, url, csv_type=k) def get_last_update_date_file(self, repo, url): """ Gets latest update time for specific file in the repository :param repo: "user/repository" for Github repository :param url: path to file :return: last update (commit) datetime for the file """ api_url = "https://api.github.com/repos" commit_info_url = "{}/{}/{}{}{}".format(api_url, repo, "commits?path=", url, "&page=1&per_page=1") with closing(requests.get(commit_info_url, stream=True)) as r: commit_info = r.json() last_update_date = commit_info[0]["commit"]["committer"]["date"] return datetime.datetime.strptime(last_update_date, "%Y-%m-%dT%H:%M:%SZ") def parse_file(self, repo, branch, file_url, csv_type): last_update_date = self.get_last_update_date_file(repo, file_url) raw_url = "https://raw.githubusercontent.com" url = "{}/{}/{}/{}".format(raw_url, repo, branch, file_url) print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: f = (line.decode("utf-8") for line in r.iter_lines()) reader = csv.reader(f, delimiter=",", quotechar='"') headers = next(reader) assert ( headers[0] != "404: Not Found" ), " Unable to get file contents, received {}.".format(headers) expected_h = list(self.headers_mapping[csv_type].keys()) assert ( set(expected_h).issubset(set(headers)) == True ), "CSV headers have changed (expected {}, got {}). We may need to update the ETL code".format( expected_h, headers) for i, f in enumerate(headers): if f in self.headers_mapping[csv_type]: old_value = self.headers_mapping[csv_type][f] self.headers_mapping[csv_type][f] = (i, old_value) for row in reader: ( summary_location, summary_clinical, summary_socio_demographic, ) = self.parse_row(row, self.headers_mapping[csv_type], last_update_date) self.summary_locations.append(summary_location) self.summary_clinicals.append(summary_clinical) self.summary_socio_demographics.append( summary_socio_demographic) def parse_row(self, row, mapping, last_update_date): summary_location = {"country_region": "US"} summary_clinical = {} summary_socio_demographic = {} for k, (i, (node_type, node_field, type_conv)) in mapping.items(): try: if node_field: value = row[i] if value: if node_type == "summary_location": summary_location[node_field] = type_conv(value) if node_type == "summary_clinical": if type_conv == int: summary_clinical[node_field] = type_conv( float(value)) else: summary_clinical[node_field] = type_conv(value) if node_type == "summary_socio_demographic": if type_conv == int: summary_socio_demographic[ node_field] = type_conv(float(value)) else: summary_socio_demographic[ node_field] = type_conv(value) summary_clinical[ node_field] = None # TODO: remove when the properties are removed from dictionary except Exception as ex: print("Error with field: {}, problematic value: {}".format( node_field, row[i])) summary_location_submitter_id = format_location_submitter_id( summary_location) summary_location["submitter_id"] = summary_location_submitter_id summary_location["projects"] = [{"code": self.project_code}] state = summary_location["province_state"] if len(state) == 2: summary_location["province_state"] = state_to_long(state) summary_clinical[ "submitter_id"] = format_summary_clinical_submitter_id( summary_location_submitter_id, date=last_update_date.strftime("%Y-%m-%d")) summary_clinical["summary_locations"] = [{ "submitter_id": summary_location_submitter_id }] summary_socio_demographic[ "submitter_id"] = format_summary_socio_demographic_id( summary_location_submitter_id, date=last_update_date.strftime("%Y-%m-%d")) summary_socio_demographic["summary_locations"] = [{ "submitter_id": summary_location_submitter_id }] return summary_location, summary_clinical, summary_socio_demographic def submit_metadata(self): print("Submitting summary_location data") for loc in self.summary_locations: loc_record = {"type": "summary_location"} loc_record.update(loc) self.metadata_helper.add_record_to_submit(loc_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for sc in self.summary_clinicals: sc_record = {"type": "summary_clinical"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records() print("Submitting summary_socio_demographic data") for sc in self.summary_socio_demographics: sc_record = {"type": "summary_socio_demographic"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records()
class DS4C(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "DS4C" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.subjects = [] self.demographics = [] self.observations = [] def files_to_submissions(self): with open( os.path.join(CURRENT_DIR, "data/ds4c_PatientInfo.csv"), newline="" ) as csvfile: reader = csv.reader(csvfile, delimiter=",") header = next(reader) print("Headers:", header) header = {k: v for v, k in enumerate(header)} n_1200012238 = 1 for row in reader: patient_id = row[header["patient_id"]].strip() if patient_id == "1200012238": # there are 2 rows for the same ID patient_id = f"{patient_id}_{n_1200012238}" n_1200012238 += 1 # generate subject record subject = { "submitter_id": patient_id, "projects": [{"code": self.project_code}], } confirmed_date = row[header["confirmed_date"]].strip() if confirmed_date: check_date_format(confirmed_date) subject["date_confirmation"] = confirmed_date subject["covid_19_status"] = "Positive" infected_by = row[header["infected_by"]].strip() if infected_by: subject["infected_by"] = list( map(lambda v: v.strip(), infected_by.split(",")) ) deceased_date = row[header["deceased_date"]].strip() if deceased_date: check_date_format(deceased_date) subject["deceased_date"] = deceased_date # generate demographic record demographic = { "submitter_id": f"demographic_{patient_id}", "subjects": {"submitter_id": patient_id}, "age_decade": row[header["age"]].strip(), "province_state": row[header["province"]].strip(), "city": row[header["city"]].strip(), } country = row[header["country"]].strip() if country == "Korea": demographic["country_region"] = "South Korea" elif country == "United States": demographic["country_region"] = "USA" else: demographic["country_region"] = country gender = row[header["sex"]].strip() demographic["gender"] = harmonize_gender(gender) demographic["year_of_birth"] = None # generate observation record observation = { "submitter_id": f"observation_{patient_id}", "subjects": {"submitter_id": patient_id}, "exposure": row[header["infection_case"]].strip(), } date_onset_symptoms = row[header["symptom_onset_date"]].strip() if date_onset_symptoms: check_date_format(row[header["symptom_onset_date"]]) observation["date_onset_symptoms"] = date_onset_symptoms state = row[header["state"]].strip() if state == "deceased": subject["vital_status"] = "Dead" elif state == "isolated": observation["isolation_status"] = "Isolated" elif state == "released": observation["treatment_status"] = "Released" elif state: raise Exception('State "{}" is unknown'.format(state)) released_date = row[header["released_date"]].strip() if released_date: check_date_format(released_date) observation["released_date"] = released_date subject = {k: v if v else None for k, v in subject.items()} self.subjects.append(subject) demographic = {k: v for k, v in demographic.items() if v} self.demographics.append(demographic) observation = {k: v for k, v in observation.items() if v} self.observations.append(observation) def submit_metadata(self): print("Submitting data") print("Submitting subject data") for loc in self.subjects: loc_record = {"type": "subject"} loc_record.update(loc) self.metadata_helper.add_record_to_submit(loc_record) self.metadata_helper.batch_submit_records() print("Submitting demographic data") for dem in self.demographics: dem_record = {"type": "demographic"} dem_record.update(dem) self.metadata_helper.add_record_to_submit(dem_record) self.metadata_helper.batch_submit_records() print("Submitting observation data") for obs in self.observations: obs_record = {"type": "observation"} obs_record.update(obs) self.metadata_helper.add_record_to_submit(obs_record) self.metadata_helper.batch_submit_records()
class DSCI(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "DSCI" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.subjects = [] self.demographics = [] self.observations = [] def files_to_submissions(self): with open( os.path.join(CURRENT_DIR, "data/dsci_patient.csv"), newline="" ) as csvfile: reader = csv.reader(csvfile, delimiter=",", quotechar="|") header = next(reader) print("Headers:", header) header = {k: v for v, k in enumerate(header)} for row in reader: patient_id = row[header["patient_id"]].strip() # generate subject record subject = { "submitter_id": patient_id, "projects": [{"code": self.project_code}], } infected_by = row[header["contacted_with"]].strip() if infected_by: subject["infected_by"] = list( map(lambda v: v.strip(), infected_by.split(",")) ) confirmed_date = row[header["confirmed_date"]].strip() if confirmed_date: confirmed_date = format_date(confirmed_date) check_date_format(confirmed_date) subject["date_confirmation"] = confirmed_date subject["covid_19_status"] = "Positive" deceased_date = row[header["deceased_date"]].strip() if deceased_date: deceased_date = format_date(deceased_date) check_date_format(deceased_date) subject["deceased_date"] = deceased_date # generate demographic record demographic = { "submitter_id": f"demographic_{patient_id}", "subjects": {"submitter_id": f"{patient_id}"}, } cols = {"age": "age", "province": "province_state"} for k, v in cols.items(): value = row[header[k]].strip() if value: demographic[v] = value if "age" in demographic: demographic["age"] = int(demographic["age"]) gender = row[header["gender"]].strip() demographic["gender"] = harmonize_gender(gender) nationality = row[header["nationality"]].strip() if nationality == "indonesia": demographic["country_region"] = "Indonesia" elif nationality == "foreigner": pass elif nationality: raise Exception('Nationality "{}" is unknown'.format(nationality)) # generate observation record observation = { "submitter_id": f"observation_{patient_id}", "subjects": {"submitter_id": f"{patient_id}"}, } hospital = row[header["hospital"]].strip() if hospital: observation["hospital"] = hospital state = row[header["current_state"]].strip() if state == "deceased": subject["vital_status"] = "Dead" elif state == "isolated": observation["isolation_status"] = "Isolated" elif state == "released": observation["treatment_status"] = "Released" elif state: raise Exception('State "{}" is unknown'.format(state)) released_date = row[header["released_date"]].strip() if released_date: released_date = format_date(released_date) check_date_format(released_date) observation["released_date"] = released_date self.subjects.append(subject) self.demographics.append(demographic) self.observations.append(observation) def submit_metadata(self): print("Submitting data") print("Submitting subject data") for loc in self.subjects: loc_record = {"type": "subject"} loc_record.update(loc) self.metadata_helper.add_record_to_submit(loc_record) self.metadata_helper.batch_submit_records() print("Submitting demographic data") for dem in self.demographics: dem_record = {"type": "demographic"} dem_record.update(dem) self.metadata_helper.add_record_to_submit(dem_record) self.metadata_helper.batch_submit_records() print("Submitting observation data") for obs in self.observations: obs_record = {"type": "observation"} obs_record.update(obs) self.metadata_helper.add_record_to_submit(obs_record) self.metadata_helper.batch_submit_records()
class IDPH_FACILITY(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "IDPH-Facility" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.country = "US" self.state = "IL" self.summary_locations = {} self.summary_clinicals = {} def files_to_submissions(self): """ Reads JSON file and convert the data to Sheepdog records """ latest_submitted_date = self.metadata_helper.get_latest_submitted_date_idph( ) today = datetime.date.today() if latest_submitted_date == today: print( "Nothing to submit: today and latest submitted date are the same." ) return today_str = today.strftime("%Y%m%d") print(f"Getting data for date: {today_str}") url = "https://dph.illinois.gov/sitefiles/COVIDLTC.json" self.parse_file(latest_submitted_date, url) def parse_file(self, latest_submitted_date, url): """ Converts a JSON files to data we can submit via Sheepdog. Stores the records to submit in `self.summary_locations` and `self.summary_clinicals`. Args: latest_submitted_date (date): the date of latest available "summary_clinical" for project url (str): URL at which the JSON file is available """ print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: data = r.json() date = idph_get_date(data["LastUpdateDate"]) if latest_submitted_date and date == latest_submitted_date.strftime( "%Y-%m-%d"): print( "Nothing to submit: latest submitted date and date from data are the same." ) return if "LTC_Reported_Cases" in data: summary_location_submitter_id = format_submitter_id( "summary_location", { "country": self.country, "state": self.state }) summary_location = { "country_region": self.country, "submitter_id": summary_location_submitter_id, "projects": [{ "code": self.project_code }], "province_state": self.state, } summary_clinical_submitter_id = derived_submitter_id( summary_location_submitter_id, "summary_location", "summary_clinical", {"date": date}, ) summary_clinical = { "confirmed": data["LTC_Reported_Cases"]["confirmed_cases"], "deaths": data["LTC_Reported_Cases"]["deaths"], "submitter_id": summary_clinical_submitter_id, "lastUpdateEt": date, "date": date, "summary_locations": [{ "submitter_id": summary_location_submitter_id }], } self.summary_locations[ summary_location_submitter_id] = summary_location self.summary_clinicals[ summary_clinical_submitter_id] = summary_clinical for facility in data["FacilityValues"]: (summary_location, summary_clinical) = self.parse_facility(date, facility) summary_location_submitter_id = summary_location[ "submitter_id"] summary_clinical_submitter_id = summary_clinical[ "submitter_id"] self.summary_locations[ summary_location_submitter_id] = summary_location if summary_clinical_submitter_id in self.summary_clinicals: existed = self.summary_clinicals[ summary_clinical_submitter_id] summary_clinical["confirmed"] = max( summary_clinical["confirmed"], existed["confirmed"]) summary_clinical["deaths"] = max( summary_clinical["deaths"], existed["deaths"]) self.summary_clinicals[ summary_clinical_submitter_id] = summary_clinical def parse_facility(self, date, facility): """ From county-level data, generate the data we can submit via Sheepdog """ county = facility["County"] facility_name = facility["FacilityName"] confirmed_cases = facility["confirmed_cases"] deaths = facility["deaths"] status = facility.get("status", None) summary_location_submitter_id = format_submitter_id( "summary_location", { "country": self.country, "state": self.state, "facility_name": facility_name, "reporting_org_status": status, }, ) summary_location = { "country_region": self.country, "submitter_id": summary_location_submitter_id, "projects": [{ "code": self.project_code }], "province_state": self.state, "county": county, "reporting_org": facility_name, "reporting_org_status": status, } summary_clinical_submitter_id = derived_submitter_id( summary_location_submitter_id, "summary_location", "summary_clinical", {"date": date}, ) summary_clinical = { "confirmed": confirmed_cases, "deaths": deaths, "submitter_id": summary_clinical_submitter_id, "lastUpdateEt": date, "date": date, "summary_locations": [{ "submitter_id": summary_location_submitter_id }], } return summary_location, summary_clinical def submit_metadata(self): print("Submitting data...") print("Submitting summary_location data") for sl in self.summary_locations.values(): sl_record = {"type": "summary_location"} sl_record.update(sl) self.metadata_helper.add_record_to_submit(sl_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for sc in self.summary_clinicals.values(): sc_record = {"type": "summary_clinical"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records()
class JHU(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.location_data = {} self.time_series_data = defaultdict(lambda: defaultdict(dict)) self.program_name = "open" self.project_code = "JHU" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.expected_csv_headers = { "global": ["Province/State", "Country/Region", "Lat", "Long", "1/22/20"], "US_counties": { "confirmed": [ "UID", "iso2", "iso3", "code3", "FIPS", "Admin2", "Province_State", "Country_Region", "Lat", "Long_", "Combined_Key", "1/22/20", ], "deaths": [ "UID", "iso2", "iso3", "code3", "FIPS", "Admin2", "Province_State", "Country_Region", "Lat", "Long_", "Combined_Key", "Population", # TODO use this "1/22/20", ], }, } self.header_to_column = { "global": { "province": 0, "country": 1, "latitude": 2, "longitude": 3, "dates_start": 4, }, "US_counties": { "confirmed": { "iso2": 1, "iso3": 2, "code3": 3, "FIPS": 4, "county": 5, "province": 6, "country": 7, "latitude": 8, "longitude": 9, "dates_start": 11, }, "deaths": { "iso2": 1, "iso3": 2, "code3": 3, "FIPS": 4, "county": 5, "province": 6, "country": 7, "latitude": 8, "longitude": 9, "dates_start": 12, }, }, } self.existing_summary_locations = [] self.last_date = "" def files_to_submissions(self): """ Reads CSV files and converts the data to Sheepdog records """ urls = { "global": { "confirmed": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv", "deaths": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv", "recovered": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv", # "testing": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_testing_global.csv", }, "US_counties": { "confirmed": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv", "deaths": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv", }, } ( self.existing_summary_locations, self.last_date, ) = self.metadata_helper.get_existing_data_jhu() for file_type in ["global", "US_counties"]: for data_type, url in urls[file_type].items(): self.parse_file(file_type, data_type, url) def parse_file(self, file_type, data_type, url): """ Converts a CSV file to data we can submit via Sheepdog. Stores the records to submit in `self.location_data` and `self.time_series_data`. Ignores any records that are already in Sheepdog (relies on unique `submitter_id` to check) Args: file_type (str): type of this file - one of ["global", "US_counties"] data_type (str): type of the data in this file - one of ["confirmed", "deaths", "recovered"] url (str): URL at which the CSV file is available """ print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: f = (line.decode("utf-8") for line in r.iter_lines()) reader = csv.reader(f, delimiter=",", quotechar='"') headers = next(reader) if headers[0] == "404: Not Found": print(" Unable to get file contents, received {}.".format( headers)) return expected_h = self.expected_csv_headers[file_type] if isinstance(expected_h, dict): expected_h = expected_h[data_type] obtained_h = headers[:len(expected_h)] assert ( obtained_h == expected_h ), "CSV headers have changed (expected {}, got {}). We may need to update the ETL code".format( expected_h, obtained_h) first_date_i = [ i for i, h in enumerate(headers) if h.endswith("/20") ][0] last_date = headers[-1] print(" First date: {}; last date: {}".format( headers[first_date_i], last_date)) for row in reader: if not row: # ignore empty rows continue location, date_to_value = self.parse_row( file_type, data_type, headers, row) if not location: # We are using US data by state instead of global continue location_submitter_id = location["submitter_id"] if (location_submitter_id not in self.location_data # do not re-submit location data that already exist and location_submitter_id not in self.existing_summary_locations): self.location_data[location_submitter_id] = location for date, value in date_to_value.items(): # do not re-submit summary_clinical data that # already exist. Assume anything older than the last # submitted date has already been submitted if (time_series_date_to_string(date) > time_series_date_to_string(self.last_date) or LAST_DATE_ONLY): self.time_series_data[location_submitter_id][date][ data_type] = value def parse_row(self, file_type, data_type, headers, row): """ Converts a row of a CSV file to data we can submit via Sheepdog Args: file_type (str): type of this file - one of ["global", "US_counties"] data_type (str): type of the data in this file - one of ["confirmed", "deaths", "recovered"] headers (list(str)): CSV file headers (first row of the file) row (list(str)): row of data Returns: (dict, dict) tuple: - location data, in a format ready to be submitted to Sheepdog - { "date1": <value>, "date2": <value> } from the row data """ header_to_column = self.header_to_column[file_type] if "country" not in header_to_column: header_to_column = header_to_column[data_type] country = row[header_to_column["country"]] province = row[header_to_column["province"]] latitude = row[header_to_column["latitude"]] or "0" longitude = row[header_to_column["longitude"]] or "0" if country == "US" and province == "": # We are using US data by state instead of global return None, None if int(float(latitude)) == 0 and int(float(longitude)) == 0: # Data with "Out of <state>" or "Unassigned" county value have # unknown coordinates of (0,0). We don't submit them for now return None, None submitter_id = format_location_submitter_id(country, province) location = { "country_region": country, "latitude": latitude, "longitude": longitude, "projects": [{ "code": self.project_code }], } if province: location["province_state"] = province if file_type == "US_counties": county = row[header_to_column["county"]] iso2 = row[header_to_column["iso2"]] iso3 = row[header_to_column["iso3"]] code3 = row[header_to_column["code3"]] fips = row[header_to_column["FIPS"]] if county: location["county"] = county submitter_id = format_location_submitter_id( country, province, county) if iso2: location["iso2"] = iso2 if iso3: location["iso3"] = iso3 if code3: location["code3"] = int(code3) if fips: location["FIPS"] = int(float(fips)) location["submitter_id"] = submitter_id date_to_value = {} dates_start = header_to_column["dates_start"] dates_indices = range(dates_start, len(headers)) if LAST_DATE_ONLY: dates_indices = [len(headers) - 1] for i in dates_indices: date = headers[i] date = get_unified_date_format(date) if row[i] == "": # ignore empty values continue try: val = int(float(row[i])) except ValueError: print( 'Unable to convert {} to int for "{}", "{}" at {}'.format( row[i], province, country, date)) raise date_to_value[date] = val return location, date_to_value def submit_metadata(self): """ Converts the data in `self.time_series_data` to Sheepdog records. `self.location_data already contains Sheepdog records. Batch submits all records in `self.location_data` and `self.time_series_data` """ if LAST_DATE_ONLY: # delete the old data from the Sheepdog DB print("Deleting old summary_clinical data") self.metadata_helper.delete_nodes(["summary_clinical"]) print("Submitting summary_location data") for location in self.location_data.values(): record = {"type": "summary_location"} record.update(location) self.metadata_helper.add_record_to_submit(record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for location_submitter_id, time_series in self.time_series_data.items( ): for date, data in time_series.items(): submitter_id = format_summary_clinical_submitter_id( location_submitter_id, date) record = { "type": "summary_clinical", "submitter_id": submitter_id, "summary_locations": [{ "submitter_id": location_submitter_id }], "date": date, } for data_type, value in data.items(): record[data_type] = value self.metadata_helper.add_record_to_submit(record) self.metadata_helper.batch_submit_records()
class VAC_TRACKER(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.clinical_trials = [] self.program_name = "open" self.project_code = "VacTracker" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) def files_to_submissions(self): """ Reads json files and converts the data to Sheepdog records """ url = "https://biorender.com/page-data/covid-vaccine-tracker/page-data.json" self.parse_file(url) def parse_file(self, url): """ Converts a json file to data we can submit via Sheepdog. Stores the records to submit in `self.location_data` and `self.time_series_data`. Ignores any records that are already in Sheepdog (relies on unique `submitter_id` to check) Args: url (str): URL at which the file is available """ print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: data = r.json() try: for treatment in data["result"]["pageContext"]["treatments"]: node = treatment["node"] clinical_trial = self.parse_node(node) self.clinical_trials.append(clinical_trial) except ValueError as e: print(f"ERROR: value error. Detail {e}") def parse_node(self, node): """ Converts an element of an JSON file to data we can submit via Sheepdog Args: node (dict): node data Returns: dict: - clinical trial data, in a format ready to be submitted to Sheepdog """ clinical_trial = { "projects": [{"code": self.project_code}], "type": "clinical_trials", } for key, value in node.items(): if key not in MAP_FIELDS: continue gen3_field = MAP_FIELDS.get(key)[0] gen3_field_type = MAP_FIELDS.get(key)[1] if type(value) != gen3_field_type: print( f"ERROR: The type of {key} does not match with the one in Gen3. Skip it" ) continue if key == "fdaApproved": if "FDA-approved" in value: value = "Yes" elif value == "": value = "Unknown" elif value in ["N/A", "N//A", "N/A*"]: value = "NA" elif value not in ["Yes", "No", "Unknown", "NA", None]: value = "Unknown" if key == "customClinicalPhase": if value.lower() == "phase na": value = "Phase N/A" elif value.lower() in ["preclinical", "pre-clinical"]: value = "Preclinical Phase" elif value not in [ "Preclinical Phase", "Phase I", "Phase I/II", "Phase II", "Phase I/II/III", "Phase III", "Phase III/IV", "Phase IV", "Phase I/III/IV", "Phase I/IV", "Phase II/IV", "Phase II/III/IV", "Phase I/II/III/IV", "Phase II/III", "Phase N/A", None, ]: value = None if key == "technology": value = value.replace("*", "") if "to repurpose" in value.lower(): value = "Repurposed" if value not in [ "Antibodies", "Antivirals", "Cell-based therapies", "Device", "DNA-based", "Inactivated virus", "Modified APC", "Non-replicating viral vector", "Protein subunit", "RNA-based treatments", "RNA-based vaccine", "Repurposed", "Virus Like Particle", "Other", None, ]: value = "Other" if key == "developmentStage": if value.lower() in ["preclinical", "pre-clinical"]: value = "Preclinical Phase" elif value not in ["Preclinical Phase", "Clinical", "Withdrawn", None]: value = "Other" if gen3_field_type == list: value = [str(v) for v in value] clinical_trial[gen3_field] = value return clinical_trial def submit_metadata(self): """ Converts the data in `self.time_series_data` to Sheepdog records. `self.location_data already contains Sheepdog records. Batch submits all records in `self.clinical_trials` """ print("Submitting clinical_trial data") for clinical_trial in self.clinical_trials: self.metadata_helper.add_record_to_submit(clinical_trial) self.metadata_helper.batch_submit_records()
class OWID2(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.summary_locations = [] self.summary_clinicals = [] self.summary_socio_demographics = [] self.program_name = "open" self.project_code = "OWID" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.expected_csv_headers = [ "iso_code", "continent", "location", "date", "total_cases", "new_cases", "new_cases_smoothed", "new_deaths", "new_deaths_smoothed", "total_cases_per_million", "new_cases_per_million", "new_cases_smoothed_per_million", "total_deaths_per_million", "new_deaths_per_million", "new_deaths_smoothed_per_million", "new_tests", "total_tests", "total_tests_per_thousand", "new_tests_per_thousand", "new_tests_smoothed", "new_tests_smoothed_per_thousand", "tests_per_case", "positive_rate", "tests_units", "stringency_index", "population", "population_density", "median_age", "aged_65_older", "aged_70_older", "gdp_per_capita", "extreme_poverty", "cardiovasc_death_rate", "diabetes_prevalence", "female_smokers", "male_smokers", "handwashing_facilities", "hospital_beds_per_thousand", "life_expectancy", ] self.header_to_column = { k: self.expected_csv_headers.index(k) for k in self.expected_csv_headers } def files_to_submissions(self): """ Reads CSV files and converts the data to Sheepdog records """ url = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv" self.parse_file(url) def insert_row_value(self, row_value): summary_location_list = [] ( summary_location, summary_clinical, summary_socio_demographic, ) = row_value summary_location_submitter_id = summary_location["submitter_id"] if summary_location_submitter_id not in summary_location_list: self.summary_locations.append(summary_location) summary_location_list.append(summary_location_submitter_id) self.summary_clinicals.append(summary_clinical) self.summary_socio_demographics.append(summary_socio_demographic) def parse_file(self, url): """ Converts a CSV file to data we can submit via Sheepdog. Stores the records to submit in `self.location_data` and `self.time_series_data`. Ignores any records that are already in Sheepdog (relies on unique `submitter_id` to check) Args: url (str): URL at which the CSV file is available """ print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: f = (line.decode("utf-8") for line in r.iter_lines()) reader = csv.reader(f, delimiter=",", quotechar='"') headers = next(reader) if headers[0] == "404: Not Found": print(" Unable to get file contents, received {}.".format( headers)) return expected_h = self.expected_csv_headers assert ( set(expected_h).issubset(headers) == True ), "CSV headers have changed (expected {}, got {}). We may need to update the ETL code".format( expected_h, headers) pre_row = None for row in reader: res = self.parse_row(pre_row, row) if res is not None: self.insert_row_value(res) pre_row = row if pre_row is not None: res = self.parse_row(pre_row, None) if res is not None: self.insert_row_value(res) def create_clinical(self, row, date, summary_location_submitter_id): summary_clinical_submitter_id = format_summary_clinical_submitter_id( summary_location_submitter_id, date) summary_clinical = { "date": date, "submitter_id": summary_clinical_submitter_id, "summary_locations": [{ "submitter_id": summary_location_submitter_id }], } map_csv_fields = { # "iso_code": "iso_code", # "continent": "continent", # "location": "location", # "date": "date", "confirmed": ("total_cases", int), "new_cases": ("new_cases", int), "new_cases_smoothed": ("new_cases_smoothed", float), # "total_deaths": ("total_deaths", int), "new_deaths": ("new_deaths", int), "new_deaths_smoothed": ("new_deaths_smoothed", float), "total_cases_per_million": ("total_cases_per_million", float), "new_cases_per_million": ("new_cases_per_million", float), "new_cases_smoothed_per_million": ("new_cases_smoothed_per_million", float), "total_deaths_per_million": ("total_deaths_per_million", float), "new_deaths_per_million": ("new_deaths_per_million", float), "new_deaths_smoothed_per_million": ( "new_deaths_smoothed_per_million", float, ), "new_tests": ("new_tests", int), "testing": ("total_tests", int), "total_tests_per_thousand": ("total_tests_per_thousand", float), "new_tests_per_thousand": ("new_tests_per_thousand", float), "new_tests_smoothed": ("new_tests_smoothed", float), "new_tests_smoothed_per_thousand": ( "new_tests_smoothed_per_thousand", float, ), "tests_per_case": ("tests_per_case", float), "positive_rate": ("positive_rate", float), "tests_units": ("tests_units", str), "cardiovasc_death_rate": ("cardiovasc_death_rate", float), "diabetes_prevalence": ("diabetes_prevalence", float) # "hospital_beds_per_thousand": ("hospital_beds_per_thousand", float) # "human_development_index": ("human_development_index", float), } for k, (v, dtype) in map_csv_fields.items(): value = row[self.header_to_column[v]] if value and value.lower() != "nan": try: if dtype == int: summary_clinical[k] = int(float(value.replace(",", ""))) elif dtype == float: summary_clinical[k] = float(value.replace(",", "")) except Exception: pass return summary_clinical def create_summary_socio_demographic(self, row, date, summary_location_submitter_id): summary_socio_demographic_submitter_id = ( format_summary_summary_socio_demographic( summary_location_submitter_id, date)) summary_socio_demographic = { "submitter_id": summary_socio_demographic_submitter_id, "summary_locations": [{ "submitter_id": summary_location_submitter_id }], } map_csv_socio_fields = { "stringency_index": ("stringency_index", float), "population": ("population", int), "population_density": ("population_density", float), "median_age": ("median_age", float), "aged_65_older": ("aged_65_older", float), "aged_70_older": ("aged_70_older", float), "gdp_per_capita": ("gdp_per_capita", float), "extreme_poverty": ("extreme_poverty", float), "female_smokers": ("female_smokers", float), "male_smokers": ("male_smokers", float), "handwashing_facilities": ("handwashing_facilities", float), "life_expectancy": ("life_expectancy", float), } for k, (v, dtype) in map_csv_socio_fields.items(): value = row[self.header_to_column[v]] if value and value.lower() != "nan": try: if dtype == int: summary_socio_demographic[k] = int( float(value.replace(",", ""))) elif dtype == float: summary_socio_demographic[k] = float( value.replace(",", "")) except Exception: pass return summary_socio_demographic def parse_row(self, pre_row, row): """ Converts a row of a CSV file to data we can submit via Sheepdog Args: row (list(str)): row of data Returns: (dict, dict) tuple: - location data, in a format ready to be submitted to Sheepdog - { "date1": <value>, "date2": <value> } from the row data """ if pre_row is None: return None pre_date = pre_row[self.header_to_column["date"]] pre_country = pre_row[self.header_to_column["location"]] pre_iso_code = pre_row[self.header_to_column["iso_code"]] if row is not None: iso_code = row[self.header_to_column["iso_code"]] if row is not None and pre_iso_code == iso_code: return None summary_location_submitter_id = format_location_submitter_id( pre_country) summary_location = { "country_region": pre_country, "submitter_id": summary_location_submitter_id, "projects": [{ "code": self.project_code }], } return ( summary_location, self.create_clinical(pre_row, pre_date, summary_location_submitter_id), self.create_summary_socio_demographic( pre_row, pre_date, summary_location_submitter_id), ) def submit_metadata(self): """ Converts the data in `self.time_series_data` to Sheepdog records. `self.location_data already contains Sheepdog records. Batch submits all records in `self.location_data` and `self.time_series_data` """ # Commented # Only required for one time submission of summary_location print("Submitting summary_location data") for loc in self.summary_locations: loc_record = {"type": "summary_location"} loc_record.update(loc) self.metadata_helper.add_record_to_submit(loc_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for sc in self.summary_clinicals: sc_record = {"type": "summary_clinical"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records() print("Submitting summary_socio_demographic data") for sc in self.summary_socio_demographics: sc_record = {"type": "summary_socio_demographic"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records()