def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "NPI-PRO" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.country = "US" self.summary_locations = [] self.summary_clinicals = []
class JHU_COUNTRY_CODES(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "JHU" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) def files_to_submissions(self): codes_dict = get_codes_dictionary() locations = self.get_existing_locations() for location in locations: codes = get_codes_for_country_name(codes_dict, location["country_region"]) # do not update the record if it already has the codes if location["iso2"] == codes["iso2"] and location["iso3"] == codes[ "iso3"]: continue record = {k: v for k, v in location.items() if v != None} record.update({ "type": "summary_location", "projects": [{ "code": self.project_code }], "iso2": codes["iso2"], "iso3": codes["iso3"], }) self.metadata_helper.add_record_to_submit(record) def submit_metadata(self): self.metadata_helper.batch_submit_records() def get_existing_locations(self): print("Getting summary_location data from Peregrine") query_string = ('{ summary_location (first: 0, project_id: "' + self.program_name + "-" + self.project_code + '") { submitter_id, country_region, iso2, iso3 } }') query_res = self.metadata_helper.query_peregrine(query_string) return [location for location in query_res["data"]["summary_location"]]
def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "IDPH-Vaccine" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.country = "US" self.state = "IL" self.date = "" self.counties_inventory = {} self.summary_locations = {} self.summary_clinicals = {} self.summary_group_demographic = {}
def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) # Get all input strings from YAML script = path.splitext(path.basename(__file__))[0].strip("/") script = path.join(CURRENT_DIR, script + ".yaml") with open(script) as f: config = yaml.load(f, Loader=yaml.FullLoader) self.verbose = config["verbose"] self.program_name = config["program_name"] self.project_code = config["project_code"] self.virus_genome_data_category = config["virus_genome_data_category"] self.virus_genome_data_type = config["virus_genome_data_type"] self.virus_genome_data_format = config["virus_genome_data_format"] self.virus_genome_source = config["virus_genome_source"] self.virus_genome_type = config["virus_genome_type"] self.virus_sequence_type = config["virus_sequence_type"] self.virus_sequence_data_type = config["virus_sequence_data_type"] self.virus_sequence_data_format = config["virus_sequence_data_format"] self.virus_sequence_alignment_type = config["virus_sequence_alignment_type"] self.virus_sequence_alignment_data_type = config[ "virus_sequence_alignment_data_type" ] self.virus_sequence_alignment_data_format = config[ "virus_sequence_alignment_data_format" ] self.virus_sequence_alignment_tool = config["virus_sequence_alignment_tool"] self.virus_sequence_hmm_type = config["virus_sequence_hmm_type"] self.virus_sequence_hmm_data_type = config["virus_sequence_hmm_data_type"] self.virus_sequence_hmm_data_format = config["virus_sequence_hmm_data_format"] self.virus_genomes = [] self.virus_sequences = [] self.virus_sequence_alignments = [] self.virus_sequence_hmms = [] self.metadata_helper = MetadataHelper( base_url=base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, )
def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.manifest_bucket = "sra-pub-sars-cov2" self.sra_src_manifest = "sra-src/Manifest" self.program_name = "open" self.project_code = "ncbi-covid-19" self.token = access_token self.last_submission_identifier = None self.file_helper = AsyncFileHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, )
def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "ATLAS" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.nodes = { "summary_location": [], "summary_socio_demographic": [], }
def main(): headers = {"Authorization": f"bearer {access_token}"} records = get_existing_data(base_url, program, project, old_node, headers) metadata_helper = MetadataHelper( base_url=base_url, program_name=program, project_code=project, access_token=access_token, ) print(f"Submitting {new_node} data") for old_rec in records: new_rec = {"type": new_node, "project_id": f"{program}-{project}"} for key, value in old_rec.items(): if value: new_rec[key] = value metadata_helper.add_record_to_submit(new_rec) metadata_helper.batch_submit_records()
class CHESTXRAY8(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "ChestX-ray8" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.file_helper = FileHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.cmc_submitter_id = format_submitter_id("cmc_chestxray8", {}) self.core_metadata_collection = [{ "submitter_id": self.cmc_submitter_id, "projects": [{ "code": self.project_code }], }] self.imaging_file = [] def files_to_submissions(self): for image_type in ("No_findings", "Pneumonia"): for image_filepath in ( Path(CHESTXRAY8_DATA_PATH).joinpath("COVID-19").joinpath( "X-Ray Image DataSet").joinpath(image_type).iterdir()): did, rev, md5, size = self.file_helper.find_by_name( image_filepath.name) if not did: guid = self.file_helper.upload_file(image_filepath) print( f"file {image_filepath.name} uploaded with guid: {guid}" ) else: print( f"file {image_filepath.name} exists in indexd... skipping..." ) imaging_file_submitter_id = format_submitter_id( "imaging_file_chestxray8", {"filename": image_filepath.name}) uploaded_imaging_file = { "submitter_id": imaging_file_submitter_id, "core_metadata_collections": [{ "submitter_id": self.cmc_submitter_id }], "data_type": "PNG", "data_format": "Image File", "data_category": "X-Ray Image", "file_name": image_filepath.name, "file_size": size, "md5sum": md5, "object_id": did, "clinical_notes": image_type, } self.imaging_file.append(uploaded_imaging_file) def submit_metadata(self): print("Submitting data...") print("Submitting core_metadata_collection data") for cmc in self.core_metadata_collection: cmc_record = {"type": "core_metadata_collection"} cmc_record.update(cmc) self.metadata_helper.add_record_to_submit(cmc_record) self.metadata_helper.batch_submit_records() print("Submitting imaging_file data") for ifile in self.imaging_file: if_record = {"type": "imaging_file"} if_record.update(ifile) self.metadata_helper.add_record_to_submit(if_record) self.metadata_helper.batch_submit_records()
def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.summary_locations = [] self.summary_clinicals = [] self.header_to_column = {} self.program_name = "open" self.project_code = "CTP" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.expected_file_headers = set([ "date", "state", "positive", "negative", "pending", "totalTestResults", "hospitalizedCurrently", "hospitalizedCumulative", "inIcuCurrently", "inIcuCumulative", "onVentilatorCurrently", "onVentilatorCumulative", "recovered", "dataQualityGrade", "lastUpdateEt", "dateModified", "checkTimeEt", "death", "hospitalized", "dateChecked", "totalTestsViral", "positiveTestsViral", "negativeTestsViral", "positiveCasesViral", "deathConfirmed", "deathProbable", "totalTestEncountersViral", "totalTestsPeopleViral", "totalTestsAntibody", "positiveTestsAntibody", "negativeTestsAntibody", "totalTestsPeopleAntibody", "positiveTestsPeopleAntibody", "negativeTestsPeopleAntibody", "totalTestsPeopleAntigen", "positiveTestsPeopleAntigen", "totalTestsAntigen", "positiveTestsAntigen", "fips", "positiveIncrease", "negativeIncrease", "total", "totalTestResultsSource", "totalTestResultsIncrease", "posNeg", "deathIncrease", "hospitalizedIncrease", "hash", "commercialScore", "negativeRegularScore", "negativeScore", "positiveScore", "score", "grade", ]) self.expected_race_headers = set([ "Date", "State", "Cases_Total", "Cases_White", "Cases_Black", "Cases_Latinx", "Cases_Asian", "Cases_AIAN", "Cases_NHPI", "Cases_Multiracial", "Cases_Other", "Cases_Unknown", "Cases_Ethnicity_Hispanic", "Cases_Ethnicity_NonHispanic", "Cases_Ethnicity_Unknown", "Deaths_Total", "Deaths_White", "Deaths_Black", "Deaths_Latinx", "Deaths_Asian", "Deaths_AIAN", "Deaths_NHPI", "Deaths_Multiracial", "Deaths_Other", "Deaths_Unknown", "Deaths_Ethnicity_Hispanic", "Deaths_Ethnicity_NonHispanic", "Deaths_Ethnicity_Unknown", ])
class NCBI_MANIFEST(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.manifest_bucket = "sra-pub-sars-cov2" self.sra_src_manifest = "sra-src/Manifest" self.program_name = "open" self.project_code = "ncbi-covid-19" self.token = access_token self.last_submission_identifier = None self.file_helper = AsyncFileHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) def read_ncbi_manifest(self, key): """read the manifest""" tries = 0 last_row_num = 0 while tries < MAX_RETRIES: try: s3 = boto3.resource("s3", config=Config(signature_version=UNSIGNED)) s3_object = s3.Object(self.manifest_bucket, key) line_stream = codecs.getreader("utf-8") row_num = 0 for line in line_stream(s3_object.get()["Body"]): row_num = row_num + 1 if row_num < last_row_num: continue if row_num % 1000 == 0: print(f"Processed {row_num} rows of {key}") words = line.split("\t") guid = conform_data_format(words[0].strip(), "guid") size = int(conform_data_format(words[2].strip(), "size")) md5 = conform_data_format(words[3].strip(), "md5") authz = f"/programs/{self.program_name}/project/{self.project_code}" url = conform_data_format(words[5].strip(), "url") release_date = parse( re.sub(r":[0-9]{3}", "", words[6].strip())) yield guid, size, md5, authz, url, release_date break except Exception as e: print(f"Can not stream {key}. Retrying...") time.sleep(30) tries += 1 last_row_num = row_num def submit_metadata(self): start = time.strftime("%X") loop = asyncio.get_event_loop() try: loop.run_until_complete( asyncio.gather(self.index_manifest(self.sra_src_manifest))) future = AsyncFileHelper.close_session() if future: loop.run_until_complete(asyncio.gather(future)) finally: loop.close() end = time.strftime("%X") print(f"Running time: From {start} to {end}") async def index_manifest(self, manifest): query_string = ('{ project (first: 0, dbgap_accession_number: "' + self.project_code + '") { last_submission_identifier } }') try: response = self.metadata_helper.query_peregrine(query_string) self.last_submission_identifier = parse( response["data"]["project"][0]["last_submission_identifier"]) except Exception as ex: self.last_submission_identifier = None now = datetime.datetime.now() last_submission_date_time = now.strftime("%m/%d/%Y, %H:%M:%S") for (guid, size, md5, authz, url, release_date) in self.read_ncbi_manifest(manifest): if (not self.last_submission_identifier or release_date > self.last_submission_identifier): filename = url.split("/")[-1] retrying = True while retrying: try: did, _, _, _, _, _ = await self.file_helper.async_find_by_name( filename) retrying = False except Exception as e: print( f"ERROR: Fail to query indexd for {filename}. Detail {e}. Retrying..." ) await asyncio.sleep(5) if did: print(f"{filename} was already indexed") continue print(f"start to index {filename}") retries = 0 while retries < MAX_RETRIES: try: await self.file_helper.async_index_record( guid, size, filename, url, authz, md5) break except Exception as e: retries += 1 print( f"ERROR: Fail to create new indexd record for {guid}. Detail {e}. Retrying..." ) await asyncio.sleep(5) headers = { "content-type": "application/json", "Authorization": f"Bearer {self.access_token}", } record = { "code": self.project_code, "dbgap_accession_number": self.project_code, "last_submission_identifier": last_submission_date_time, } res = requests.put( "{}/api/v0/submission/{}".format(self.base_url, self.program_name), headers=headers, data=json.dumps(record), )
class STOPLIGHT(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.summary_clinicals = [] self.summary_locations = [] self.program_name = "open" self.project_code = "covidstoplight" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) def files_to_submissions(self): """ Reads json files and converts the data to Sheepdog records """ url = "https://covidstoplight.org/api/v0/location/US" self.parse_file(url) def parse_file(self, url): """ Converts a json file to data we can submit via Sheepdog. Stores the records to submit in `self.location_data` and `self.time_series_data`. Ignores any records that are already in Sheepdog (relies on unique `submitter_id` to check) Args: url (str): URL at which the file is available """ print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: data = r.json() timestamp_created = data["data"]["generated"] country = data["country_code"] summary_location_list = [] try: for zipcode, feelings in data["data"]["submissions"].items(): node = { "zipcode": zipcode, "feelings": feelings, "timestamp_created": timestamp_created, "country": country, } summary_location, summary_clinical = self.parse_node(node) summary_location_submitter_id = summary_location[ "submitter_id"] if summary_location_submitter_id not in summary_location_list: self.summary_locations.append(summary_location) summary_location_list.append( summary_location_submitter_id) self.summary_clinicals.append(summary_clinical) except ValueError as e: print(f"ERROR: value error. Detail {e}") def parse_node(self, node): """ Converts an element of an JSON file to data we can submit via Sheepdog Args: node (dict): node data Returns: (dict, dict) tuple: - location data, in a format ready to be submitted to Sheepdog - { "date1": <value>, "date2": <value> } from the row data """ zipcode = node["zipcode"] feelings = node["feelings"] timestamp_created = node["timestamp_created"] country = node["country"] summary_location_submitter_id = format_location_submitter_id( country, zipcode) summary_location = { "country_region": country, "submitter_id": summary_location_submitter_id, "projects": [{ "code": self.project_code }], "zipcode": zipcode, } date = datetime.strptime(timestamp_created, "%Y-%m-%dT%H:%M:%S").date() date = date.strftime("%Y-%m-%d") summary_clinical_submitter_id = format_summary_clinical_submitter_id( summary_location_submitter_id, date) summary_clinical = { "date": date, "timestamp_created": timestamp_created, "submitter_id": summary_clinical_submitter_id, "summary_locations": [{ "submitter_id": summary_location_submitter_id }], } map_fields = { 1: "feeling_healthy_count", 2: "feeling_not_so_good_count", 3: "feeling_sick_count", } for element in feelings: summary_clinical[map_fields[element["feeling"]]] = element["count"] return summary_location, summary_clinical def submit_metadata(self): """ Converts the data in `self.time_series_data` to Sheepdog records. `self.location_data already contains Sheepdog records. Batch submits all records in `self.location_data` and `self.time_series_data` """ print("Submitting summary_location data") for loc in self.summary_locations: loc_record = {"type": "summary_location"} loc_record.update(loc) self.metadata_helper.add_record_to_submit(loc_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for rep in self.summary_clinicals: rep_record = {"type": "summary_clinical"} rep_record.update(rep) self.metadata_helper.add_record_to_submit(rep_record) self.metadata_helper.batch_submit_records()
def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.summary_locations = [] self.summary_clinicals = [] self.program_name = "open" self.project_code = "OWID" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) # structure is # (csv field name, (node type, node field name, type of field)) testing_fields = [ ("ISO code", ("summary_location", "iso3", str)), ("Entity", (None, None, split_entity)), ("Date", ("summary_clinical", "date", str)), ("Source URL", ("summary_clinical", "source_url", str)), ("Source label", ("summary_clinical", "source_label", str)), ("Notes", ("summary_clinical", "notes", str)), ("Number of observations", ("summary_clinical", "num_observations", int)), ("Cumulative total", ("summary_clinical", "testing", int)), ( "Cumulative total per thousand", ("summary_clinical", "cumulative_total_per_thousand", int), ), ( "Daily change in cumulative total", ("summary_clinical", "daily_change_in_cumulative_total", int), ), ( "Daily change in cumulative total per thousand", ( "summary_clinical", "daily_change_in_cumulative_total_per_thousand", int, ), ), ( "7-day smoothed daily change", ("summary_clinical", "seven_day_smoothed_daily_change", int), ), ( "7-day smoothed daily change per thousand", ( "summary_clinical", "seven_day_smoothed_daily_change_per_thousand", float, ), ), ("Short-term positive rate", (None, None, None)), ("Short-term tests per case", (None, None, None)), ("General source label", ("summary_clinical", "general_source_label", str)), ("General source URL", ("summary_clinical", "general_source_url", str)), ("Short description", ("summary_clinical", "short_description", str)), ("Detailed description", ("summary_clinical", "detailed_description", str)), ] self.headers_mapping = { field: (k, mapping) for k, (field, mapping) in enumerate(testing_fields) }
class IDPH_FACILITY(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "IDPH-Facility" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.country = "US" self.state = "IL" self.summary_locations = {} self.summary_clinicals = {} def files_to_submissions(self): """ Reads JSON file and convert the data to Sheepdog records """ latest_submitted_date = self.metadata_helper.get_latest_submitted_date_idph( ) today = datetime.date.today() if latest_submitted_date == today: print( "Nothing to submit: today and latest submitted date are the same." ) return today_str = today.strftime("%Y%m%d") print(f"Getting data for date: {today_str}") url = "https://dph.illinois.gov/sitefiles/COVIDLTC.json" self.parse_file(latest_submitted_date, url) def parse_file(self, latest_submitted_date, url): """ Converts a JSON files to data we can submit via Sheepdog. Stores the records to submit in `self.summary_locations` and `self.summary_clinicals`. Args: latest_submitted_date (date): the date of latest available "summary_clinical" for project url (str): URL at which the JSON file is available """ print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: data = r.json() date = idph_get_date(data["LastUpdateDate"]) if latest_submitted_date and date == latest_submitted_date.strftime( "%Y-%m-%d"): print( "Nothing to submit: latest submitted date and date from data are the same." ) return if "LTC_Reported_Cases" in data: summary_location_submitter_id = format_submitter_id( "summary_location", { "country": self.country, "state": self.state }) summary_location = { "country_region": self.country, "submitter_id": summary_location_submitter_id, "projects": [{ "code": self.project_code }], "province_state": self.state, } summary_clinical_submitter_id = derived_submitter_id( summary_location_submitter_id, "summary_location", "summary_clinical", {"date": date}, ) summary_clinical = { "confirmed": data["LTC_Reported_Cases"]["confirmed_cases"], "deaths": data["LTC_Reported_Cases"]["deaths"], "submitter_id": summary_clinical_submitter_id, "lastUpdateEt": date, "date": date, "summary_locations": [{ "submitter_id": summary_location_submitter_id }], } self.summary_locations[ summary_location_submitter_id] = summary_location self.summary_clinicals[ summary_clinical_submitter_id] = summary_clinical for facility in data["FacilityValues"]: (summary_location, summary_clinical) = self.parse_facility(date, facility) summary_location_submitter_id = summary_location[ "submitter_id"] summary_clinical_submitter_id = summary_clinical[ "submitter_id"] self.summary_locations[ summary_location_submitter_id] = summary_location if summary_clinical_submitter_id in self.summary_clinicals: existed = self.summary_clinicals[ summary_clinical_submitter_id] summary_clinical["confirmed"] = max( summary_clinical["confirmed"], existed["confirmed"]) summary_clinical["deaths"] = max( summary_clinical["deaths"], existed["deaths"]) self.summary_clinicals[ summary_clinical_submitter_id] = summary_clinical def parse_facility(self, date, facility): """ From county-level data, generate the data we can submit via Sheepdog """ county = facility["County"] facility_name = facility["FacilityName"] confirmed_cases = facility["confirmed_cases"] deaths = facility["deaths"] status = facility.get("status", None) summary_location_submitter_id = format_submitter_id( "summary_location", { "country": self.country, "state": self.state, "facility_name": facility_name, "reporting_org_status": status, }, ) summary_location = { "country_region": self.country, "submitter_id": summary_location_submitter_id, "projects": [{ "code": self.project_code }], "province_state": self.state, "county": county, "reporting_org": facility_name, "reporting_org_status": status, } summary_clinical_submitter_id = derived_submitter_id( summary_location_submitter_id, "summary_location", "summary_clinical", {"date": date}, ) summary_clinical = { "confirmed": confirmed_cases, "deaths": deaths, "submitter_id": summary_clinical_submitter_id, "lastUpdateEt": date, "date": date, "summary_locations": [{ "submitter_id": summary_location_submitter_id }], } return summary_location, summary_clinical def submit_metadata(self): print("Submitting data...") print("Submitting summary_location data") for sl in self.summary_locations.values(): sl_record = {"type": "summary_location"} sl_record.update(sl) self.metadata_helper.add_record_to_submit(sl_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for sc in self.summary_clinicals.values(): sc_record = {"type": "summary_clinical"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records()
class DS4C(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "DS4C" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.subjects = [] self.demographics = [] self.observations = [] def files_to_submissions(self): with open( os.path.join(CURRENT_DIR, "data/ds4c_PatientInfo.csv"), newline="" ) as csvfile: reader = csv.reader(csvfile, delimiter=",") header = next(reader) print("Headers:", header) header = {k: v for v, k in enumerate(header)} n_1200012238 = 1 for row in reader: patient_id = row[header["patient_id"]].strip() if patient_id == "1200012238": # there are 2 rows for the same ID patient_id = f"{patient_id}_{n_1200012238}" n_1200012238 += 1 # generate subject record subject = { "submitter_id": patient_id, "projects": [{"code": self.project_code}], } confirmed_date = row[header["confirmed_date"]].strip() if confirmed_date: check_date_format(confirmed_date) subject["date_confirmation"] = confirmed_date subject["covid_19_status"] = "Positive" infected_by = row[header["infected_by"]].strip() if infected_by: subject["infected_by"] = list( map(lambda v: v.strip(), infected_by.split(",")) ) deceased_date = row[header["deceased_date"]].strip() if deceased_date: check_date_format(deceased_date) subject["deceased_date"] = deceased_date # generate demographic record demographic = { "submitter_id": f"demographic_{patient_id}", "subjects": {"submitter_id": patient_id}, "age_decade": row[header["age"]].strip(), "province_state": row[header["province"]].strip(), "city": row[header["city"]].strip(), } country = row[header["country"]].strip() if country == "Korea": demographic["country_region"] = "South Korea" elif country == "United States": demographic["country_region"] = "USA" else: demographic["country_region"] = country gender = row[header["sex"]].strip() demographic["gender"] = harmonize_gender(gender) demographic["year_of_birth"] = None # generate observation record observation = { "submitter_id": f"observation_{patient_id}", "subjects": {"submitter_id": patient_id}, "exposure": row[header["infection_case"]].strip(), } date_onset_symptoms = row[header["symptom_onset_date"]].strip() if date_onset_symptoms: check_date_format(row[header["symptom_onset_date"]]) observation["date_onset_symptoms"] = date_onset_symptoms state = row[header["state"]].strip() if state == "deceased": subject["vital_status"] = "Dead" elif state == "isolated": observation["isolation_status"] = "Isolated" elif state == "released": observation["treatment_status"] = "Released" elif state: raise Exception('State "{}" is unknown'.format(state)) released_date = row[header["released_date"]].strip() if released_date: check_date_format(released_date) observation["released_date"] = released_date subject = {k: v if v else None for k, v in subject.items()} self.subjects.append(subject) demographic = {k: v for k, v in demographic.items() if v} self.demographics.append(demographic) observation = {k: v for k, v in observation.items() if v} self.observations.append(observation) def submit_metadata(self): print("Submitting data") print("Submitting subject data") for loc in self.subjects: loc_record = {"type": "subject"} loc_record.update(loc) self.metadata_helper.add_record_to_submit(loc_record) self.metadata_helper.batch_submit_records() print("Submitting demographic data") for dem in self.demographics: dem_record = {"type": "demographic"} dem_record.update(dem) self.metadata_helper.add_record_to_submit(dem_record) self.metadata_helper.batch_submit_records() print("Submitting observation data") for obs in self.observations: obs_record = {"type": "observation"} obs_record.update(obs) self.metadata_helper.add_record_to_submit(obs_record) self.metadata_helper.batch_submit_records()
class IDPH_ZIPCODE(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "IDPH-zipcode" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.country = "US" self.state = "IL" self.summary_locations = [] self.summary_clinicals = [] def files_to_submissions(self): """ Reads JSON file and convert the data to Sheepdog records """ latest_submitted_date = self.metadata_helper.get_latest_submitted_date_idph( ) today = datetime.date.today() if latest_submitted_date == today: print( "Nothing to submit: today and latest submitted date are the same." ) return today_str = today.strftime("%Y%m%d") print(f"Getting data for date: {today_str}") url = "http://dph.illinois.gov/sitefiles/COVIDZip.json?nocache=1" self.parse_file(latest_submitted_date, url) def parse_file(self, latest_submitted_date, url): """ Converts a JSON files to data we can submit via Sheepdog. Stores the records to submit in `self.summary_locations` and `self.summary_clinicals`. Args: latest_submitted_date (date): date for latest submitted date url (str): URL at which the JSON file is available """ print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: data = r.json() date = idph_get_date(data["LastUpdateDate"]) if latest_submitted_date and date == latest_submitted_date.strftime( "%Y-%m-%d"): print( "Nothing to submit: latest submitted date and date from data are the same." ) return for zipcode_values in data["zip_values"]: (summary_location, summary_clinical) = self.parse_zipcode(date, zipcode_values) self.summary_locations.append(summary_location) self.summary_clinicals.append(summary_clinical) def parse_zipcode(self, date, zipcode_values): """ From county-level data, generate the data we can submit via Sheepdog """ zipcode = zipcode_values["zip"] summary_location_submitter_id = format_submitter_id( "summary_location", { "country": self.country, "state": self.state, "zipcode": zipcode }, ) summary_location = { "submitter_id": summary_location_submitter_id, "country_region": self.country, "province_state": self.state, "zipcode": zipcode, "projects": [{ "code": self.project_code }], } summary_clinical_submitter_id = derived_submitter_id( summary_location_submitter_id, "summary_location", "summary_clinical", {"date": date}, ) summary_clinical = { "submitter_id": summary_clinical_submitter_id, "date": date, "confirmed": zipcode_values["confirmed_cases"], "summary_locations": [{ "submitter_id": summary_location_submitter_id }], } if "demographics" in zipcode_values: demographic = zipcode_values["demographics"] for k, v in fields_mapping.items(): field, mapping = v demographic_group = demographic[k] for item in demographic_group: dst_field = mapping[item[field]] if dst_field: if "count" in item: age_group_count_field = "{}_{}".format( mapping[item[field]], "count") summary_clinical[age_group_count_field] = item[ "count"] if "tested" in item: age_group_tested_field = "{}_{}".format( mapping[item[field]], "tested") summary_clinical[age_group_tested_field] = item[ "tested"] return summary_location, summary_clinical def submit_metadata(self): """ Submits the data in `self.summary_locations` and `self.summary_clinicals` to Sheepdog. """ print("Submitting data...") print("Submitting summary_location data") for sl in self.summary_locations: sl_record = {"type": "summary_location"} sl_record.update(sl) self.metadata_helper.add_record_to_submit(sl_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for sc in self.summary_clinicals: sc_record = {"type": "summary_clinical"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records()
class DSCI(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "DSCI" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.subjects = [] self.demographics = [] self.observations = [] def files_to_submissions(self): with open( os.path.join(CURRENT_DIR, "data/dsci_patient.csv"), newline="" ) as csvfile: reader = csv.reader(csvfile, delimiter=",", quotechar="|") header = next(reader) print("Headers:", header) header = {k: v for v, k in enumerate(header)} for row in reader: patient_id = row[header["patient_id"]].strip() # generate subject record subject = { "submitter_id": patient_id, "projects": [{"code": self.project_code}], } infected_by = row[header["contacted_with"]].strip() if infected_by: subject["infected_by"] = list( map(lambda v: v.strip(), infected_by.split(",")) ) confirmed_date = row[header["confirmed_date"]].strip() if confirmed_date: confirmed_date = format_date(confirmed_date) check_date_format(confirmed_date) subject["date_confirmation"] = confirmed_date subject["covid_19_status"] = "Positive" deceased_date = row[header["deceased_date"]].strip() if deceased_date: deceased_date = format_date(deceased_date) check_date_format(deceased_date) subject["deceased_date"] = deceased_date # generate demographic record demographic = { "submitter_id": f"demographic_{patient_id}", "subjects": {"submitter_id": f"{patient_id}"}, } cols = {"age": "age", "province": "province_state"} for k, v in cols.items(): value = row[header[k]].strip() if value: demographic[v] = value if "age" in demographic: demographic["age"] = int(demographic["age"]) gender = row[header["gender"]].strip() demographic["gender"] = harmonize_gender(gender) nationality = row[header["nationality"]].strip() if nationality == "indonesia": demographic["country_region"] = "Indonesia" elif nationality == "foreigner": pass elif nationality: raise Exception('Nationality "{}" is unknown'.format(nationality)) # generate observation record observation = { "submitter_id": f"observation_{patient_id}", "subjects": {"submitter_id": f"{patient_id}"}, } hospital = row[header["hospital"]].strip() if hospital: observation["hospital"] = hospital state = row[header["current_state"]].strip() if state == "deceased": subject["vital_status"] = "Dead" elif state == "isolated": observation["isolation_status"] = "Isolated" elif state == "released": observation["treatment_status"] = "Released" elif state: raise Exception('State "{}" is unknown'.format(state)) released_date = row[header["released_date"]].strip() if released_date: released_date = format_date(released_date) check_date_format(released_date) observation["released_date"] = released_date self.subjects.append(subject) self.demographics.append(demographic) self.observations.append(observation) def submit_metadata(self): print("Submitting data") print("Submitting subject data") for loc in self.subjects: loc_record = {"type": "subject"} loc_record.update(loc) self.metadata_helper.add_record_to_submit(loc_record) self.metadata_helper.batch_submit_records() print("Submitting demographic data") for dem in self.demographics: dem_record = {"type": "demographic"} dem_record.update(dem) self.metadata_helper.add_record_to_submit(dem_record) self.metadata_helper.batch_submit_records() print("Submitting observation data") for obs in self.observations: obs_record = {"type": "observation"} obs_record.update(obs) self.metadata_helper.add_record_to_submit(obs_record) self.metadata_helper.batch_submit_records()
def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.summary_locations = [] self.summary_clinicals = [] self.summary_socio_demographics = [] self.program_name = "open" self.project_code = "CCMap" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) # structure is # (csv field name, (node type, node field name, type of field)) county_fields = [ ("fips_code", ("summary_location", "FIPS", int)), ("State", ("summary_location", "province_state", str)), ("County Name", ("summary_location", "county", str)), ("Staffed All Beds", ("summary_clinical", "staffed_all_beds", int)), ("Staffed ICU Beds", ("summary_clinical", "staffed_icu_beds", int)), ("Licensed All Beds", ("summary_clinical", "licensed_all_beds", int)), ( "All Bed Occupancy Rate", ("summary_clinical", "all_bed_occupancy_rate", float), ), ( "ICU Bed Occupancy Rate", ("summary_clinical", "icu_bed_occupancy_rate", float), ), ("Population", ("summary_clinical", "population", int)), ("Population (20+)", ("summary_clinical", "population_gtr_20", int)), ("Population (65+)", ("summary_clinical", "population_gtr_65", int)), ( "Staffed All Beds [Per 1000 People]", ("summary_clinical", "staffed_all_beds_per_1000", float), ), ( "Staffed All Beds [Per 1000 Adults (20+)]", ("summary_clinical", "staffed_all_beds_per_1000_gtr_20", float), ), ( "Staffed All Beds [Per 1000 Elderly (65+)]", ("summary_clinical", "staffed_all_beds_per_1000_gtr_65", float), ), ( "Staffed ICU Beds [Per 1000 People]", ("summary_clinical", "staffed_icu_beds_per_1000", float), ), ( "Staffed ICU Beds [Per 1000 Adults (20+)]", ("summary_clinical", "staffed_icu_beds_per_1000_gtr_20", float), ), ( "Staffed ICU Beds [Per 1000 Elderly (65+)]", ("summary_clinical", "staffed_icu_beds_per_1000_gtr_65", float), ), ( "Licensed All Beds [Per 1000 People]", ("summary_clinical", "licensed_all_beds_per_1000", float), ), ( "Licensed All Beds [Per 1000 Adults (20+)]", ("summary_clinical", "licensed_all_beds_per_1000_gtr_20", float), ), ( "Licensed All Beds [Per 1000 Elderly (65+)]", ("summary_clinical", "licensed_all_beds_per_1000_gtr_65", float), ), ] state_fields = [ ("State", ("summary_location", None, int)), ("State Name", ("summary_location", "province_state", str)), ("Staffed All Beds", ("summary_clinical", "staffed_all_beds", int)), ("Staffed ICU Beds", ("summary_clinical", "staffed_icu_beds", int)), ("Licensed All Beds", ("summary_clinical", "licensed_all_beds", int)), ( "All Bed Occupancy Rate", ("summary_clinical", "all_bed_occupancy_rate", float), ), ( "ICU Bed Occupancy Rate", ("summary_clinical", "icu_bed_occupancy_rate", float), ), ("Population", ("summary_clinical", "population", int)), ( "Population (20+)", ("summary_socio_demographic", "population_gtr_20", int), ), ( "Population (65+)", ("summary_socio_demographic", "population_gtr_65", int), ), ( "Staffed All Beds [Per 1000 People]", ("summary_clinical", "staffed_all_beds_per_1000", float), ), ( "Staffed All Beds [Per 1000 Adults (20+)]", ("summary_clinical", "staffed_all_beds_per_1000_gtr_20", float), ), ( "Staffed All Beds [Per 1000 Elderly (65+)]", ("summary_clinical", "staffed_all_beds_per_1000_gtr_65", float), ), ( "Staffed ICU Beds [Per 1000 People]", ("summary_clinical", "staffed_icu_beds_per_1000", float), ), ( "Staffed ICU Beds [Per 1000 Adults (20+)]", ("summary_clinical", "staffed_icu_beds_per_1000_gtr_20", float), ), ( "Staffed ICU Beds [Per 1000 Elderly (65+)]", ("summary_clinical", "staffed_icu_beds_per_1000_gtr_65", float), ), ( "Licensed All Beds [Per 1000 People]", ("summary_clinical", "licensed_all_beds_per_1000", float), ), ( "Licensed All Beds [Per 1000 Adults (20+)]", ("summary_clinical", "licensed_all_beds_per_1000_gtr_20", float), ), ( "Licensed All Beds [Per 1000 Elderly (65+)]", ("summary_clinical", "licensed_all_beds_per_1000_gtr_65", float), ), ( "Estimated No. Full-Featured Mechanical Ventilators (2010 study estimate)", ("summary_clinical", "estimated_full_mech_ventilators", int), ), ( "Estimated No. Full-Featured Mechanical Ventilators per 100,000 Population (2010 study estimate)", ( "summary_clinical", "estimated_full_mech_ventilators_per_100000", float, ), ), ( "Estimated No. Pediatrics-Capable Full-Feature Mechanical Ventilators (2010 study estimate)", ("summary_clinical", "estimated_full_mech_pediatric_ventilators", int), ), ( "Estimated No. Full-Feature Mechanical Ventilators, Pediatrics Capable per 100,000 Population <14 y (2010 study estimate)", ( "summary_clinical", "estimated_full_mech_pediatric_ventilators_per_100000", float, ), ), ] self.headers_mapping = { "county": {field: mapping for field, mapping in county_fields}, "state": {field: mapping for field, mapping in state_fields}, }
class CCMAP(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.summary_locations = [] self.summary_clinicals = [] self.summary_socio_demographics = [] self.program_name = "open" self.project_code = "CCMap" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) # structure is # (csv field name, (node type, node field name, type of field)) county_fields = [ ("fips_code", ("summary_location", "FIPS", int)), ("State", ("summary_location", "province_state", str)), ("County Name", ("summary_location", "county", str)), ("Staffed All Beds", ("summary_clinical", "staffed_all_beds", int)), ("Staffed ICU Beds", ("summary_clinical", "staffed_icu_beds", int)), ("Licensed All Beds", ("summary_clinical", "licensed_all_beds", int)), ( "All Bed Occupancy Rate", ("summary_clinical", "all_bed_occupancy_rate", float), ), ( "ICU Bed Occupancy Rate", ("summary_clinical", "icu_bed_occupancy_rate", float), ), ("Population", ("summary_clinical", "population", int)), ("Population (20+)", ("summary_clinical", "population_gtr_20", int)), ("Population (65+)", ("summary_clinical", "population_gtr_65", int)), ( "Staffed All Beds [Per 1000 People]", ("summary_clinical", "staffed_all_beds_per_1000", float), ), ( "Staffed All Beds [Per 1000 Adults (20+)]", ("summary_clinical", "staffed_all_beds_per_1000_gtr_20", float), ), ( "Staffed All Beds [Per 1000 Elderly (65+)]", ("summary_clinical", "staffed_all_beds_per_1000_gtr_65", float), ), ( "Staffed ICU Beds [Per 1000 People]", ("summary_clinical", "staffed_icu_beds_per_1000", float), ), ( "Staffed ICU Beds [Per 1000 Adults (20+)]", ("summary_clinical", "staffed_icu_beds_per_1000_gtr_20", float), ), ( "Staffed ICU Beds [Per 1000 Elderly (65+)]", ("summary_clinical", "staffed_icu_beds_per_1000_gtr_65", float), ), ( "Licensed All Beds [Per 1000 People]", ("summary_clinical", "licensed_all_beds_per_1000", float), ), ( "Licensed All Beds [Per 1000 Adults (20+)]", ("summary_clinical", "licensed_all_beds_per_1000_gtr_20", float), ), ( "Licensed All Beds [Per 1000 Elderly (65+)]", ("summary_clinical", "licensed_all_beds_per_1000_gtr_65", float), ), ] state_fields = [ ("State", ("summary_location", None, int)), ("State Name", ("summary_location", "province_state", str)), ("Staffed All Beds", ("summary_clinical", "staffed_all_beds", int)), ("Staffed ICU Beds", ("summary_clinical", "staffed_icu_beds", int)), ("Licensed All Beds", ("summary_clinical", "licensed_all_beds", int)), ( "All Bed Occupancy Rate", ("summary_clinical", "all_bed_occupancy_rate", float), ), ( "ICU Bed Occupancy Rate", ("summary_clinical", "icu_bed_occupancy_rate", float), ), ("Population", ("summary_clinical", "population", int)), ( "Population (20+)", ("summary_socio_demographic", "population_gtr_20", int), ), ( "Population (65+)", ("summary_socio_demographic", "population_gtr_65", int), ), ( "Staffed All Beds [Per 1000 People]", ("summary_clinical", "staffed_all_beds_per_1000", float), ), ( "Staffed All Beds [Per 1000 Adults (20+)]", ("summary_clinical", "staffed_all_beds_per_1000_gtr_20", float), ), ( "Staffed All Beds [Per 1000 Elderly (65+)]", ("summary_clinical", "staffed_all_beds_per_1000_gtr_65", float), ), ( "Staffed ICU Beds [Per 1000 People]", ("summary_clinical", "staffed_icu_beds_per_1000", float), ), ( "Staffed ICU Beds [Per 1000 Adults (20+)]", ("summary_clinical", "staffed_icu_beds_per_1000_gtr_20", float), ), ( "Staffed ICU Beds [Per 1000 Elderly (65+)]", ("summary_clinical", "staffed_icu_beds_per_1000_gtr_65", float), ), ( "Licensed All Beds [Per 1000 People]", ("summary_clinical", "licensed_all_beds_per_1000", float), ), ( "Licensed All Beds [Per 1000 Adults (20+)]", ("summary_clinical", "licensed_all_beds_per_1000_gtr_20", float), ), ( "Licensed All Beds [Per 1000 Elderly (65+)]", ("summary_clinical", "licensed_all_beds_per_1000_gtr_65", float), ), ( "Estimated No. Full-Featured Mechanical Ventilators (2010 study estimate)", ("summary_clinical", "estimated_full_mech_ventilators", int), ), ( "Estimated No. Full-Featured Mechanical Ventilators per 100,000 Population (2010 study estimate)", ( "summary_clinical", "estimated_full_mech_ventilators_per_100000", float, ), ), ( "Estimated No. Pediatrics-Capable Full-Feature Mechanical Ventilators (2010 study estimate)", ("summary_clinical", "estimated_full_mech_pediatric_ventilators", int), ), ( "Estimated No. Full-Feature Mechanical Ventilators, Pediatrics Capable per 100,000 Population <14 y (2010 study estimate)", ( "summary_clinical", "estimated_full_mech_pediatric_ventilators_per_100000", float, ), ), ] self.headers_mapping = { "county": {field: mapping for field, mapping in county_fields}, "state": {field: mapping for field, mapping in state_fields}, } def files_to_submissions(self): """ Reads CSV files and converts the data to Sheepdog records """ repo = "covidcaremap/covid19-healthsystemcapacity" branch = "master" files = { "county": "data/published/us_healthcare_capacity-county-CovidCareMap.csv", "state": "data/published/us_healthcare_capacity-state-CovidCareMap.csv", } for k, url in files.items(): self.parse_file(repo, branch, url, csv_type=k) def get_last_update_date_file(self, repo, url): """ Gets latest update time for specific file in the repository :param repo: "user/repository" for Github repository :param url: path to file :return: last update (commit) datetime for the file """ api_url = "https://api.github.com/repos" commit_info_url = "{}/{}/{}{}{}".format(api_url, repo, "commits?path=", url, "&page=1&per_page=1") with closing(requests.get(commit_info_url, stream=True)) as r: commit_info = r.json() last_update_date = commit_info[0]["commit"]["committer"]["date"] return datetime.datetime.strptime(last_update_date, "%Y-%m-%dT%H:%M:%SZ") def parse_file(self, repo, branch, file_url, csv_type): last_update_date = self.get_last_update_date_file(repo, file_url) raw_url = "https://raw.githubusercontent.com" url = "{}/{}/{}/{}".format(raw_url, repo, branch, file_url) print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: f = (line.decode("utf-8") for line in r.iter_lines()) reader = csv.reader(f, delimiter=",", quotechar='"') headers = next(reader) assert ( headers[0] != "404: Not Found" ), " Unable to get file contents, received {}.".format(headers) expected_h = list(self.headers_mapping[csv_type].keys()) assert ( set(expected_h).issubset(set(headers)) == True ), "CSV headers have changed (expected {}, got {}). We may need to update the ETL code".format( expected_h, headers) for i, f in enumerate(headers): if f in self.headers_mapping[csv_type]: old_value = self.headers_mapping[csv_type][f] self.headers_mapping[csv_type][f] = (i, old_value) for row in reader: ( summary_location, summary_clinical, summary_socio_demographic, ) = self.parse_row(row, self.headers_mapping[csv_type], last_update_date) self.summary_locations.append(summary_location) self.summary_clinicals.append(summary_clinical) self.summary_socio_demographics.append( summary_socio_demographic) def parse_row(self, row, mapping, last_update_date): summary_location = {"country_region": "US"} summary_clinical = {} summary_socio_demographic = {} for k, (i, (node_type, node_field, type_conv)) in mapping.items(): try: if node_field: value = row[i] if value: if node_type == "summary_location": summary_location[node_field] = type_conv(value) if node_type == "summary_clinical": if type_conv == int: summary_clinical[node_field] = type_conv( float(value)) else: summary_clinical[node_field] = type_conv(value) if node_type == "summary_socio_demographic": if type_conv == int: summary_socio_demographic[ node_field] = type_conv(float(value)) else: summary_socio_demographic[ node_field] = type_conv(value) summary_clinical[ node_field] = None # TODO: remove when the properties are removed from dictionary except Exception as ex: print("Error with field: {}, problematic value: {}".format( node_field, row[i])) summary_location_submitter_id = format_location_submitter_id( summary_location) summary_location["submitter_id"] = summary_location_submitter_id summary_location["projects"] = [{"code": self.project_code}] state = summary_location["province_state"] if len(state) == 2: summary_location["province_state"] = state_to_long(state) summary_clinical[ "submitter_id"] = format_summary_clinical_submitter_id( summary_location_submitter_id, date=last_update_date.strftime("%Y-%m-%d")) summary_clinical["summary_locations"] = [{ "submitter_id": summary_location_submitter_id }] summary_socio_demographic[ "submitter_id"] = format_summary_socio_demographic_id( summary_location_submitter_id, date=last_update_date.strftime("%Y-%m-%d")) summary_socio_demographic["summary_locations"] = [{ "submitter_id": summary_location_submitter_id }] return summary_location, summary_clinical, summary_socio_demographic def submit_metadata(self): print("Submitting summary_location data") for loc in self.summary_locations: loc_record = {"type": "summary_location"} loc_record.update(loc) self.metadata_helper.add_record_to_submit(loc_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for sc in self.summary_clinicals: sc_record = {"type": "summary_clinical"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records() print("Submitting summary_socio_demographic data") for sc in self.summary_socio_demographics: sc_record = {"type": "summary_socio_demographic"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records()
class DSFSI(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.subjects = [] self.demographics = [] self.observations = [] self.program_name = "open" self.project_code = "DSFSI" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) # structure is # (csv field name, (node type, node field name, type of field)) self.countries_fields = [ ("case_id", ("subject", "submitter_id", str)), ("origin_case_id", (None, None, None)), ("date", ("observation", "reporting_date", normalize_date)), ("age", ("demographic", "age", normalize_age)), ("gender", ("demographic", "gender", normalize_gender)), ("city", ("demographic", "city", str)), ("province/state", ("demographic", "province_state", str)), ("country", ("demographic", "country_region", str)), ( "current_status", ("subject", "tmp_current_status", normalize_current_status), ), ( "source", ("observation", "reporting_source_url", str), ), # type of fields "None" is used to remove the value ("symptoms", ("observation", "symptoms", normalize_symptoms)), ( "date_onset_symptoms", ("observation", "date_onset_symptoms", normalize_date), ), ( "date_admission_hospital", ("observation", "date_admission_hospital", normalize_date), ), ("date_confirmation", ("subject", "date_confirmation", normalize_date)), ("underlying_conditions", (None, None, None)), ("travel_history_dates", ("subject", "travel_history_dates", str)), ("travel_history_location", ("subject", "travel_history_location", str)), ("death_date", ("subject", "deceased_date", normalize_date)), ("notes_for_discussion", (None, None, None)), ] def files_to_submissions(self): """ Reads CSV files and converts the data to Sheepdog records """ urls = { "Algeria": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-algeria.csv", "Angola": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-angola.csv", "Benin": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-benin.csv", "Burkina Faso": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-burkina-faso.csv", "Cabo Verde": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-cabo-verde.csv", "Cameroon": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-cameroon.csv", "Central African Republic": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-central-african-republic.csv", "Chad": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-chad.csv", "Côte d'Ivoire": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-cote-divoire.csv", "Democratic Republic of the Congo": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-democratic-republic-of-the-congo.csv", "Djibouti": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-djibouti.csv", # here should be an Egypt dataset, but it's not useful and omitted on purpose "Equatorial Guinea": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-equatorial-guinea.csv", "Eritrea": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-eritrea.csv", "Eswatini": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-eswatini.csv", "Ethiopia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-ethiopia.csv", "Gabon": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-gabon.csv", "Gambia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-gambia.csv", "Ghana": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-ghana.csv", "Guinea Bissau": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-guinea-bissau.csv", "Guinea": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-guinea.csv", "Kenya": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-kenya.csv", "Liberia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-liberia.csv", "Madagascar": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-madagascar.csv", "Mali": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-mali.csv", "Mauritania": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-mauritania.csv", "Mauritius": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-mauritius.csv", "Mozambique": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-mozambique.csv", "Namibia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-namibia.csv", "Niger": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-niger.csv", "Nigeria": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-nigeria.csv", "Republic of Congo": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-republic-of-congo.csv", "Rwanda": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-rwanda.csv", "Senegal": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-senegal.csv", "Seychelles": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-seychelles.csv", "Somalia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-somalia.csv", "South Africa": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-south-africa.csv", "Sudan": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-sudan.csv", "Tanzania": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-tanzania.csv", "Togo": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-togo.csv", "Uganda": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-uganda.csv", "Zambia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-zambia.csv", "Zimbabwe": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-zimbabwe.csv", } for k, url in urls.items(): self.parse_file(k, url) def parse_file(self, country, url): print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: f = (line.decode("utf-8") for line in r.iter_lines()) reader = csv.reader(f, delimiter=",", quotechar='"') headers = next(reader) assert ( headers[0] != "404: Not Found" ), " Unable to get file contents, received {}.".format(headers) countries_with_empty_columns = [ "Angola", "Burkina Faso", "Cabo Verde", "Cameroon", "Central African Republic", "Chad", "Côte d'Ivoire", "Democratic Republic of the Congo", "Djibouti", "Equatorial Guinea", "Eritrea", "Eswatini", "Gabon", "Guinea Bissau", "Guinea", "Liberia", "Madagascar", "Mali", "Mauritania", "Mauritius", "Mozambique", "Republic of Congo", "Senegal", "Seychelles", "Somalia", "Sudan", "Tanzania", "Togo", "Uganda", "Zambia", ] countries_with_mistyped_column = ["South Africa"] countries_without_notes = [ "Eritrea", "Eswatini", "Gabon", "Madagascar", "Mali", "Mauritania", "Mauritius", "Mozambique", "Republic of Congo", "Senegal", "Seychelles", "Somalia", "Sudan", "Tanzania", "Togo", "Uganda", "Zambia", ] # Ok, this is ugly... But, almost all the countries have some ugliness in the CSV format... # And this code deals with it tmp = copy.deepcopy(self.countries_fields) if country in countries_with_empty_columns: tmp.insert(0, ("", (None, None, None))) if country in countries_with_mistyped_column: tmp[14] = ("underlyng_conditions", (None, None, None)) if country in countries_without_notes: del tmp[-1] if country == "Ethiopia": tmp.insert(8, ("original_status", (None, None, None))) del tmp[10] tmp.insert(14, ("closed_date", (None, None, None))) tmp.insert(16, ("quarantine_status", (None, None, None))) del tmp[19] tmp.insert(19, ("contact", (None, None, None))) tmp.append(("source", (None, None, None))) if country == "Niger": del tmp[9] tmp.insert(9, ("source 1", (None, None, None))) tmp.insert(10, ("source 2", (None, None, None))) updated_headers_mapping = { field: (k, mapping) for k, (field, mapping) in enumerate(tmp) } expected_h = list(updated_headers_mapping.keys()) obtained_h = headers[: len(expected_h)] obtained_h = [header.strip() for header in obtained_h] assert ( obtained_h == expected_h ), "CSV headers have changed\nexpected: {}\n got: {})".format( expected_h, obtained_h ) # South Africa dataset has only 274 nice cases # Everything after has the same data and don't have any meaningful information idx = 0 last = None if country == "South Africa": last = 275 for row in reader: idx += 1 if last and idx == last: break subject, demographic, observation = self.parse_row( country, row, updated_headers_mapping ) self.subjects.append(subject) self.demographics.append(demographic) self.observations.append(observation) def parse_row(self, country, row, mapping): subject = {} demographic = {} observation = {} for (i, (node_type, node_field, type_conv)) in mapping.values(): if node_field: value = row[i] if value: if node_type == "subject": if type_conv is None: subject[node_field] = None continue subject[node_field] = type_conv(value) if node_type == "demographic": if type_conv is None: demographic[node_field] = None continue demographic[node_field] = type_conv(value) # init subject node case_id = subject["submitter_id"] subject["submitter_id"] = format_subject_submitter_id( country, subject["submitter_id"] ) subject["projects"] = [{"code": self.project_code}] # Only South Africa dataset has a record with the same case_id... # Because this code deals only with individual rows, it's hard coded right now if country == "South Africa" and case_id == "110": if demographic["age"] == 34: subject["submitter_id"] += "_1" elif demographic["age"] == 27: subject["submitter_id"] += "_2" # init demographic node demographic["submitter_id"] = format_node_submitter_id( subject["submitter_id"], "demographic" ) demographic["subjects"] = [{"submitter_id": subject["submitter_id"]}] # init observation node observation["submitter_id"] = format_node_submitter_id( subject["submitter_id"], "observation" ) observation["subjects"] = [{"submitter_id": subject["submitter_id"]}] if subject.get("date_confirmation"): subject["covid_19_status"] = "Positive" state = subject.get("tmp_current_status") if "tmp_current_status" in subject: del subject["tmp_current_status"] if state == "deceased": subject["vital_status"] = "Dead" elif state in ["alive"]: subject["vital_status"] = state.capitalize() elif state in ["positive"]: subject["covid_19_status"] = state.capitalize() elif state == "isolated": observation["isolation_status"] = state.capitalize() elif state in ["released", "recovered", "in recovery", "in treatment"]: observation["treatment_status"] = state.capitalize() elif state in ["stable", "unstable", "critical"]: observation["condition"] = state.capitalize() elif state: raise Exception('State "{}" is unknown'.format(state)) if "travel_history_dates" in subject: date_list = normalize_date_list(subject["travel_history_dates"]) if date_list: subject["travel_history_dates"] = date_list else: del subject["travel_history_dates"] if "travel_history_location" in subject: loc_list = normalize_location_list(subject["travel_history_location"]) if loc_list: subject["travel_history_location"] = loc_list else: del subject["travel_history_location"] return subject, demographic, observation def submit_metadata(self): print("Submitting subject data") for loc in self.subjects: loc_record = {"type": "subject"} loc_record.update(loc) self.metadata_helper.add_record_to_submit(loc_record) self.metadata_helper.batch_submit_records() print("Submitting demographic data") for dem in self.demographics: dem_record = {"type": "demographic"} dem_record.update(dem) self.metadata_helper.add_record_to_submit(dem_record) self.metadata_helper.batch_submit_records() print("Submitting observation data") for obs in self.observations: obs_record = {"type": "observation"} obs_record.update(obs) self.metadata_helper.add_record_to_submit(obs_record) self.metadata_helper.batch_submit_records()
class IDPH(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "IDPH" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.country = "US" self.state = "IL" self.county_dict = {} self.il_counties() self.summary_locations = [] self.summary_clinicals = [] def get_location_and_clinical_submitter_id(self, county, date): summary_location_submitter_id = format_submitter_id( "summary_location", {"country": self.country, "state": self.state, "county": county} if county is not None else {"country": self.country, "state": self.state}, ) summary_clinical_submitter_id = derived_submitter_id( summary_location_submitter_id, "summary_location", "summary_clinical", {"date": date}, ) return summary_location_submitter_id, summary_clinical_submitter_id def il_counties(self): with open( os.path.join(CURRENT_DIR, "data/IL_counties_central_coords_lat_long.tsv") ) as f: counties = f.readlines() counties = counties[1:] counties = map(lambda l: l.strip().split("\t"), counties) for county, lat, lon in counties: self.county_dict[county] = {"lat": lat, "lon": lon} def files_to_submissions(self): """ Reads JSON file and convert the data to Sheepdog records. """ latest_submitted_date = self.metadata_helper.get_latest_submitted_date_idph() today = datetime.date.today() if latest_submitted_date == today: print("Nothing to submit: today and latest submitted date are the same.") return today_str = today.strftime("%Y%m%d") print(f"Getting data for date: {today_str}") # they changed the URL on April 1, 2020 if today > datetime.date(2020, 3, 31): url = "http://www.dph.illinois.gov/sitefiles/COVIDTestResults.json" else: url = f"https://www.dph.illinois.gov/sites/default/files/COVID19/COVID19CountyResults{today_str}.json" self.parse_file(latest_submitted_date, url) def parse_file(self, latest_submitted_date, url): """ Converts a JSON files to data we can submit via Sheepdog. Stores the records to submit in `self.summary_locations` and `self.summary_clinicals`. Args: latest_submitted_date (date): date for latest submitted date url (str): URL at which the JSON file is available """ print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: data = r.json() date = idph_get_date(data["LastUpdateDate"]) if latest_submitted_date and date == latest_submitted_date.strftime( "%Y-%m-%d" ): print( "Nothing to submit: latest submitted date and date from data are the same." ) return for county in data["characteristics_by_county"]["values"]: demographic = data.get("demographics", None) summary_location, summary_clinical = self.parse_county( date, county, demographic ) self.summary_locations.append(summary_location) self.summary_clinicals.append(summary_clinical) for illinois_data in data["state_testing_results"]["values"]: illinois_historic_data = self.parse_historical_data(illinois_data) self.summary_clinicals.append(illinois_historic_data) def parse_historical_data(self, illinois_data): """ Parses historical state-level data. "summary_location" node is created from "characteristics_by_county" data. Args: illinois_data (dict): data JSON with "testDate", "total_tested", "confirmed_cases" and "deaths" Returns: dict: "summary_clinical" node for Sheepdog """ county = "Illinois" date = datetime.datetime.strptime( illinois_data["testDate"], "%m/%d/%Y" ).strftime("%Y-%m-%d") ( summary_location_submitter_id, summary_clinical_submitter_id, ) = self.get_location_and_clinical_submitter_id(county, date) summary_clinical = { "submitter_id": summary_clinical_submitter_id, "date": date, "confirmed": illinois_data["confirmed_cases"], "testing": illinois_data["total_tested"], "deaths": illinois_data["deaths"], "summary_locations": [{"submitter_id": summary_location_submitter_id}], } return summary_clinical def parse_county(self, date, county_json, demographic): """ From county-level data, generate the data we can submit via Sheepdog Args: date (date): date county_json (dict): JSON for county statistics Returns: (dict, dict): "summary_location" and "summary_clinical" records """ county = county_json["County"] ( summary_location_submitter_id, summary_clinical_submitter_id, ) = self.get_location_and_clinical_submitter_id(county, date) summary_location = { "submitter_id": summary_location_submitter_id, "country_region": self.country, "province_state": self.state, "projects": [{"code": self.project_code}], } # the IDPH data use Illinois in "County" field for aggregated data # in Gen3 it would equal to location with "province_state" equal to "IL" and no "County" field if county != "Illinois": summary_location["county"] = county if county in self.county_dict: summary_location["latitude"] = self.county_dict[county]["lat"] summary_location["longitude"] = self.county_dict[county]["lon"] else: if county_json["lat"] != 0: summary_location["latitude"] = str(county_json["lat"]) if county_json["lon"] != 0: summary_location["longitude"] = str(county_json["lon"]) summary_clinical = { "submitter_id": summary_clinical_submitter_id, "date": date, "confirmed": county_json["confirmed_cases"], "testing": county_json["total_tested"], "deaths": county_json["deaths"], "summary_locations": [{"submitter_id": summary_location_submitter_id}], } if "negative" in county_json: summary_clinical["negative"] = county_json["negative"] if county == "Illinois" and demographic: for k, v in fields_mapping.items(): field, mapping = v demographic_group = demographic[k] for item in demographic_group: dst_field = mapping[item[field]] if dst_field: if "count" in item: age_group_count_field = "{}_{}".format( mapping[item[field]], "count" ) summary_clinical[age_group_count_field] = item["count"] if "tested" in item: age_group_tested_field = "{}_{}".format( mapping[item[field]], "tested" ) summary_clinical[age_group_tested_field] = item["tested"] return summary_location, summary_clinical def submit_metadata(self): """ Submits the data in `self.summary_locations` and `self.summary_clinicals` to Sheepdog. """ print("Submitting data...") print("Submitting summary_location data") for sl in self.summary_locations: sl_record = {"type": "summary_location"} sl_record.update(sl) self.metadata_helper.add_record_to_submit(sl_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for sc in self.summary_clinicals: sc_record = {"type": "summary_clinical"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records()
class COXRAY(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "COXRAY" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.file_helper = FileHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.nodes = { "core_metadata_collection": [], "study": [], "subject": [], "observation": [], "follow_up": [], "demographic": [], "imaging_file": [], } def files_to_submissions(self): with open(Path(COXRAY_DATA_PATH).joinpath("metadata.csv")) as f: reader = csv.reader(f, delimiter=",", quotechar='"') headers = next(reader) for row in reader: row_nodes = self.parse_row(headers, row) for k, v in row_nodes.items(): self.nodes[k].append(v) def parse_row(self, headers, row): cmc_submitter_id = format_submitter_id("cmc_coxray", {}) subject_submitter_id = format_submitter_id( "subject_coxray", {"patientid": row[headers.index("patientid")]}) observation_submitter_id = derived_submitter_id( subject_submitter_id, "subject_coxray", "observation_coxray", {}) follow_up_submitter_id = derived_submitter_id( subject_submitter_id, "subject_coxray", "follow_up_coxray", {"offset": row[headers.index("offset")]}, ) demographic_submitter_id = derived_submitter_id( subject_submitter_id, "subject_coxray", "demographic_coxray", {}) imaging_file_submitter_id = format_submitter_id( "imaging_file_coxray", {"filename": row[headers.index("filename")]}) study_submitter_id = format_submitter_id( "study_coxray", {"doi": row[headers.index("doi")]}) filename = row[headers.index("filename")] filename = Path(filename) filepath = Path(COXRAY_DATA_PATH).joinpath("images", filename) filepath_exist = filepath.exists() nodes = { "core_metadata_collection": { "submitter_id": cmc_submitter_id, "projects": [{ "code": self.project_code }], }, "study": { "submitter_id": study_submitter_id, "projects": [{ "code": self.project_code }], }, "subject": { "submitter_id": subject_submitter_id, "projects": [{ "code": self.project_code }], "studies": [{ "submitter_id": study_submitter_id }], }, "observation": { "submitter_id": observation_submitter_id, "subjects": [{ "submitter_id": subject_submitter_id }], }, "follow_up": { "submitter_id": follow_up_submitter_id, "subjects": [{ "submitter_id": subject_submitter_id }], }, "demographic": { "submitter_id": demographic_submitter_id, "subjects": [{ "submitter_id": subject_submitter_id }], }, } if filepath_exist: data_type = "".join(filename.suffixes) did, rev, md5sum, filesize = self.file_helper.find_by_name( filename=filename) assert ( did ), f"file {filename} does not exist in the index, rerun COXRAY_FILE ETL" self.file_helper.update_authz(did=did, rev=rev) nodes["imaging_file"] = { "submitter_id": imaging_file_submitter_id, "subjects": [{ "submitter_id": subject_submitter_id }], "follow_ups": [{ "submitter_id": follow_up_submitter_id }], "core_metadata_collections": [{ "submitter_id": cmc_submitter_id }], "data_type": data_type, "data_format": "Image File", "data_category": "X-Ray Image", "file_size": filesize, "md5sum": md5sum, "object_id": did, } else: print( f"subject references the file that doesn't exist as a file: {filepath}" ) for k, (node, field, converter) in fields_mapping.items(): value = row[headers.index(k)] if node in nodes and value: if converter: nodes[node][field] = converter(value) else: nodes[node][field] = value return nodes def submit_metadata(self): print("Submitting data...") for k, v in self.nodes.items(): submitter_id_exist = [] print(f"Submitting {k} data...") for node in v: node_record = {"type": k} node_record.update(node) submitter_id = node_record["submitter_id"] if submitter_id not in submitter_id_exist: submitter_id_exist.append(submitter_id) self.metadata_helper.add_record_to_submit(node_record) self.metadata_helper.batch_submit_records()
class COM_MOBILITY(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "Com-Mobility" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.expected_file_headers = [ "country_region_code", "country_region", "sub_region_1", "sub_region_2", "metro_area", "iso_3166_2_code", "census_fips_code", "date", "retail_and_recreation_percent_change_from_baseline", "grocery_and_pharmacy_percent_change_from_baseline", "parks_percent_change_from_baseline", "transit_stations_percent_change_from_baseline", "workplaces_percent_change_from_baseline", "residential_percent_change_from_baseline", ] self.summary_locations = [] self.summary_socio_demographics = [] def files_to_submissions(self): """ Reads CSV files and converts the data to Sheepdog records """ url = "https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv" self.parse_file(url) def parse_file(self, url): """ Converts a CSV file to data we can submit via Sheepdog. Stores the records to submit in `self.location_data` and `self.time_series_data`. Ignores any records that are already in Sheepdog (relies on unique `submitter_id` to check) Args: url (str): URL at which the CSV file is available """ self.last_submission_date_time = self.metadata_helper.get_last_submission() the_lattest_data_datetime = None print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: f = (line.decode("utf-8") for line in r.iter_lines()) reader = csv.reader(f, delimiter=",", quotechar='"') headers = next(reader) assert ( headers[0] != "404: Not Found" ), "Unable to get file contents, received {}.".format(headers) assert set(self.expected_file_headers).issubset( set(headers) ), "CSV headers have changed (expected {} is a subset of {}). We may need to update the ETL code".format( self.expected_file_headers, headers ) for row in reader: # ignore any empty row if not row: continue row_dict = dict(zip(headers, row)) if row_dict["country_region_code"] != "US": continue if ( not self.last_submission_date_time or parse(row_dict["date"]) > self.last_submission_date_time ): if ( the_lattest_data_datetime is None or the_lattest_data_datetime < parse(row_dict["date"]) ): the_lattest_data_datetime = parse(row_dict["date"]) summary_location = {} summary_socio_demographic = {} summary_location_submitter_id = format_submitter_id( "summary_location", row_dict["country_region_code"], row_dict["sub_region_1"], row_dict["sub_region_2"], row_dict["metro_area"], row_dict["date"], ) summary_socio_demographic_submitter_id = format_submitter_id( "summary_socio_demographic", row_dict["country_region_code"], row_dict["sub_region_1"], row_dict["sub_region_2"], row_dict["metro_area"], row_dict["date"], ) summary_location = { "submitter_id": summary_location_submitter_id, "projects": [{"code": self.project_code}], } summary_socio_demographic = { "submitter_id": summary_socio_demographic_submitter_id, "summary_locations": [ {"submitter_id": summary_location_submitter_id} ], } for field in [ "country_region_code", "country_region", "sub_region_1", "sub_region_2", "metro_area", "iso_3166_2_code", "census_fips_code", ]: gen3_field, func = SPECIAL_MAP_FIELDS[field] summary_location[gen3_field] = func(row_dict[field]) for field in [ "retail_and_recreation_percent_change_from_baseline", "grocery_and_pharmacy_percent_change_from_baseline", "parks_percent_change_from_baseline", "transit_stations_percent_change_from_baseline", "workplaces_percent_change_from_baseline", "residential_percent_change_from_baseline", "date", ]: gen3_field, func = SPECIAL_MAP_FIELDS[field] summary_socio_demographic[gen3_field] = func(row_dict[field]) self.summary_locations.append(summary_location) self.summary_socio_demographics.append(summary_socio_demographic) if the_lattest_data_datetime: self.last_submission_date_time = the_lattest_data_datetime def submit_metadata(self): """ Converts the data in `self.time_series_data` to Sheepdog records. `self.location_data already contains Sheepdog records. Batch submits all records in `self.location_data` and `self.time_series_data` """ # Commented # Only required for one time submission of summary_location print("Submitting summary_location data") for loc in self.summary_locations: loc_record = {"type": "summary_location"} loc_record.update(loc) self.metadata_helper.add_record_to_submit(loc_record) self.metadata_helper.batch_submit_records() print("Submitting summary_socio_demographic data") for sc in self.summary_socio_demographics: sc_record = {"type": "summary_socio_demographic"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records() self.metadata_helper.update_last_submission( self.last_submission_date_time.strftime("%Y-%m-%d") )
class NCBI(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "ncbi-covid-19" self.manifest_bucket = "sra-pub-sars-cov2" self.sra_src_manifest = "sra-src/Manifest" self.accession_number_filename_map = {} self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.file_helper = AsyncFileHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.data_file = NCBI_FILE( base_url=self.base_url, s3_bucket=self.project_code, access_token=access_token, ) self.submitting_data = { "sample": [], "virus_sequence": [], "core_metadata_collection": [], "virus_sequence_run_taxonomy": [], "virus_sequence_contig": [], "virus_sequence_blastn": [], "virus_sequence_contig_taxonomy": [], "virus_sequence_peptide": [], "virus_sequence_hmm_search": [], } self.submitting_data["core_metadata_collection"].append({ "submitter_id": format_submitter_id("cmc_ncbi_covid19", {}), "projects": [{ "code": self.project_code }], }) read_ncbi_manifest( self.manifest_bucket, self.sra_src_manifest, self.accession_number_filename_map, ) def submit_metadata(self): start = time.strftime("%X") loop = asyncio.get_event_loop() tasks = [] for node_name, _ in self.data_file.nodes.items(): if node_name == "virus_sequence_run_taxonomy": continue else: tasks.append( asyncio.ensure_future( self.files_to_node_submissions(node_name))) try: results = loop.run_until_complete(asyncio.gather(*tasks)) loop.run_until_complete( asyncio.gather( self.files_to_virus_sequence_run_taxonomy_submission( results[0]))) if AsyncFileHelper.session: loop.run_until_complete( asyncio.gather(AsyncFileHelper.close_session())) finally: loop.close() end = time.strftime("%X") for k, v in self.submitting_data.items(): print(f"Submitting {k} data...") for node in v: node_record = {"type": k} node_record.update(node) self.metadata_helper.add_record_to_submit(node_record) self.metadata_helper.batch_submit_records() print(f"Running time: From {start} to {end}") async def files_to_virus_sequence_run_taxonomy_submission( self, submitting_accession_numbers): """get submitting data for virus_sequence_run_taxonomy node""" if not submitting_accession_numbers: return records = self._get_response_from_big_query( submitting_accession_numbers) # Keep track accession_numbers having link to virus_sequence nodes accession_number_set = set() for record in records: if record["acc"] in self.accession_number_filename_map: accession_number = record["acc"] print(f"Get from bigquery response {accession_number}") success = await self._parse_big_query_response(record) if success: accession_number_set.add(accession_number) cmc_submitter_id = format_submitter_id("cmc_ncbi_covid19", {}) for accession_number in submitting_accession_numbers: virus_sequence_run_taxonomy_submitter_id = format_submitter_id( "virus_sequence_run_taxonomy", {"accession_number": accession_number}) submitted_json = { "submitter_id": virus_sequence_run_taxonomy_submitter_id, "core_metadata_collections": [{ "submitter_id": cmc_submitter_id }], "accession_number": accession_number, "data_type": "Virus Sequence Run Taxonomy Analysis", "data_format": "json", "data_category": "Kmer-based Taxonomy Analysis", } # Add link to virus sequence node if accession_number in accession_number_set: submitted_json["virus_sequences"] = [{ "submitter_id": f"virus_sequence_{accession_number}" }] filename = f"virus_sequence_run_taxonomy_{accession_number}.csv" print(f"Get indexd info of {filename}") trying = True while trying: try: ( did, rev, md5sum, filesize, file_name, authz, ) = await self.file_helper.async_find_by_name( filename=filename) trying = False except Exception as e: print( f"Can not get indexd record of {filename}. Detail {e}. Retrying..." ) assert ( did ), f"file {filename} does not exist in the index, rerun NCBI_FILE ETL" if not authz: tries = 0 while tries < MAX_RETRIES: try: await self.file_helper.async_update_authz(did=did, rev=rev) break except Exception as e: tries += 1 print( f"Can not update indexd for {did}. Detail {e}. Retrying..." ) submitted_json["file_size"] = filesize submitted_json["md5sum"] = md5sum submitted_json["object_id"] = did submitted_json["file_name"] = file_name self.submitting_data["virus_sequence_run_taxonomy"].append( submitted_json) async def files_to_node_submissions(self, node_name): """Get submitting data for the node""" retrying = True while retrying: try: submitting_accession_numbers = ( await self.get_submitting_accession_number_list(node_name)) retrying = False except Exception as e: print( f"Can not query peregine with {node_name}. Detail {e}. Retrying ..." ) for accession_number in submitting_accession_numbers: submitter_id = format_submitter_id( node_name, {"accession_number": accession_number}) cmc_submitter_id = format_submitter_id("cmc_ncbi_covid19", {}) contig_submitter_id = format_submitter_id( "virus_sequence_contig", {"accession_number": accession_number}) peptide_submitter_id = format_submitter_id( "virus_sequence_peptide", {"accession_number": accession_number}) run_taxonomy_submitter_id = format_submitter_id( "virus_sequence_run_taxonomy", {"accession_number": accession_number}) contig_taxonomy_submitter_id = format_submitter_id( "virus_sequence_contig_taxonomy", {"accession_number": accession_number}) if node_name == "virus_sequence_contig": submitted_json = { "submitter_id": submitter_id, "core_metadata_collections": [{ "submitter_id": cmc_submitter_id }], "virus_sequences_run_taxonomies": [{ "submitter_id": run_taxonomy_submitter_id }], "accession_number": accession_number, "data_type": "Virus Sequence Contig", "data_format": "json", "data_category": "Nucleotide Contig", } elif node_name == "virus_sequence_blastn": submitted_json = { "submitter_id": submitter_id, "core_metadata_collections": [{ "submitter_id": cmc_submitter_id }], "virus_sequence_contigs": [{ "submitter_id": contig_submitter_id }], "accession_number": accession_number, "data_type": "Virus Sequence Blastn", "data_format": "tsv", "data_category": "Nucleotide Blast", } elif node_name == "virus_sequence_peptide": submitted_json = { "submitter_id": submitter_id, "core_metadata_collections": [{ "submitter_id": cmc_submitter_id }], "virus_sequence_contigs": [{ "submitter_id": contig_submitter_id }], "accession_number": accession_number, "data_type": "Peptides Annotation Using VIGOR3", "data_format": "json", "data_category": "Peptides Annotation", } elif node_name == "virus_sequence_hmm_search": submitted_json = { "submitter_id": submitter_id, "core_metadata_collections": [{ "submitter_id": cmc_submitter_id }], "virus_sequence_peptides": [{ "submitter_id": peptide_submitter_id }], "accession_number": accession_number, "data_type": "Virus Sequence HMM Search", "data_format": "json", "data_category": "HMMER Scab of Contigs", } elif node_name == "virus_sequence_contig_taxonomy": submitted_json = { "submitter_id": submitter_id, "core_metadata_collections": [{ "submitter_id": cmc_submitter_id }], "virus_sequence_contigs": [{ "submitter_id": contig_submitter_id }], "accession_number": accession_number, "data_type": "Contig Taxonomy", "data_format": "json", "data_category": "Kmer-based Taxonomy Analysis of Contigs", } else: raise Exception(f"ERROR: {node_name} does not exist") ext = re.search("\.(.*)$", self.data_file.nodes[node_name][0]).group(1) filename = f"{node_name}_{accession_number}.{ext}" print(f"Get indexd record of {filename}") retrying = True while retrying: try: ( did, rev, md5sum, filesize, file_name, authz, ) = await self.file_helper.async_find_by_name( filename=filename) retrying = False except Exception as e: print( f"ERROR: Fail to query indexd for {filename}. Detail {e}. Retrying ..." ) await asyncio.sleep(5) assert ( did ), f"file {filename} does not exist in the index, rerun NCBI_FILE ETL" if not authz: tries = 0 while tries < MAX_RETRIES: try: await self.file_helper.async_update_authz(did=did, rev=rev) break except Exception as e: tries += 1 print( f"ERROR: Fail to update indexd for {filename}. Detail {e}. Retrying ..." ) await asyncio.sleep(5) submitted_json["file_size"] = filesize submitted_json["md5sum"] = md5sum submitted_json["object_id"] = did submitted_json["file_name"] = file_name self.submitting_data[node_name].append(submitted_json) return submitting_accession_numbers async def get_submitting_accession_number_list_for_run_taxonomy(self): """get submitting number list for run_taxonomy file""" node_name = "virus_sequence_run_taxonomy" submitting_accession_numbers = set() existed_accession_numbers = await self.data_file.get_existed_accession_numbers( node_name) s3 = boto3.resource("s3", config=Config(signature_version=UNSIGNED)) s3_object = s3.Object(self.data_file.bucket, self.data_file.nodes[node_name][0]) file_path = f"{DATA_PATH}/virus_sequence_run_taxonomy.gz" s3_object.download_file(file_path) n_lines = 0 with gzip.open(file_path, "rb") as f: while True: bline = f.readline() if not bline: break n_lines += 1 if n_lines % 10000 == 0: print(f"Finish process {n_lines} of file {node_name}") line = bline.decode("UTF-8") r1 = re.findall("[SDE]RR\d+", line) if len(r1) == 0: continue read_accession_number = r1[0] if (f"{node_name}_{read_accession_number}" not in existed_accession_numbers): submitting_accession_numbers.add(read_accession_number) return list(submitting_accession_numbers) async def get_submitting_accession_number_list(self, node_name): """get submitting acession number list""" submitting_accession_numbers = set() existed_accession_numbers = await self.data_file.get_existed_accession_numbers( node_name) s3 = boto3.resource("s3", config=Config(signature_version=UNSIGNED)) s3_object = s3.Object(self.data_file.bucket, self.data_file.nodes[node_name][0]) line_stream = codecs.getreader("utf-8") n_lines = 0 for line in line_stream(s3_object.get()["Body"]): r1 = re.findall("[SDE]RR\d+", line) n_lines += 1 if n_lines % 10000 == 0: print(f"Finish process {n_lines} of file {node_name}") if len(r1) == 0: continue read_accession_number = r1[0] if (f"{node_name}_{read_accession_number}".lower() not in existed_accession_numbers): submitting_accession_numbers.add(read_accession_number) return list(submitting_accession_numbers) def _get_response_from_big_query(self, accession_numbers): """ Get data from big query. The format of the response json is described as below: [{ "acc": "DRR220591", "assay_type": "RNA-Seq", "center_name": "KUMAMOTO", "consent": "public", "experiment": "DRX210904", "sample_name": "SAMD00217265", "instrument": "Illumina NovaSeq 6000", "librarylayout": "PAIRED", "libraryselection": "RANDOM", "librarysource": "TRANSCRIPTOMIC", "platform": "ILLUMINA", "sample_acc": "DRS139760", "biosample": "SAMD00217265", "organism": "Mus musculus", "sra_study": "DRP006149", #'releasedate': datetime.datetime(2020, 6, 4, 0, 0, tzinfo=<UTC>), "bioproject": "PRJDB9618", "mbytes": 2160, "loaddate": None, "avgspotlen": 300, "mbases": 6395, "insertsize": None, "library_name": None, "biosamplemodel_sam": [], "collection_date_sam": [], "geo_loc_name_country_calc": None, "geo_loc_name_country_continent_calc": None, "geo_loc_name_sam": [], "ena_first_public_run": [], "ena_last_update_run": [], "sample_name_sam": ["WT3_plus"], "datastore_filetype": ["sra"], "datastore_provider": ["gs", "ncbi", "s3"], "datastore_region": ["gs.US", "ncbi.public", "s3.us-east-1"], }] """ assert accession_numbers != [], "accession_numbers is not empty" start = 0 offset = 100 client = bigquery.Client() while start < len(accession_numbers): end = min(start + offset, len(accession_numbers)) stm = 'SELECT * FROM `nih-sra-datastore`.sra.metadata where consent = "public"' stm = stm + f' and (acc = "{accession_numbers[start]}"' for accession_number in accession_numbers[start + 1:end]: stm = stm + f' or acc = "{accession_number}"' stm = stm + ")" query_job = client.query(stm) results = query_job.result() # Waits for job to complete. for row in results: yield dict(row) start = end async def _parse_big_query_response(self, response): """ Parse the big query response and get indexd record Return True if success """ accession_number = response["acc"] sample = {} virus_sequence = {} sample["submitter_id"] = f"sample_{accession_number}" sample["projects"] = [{"code": self.project_code}] for field in [ "ncbi_bioproject", "ncbi_biosample", "sample_accession", "host_associated_environmental_package_sam", "organism", "collection_date", "country_region", "continent", ]: if field in SPECIAL_MAP_FIELDS: old_name, dtype, handler = SPECIAL_MAP_FIELDS[field] sample[field] = handler(response.get(old_name)) elif field in response: sample[field] = str(response.get(field)) virus_sequence["submitter_id"] = f"virus_sequence_{accession_number}" for field in [ "assay_type", "avgspotlen", "bytes", "center_name", "consent", "datastore_provider", "datastore_region", "description_sam", "ena_checklist_sam", "ena_first_public_run", "ena_last_update_run", "experiment", "insdc_center_name_sam", "insdc_first_public_sam", "insdc_center_alias_sam", "insdc_last_update_sam", "investigation_type_sam", "insdc_status_sam", "instrument", "library_name", "libraryselection", "librarysource", "mbases", "mbytes", "platform", "sra_accession_sam", "sra_study", "title_sam", "release_date", "data_format", "librarylayout", ]: if field in SPECIAL_MAP_FIELDS: old_name, dtype, handler = SPECIAL_MAP_FIELDS[field] virus_sequence[field] = handler(response.get(old_name)) elif field in response: virus_sequence[field] = str(response.get(field)) virus_sequence["samples"] = [{"submitter_id": sample["submitter_id"]}] virus_sequence["data_category"] = "Nucleotide" virus_sequence["data_type"] = "Sequence" virus_sequence["file_name"] = self.accession_number_filename_map[ accession_number] virus_sequence["data_format"] = get_file_extension( virus_sequence["file_name"]) filename = virus_sequence["file_name"] retrying = True while retrying: try: ( did, rev, md5sum, filesize, file_name, authz, ) = await self.file_helper.async_find_by_name(filename=filename ) retrying = False except Exception as e: print( f"ERROR: Fail to get indexd for {filename}. Detail {e}. Retrying ..." ) await asyncio.sleep(5) if not did: print( f"file {filename} does not exist in the index, rerun NCBI_MANIFEST ETL" ) return False if not authz: retries = 0 while retries < MAX_RETRIES: try: await self.file_helper.async_update_authz(did=did, rev=rev) break except Exception as e: print( f"ERROR: Fail to update indexd for {filename}. Detail {e}. Retrying ..." ) retries += 1 await asyncio.sleep(5) virus_sequence["file_size"] = filesize virus_sequence["md5sum"] = md5sum virus_sequence["object_id"] = did self.submitting_data["virus_sequence"].append(virus_sequence) self.submitting_data["sample"].append(sample) return True
class OWID(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.summary_locations = [] self.summary_clinicals = [] self.program_name = "open" self.project_code = "OWID" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) # structure is # (csv field name, (node type, node field name, type of field)) testing_fields = [ ("ISO code", ("summary_location", "iso3", str)), ("Entity", (None, None, split_entity)), ("Date", ("summary_clinical", "date", str)), ("Source URL", ("summary_clinical", "source_url", str)), ("Source label", ("summary_clinical", "source_label", str)), ("Notes", ("summary_clinical", "notes", str)), ("Number of observations", ("summary_clinical", "num_observations", int)), ("Cumulative total", ("summary_clinical", "testing", int)), ( "Cumulative total per thousand", ("summary_clinical", "cumulative_total_per_thousand", int), ), ( "Daily change in cumulative total", ("summary_clinical", "daily_change_in_cumulative_total", int), ), ( "Daily change in cumulative total per thousand", ( "summary_clinical", "daily_change_in_cumulative_total_per_thousand", int, ), ), ( "7-day smoothed daily change", ("summary_clinical", "seven_day_smoothed_daily_change", int), ), ( "7-day smoothed daily change per thousand", ( "summary_clinical", "seven_day_smoothed_daily_change_per_thousand", float, ), ), ("Short-term positive rate", (None, None, None)), ("Short-term tests per case", (None, None, None)), ("General source label", ("summary_clinical", "general_source_label", str)), ("General source URL", ("summary_clinical", "general_source_url", str)), ("Short description", ("summary_clinical", "short_description", str)), ("Detailed description", ("summary_clinical", "detailed_description", str)), ] self.headers_mapping = { field: (k, mapping) for k, (field, mapping) in enumerate(testing_fields) } def files_to_submissions(self): """ Reads CSV files and converts the data to Sheepdog records """ url = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/testing/covid-testing-latest-data-source-details.csv" self.parse_file(url) def parse_file(self, url): print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: f = (line.decode("utf-8") for line in r.iter_lines()) reader = csv.reader(f, delimiter=",", quotechar='"') headers = next(reader) assert ( headers[0] != "404: Not Found" ), " Unable to get file contents, received {}.".format(headers) expected_h = list(self.headers_mapping.keys()) obtained_h = headers[:len(expected_h)] assert ( obtained_h == expected_h ), "CSV headers have changed (expected {}, got {}). We may need to update the ETL code".format( expected_h, obtained_h) for row in reader: summary_location, summary_clinical = self.parse_row( row, self.headers_mapping) if summary_location not in self.summary_locations: self.summary_locations.append(summary_location) self.summary_clinicals.append(summary_clinical) def parse_row(self, row, mapping): summary_location = {} summary_clinical = {} for k, (i, (node_type, node_field, type_conv)) in mapping.items(): if k == "Entity": country, test_type = split_entity(row[i]) summary_location["country_region"] = country summary_clinical["test_type"] = test_type if node_field: value = row[i] if value: if node_type == "summary_location": summary_location[node_field] = type_conv(value) if node_type == "summary_clinical": if type_conv == int: summary_clinical[node_field] = type_conv( float(value)) else: summary_clinical[node_field] = type_conv(value) summary_location_submitter_id = format_location_submitter_id( summary_location) summary_location["submitter_id"] = summary_location_submitter_id summary_location["projects"] = [{"code": self.project_code}] summary_clinical[ "submitter_id"] = format_summary_clinical_submitter_id( summary_location_submitter_id, test_type=summary_clinical["test_type"], date=datetime.date.today().strftime("%Y-%m-%d"), ) summary_clinical["summary_locations"] = [{ "submitter_id": summary_location_submitter_id }] return summary_location, summary_clinical def submit_metadata(self): print("Submitting summary_location data") for loc in self.summary_locations: loc_record = {"type": "summary_location"} loc_record.update(loc) self.metadata_helper.add_record_to_submit(loc_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for rep in self.summary_clinicals: rep_record = {"type": "summary_clinical"} rep_record.update(rep) self.metadata_helper.add_record_to_submit(rep_record) self.metadata_helper.batch_submit_records()
class CHI_NBHD(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.summary_locations = [] self.summary_clinicals = [] self.program_name = "open" self.project_code = "CHI-NBHD" self.country = "US" self.state = "IL" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) def files_to_submissions(self): """ Reads JSON file and convert the data to Sheepdog records """ url = "https://covid19neighborhoods.southsideweekly.com/page-data/index/page-data.json" self.parse_file(url) def parse_file(self, url): print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: data = r.json() data = data["result"]["data"] build_time_str = data["build_time"]["nodes"][0]["buildTime"] build_time = datetime.datetime.strptime( build_time_str, "%Y-%m-%dT%H:%M:%S.%fZ" ) current_date = build_time.strftime("%Y-%m-%d") nbhd_stats = data["community_areas_all"]["nodes"][0]["childGeoJson"][ "features" ] for nbhd_object in nbhd_stats: summary_location, summary_clinical = self.parse_nbhd( nbhd_object, current_date ) self.summary_locations.append(summary_location) self.summary_clinicals.append(summary_clinical) print(summary_location) print(summary_clinical) def parse_nbhd(self, nbhd_object, date): properties = nbhd_object["properties"] nbhd = properties["community"] deaths = properties["value"] population = properties["population"] summary_location_submitter_id = format_submitter_id( "summary_location", {"country": self.country, "state": self.state, "nbhd": nbhd}, ) summary_location = { "submitter_id": summary_location_submitter_id, "community_area": nbhd, "projects": [{"code": self.project_code}], } summary_clinical_submitter_id = derived_submitter_id( summary_location_submitter_id, "summary_location", "summary_clinical", {"date": date}, ) summary_clinical = { "submitter_id": summary_clinical_submitter_id, "date": date, "deaths_per_10000": round(10000 * deaths / population, 2), "deaths": deaths, "summary_locations": [{"submitter_id": summary_location_submitter_id}], } return summary_location, summary_clinical def submit_metadata(self): print("Submitting summary_location data") for loc in self.summary_locations: loc_record = {"type": "summary_location"} loc_record.update(loc) self.metadata_helper.add_record_to_submit(loc_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for sc in self.summary_clinicals: sc_record = {"type": "summary_clinical"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records()
class IDPH_HOSPITAL(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "IDPH-Hospital" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.country = "US" self.state = "IL" self.summary_locations = [] self.summary_clinicals = [] def files_to_submissions(self): """ Reads JSON file and convert the data to Sheepdog records """ latest_submitted_date = self.metadata_helper.get_latest_submitted_date_idph( ) today = datetime.date.today() if latest_submitted_date == today: print( "Nothing to submit: today and latest submitted date are the same." ) return today_str = today.strftime("%Y%m%d") print(f"Getting data for date: {today_str}") url = "https://dph.illinois.gov/sitefiles/COVIDHospitalRegions.json" self.parse_file(latest_submitted_date, url) def parse_file(self, latest_submitted_date, url): """ Converts a JSON files to data we can submit via Sheepdog. Stores the records to submit in `self.summary_locations` and `self.summary_clinicals`. Args: latest_submitted_date (date): the date of latest available "summary_clinical" for project url (str): URL at which the JSON file is available """ print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: data = r.json() date = idph_get_date(data["LastUpdateDate"]) if latest_submitted_date and date == latest_submitted_date.strftime( "%Y-%m-%d"): print( "Nothing to submit: latest submitted date and date from data are the same." ) return ( summary_location, summary_clinical_statewide_current, ) = self.parse_statewide_values(date, data["statewideValues"]) self.summary_locations.append(summary_location) for utilization in data["HospitalUtilizationResults"]: summary_clinical = self.parse_historical( utilization, summary_clinical_statewide_current) self.summary_clinicals.append(summary_clinical) for region in data["regionValues"]: (summary_location, summary_clinical) = self.parse_region(date, region) self.summary_locations.append(summary_location) self.summary_clinicals.append(summary_clinical) def parse_historical(self, utilization, summary_clinical_statewide_current): utilization_mapping = { "reportDate": "date", "TotalBeds": "state_total_beds", "TotalOpenBeds": "total_open_beds", "TotalInUseBedsNonCOVID": "total_in_use_beds_non_covid", "TotalInUseBedsCOVID": "total_in_use_beds_covid", "ICUBeds": "icu_beds", "ICUOpenBeds": "icu_open_beds", "ICUInUseBedsNonCOVID": "icu_in_use_beds_non_covid", "ICUInUseBedsCOVID": "icu_in_use_beds_covid", "VentilatorCapacity": "ventilator_capacity", "VentilatorAvailable": "ventilator_available", "VentilatorInUseNonCOVID": "ventilator_in_use_non_covid", "VentilatorInUseCOVID": "ventilator_in_use_covid", } date = utilization["reportDate"] summary_location_submitter_id = format_submitter_id( "summary_location", { "project": "idph_hospital", "country": self.country, "state": self.state }, ) summary_clinical_submitter_id = derived_submitter_id( summary_location_submitter_id, "summary_location", "summary_clinical", { "project": "idph_hospital", "date": date }, ) summary_clinical = { "submitter_id": summary_clinical_submitter_id, "date": date, "summary_locations": [{ "submitter_id": summary_location_submitter_id }], } for k, v in utilization.items(): summary_clinical[utilization_mapping[k]] = v if (summary_clinical_submitter_id == summary_clinical_statewide_current["submitter_id"]): summary_clinical.update(summary_clinical_statewide_current) return summary_clinical def parse_statewide_values(self, date, statewide_values): statewide_mapping = { "ICUCapacity": "state_icu_capacity", "ICUCovidPatients": "state_icu_covid_patients", "VentCapacity": "state_vent_capacity", "VentCovidPatients": "state_vent_covid_patients", "ICUAvailable": "state_icu_available", "VentsAvailable": "state_vents_available", "TotalBeds": "state_total_beds", "TotalBedsAvailable": "state_total_beds_available", "TotalBedsUsed": "state_total_beds_used", "PctHospitalBedsAvailable": "state_pct_hospital_beds_available", "AdultICUCapacity": "state_adult_icu_capacity", "ICUOpenBeds": "state_icu_open_beds", "ICUBedsUsed": "state_icu_beds_used", "ICUOpenBedsPct": "state_icu_open_beds_pct", "COVIDPUIPatients": "state_covid_pui_patients", "COVIDPUIPatientsPct": "state_covid_pui_patients_pct", "COVIDPUIPatientsBedsInUsePct": "state_covid_pui_patients_beds_in_use_pct", "VentilatorCapacity": "state_ventilator_capacity", "VentilatorsOpen": "state_ventilators_open", "VentilatorsOpenPct": "state_Ventilators_open_pct", "VentilatorsInUse": "state_ventilators_in_use", "VentilatorsInUseCOVID": "state_ventilators_in_use_covid", "VentilatorsCOVIDPatientsPct": "state_ventilators_covid_patients_pct", "VentilatorsCOVIDPatientsInUsePct": "state_ventilators_covid_patients_in_use_pct", "CovidPatientsNonICU": "state_covid_patients_non_icu", "TotalCOVIDPUIInICU": "state_total_covid_pui_in_icu", "TotalCOVIDPUIInHospital": "state_total_covid_pui_in_hospital", "PctBedsCOVIDPUI": "state_pct_beds_covid_pui", "MedSurgBeds": "state_med_surg_beds", "MedSurgBedsOpen": "state_med_surg_beds_open", "MedSurgBedsOpenPct": "state_med_surg_beds_open_pct", "MedSurgBedsInUse": "state_med_surg_beds_in_use", } summary_location_submitter_id = format_submitter_id( "summary_location", { "project": "idph_hospital", "country": self.country, "state": self.state }, ) summary_location = { "submitter_id": summary_location_submitter_id, "projects": [{ "code": self.project_code }], "country_region": self.country, "province_state": self.state, } summary_clinical_submitter_id = derived_submitter_id( summary_location_submitter_id, "summary_location", "summary_clinical", { "project": "idph_hospital", "date": date }, ) summary_clinical = { "submitter_id": summary_clinical_submitter_id, "date": date, "summary_locations": [{ "submitter_id": summary_location_submitter_id }], } for k, v in statewide_values.items(): summary_clinical[statewide_mapping[k]] = v return summary_location, summary_clinical def parse_region(self, date, hospital_region): """ From county-level data, generate the data we can submit via Sheepdog """ region = hospital_region["region"] region_description = hospital_region["region_description"] summary_location_submitter_id = format_submitter_id( "summary_location", { "project": "idph_hospital", "country": self.country, "state": self.state, "region": region, }, ) summary_location = { "country_region": self.country, "submitter_id": summary_location_submitter_id, "projects": [{ "code": self.project_code }], "province_state": self.state, "state_hospital_region": region, "state_region_description": strip_prefix(region_description), } summary_clinical_submitter_id = derived_submitter_id( summary_location_submitter_id, "summary_location", "summary_clinical", { "project": "idph_hospital", "date": date }, ) summary_clinical = { "submitter_id": summary_clinical_submitter_id, "date": date, "summary_locations": [{ "submitter_id": summary_location_submitter_id }], "region_icu_avail": hospital_region["ICUAvail"], "region_icu_capacity": hospital_region["ICUCapacity"], "region_vents_available": hospital_region["VentsAvailable"], "region_vents_capacity": hospital_region["VentsCapacity"], } return summary_location, summary_clinical def submit_metadata(self): print("Submitting data...") print("Submitting summary_location data") for sl in self.summary_locations: sl_record = {"type": "summary_location"} sl_record.update(sl) self.metadata_helper.add_record_to_submit(sl_record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for sc in self.summary_clinicals: sc_record = {"type": "summary_clinical"} sc_record.update(sc) self.metadata_helper.add_record_to_submit(sc_record) self.metadata_helper.batch_submit_records()
class JHU(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.location_data = {} self.time_series_data = defaultdict(lambda: defaultdict(dict)) self.program_name = "open" self.project_code = "JHU" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.expected_csv_headers = { "global": ["Province/State", "Country/Region", "Lat", "Long", "1/22/20"], "US_counties": { "confirmed": [ "UID", "iso2", "iso3", "code3", "FIPS", "Admin2", "Province_State", "Country_Region", "Lat", "Long_", "Combined_Key", "1/22/20", ], "deaths": [ "UID", "iso2", "iso3", "code3", "FIPS", "Admin2", "Province_State", "Country_Region", "Lat", "Long_", "Combined_Key", "Population", # TODO use this "1/22/20", ], }, } self.header_to_column = { "global": { "province": 0, "country": 1, "latitude": 2, "longitude": 3, "dates_start": 4, }, "US_counties": { "confirmed": { "iso2": 1, "iso3": 2, "code3": 3, "FIPS": 4, "county": 5, "province": 6, "country": 7, "latitude": 8, "longitude": 9, "dates_start": 11, }, "deaths": { "iso2": 1, "iso3": 2, "code3": 3, "FIPS": 4, "county": 5, "province": 6, "country": 7, "latitude": 8, "longitude": 9, "dates_start": 12, }, }, } self.existing_summary_locations = [] self.last_date = "" def files_to_submissions(self): """ Reads CSV files and converts the data to Sheepdog records """ urls = { "global": { "confirmed": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv", "deaths": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv", "recovered": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv", # "testing": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_testing_global.csv", }, "US_counties": { "confirmed": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv", "deaths": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv", }, } ( self.existing_summary_locations, self.last_date, ) = self.metadata_helper.get_existing_data_jhu() for file_type in ["global", "US_counties"]: for data_type, url in urls[file_type].items(): self.parse_file(file_type, data_type, url) def parse_file(self, file_type, data_type, url): """ Converts a CSV file to data we can submit via Sheepdog. Stores the records to submit in `self.location_data` and `self.time_series_data`. Ignores any records that are already in Sheepdog (relies on unique `submitter_id` to check) Args: file_type (str): type of this file - one of ["global", "US_counties"] data_type (str): type of the data in this file - one of ["confirmed", "deaths", "recovered"] url (str): URL at which the CSV file is available """ print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: f = (line.decode("utf-8") for line in r.iter_lines()) reader = csv.reader(f, delimiter=",", quotechar='"') headers = next(reader) if headers[0] == "404: Not Found": print(" Unable to get file contents, received {}.".format( headers)) return expected_h = self.expected_csv_headers[file_type] if isinstance(expected_h, dict): expected_h = expected_h[data_type] obtained_h = headers[:len(expected_h)] assert ( obtained_h == expected_h ), "CSV headers have changed (expected {}, got {}). We may need to update the ETL code".format( expected_h, obtained_h) first_date_i = [ i for i, h in enumerate(headers) if h.endswith("/20") ][0] last_date = headers[-1] print(" First date: {}; last date: {}".format( headers[first_date_i], last_date)) for row in reader: if not row: # ignore empty rows continue location, date_to_value = self.parse_row( file_type, data_type, headers, row) if not location: # We are using US data by state instead of global continue location_submitter_id = location["submitter_id"] if (location_submitter_id not in self.location_data # do not re-submit location data that already exist and location_submitter_id not in self.existing_summary_locations): self.location_data[location_submitter_id] = location for date, value in date_to_value.items(): # do not re-submit summary_clinical data that # already exist. Assume anything older than the last # submitted date has already been submitted if (time_series_date_to_string(date) > time_series_date_to_string(self.last_date) or LAST_DATE_ONLY): self.time_series_data[location_submitter_id][date][ data_type] = value def parse_row(self, file_type, data_type, headers, row): """ Converts a row of a CSV file to data we can submit via Sheepdog Args: file_type (str): type of this file - one of ["global", "US_counties"] data_type (str): type of the data in this file - one of ["confirmed", "deaths", "recovered"] headers (list(str)): CSV file headers (first row of the file) row (list(str)): row of data Returns: (dict, dict) tuple: - location data, in a format ready to be submitted to Sheepdog - { "date1": <value>, "date2": <value> } from the row data """ header_to_column = self.header_to_column[file_type] if "country" not in header_to_column: header_to_column = header_to_column[data_type] country = row[header_to_column["country"]] province = row[header_to_column["province"]] latitude = row[header_to_column["latitude"]] or "0" longitude = row[header_to_column["longitude"]] or "0" if country == "US" and province == "": # We are using US data by state instead of global return None, None if int(float(latitude)) == 0 and int(float(longitude)) == 0: # Data with "Out of <state>" or "Unassigned" county value have # unknown coordinates of (0,0). We don't submit them for now return None, None submitter_id = format_location_submitter_id(country, province) location = { "country_region": country, "latitude": latitude, "longitude": longitude, "projects": [{ "code": self.project_code }], } if province: location["province_state"] = province if file_type == "US_counties": county = row[header_to_column["county"]] iso2 = row[header_to_column["iso2"]] iso3 = row[header_to_column["iso3"]] code3 = row[header_to_column["code3"]] fips = row[header_to_column["FIPS"]] if county: location["county"] = county submitter_id = format_location_submitter_id( country, province, county) if iso2: location["iso2"] = iso2 if iso3: location["iso3"] = iso3 if code3: location["code3"] = int(code3) if fips: location["FIPS"] = int(float(fips)) location["submitter_id"] = submitter_id date_to_value = {} dates_start = header_to_column["dates_start"] dates_indices = range(dates_start, len(headers)) if LAST_DATE_ONLY: dates_indices = [len(headers) - 1] for i in dates_indices: date = headers[i] date = get_unified_date_format(date) if row[i] == "": # ignore empty values continue try: val = int(float(row[i])) except ValueError: print( 'Unable to convert {} to int for "{}", "{}" at {}'.format( row[i], province, country, date)) raise date_to_value[date] = val return location, date_to_value def submit_metadata(self): """ Converts the data in `self.time_series_data` to Sheepdog records. `self.location_data already contains Sheepdog records. Batch submits all records in `self.location_data` and `self.time_series_data` """ if LAST_DATE_ONLY: # delete the old data from the Sheepdog DB print("Deleting old summary_clinical data") self.metadata_helper.delete_nodes(["summary_clinical"]) print("Submitting summary_location data") for location in self.location_data.values(): record = {"type": "summary_location"} record.update(location) self.metadata_helper.add_record_to_submit(record) self.metadata_helper.batch_submit_records() print("Submitting summary_clinical data") for location_submitter_id, time_series in self.time_series_data.items( ): for date, data in time_series.items(): submitter_id = format_summary_clinical_submitter_id( location_submitter_id, date) record = { "type": "summary_clinical", "submitter_id": submitter_id, "summary_locations": [{ "submitter_id": location_submitter_id }], "date": date, } for data_type, value in data.items(): record[data_type] = value self.metadata_helper.add_record_to_submit(record) self.metadata_helper.batch_submit_records()
class SSR(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.subjects = [] self.demographics = [] self.program_name = "controlled" self.project_code = "SSR" self.country = "US" self.state = "IL" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) # self.records = { <node ID>: { <submitter_id: { <data> } } } self.records = defaultdict(dict) # TODO temporary - for now this ETL can only be run manually self.file_path = os.environ.get("FILE_PATH") if not self.file_path: # log instead of exception so that unit tests don't complain print("Need FILE_PATH environment variable (SSR file to parse)") def files_to_submissions(self): """ Reads input files and converts the data to Sheepdog records """ print("Parsing file: {}".format(self.file_path)) extension = self.file_path.lower().split(".")[-1] if extension == "txt": self.parse_txt_input_file() elif extension == "xlsx": self.parse_xlsx_input_file() else: raise Exception( f"I don't know how to parse extension {extension} for file {self.file_path}" ) def parse_txt_input_file(self): with open(self.file_path, newline="") as csvfile: reader = csv.reader(csvfile, delimiter="|") header = next(reader) header = {k: v for v, k in enumerate(header)} for row in reader: row_data = dict(zip(header, row)) self.parse_input(row_data=row_data) def parse_xlsx_input_file(self): # Set up file path, workbook, and sheet. wb = xlrd.open_workbook(self.file_path) sheet = wb.sheet_by_index(0) # Create lists for SSR properties and value from Excel sheet. prop_list = sheet.col_values(0)[1:] value_list = sheet.col_values(1)[1:] col_data = dict(zip(prop_list, value_list)) self.parse_input(row_data=col_data, date_mode=wb.datemode) def parse_input(self, row_data, date_mode=None): # (original property, (gen3 node, gen3 property, property type)) mapping = [ ("reportingOrg", ("summary_location", "reporting_org", str)), ("reportDate", ("statistical_summary_report", "report_date", str)), ("num_COVID", ("statistical_summary_report", "num_COVID", int)), ( "num_COVID_deaths", ("statistical_summary_report", "num_COVID_deaths", int), ), ("num_outpatient", ("statistical_summary_report", "num_outpatient", int)), ("num_admitted", ("statistical_summary_report", "num_admitted", int)), ("num_icu", ("statistical_summary_report", "num_icu", int)), ("num_vent", ("statistical_summary_report", "num_vent", int)), ("num_resp", ("statistical_summary_report", "num_resp", int)), ("num_pneu", ("statistical_summary_report", "num_pneu", int)), ("num_diab", ("statistical_summary_report", "num_diab", int)), ("num_asth", ("statistical_summary_report", "num_asth", int)), ("num_obes", ("statistical_summary_report", "num_obes", int)), ("num_card", ("statistical_summary_report", "num_card", int)), ("num_chf", ("statistical_summary_report", "num_chf", int)), ] # row_records = { <node ID>: { <record data> } } # (there is only 1 record of each node type per row) row_records = defaultdict(dict) for orig_prop_name, (node_type, prop_name, _type) in mapping: if row_data[orig_prop_name]: row_records[node_type][prop_name] = format_value( prop_name, row_data[orig_prop_name], _type, date_mode) # add missing summary_location props summary_location_submitter_id = format_submitter_id( "summary_location", { "reporting_org": row_records["summary_location"]["reporting_org"] }, ) row_records["summary_location"].update({ "type": "summary_location", "submitter_id": summary_location_submitter_id, "projects": { "code": self.project_code }, "country_region": self.country, "province_state": self.state, }) # add missing statistical_summary_report props ssr_submitter_id = derived_submitter_id( summary_location_submitter_id, "statistical_summary_report", "ssr", { "report_date": row_records["statistical_summary_report"]["report_date"] }, ) row_records["statistical_summary_report"].update({ "type": "statistical_summary_report", "submitter_id": ssr_submitter_id, "summary_locations": { "submitter_id": summary_location_submitter_id }, }) for node_type in row_records: rec = row_records[node_type] self.records[node_type][rec["submitter_id"]] = rec def submit_metadata(self): # TODO check which summary_locations already exist for node_type in SUBMISSION_ORDER: recs = self.records[node_type].values() self.metadata_helper.add_records_to_submit(recs) self.metadata_helper.batch_submit_records()
class VAC_TRACKER(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.clinical_trials = [] self.program_name = "open" self.project_code = "VacTracker" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) def files_to_submissions(self): """ Reads json files and converts the data to Sheepdog records """ url = "https://biorender.com/page-data/covid-vaccine-tracker/page-data.json" self.parse_file(url) def parse_file(self, url): """ Converts a json file to data we can submit via Sheepdog. Stores the records to submit in `self.location_data` and `self.time_series_data`. Ignores any records that are already in Sheepdog (relies on unique `submitter_id` to check) Args: url (str): URL at which the file is available """ print("Getting data from {}".format(url)) with closing(requests.get(url, stream=True)) as r: data = r.json() try: for treatment in data["result"]["pageContext"]["treatments"]: node = treatment["node"] clinical_trial = self.parse_node(node) self.clinical_trials.append(clinical_trial) except ValueError as e: print(f"ERROR: value error. Detail {e}") def parse_node(self, node): """ Converts an element of an JSON file to data we can submit via Sheepdog Args: node (dict): node data Returns: dict: - clinical trial data, in a format ready to be submitted to Sheepdog """ clinical_trial = { "projects": [{"code": self.project_code}], "type": "clinical_trials", } for key, value in node.items(): if key not in MAP_FIELDS: continue gen3_field = MAP_FIELDS.get(key)[0] gen3_field_type = MAP_FIELDS.get(key)[1] if type(value) != gen3_field_type: print( f"ERROR: The type of {key} does not match with the one in Gen3. Skip it" ) continue if key == "fdaApproved": if "FDA-approved" in value: value = "Yes" elif value == "": value = "Unknown" elif value in ["N/A", "N//A", "N/A*"]: value = "NA" elif value not in ["Yes", "No", "Unknown", "NA", None]: value = "Unknown" if key == "customClinicalPhase": if value.lower() == "phase na": value = "Phase N/A" elif value.lower() in ["preclinical", "pre-clinical"]: value = "Preclinical Phase" elif value not in [ "Preclinical Phase", "Phase I", "Phase I/II", "Phase II", "Phase I/II/III", "Phase III", "Phase III/IV", "Phase IV", "Phase I/III/IV", "Phase I/IV", "Phase II/IV", "Phase II/III/IV", "Phase I/II/III/IV", "Phase II/III", "Phase N/A", None, ]: value = None if key == "technology": value = value.replace("*", "") if "to repurpose" in value.lower(): value = "Repurposed" if value not in [ "Antibodies", "Antivirals", "Cell-based therapies", "Device", "DNA-based", "Inactivated virus", "Modified APC", "Non-replicating viral vector", "Protein subunit", "RNA-based treatments", "RNA-based vaccine", "Repurposed", "Virus Like Particle", "Other", None, ]: value = "Other" if key == "developmentStage": if value.lower() in ["preclinical", "pre-clinical"]: value = "Preclinical Phase" elif value not in ["Preclinical Phase", "Clinical", "Withdrawn", None]: value = "Other" if gen3_field_type == list: value = [str(v) for v in value] clinical_trial[gen3_field] = value return clinical_trial def submit_metadata(self): """ Converts the data in `self.time_series_data` to Sheepdog records. `self.location_data already contains Sheepdog records. Batch submits all records in `self.clinical_trials` """ print("Submitting clinical_trial data") for clinical_trial in self.clinical_trials: self.metadata_helper.add_record_to_submit(clinical_trial) self.metadata_helper.batch_submit_records()
def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.location_data = {} self.time_series_data = defaultdict(lambda: defaultdict(dict)) self.program_name = "open" self.project_code = "JHU" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.expected_csv_headers = { "global": ["Province/State", "Country/Region", "Lat", "Long", "1/22/20"], "US_counties": { "confirmed": [ "UID", "iso2", "iso3", "code3", "FIPS", "Admin2", "Province_State", "Country_Region", "Lat", "Long_", "Combined_Key", "1/22/20", ], "deaths": [ "UID", "iso2", "iso3", "code3", "FIPS", "Admin2", "Province_State", "Country_Region", "Lat", "Long_", "Combined_Key", "Population", # TODO use this "1/22/20", ], }, } self.header_to_column = { "global": { "province": 0, "country": 1, "latitude": 2, "longitude": 3, "dates_start": 4, }, "US_counties": { "confirmed": { "iso2": 1, "iso3": 2, "code3": 3, "FIPS": 4, "county": 5, "province": 6, "country": 7, "latitude": 8, "longitude": 9, "dates_start": 11, }, "deaths": { "iso2": 1, "iso3": 2, "code3": 3, "FIPS": 4, "county": 5, "province": 6, "country": 7, "latitude": 8, "longitude": 9, "dates_start": 12, }, }, } self.existing_summary_locations = [] self.last_date = ""