class JHU_COUNTRY_CODES(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "JHU" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) def files_to_submissions(self): codes_dict = get_codes_dictionary() locations = self.get_existing_locations() for location in locations: codes = get_codes_for_country_name(codes_dict, location["country_region"]) # do not update the record if it already has the codes if location["iso2"] == codes["iso2"] and location["iso3"] == codes[ "iso3"]: continue record = {k: v for k, v in location.items() if v != None} record.update({ "type": "summary_location", "projects": [{ "code": self.project_code }], "iso2": codes["iso2"], "iso3": codes["iso3"], }) self.metadata_helper.add_record_to_submit(record) def submit_metadata(self): self.metadata_helper.batch_submit_records() def get_existing_locations(self): print("Getting summary_location data from Peregrine") query_string = ('{ summary_location (first: 0, project_id: "' + self.program_name + "-" + self.project_code + '") { submitter_id, country_region, iso2, iso3 } }') query_res = self.metadata_helper.query_peregrine(query_string) return [location for location in query_res["data"]["summary_location"]]
class NCBI_MANIFEST(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.manifest_bucket = "sra-pub-sars-cov2" self.sra_src_manifest = "sra-src/Manifest" self.program_name = "open" self.project_code = "ncbi-covid-19" self.token = access_token self.last_submission_identifier = None self.file_helper = AsyncFileHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) def read_ncbi_manifest(self, key): """read the manifest""" tries = 0 last_row_num = 0 while tries < MAX_RETRIES: try: s3 = boto3.resource("s3", config=Config(signature_version=UNSIGNED)) s3_object = s3.Object(self.manifest_bucket, key) line_stream = codecs.getreader("utf-8") row_num = 0 for line in line_stream(s3_object.get()["Body"]): row_num = row_num + 1 if row_num < last_row_num: continue if row_num % 1000 == 0: print(f"Processed {row_num} rows of {key}") words = line.split("\t") guid = conform_data_format(words[0].strip(), "guid") size = int(conform_data_format(words[2].strip(), "size")) md5 = conform_data_format(words[3].strip(), "md5") authz = f"/programs/{self.program_name}/project/{self.project_code}" url = conform_data_format(words[5].strip(), "url") release_date = parse( re.sub(r":[0-9]{3}", "", words[6].strip())) yield guid, size, md5, authz, url, release_date break except Exception as e: print(f"Can not stream {key}. Retrying...") time.sleep(30) tries += 1 last_row_num = row_num def submit_metadata(self): start = time.strftime("%X") loop = asyncio.get_event_loop() try: loop.run_until_complete( asyncio.gather(self.index_manifest(self.sra_src_manifest))) future = AsyncFileHelper.close_session() if future: loop.run_until_complete(asyncio.gather(future)) finally: loop.close() end = time.strftime("%X") print(f"Running time: From {start} to {end}") async def index_manifest(self, manifest): query_string = ('{ project (first: 0, dbgap_accession_number: "' + self.project_code + '") { last_submission_identifier } }') try: response = self.metadata_helper.query_peregrine(query_string) self.last_submission_identifier = parse( response["data"]["project"][0]["last_submission_identifier"]) except Exception as ex: self.last_submission_identifier = None now = datetime.datetime.now() last_submission_date_time = now.strftime("%m/%d/%Y, %H:%M:%S") for (guid, size, md5, authz, url, release_date) in self.read_ncbi_manifest(manifest): if (not self.last_submission_identifier or release_date > self.last_submission_identifier): filename = url.split("/")[-1] retrying = True while retrying: try: did, _, _, _, _, _ = await self.file_helper.async_find_by_name( filename) retrying = False except Exception as e: print( f"ERROR: Fail to query indexd for {filename}. Detail {e}. Retrying..." ) await asyncio.sleep(5) if did: print(f"{filename} was already indexed") continue print(f"start to index {filename}") retries = 0 while retries < MAX_RETRIES: try: await self.file_helper.async_index_record( guid, size, filename, url, authz, md5) break except Exception as e: retries += 1 print( f"ERROR: Fail to create new indexd record for {guid}. Detail {e}. Retrying..." ) await asyncio.sleep(5) headers = { "content-type": "application/json", "Authorization": f"Bearer {self.access_token}", } record = { "code": self.project_code, "dbgap_accession_number": self.project_code, "last_submission_identifier": last_submission_date_time, } res = requests.put( "{}/api/v0/submission/{}".format(self.base_url, self.program_name), headers=headers, data=json.dumps(record), )
class IDPH_VACCINE_TO_S3(base.BaseETL): def __init__(self, base_url, access_token, s3_bucket): super().__init__(base_url, access_token, s3_bucket) self.program_name = "open" self.project_code = "IDPH-Vaccine" self.metadata_helper = MetadataHelper( base_url=self.base_url, program_name=self.program_name, project_code=self.project_code, access_token=access_token, ) self.s3_client = boto3.client("s3") def get_existing_data_from_s3(self): s3_path = os.path.join(MAP_DATA_FOLDER, VACCINES_BY_COUNTY_BY_DATE_FILENAME) bucket = self.s3_bucket.split("s3://")[-1] try: res = self.s3_client.get_object(Bucket=bucket, Key=s3_path) return json.loads(res["Body"].read().decode("utf-8")) except Exception as e: print( f"WARNING: Unable to get existing data from S3. Will get all data from Peregrine instead. Details: {e}" ) # return default empty value # IL only - this dataset is only for IL anyway return {"il_county_list": {}, "last_updated": None} def get_new_data_from_peregrine(self, days_since_last_update): """ Query Peregrine for the vaccine data that is not yet in S3. Filter country=US and state=IL to be safe even if the IDPH-Vaccine project only contains IL data anyway. """ first = str(days_since_last_update or 0) # first=0 means all data query_string = ('{ summary_location (first: 0, project_id: "' + self.program_name + "-" + self.project_code + '", country_region: "US", province_state: "IL") {' + "county, summary_clinicals (first: " + first + ', order_by_desc: "date") {' + "date, vaccine_persons_fully_vaccinated } } }") try: response = self.metadata_helper.query_peregrine(query_string) return response["data"] except Exception as ex: print(f"Unable to query peregrine. Detail {ex}") raise def get_last_updated_date(self, summary_clinicals): """ Return the most recent date found in a list of `summary_clinical` records. """ last_updated_date = None for record in summary_clinicals: # remove time from some early dates in the dataset date = record["date"].split("T")[0] if not last_updated_date or datetime.strptime( date, "%Y-%m-%d") > datetime.strptime( last_updated_date, "%Y-%m-%d"): last_updated_date = date print(f"Dataset last updated date: {last_updated_date}") return last_updated_date def format_result(self, county_to_fips_dict, existing_data, new_data): """ Parse new data from Peregrine and add it to the existing data. Args: - `county_to_fips_dict` - `existing_data` format: see data file format at the top of this file - `new_data`: data from Peregrine, in format: { summary_location: [ { county: <county name>, summary_clinicals: [ { date: <str>, vaccine_persons_fully_vaccinated: <int> }, ... ] }, ... ] } """ # the date at which this data was last updated existing_data["last_updated"] = self.get_last_updated_date( new_data["summary_location"][0]["summary_clinicals"]) chicago_data_by_date = None for location in new_data["summary_location"]: county = location["county"] # get the total count if county == "Illinois": for record in location["summary_clinicals"]: date = record["date"].split("T")[0] if date == existing_data["last_updated"]: existing_data["total"] = record[ "vaccine_persons_fully_vaccinated"] continue fips = county_to_fips_dict.get(county) if not fips: if county in ["Unknown", "Out Of State", "Chicago"]: # we expect no FIPS for these, use the name as identifier fips = county else: raise Exception( f"Uh-oh, did not find FIPS code for county '{county}'") data_by_date = {} for record in location["summary_clinicals"]: # remove time from some early dates in the dataset date = record["date"].split("T")[0] data_by_date[date] = record["vaccine_persons_fully_vaccinated"] if county == "Chicago": # the Chicago data are processed later chicago_data_by_date = data_by_date else: if fips in existing_data["il_county_list"]: # merge existing data and new data data_by_date = dict( data_by_date, **existing_data["il_county_list"][fips]["by_date"]) existing_data["il_county_list"][fips] = { "county": county, "by_date": data_by_date, } # we don't separate Chicago from Cook county on the frontend, # so add the Chicago counts to the Cook county counts. for data in existing_data["il_county_list"].values(): if data["county"] == "Cook": for date in data["by_date"]: # ignore dates that are in Chicago data but not Cook data; # counts without the rest of Cook county would look weird if date in chicago_data_by_date: data["by_date"][date] += chicago_data_by_date[date] break return existing_data def files_to_submissions(self): """ Get the existing vaccine data from S3, query Peregrine for any new data, and create an updated JSON file with existing + new data. """ existing_data = self.get_existing_data_from_s3() last_updated_date = existing_data["last_updated"] if last_updated_date: last_updated_date = datetime.strptime(last_updated_date, "%Y-%m-%d") days_since_last_update = (datetime.now() - last_updated_date).days print( f"Data in S3 up to {last_updated_date}; querying Peregrine for the last {days_since_last_update} days of data" ) else: days_since_last_update = None if days_since_last_update == 0: print("Zero days since last update: nothing to do") return new_data = self.get_new_data_from_peregrine(days_since_last_update) county_to_fips_dict = get_county_to_fips_dictionary() result = self.format_result(county_to_fips_dict, existing_data, new_data) # save to local with open( os.path.join(CURRENT_DIR, VACCINES_BY_COUNTY_BY_DATE_FILENAME), "w") as f: f.write(json.dumps( result, separators=(",", ":"), )) def submit_metadata(self): abs_path = os.path.join(CURRENT_DIR, VACCINES_BY_COUNTY_BY_DATE_FILENAME) s3_path = os.path.join(MAP_DATA_FOLDER, VACCINES_BY_COUNTY_BY_DATE_FILENAME) print(f"Uploading file to S3 at '{s3_path}'") self.s3_client.upload_file(Filename=abs_path, Bucket=self.s3_bucket, Key=s3_path) os.remove(abs_path)