示例#1
0
class JHU_COUNTRY_CODES(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.program_name = "open"
        self.project_code = "JHU"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

    def files_to_submissions(self):
        codes_dict = get_codes_dictionary()
        locations = self.get_existing_locations()
        for location in locations:
            codes = get_codes_for_country_name(codes_dict,
                                               location["country_region"])

            # do not update the record if it already has the codes
            if location["iso2"] == codes["iso2"] and location["iso3"] == codes[
                    "iso3"]:
                continue

            record = {k: v for k, v in location.items() if v != None}
            record.update({
                "type": "summary_location",
                "projects": [{
                    "code": self.project_code
                }],
                "iso2": codes["iso2"],
                "iso3": codes["iso3"],
            })
            self.metadata_helper.add_record_to_submit(record)

    def submit_metadata(self):
        self.metadata_helper.batch_submit_records()

    def get_existing_locations(self):
        print("Getting summary_location data from Peregrine")
        query_string = ('{ summary_location (first: 0, project_id: "' +
                        self.program_name + "-" + self.project_code +
                        '") { submitter_id, country_region, iso2, iso3 } }')
        query_res = self.metadata_helper.query_peregrine(query_string)
        return [location for location in query_res["data"]["summary_location"]]
示例#2
0
class NCBI_MANIFEST(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.manifest_bucket = "sra-pub-sars-cov2"
        self.sra_src_manifest = "sra-src/Manifest"
        self.program_name = "open"
        self.project_code = "ncbi-covid-19"
        self.token = access_token
        self.last_submission_identifier = None

        self.file_helper = AsyncFileHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

    def read_ncbi_manifest(self, key):
        """read the manifest"""
        tries = 0
        last_row_num = 0
        while tries < MAX_RETRIES:
            try:
                s3 = boto3.resource("s3",
                                    config=Config(signature_version=UNSIGNED))
                s3_object = s3.Object(self.manifest_bucket, key)
                line_stream = codecs.getreader("utf-8")
                row_num = 0
                for line in line_stream(s3_object.get()["Body"]):
                    row_num = row_num + 1
                    if row_num < last_row_num:
                        continue
                    if row_num % 1000 == 0:
                        print(f"Processed {row_num} rows of {key}")
                    words = line.split("\t")
                    guid = conform_data_format(words[0].strip(), "guid")
                    size = int(conform_data_format(words[2].strip(), "size"))
                    md5 = conform_data_format(words[3].strip(), "md5")
                    authz = f"/programs/{self.program_name}/project/{self.project_code}"
                    url = conform_data_format(words[5].strip(), "url")
                    release_date = parse(
                        re.sub(r":[0-9]{3}", "", words[6].strip()))
                    yield guid, size, md5, authz, url, release_date
                break
            except Exception as e:
                print(f"Can not stream {key}. Retrying...")
                time.sleep(30)
                tries += 1
                last_row_num = row_num

    def submit_metadata(self):
        start = time.strftime("%X")

        loop = asyncio.get_event_loop()
        try:
            loop.run_until_complete(
                asyncio.gather(self.index_manifest(self.sra_src_manifest)))
            future = AsyncFileHelper.close_session()
            if future:
                loop.run_until_complete(asyncio.gather(future))

        finally:
            loop.close()
        end = time.strftime("%X")
        print(f"Running time: From {start} to {end}")

    async def index_manifest(self, manifest):
        query_string = ('{ project (first: 0, dbgap_accession_number: "' +
                        self.project_code +
                        '") { last_submission_identifier } }')
        try:
            response = self.metadata_helper.query_peregrine(query_string)
            self.last_submission_identifier = parse(
                response["data"]["project"][0]["last_submission_identifier"])
        except Exception as ex:
            self.last_submission_identifier = None

        now = datetime.datetime.now()
        last_submission_date_time = now.strftime("%m/%d/%Y, %H:%M:%S")

        for (guid, size, md5, authz, url,
             release_date) in self.read_ncbi_manifest(manifest):
            if (not self.last_submission_identifier
                    or release_date > self.last_submission_identifier):
                filename = url.split("/")[-1]
                retrying = True

                while retrying:
                    try:
                        did, _, _, _, _, _ = await self.file_helper.async_find_by_name(
                            filename)
                        retrying = False
                    except Exception as e:
                        print(
                            f"ERROR: Fail to query indexd for {filename}. Detail {e}. Retrying..."
                        )
                        await asyncio.sleep(5)

                if did:
                    print(f"{filename} was already indexed")
                    continue

                print(f"start to index {filename}")
                retries = 0
                while retries < MAX_RETRIES:
                    try:
                        await self.file_helper.async_index_record(
                            guid, size, filename, url, authz, md5)
                        break
                    except Exception as e:
                        retries += 1
                        print(
                            f"ERROR: Fail to create new indexd record for {guid}. Detail {e}. Retrying..."
                        )
                        await asyncio.sleep(5)

        headers = {
            "content-type": "application/json",
            "Authorization": f"Bearer {self.access_token}",
        }
        record = {
            "code": self.project_code,
            "dbgap_accession_number": self.project_code,
            "last_submission_identifier": last_submission_date_time,
        }
        res = requests.put(
            "{}/api/v0/submission/{}".format(self.base_url, self.program_name),
            headers=headers,
            data=json.dumps(record),
        )
class IDPH_VACCINE_TO_S3(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "IDPH-Vaccine"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )
        self.s3_client = boto3.client("s3")

    def get_existing_data_from_s3(self):
        s3_path = os.path.join(MAP_DATA_FOLDER,
                               VACCINES_BY_COUNTY_BY_DATE_FILENAME)
        bucket = self.s3_bucket.split("s3://")[-1]
        try:
            res = self.s3_client.get_object(Bucket=bucket, Key=s3_path)
            return json.loads(res["Body"].read().decode("utf-8"))
        except Exception as e:
            print(
                f"WARNING: Unable to get existing data from S3. Will get all data from Peregrine instead. Details: {e}"
            )

        # return default empty value
        # IL only - this dataset is only for IL anyway
        return {"il_county_list": {}, "last_updated": None}

    def get_new_data_from_peregrine(self, days_since_last_update):
        """
        Query Peregrine for the vaccine data that is not yet in S3.
        Filter country=US and state=IL to be safe even if the IDPH-Vaccine
        project only contains IL data anyway.
        """
        first = str(days_since_last_update or 0)  # first=0 means all data
        query_string = ('{ summary_location (first: 0, project_id: "' +
                        self.program_name + "-" + self.project_code +
                        '", country_region: "US", province_state: "IL") {' +
                        "county, summary_clinicals (first: " + first +
                        ', order_by_desc: "date") {' +
                        "date, vaccine_persons_fully_vaccinated } } }")
        try:
            response = self.metadata_helper.query_peregrine(query_string)
            return response["data"]
        except Exception as ex:
            print(f"Unable to query peregrine. Detail {ex}")
            raise

    def get_last_updated_date(self, summary_clinicals):
        """
        Return the most recent date found in a list of `summary_clinical` records.
        """
        last_updated_date = None
        for record in summary_clinicals:
            # remove time from some early dates in the dataset
            date = record["date"].split("T")[0]
            if not last_updated_date or datetime.strptime(
                    date, "%Y-%m-%d") > datetime.strptime(
                        last_updated_date, "%Y-%m-%d"):
                last_updated_date = date
        print(f"Dataset last updated date: {last_updated_date}")
        return last_updated_date

    def format_result(self, county_to_fips_dict, existing_data, new_data):
        """
        Parse new data from Peregrine and add it to the existing data.

        Args:
        - `county_to_fips_dict`
        - `existing_data` format: see data file format at the top of this file
        - `new_data`: data from Peregrine, in format:
            {
                summary_location: [
                    {
                        county: <county name>,
                        summary_clinicals: [
                            {
                                date: <str>,
                                vaccine_persons_fully_vaccinated: <int>
                            },
                            ...
                        ]
                    },
                    ...
                ]
            }
        """
        # the date at which this data was last updated
        existing_data["last_updated"] = self.get_last_updated_date(
            new_data["summary_location"][0]["summary_clinicals"])

        chicago_data_by_date = None
        for location in new_data["summary_location"]:
            county = location["county"]

            # get the total count
            if county == "Illinois":
                for record in location["summary_clinicals"]:
                    date = record["date"].split("T")[0]
                    if date == existing_data["last_updated"]:
                        existing_data["total"] = record[
                            "vaccine_persons_fully_vaccinated"]
                continue

            fips = county_to_fips_dict.get(county)
            if not fips:
                if county in ["Unknown", "Out Of State", "Chicago"]:
                    # we expect no FIPS for these, use the name as identifier
                    fips = county
                else:
                    raise Exception(
                        f"Uh-oh, did not find FIPS code for county '{county}'")

            data_by_date = {}
            for record in location["summary_clinicals"]:
                # remove time from some early dates in the dataset
                date = record["date"].split("T")[0]
                data_by_date[date] = record["vaccine_persons_fully_vaccinated"]

            if county == "Chicago":
                # the Chicago data are processed later
                chicago_data_by_date = data_by_date
            else:
                if fips in existing_data["il_county_list"]:
                    # merge existing data and new data
                    data_by_date = dict(
                        data_by_date,
                        **existing_data["il_county_list"][fips]["by_date"])
                existing_data["il_county_list"][fips] = {
                    "county": county,
                    "by_date": data_by_date,
                }

        # we don't separate Chicago from Cook county on the frontend,
        # so add the Chicago counts to the Cook county counts.
        for data in existing_data["il_county_list"].values():
            if data["county"] == "Cook":
                for date in data["by_date"]:
                    # ignore dates that are in Chicago data but not Cook data;
                    # counts without the rest of Cook county would look weird
                    if date in chicago_data_by_date:
                        data["by_date"][date] += chicago_data_by_date[date]
                break

        return existing_data

    def files_to_submissions(self):
        """
        Get the existing vaccine data from S3, query Peregrine for any new
        data, and create an updated JSON file with existing + new data.
        """
        existing_data = self.get_existing_data_from_s3()
        last_updated_date = existing_data["last_updated"]
        if last_updated_date:
            last_updated_date = datetime.strptime(last_updated_date,
                                                  "%Y-%m-%d")
            days_since_last_update = (datetime.now() - last_updated_date).days
            print(
                f"Data in S3 up to {last_updated_date}; querying Peregrine for the last {days_since_last_update} days of data"
            )
        else:
            days_since_last_update = None

        if days_since_last_update == 0:
            print("Zero days since last update: nothing to do")
            return

        new_data = self.get_new_data_from_peregrine(days_since_last_update)
        county_to_fips_dict = get_county_to_fips_dictionary()
        result = self.format_result(county_to_fips_dict, existing_data,
                                    new_data)

        # save to local
        with open(
                os.path.join(CURRENT_DIR, VACCINES_BY_COUNTY_BY_DATE_FILENAME),
                "w") as f:
            f.write(json.dumps(
                result,
                separators=(",", ":"),
            ))

    def submit_metadata(self):
        abs_path = os.path.join(CURRENT_DIR,
                                VACCINES_BY_COUNTY_BY_DATE_FILENAME)
        s3_path = os.path.join(MAP_DATA_FOLDER,
                               VACCINES_BY_COUNTY_BY_DATE_FILENAME)
        print(f"Uploading file to S3 at '{s3_path}'")
        self.s3_client.upload_file(Filename=abs_path,
                                   Bucket=self.s3_bucket,
                                   Key=s3_path)
        os.remove(abs_path)