Exemplo n.º 1
0
 def __init__(self, base_url, access_token):
     super().__init__(base_url, access_token)
     self.program_name = "open"
     self.project_code = "JHU"
     self.metadata_helper = MetadataHelper(
         base_url=self.base_url,
         program_name=self.program_name,
         project_code=self.project_code,
         access_token=access_token,
     )
Exemplo n.º 2
0
    def __init__(self, base_url, access_token):
        super().__init__(base_url, access_token)

        self.program_name = "open"
        self.project_code = "IDPH"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.county_dict = {}

        self.summary_locations = []
        self.summary_reports = []
Exemplo n.º 3
0
    def __init__(self, base_url, access_token):
        super().__init__(base_url, access_token)
        self.summary_locations = []
        self.summary_reports = []

        self.program_name = "open"
        self.project_code = "CTP"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.expected_csv_headers = [
            "date",
            "state",
            "positive",
            "negative",
            "pending",
            "hospitalizedCurrently",
            "hospitalizedCumulative",
            "inIcuCurrently",
            "inIcuCumulative",
            "onVentilatorCurrently",
            "onVentilatorCumulative",
            "recovered",
            "hash",
            "dateChecked",
            "death",
            "hospitalized",
            "total",
            "totalTestResults",
            "posNeg",
            "fips",
            "deathIncrease",
            "hospitalizedIncrease",
            "negativeIncrease",
            "positiveIncrease",
            "totalTestResultsIncrease",
        ]

        self.header_to_column = {
            k: self.expected_csv_headers.index(k)
            for k in self.expected_csv_headers
        }
Exemplo n.º 4
0
    def __init__(self, base_url, access_token):
        super().__init__(base_url, access_token)
        script = os.path.splitext(os.path.basename(__file__))[0]
        # Get all constants from YAML, including program_name, project_code
        with open('{}.yaml'.format(script)) as f:
            config = yaml.load(f, Loader=yaml.FullLoader)
        self.email = config['email']
        self.min_len = config['min_len']
        self.max_len = config['max_len']
        self.split = config['split']
        self.recurse = config['recurse']
        self.verbose = config['verbose']
        self.taxid = config['taxid']
        self.seq_format = config['seq_format']
        self.retmax = config['retmax']
        self.program_name = config['program_name']
        self.project_code = config['project_code']

        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )
Exemplo n.º 5
0
 def __init__(self, base_url, access_token):
     super().__init__(base_url, access_token)
     self.location_data = {}
     self.time_series_data = defaultdict(lambda: defaultdict(dict))
     self.program_name = "open"
     self.project_code = "JHU"
     self.metadata_helper = MetadataHelper(
         base_url=self.base_url,
         program_name=self.program_name,
         project_code=self.project_code,
         access_token=access_token,
     )
     self.expected_csv_headers = {
         "global":
         ["Province/State", "Country/Region", "Lat", "Long", "1/22/20"],
         "US_counties": {
             "confirmed": [
                 "UID",
                 "iso2",
                 "iso3",
                 "code3",
                 "FIPS",
                 "Admin2",
                 "Province_State",
                 "Country_Region",
                 "Lat",
                 "Long_",
                 "Combined_Key",
                 "1/22/20",
             ],
             "deaths": [
                 "UID",
                 "iso2",
                 "iso3",
                 "code3",
                 "FIPS",
                 "Admin2",
                 "Province_State",
                 "Country_Region",
                 "Lat",
                 "Long_",
                 "Combined_Key",
                 "Population",  # TODO use this
                 "1/22/20",
             ],
         },
     }
     self.header_to_column = {
         "global": {
             "province": 0,
             "country": 1,
             "latitude": 2,
             "longitude": 3,
             "dates_start": 4,
         },
         "US_counties": {
             "confirmed": {
                 "iso2": 1,
                 "iso3": 2,
                 "code3": 3,
                 "FIPS": 4,
                 "county": 5,
                 "province": 6,
                 "country": 7,
                 "latitude": 8,
                 "longitude": 9,
                 "dates_start": 11,
             },
             "deaths": {
                 "iso2": 1,
                 "iso3": 2,
                 "code3": 3,
                 "FIPS": 4,
                 "county": 5,
                 "province": 6,
                 "country": 7,
                 "latitude": 8,
                 "longitude": 9,
                 "dates_start": 12,
             },
         },
     }
     self.existing_data = self.metadata_helper.get_existing_data_jhu()
Exemplo n.º 6
0
class JHU(base.BaseETL):
    def __init__(self, base_url, access_token):
        super().__init__(base_url, access_token)
        self.location_data = {}
        self.time_series_data = defaultdict(lambda: defaultdict(dict))
        self.program_name = "open"
        self.project_code = "JHU"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )
        self.expected_csv_headers = {
            "global":
            ["Province/State", "Country/Region", "Lat", "Long", "1/22/20"],
            "US_counties": {
                "confirmed": [
                    "UID",
                    "iso2",
                    "iso3",
                    "code3",
                    "FIPS",
                    "Admin2",
                    "Province_State",
                    "Country_Region",
                    "Lat",
                    "Long_",
                    "Combined_Key",
                    "1/22/20",
                ],
                "deaths": [
                    "UID",
                    "iso2",
                    "iso3",
                    "code3",
                    "FIPS",
                    "Admin2",
                    "Province_State",
                    "Country_Region",
                    "Lat",
                    "Long_",
                    "Combined_Key",
                    "Population",  # TODO use this
                    "1/22/20",
                ],
            },
        }
        self.header_to_column = {
            "global": {
                "province": 0,
                "country": 1,
                "latitude": 2,
                "longitude": 3,
                "dates_start": 4,
            },
            "US_counties": {
                "confirmed": {
                    "iso2": 1,
                    "iso3": 2,
                    "code3": 3,
                    "FIPS": 4,
                    "county": 5,
                    "province": 6,
                    "country": 7,
                    "latitude": 8,
                    "longitude": 9,
                    "dates_start": 11,
                },
                "deaths": {
                    "iso2": 1,
                    "iso3": 2,
                    "code3": 3,
                    "FIPS": 4,
                    "county": 5,
                    "province": 6,
                    "country": 7,
                    "latitude": 8,
                    "longitude": 9,
                    "dates_start": 12,
                },
            },
        }
        self.existing_data = self.metadata_helper.get_existing_data_jhu()

    def files_to_submissions(self):
        """
        Reads CSV files and converts the data to Sheepdog records
        """
        # self.metadata_helper.delete_tmp()  # TODO remove
        # return
        urls = {
            "global": {
                "confirmed":
                "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv",
                "deaths":
                "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv",
                "recovered":
                "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv",
                "testing":
                "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_testing_global.csv",
            },
            "US_counties": {
                "confirmed":
                "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv",
                "deaths":
                "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv",
            },
        }

        for file_type in ["global", "US_counties"]:
            for data_type, url in urls[file_type].items():
                self.parse_file(file_type, data_type, url)

    def parse_file(self, file_type, data_type, url):
        """
        Converts a CSV file to data we can submit via Sheepdog. Stores the
        records to submit in `self.location_data` and `self.time_series_data`.
        Ignores any records that are already in Sheepdog (relies on unique
        `submitter_id` to check)

        Args:
            data_type (str): type of the data in this file - one
                of ["confirmed", "deaths", "recovered"]
            url (str): URL at which the CSV file is available
        """
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            f = (line.decode("utf-8") for line in r.iter_lines())
            reader = csv.reader(f, delimiter=",", quotechar='"')

            headers = next(reader)

            if headers[0] == "404: Not Found":
                print("  Unable to get file contents, received {}.".format(
                    headers))
                return

            expected_h = self.expected_csv_headers[file_type]
            if isinstance(expected_h, dict):
                expected_h = expected_h[data_type]
            obtained_h = headers[:len(expected_h)]
            assert (
                obtained_h == expected_h
            ), "CSV headers have changed (expected {}, got {}). We may need to update the ETL code".format(
                expected_h, obtained_h)

            for row in reader:
                location, date_to_value = self.parse_row(
                    file_type, data_type, headers, row)
                if not location:
                    # We are using US data by state instead of global
                    continue

                location_submitter_id = location["submitter_id"]
                if (location_submitter_id not in self.location_data
                        # do not re-submit location data that already exist
                        and location_submitter_id not in self.existing_data):
                    self.location_data[location_submitter_id] = location

                for date, value in date_to_value.items():
                    date_submitter_id = format_report_submitter_id(
                        location_submitter_id, date)
                    # do not re-submit time_series data that already exist
                    if date_submitter_id not in self.existing_data.get(
                            location_submitter_id, []):
                        self.time_series_data[location_submitter_id][date][
                            data_type] = value

    def parse_row(self, file_type, data_type, headers, row):
        """
        Converts a row of a CSV file to data we can submit via Sheepdog

        Args:
            headers (list(str)): CSV file headers (first row of the file)
            row (list(str)): row of data

        Returns:
            (dict, dict) tuple:
                - location data, in a format ready to be submitted to Sheepdog
                - { "date1": <value>, "date2": <value> } from the row data
        """
        header_to_column = self.header_to_column[file_type]
        if "country" not in header_to_column:
            header_to_column = header_to_column[data_type]

        country = row[header_to_column["country"]]
        province = row[header_to_column["province"]]
        latitude = row[header_to_column["latitude"]]
        longitude = row[header_to_column["longitude"]]

        if country == "US" and province == "":
            # We are using US data by state instead of global
            return None, None

        if int(float(latitude)) == 0 and int(float(longitude)) == 0:
            # Data with "Out of <state>" or "Unassigned" county value have
            # unknown coordinates of (0,0). We don't submit them for now
            return None, None

        submitter_id = format_location_submitter_id(country, province)
        location = {
            "country_region": country,
            "latitude": latitude,
            "longitude": longitude,
            "projects": [{
                "code": self.project_code
            }],
        }
        if province:
            location["province_state"] = province
        if file_type == "US_counties":
            county = row[header_to_column["county"]]
            iso2 = row[header_to_column["iso2"]]
            iso3 = row[header_to_column["iso3"]]
            code3 = row[header_to_column["code3"]]
            fips = row[header_to_column["FIPS"]]
            if county:
                location["county"] = county
                submitter_id = format_location_submitter_id(
                    country, province, county)
            if iso2:
                location["iso2"] = iso2
            if iso3:
                location["iso3"] = iso3
            if code3:
                location["code3"] = int(code3)
            if fips:
                location["FIPS"] = int(float(fips))
        location["submitter_id"] = submitter_id

        date_to_value = {}
        dates_start = header_to_column["dates_start"]
        for i in range(dates_start, len(headers)):
            date = headers[i]
            date = get_unified_date_format(date)

            if row[i] == "":  # ignore empty values
                continue
            try:
                val = int(row[i])
            except ValueError:
                print(
                    'Unable to convert {} to int for "{}", "{}" at {}'.format(
                        row[i], province, country, date))
                raise
            date_to_value[date] = val

        return location, date_to_value

    def submit_metadata(self):
        """
        Converts the data in `self.time_series_data` to Sheepdog records.
        `self.location_data already contains Sheepdog records. Batch submits
        all records in `self.location_data` and `self.time_series_data`
        """

        print("Submitting summary_location data")
        for location in self.location_data.values():
            record = {"type": "summary_location"}
            record.update(location)
            self.metadata_helper.add_record_to_submit(record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_report data")
        for location_submitter_id, time_series in self.time_series_data.items(
        ):
            for date, data in time_series.items():
                submitter_id = format_report_submitter_id(
                    location_submitter_id, date)
                record = {
                    "type": "summary_report",
                    "submitter_id": submitter_id,
                    "summary_locations": [{
                        "submitter_id": location_submitter_id
                    }],
                    "date": format_time_series_date(date),
                }
                for data_type, value in data.items():
                    record[data_type] = value
                self.metadata_helper.add_record_to_submit(record)
        self.metadata_helper.batch_submit_records()
Exemplo n.º 7
0
class DOWNLOAD_GB_BY_TAXID(base.BaseETL):
    def __init__(self, base_url, access_token):
        super().__init__(base_url, access_token)
        script = os.path.splitext(os.path.basename(__file__))[0]
        # Get all constants from YAML, including program_name, project_code
        with open('{}.yaml'.format(script)) as f:
            config = yaml.load(f, Loader=yaml.FullLoader)
        self.email = config['email']
        self.min_len = config['min_len']
        self.max_len = config['max_len']
        self.split = config['split']
        self.recurse = config['recurse']
        self.verbose = config['verbose']
        self.taxid = config['taxid']
        self.seq_format = config['seq_format']
        self.retmax = config['retmax']
        self.program_name = config['program_name']
        self.project_code = config['project_code']

        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

    def files_to_submissions(self):
        self.search()
        self.filter()

    def submit_metadata(self):
        self.write()

    def search(self):
        nummatch = re.match(r'^\d+$', str(self.taxid))

        if not nummatch and self.verbose:
            print("String '" + self.taxid + "' is not an NCBI taxon id")
            return

        Entrez.email = self.email

        if self.recurse == True:
            try:
                handle = Entrez.esearch(db="nuccore",
                                        idtype="acc",
                                        retmax=5000,
                                        term="txid{}[Organism:exp]".format(
                                            self.taxid))
                records = Entrez.read(handle)
                handle.close()
            except (RuntimeError) as exception:
                print("Error retrieving sequence ids using Taxonomy id '" +
                      str(self.taxid) + "'" + str(exception))

            for link in records['IdList']:
                self.nt_ids.append(link)
        else:
            try:
                links = Entrez.read(
                    Entrez.elink(dbfrom="taxonomy",
                                 db="nucleotide",
                                 idtype="acc",
                                 id=self.taxid))
            except (RuntimeError) as exception:
                print("Error retrieving sequence ids using Taxonomy id '" +
                      str(self.taxid) + "'" + str(exception))

            if len(links[0]["LinkSetDb"]) == 0:
                print("No sequences found with id " + self.taxid)
                return

            for link in links[0]["LinkSetDb"][0]["Link"]:
                self.nt_ids.append(link["Id"])

        if self.verbose:
            print("Esearch id count for Taxonomy id {0}: {1}".format(
                self.taxid, len(self.nt_ids)))

        self.efetch()

    def efetch(self):
        Entrez.email = self.email
        # Split the list of ids into batches of 'retmax' size for Entrez
        num_chunks = int(len(self.nt_ids) / self.retmax) + 1

        try:
            for id_chunk in numpy.array_split(numpy.array(self.nt_ids),
                                              num_chunks):
                if self.verbose:
                    print("Going to download records: {}".format(id_chunk))
                handle = Entrez.efetch(db="nucleotide",
                                       rettype=self.seq_format,
                                       retmode="text",
                                       id=','.join(id_chunk))
                # Creating the SeqRecord objects here makes filter() easier
                self.records = itertools.chain(
                    self.records, SeqIO.parse(handle, self.seq_format))
        except (RuntimeError) as exception:
            print("Error retrieving sequences using id '" + str(self.taxid) +
                  "':" + str(exception))

    def filter(self):
        # "genomes" are arbitrarily defined as sequences > min_len
        if self.min_len:
            filtered = []
            for record in self.records:
                if len(record) >= self.min_len:
                    filtered.append(record)
            self.records = filtered

    def write(self):
        if self.split:
            for record in self.records:
                seqfile = record.name + '.' + self.seq_format
                #SeqIO.write(record, seqfile, self.seq_format)
                self.metadata_helper.add_record_to_submit(seqfile)
            self.metadata_helper.batch_submit_records()
        else:
            seqfile = 'taxid-' + str(self.taxid) + '.' + self.seq_format
            #SeqIO.write(self.records, seqfile, self.seq_format)
            self.metadata_helper.add_record_to_submit(seqfile)
            self.metadata_helper.batch_submit_records()
Exemplo n.º 8
0
class JHU_COUNTRY_CODES(base.BaseETL):
    def __init__(self, base_url, access_token):
        super().__init__(base_url, access_token)
        self.program_name = "open"
        self.project_code = "JHU"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

    def files_to_submissions(self):
        codes_dict = self.get_codes_dictionary()
        locations = self.get_existing_locations()
        for location in locations:
            codes = self.get_codes_for_country_name(codes_dict,
                                                    location["country_region"])
            record = {k: v for k, v in location.items() if v != None}
            record.update({
                "type": "summary_location",
                "projects": [{
                    "code": self.project_code
                }],
                "iso2": codes["iso2"],
                "iso3": codes["iso3"],
            })
            self.metadata_helper.add_record_to_submit(record)

    def submit_metadata(self):
        self.metadata_helper.batch_submit_records()

    def get_codes_dictionary(self):
        with open(os.path.join(CURRENT_DIR, "country_codes.csv")) as f:
            reader = csv.reader(f, delimiter=",", quotechar='"')
            headers = next(reader)
            i_name = headers.index("CLDR display name")
            i_iso2 = headers.index("ISO3166-1-Alpha-2")
            i_iso3 = headers.index("ISO3166-1-Alpha-3")

            res = {
                row[i_name]: {
                    "iso2": row[i_iso2],
                    "iso3": row[i_iso3]
                }
                for row in reader
            }
        return res

    def get_existing_locations(self):
        print("Getting summary_location data from Peregrine")
        headers = {"Authorization": "bearer " + self.access_token}
        query_string = (
            '{ summary_location (first: 0, project_id: "' + PROGRAM_NAME +
            "-" + PROJECT_CODE +
            '") { submitter_id, country_region, province_state } }')
        response = requests.post(
            "{}/api/v0/submission/graphql".format(BASE_URL),
            json={
                "query": query_string,
                "variables": None
            },
            headers=headers,
        )
        assert (
            response.status_code == 200
        ), "Unable to query Peregrine for existing 'summary_location' data: {}\n{}".format(
            response.status_code, response.text)
        try:
            query_res = json.loads(response.text)
        except:
            print("Peregrine did not return JSON")
            raise
        return [location for location in query_res["data"]["summary_location"]]

    def get_codes_for_country_name(self, codes_dict, country_name):
        stripped_name = (country_name.strip("*").replace("Saint",
                                                         "St.").replace(
                                                             " and ", " & "))
        data = codes_dict.get(stripped_name)
        if data:
            return data

        mapped_name = COUNTRY_NAME_MAPPING.get(country_name)
        data = codes_dict.get(mapped_name)
        if data:
            return data

        data = ISO_CODES_MAPPING.get(country_name)
        if data:
            return data

        raise Exception(
            'Cannot find ISO codes data for "{}"'.format(country_name))
Exemplo n.º 9
0
class IDPH(base.BaseETL):
    def __init__(self, base_url, access_token):
        super().__init__(base_url, access_token)

        self.program_name = "open"
        self.project_code = "IDPH"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.county_dict = {}

        self.summary_locations = []
        self.summary_reports = []

    def il_counties(self):
        with open("IL_counties_central_coords_lat_long.tsv") as f:
            counties = f.readlines()
            counties = counties[1:]
            counties = map(lambda l: l.strip().split("\t"), counties)

        for county, lat, lon in counties:
            self.county_dict[county] = {"lat": lat, "lon": lon}

    def files_to_submissions(self):
        """
        Reads JSON file and convert the data to Sheepdog records
        """

        latest_submitted_date = self.metadata_helper.get_latest_submitted_data_idph(
        )
        today = datetime.date.today()
        if latest_submitted_date == today:
            print(
                "Nothing to submit: today and latest submitted date are the same."
            )
            return

        today_str = today.strftime("%Y%m%d")
        print(f"Getting data for date: {today_str}")
        state = "IL"

        # they changed the URL on April 1, 2020
        if today > datetime.date(2020, 3, 31):
            url = "http://www.dph.illinois.gov/sitefiles/COVIDTestResults.json"
        else:
            url = f"https://www.dph.illinois.gov/sites/default/files/COVID19/COVID19CountyResults{today_str}.json"
        self.parse_file(latest_submitted_date, state, url)

    def parse_file(self, latest_submitted_date, state, url):
        """
        Converts a JSON files to data we can submit via Sheepdog. Stores the
        records to submit in `self.summary_locations` and `self.summary_reports`.

        `self.summary_locations` is only needed once.

        Args:
            state (str): the state
            url (str): URL at which the JSON file is available
        """
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            data = r.json()
            date = self.get_date(data)

            if date == latest_submitted_date.strftime("%Y-%m-%d"):
                print(
                    "Nothing to submit: today and latest submitted date are the same."
                )
                return

            for county in data["characteristics_by_county"]["values"]:
                summary_location, summary_report = self.parse_county(
                    date, state, county)

                # drop the Illinois summary data
                if summary_location["county"] == "Illinois":
                    continue

                self.summary_locations.append(summary_location)
                self.summary_reports.append(summary_report)

    def parse_county(self, date, state, county_json):
        """
        From county-level data, generate the data we can submit via Sheepdog
        """
        country = "US"
        county = county_json["County"]

        summary_location_submitter_id = format_summary_location_submitter_id(
            country, state, county)

        summary_location = {
            "country_region": country,
            "county": county,
            "submitter_id": summary_location_submitter_id,
            "projects": [{
                "code": self.project_code
            }],
            "province_state": state,
        }

        if county in self.county_dict:
            summary_location["latitude"] = self.county_dict[county]["lat"]
            summary_location["longitude"] = self.county_dict[county]["lon"]
        else:
            if county_json["lat"] != 0:
                summary_location["latitude"] = str(county_json["lat"])
            if county_json["lon"] != 0:
                summary_location["longitude"] = str(county_json["lon"])

        summary_report_submitter_id = format_summary_report_submitter_id(
            summary_location_submitter_id, date)
        summary_report = {
            "confirmed": county_json["confirmed_cases"],
            "submitter_id": summary_report_submitter_id,
            "testing": county_json["total_tested"],
            "negative": county_json["negative"],
            "date": date,
            "deaths": county_json["deaths"],
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        return summary_location, summary_report

    def get_date(self, county_json):
        """
        Converts JSON with "year", "month" and "day" to formatted date string.
        """
        date_json = county_json["LastUpdateDate"]
        date = datetime.date(**date_json)
        return date.strftime("%Y-%m-%d")

    def submit_metadata(self):
        """
        Submits the data in `self.summary_locations` and `self.summary_reports` to Sheepdog.
        """

        print("Submitting data")

        # Commented
        # Only required for one time submission of summary_location
        # print("Submitting summary_location data")
        # for loc in self.summary_locations:
        #     loc_record = {"type": "summary_location"}
        #     loc_record.update(loc)
        #     self.metadata_helper.add_record_to_submit(loc_record)
        # self.metadata_helper.batch_submit_records()

        print("Submitting summary_report data")
        for rep in self.summary_reports:
            rep_record = {"type": "summary_report"}
            rep_record.update(rep)
            self.metadata_helper.add_record_to_submit(rep_record)
        self.metadata_helper.batch_submit_records()
Exemplo n.º 10
0
class CTP(base.BaseETL):
    def __init__(self, base_url, access_token):
        super().__init__(base_url, access_token)
        self.summary_locations = []
        self.summary_reports = []

        self.program_name = "open"
        self.project_code = "CTP"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.expected_csv_headers = [
            "date",
            "state",
            "positive",
            "negative",
            "pending",
            "hospitalizedCurrently",
            "hospitalizedCumulative",
            "inIcuCurrently",
            "inIcuCumulative",
            "onVentilatorCurrently",
            "onVentilatorCumulative",
            "recovered",
            "hash",
            "dateChecked",
            "death",
            "hospitalized",
            "total",
            "totalTestResults",
            "posNeg",
            "fips",
            "deathIncrease",
            "hospitalizedIncrease",
            "negativeIncrease",
            "positiveIncrease",
            "totalTestResultsIncrease",
        ]

        self.header_to_column = {
            k: self.expected_csv_headers.index(k)
            for k in self.expected_csv_headers
        }

    def files_to_submissions(self):
        """
        Reads CSV files and converts the data to Sheepdog records
        """
        url = "https://raw.githubusercontent.com/COVID19Tracking/covid-tracking-data/master/data/states_daily_4pm_et.csv"
        self.parse_file(url)

    def parse_file(self, url):
        """
        Converts a CSV file to data we can submit via Sheepdog. Stores the
        records to submit in `self.location_data` and `self.time_series_data`.
        Ignores any records that are already in Sheepdog (relies on unique
        `submitter_id` to check)

        Args:
            data_type (str): type of the data in this file - one
                of ["confirmed", "deaths", "recovered"]
            url (str): URL at which the CSV file is available
        """
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            f = (line.decode("utf-8") for line in r.iter_lines())
            reader = csv.reader(f, delimiter=",", quotechar='"')

            headers = next(reader)

            if headers[0] == "404: Not Found":
                print("  Unable to get file contents, received {}.".format(
                    headers))
                return

            expected_h = self.expected_csv_headers
            obtained_h = headers[:len(expected_h)]
            assert (
                obtained_h == expected_h
            ), "CSV headers have changed (expected {}, got {}). We may need to update the ETL code".format(
                expected_h, obtained_h)

            for row in reader:
                summary_location, summary_report = self.parse_row(headers, row)

                self.summary_locations.append(summary_location)
                self.summary_reports.append(summary_report)

    def parse_row(self, headers, row):
        """
        Converts a row of a CSV file to data we can submit via Sheepdog

        Args:
            headers (list(str)): CSV file headers (first row of the file)
            row (list(str)): row of data

        Returns:
            (dict, dict) tuple:
                - location data, in a format ready to be submitted to Sheepdog
                - { "date1": <value>, "date2": <value> } from the row data
        """

        date = row[self.header_to_column["date"]]
        date = datetime.strptime(date, "%Y%m%d").date()
        date = date.strftime("%Y-%m-%d")

        country = "US"
        state = row[self.header_to_column["state"]]
        summary_location_submitter_id = format_location_submitter_id(
            country, state)

        summary_location = {
            "country_region": country,
            "submitter_id": summary_location_submitter_id,
            "projects": [{
                "code": self.project_code
            }],
            "province_state": state,
        }

        summary_report_submitter_id = format_summary_report_submitter_id(
            summary_location_submitter_id, date)
        summary_report = {
            "date": date,
            "submitter_id": summary_report_submitter_id,
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        map_csv_fields = {
            "confirmed": "positive",
            "testing": "totalTestResultsIncrease",
            "deaths": "death",
        }

        for k, v in map_csv_fields.items():
            if row[self.header_to_column[v]]:
                summary_report[k] = int(row[self.header_to_column[v]])

        return summary_location, summary_report

    def submit_metadata(self):
        """
        Converts the data in `self.time_series_data` to Sheepdog records.
        `self.location_data already contains Sheepdog records. Batch submits
        all records in `self.location_data` and `self.time_series_data`
        """

        # Commented
        # Only required for one time submission of summary_location
        print("Submitting summary_location data")
        # for loc in self.summary_locations:
        #     loc_record = {"type": "summary_location"}
        #     loc_record.update(loc)
        #     self.metadata_helper.add_record_to_submit(loc_record)
        # self.metadata_helper.batch_submit_records()

        # print("Submitting summary_report data")
        for rep in self.summary_reports:
            rep_record = {"type": "summary_report"}
            rep_record.update(rep)
            self.metadata_helper.add_record_to_submit(rep_record)
        self.metadata_helper.batch_submit_records()