Пример #1
0
def get_urls_to_download() -> List[Tuple[str, Dict]]:
    """Get all of the urls that should be downloaded."""
    page = requests.post(LANDING_PAGE, data=_get_landing_data()).text

    # Formatting on the page is extremely weird so its easiest to just take a
    # slice of the data.
    start = page.index(DATE_RANGE_ANCHOR) + len(DATE_RANGE_ANCHOR) + 10
    end = start + 50
    match = re.match(DATE_RANGE_RE, page[start:end])
    if match:
        date_from = str_field_utils.parse_date(match.group(1))
        date_to = str_field_utils.parse_date(match.group(2))

    if not (match and date_from and date_to):
        date_from = datetime.date(year=1995, month=9, day=5)
        date_to = aggregate_ingest_utils.subtract_month(
            datetime.date.today().replace(day=1))

    aggregate_urls = []
    for i in range(date_from.year, date_to.year + 1):
        month_from = 1
        month_to = 12
        if i == date_from.year:
            month_from = date_from.month
        elif i == date_to.year:
            month_to = date_to.month
        reporting_range = 1995 if i < 2002 else 2002
        pdf_post_data = _get_pdf_data(i, month_from, month_to, reporting_range)
        aggregate_urls.append((PDF_URL, pdf_post_data))
    return aggregate_urls
Пример #2
0
    def testWrite_SingleCountWithDateAndAllDemographics(self) -> None:
        params = {
            "jid": "01001001",
            "ethnicity": Ethnicity.HISPANIC.value,
            "gender": Gender.FEMALE.value,
            "race": Race.BLACK.value,
            "count": 311,
            "date": "2019-01-01",
        }

        headers = {"X-Appengine-Cron": "test-cron"}
        response = self.client.get(f"/single_count?{urlencode(params)}",
                                   headers=headers)
        self.assertEqual(response.status_code, 200)

        # Assert
        with SessionFactory.using_database(self.database_key,
                                           autocommit=False) as session:
            query = session.query(SingleCountAggregate)
            result = one(query.all())

        self.assertEqual(result.count, params["count"])
        date_str = params["date"]
        if not isinstance(date_str, str):
            raise ValueError(
                f"Unexpected type for date_str: [{type(date_str)}]")
        self.assertEqual(result.date, str_field_utils.parse_date(date_str))
        self.assertEqual(result.ethnicity, params["ethnicity"])
        self.assertEqual(result.gender, params["gender"])
        self.assertEqual(result.race, params["race"])
Пример #3
0
    def testWrite_SingleCountWithDateAndAllDemographics(self):
        params = {
            'jid': '01001001',
            'ethnicity': Ethnicity.HISPANIC.value,
            'gender': Gender.FEMALE.value,
            'race': Race.BLACK.value,
            'count': 311,
            'date': '2019-01-01',
        }

        headers = {'X-Appengine-Cron': 'test-cron'}
        response = self.client.get(f'/single_count?{urlencode(params)}',
                                   headers=headers)
        self.assertEqual(response.status_code, 200)

        # Assert
        query = SessionFactory.for_schema_base(JailsBase).query(
            SingleCountAggregate)
        result = one(query.all())

        self.assertEqual(result.count, params['count'])
        self.assertEqual(result.date,
                         str_field_utils.parse_date(params['date']))
        self.assertEqual(result.ethnicity, params['ethnicity'])
        self.assertEqual(result.gender, params['gender'])
        self.assertEqual(result.race, params['race'])
Пример #4
0
def _parse_date(filename: str) -> datetime.date:
    end = filename.index('.pdf')
    start = end - 7
    d = str_field_utils.parse_date(filename[start:end])
    if d:
        return aggregate_ingest_utils.on_last_day_of_month(d)
    raise AggregateDateParsingError("Could not extract date")
Пример #5
0
    def testWrite_SingleCountWithDateAndAllDemographics(self):
        params = {
            "jid": "01001001",
            "ethnicity": Ethnicity.HISPANIC.value,
            "gender": Gender.FEMALE.value,
            "race": Race.BLACK.value,
            "count": 311,
            "date": "2019-01-01",
        }

        headers = {"X-Appengine-Cron": "test-cron"}
        response = self.client.get(f"/single_count?{urlencode(params)}",
                                   headers=headers)
        self.assertEqual(response.status_code, 200)

        # Assert
        query = SessionFactory.for_schema_base(JailsBase).query(
            SingleCountAggregate)
        result = one(query.all())

        self.assertEqual(result.count, params["count"])
        self.assertEqual(result.date,
                         str_field_utils.parse_date(params["date"]))
        self.assertEqual(result.ethnicity, params["ethnicity"])
        self.assertEqual(result.gender, params["gender"])
        self.assertEqual(result.race, params["race"])
def _parse_date(filename: str) -> datetime.date:
    # Slashes are converted to underscores in the GCS bucket. This
    # assumes there are no underscores in the URL basename.
    base_filename = filename.split('_')[-1].replace('female', '')
    end = base_filename.index('.pdf')
    start = 4
    d = str_field_utils.parse_date(base_filename[start:end])
    return aggregate_ingest_utils.on_last_day_of_month(d)
def set_date_specific_lsir_fields(
        assessment: StateAssessment) -> StateAssessment:
    """Over time, US_PA has updated the mapping between an LSIR score and the associated assessment level. This function
    sets the appropriate assessment_level and assessment_score according to the score and the date of the |assessment|,
    as defined by _DATE_SPECIFIC_ORDERED_LSIR_LEVELS.

    Returns the updated StateAssessment object.
    """
    if not assessment.assessment_score:
        return assessment

    assessment_score = parse_int(assessment.assessment_score)

    if assessment_score == 60:
        # This value indicates the scoring was not completed
        assessment.assessment_score = None
        assessment.assessment_level = "UNKNOWN (60-ATTEMPTED_INCOMPLETE)"
    elif assessment_score == 70:
        # This person either refused to be assessed or did not need to be assessed because they chose not to be released
        # onto parole
        assessment.assessment_score = None
        assessment.assessment_level = "UNKNOWN (70-REFUSED)"
    elif assessment_score > 55:
        # Assessment score number is over the max value of 54, and isn't one of the expected special-case
        # codes (60, 70, 55)
        assessment.assessment_level = f"UNKNOWN ({assessment_score}-SCORE_OUT_OF_RANGE)"
        assessment.assessment_score = None
    else:
        if assessment_score == 55:
            # This should be treated as a 54
            assessment_score = 54
            assessment.assessment_score = "54"

        assessment_date_raw = assessment.assessment_date
        assessment_date = (str_field_utils.parse_date(assessment_date_raw)
                           if assessment_date_raw else None)

        if not assessment_date:
            # At this point we need a valid assessment_date to determine the date-specific LSIR level
            assessment.assessment_level = "UNKNOWN (NO_DATE)"
            return assessment
        for cutoff_date, score_level_map in _DATE_SPECIFIC_ORDERED_LSIR_LEVELS.items(
        ):
            if assessment_date <= cutoff_date:
                for cutoff_score, level in score_level_map.items():
                    if assessment_score <= cutoff_score:
                        assessment.assessment_level = level.value
                        return assessment

        raise ValueError(
            f"Unhandled assessment_score {assessment_score} with assessment_date {assessment_date}"
        )

    return assessment
Пример #8
0
def parse_date(filename: str) -> datetime.date:
    """
    Parse the report_date from the filename since the PDF contents can't
    easily be parsed for the date.
    """
    date_str = filename.replace(' revised', ''). \
                   replace(' new', '').replace('.pdf', '')[-8:]
    parsed_date = str_field_utils.parse_date(date_str)
    if parsed_date:
        return parsed_date
    raise AggregateDateParsingError("Could not extract date")
Пример #9
0
def _parse_date(filename: str) -> datetime.date:
    end = filename.index(".pdf")
    start = end - 7

    try:
        d = str_field_utils.parse_date(filename[start:end])
        if d:
            return aggregate_ingest_utils.on_last_day_of_month(d)
    except Exception:
        pass

    # alternate filename format.
    try:
        d = str_field_utils.parse_date(filename.split()[0][-7:])
        if d:
            return aggregate_ingest_utils.on_last_day_of_month(d)
    except Exception:
        pass

    raise AggregateDateParsingError(f"Could not extract date from filename: {filename}")
Пример #10
0
def _parse_date(filename: str) -> datetime.date:
    # Slashes are converted to underscores in the GCS bucket. This
    # assumes there are no underscores in the URL basename.
    base_filename = filename.split("_")[-1].replace("female", "")
    end = base_filename.index(".pdf")
    start = 4
    d = str_field_utils.parse_date(base_filename[start:end])

    if d is None:
        raise ValueError(f"Unexpected null date parsed from filename [{filename}]")

    return aggregate_ingest_utils.on_last_day_of_month(d)
Пример #11
0
def _date_converter(value: Any) -> datetime.date:
    if not value:
        return datetime.date.today()

    if isinstance(value, datetime.date):
        return value

    parsed_date = str_field_utils.parse_date(value)
    if not parsed_date:
        raise ValueError(f"Failed to parse {value} as a date")

    return parsed_date
Пример #12
0
def parse_date(filename: str) -> datetime.date:
    # Hawaii report pdfs have names that start with `Pop-Reports-EOM-`, followed
    # by a 10-character date and possibly another number (version?). For example
    # `Pop-Reports-EOM-2019-03-21.pdf` and `Pop-Reports-EOM-2018-03-31-1.pdf`.
    regex = r".*?Pop-Reports-EOM-([\d-]{10})"
    match = re.search(regex, filename, re.IGNORECASE)
    if match:
        date_str = match.group(1)
        parsed_date = str_field_utils.parse_date(date_str)
        if parsed_date:
            return parsed_date
    raise AggregateDateParsingError("Could not extract date")
Пример #13
0
def _parse_date(filename: str) -> datetime.date:
    # If this doesn't work, try scraping it from the url name
    filename_date = filename.lower()
    if DATE_PARSE_ANCHOR_FILENAME in filename_date:
        # The names can be a few formats, the most robust way is to take
        # all of the text after the anchor.
        # (eg. report Jan 2017.pdf)
        start = filename_date.index(DATE_PARSE_ANCHOR_FILENAME) \
                + len(DATE_PARSE_ANCHOR_FILENAME)
        date_str = filename_date[start:].strip('.pdf')
        parsed_date = str_field_utils.parse_date(date_str)
        if parsed_date:
            return parsed_date.replace(day=1)
    raise AggregateDateParsingError("Could not extract date")
Пример #14
0
def _parse_date(filename: str) -> datetime.date:
    with open(filename, 'rb') as f:
        try:
            pdf = PdfFileReader(f)
            page = pdf.getPage(0)
            text = page.extractText()
            lines = text.split('\n')
        except Exception as e:
            raise AggregateDateParsingError(str(e)) from e
        for index, line in enumerate(lines):
            if DATE_PARSE_ANCHOR in line:
                # The date is on the next line if anchor is present on the line
                parsed_date = str_field_utils.parse_date(lines[index + 1])
                if parsed_date:
                    return parsed_date
        raise AggregateDateParsingError("Could not extract date")
Пример #15
0
def date_converter_or_today(value: Any) -> datetime.date:
    """Converts a value to a datetime.date, if possible. If the value is
    falsy, datetime.date.today() is returned. If the value is already
    a datetime.date, it is returned directly, otherwise the value is
    sent to the string utils dateparser.
    """
    if not value:
        return datetime.date.today()

    if isinstance(value, datetime.date):
        return value

    parsed_date = str_field_utils.parse_date(value)
    if not parsed_date:
        raise ValueError(f"Failed to parse {value} as a date")

    return parsed_date
Пример #16
0
    def testWrite_SingleCountWithDate(self):
        params = {
            'jid': '01001001',
            'count': 311,
            'date': '2019-01-01',
        }

        headers = {'X-Appengine-Cron': 'test-cron'}
        response = self.client.get(f'/single_count?{urlencode(params)}',
                                   headers=headers)
        self.assertEqual(response.status_code, 200)

        # Assert
        query = SessionFactory.for_schema_base(JailsBase).query(
            SingleCountAggregate)
        result = one(query.all())

        self.assertEqual(result.count, params['count'])
        self.assertEqual(result.date,
                         str_field_utils.parse_date(params['date']))
Пример #17
0
    def testWrite_SingleCountWithDate(self):
        params = {
            "jid": "01001001",
            "count": 311,
            "date": "2019-01-01",
        }

        headers = {"X-Appengine-Cron": "test-cron"}
        response = self.client.get(f"/single_count?{urlencode(params)}",
                                   headers=headers)
        self.assertEqual(response.status_code, 200)

        # Assert
        query = SessionFactory.for_schema_base(JailsBase).query(
            SingleCountAggregate)
        result = one(query.all())

        self.assertEqual(result.count, params["count"])
        self.assertEqual(result.date,
                         str_field_utils.parse_date(params["date"]))
Пример #18
0
def _parse_date(filename: str) -> datetime.date:
    # If this doesn't work, try scraping it from the url name
    filename_date = filename.lower()
    if DATE_PARSE_ANCHOR_FILENAME in filename_date:
        # The names can be a few formats, the most robust way is to take
        # all of the text after the anchor.
        # (eg. report Jan 2017.pdf)
        start = filename_date.index(DATE_PARSE_ANCHOR_FILENAME) \
                + len(DATE_PARSE_ANCHOR_FILENAME)
        date_str = filename_date[start:].strip('.pdf')
        parsed_date = str_field_utils.parse_date(date_str)
        if parsed_date:
            return parsed_date.replace(day=1)

    try:
        return datetime.datetime.strptime(
            filename.split('/')[-1],
            "_wp-content_uploads_%Y_%m_abbrerptcurrent.pdf")
    except ValueError as e:
        raise AggregateDateParsingError("Could not extract date") from e
Пример #19
0
    def convert_field_value(field: attr.Attribute,
                            field_value: Union[str, EnumParser]) -> Any:
        if field_value is None:
            return None

        if is_forward_ref(field) or is_list(field):
            return field_value

        if isinstance(field_value, str):
            if not field_value or not field_value.strip():
                return None

        if field.name in converter_overrides:
            converter = converter_overrides[field.name]
            if not isinstance(field_value, converter.field_type):
                raise ValueError(
                    f"Found converter for field [{field.name}] in the converter_overrides, but expected "
                    f"field type [{converter.field_type}] does not match actual field type "
                    f"[{type(field_value)}]")
            return converter.convert(field_value)

        if isinstance(field_value, EnumParser):
            if is_enum(field):
                return field_value.parse()
            raise ValueError(
                f"Found field value [{field_value}] for field that is not an enum [{field}]."
            )

        if isinstance(field_value, str):
            if is_str(field):
                return normalize(field_value)
            if is_date(field):
                return parse_date(field_value)
            if is_int(field):
                return parse_int(field_value)
            if field.type in {bool, Union[bool, None]}:
                return parse_bool(field_value)

        raise ValueError(f"Unsupported field {field.name}")
Пример #20
0
 def test_parseNoDate(self):
     assert parse_date('None set') is None
Пример #21
0
 def test_parseDate_zeroes_weird(self):
     assert parse_date('0 0 0') is None
     assert parse_date('0000-00-00') is None
Пример #22
0
 def test_parseDate_zeroes(self):
     assert parse_date('00000000') is None
Пример #23
0
 def test_parseDate(self):
     assert parse_date('Jan 1, 2018') == \
            datetime.date(year=2018, month=1, day=1)
Пример #24
0
 def test_parseDate_zeroes(self) -> None:
     assert parse_date("00000000") is None
Пример #25
0
 def test_parseNoDate(self) -> None:
     assert parse_date("None set") is None
Пример #26
0
 def test_parseDate(self) -> None:
     assert parse_date("Jan 1, 2018") == datetime.date(year=2018, month=1, day=1)
Пример #27
0
 def test_parseDate_no_separators_part_string_part_number(self) -> None:
     assert parse_date("June2016") == datetime.date(year=2016,
                                                    month=6,
                                                    day=1)
Пример #28
0
 def test_parseDate_space_separators_part_string_part_number(self) -> None:
     assert parse_date("MAY 2003") == datetime.date(year=2003,
                                                    month=5,
                                                    day=1)
Пример #29
0
 def test_parseDate_no_separators(self) -> None:
     assert parse_date("03122008") == datetime.date(year=2008,
                                                    month=3,
                                                    day=12)
Пример #30
0
 def test_parseDate_zeroes_weird(self) -> None:
     assert parse_date("0 0 0") is None
     assert parse_date("0000-00-00") is None