Пример #1
0
def _parse_date(filename: str) -> datetime.date:
    with open(filename, 'rb') as f:
        try:
            pdf = PdfFileReader(f)
            page = pdf.getPage(0)
            text = page.extractText()
            lines = text.split('\n')
        except Exception as e:
            raise AggregateDateParsingError(str(e)) from e
        for index, line in enumerate(lines):
            if DATE_PARSE_ANCHOR in line:
                # The date is on the next line if anchor is present on the line
                parsed_date = str_field_utils.parse_date(lines[index + 1])
                if parsed_date:
                    return parsed_date
        raise AggregateDateParsingError("Could not extract date")
Пример #2
0
def _parse_date(filename: str) -> datetime.date:
    end = filename.index('.pdf')
    start = end - 7
    d = str_field_utils.parse_date(filename[start:end])
    if d:
        return aggregate_ingest_utils.on_last_day_of_month(d)
    raise AggregateDateParsingError("Could not extract date")
Пример #3
0
def parse_date(filename: str) -> datetime.date:
    """
    Parse the report_date from the filename since the PDF contents can't
    easily be parsed for the date.
    """
    date_str = filename.replace(' revised', ''). \
                   replace(' new', '').replace('.pdf', '')[-8:]
    parsed_date = str_field_utils.parse_date(date_str)
    if parsed_date:
        return parsed_date
    raise AggregateDateParsingError("Could not extract date")
Пример #4
0
def parse_date(filename: str) -> datetime.date:
    # Hawaii report pdfs have names that start with `Pop-Reports-EOM-`, followed
    # by a 10-character date and possibly another number (version?). For example
    # `Pop-Reports-EOM-2019-03-21.pdf` and `Pop-Reports-EOM-2018-03-31-1.pdf`.
    regex = r".*?Pop-Reports-EOM-([\d-]{10})"
    match = re.search(regex, filename, re.IGNORECASE)
    if match:
        date_str = match.group(1)
        parsed_date = str_field_utils.parse_date(date_str)
        if parsed_date:
            return parsed_date
    raise AggregateDateParsingError("Could not extract date")
Пример #5
0
def _parse_date(filename: str) -> datetime.date:
    # If this doesn't work, try scraping it from the url name
    filename_date = filename.lower()
    if DATE_PARSE_ANCHOR_FILENAME in filename_date:
        # The names can be a few formats, the most robust way is to take
        # all of the text after the anchor.
        # (eg. report Jan 2017.pdf)
        start = filename_date.index(DATE_PARSE_ANCHOR_FILENAME) \
                + len(DATE_PARSE_ANCHOR_FILENAME)
        date_str = filename_date[start:].strip('.pdf')
        parsed_date = str_field_utils.parse_date(date_str)
        if parsed_date:
            return parsed_date.replace(day=1)
    raise AggregateDateParsingError("Could not extract date")
Пример #6
0
def _parse_date(filename: str) -> datetime.date:
    # If this doesn't work, try scraping it from the url name
    filename_date = filename.lower()
    if DATE_PARSE_ANCHOR_FILENAME in filename_date:
        # The names can be a few formats, the most robust way is to take
        # all of the text after the anchor.
        # (eg. report Jan 2017.pdf)
        start = filename_date.index(DATE_PARSE_ANCHOR_FILENAME) \
                + len(DATE_PARSE_ANCHOR_FILENAME)
        date_str = filename_date[start:].strip('.pdf')
        parsed_date = str_field_utils.parse_date(date_str)
        if parsed_date:
            return parsed_date.replace(day=1)

    try:
        return datetime.datetime.strptime(
            filename.split('/')[-1],
            "_wp-content_uploads_%Y_%m_abbrerptcurrent.pdf")
    except ValueError as e:
        raise AggregateDateParsingError("Could not extract date") from e
Пример #7
0
def _parse_date(filename: str) -> datetime.date:
    end = filename.index(".pdf")
    start = end - 7

    try:
        d = str_field_utils.parse_date(filename[start:end])
        if d:
            return aggregate_ingest_utils.on_last_day_of_month(d)
    except Exception:
        pass

    # alternate filename format.
    try:
        d = str_field_utils.parse_date(filename.split()[0][-7:])
        if d:
            return aggregate_ingest_utils.on_last_day_of_month(d)
    except Exception:
        pass

    raise AggregateDateParsingError(f"Could not extract date from filename: {filename}")