예제 #1
0
def load_scrapd2(entry):
    """
    Load a ScrAPD 2 entry and returns a ScrAPD3 one.
    """
    r = model.Report(case=entry['Case'],
                     date=date_utils.parse_date(entry.get('Date')))
    if entry.get('Fatal crashes this year'):
        r.crash = int(entry.get('Fatal crashes this year'))
    if entry.get('Link'):
        r.link = entry.get('Link')
    if entry.get('Latitude'):
        r.latitude = entry.get('Latitude')
    if entry.get('Location'):
        r.location = entry.get('Location')
    if entry.get('Longitude'):
        r.longitude = entry.get('Longitude')
    if entry.get('Notes'):
        r.notes = entry.get('Notes')
    if entry.get('Time'):
        r.time = date_utils.parse_time(entry.get('Time'))

    f = model.Fatality()
    if entry.get('Age'):
        f.age = int(entry.get('Age'))
    if entry.get('DOB'):
        f.dob = date_utils.parse_date(entry.get('DOB'))
    if entry.get('Ethnicity'):
        try:
            f.ethnicity = model.Ethnicity(entry.get('Ethnicity').capitalize())
        except ValueError:
            f.ethnicity = model.Ethnicity.undefined
    if entry.get('First Name'):
        f.first = entry.get('First Name')
    if entry.get('Gender'):
        try:
            f.gender = model.Gender(entry.get('Gender').capitalize())
        except ValueError:
            f.gender = model.Gender.undefined
    if entry.get('Last Name'):
        f.last = entry.get('Last Name')

    r.fatalities = [f]
    r.compute_fatalities_age()

    return r
예제 #2
0
파일: twitter.py 프로젝트: lalver1/scrapd
def normalize_tokens(d):
    """
    Normalize the description tokens.

    The normalization happens in place.

    :param dict d: a dict representing the tokenized version of the description.
    :return: the list of parsing errors that occured during the normalization process.
    :rtype: list
    """
    err = []

    # Handle the DOB variations.
    if d.get("D.O.B."):
        d["DOB"] = d.pop("D.O.B.")

    # Group all the falaities in a list.
    tmp_fatalities = [d[k] for k in d if k.startswith("Deceased")]

    # If there is only one fatality we must ensure there is a DOB marker in the deceased field.
    if len(tmp_fatalities) == 1 and d.get('DOB'):
        tmp_fatalities = [f"{tmp_fatalities[0]} DOB {d.get('DOB')}"]

    # Process each fatality.
    for fatality in tmp_fatalities:
        try:
            f, errors = deceased.process_deceased_field(fatality)
        except ValueError as e:
            err.append(str(e))
            continue
        else:
            d.setdefault('fatalities', []).append(f)
            err.extend(errors)

    # Parse the `Date` field.
    fatality_date = d.get('Date')
    if fatality_date:
        d[Fields.DATE] = date_utils.parse_date(fatality_date)

    # Convert the time to a time object.
    fatality_time = d.get('Time')
    if fatality_time:
        d[Fields.TIME] = date_utils.parse_time(fatality_time)

    return err
예제 #3
0
파일: parsing.py 프로젝트: mscarey/scrapd
def parse_twitter_description(twitter_description):
    """
    Convert text of twitter_description field to a dict with list and datetime values.

    The Twitter description sometimes contains all the information that we need,
    but sometimes doesn't have the deceased person's name.
    Even though it is still unstructured data, it sometimes is easier
    to parse than the data from the detail page.

    :param str twitter_description: Twitter description embedded in the fatality details page
    :return: A dictionary containing the details information about the fatality.
    :rtype: dict
    """
    d = twitter_description_to_dict(twitter_description)

    # Parse the `Date` field.
    fatality_date = d.get(Fields.DATE)
    if fatality_date:

        # Turn it into a date object.
        d[Fields.DATE] = date_utils.parse_date(fatality_date)

    # Convert the time to a time object.
    fatality_time = d.get(Fields.TIME)
    if fatality_time:
        d[Fields.TIME] = date_utils.parse_time(fatality_time)

    # Handle special case where Date of birth is a token `DOB:`.
    tmp_dob = d.get(Fields.DOB)
    if tmp_dob:
        try:
            d[Fields.DOB] = date_utils.parse_date(tmp_dob)
        except ValueError:
            d[Fields.DOB] = date_utils.parse_date(tmp_dob.split()[0])

    return d
예제 #4
0
파일: article.py 프로젝트: lalver1/scrapd
def parse_content(page):
    """
    Parse the detail page to extract fatality information.

    :param str news_page: the content of the fatality page
    :return: a dictionary representing a fatality and a list of errors.
    :rtype: dict, list
    """
    d = {}
    parsing_errors = []

    # Normalize the page.
    normalized_detail_page = unicodedata.normalize("NFKD", page)

    # Parse the `Case` field.
    d[Fields.CASE] = regex.match_case_field(normalized_detail_page)
    if not d.get(Fields.CASE):
        raise ValueError('a case number is mandatory')

    # Parse the `Date` field.
    d[Fields.DATE] = regex.match_date_field(normalized_detail_page)
    if not d.get(Fields.DATE):
        raise ValueError('a date is mandatory')

    # Parse the `Crashes` field.
    crash_str = regex.match_crash_field(normalized_detail_page)
    if crash_str:
        d[Fields.CRASH] = crash_str
    else:
        parsing_errors.append("could not retrieve the crash number")

    # Parse the `Time` field.
    time_str = regex.match_time_field(normalized_detail_page)
    time = date_utils.parse_time(time_str)
    if time:
        d[Fields.TIME] = time
    else:
        parsing_errors.append("could not retrieve the crash time")

    # Parse the location field.
    location_str = regex.match_location_field(normalized_detail_page)
    if location_str:
        d[Fields.LOCATION] = location_str.strip()
    else:
        parsing_errors.append("could not retrieve the location")

    # Convert to a report object.
    report, err = twitter.to_report(d)
    parsing_errors.extend(err)

    # Convert the page to a BeautifulSoup object.
    soup = to_soup(normalized_detail_page.replace("<br>", "</br>"))

    # Parse the `Deceased` field.
    deceased_fields, err = parse_deceased_field(soup)
    if deceased_fields:
        report.fatalities = deceased_fields
        parsing_errors.extend(err)
    else:
        parsing_errors.append("could not retrieve the deceased information")
    report.compute_fatalities_age()

    # Fill in Notes from Details page
    if deceased_fields:
        notes = parse_notes_field(soup)
        if notes:
            report.notes = notes
        else:
            parsing_errors.append("could not retrieve the notes information")

    return report, parsing_errors
예제 #5
0
def test_parse_time_field_00(input_, expected):
    """Ensure a time field gets parsed correctly."""
    time_str = regex.match_time_field(input_)
    actual = date_utils.parse_time(time_str)
    assert actual == expected
예제 #6
0
파일: parsing.py 프로젝트: mscarey/scrapd
def parse_page_content(detail_page, notes_parsed=False):
    """
    Parse the detail page to extract fatality information.

    :param str news_page: the content of the fatality page
    :return: a dictionary representing a fatality and a list of errors.
    :rtype: dict, list
    """
    d = {}
    parsing_errors = []
    normalized_detail_page = unicodedata.normalize("NFKD", detail_page)
    soup = to_soup(normalized_detail_page.replace("<br>", "</br>"))

    # Parse the `Case` field.
    d[Fields.CASE] = regex.match_case_field(normalized_detail_page)
    if not d.get(Fields.CASE):
        raise ValueError('A case number is mandatory.')

    # Parse the `Crashes` field.
    crash_str = regex.match_crashes_field(normalized_detail_page)
    if crash_str:
        d[Fields.CRASHES] = crash_str
    else:
        parsing_errors.append("could not retrieve the crash number")

    # Parse the `Date` field.
    date_field = regex.match_date_field(normalized_detail_page)
    if date_field:
        d[Fields.DATE] = date_field
    else:
        parsing_errors.append("could not retrieve the crash date")

    # Parse the `Time` field.
    time_str = regex.match_time_field(normalized_detail_page)
    time = date_utils.parse_time(time_str)
    if time:
        d[Fields.TIME] = time
    else:
        parsing_errors.append("could not retrieve the crash time")

    # Parse the location field.
    location_str = regex.match_location_field(normalized_detail_page)
    if location_str:
        d[Fields.LOCATION] = location_str
    else:
        parsing_errors.append("could not retrieve the location")

    # Parse the `Deceased` field.
    deceased_field_list = parse_deceased_field(soup)
    if deceased_field_list:
        d[Fields.DECEASED] = deceased_field_list
    else:
        parsing_errors.append("could not retrieve the deceased information")

    # Fill in Notes from Details page if not in twitter description.
    if deceased_field_list and not notes_parsed:
        notes = parse_notes_field(soup, d[Fields.DECEASED][-1])
        if notes:
            d[Fields.NOTES] = notes
    if not d.get(Fields.NOTES):
        parsing_errors.append("could not retrieve the notes information")

    return d, parsing_errors