예제 #1
0
def common_fatality_parsing(d):
    """
    Perform parsing common to Twitter descriptions and page content.

    Ensures that the values are all strings and removes the 'Deceased' field which does not contain
    relevant information anymore.

    :param dict d: the fatality to finish parsing
    :return: A dictionary containing the details information about the fatality with sanitized entries.
    :rtype: dict
    """
    # Extracting other fields from 'Deceased' field.
    deceased_field = d.get(Fields.DECEASED)
    if deceased_field:
        if isinstance(deceased_field, list):
            deceased_field = ' '.join(deceased_field)

        try:
            d.update(parse_deceased_field(deceased_field))
        except ValueError as e:
            logger.trace(e)
    else:
        logger.trace('No deceased information to parse in fatality page.')

    # Compute the victim's age.
    if d.get(Fields.DATE) and d.get(Fields.DOB):
        d[Fields.AGE] = date_utils.compute_age(d.get(Fields.DATE),
                                               d.get(Fields.DOB))

    return sanitize_fatality_entity(d)
예제 #2
0
def parse_person(deceased, birth_date=None, collision_date=None):
    """
    Perform parsing about a person who died in a collision.

    :param str deceased: the text describing the deceased person
    :param datetime.date birth_date: the date of the person's birth
    :param datetime.date collision_date:
        the date of the fatal collision (even if the person died later)

    :return: A dictionary containing the details information about the fatality.
    :rtype: dict, list
    """
    d = {}
    parsing_errors = []

    # Extracting other fields from 'Deceased' field.
    if deceased:
        deceased = deceased.lstrip(" :1234567890")
        try:
            d.update(process_deceased_field(deceased))
        except ValueError as e:
            parsing_errors.append(str(e))

    # Compute the victim's age.
    birth_date = birth_date or d.get(Fields.DOB)
    if collision_date and birth_date:
        d[Fields.AGE] = date_utils.compute_age(collision_date, birth_date)

    if d.get(Fields.AGE, -1) < 0:
        parsing_errors.append(f'age is invalid: {d.get(Fields.AGE)}')

    return d, parsing_errors
예제 #3
0
파일: model.py 프로젝트: lalver1/scrapd
    def compute_fatalities_age(self):
        """Compute the ages of all fatalities in a report."""
        for f in self.fatalities:
            # Skip if the fatality already has an age, or if there is no dob.
            if f.age or not f.dob:
                continue

            # Compute the age.
            f.age = date_utils.compute_age(self.date, f.dob)
예제 #4
0
def parse_page_content(detail_page, notes_parsed=False):
    """
    Parse the detail page to extract fatality information.

    :param str news_page: the content of the fatality page
    :return: a dictionary representing a fatality.
    :rtype: dict
    """
    d = {}
    searches = [
        (Fields.DATE, re.compile(r'>Date:.*\s{2,}(?:</strong>)?([^<]*)</')),
        (Fields.DECEASED,
         re.compile(
             r'>Deceased:\s*(?:</span>)?(?:</strong>)?\s*>?([^<]*\d)\s*.*\)?<')
         ),
        (Fields.LOCATION,
         re.compile(r'>Location:.*>\s{2,}(?:</strong>)?([^<]+)')),
    ]
    normalized_detail_page = unicodedata.normalize("NFKD", detail_page)
    for search in searches:
        match = re.search(search[1], normalized_detail_page)
        if match:
            d[search[0]] = match.groups()[0]

    # Parse the `Case` field.
    d[Fields.CASE] = parse_case_field(normalized_detail_page)
    if not d.get(Fields.CASE):
        raise ValueError('A case number is mandatory.')

    # Parse the `Crashes` field.
    d[Fields.CRASHES] = parse_crashes_field(normalized_detail_page)

    # Parse the `Time` field.
    d[Fields.TIME] = parse_time_field(normalized_detail_page)

    # Parse the `Deceased` field.
    if d.get(Fields.DECEASED):
        try:
            d.update(parse_deceased_field(d.get(Fields.DECEASED)))
        except ValueError as e:
            logger.trace(e)
    else:
        logger.trace('No deceased information to parse in fatality page.')

    # Fill in Notes from Details page if not in twitter description.
    search_notes = re.compile(r'>Deceased:.*\s{2,}(.|\n)*?<\/p>(.|\n)*?<\/p>')
    match = re.search(search_notes, normalized_detail_page)
    if match and not notes_parsed:
        text_chunk = match.string[match.start(0):match.end(0)]
        d[Fields.NOTES] = parse_details_page_notes(text_chunk)

    # Compute the victim's age.
    if d.get(Fields.DATE) and d.get(Fields.DOB):
        d[Fields.AGE] = date_utils.compute_age(d.get(Fields.DATE),
                                               d.get(Fields.DOB))

    return sanitize_fatality_entity(d)
예제 #5
0
def parse_twitter_description(twitter_description):
    """
    Parse the Twitter description metadata.

    The Twitter description contains all the information that we need, and even though it is still unstructured data,
    it is easier to parse than the data from the detail page.

    :param str twitter_description: Twitter description embedded in the fatality details page
    :return: A dictionary containing the details information about the fatality.
    :rtype: dict
    """
    d = {}
    if not twitter_description:
        return d

    # Split the description to be able to parse it.
    current_field = None
    description_words = twitter_description.split()
    for word in description_words:
        # A word ending with a colon (':') is considered a field.
        if word.endswith(':'):
            current_field = word.replace(':', '')
            continue
        if not current_field:
            continue
        d.setdefault(current_field, []).append(word)

    # Handle special case where Date of birth is a token `DOB:`.
    tmp_dob = d.get(Fields.DOB)
    if tmp_dob and isinstance(tmp_dob, list):
        d[Fields.DOB] = tmp_dob[0]

    # Parse the Deceased field.
    if d.get(Fields.DECEASED):
        try:
            d.update(parse_deceased_field(' '.join(d.get(Fields.DECEASED))))
        except ValueError as e:
            logger.trace(e)
    else:
        logger.trace('No decease information to parse in Twitter description.')

    # Compute the victim's age.
    if d.get(Fields.DATE) and d.get(Fields.DOB):
        d[Fields.AGE] = date_utils.compute_age(' '.join(d.get(Fields.DATE)),
                                               d.get(Fields.DOB))

    return sanitize_fatality_entity(d)