Пример #1
0
def wrangle_inmate_data_from_tag(rowtag):
    d = {}
    col = rowtag.select('td')
    d['tdcj_id'] = col[0].text.strip()
    d['url'] = make_absolute_url(col[1].select('a')[0].attrs['href'])
    d['last_name'] = col[2].text.strip()
    d['first_name'] = col[3].text.strip()
    d['birthdate'] = txdate_to_iso(col[4].text.strip())
    d['gender'] = col[5].text.strip()
    d['race'] = col[6].text.strip()
    d['date_received'] = txdate_to_iso(col[7].text.strip())
    d['date_offense'] = txdate_to_iso(col[9].text.strip())
    d['county'] = col[8].text.strip()
    d['age_at_offense'] = round(
        calc_years_diff(d['birthdate'], d['date_offense']))
    d['years_before_death_row'] = calc_years_diff(d['date_offense'],
                                                  d['date_received'])
    return d
Пример #2
0
def wrangle_inmate_data_from_tag(rowtag):
    """
    Args:
        rowtag: a BeautifulSoup <Tag> object, ostensibly representing a table row
            from a parsed Texas death row HTML table, e.g.

            <tr>
            <td>999608</td>
            <td align="center"><a href="dr_info/hudsonwilliam.html" title="Offender Information for William Hudson">Offender Information</a></td>
            <td>Hudson</td>
            <td>William</td>
            <td>07/03/1982</td>
            <td align="center">M</td>
            <td>White</td>
            <td>11/16/2017</td>
            <td>Anderson</td>
            <td>11/14/2015</td>
            </tr>

    Returns:
        <dict>: A dictionary object that contains some of the values in
            the HTML, with formatting where standardization is needed --
            e.g. for dates and for the inmate's URL (absolute vs relative)
            and some derived attributes, e.g. 'age_at_offense'

            The value for 'url' should be an absolute URL, i.e. a valid
               URL on the Web, not a relative one.

            'birthdate', date_received', 'date_offense' should be in 'YYYY-MM-DD'
                format

            All values are strings except for:
               'age_at_offense', which is an integer derived from birthdate and
                                 date of offense

               'years_before_death_row', which is a float
                              (rounded to nearest tenth) derived
                              from date of offense and date received, i.e.
                              number of years between commission of crime and entering
                              death row.

        e.g.
            {
                'tdcj_id': '999608',
                'url': 'https://wgetsnaps.github.io/tdcj-state-tx-us-2018/death_row/dr_info/hudsonwilliam.html',
                'last_name': 'Hudson',
                'first_name': 'William',
                'birthdate': '1982-07-03',
                'gender': 'M',
                'race': 'White',
                'date_received': '2017-11-16',
                'date_offense': '2015-11-14',
                'age_at_offence': 33,
                'years_before_death_row': 2.0
            }
    """
    ### fill in yourself
    ### (this one will be pretty long, though it's mostly tedious/repetitive steps)
    cat_array = rowtag.select('td')
    return_dict = {}
    return_dict['tdcj_id'] = cat_array[0].text.strip()
    return_dict['url'] = make_absolute_url(cat_array[1].select('a')[0].attrs['href'])
    return_dict['last_name'] = cat_array[2].text
    return_dict['first_name'] = cat_array[3].text
    return_dict['birthdate'] = txdate_to_iso(cat_array[4].text)
    return_dict['gender'] = cat_array[5].text
    return_dict['race'] = cat_array[6].text
    return_dict['date_received'] = txdate_to_iso(cat_array[7].text)
    return_dict['county'] = cat_array[8].text.strip()
    return_dict['date_offense'] = txdate_to_iso(cat_array[9].text)
    return_dict['age_at_offense'] = int(calc_years_diff(return_dict['birthdate'], return_dict['date_offense'])) + 1
    return_dict['years_before_death_row'] = calc_years_diff(return_dict['date_offense'], return_dict['date_received'])

    return return_dict
Пример #3
0
def wrangle_inmate_data_from_tag(rowtag):
    """
    Args:
        rowtag: a BeautifulSoup <Tag> object, ostensibly representing a table row
            from a parsed Texas death row HTML table, e.g.

            <tr>
            <td>999608</td>
            <td align="center"><a href="dr_info/hudsonwilliam.html" title="Offender Information for William Hudson">Offender Information</a></td>
            <td>Hudson</td>
            <td>William</td>
            <td>07/03/1982</td>
            <td align="center">M</td>
            <td>White</td>
            <td>11/16/2017</td>
            <td>Anderson</td>
            <td>11/14/2015</td>
            </tr>

    Returns:
        <dict>: A dictionary object that contains some of the values in
            the HTML, with formatting where standardization is needed --
            e.g. for dates and for the inmate's URL (absolute vs relative)
            and some derived attributes, e.g. 'age_at_offense'

            The value for 'url' should be an absolute URL, i.e. a valid
               URL on the Web, not a relative one.

            'birthdate', date_received', 'date_offense' should be in 'YYYY-MM-DD'
                format

            All values are strings except for:
               'age_at_offense', which is an integer derived from birthdate and
                                 date of offense

               'years_before_death_row', which is a float
                              (rounded to nearest tenth) derived
                              from date of offense and date received, i.e.
                              number of years between commission of crime and entering
                              death row.

        e.g.
            {
                'tdcj_id': '999608',
                'url': 'https://wgetsnaps.github.io/tdcj-state-tx-us-2018/death_row/dr_info/hudsonwilliam.html',
                'last_name': 'Hudson',
                'first_name': 'William',
                'birthdate': '1982-07-03',
                'gender': 'M',
                'race': 'White',
                'date_received': '2017-11-16',
                'date_offense': '2015-11-14',
                'age_at_offence': 33,
                'years_before_death_row': 2.0
            }
    """
    ### fill in yourself
    ### (this one will be pretty long, though it's mostly tedious/repetitive steps)

    offender_dict = {}
    td_elements = rowtag.select('td')
    
    #get id number and add to dictionary
    offender_dict['tdcj_id'] = td_elements[0].text.strip()

    #derive absolute url from relative, file under 'url'
    url_line = rowtag.select('td a')
    relative_url = url_line[0].attrs['href']
    offender_dict['url'] = make_absolute_url(relative_url)

    #get last name, first name
    offender_dict['last_name'] = td_elements[2].text.strip()
    offender_dict['first_name'] = td_elements[3].text.strip()

    #get birthdate (add in 'YYYY-MM-DD' format)
    birthdate = td_elements[4].text.strip()
    iso_birthdate = txdate_to_iso(birthdate)
    offender_dict['birthdate'] = iso_birthdate

    #gender
    gender = td_elements[5].text.strip()
    offender_dict['gender'] = gender

    #race
    race = td_elements[6].text.strip()
    offender_dict['race'] = race

    #get the date sentence received ('YYYY-MM-DD' format)
    date_received = txdate_to_iso(td_elements[7].text)
    offender_dict['date_received'] = date_received

    #get the date of the offense ('YYYY-MM-DD' format)
    date_of_offense = txdate_to_iso(td_elements[-1].text.strip())
    offender_dict['date_offense'] = date_of_offense

    #calculate age at offense and store as integer value
    age = calc_years_diff(iso_birthdate, date_of_offense) + 1
    offender_dict['age_at_offense'] = int(age)

    #determine years before death row
    years_before = calc_years_diff(date_of_offense, date_received)
    offender_dict['years_before_death_row'] = years_before

    #county
    county = td_elements[-2].text.strip()
    offender_dict['county'] = county

    #give you the dictionary with each element
    return offender_dict
Пример #4
0
def wrangle_inmate_data_from_tag(rowtag):
    """
    Args:
        rowtag: a BeautifulSoup <Tag> object, ostensibly representing a table row
            from a parsed Texas death row HTML table, e.g.

            <tr>
            <td>999608</td>
            <td align="center"><a href="dr_info/hudsonwilliam.html" title="Offender Information for William Hudson">Offender Information</a></td>
            <td>Hudson</td>
            <td>William</td>
            <td>07/03/1982</td>
            <td align="center">M</td>
            <td>White</td>
            <td>11/16/2017</td>
            <td>Anderson</td>
            <td>11/14/2015</td>
            </tr>

    Returns:
        <dict>: A dictionary object that contains some of the values in
            the HTML, with formatting where standardization is needed --
            e.g. for dates and for the inmate's URL (absolute vs relative)
            and some derived attributes, e.g. 'age_at_offense'

            The value for 'url' should be an absolute URL, i.e. a valid
               URL on the Web, not a relative one.

            'birthdate', date_received', 'date_offense' should be in 'YYYY-MM-DD'
                format

            All values are strings except for:
               'age_at_offense', which is an integer derived from birthdate and
                                 date of offense

               'years_before_death_row', which is a float
                              (rounded to nearest tenth) derived
                              from date of offense and date received, i.e.
                              number of years between commission of crime and entering
                              death row.

        e.g.
            {
                'tdcj_id': '999608',
                'url': 'https://wgetsnaps.github.io/tdcj-state-tx-us-2018/death_row/dr_info/hudsonwilliam.html',
                'last_name': 'Hudson',
                'first_name': 'William',
                'birthdate': '1982-07-03',
                'gender': 'M',
                'race': 'White',
                'date_received': '2017-11-16',
                'county': 'Anderson',
                'date_offense': '2015-11-14',
                'age_at_offence': 33,
                'years_before_death_row': 2.0
            }
    """
    def get_text(num):
        return td_tags[num].text.strip()

    td_tags = rowtag.select('td')
    date_offense = txdate_to_iso(get_text(9))
    birth_date = txdate_to_iso(get_text(4))
    date_received = txdate_to_iso(get_text(7))
    dict = {
        'tdcj_id': get_text(0),
        'url': make_absolute_url(td_tags[1].select('a')[0].attrs['href']),
        'last_name': get_text(2),
        'first_name': get_text(3),
        'birthdate': birth_date,
        'gender': get_text(5),
        'race': get_text(6),
        'date_received': date_received,
        'county': get_text(8),
        'date_offense': date_offense,
        'age_at_offense': round(calc_years_diff(birth_date, date_offense)),
        'years_before_death_row': calc_years_diff(date_offense, date_received)
    }
    return dict
def test_half_year():
    assert calc_years_diff('2017-07-01', '2018-01-01') == 0.5
def test_decade_diff():
    assert calc_years_diff('2002-05-01', '2012-05-01') == 10
def test_zero_diff():
    assert calc_years_diff('2012-01-01', '2012-01-01') == 0
def test_return_type():
    assert isinstance(calc_years_diff('2012-01-01', '2015-05-05'), float)