def wrangle_inmate_data_from_tag(rowtag): d = {} col = rowtag.select('td') d['tdcj_id'] = col[0].text.strip() d['url'] = make_absolute_url(col[1].select('a')[0].attrs['href']) d['last_name'] = col[2].text.strip() d['first_name'] = col[3].text.strip() d['birthdate'] = txdate_to_iso(col[4].text.strip()) d['gender'] = col[5].text.strip() d['race'] = col[6].text.strip() d['date_received'] = txdate_to_iso(col[7].text.strip()) d['date_offense'] = txdate_to_iso(col[9].text.strip()) d['county'] = col[8].text.strip() d['age_at_offense'] = round( calc_years_diff(d['birthdate'], d['date_offense'])) d['years_before_death_row'] = calc_years_diff(d['date_offense'], d['date_received']) return d
def wrangle_inmate_data_from_tag(rowtag): """ Args: rowtag: a BeautifulSoup <Tag> object, ostensibly representing a table row from a parsed Texas death row HTML table, e.g. <tr> <td>999608</td> <td align="center"><a href="dr_info/hudsonwilliam.html" title="Offender Information for William Hudson">Offender Information</a></td> <td>Hudson</td> <td>William</td> <td>07/03/1982</td> <td align="center">M</td> <td>White</td> <td>11/16/2017</td> <td>Anderson</td> <td>11/14/2015</td> </tr> Returns: <dict>: A dictionary object that contains some of the values in the HTML, with formatting where standardization is needed -- e.g. for dates and for the inmate's URL (absolute vs relative) and some derived attributes, e.g. 'age_at_offense' The value for 'url' should be an absolute URL, i.e. a valid URL on the Web, not a relative one. 'birthdate', date_received', 'date_offense' should be in 'YYYY-MM-DD' format All values are strings except for: 'age_at_offense', which is an integer derived from birthdate and date of offense 'years_before_death_row', which is a float (rounded to nearest tenth) derived from date of offense and date received, i.e. number of years between commission of crime and entering death row. e.g. { 'tdcj_id': '999608', 'url': 'https://wgetsnaps.github.io/tdcj-state-tx-us-2018/death_row/dr_info/hudsonwilliam.html', 'last_name': 'Hudson', 'first_name': 'William', 'birthdate': '1982-07-03', 'gender': 'M', 'race': 'White', 'date_received': '2017-11-16', 'date_offense': '2015-11-14', 'age_at_offence': 33, 'years_before_death_row': 2.0 } """ ### fill in yourself ### (this one will be pretty long, though it's mostly tedious/repetitive steps) cat_array = rowtag.select('td') return_dict = {} return_dict['tdcj_id'] = cat_array[0].text.strip() return_dict['url'] = make_absolute_url(cat_array[1].select('a')[0].attrs['href']) return_dict['last_name'] = cat_array[2].text return_dict['first_name'] = cat_array[3].text return_dict['birthdate'] = txdate_to_iso(cat_array[4].text) return_dict['gender'] = cat_array[5].text return_dict['race'] = cat_array[6].text return_dict['date_received'] = txdate_to_iso(cat_array[7].text) return_dict['county'] = cat_array[8].text.strip() return_dict['date_offense'] = txdate_to_iso(cat_array[9].text) return_dict['age_at_offense'] = int(calc_years_diff(return_dict['birthdate'], return_dict['date_offense'])) + 1 return_dict['years_before_death_row'] = calc_years_diff(return_dict['date_offense'], return_dict['date_received']) return return_dict
def wrangle_inmate_data_from_tag(rowtag): """ Args: rowtag: a BeautifulSoup <Tag> object, ostensibly representing a table row from a parsed Texas death row HTML table, e.g. <tr> <td>999608</td> <td align="center"><a href="dr_info/hudsonwilliam.html" title="Offender Information for William Hudson">Offender Information</a></td> <td>Hudson</td> <td>William</td> <td>07/03/1982</td> <td align="center">M</td> <td>White</td> <td>11/16/2017</td> <td>Anderson</td> <td>11/14/2015</td> </tr> Returns: <dict>: A dictionary object that contains some of the values in the HTML, with formatting where standardization is needed -- e.g. for dates and for the inmate's URL (absolute vs relative) and some derived attributes, e.g. 'age_at_offense' The value for 'url' should be an absolute URL, i.e. a valid URL on the Web, not a relative one. 'birthdate', date_received', 'date_offense' should be in 'YYYY-MM-DD' format All values are strings except for: 'age_at_offense', which is an integer derived from birthdate and date of offense 'years_before_death_row', which is a float (rounded to nearest tenth) derived from date of offense and date received, i.e. number of years between commission of crime and entering death row. e.g. { 'tdcj_id': '999608', 'url': 'https://wgetsnaps.github.io/tdcj-state-tx-us-2018/death_row/dr_info/hudsonwilliam.html', 'last_name': 'Hudson', 'first_name': 'William', 'birthdate': '1982-07-03', 'gender': 'M', 'race': 'White', 'date_received': '2017-11-16', 'date_offense': '2015-11-14', 'age_at_offence': 33, 'years_before_death_row': 2.0 } """ ### fill in yourself ### (this one will be pretty long, though it's mostly tedious/repetitive steps) offender_dict = {} td_elements = rowtag.select('td') #get id number and add to dictionary offender_dict['tdcj_id'] = td_elements[0].text.strip() #derive absolute url from relative, file under 'url' url_line = rowtag.select('td a') relative_url = url_line[0].attrs['href'] offender_dict['url'] = make_absolute_url(relative_url) #get last name, first name offender_dict['last_name'] = td_elements[2].text.strip() offender_dict['first_name'] = td_elements[3].text.strip() #get birthdate (add in 'YYYY-MM-DD' format) birthdate = td_elements[4].text.strip() iso_birthdate = txdate_to_iso(birthdate) offender_dict['birthdate'] = iso_birthdate #gender gender = td_elements[5].text.strip() offender_dict['gender'] = gender #race race = td_elements[6].text.strip() offender_dict['race'] = race #get the date sentence received ('YYYY-MM-DD' format) date_received = txdate_to_iso(td_elements[7].text) offender_dict['date_received'] = date_received #get the date of the offense ('YYYY-MM-DD' format) date_of_offense = txdate_to_iso(td_elements[-1].text.strip()) offender_dict['date_offense'] = date_of_offense #calculate age at offense and store as integer value age = calc_years_diff(iso_birthdate, date_of_offense) + 1 offender_dict['age_at_offense'] = int(age) #determine years before death row years_before = calc_years_diff(date_of_offense, date_received) offender_dict['years_before_death_row'] = years_before #county county = td_elements[-2].text.strip() offender_dict['county'] = county #give you the dictionary with each element return offender_dict
def wrangle_inmate_data_from_tag(rowtag): """ Args: rowtag: a BeautifulSoup <Tag> object, ostensibly representing a table row from a parsed Texas death row HTML table, e.g. <tr> <td>999608</td> <td align="center"><a href="dr_info/hudsonwilliam.html" title="Offender Information for William Hudson">Offender Information</a></td> <td>Hudson</td> <td>William</td> <td>07/03/1982</td> <td align="center">M</td> <td>White</td> <td>11/16/2017</td> <td>Anderson</td> <td>11/14/2015</td> </tr> Returns: <dict>: A dictionary object that contains some of the values in the HTML, with formatting where standardization is needed -- e.g. for dates and for the inmate's URL (absolute vs relative) and some derived attributes, e.g. 'age_at_offense' The value for 'url' should be an absolute URL, i.e. a valid URL on the Web, not a relative one. 'birthdate', date_received', 'date_offense' should be in 'YYYY-MM-DD' format All values are strings except for: 'age_at_offense', which is an integer derived from birthdate and date of offense 'years_before_death_row', which is a float (rounded to nearest tenth) derived from date of offense and date received, i.e. number of years between commission of crime and entering death row. e.g. { 'tdcj_id': '999608', 'url': 'https://wgetsnaps.github.io/tdcj-state-tx-us-2018/death_row/dr_info/hudsonwilliam.html', 'last_name': 'Hudson', 'first_name': 'William', 'birthdate': '1982-07-03', 'gender': 'M', 'race': 'White', 'date_received': '2017-11-16', 'county': 'Anderson', 'date_offense': '2015-11-14', 'age_at_offence': 33, 'years_before_death_row': 2.0 } """ def get_text(num): return td_tags[num].text.strip() td_tags = rowtag.select('td') date_offense = txdate_to_iso(get_text(9)) birth_date = txdate_to_iso(get_text(4)) date_received = txdate_to_iso(get_text(7)) dict = { 'tdcj_id': get_text(0), 'url': make_absolute_url(td_tags[1].select('a')[0].attrs['href']), 'last_name': get_text(2), 'first_name': get_text(3), 'birthdate': birth_date, 'gender': get_text(5), 'race': get_text(6), 'date_received': date_received, 'county': get_text(8), 'date_offense': date_offense, 'age_at_offense': round(calc_years_diff(birth_date, date_offense)), 'years_before_death_row': calc_years_diff(date_offense, date_received) } return dict
def test_half_year(): assert calc_years_diff('2017-07-01', '2018-01-01') == 0.5
def test_decade_diff(): assert calc_years_diff('2002-05-01', '2012-05-01') == 10
def test_zero_diff(): assert calc_years_diff('2012-01-01', '2012-01-01') == 0
def test_return_type(): assert isinstance(calc_years_diff('2012-01-01', '2015-05-05'), float)