def get_pixelated_mug(): """This function uploads the raw image to cloudinary and then uploads the pixelated version to the airtable record.""" t0, i = time.time(), 0 needs_pix_img_formula = "AND(PHOTO != '', PIXELATED_IMG = '', hours_since_verification < 24, jail != 'jcdc')" records = airtab.get_all(formula=needs_pix_img_formula) for record in records: this_dict = {} url = record["fields"]["PHOTO"][0]["url"] r = requests.get(url) content_type = r.headers['Content-Type'] # print(content_type) if content_type == 'image/jpeg': try: upload_response = uploader.upload(url, opacity=40, effect="blur:400") time.sleep(1) this_dict["PIXELATED_IMG"] = [{ "url": upload_response['secure_url'] }] airtab.update(record['id'], this_dict) except cloudinary.exceptions.Error as err1: print("cloudinary can't accept that shit: ", err1) except AttributeError as err2: print('Attribute Error for cloudinary upload: ', err2) else: print('this shit was some really weird content type:', content_type) wrap_it_up(t0, new=i, total=len(records), function='get_pixelated_mug')
def retry_getting_mugshot(): t0, i = time.time(), 0 needs_pic_formula = "AND(PHOTO = '', hours_since_verification < 12, jail != 'lcdc')" records = airtab.get_all(formula=needs_pic_formula) print("we're gonna retry getting mugs for", len(records), "records...") for record in records: this_dict = {} r = requests.get(record['fields']['link']) if record['fields']['jail'] == 'jcdc': soup = BeautifulSoup(r.text, 'html.parser').find(id='cms-content') img_tag = soup.find('div', class_='inmate_profile_image').img if img_tag['alt'] != 'Image Not Availble': this_dict[ 'img_src'] = f"https://www.jonesso.com/{img_tag['src']}" this_dict['PHOTO'] = [{'url': this_dict['img_src']}] # else: # print('image not currently available') elif record['fields']['jail'] == 'hcdc': soup = BeautifulSoup(r.text, 'html.parser') try: img_src = 'http://www.co.hinds.ms.us' + soup.find( 'img', {'align': 'middle'})['src'] if requests.get( img_src).headers['Content-Type'] == 'image/jpeg': this_dict['img_src'] = img_src this_dict['PHOTO'] = [{'url': img_src}] else: print('image source isn\'t actually an image') except TypeError: print('no img tag in intake html') elif record['fields']['jail'] == 'kcdc': soup = BeautifulSoup(r.text, 'html.parser').find(id='cms-content') try: img_tag = soup.find('img') except AttributeError: # print('no img tag in intake html') continue if soup.img: img_src_raw = soup.img['src'] if img_src_raw.startswith( 'templates/kempercountysheriff.com/images/inmates'): this_dict[ 'img_src'] = f"https://www.kempercountysheriff.com/{img_src_raw}" this_dict['PHOTO'] = [{'url': this_dict['img_src']}] elif record['fields']['jail'] == 'acdc': soup = BeautifulSoup(r.text, 'html.parser').find( 'div', class_='blog-content-container') try: img_tag = soup.find('img') this_dict['img_src'] = img_tag.get('src') this_dict['PHOTO'] = [{'url': this_dict['img_src']}] except AttributeError: # print('no img tag in intake html') continue else: print( f"awww hell... this one is from the {record['fields']['jail']} docket/scraper..." ) airtab.update(record['id'], this_dict) wrap_it_up(t0, new=i, total=len(records), function='retry_getting_mugshot')
def admits_otw(week, county, jail): record = airtab_weekly.match('WOI', week) admits_formula = f"AND(WOI='{week}', jail='{jail}')" records = airtab_intakes.get_all(fields='jail', formula=admits_formula) other_records = airtab_archive_intakes.get_all(fields='jail', formula=admits_formula) this_dict = {f"{county} total admits": len(records) + len(other_records)} airtab_weekly.update(record['id'], this_dict) # print(this_dict, f"active, {len(records)}; archive, {len(other_records)}") print(week, ' --> ', this_dict, f"active, {len(records)}; archive, {len(other_records)}")
def get_all_intake_deets(): t0, i = time.time(), 0 jcadc_deets_formula = "AND(jail = 'jcadc', charges = '', recent_text != '')" records = airtab.get_all(formula=jcadc_deets_formula, fields='recent_text') for record in records: charges = [] bond_ammts = [] classifications = [] this_dict = {} txt_str = record['fields']['recent_text'] chunks = txt_str.split('\nRequest Victim Notification\n') match_1 = re.search(r"(\w+)\s+(Male|Female)", chunks[0]) try: raw_race = match_1.group(1) if raw_race == 'AVAILABLE': this_dict['race'] = 'U' else: this_dict['race'] = raw_race[0] this_dict['sex'] = match_1.group(2)[0] except AttributeError: print('there isnt race/sex info') try: this_dict['intake_weight'] = re.search(r"(\d+) Pounds", chunks[0]).group(1) except AttributeError: print('there isnt weight info') try: this_dict['intake_height'] = re.search(r"(\d Ft. \d+ In.)", chunks[0]).group(1) except AttributeError: print('idk how tall this person is') try: this_dict['intake_eye'] = re.search(r"(\w+)\s+Eyes", chunks[0]).group(1) except AttributeError: print('eye color is a mystery') this_dict['intake_age'] = re.search(r"(\d\d) Years Old", chunks[0]).group(1) crim_details = chunks[1].splitlines() for ln in crim_details: results = re.search(r"([MF]\w+) - Bond: (\$.*)", ln) if results: bond_ammts.append(results.group(2)) classifications.append(results.group(1)) elif ', ' in ln: charges.append(f"\"{ln}\"") else: charges.append(ln) this_dict['charges'] = ', '.join(charges) this_dict['bond_ammounts'] = '\n'.join(bond_ammts) this_dict['charge_classifications'] = ', '.join(classifications) airtab.update(record['id'], this_dict, typecast=True) i += 1 wrap_it_up(t0, new=i, total=len(records), function='get_all_intake_deets')
def pop_otd(day, county, jail, quiet=True): record = airtab_daily.match('date_str', day) day_before = datetime.strptime(day, '%Y-%m-%d') - timedelta(1) day_after = datetime.strptime(day, '%Y-%m-%d') + timedelta(1) pop_formula = f"AND(IS_BEFORE(DOI, '{day_after.date()}'), IS_AFTER(last_verified, '{day_before.date()}'), jail='{jail}')" records = airtab_intakes.get_all(fields='jail', formula=pop_formula) # other_records = airtab_archive_intakes.get_all(fields='jail', formula=pop_formula) # this_dict = {f"{county} pop": len(records) + len(other_records)} this_dict = {f"{county} pop": len(records)} airtab_daily.update(record['id'], this_dict) if not quiet: # print(this_dict, f"active, {len(records)}; archive, {len(other_records)}") print(this_dict)
def admits_otd(day, county, jail, quiet=True): record = airtab_daily.match('date_str', day) admits_formula = f"AND(DATETIME_FORMAT(DOI, 'YYYY-MM-DD')='{day}', jail='{jail}')" records = airtab_intakes.get_all(fields='jail', formula=admits_formula) # other_records = airtab_archive_intakes.get_all(fields='jail', formula=admits_formula) # this_dict = {f"{county} admits": len(records) + len(other_records)} this_dict = {f"{county} admits": len(records)} try: airtab_daily.update(record['id'], this_dict) except KeyError: print('no record') if not quiet: # print(this_dict, f"active, {len(records)}; archive, {len(other_records)}") print(this_dict)
def update_dc_fields(): records = airtab.get_all(view='need dc urls updated', fields='dc_id', max_records=100) print(len(records), ' records need updated documentcloud URLs.') for record in records: this_dict = {} dc_id = record['fields'].get('dc_id') obj = dc.documents.get(dc_id) this_dict["PDF"] = obj.pdf_url this_dict["dc_canonical_url"] = obj.canonical_url this_dict["dc_resources_page_image"] = obj.normal_image_url airtab.update(record['id'], this_dict) time.sleep(.3)
def get_full_text(): t0, i = time.time(), 0 records = airtab.get_all(formula="AND(dc_id != '', dc_full_text = '')", fields=['dc_id']) for record in records: this_dict = {} obj = dc.documents.get(record['fields']['dc_id']) this_dict["dc_title"] = obj.title this_dict["dc_access"] = obj.access this_dict["dc_pages"] = obj.pages this_dict["dc_full_text"] = obj.full_text.decode("utf-8") airtab.update(record["id"], this_dict) i += 1 wrap_it_up(t0, new=i, total=len(records), function='get_full_text')
def update_summary(this_many=150): """This function updates the record summary. The reason we have this field, rather than just use the 'blurb' field, is bc the gallery view works better with a text field than it does with a formula field. Because this view will regularly be packed full of records, the default max records is 100.""" t0, i = time.time(), 0 # outdated_summary_formula = "AND(blurb != '#ERROR!', blurb != summary)" # records = airtab.get_all(formula=outdated_summary_formula, fields="blurb", max_records=this_many) records = airtab.get_all(view='needs updated summary', fields="blurb", max_records=this_many) for record in records: this_dict = {} this_dict["summary"] = record["fields"]["blurb"] airtab.update(record["id"], this_dict) wrap_it_up(t0, new=i, total=len(records), function='update_summary')
def remove_weird_character(): t0, i = time.time(), 0 remove_wierd_character_formula = "AND(hours_since_verification > 12, FIND('ã', recent_text) > 1)" records = airtab.get_all(formula=remove_wierd_character_formula, fields='recent_text') for record in records: this_dict = {} x = record['fields']['recent_text'].find('ã') y = record['fields']['recent_text'].find('\n', x) this_dict['recent_text'] = record['fields']['recent_text'].replace( record['fields']['recent_text'][x:y], '') airtab.update(record['id'], this_dict) i += 1 wrap_it_up(t0, new=i, total=len(records), function='remove_weird_character')
def web_to_pdf(): # filters for recently verified intakes w/out dc_id. # for records meeting that criteria, create pdf & store locally t0, i = time.time(), 0 # pdf_formula = "AND(dc_id = '', hours_since_verification < 6, jail != 'jcj')" records = airtab.get_all(view='needs pdf') i = len(records) for record in records: url = record['fields']['link'] jail = record['fields']['jail'] if jail in {'mcdc', 'prcdf', 'lcdc', 'jcadc'}: fn = f"./output/{jail}/{record['fields']['intake_number']}.pdf" else: fn = f"./output/{jail}/{record['fields']['bk']}.pdf" options = { 'quiet': '', 'footer-right': time.strftime('%c'), 'footer-left': url, 'javascript-delay': 5000} if jail == 'lcdc': options['zoom'] = '.75' options['viewport-size'] = '1000x1400' options['footer-font-size'] = 9 else: options['footer-font-size'] = 10 if jail in {'mcdc', 'prcdf'}: try: r = requests.get(url, headers=muh_headers) except requests.ConnectionError as err: print(f"Skipping {url}: {err}") time.sleep(5) continue data = [] soup = BeautifulSoup(r.text, 'html.parser') for string in soup.stripped_strings: data.append(str(string)) if record['fields']['intake_number'] == data[1 + data.index('INTAKE #:')]: pdfkit.from_url(url, fn, options) else: print('the intake number does not match!') else: pdfkit.from_url(url, fn, options) wrap_it_up(t0, new=i, total=i, function='web_to_pdf')
def get_dor_if_possible(this_many=50): t0, i = time.time(), 0 # records = airtab.get_all(view="check for DOR") dor_formula = "AND(OR(jail = 'kcdc', jail = 'tcdc', jail = 'ccdc', jail = 'jcdc'), DOR = '', hours_since_verification > 6, hours_since_verification < 48)" records = airtab.get_all(formula=dor_formula, max_records=this_many) total = len(records) for record in records: this_dict = {} try: r = requests.get(record["fields"]["link"]) except requests.ConnectionError as err: print(f"Skipping {record['fields']['link']}: {err}") time.sleep(5) continue soup = BeautifulSoup(r.text, "html.parser") data = [] for string in soup.stripped_strings: data.append(str(string)) if "Release Date:" in data: options = { "quiet": "", "footer-font-size": 10, "footer-left": record["fields"]["link"], "footer-right": time.strftime('%c'), } directory = f"./output/{record['fields']['jail']}/updated" try: ensure_dir(directory) file_name = f"{record['fields']['bk']} (final).pdf" fn = os.path.join(directory, file_name) pdfkit.from_url(record["fields"]["link"], fn, options=options) except NotADirectoryError as err: print(f"Can't write PDF: {err}") this_dict["DOR"] = datetime.datetime.strptime( data[1 + data.index("Release Date:")], "%m-%d-%Y - %I:%M %p" ).strftime('%m/%d/%Y %H:%M') airtab.update(record["id"], this_dict) i += 1 wrap_it_up(t0, i, total, function='get_dor_if_possible')
def fix_charges_to_by_lines(): t0, i = time.time(), 0 records = airtab.get_all( formula="AND(TEST_FORMULA != '', TEST_RESULT = '')", fields='charges') for record in records: this_dict = {} cleaner = [] mess = record['fields']['charges'].replace('", ', '"\n').replace( ', "', '\n"').splitlines() for c in mess: if c.startswith('"'): cleaner.append(c.replace('"', '')) else: for d in c.split(', '): cleaner.append(d) this_dict['TEST_RESULT'] = '\n'.join(cleaner) airtab.update(record['id'], this_dict) i += 1 wrap_it_up(t0, new=i, total=len(records), function='fix_charges_to_by_lines')
def parse_charge_1(): t0, i = time.time(), 0 needs_charge_1_parsed_formula = "AND(OR(jail = 'mcdc', jail = 'prcdf'), charge_1_statute = '', hours_since_initial_scrape < 48, charge_1 != '', charge_1 != 'HOLDHOLD', charge_1 != 'DRUGDRUG COURT', charge_1 != 'HLD Other AgencyHold for other Agency')" records = airtab.get_all(formula=needs_charge_1_parsed_formula) for record in records: this_dict = {} x = None if re.search("[)][A-Z]", record["fields"]["charge_1"]): x = re.search("[)][A-Z]", record["fields"]["charge_1"]) elif re.search("[0-9][A-Z]", record["fields"]["charge_1"]): x = re.search("[0-9][A-Z]", record["fields"]["charge_1"]) if x: this_dict["charge_1_statute"] = record["fields"][ "charge_1"][:x.start() + 1] this_dict["charge_1_title"] = record["fields"]["charge_1"][x.end( ) - 1:] try: airtab.update(record["id"], this_dict) i += 1 except requests.exceptions.HTTPError as err: print(err) continue wrap_it_up(t0, new=i, total=len(records), function='parse_charge_1')
def get_charges_from_recent_text(): """This function parces the recent text field and extracts the listed charges.""" t0, i = time.time(), 0 needs_charges_formula = "AND(charges_updated = '', html != '', recent_text != '', hours_since_verification < 72, DONT_DELETE != 'no charges')" records = airtab.get_all(formula=needs_charges_formula) for record in records: this_dict = {} if record["fields"]["jail"] == "lcdc": charges = [] bond_ammounts = [] fine_ammounts = [] soup = BeautifulSoup(record["fields"]["html"], "html.parser").tbody rows = soup.find_all("tr") if soup.tfoot: goods = rows[:len(rows) - 1] this_dict["intake_bond_cash"] = soup.tfoot.find_all( "td")[2].b.string.strip() this_dict["intake_fine_ammount"] = soup.tfoot.find_all( "td")[3].b.string.strip() else: goods = rows for row in goods: cells = row.find_all("td") if cells[0].string.strip(): if "," in cells[0].string.strip(): charges.append('"' + cells[0].string.strip() + '"') else: charges.append(cells[0].string.strip()) if cells[2].string.strip(): bond_ammounts.append(cells[2].string.strip().replace( ",", "")) if cells[3].string.strip(): fine_ammounts.append(cells[3].string.strip().replace( ",", "")) if charges: this_dict["charges"] = ", ".join(charges) if bond_ammounts: this_dict["bond_ammounts"] = "\n".join(bond_ammounts) if fine_ammounts: this_dict["fine_ammounts"] = "\n".join(fine_ammounts) airtab.update(record["id"], this_dict, typecast=True) i += 1 elif record["fields"]["jail"] == "kcdc": charges = [] text = record["fields"]["recent_text"] goods = text[text.find("Charges:"):text.find("Note:")].splitlines() if len(goods) > 1: for good in goods: if "," in good: charges.append('"' + good.strip() + '"') else: charges.append(good) this_dict["charges"] = ", ".join(goods[1:]) airtab.update(record["id"], this_dict) i += 1 elif record["fields"]["jail"] in {"ccdc", "tcdc", "jcdc"}: charges = [] text = record["fields"]["recent_text"] x = text.find("\nCharges:") + 9 y = text.find("\nBond:") goods = text[x:y].strip().splitlines() for line in goods: if "," in line: charges.append('"' + line + '"') else: charges.append(line) this_dict["charges"] = ", ".join(charges) airtab.update(record["id"], this_dict) i += 1 elif record["fields"]["jail"] == "hcdc": messy = [] goods = [] data = record["fields"]["recent_text"].splitlines() messy.append(data[data.index("Charge 1") + 1].strip()) messy.append(data[data.index("Charge 2") + 1].strip()) messy.append(data[data.index("Charge 3") + 1].strip()) messy.append(data[data.index("Charge 4") + 1].strip()) for x in messy: if not x.startswith("Felony / Misd"): if "," in x: goods.append('"' + x + '"') else: goods.append(x) this_dict["charges"] = ", ".join(goods) airtab.update(record["id"], this_dict) i += 1 wrap_it_up(t0, new=i, total=len(records), function='get_charges_from_recent_text')