Python get_all示例，common.airtab_intakes.get_all Python示例

示例#1

0

显示文件

文件： polish_data.py 项目： bfeldman89/jail_scrapers

def get_pixelated_mug():
    """This function uploads the raw image to cloudinary and
    then uploads the pixelated version to the airtable record."""
    t0, i = time.time(), 0
    needs_pix_img_formula = "AND(PHOTO != '', PIXELATED_IMG = '', hours_since_verification < 24, jail != 'jcdc')"
    records = airtab.get_all(formula=needs_pix_img_formula)
    for record in records:
        this_dict = {}
        url = record["fields"]["PHOTO"][0]["url"]
        r = requests.get(url)
        content_type = r.headers['Content-Type']
        # print(content_type)
        if content_type == 'image/jpeg':
            try:
                upload_response = uploader.upload(url,
                                                  opacity=40,
                                                  effect="blur:400")
                time.sleep(1)
                this_dict["PIXELATED_IMG"] = [{
                    "url":
                    upload_response['secure_url']
                }]
                airtab.update(record['id'], this_dict)
            except cloudinary.exceptions.Error as err1:
                print("cloudinary can't accept that shit: ", err1)
            except AttributeError as err2:
                print('Attribute Error for cloudinary upload: ', err2)
        else:
            print('this shit was some really weird content type:',
                  content_type)
    wrap_it_up(t0, new=i, total=len(records), function='get_pixelated_mug')

示例#2

0

显示文件

文件： polish_data.py 项目： bfeldman89/jail_scrapers

def retry_getting_mugshot():
    t0, i = time.time(), 0
    needs_pic_formula = "AND(PHOTO = '', hours_since_verification < 12, jail != 'lcdc')"
    records = airtab.get_all(formula=needs_pic_formula)
    print("we're gonna retry getting mugs for", len(records), "records...")
    for record in records:
        this_dict = {}
        r = requests.get(record['fields']['link'])
        if record['fields']['jail'] == 'jcdc':
            soup = BeautifulSoup(r.text, 'html.parser').find(id='cms-content')
            img_tag = soup.find('div', class_='inmate_profile_image').img
            if img_tag['alt'] != 'Image Not Availble':
                this_dict[
                    'img_src'] = f"https://www.jonesso.com/{img_tag['src']}"
                this_dict['PHOTO'] = [{'url': this_dict['img_src']}]
            # else:
            #     print('image not currently available')
        elif record['fields']['jail'] == 'hcdc':
            soup = BeautifulSoup(r.text, 'html.parser')
            try:
                img_src = 'http://www.co.hinds.ms.us' + soup.find(
                    'img', {'align': 'middle'})['src']
                if requests.get(
                        img_src).headers['Content-Type'] == 'image/jpeg':
                    this_dict['img_src'] = img_src
                    this_dict['PHOTO'] = [{'url': img_src}]
                else:
                    print('image source isn\'t actually an image')
            except TypeError:
                print('no img tag in intake html')
        elif record['fields']['jail'] == 'kcdc':
            soup = BeautifulSoup(r.text, 'html.parser').find(id='cms-content')
            try:
                img_tag = soup.find('img')
            except AttributeError:
                # print('no img tag in intake html')
                continue
            if soup.img:
                img_src_raw = soup.img['src']
                if img_src_raw.startswith(
                        'templates/kempercountysheriff.com/images/inmates'):
                    this_dict[
                        'img_src'] = f"https://www.kempercountysheriff.com/{img_src_raw}"
                    this_dict['PHOTO'] = [{'url': this_dict['img_src']}]
        elif record['fields']['jail'] == 'acdc':
            soup = BeautifulSoup(r.text, 'html.parser').find(
                'div', class_='blog-content-container')
            try:
                img_tag = soup.find('img')
                this_dict['img_src'] = img_tag.get('src')
                this_dict['PHOTO'] = [{'url': this_dict['img_src']}]
            except AttributeError:
                # print('no img tag in intake html')
                continue
        else:
            print(
                f"awww hell... this one is from the {record['fields']['jail']} docket/scraper..."
            )
        airtab.update(record['id'], this_dict)
    wrap_it_up(t0, new=i, total=len(records), function='retry_getting_mugshot')

示例#3

0

显示文件

文件： weekly_snapshot.py 项目： bfeldman89/jail_scrapers

def admits_otw(week, county, jail):
    record = airtab_weekly.match('WOI', week)
    admits_formula = f"AND(WOI='{week}', jail='{jail}')"
    records = airtab_intakes.get_all(fields='jail', formula=admits_formula)
    other_records = airtab_archive_intakes.get_all(fields='jail',
                                                   formula=admits_formula)
    this_dict = {f"{county} total admits": len(records) + len(other_records)}
    airtab_weekly.update(record['id'], this_dict)
    # print(this_dict, f"active, {len(records)}; archive, {len(other_records)}")
    print(week, ' --> ', this_dict,
          f"active, {len(records)}; archive, {len(other_records)}")

示例#4

0

显示文件

文件： polish_data.py 项目： bfeldman89/jail_scrapers

def get_all_intake_deets():
    t0, i = time.time(), 0
    jcadc_deets_formula = "AND(jail = 'jcadc', charges = '', recent_text != '')"
    records = airtab.get_all(formula=jcadc_deets_formula, fields='recent_text')
    for record in records:
        charges = []
        bond_ammts = []
        classifications = []
        this_dict = {}
        txt_str = record['fields']['recent_text']
        chunks = txt_str.split('\nRequest Victim Notification\n')
        match_1 = re.search(r"(\w+)\s+(Male|Female)", chunks[0])
        try:
            raw_race = match_1.group(1)
            if raw_race == 'AVAILABLE':
                this_dict['race'] = 'U'
            else:
                this_dict['race'] = raw_race[0]
            this_dict['sex'] = match_1.group(2)[0]
        except AttributeError:
            print('there isnt race/sex info')
        try:
            this_dict['intake_weight'] = re.search(r"(\d+) Pounds",
                                                   chunks[0]).group(1)
        except AttributeError:
            print('there isnt weight info')
        try:
            this_dict['intake_height'] = re.search(r"(\d Ft. \d+ In.)",
                                                   chunks[0]).group(1)
        except AttributeError:
            print('idk how tall this person is')
        try:
            this_dict['intake_eye'] = re.search(r"(\w+)\s+Eyes",
                                                chunks[0]).group(1)
        except AttributeError:
            print('eye color is a mystery')
        this_dict['intake_age'] = re.search(r"(\d\d) Years Old",
                                            chunks[0]).group(1)
        crim_details = chunks[1].splitlines()
        for ln in crim_details:
            results = re.search(r"([MF]\w+) - Bond: (\$.*)", ln)
            if results:
                bond_ammts.append(results.group(2))
                classifications.append(results.group(1))
            elif ', ' in ln:
                charges.append(f"\"{ln}\"")
            else:
                charges.append(ln)
        this_dict['charges'] = ', '.join(charges)
        this_dict['bond_ammounts'] = '\n'.join(bond_ammts)
        this_dict['charge_classifications'] = ', '.join(classifications)
        airtab.update(record['id'], this_dict, typecast=True)
        i += 1
    wrap_it_up(t0, new=i, total=len(records), function='get_all_intake_deets')

示例#5

0

显示文件

文件： snapshot.py 项目： bfeldman89/jail_scrapers

def pop_otd(day, county, jail, quiet=True):
    record = airtab_daily.match('date_str', day)
    day_before = datetime.strptime(day, '%Y-%m-%d') - timedelta(1)
    day_after = datetime.strptime(day, '%Y-%m-%d') + timedelta(1)
    pop_formula = f"AND(IS_BEFORE(DOI, '{day_after.date()}'), IS_AFTER(last_verified, '{day_before.date()}'), jail='{jail}')"
    records = airtab_intakes.get_all(fields='jail', formula=pop_formula)
    # other_records = airtab_archive_intakes.get_all(fields='jail', formula=pop_formula)
    # this_dict = {f"{county} pop": len(records) + len(other_records)}
    this_dict = {f"{county} pop": len(records)}
    airtab_daily.update(record['id'], this_dict)
    if not quiet:
        # print(this_dict, f"active, {len(records)}; archive, {len(other_records)}")
        print(this_dict)

示例#6

0

显示文件

文件： snapshot.py 项目： bfeldman89/jail_scrapers

def admits_otd(day, county, jail, quiet=True):
    record = airtab_daily.match('date_str', day)
    admits_formula = f"AND(DATETIME_FORMAT(DOI, 'YYYY-MM-DD')='{day}', jail='{jail}')"
    records = airtab_intakes.get_all(fields='jail', formula=admits_formula)
    # other_records = airtab_archive_intakes.get_all(fields='jail', formula=admits_formula)
    # this_dict = {f"{county} admits": len(records) + len(other_records)}
    this_dict = {f"{county} admits": len(records)}
    try:
        airtab_daily.update(record['id'], this_dict)
    except KeyError:
        print('no record')
    if not quiet:
        # print(this_dict, f"active, {len(records)}; archive, {len(other_records)}")
        print(this_dict)

示例#7

0

显示文件

文件： polish_data.py 项目： bfeldman89/jail_scrapers

def update_dc_fields():
    records = airtab.get_all(view='need dc urls updated',
                             fields='dc_id',
                             max_records=100)
    print(len(records), ' records need updated documentcloud URLs.')
    for record in records:
        this_dict = {}
        dc_id = record['fields'].get('dc_id')
        obj = dc.documents.get(dc_id)
        this_dict["PDF"] = obj.pdf_url
        this_dict["dc_canonical_url"] = obj.canonical_url
        this_dict["dc_resources_page_image"] = obj.normal_image_url
        airtab.update(record['id'], this_dict)
        time.sleep(.3)

示例#8

0

显示文件

文件： polish_data.py 项目： bfeldman89/jail_scrapers

def get_full_text():
    t0, i = time.time(), 0
    records = airtab.get_all(formula="AND(dc_id != '', dc_full_text = '')",
                             fields=['dc_id'])
    for record in records:
        this_dict = {}
        obj = dc.documents.get(record['fields']['dc_id'])
        this_dict["dc_title"] = obj.title
        this_dict["dc_access"] = obj.access
        this_dict["dc_pages"] = obj.pages
        this_dict["dc_full_text"] = obj.full_text.decode("utf-8")
        airtab.update(record["id"], this_dict)
        i += 1
    wrap_it_up(t0, new=i, total=len(records), function='get_full_text')

示例#9

0

显示文件

文件： polish_data.py 项目： bfeldman89/jail_scrapers

def update_summary(this_many=150):
    """This function updates the record summary. The reason we have this field,
    rather than just use the 'blurb' field, is bc the gallery view works better
    with a text field than it does with a formula field. Because this view will
    regularly be packed full of records, the default max records is 100."""
    t0, i = time.time(), 0
    # outdated_summary_formula = "AND(blurb != '#ERROR!', blurb != summary)"
    # records = airtab.get_all(formula=outdated_summary_formula, fields="blurb", max_records=this_many)
    records = airtab.get_all(view='needs updated summary',
                             fields="blurb",
                             max_records=this_many)
    for record in records:
        this_dict = {}
        this_dict["summary"] = record["fields"]["blurb"]
        airtab.update(record["id"], this_dict)
    wrap_it_up(t0, new=i, total=len(records), function='update_summary')

示例#10

0

显示文件

文件： polish_data.py 项目： bfeldman89/jail_scrapers

def remove_weird_character():
    t0, i = time.time(), 0
    remove_wierd_character_formula = "AND(hours_since_verification > 12, FIND('ã', recent_text) > 1)"
    records = airtab.get_all(formula=remove_wierd_character_formula,
                             fields='recent_text')
    for record in records:
        this_dict = {}
        x = record['fields']['recent_text'].find('ã')
        y = record['fields']['recent_text'].find('\n', x)
        this_dict['recent_text'] = record['fields']['recent_text'].replace(
            record['fields']['recent_text'][x:y], '')
        airtab.update(record['id'], this_dict)
        i += 1
    wrap_it_up(t0,
               new=i,
               total=len(records),
               function='remove_weird_character')

示例#11

0

显示文件

def web_to_pdf():
    # filters for recently verified intakes w/out dc_id.
    # for records meeting that criteria, create pdf & store locally
    t0, i = time.time(), 0
    # pdf_formula = "AND(dc_id = '', hours_since_verification < 6, jail != 'jcj')"
    records = airtab.get_all(view='needs pdf')
    i = len(records)
    for record in records:
        url = record['fields']['link']
        jail = record['fields']['jail']
        if jail in {'mcdc', 'prcdf', 'lcdc', 'jcadc'}:
            fn = f"./output/{jail}/{record['fields']['intake_number']}.pdf"
        else:
            fn = f"./output/{jail}/{record['fields']['bk']}.pdf"
        options = {
            'quiet': '',
            'footer-right': time.strftime('%c'),
            'footer-left': url,
            'javascript-delay': 5000}
        if jail == 'lcdc':
            options['zoom'] = '.75'
            options['viewport-size'] = '1000x1400'
            options['footer-font-size'] = 9
        else:
            options['footer-font-size'] = 10
        if jail in {'mcdc', 'prcdf'}:
            try:
                r = requests.get(url, headers=muh_headers)
            except requests.ConnectionError as err:
                print(f"Skipping {url}: {err}")
                time.sleep(5)
                continue
            data = []
            soup = BeautifulSoup(r.text, 'html.parser')
            for string in soup.stripped_strings:
                data.append(str(string))
            if record['fields']['intake_number'] == data[1 + data.index('INTAKE #:')]:
                pdfkit.from_url(url, fn, options)
            else:
                print('the intake number does not match!')
        else:
            pdfkit.from_url(url, fn, options)
    wrap_it_up(t0, new=i, total=i, function='web_to_pdf')

示例#12

0

显示文件

def get_dor_if_possible(this_many=50):
    t0, i = time.time(), 0
    # records = airtab.get_all(view="check for DOR")
    dor_formula = "AND(OR(jail = 'kcdc', jail = 'tcdc', jail = 'ccdc', jail = 'jcdc'), DOR = '', hours_since_verification > 6, hours_since_verification < 48)"
    records = airtab.get_all(formula=dor_formula, max_records=this_many)
    total = len(records)
    for record in records:
        this_dict = {}
        try:
            r = requests.get(record["fields"]["link"])
        except requests.ConnectionError as err:
            print(f"Skipping {record['fields']['link']}: {err}")
            time.sleep(5)
            continue
        soup = BeautifulSoup(r.text, "html.parser")
        data = []
        for string in soup.stripped_strings:
            data.append(str(string))
        if "Release Date:" in data:
            options = {
                "quiet": "",
                "footer-font-size": 10,
                "footer-left": record["fields"]["link"],
                "footer-right": time.strftime('%c'),
            }
            directory = f"./output/{record['fields']['jail']}/updated"
            try:
                ensure_dir(directory)
                file_name = f"{record['fields']['bk']} (final).pdf"
                fn = os.path.join(directory, file_name)
                pdfkit.from_url(record["fields"]["link"], fn, options=options)
            except NotADirectoryError as err:
                print(f"Can't write PDF: {err}")
            this_dict["DOR"] = datetime.datetime.strptime(
                data[1 + data.index("Release Date:")], "%m-%d-%Y - %I:%M %p"
            ).strftime('%m/%d/%Y %H:%M')
            airtab.update(record["id"], this_dict)
            i += 1
    wrap_it_up(t0, i, total, function='get_dor_if_possible')

示例#13

0

显示文件

文件： polish_data.py 项目： bfeldman89/jail_scrapers

def fix_charges_to_by_lines():
    t0, i = time.time(), 0
    records = airtab.get_all(
        formula="AND(TEST_FORMULA != '', TEST_RESULT = '')", fields='charges')
    for record in records:
        this_dict = {}
        cleaner = []
        mess = record['fields']['charges'].replace('", ', '"\n').replace(
            ', "', '\n"').splitlines()
        for c in mess:
            if c.startswith('"'):
                cleaner.append(c.replace('"', ''))
            else:
                for d in c.split(', '):
                    cleaner.append(d)
        this_dict['TEST_RESULT'] = '\n'.join(cleaner)
        airtab.update(record['id'], this_dict)
        i += 1
    wrap_it_up(t0,
               new=i,
               total=len(records),
               function='fix_charges_to_by_lines')

示例#14

0

显示文件

文件： polish_data.py 项目： bfeldman89/jail_scrapers

def parse_charge_1():
    t0, i = time.time(), 0
    needs_charge_1_parsed_formula = "AND(OR(jail = 'mcdc', jail = 'prcdf'), charge_1_statute = '', hours_since_initial_scrape < 48, charge_1 != '', charge_1 != 'HOLDHOLD', charge_1 != 'DRUGDRUG COURT', charge_1 != 'HLD Other AgencyHold for other Agency')"
    records = airtab.get_all(formula=needs_charge_1_parsed_formula)
    for record in records:
        this_dict = {}
        x = None
        if re.search("[)][A-Z]", record["fields"]["charge_1"]):
            x = re.search("[)][A-Z]", record["fields"]["charge_1"])
        elif re.search("[0-9][A-Z]", record["fields"]["charge_1"]):
            x = re.search("[0-9][A-Z]", record["fields"]["charge_1"])
        if x:
            this_dict["charge_1_statute"] = record["fields"][
                "charge_1"][:x.start() + 1]
            this_dict["charge_1_title"] = record["fields"]["charge_1"][x.end(
            ) - 1:]
            try:
                airtab.update(record["id"], this_dict)
                i += 1
            except requests.exceptions.HTTPError as err:
                print(err)
                continue
    wrap_it_up(t0, new=i, total=len(records), function='parse_charge_1')

示例#15

0

显示文件

文件： polish_data.py 项目： bfeldman89/jail_scrapers

def get_charges_from_recent_text():
    """This function parces the recent text field and extracts the listed charges."""
    t0, i = time.time(), 0
    needs_charges_formula = "AND(charges_updated = '', html != '', recent_text != '', hours_since_verification < 72, DONT_DELETE != 'no charges')"
    records = airtab.get_all(formula=needs_charges_formula)
    for record in records:
        this_dict = {}
        if record["fields"]["jail"] == "lcdc":
            charges = []
            bond_ammounts = []
            fine_ammounts = []
            soup = BeautifulSoup(record["fields"]["html"], "html.parser").tbody
            rows = soup.find_all("tr")
            if soup.tfoot:
                goods = rows[:len(rows) - 1]
                this_dict["intake_bond_cash"] = soup.tfoot.find_all(
                    "td")[2].b.string.strip()
                this_dict["intake_fine_ammount"] = soup.tfoot.find_all(
                    "td")[3].b.string.strip()
            else:
                goods = rows
            for row in goods:
                cells = row.find_all("td")
                if cells[0].string.strip():
                    if "," in cells[0].string.strip():
                        charges.append('"' + cells[0].string.strip() + '"')
                    else:
                        charges.append(cells[0].string.strip())
                if cells[2].string.strip():
                    bond_ammounts.append(cells[2].string.strip().replace(
                        ",", ""))
                if cells[3].string.strip():
                    fine_ammounts.append(cells[3].string.strip().replace(
                        ",", ""))
            if charges:
                this_dict["charges"] = ", ".join(charges)
            if bond_ammounts:
                this_dict["bond_ammounts"] = "\n".join(bond_ammounts)
            if fine_ammounts:
                this_dict["fine_ammounts"] = "\n".join(fine_ammounts)
            airtab.update(record["id"], this_dict, typecast=True)
            i += 1
        elif record["fields"]["jail"] == "kcdc":
            charges = []
            text = record["fields"]["recent_text"]
            goods = text[text.find("Charges:"):text.find("Note:")].splitlines()
            if len(goods) > 1:
                for good in goods:
                    if "," in good:
                        charges.append('"' + good.strip() + '"')
                    else:
                        charges.append(good)
                this_dict["charges"] = ", ".join(goods[1:])
                airtab.update(record["id"], this_dict)
                i += 1
        elif record["fields"]["jail"] in {"ccdc", "tcdc", "jcdc"}:
            charges = []
            text = record["fields"]["recent_text"]
            x = text.find("\nCharges:") + 9
            y = text.find("\nBond:")
            goods = text[x:y].strip().splitlines()
            for line in goods:
                if "," in line:
                    charges.append('"' + line + '"')
                else:
                    charges.append(line)
            this_dict["charges"] = ", ".join(charges)
            airtab.update(record["id"], this_dict)
            i += 1
        elif record["fields"]["jail"] == "hcdc":
            messy = []
            goods = []
            data = record["fields"]["recent_text"].splitlines()
            messy.append(data[data.index("Charge 1") + 1].strip())
            messy.append(data[data.index("Charge 2") + 1].strip())
            messy.append(data[data.index("Charge 3") + 1].strip())
            messy.append(data[data.index("Charge 4") + 1].strip())
            for x in messy:
                if not x.startswith("Felony / Misd"):
                    if "," in x:
                        goods.append('"' + x + '"')
                    else:
                        goods.append(x)
            this_dict["charges"] = ", ".join(goods)
            airtab.update(record["id"], this_dict)
            i += 1
    wrap_it_up(t0,
               new=i,
               total=len(records),
               function='get_charges_from_recent_text')