def check_case(case_number, charge_cls, parser_cls, receipt_handle): logger.info(f'Processing {case_number}') # Fetch HTML from S3 and set up parser case_details = config.case_details_bucket.Object(case_number).get() case_html = case_details['Body'].read().decode('utf-8') parser = parser_cls(case_number, case_html) # Delete and reparse charges try: with db_session() as db: db.execute(charge_cls.__table__.delete()\ .where(charge_cls.case_number == case_number)) parser.charge_and_disposition(db, parser.soup) parser.update_last_parse(db) except IntegrityError: logger.warning(f'Reparsing {case_number} due to integrity error') try: parser.parse() except: logger.warning(f'Failed to parse {case_number}') pass logger.debug(f'Deleting {case_number} from parser queue') config.parser_queue.delete_messages( Entries=[{ 'Id': 'unused', 'ReceiptHandle': receipt_handle }])
def export_tables(args): case_models = get_case_model_list(models) with db_session() as db: for model in case_models: if args.redacted and 'defendants' in model.__tablename__: table_name = f'redacted.{model.__tablename__}' export_name = f'{model.__tablename__}_redacted.csv' else: table_name = model.__tablename__ export_name = f'{model.__tablename__}.csv' logger.info(f'Exporting {table_name} to S3') db.execute(f""" SELECT * FROM aws_s3.query_export_to_s3(' SELECT * FROM {table_name}', aws_commons.create_s3_uri( 'caseharvester-exports', '{export_name}', 'us-east-1' ), OPTIONS :='FORMAT CSV, HEADER' ) """)
def load_queue(detail_loc=None): if detail_loc: query_text = f""" SELECT distinct case_number, detail_loc FROM scrape_versions JOIN cases USING (case_number) WHERE detail_loc = '{detail_loc}' AND parse_exempt = FALSE AND last_parse IS NOT NULL GROUP BY case_number, detail_loc HAVING COUNT(*) >= 2 """ else: locs = "', '".join([k for k, _ in parsers.items()]) query_text = f""" SELECT distinct case_number, detail_loc FROM scrape_versions JOIN cases USING (case_number) WHERE detail_loc IN ('{locs}') AND parse_exempt = FALSE AND last_parse IS NOT NULL GROUP BY case_number, detail_loc HAVING COUNT(*) >= 2 """ logger.debug(f'Querying for cases') with db_session() as db: results = db.execute(text(query_text)) logger.debug('Query complete') messages = [ json.dumps({ 'Records': [{ 'manual': { 'case_number': case_number, 'detail_loc': detail_loc } }] }) for case_number, detail_loc in results ] send_to_queue(config.parser_queue, messages)
def scraper_prompt(exception, case_number): print(exception) while True: print('Continue scraping?') print('\t\t(y)es - ignore error and continue scraping (default)') print('\t\t(n)o - stop scraping and raise exception') print('\t\t(d)elete - ignore error and delete item from queue') print('\t\t(m)ark - mark case as unscrapable and continue') print('\t\t(e)xempt - save scrape but exempt from parsing') print('\t\t(s)ave - save scrape and continue') answer = input('Answer: (Y/n/d/m/e/s) ') if answer == 'y' or answer == 'Y' or not answer: return 'continue' elif answer == 'n': raise exception elif answer == 'd': return 'delete' elif answer == 'm': with db_session() as db: db.execute( Case.__table__.update()\ .where(Case.case_number == case_number)\ .values(scrape_exempt = True) ) return 'delete' elif answer == 'e': with db_session() as db: db.execute( Case.__table__.update()\ .where(Case.case_number == case_number)\ .values(parse_exempt = True) ) return 'store' elif answer == 's': return 'store' else: print('Invalid answer')
def parser_prompt(exception, case_number): if type(exception) == NotImplementedError: return 'delete' print(exception) while True: print('Continue parsing?') print('\t\t(y)es - ignore error and continue parsing (default)') print('\t\t(n)o - stop parsing and raise exception') print('\t\t(d)elete - delete scrape and remove from queue') answer = input('Answer: (Y/n/d) ') if answer == 'y' or answer == 'Y' or not answer: return 'continue' elif answer == 'n': raise exception elif answer == 'd': with db_session() as db: delete_latest_scrape(db, case_number) return 'delete' else: print('Invalid answer')