예제 #1
0
def check_case(case_number, charge_cls, parser_cls, receipt_handle):
    logger.info(f'Processing {case_number}')

    # Fetch HTML from S3 and set up parser
    case_details = config.case_details_bucket.Object(case_number).get()
    case_html = case_details['Body'].read().decode('utf-8')
    parser = parser_cls(case_number, case_html)

    # Delete and reparse charges
    try:
        with db_session() as db:
            db.execute(charge_cls.__table__.delete()\
                .where(charge_cls.case_number == case_number))
            parser.charge_and_disposition(db, parser.soup)
            parser.update_last_parse(db)
    except IntegrityError:
        logger.warning(f'Reparsing {case_number} due to integrity error')
        try:
            parser.parse()
        except:
            logger.warning(f'Failed to parse {case_number}')
            pass
    logger.debug(f'Deleting {case_number} from parser queue')
    config.parser_queue.delete_messages(
        Entries=[{
            'Id': 'unused',
            'ReceiptHandle': receipt_handle
        }])
예제 #2
0
def export_tables(args):
    case_models = get_case_model_list(models)
    with db_session() as db:
        for model in case_models:
            if args.redacted and 'defendants' in model.__tablename__:
                table_name = f'redacted.{model.__tablename__}'
                export_name = f'{model.__tablename__}_redacted.csv'
            else:
                table_name = model.__tablename__
                export_name = f'{model.__tablename__}.csv'
            logger.info(f'Exporting {table_name} to S3')
            db.execute(f"""
                SELECT
                    *
                FROM
                    aws_s3.query_export_to_s3('
                        SELECT
                            *
                        FROM
                            {table_name}',
                        aws_commons.create_s3_uri(
                            'caseharvester-exports',
                            '{export_name}',
                            'us-east-1'
                        ),
                        OPTIONS :='FORMAT CSV, HEADER'
                    )
            """)
예제 #3
0
def load_queue(detail_loc=None):
    if detail_loc:
        query_text = f"""
            SELECT
                distinct case_number,
                detail_loc
            FROM
                scrape_versions 
                JOIN
                    cases USING (case_number) 
            WHERE
                detail_loc = '{detail_loc}'
                AND parse_exempt = FALSE
                AND last_parse IS NOT NULL
            GROUP BY
                case_number,
                detail_loc
            HAVING
                COUNT(*) >= 2
        """
    else:
        locs = "', '".join([k for k, _ in parsers.items()])
        query_text = f"""
            SELECT
                distinct case_number,
                detail_loc
            FROM
                scrape_versions 
                JOIN
                    cases USING (case_number) 
            WHERE
                detail_loc IN ('{locs}')
                AND parse_exempt = FALSE
                AND last_parse IS NOT NULL
            GROUP BY
                case_number,
                detail_loc
            HAVING
                COUNT(*) >= 2
        """
    logger.debug(f'Querying for cases')
    with db_session() as db:
        results = db.execute(text(query_text))
    logger.debug('Query complete')
    messages = [
        json.dumps({
            'Records': [{
                'manual': {
                    'case_number': case_number,
                    'detail_loc': detail_loc
                }
            }]
        }) for case_number, detail_loc in results
    ]
    send_to_queue(config.parser_queue, messages)
예제 #4
0
def scraper_prompt(exception, case_number):
    print(exception)
    while True:
        print('Continue scraping?')
        print('\t\t(y)es    - ignore error and continue scraping (default)')
        print('\t\t(n)o     - stop scraping and raise exception')
        print('\t\t(d)elete - ignore error and delete item from queue')
        print('\t\t(m)ark   - mark case as unscrapable and continue')
        print('\t\t(e)xempt - save scrape but exempt from parsing')
        print('\t\t(s)ave   - save scrape and continue')
        answer = input('Answer: (Y/n/d/m/e/s) ')
        if answer == 'y' or answer == 'Y' or not answer:
            return 'continue'
        elif answer == 'n':
            raise exception
        elif answer == 'd':
            return 'delete'
        elif answer == 'm':
            with db_session() as db:
                db.execute(
                    Case.__table__.update()\
                        .where(Case.case_number == case_number)\
                        .values(scrape_exempt = True)
                )
            return 'delete'
        elif answer == 'e':
            with db_session() as db:
                db.execute(
                    Case.__table__.update()\
                        .where(Case.case_number == case_number)\
                        .values(parse_exempt = True)
                )
            return 'store'
        elif answer == 's':
            return 'store'
        else:
            print('Invalid answer')
예제 #5
0
def parser_prompt(exception, case_number):
    if type(exception) == NotImplementedError:
        return 'delete'
    print(exception)
    while True:
        print('Continue parsing?')
        print('\t\t(y)es    - ignore error and continue parsing (default)')
        print('\t\t(n)o     - stop parsing and raise exception')
        print('\t\t(d)elete - delete scrape and remove from queue')
        answer = input('Answer: (Y/n/d) ')
        if answer == 'y' or answer == 'Y' or not answer:
            return 'continue'
        elif answer == 'n':
            raise exception
        elif answer == 'd':
            with db_session() as db:
                delete_latest_scrape(db, case_number)
            return 'delete'
        else:
            print('Invalid answer')