def parse_file(filename, file_id):
    dbc = Db()
    if WORK_DIR not in filename:
        filename = os.path.join(WORK_DIR, filename)
    with open(filename, 'rb') as inputfile:
        file_start_time = time.time()
        logger.info('Parsing file %s' % filename)
        context = etree.iterparse(inputfile, events=('end',), tag='case-file')
        for event, case in context:
            doc_id = int(get_text_or_none(case, 'serial-number/text()'))
            serial_db = dbc.serial_get(doc_id, file_id)
            if serial_db is not None:
                new_file_date = int(re.sub(r"\D", "", filename))
                db_file_date = int(re.sub(r"\D", "", serial_db['filename']))
                if new_file_date > db_file_date \
                        or serial_db['status'] is False \
                        or (new_file_date >= db_file_date and args.parseall and args.force):
                    for t in ('trademark_app_case_files', 'trademark_app_case_file_event_statements',
                              'trademark_app_case_file_headers', 'trademark_app_case_file_owners',
                              'trademark_app_case_file_statements', 'trademark_app_classifications',
                              'trademark_app_correspondents', 'trademark_app_design_searches',
                              'trademark_app_foreign_applications', 'trademark_app_international_registration',
                              'trademark_app_madrid_history_events', 'trademark_app_madrid_international_filing_record',
                              'trademark_app_prior_registration_applications', 'trademark_app_us_codes'):
                        dbc.delete_serial(doc_id, t)
                    logger.info('Processing existing serial number %s', doc_id)
                    parse_case(case, doc_id, file_id)
            else:
                logger.info('Processing new serial number %s', doc_id)
                parse_case(case, doc_id, file_id)
            case.clear()
    dbc.file_update_status(file_id, 'finished')
    os.remove(filename)
    logger.info('Finished parsing file %s in [%s sec]', filename, time.time() - file_start_time)
示例#2
0
def parse_file(filename, file_id):
    dbc = Db()
    if WORK_DIR not in filename:
        filename = os.path.join(WORK_DIR, filename)
    with open(filename, 'rb') as inputfile:
        file_start_time = time.time()
        logger.info('Parsing file %s' % filename)
        context = etree.iterparse(inputfile, events=('end', ), tag='case-file')
        for event, case in context:
            doc_id = int(get_text_or_none(case, 'serial-number/text()'))
            serial_db = dbc.serial_get(doc_id, file_id)
            if serial_db is not None:
                transaction_date_string = get_text_or_none(
                    case, 'transaction-date/text()')
                if transaction_date_string:
                    transaction_date = datetime.strptime(
                        transaction_date_string, '%Y%m%d').date()
                else:
                    logger.warning('Missing transaction date in XML')
                    transaction_date = None
                if serial_db['transaction_date'] is not None and serial_db[
                        'transaction_date'] != '':
                    db_transaction_date = datetime.strptime(
                        serial_db['transaction_date'], '%Y%m%d').date()
                else:
                    logger.warning('Missing transaction date in database')
                    db_transaction_date = None
                if transaction_date > db_transaction_date \
                    or (serial_db['status'] is False and args.force) \
                    or (transaction_date > db_transaction_date and args.parseall and args.force):
                    for t in (
                            'trademark_app_case_files',
                            'trademark_app_case_file_event_statements',
                            'trademark_app_case_file_headers',
                            'trademark_app_case_file_owners',
                            'trademark_app_case_file_statements',
                            'trademark_app_classifications',
                            'trademark_app_correspondents',
                            'trademark_app_design_searches',
                            'trademark_app_foreign_applications',
                            'trademark_app_international_registration',
                            'trademark_app_madrid_history_events',
                            'trademark_app_madrid_international_filing_record',
                            'trademark_app_prior_registration_applications',
                            'trademark_app_us_codes'):
                        dbc.delete_serial(doc_id, t)
                        dbc.cnx.commit()
                    logger.info('[%s] Deleted serial %s from all tables',
                                os.path.basename(filename), doc_id)
                    logger.info('[%s] Processing existing serial number %s',
                                os.path.basename(filename), doc_id)
                    parse_case(case, doc_id, file_id, dbc)
            else:
                logger.info('[%s] Processing new serial number %s',
                            os.path.basename(filename), doc_id)
                parse_case(case, doc_id, file_id, dbc)
            case.clear()
    dbc.file_update_status(file_id, 'finished')
    os.remove(filename)
    logger.info('[%s] Finished parsing file in [%s sec]',
                os.path.basename(filename),
                time.time() - file_start_time)