def parse_file(filename, file_id): dbc = Db() if WORK_DIR not in filename: filename = os.path.join(WORK_DIR, filename) with open(filename, 'rb') as inputfile: file_start_time = time.time() logger.info('Parsing file %s' % filename) context = etree.iterparse(inputfile, events=('end',), tag='case-file') for event, case in context: doc_id = int(get_text_or_none(case, 'serial-number/text()')) serial_db = dbc.serial_get(doc_id, file_id) if serial_db is not None: new_file_date = int(re.sub(r"\D", "", filename)) db_file_date = int(re.sub(r"\D", "", serial_db['filename'])) if new_file_date > db_file_date \ or serial_db['status'] is False \ or (new_file_date >= db_file_date and args.parseall and args.force): for t in ('trademark_app_case_files', 'trademark_app_case_file_event_statements', 'trademark_app_case_file_headers', 'trademark_app_case_file_owners', 'trademark_app_case_file_statements', 'trademark_app_classifications', 'trademark_app_correspondents', 'trademark_app_design_searches', 'trademark_app_foreign_applications', 'trademark_app_international_registration', 'trademark_app_madrid_history_events', 'trademark_app_madrid_international_filing_record', 'trademark_app_prior_registration_applications', 'trademark_app_us_codes'): dbc.delete_serial(doc_id, t) logger.info('Processing existing serial number %s', doc_id) parse_case(case, doc_id, file_id) else: logger.info('Processing new serial number %s', doc_id) parse_case(case, doc_id, file_id) case.clear() dbc.file_update_status(file_id, 'finished') os.remove(filename) logger.info('Finished parsing file %s in [%s sec]', filename, time.time() - file_start_time)
def parse_file(filename, file_id): dbc = Db() if WORK_DIR not in filename: filename = os.path.join(WORK_DIR, filename) with open(filename, 'rb') as inputfile: file_start_time = time.time() logger.info('Parsing file %s' % filename) context = etree.iterparse(inputfile, events=('end', ), tag='case-file') for event, case in context: doc_id = int(get_text_or_none(case, 'serial-number/text()')) serial_db = dbc.serial_get(doc_id, file_id) if serial_db is not None: transaction_date_string = get_text_or_none( case, 'transaction-date/text()') if transaction_date_string: transaction_date = datetime.strptime( transaction_date_string, '%Y%m%d').date() else: logger.warning('Missing transaction date in XML') transaction_date = None if serial_db['transaction_date'] is not None and serial_db[ 'transaction_date'] != '': db_transaction_date = datetime.strptime( serial_db['transaction_date'], '%Y%m%d').date() else: logger.warning('Missing transaction date in database') db_transaction_date = None if transaction_date > db_transaction_date \ or (serial_db['status'] is False and args.force) \ or (transaction_date > db_transaction_date and args.parseall and args.force): for t in ( 'trademark_app_case_files', 'trademark_app_case_file_event_statements', 'trademark_app_case_file_headers', 'trademark_app_case_file_owners', 'trademark_app_case_file_statements', 'trademark_app_classifications', 'trademark_app_correspondents', 'trademark_app_design_searches', 'trademark_app_foreign_applications', 'trademark_app_international_registration', 'trademark_app_madrid_history_events', 'trademark_app_madrid_international_filing_record', 'trademark_app_prior_registration_applications', 'trademark_app_us_codes'): dbc.delete_serial(doc_id, t) dbc.cnx.commit() logger.info('[%s] Deleted serial %s from all tables', os.path.basename(filename), doc_id) logger.info('[%s] Processing existing serial number %s', os.path.basename(filename), doc_id) parse_case(case, doc_id, file_id, dbc) else: logger.info('[%s] Processing new serial number %s', os.path.basename(filename), doc_id) parse_case(case, doc_id, file_id, dbc) case.clear() dbc.file_update_status(file_id, 'finished') os.remove(filename) logger.info('[%s] Finished parsing file in [%s sec]', os.path.basename(filename), time.time() - file_start_time)