def annotate_page(scan, size, articles): scan_px = scan.load() log.debug("Image size: {} x {}".format(scan.size[0], scan.size[1])) log.debug("Page size: {} x {}".format(int(size[0]), int(size[1]))) # the image size is not the same as the page size, # so we must calculate how different it is. fx = size[0] / scan.size[0] fy = size[1] / scan.size[1] log.debug("Scale factors: " + str((fx, fy))) log.info("Annotating scan...") # indicate elements log.debug("Drawing the following elements:") log.indent() # color all articles RED = (255, 0, 0) for article in articles: left = math.floor(article['start'][0] / fx) right = math.ceil(article['end'][0] / fx) top = math.floor(article['start'][1] / fy) bottom = math.ceil(article['end'][1] / fy) log.debug("Rectangle[{:>4} + {:>4}, {:>4} + {:>4}]".format( left, right - left, bottom, bottom - top, )) # margins are marked in solid red # top and bottom for i in range(left, right): scan_px[i, top] = RED scan_px[i, bottom - 1] = RED # left and right for i in range(top, bottom): scan_px[left, i] = RED scan_px[right - 1, i] = RED # color the inside of the block yellow for i in range(left + 1, right): for j in range(top + 1, bottom): scan_px[i, j] = (min(255, scan_px[i, j][0] + 40), min(255, scan_px[i, j][1] + 40), scan_px[i, j][2]) log.dedent()
def handle_page(root): # log.info("Parsing ALTO file...") # in case server gives an HTTP 200 response with invalid data, try # to detect it at parse time. try: alto = parsefile('ocr.xml') except SAXParseException as e: log.error("Invalid XML found. Maybe site is down?") return if alto is None: log.warn("Non-conforming ALTO file found. Skipping.") return # show font info for page log.debug("Fonts:") log.indent() for name, font in alto.fonts.items(): log.debug("{:<6}: {}".format(name, font)) log.dedent() # log.info("Finding articles...") find_articles(alto) if '--annotate' in sys.argv and 'scan.jp2' in os.listdir(): log.info("Opening image file...") scan = Image.open('scan.jp2').convert('RGB') # annotate image with articles with open('articles.json') as articles: annotate_page(scan, alto.page.size, json.load(articles)) # save and finish scan.save('annotated_scan.jpg') scan.close() # insert into database try: articleToDB.putArticlesInDatabase(commitIn=1, root=root) except Exception as e: # if database insertion fails, quit completely. # have user restart process where it failed with # `--start` option. log.error("Database error: {}".format(e)) for file in os.listdir(): os.remove(file) raise e
def wrap(*args, **kwargs): log.message(start) log.indent() return_value = None try: return_value = func(*args, **kwargs) except: log.exception() if check: if not return_value: log.error("Failure") log.dedent() return return_value
def function_wrapper(*args, **kwargs): # Get function arguments and their values farglist = __get_function_parameters(function, *args, **kwargs) # Print the header: what function was called # and what argument values were passed fargdict = dict(farglist) if 'self' in fargdict: call_str = 'CALL %s.%s.%s (' % (function.__module__, fargdict['self'].__class__.__name__, function.__name__) elif 'cls' in fargdict: call_str = 'CALL %s.%s.%s (' % (function.__module__, fargdict['cls'].__name__, function.__name__) else: call_str = 'CALL %s.%s (' % (function.__module__, function.__name__) if not farglist: log.info('%s)' % call_str) elif len(farglist) == 1: log.info('%s%s = %s)' % (call_str, farglist[0][0], __format_value(farglist[0][1]))) else: indent = ' ' * len(call_str) # First parameter log.info('%s%s = %s' % (call_str, farglist[0][0], __format_value(farglist[0][1]))) # [1:-1] parameters for name, value in farglist[1:-1]: log.info('%s%s = %s' % (indent, name, __format_value(value))) # Last parameters log.info('%s%s = %s)' % (indent, farglist[-1][0], __format_value(farglist[-1][1]))) log.indent() # Invoke the functions result = function(*args, **kwargs) # Print footer and the return value (if present) if __function_has_return_statement(function): log.info('END %s -> %s' % (function.__name__, __format_value(result))) else: log.info('END %s' % function.__name__) log.unindent() # Return function result return result
def fetch_batch(batch, start_issue): metadata = {} batch_url = CA_DOMAIN + "/batches/" + batch + ".json" # fetch individual batch json batch = poll_fetch(batch_url) if batch is None: log.warn("Could not fetch batch. Skipping.") return batch_json = batch.json() log.info('Issues: ' + str(len(batch_json['issues']))) log.info('Pages: ' + str(batch_json['page_count'])) log.indent() for issue in batch_json['issues']: metadata['newspaper'] = issue['title']['name'] metadata['publication_date'] = issue['date_issued'] if (batch_json['issues'].index(issue) + 1) < start_issue: continue # unit testing #if (batch_json['issues'].index(issue)+1) > start_issue: # return log.info('Issue {}/{}: "{}", {}'.format( batch_json['issues'].index(issue) + 1, len(batch_json['issues']), metadata['newspaper'], metadata['publication_date'])) # fetch title json for publisher and location data title = poll_fetch(issue['title']['url']) if title is None: log.warn("Could not fetch title information. Skipping.") continue title_json = title.json() metadata['publisher'] = title_json['publisher'] place = title_json['place'][0].split('--') metadata['state'] = place[0] metadata['city'] = place[-1] # fetch json for newspaper issue issue = poll_fetch(issue['url']) if issue is None: log.warn("Could not fetch issue JSON. Skipping.") continue issue_json = issue.json() # sort pages by sequence number, just in case issue_json['pages'].sort(key=lambda page: page['sequence']) log.info('Saving {} pages at ./{}'.format( len(issue_json['pages']), os.path.splitext(urlparse(issue_json['url']).path[1:])[0])) log.indent() for page in issue_json['pages']: log.info("Processing page {} ...".format(page['sequence'])) # fetch json for single page page = poll_fetch(page['url']) if page is None: log.warn("Could not fetch page JSON. Skipping.") continue page_json = page.json() metadata['source_link'] = page_json['pdf'] log.debug("Metadata: {") log.indent() for key in metadata: log.debug("'{}': '{}'".format(key, metadata[key])) log.dedent() log.debug("}") # parse ocr url and get its path component only path = os.path.dirname(urlparse(page_json['ocr']).path[1:]) root = os.getcwd() os.makedirs(path, exist_ok=True) os.chdir(path) # save metadata.json; SHOULD work with open('metadata.json', 'w') as file: file.write(json.dumps(metadata, sort_keys=True, indent=4)) # save ocr.xml try: with open('ocr.xml', 'w', encoding='utf-8') as file: ocr = poll_fetch(page_json['ocr']) if ocr is None: log.warn("Could not fetch ALTO file. Skipping.") for file in os.listdir(): os.remove(file) os.chdir(root) os.removedirs(path) continue file.write(ocr.text) log.debug("OCR file: ./" + path + "/ocr.xml") except Exception as e: # skip to the next page log.warn("OCR file not found: {}".format(e)) os.remove('ocr.xml') os.remove('metadata.json') os.chdir(root) os.removedirs(path) continue if '--annotate' in sys.argv: try: # save scan.jp2 data = requests.get(page_json['jp2'], stream=True) if data.status_code == 200: with open('scan.jp2', 'wb') as file: for chunk in data: file.write(chunk) log.debug("Image scan: ./" + path + "/scan.jp2") else: log.warn("Failed to get image scan. Skipping.") except Exception as e: log.error("Could not get image: {}".format(e)) log.warn("Skipping annotation stage for this page.") handle_page(root) # log.info("Cleaning up...") for file in os.listdir(): os.remove(file) os.chdir(root) os.removedirs(path) log.info() log.dedent() log.dedent() log.info("Done fetching this batch, onto next")
batch_page_url = CA_DOMAIN + "/batches/" + str(start_page) + ".json" log.info("Fetching batch list") # the only code in the whole script that will stop on a network # error. ensures that people double-check their connection. try: batches_page_json = requests.get(batch_page_url).json() except: log.warn("Can't fetch batch list right now. Perhaps site is down?") log.warn("Please try again later.") sys.exit(1) try: while True: log.info("Batch page: {}".format(start_page)) log.indent() for batch_info in batches_page_json['batches']: log.info('Batch: ' + batch_info['name']) if (curr_batch < start_batch): pass # unit testing #elif(curr_batch > start_batch): # sys.exit(0) else: fetch_batch(batch_info['name'], start_issue) curr_batch = curr_batch + 1 log.dedent() try: batches_page_json = poll_fetch( batches_page_json['next']).json()