def enter_singlepage_document(file_path, document_slug, collection_slug, page_number,only_enter_new_pages=False): this_collection, created = Document_Collection.objects.get_or_create(collection_slug=collection_slug) parser = document_parser(file_path, encoding='latin-1') this_doc, created = Document.objects.get_or_create(document_slug=document_slug, document_collection=this_collection) # todo: populate ein, form, year, month from id page_count=0 for this_page in parser: page_count+=1 assert page_count < 2 page = get_words_with_lines_from_page(this_page.getvalue()) enter_page(this_doc, page, page_number, only_enter_new_pages)
def enter_document(file_path, document_id): parser = document_parser(file_path, encoding='latin-1') this_doc, created = Document.objects.get_or_create(document_id=document_id) # todo: populate ein, form, year, month from id page_count=0 for this_page in parser: page_count += 1 # READ THE PAGE AS A HIERARCHY OF LINES AND WORDS. ONLY WORDS ARE GIVEN BOUNDING BOXES HERE THOUGH # THE HOCR SPEC GIVES LINES BOUNDING BOXES, SO THEY COULD BE ADDED, IT'S JUST NOT CLEAR IF THAT WOULD HELP # I ASSUME THE LINE IS JUST THE CONVEX HULL OF THE WORDS, BUT DON'T KNOW THIS FOR SURE. page = get_words_with_lines_from_page(this_page.getvalue()) enter_page(this_doc, page, page_count)
def handle(self, *args, **options): for d, _, files in os.walk(SAMPLE_FILE_DIR): for i, this_file in enumerate(files): file_path = SAMPLE_FILE_DIR + this_file # ignore files that aren't .html in case any got mixed in there # may want to filter on other criteria here too if file_path.find(".html") > 0: print "Handling %s" % (file_path) parser = document_parser(file_path, encoding='latin-1') first_page = parser.next_document() page = get_words_with_lines_from_page(first_page.getvalue()) fc = get_feature_collection(page) print fc
def enter_document(file_path, document_id): parser = document_parser(file_path, encoding='latin-1') this_doc, created = Document.objects.get_or_create(document_id=document_id) # todo: populate ein, form, year, month from id page_count = 0 for this_page in parser: page_count += 1 # READ THE PAGE AS A HIERARCHY OF LINES AND WORDS. ONLY WORDS ARE GIVEN BOUNDING BOXES HERE THOUGH # THE HOCR SPEC GIVES LINES BOUNDING BOXES, SO THEY COULD BE ADDED, IT'S JUST NOT CLEAR IF THAT WOULD HELP # I ASSUME THE LINE IS JUST THE CONVEX HULL OF THE WORDS, BUT DON'T KNOW THIS FOR SURE. page = get_words_with_lines_from_page(this_page.getvalue()) enter_page(this_doc, page, page_count)
def handle(self, *args, **options): """ test cmd to just get a page with geosgeometries attached """ for d, _, files in os.walk(SAMPLE_FILE_DIR): for i, this_file in enumerate(files): file_path = SAMPLE_FILE_DIR + this_file # ignore files that aren't .html in case any got mixed in there # may want to filter on other criteria here too if file_path.find(".html") > 0: print "Handling %s" % (file_path) parser = document_parser(file_path, encoding='latin-1') first_page = parser.next_document() page = get_words_with_lines_from_page(first_page.getvalue()) page['words'] = get_word_shapes(page['words']) print page
""" A django-independent test that reads a document and returns geojson files for each page. """ from hocr_parser.document_parser import document_parser from hocr_parser.parse_utils import get_words_with_lines_from_page from geo_utils.geojson_utils import get_feature_collection # A test file hocr_file = "./hocr_parser/test_hocr/58-1723645_990_201204.html" # create a parser for this doc hocr_parser = document_parser(hocr_file, encoding='latin-1') for this_page in hocr_parser: # retrieve a representation of the pages that include line numbers and word numbers page = get_words_with_lines_from_page(this_page.getvalue()) print "Processing page %s -- now dumping geojson to stdout\n\n" % (page['attrib']) # Get geojson that assigns id by word order and preserves line numbers as an attribute print get_feature_collection(page) print "\n\n\n"
from lxml import etree from lxml.etree import tostring from StringIO import StringIO from hocr_parser.document_parser import document_parser from hocr_parser.parse_utils import get_words_from_page, get_words_with_lines_from_page, get_annotated_bbox flexible_parser = etree.XMLParser(encoding='utf-8', recover=True) file_name = "58-1723645_990_201204" file_path = "parser/test_hocr/" + file_name + ".html" parser = document_parser(file_path, encoding='latin-1') page_num = 0 while True: this_page = parser.read_page() if not this_page: break page_num += 1 print "Processing page %s" % page_num outfile = "../display/hocr_pages/" + file_name + "p" + str(page_num) + ".html" #outh = open(outfile, 'w') page_xml = this_page.getvalue() #page_xml = page_xml.decode('latin-1', 'ignore').encode('utf-8') tree = etree.parse(StringIO(page_xml), flexible_parser) tree.write(outfile)
import unicodedata from lxml import etree from lxml.etree import tostring from StringIO import StringIO from hocr_parser.document_parser import document_parser from hocr_parser.parse_utils import get_words_from_page, get_words_with_lines_from_page, get_annotated_bbox flexible_parser = etree.XMLParser(encoding='utf-8', recover=True) file_name = "58-1723645_990_201204" file_path = "hocr_parser/test_hocr/" + file_name + ".html" parser = document_parser(file_path, encoding='latin-1') page_num = 0 while True: this_page = parser.read_page() if not this_page: break page_num += 1 print "Processing page %s" % page_num outfile = "../display/hocr_pages/" + file_name + "p" + str( page_num) + ".html" #outh = open(outfile, 'w') page_xml = this_page.getvalue() #page_xml = page_xml.decode('latin-1', 'ignore').encode('utf-8') tree = etree.parse(StringIO(page_xml), flexible_parser) tree.write(outfile)
""" A django-independent test that reads a document and returns geojson files for each page. """ from hocr_parser.document_parser import document_parser from hocr_parser.parse_utils import get_words_with_lines_from_page from geo_utils.geojson_utils import get_feature_collection # A test file hocr_file = "hocr_parser/test_hocr/58-1723645_990_201204.html" # create a parser for this doc hocr_parser = document_parser(hocr_file, encoding='latin-1') for this_page in hocr_parser: # retrieve a representation of the pages that include line numbers and word numbers page = get_words_with_lines_from_page(this_page.getvalue()) print "Processing page %s -- now dumping geojson to stdout\n\n" % (page['attrib']) # Get geojson that assigns id by word order and preserves line numbers as an attribute print get_feature_collection(page) print "\n\n\n"