def document_page_geojson(request, doc_id, page_number): this_page = get_object_or_404(Page, doc__document_id=doc_id, page_number=page_number) this_page_words = PageWord.objects.filter(page_pk=this_page.pk).values('text','bbox', 'line_num') page = {} page['words'] = this_page_words output = get_feature_collection(page) # todo: add the page bounding box. return HttpResponse(output, content_type="application/json")
def document_page_geojson(request, doc_id, page_number): this_page = get_object_or_404(Page, doc__document_id=doc_id, page_number=page_number) this_page_words = PageWord.objects.filter(page_pk=this_page.pk).values( 'text', 'bbox', 'line_num') page = {} page['words'] = this_page_words output = get_feature_collection(page) # todo: add the page bounding box. return HttpResponse(output, content_type="application/json")
def handle(self, *args, **options): for d, _, files in os.walk(SAMPLE_FILE_DIR): for i, this_file in enumerate(files): file_path = SAMPLE_FILE_DIR + this_file # ignore files that aren't .html in case any got mixed in there # may want to filter on other criteria here too if file_path.find(".html") > 0: print "Handling %s" % (file_path) parser = document_parser(file_path, encoding='latin-1') first_page = parser.next_document() page = get_words_with_lines_from_page(first_page.getvalue()) fc = get_feature_collection(page) print fc
def document_page_geojson(request, slug, doc_slug, page_number): this_document = get_object_or_404(Document, document_slug=doc_slug, document_collection__collection_slug=slug) this_page = get_object_or_404(Page, doc__document_slug=doc_slug, page_number=page_number) this_page_words = PageWord.objects.filter(page_pk=this_page.pk).values('text','bbox', 'line_num') page = {} page['words'] = this_page_words featurecollection = get_feature_collection(page['words']) ## Add additional attributes needed. This may or may not break the geojsonspec. featurecollection['bbox'] = 'blah' featurecollection['background_image'] = 'blahblah' output = geojson.dumps(featurecollection) # todo: add the page bounding box. return HttpResponse(output, content_type="application/json")
def document_page_geojson(request, slug, doc_slug, page_number): this_document = get_object_or_404( Document, document_slug=doc_slug, document_collection__collection_slug=slug) this_page = get_object_or_404(Page, doc__document_slug=doc_slug, page_number=page_number) this_page_words = PageWord.objects.filter(page_pk=this_page.pk).values( 'text', 'bbox', 'line_num') page = {} page['words'] = this_page_words featurecollection = get_feature_collection(page['words']) ## Add additional attributes needed. This may or may not break the geojsonspec. featurecollection['bbox'] = 'blah' featurecollection['background_image'] = 'blahblah' output = geojson.dumps(featurecollection) # todo: add the page bounding box. return HttpResponse(output, content_type="application/json")
""" A django-independent test that reads a document and returns geojson files for each page. """ from hocr_parser.document_parser import document_parser from hocr_parser.parse_utils import get_words_with_lines_from_page from geo_utils.geojson_utils import get_feature_collection # A test file hocr_file = "./hocr_parser/test_hocr/58-1723645_990_201204.html" # create a parser for this doc hocr_parser = document_parser(hocr_file, encoding='latin-1') for this_page in hocr_parser: # retrieve a representation of the pages that include line numbers and word numbers page = get_words_with_lines_from_page(this_page.getvalue()) print "Processing page %s -- now dumping geojson to stdout\n\n" % (page['attrib']) # Get geojson that assigns id by word order and preserves line numbers as an attribute print get_feature_collection(page) print "\n\n\n"
""" A django-independent test that reads a document and returns geojson files for each page. """ from hocr_parser.document_parser import document_parser from hocr_parser.parse_utils import get_words_with_lines_from_page from geo_utils.geojson_utils import get_feature_collection # A test file hocr_file = "hocr_parser/test_hocr/58-1723645_990_201204.html" # create a parser for this doc hocr_parser = document_parser(hocr_file, encoding='latin-1') for this_page in hocr_parser: # retrieve a representation of the pages that include line numbers and word numbers page = get_words_with_lines_from_page(this_page.getvalue()) print "Processing page %s -- now dumping geojson to stdout\n\n" % (page['attrib']) # Get geojson that assigns id by word order and preserves line numbers as an attribute print get_feature_collection(page) print "\n\n\n"