Exemplo n.º 1
0
def enter_page(doc, page, page_number):
    #print "processing page %s" % page_number
    page_attributes =  page['attrib']
    title = page_attributes['title']
    #semicolon_position = title.find(';')
    #if semicolon_position > 0:
    #    title = title[:semicolon_position]
    #print "title is '%s'" % title
    #r = bbox_re.search(title)

    #bbox_raw = r.group(1)
    bbox_raw = title.split(';')[1]
    bbox_raw = bbox_raw.replace("bbox ", "")
    poly_string = get_poly_string_from_bbox(bbox_raw)
    #print "bbox is %s, poly_string is: %s" % (bbox_raw, poly_string)
    poly = GEOSGeometry(poly_string)
    wkb = poly.hex
    this_page, created = Page.objects.get_or_create(
        doc=doc, 
        page_number=page_number, 
        defaults={'page_dimensions':poly}
    )

    page_pk = this_page.pk
    enter_words(page_pk, page['words'])
Exemplo n.º 2
0
def enter_page(doc, page, page_number, only_enter_new_pages=False):
    #print "processing page %s" % page_number
    page_attributes =  page['attrib']
    title = page_attributes['title']
    #semicolon_position = title.find(';')
    #if semicolon_position > 0:
    #    title = title[:semicolon_position]
    #print "title is '%s'" % title
    #r = bbox_re.search(title)

    #bbox_raw = r.group(1)
    bbox_raw = title.split(';')[1]
    bbox_raw = bbox_raw.replace("bbox ", "")
    poly_string = get_poly_string_from_bbox(bbox_raw)
    #print "bbox is %s, poly_string is: %s" % (bbox_raw, poly_string)
    poly = GEOSGeometry(poly_string)
    wkb = poly.hex
    this_page, created = Page.objects.get_or_create(
        doc=doc, 
        page_number=page_number, 
        defaults={'page_dimensions':poly}
    )
    #print "This page is: %s created=%s" % (this_page, created)
    
    # Only enter pagewords if the page is new (or if we're telling it to. )
    if not created or not only_enter_new_pages:
        page_pk = this_page.pk
        enter_words(page_pk, page['words'])
Exemplo n.º 3
0
def enter_page_words_only(doc, page, page_number):
    print "processing page %s" % page_number
    page_attributes = page['attrib']
    title = page_attributes['title']
    r = bbox_re.search(title)

    bbox_raw = r.group(1)
    poly_string = get_poly_string_from_bbox(bbox_raw)
    poly = GEOSGeometry(poly_string)
    wkb = poly.hex
    this_page, created = Page.objects.get_or_create(
        doc=doc, page_number=page_number, defaults={'page_dimensions': poly})

    page_pk = this_page.pk
    # READ THE PAGE AS A BUNCH OF WORDS ONLY
    enter_words_only(page_pk, page['words'])
Exemplo n.º 4
0
def enter_page_words_only(doc, page, page_number):
    print "processing page %s" % page_number
    page_attributes =  page['attrib']
    title = page_attributes['title']
    r = bbox_re.search(title)
    
    bbox_raw = r.group(1)
    poly_string = get_poly_string_from_bbox(bbox_raw)
    poly = GEOSGeometry(poly_string)
    wkb = poly.hex
    this_page, created = Page.objects.get_or_create(
        doc=doc, 
        page_number=page_number, 
        defaults={'page_dimensions':poly}
    )
    
    page_pk = this_page.pk
    # READ THE PAGE AS A BUNCH OF WORDS ONLY
    enter_words_only(page_pk, page['words'])