Пример #1
0
def _split_word_at_space(word):
    prev_ = WordType(id=word.id + '_l')
    next_ = WordType(id=word.id + '_r')
    xywh = xywh_from_points(word.get_Coords().points)
    textequiv = word.get_TextEquiv()[0]
    pos = textequiv.Unicode.index(" ")
    fract = pos / len(textequiv.Unicode)
    xywh_prev = xywh.copy()
    xywh_prev.update({'w': xywh['w'] * fract})
    prev_.set_Coords(CoordsType(points=points_from_xywh(xywh_prev)))
    xywh_next = xywh.copy()
    xywh_next.update({
        'x': xywh['x'] + xywh['w'] * fract,
        'w': xywh['w'] * (1 - fract)
    })
    next_.set_Coords(CoordsType(points=points_from_xywh(xywh_next)))
    if word.get_language():
        prev_.set_language(word.get_language())
        next_.set_language(word.get_language())
    if word.get_TextStyle():
        prev_.set_TextStyle(word.get_TextStyle())
        next_.set_TextStyle(word.get_TextStyle())
    # Glyphs: irrelevant at this processing level
    textequiv_prev = TextEquivType(Unicode=textequiv.Unicode[0:pos],
                                   conf=textequiv.conf)
    textequiv_next = TextEquivType(Unicode=textequiv.Unicode[pos + 1:],
                                   conf=textequiv.conf)
    prev_.set_TextEquiv([textequiv_prev])
    next_.set_TextEquiv([textequiv_next])
    return prev_, next_
Пример #2
0
def _merge_words(prev_, next_):
    merged = WordType(id=prev_.id + '.' + next_.id)
    merged.set_Coords(
        CoordsType(points=points_from_xywh(
            xywh_from_points(prev_.get_Coords().points + ' ' +
                             next_.get_Coords().points))))
    if prev_.get_language():
        merged.set_language(prev_.get_language())
    if prev_.get_TextStyle():
        merged.set_TextStyle(prev_.get_TextStyle())
    if prev_.get_Glyph() or next_.get_Glyph():
        merged.set_Glyph(prev_.get_Glyph() + next_.get_Glyph())
    if prev_.get_TextEquiv():
        merged.set_TextEquiv(prev_.get_TextEquiv())
    else:
        merged.set_TextEquiv([TextEquivType(Unicode='', conf=1.0)])
    if next_.get_TextEquiv():
        textequiv = merged.get_TextEquiv()[0]
        textequiv2 = next_.get_TextEquiv()[0]
        textequiv.Unicode += textequiv2.Unicode
        if textequiv.conf and textequiv2.conf:
            textequiv.conf *= textequiv2.conf
    return merged