예제 #1
0
                yield TokenSplit(left, delimiter, right)
            yield atom.text


########
#
#   SEGMENT
#
########

RULES = [
    DashRule(),
    UnderscoreRule(),
    FloatRule(),
    FractionRule(),
    FunctionRule(punct),
    FunctionRule(other),
]


class TokenSegmenter(Segmenter):
    def __init__(self):
        super(TokenSegmenter, self).__init__(TokenSplitter(), RULES)

    def segment(self, parts):
        buffer = next(parts)
        for split in parts:
            right = next(parts)
            split.buffer = buffer
            if not split.delimiter and self.join(split):
                buffer += right
예제 #2
0
                yield TokenSplit(left, delimiter, right)
            yield atom.text


########
#
#   SEGMENT
#
########

RULES = [
    DashRule(),
    UnderscoreRule(),
    FloatRule(),
    FractionRule(),
    FunctionRule(punct),
    FunctionRule(other),
    FunctionRule(yahoo),
]


class TokenSegmenter(Segmenter):
    def __init__(self):
        super(TokenSegmenter, self).__init__(TokenSplitter(), RULES)

    def segment(self, parts):
        buffer = next(parts)
        for split in parts:
            right = next(parts)
            split.buffer = buffer
            if not split.delimiter and self.join(split):
예제 #3
0
            yield text[previous:start]
            left = text[max(0, start - self.window):start]
            right = text[stop:stop + self.window]
            yield SentSplit(left, delimiter, right)
            previous = stop
        yield text[previous:]


########
#
#   SEGMENT
#
########

RULES = [
    FunctionRule(_) for _ in [
        empty_side,
        no_space_prefix,
        lower_right,
        delimiter_right,
        sokr_left,
        inside_pair_sokr,
        initials_left,
        npa_splitter,
        list_item,
        close_quote,
        close_bracket,
        dash_right,
    ]
]
예제 #4
0
            yield text[previous:start]
            left = text[max(0, start - self.window):start]
            right = text[stop:stop + self.window]
            yield SentSplit(left, delimiter, right)
            previous = stop
        yield text[previous:]


########
#
#   SEGMENT
#
########


RULES = [FunctionRule(_) for _ in [
    empty_side,
    no_space_prefix,
    lower_right,
    delimiter_right,

    sokr_left,
    inside_pair_sokr,
    initials_left,

    list_item,

    close_quote,
    close_bracket,

    dash_right,