yield TokenSplit(left, delimiter, right) yield atom.text ######## # # SEGMENT # ######## RULES = [ DashRule(), UnderscoreRule(), FloatRule(), FractionRule(), FunctionRule(punct), FunctionRule(other), ] class TokenSegmenter(Segmenter): def __init__(self): super(TokenSegmenter, self).__init__(TokenSplitter(), RULES) def segment(self, parts): buffer = next(parts) for split in parts: right = next(parts) split.buffer = buffer if not split.delimiter and self.join(split): buffer += right
yield TokenSplit(left, delimiter, right) yield atom.text ######## # # SEGMENT # ######## RULES = [ DashRule(), UnderscoreRule(), FloatRule(), FractionRule(), FunctionRule(punct), FunctionRule(other), FunctionRule(yahoo), ] class TokenSegmenter(Segmenter): def __init__(self): super(TokenSegmenter, self).__init__(TokenSplitter(), RULES) def segment(self, parts): buffer = next(parts) for split in parts: right = next(parts) split.buffer = buffer if not split.delimiter and self.join(split):
yield text[previous:start] left = text[max(0, start - self.window):start] right = text[stop:stop + self.window] yield SentSplit(left, delimiter, right) previous = stop yield text[previous:] ######## # # SEGMENT # ######## RULES = [ FunctionRule(_) for _ in [ empty_side, no_space_prefix, lower_right, delimiter_right, sokr_left, inside_pair_sokr, initials_left, npa_splitter, list_item, close_quote, close_bracket, dash_right, ] ]
yield text[previous:start] left = text[max(0, start - self.window):start] right = text[stop:stop + self.window] yield SentSplit(left, delimiter, right) previous = stop yield text[previous:] ######## # # SEGMENT # ######## RULES = [FunctionRule(_) for _ in [ empty_side, no_space_prefix, lower_right, delimiter_right, sokr_left, inside_pair_sokr, initials_left, list_item, close_quote, close_bracket, dash_right,