def reassemble(sentences): at_start = True in_dquote = False buf = [] for sentence in sentences: for fragment in sentence: if fragment == '"': if in_dquote: in_dquote = False else: if not at_start: buf.append(' ') at_start = True in_dquote = True elif not PUNCTUATION_REGEX.match(unicode(fragment)) and not at_start: buf.append(' ') else: at_start = False if isinstance(fragment, Substitution): buf.append('<del>{}</del><ins>{}</ins>'.format(*fragment)) else: buf.append(fragment) return ''.join(buf)
def tag(self, sentence, tokenize=True): """Tag a string `sentence`. :param str or list sentence: A string or a list of sentence strings. :param tokenize: (optional) If ``False`` string has to be tokenized before (space separated string). """ # : Do not process empty strings (Issue #3) if sentence.strip() == "": return [] # : Do not process strings consisting of a single punctuation mark (Issue #4) elif sentence.strip() in PUNCTUATION: if self.include_punc: _sym = sentence.strip() if _sym in tuple('.?!'): _tag = "." else: _tag = _sym return [(_sym, _tag)] else: return [] if tokenize: _tokenized = " ".join(self.tokenizer.tokenize(sentence)) sentence = _tokenized # Sentence is tokenized before it is passed on to pattern.de.tag # (i.e. it is either submitted tokenized or if ) _tagged = pattern_tag(sentence, tokenize=False) if self.include_punc: return _tagged else: _tagged = [(word, t) for word, t in _tagged if not PUNCTUATION_REGEX.match(unicode(t))] return _tagged
def pos_tags(self): '''Returns an list of tuples of the form (word, POS tag). Example: :: [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'), ('Thursday', 'NNP'), ('morning', 'NN')] :rtype: list of tuples ''' return [(Word(word, pos_tag=t), unicode(t)) for word, t in self.pos_tagger.tag(self.raw) if not PUNCTUATION_REGEX.match(unicode(t))]
def pos_tags(self): """Returns an list of tuples of the form (word, POS tag). Example: :: [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'), ('Thursday', 'NNP'), ('morning', 'NN')] :rtype: list of tuples """ return [(Word(word, pos_tag=t), unicode(t)) for word, t in self.pos_tagger.tag(self.raw) if not PUNCTUATION_REGEX.match(unicode(t))]
def pos_tags(self): """Returns an list of tuples of the form (word, POS tag). Example: :: [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'), ('Thursday', 'NNP'), ('morning', 'NN')] :rtype: list of tuples """ if isinstance(self, TextBlob): return [val for sublist in [s.pos_tags for s in self.sentences] for val in sublist] else: return [(Word(unicode(word), pos_tag=t), unicode(t)) for word, t in self.pos_tagger.tag(self) if not PUNCTUATION_REGEX.match(unicode(t))]
def pos_tags(self): """Returns an list of tuples of the form (word, POS tag). Example: :: [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'), ('Thursday', 'NNP'), ('morning', 'NN')] :rtype: list of tuples """ if isinstance(self, TextBlob): return [ val for sublist in [s.pos_tags for s in self.sentences] for val in sublist ] else: return [(Word(word, pos_tag=t), unicode(t)) for word, t in self.pos_tagger.tag(self) if not PUNCTUATION_REGEX.match(unicode(t))]
def tag(self, sentence, tokenize=True): """Tag a string `sentence`. :param str or list sentence: A string or a list of sentence strings. :param tokenize: (optional) If ``False`` string has to be tokenized before (space separated string). """ #: Do not process empty strings (Issue #3) if sentence.strip() == "": return [] #: Do not process strings consisting of a single punctuation mark (Issue #4) elif sentence.strip() in PUNCTUATION: if self.include_punc: _sym = sentence.strip() if _sym in tuple('.?!'): _tag = "." else: _tag = _sym return [(_sym, _tag)] else: return [] if tokenize: _tokenized = " ".join(self.tokenizer.tokenize(sentence)) sentence = _tokenized # Sentence is tokenized before it is passed on to pattern.de.tag # (i.e. it is either submitted tokenized or if ) _tagged = pattern_tag(sentence, tokenize=False, encoding=self.encoding, tagset=self.tagset) if self.include_punc: return _tagged else: _tagged = [ (word, t) for word, t in _tagged if not PUNCTUATION_REGEX.match( unicode(t))] return _tagged