def stopwords(words, *stopLists, **params): """From a given list of words or a string, filter out: stopwords, numbers (if 'numbers'=True, default; both integers and floats), single letters and characters (if 'singles'=True, default); using all the *stopLists lists/sets combined, or STOP if no list provided. Comparison is done in lowercase, but original case is left in the result. >>> stopwords("This is an example string.") 'example string' >>> stopwords(u"Echte of neppatiënten") # unicode characters recognized correctly inside words u'Echte neppati\\xc3 nten' """ numbers = params.get('numbers', True) singles = params.get('singles', True) asstring = params.get('asstring', None) if not stopLists: stop = STOP else: stop = set() for s in stopLists: stop |= set(s.split()) if isstring(s) else set(s) if islist(s) else s if isstring(words): if asstring is None: asstring = True words = re.split(r'\W+', words, flags = re.UNICODE) res = [] for w in words: if singles and len(w) < 2: continue if numbers and w.isdigit(): continue # todo: replace isdigit() with a regex that handles all floats, too if w.lower() in stop: continue res.append(w) if asstring: return ' '.join(res) # get back to a concatenated text return res
def join(self, iterable): if islist(iterable): items = iterable else: items = list(iterable) # we have to materialize the iterable to check language of each item # check that all strings to be joined have compatible languages; calculate the resulting language; # the initial self.language can be None - this allows the items set the language language = reduce(Text.combine, items, self.language) return Text(unicode.join(self, items), language)
def tags(names): """Returns a regex pattern matching only the tags with given names, both opening and closing ones. The matched tag name is available in 1st (opening) or 4th (closing) group. """ pat = r"""<(?:(%s)(\s(?:\s*[a-zA-Z][\w:\-]*(?:\s*=(?:\s*"(?:\\"|[^"])*"|\s*'(?:\\'|[^'])*'|[^\s>]+))?)*)?(\s*[\/\?]?)|\/(%s)\s*)>""" if isstring(names) and ' ' in names: names = names.split() if islist(names): names = "|".join(names) return pat % (names, names)
def tags_except(names, special = True): "Returns a regex pattern matching all tags _except_ the given names. If special=True (default), special tags are included: <!-- --> <? ?> <![CDATA" pat = r"""<(?:(?!%s)([a-zA-Z\?][\w:\-]*)(\s(?:\s*[a-zA-Z][\w:\-]*(?:\s*=(?:\s*"(?:\\"|[^"])*"|\s*'(?:\\'|[^'])*'|[^\s>]+))?)*)?(\s*[\/\?]?)|\/(?!%s)([a-zA-Z][\w:\-]*)\s*""" if special: pat += r"|!--((?:[^\-]|-(?!->))*)--|!\[CDATA\[((?:[^\]]|\](?!\]>))*)\]\]" pat += r")>" if isstring(names) and ' ' in names: names = names.split() if islist(names): names = "|".join(names) names = r"(?:%s)\b" % names # must check for word boundary (\b) at the end of a tag name, to avoid prefix matching of other tags return pat % (names, names)
def tags_pair(names = None): """Returns a regex pattern matching: (1) an opening tag with a name from 'names', or any name if 'names' is empty/None; followed by (2) any number of characters (the "body"), matched lazy (as few characters as possible); followed by (3) a closing tag with the same name as the opening tag. The matched tag name is available in the 1st group. Self-closing tags <.../> are NOT matched. """ opening = r"""<(%s)(\s(?:\s*[a-zA-Z][\w:\-]*(?:\s*=(?:\s*"(?:\\"|[^"])*"|\s*'(?:\\'|[^'])*'|[^\s>]+))?)*)?\s*>""" closing = r"<\/(\1)\s*>" body = r".*?" # lazy "match all" pat = opening + body + closing if not names: names = r"[a-zA-Z\?][\w:\-]*" # "any tag name"; for XML matching this regex is too strict, as XML allows for other names, too else: if isstring(names): names = names.split() if islist(names): names = "|".join(names) return pat % names
def conv2D_BN(y, *args, **kwargs): """Extra arguments: - add: (optional) tensor or a list of tensors (typically a shortcut connection) to be added to the output right after BatchNormalization, but before activation """ activation = kwargs.pop('activation', None) if isstring(activation): activation = Activation(activation) add = kwargs.pop('add', None) if add and not islist(add): add = [add] y = Conv2D(*args, **kwargs)(y) y = BatchNormalization()(y) if add: y = layers_add([y] + add) if activation: y = activation(y) return y