def find_entites(text: str, trie: StringTrie): tokens = text.split() start = 0 count = 1 # start at 1, 0 is for the "NO_MATCH" entities = dict() for i in range(len(tokens)): key = "/".join(tokens[start : i + 1]).lower() if trie.has_subtrie(key): # Not done yet if i == len(tokens) - 1: # Reached the end of the string entities[count] = (get_entity(trie, key), start, i + 1) elif trie.has_key(key): # noqa: W601 # Find a perfect match entities[count] = (trie[key], start, i + 1) count += 1 start = i + 1 elif start < i: # Found partial prefix match before this token old_key = "/".join(tokens[start:i]).lower() entities[count] = (get_entity(trie, old_key), start, i) count += 1 if trie.has_node( tokens[i].lower() ): # Need to verify that the current token isn't in the Trie start = i else: start = i + 1 else: # No match start = i + 1 return reduce_entities(entities)
def detect_path_overlap(paths): """ Check for valid POSIX paths (ie ones that aren't duplicated and don't overlap). Overlapping paths are where one path terminates inside another (e.g. a/b and a/b/c). NOTE: The logic is copied from pulpcore.app.files.validate_file_paths(). This function returns the first dupe or overlap it detects. We use a trie (or prefix tree) to keep track of which paths we've already seen. Args: paths (iterable of str): An iterable of strings each representing a relative path Returns: str: a path which overlaps or duplicates another """ path_trie = StringTrie(separator="/") for path in paths: if path in path_trie: # path duplicates a path already in the trie return path if path_trie.has_subtrie(path): # overlap where path is 'a/b' and trie has 'a/b/c' return path prefixes = list(path_trie.prefixes(path)) if prefixes: # overlap where path is 'a/b/c' and trie has 'a/b' return path # if there are no overlaps, add it to our trie and continue path_trie[path] = True
def validate_file_paths(paths): """ Check for valid POSIX paths (ie ones that aren't duplicated and don't overlap). Overlapping paths are where one path terminates inside another (e.g. a/b and a/b/c). This function will raise an exception at the first dupe or overlap it detects. We use a trie (or prefix tree) to keep track of which paths we've already seen. Args: paths (iterable of str): An iterable of strings each representing a relative path Raises: ValueError: If any path overlaps another """ overlap_error = _("The path for file '{path}' overlaps: {conflicts}") path_trie = StringTrie(separator="/") dups = [] overlaps = [] for path in paths: if path in path_trie: # path duplicates a path already in the trie dups.append(path) elif path_trie.has_subtrie(path): # overlap where path is 'a/b' and trie has 'a/b/c' conflicts = [item[0] for item in path_trie.items(prefix=path)] overlaps.append( overlap_error.format(path=path, conflicts=", ".join(conflicts))) else: prefixes = list(path_trie.prefixes(path)) if prefixes: # overlap where path is 'a/b/c' and trie has 'a/b' conflicts = [prefix.key for prefix in prefixes] overlaps.append( overlap_error.format(path=path, conflicts=", ".join(conflicts))) # if there are no overlaps, add it to our trie and continue path_trie[path] = True if dups or overlaps: dups_msg = "" overlaps_msg = "" if dups: dups_msg = _("Paths are duplicated: {paths}").format( paths=",".join(dups)) if overlaps: overlaps_msg = "\n".join(overlaps) raise ValueError( _("Path errors found. {dups}\n{overlaps}").format( dups=dups_msg, overlaps=overlaps_msg))
def find_entites(text: str, trie: StringTrie, mask: str = MASK_TOKEN): tokens = text.split() tokens = fix_punct_tokens(tokens) start = 0 count = 1 # start at 1, 0 is for the "NO_MATCH" entities = dict() out = [] for i in range(len(tokens)): key = "/".join(tokens[start:i + 1]).lower() # name = " ".join(tokens[start: i + 1]) if trie.has_subtrie(key): # Not done yet if i == len(tokens) - 1: # Reached the end of the string entities[count] = get_partial_match(trie, key) out.append(add_bold(get_entity(entities[count]))) elif trie.has_key(key): # noqa: W601 # Find a perfect match entities[count] = trie[key] out.append(add_bold(get_entity(entities[count]))) count += 1 start = i + 1 elif start < i: # Found partial prefix match before this token old_key = "/".join(tokens[start:i]).lower() # name = " ".join(tokens[start:i]) entities[count] = get_partial_match(trie, old_key) out.append(add_bold(get_entity(entities[count]))) count += 1 if trie.has_node(tokens[i].lower( )): # Need to verify that the current token isn't in the Trie start = i else: out.append(tokens[i]) start = i + 1 else: # No match out.append(tokens[i]) start = i + 1 retokenized = "".join([ " " + i if not i.startswith("'") and i not in PUNCT else i for i in out ]).strip() return retokenized, reduce_entities(entities)