def make_tries(wordlist): tries = {} wordlist = sorted(wordlist, key=lambda x: x.lang) for lang, group in itertools.groupby(wordlist, key=lambda x: x.lang): words = [word for lang, word in group] tries[lang] = CompletionDAWG(words) return tries
def _create_dawg(self) -> CompletionDAWG: def generate_dawg_keys(): vbs_set = set() for sve_key in self.cache.keys(): yield sve_key # creating shallow edge from key is faster than # retrieval full edge from cache index edge = Edge.from_line(sve_key, ts=TS.sve) yield edge.vse yield edge.evs vbs_set.add(edge.vbs) yield from vbs_set it_keys = generate_dawg_keys() dawg = CompletionDAWG(it_keys) return dawg
def _create_dawg(self) -> CompletionDAWG: def generate_dawg_keys(): labels = set() for node in self.cache.values(): for term in node.terms: norm = self.normalizer.normalize(term) yield self._tky.join(["", norm, node.key]) yield self._ltk.join(["", node.label, norm, node.key]) yield self._lky.join(["", node.label, node.key]) if node.label not in labels: labels.add(node.label) yield f"{self._lbl}{node.label}" it_keys = generate_dawg_keys() dawg = CompletionDAWG(it_keys) return dawg
def __init__(self): """ Create a new lexicon """ self.word_list = CompletionDAWG() self.word_list.load(os.path.join(sys.path[0], 'csw.dawg')) sll_suffixes = [] temp_suffixes = self.list_words() while temp_suffixes: temp_suffixes = list( set([ temp_suffixes[i][1:] for i in range(len(temp_suffixes)) if len(temp_suffixes[i]) > 1 ])) sll_suffixes.extend(temp_suffixes) sll_suffixes = sorted(list(set(sll_suffixes))) self.suffix_list = CompletionDAWG(sll_suffixes) self.reverse_list = CompletionDAWG( sorted([word[::-1] for word in self.list_words()]))
def save_lex_dawg(dictionary_files=('dictionaries/sowpods.txt', ), outfile=DAWGS_PATH + 'sowpods.dawg'): completion_dawg = CompletionDAWG(get_words(dictionary_files)) completion_dawg.save(outfile)
class Lexicon: """ Represents the lexicon of allowed words, as a DAWG. See <a href = http://en.wikipedia.org/wiki/Directed_acyclic_word_graph> Named 'Lexicon' to avoid confusion with Python's inbuilt dictionary data structure. """ def __init__(self): """ Create a new lexicon """ self.word_list = CompletionDAWG() self.word_list.load(os.path.join(sys.path[0], 'csw.dawg')) sll_suffixes = [] temp_suffixes = self.list_words() while temp_suffixes: temp_suffixes = list( set([ temp_suffixes[i][1:] for i in range(len(temp_suffixes)) if len(temp_suffixes[i]) > 1 ])) sll_suffixes.extend(temp_suffixes) sll_suffixes = sorted(list(set(sll_suffixes))) self.suffix_list = CompletionDAWG(sll_suffixes) self.reverse_list = CompletionDAWG( sorted([word[::-1] for word in self.list_words()])) def contains(self, word: str): """ Returns True if the supplied word is in the lexicon """ return word.upper() in self.word_list def starts_with(self, prefix: str): """ Returns a list of all valid words in the lexicon starting with the supplied prefix """ return self.word_list.keys(prefix.upper()) def ends_with(self, suffix: str): return self.suffix_list.keys(suffix.upper()) def contains_prefix(self, prefix: str): return len(self.starts_with(prefix.upper())) > 1 def contains_suffix(self, suffix: str): return len(self.ends_with(suffix.upper())) > 1 def contains_infix(self, infix): return len(self.suffix_list.keys(infix.upper())) > 1 def contains_word_or_prefix(self, prefix): return len(self.starts_with(prefix)) > 0 def words_containing(self, infix: str): suffixes_starting_with_infix = self.suffix_list.keys(infix.upper()) words = [] [ words.extend(self.reverse_list.keys(suffix[::-1])) for suffix in suffixes_starting_with_infix ] return sorted([word[::-1] for word in words]) def list_words(self): """ Returns a list of all valid words in the lexicon """ return self.starts_with('') def __contains__(self, item): return self.contains(item)
def _load_dawg(self): if self.dawg_path.is_file(): return CompletionDAWG().load(str(self.dawg_path)) return CompletionDAWG([])