Exemplo n.º 1
0
def make_tries(wordlist):
    tries = {}
    wordlist = sorted(wordlist, key=lambda x: x.lang)
    for lang, group in itertools.groupby(wordlist, key=lambda x: x.lang):
        words = [word for lang, word in group]
        tries[lang] = CompletionDAWG(words)
    return tries
Exemplo n.º 2
0
    def _create_dawg(self) -> CompletionDAWG:
        def generate_dawg_keys():
            vbs_set = set()
            for sve_key in self.cache.keys():
                yield sve_key

                # creating shallow edge from key is faster than
                # retrieval full edge from cache index
                edge = Edge.from_line(sve_key, ts=TS.sve)
                yield edge.vse
                yield edge.evs
                vbs_set.add(edge.vbs)

            yield from vbs_set

        it_keys = generate_dawg_keys()
        dawg = CompletionDAWG(it_keys)
        return dawg
Exemplo n.º 3
0
    def _create_dawg(self) -> CompletionDAWG:
        def generate_dawg_keys():
            labels = set()
            for node in self.cache.values():
                for term in node.terms:
                    norm = self.normalizer.normalize(term)
                    yield self._tky.join(["", norm, node.key])
                    yield self._ltk.join(["", node.label, norm, node.key])

                yield self._lky.join(["", node.label, node.key])

                if node.label not in labels:
                    labels.add(node.label)
                    yield f"{self._lbl}{node.label}"

        it_keys = generate_dawg_keys()
        dawg = CompletionDAWG(it_keys)
        return dawg
Exemplo n.º 4
0
    def __init__(self):
        """ Create a new lexicon """
        self.word_list = CompletionDAWG()
        self.word_list.load(os.path.join(sys.path[0], 'csw.dawg'))
        sll_suffixes = []
        temp_suffixes = self.list_words()
        while temp_suffixes:
            temp_suffixes = list(
                set([
                    temp_suffixes[i][1:] for i in range(len(temp_suffixes))
                    if len(temp_suffixes[i]) > 1
                ]))
            sll_suffixes.extend(temp_suffixes)
        sll_suffixes = sorted(list(set(sll_suffixes)))
        self.suffix_list = CompletionDAWG(sll_suffixes)

        self.reverse_list = CompletionDAWG(
            sorted([word[::-1] for word in self.list_words()]))
Exemplo n.º 5
0
def save_lex_dawg(dictionary_files=('dictionaries/sowpods.txt', ),
                  outfile=DAWGS_PATH + 'sowpods.dawg'):

    completion_dawg = CompletionDAWG(get_words(dictionary_files))
    completion_dawg.save(outfile)
Exemplo n.º 6
0
class Lexicon:
    """ Represents the lexicon of allowed words, as a DAWG.
    See <a href = http://en.wikipedia.org/wiki/Directed_acyclic_word_graph>
    Named 'Lexicon' to avoid confusion with Python's inbuilt
    dictionary data structure. """
    def __init__(self):
        """ Create a new lexicon """
        self.word_list = CompletionDAWG()
        self.word_list.load(os.path.join(sys.path[0], 'csw.dawg'))
        sll_suffixes = []
        temp_suffixes = self.list_words()
        while temp_suffixes:
            temp_suffixes = list(
                set([
                    temp_suffixes[i][1:] for i in range(len(temp_suffixes))
                    if len(temp_suffixes[i]) > 1
                ]))
            sll_suffixes.extend(temp_suffixes)
        sll_suffixes = sorted(list(set(sll_suffixes)))
        self.suffix_list = CompletionDAWG(sll_suffixes)

        self.reverse_list = CompletionDAWG(
            sorted([word[::-1] for word in self.list_words()]))

    def contains(self, word: str):
        """ Returns True if the supplied word is in the lexicon """
        return word.upper() in self.word_list

    def starts_with(self, prefix: str):
        """ Returns a list of all valid words in the lexicon starting with the supplied prefix """
        return self.word_list.keys(prefix.upper())

    def ends_with(self, suffix: str):
        return self.suffix_list.keys(suffix.upper())

    def contains_prefix(self, prefix: str):
        return len(self.starts_with(prefix.upper())) > 1

    def contains_suffix(self, suffix: str):
        return len(self.ends_with(suffix.upper())) > 1

    def contains_infix(self, infix):
        return len(self.suffix_list.keys(infix.upper())) > 1

    def contains_word_or_prefix(self, prefix):
        return len(self.starts_with(prefix)) > 0

    def words_containing(self, infix: str):
        suffixes_starting_with_infix = self.suffix_list.keys(infix.upper())
        words = []
        [
            words.extend(self.reverse_list.keys(suffix[::-1]))
            for suffix in suffixes_starting_with_infix
        ]
        return sorted([word[::-1] for word in words])

    def list_words(self):
        """ Returns a list of all valid words in the lexicon """
        return self.starts_with('')

    def __contains__(self, item):
        return self.contains(item)
Exemplo n.º 7
0
 def _load_dawg(self):
     if self.dawg_path.is_file():
         return CompletionDAWG().load(str(self.dawg_path))
     return CompletionDAWG([])