Пример #1
0
    def load_dicts(self):
        """
        For better runtime speed multiple trees and dictionaries are precomputed, mapping between:
         - imagenet_labels: Char to labels and labels to idx (pygtrie).
         - imagenet_classes: Idx to label (dict).
         - wordnet_labels: Char to hyponym and hyponym to idx (pygtrie).
        """
        import json

        path = get_project_root()
        self.imagenet_labels: pygtrie = pygtrie.CharTrie(
            json.load(
                open(
                    path /
                    "assets/imagenet_classes/trie_char_to_name_to_idx.json",
                    "rb"), ))
        self.imagenet_classes: dict = json.load(
            open(path / "assets/imagenet_classes/dic_idx_to_label.json", "rb"),
            object_hook=key_to_int,
        )

        self.wordnet_labels: pygtrie = pygtrie.CharTrie(
            json.load(
                open(path / "assets/imagenet_classes/hyponym_imagenet.json",
                     "rb")))
Пример #2
0
    def __init__(self):
        self.config = Config()
        self.slack_client = SlackClient(self.config.bot_token)

        # Create tries if necessary
        # This is used for auto-complete
        if Bot.company_trie is None:
            Bot.company_trie = pygtrie.CharTrie()
            Bot.symbol_trie = pygtrie.CharTrie()
            full_path = os.path.realpath(__file__)
            path = os.path.dirname(full_path)
            with open(path + '/resources/nasdaq.csv') as csvfile:
                reader = csv.DictReader(csvfile)
                for row in reader:
                    cname = row['Name'].strip().upper()
                    symbol = row['Symbol'].strip().upper()
                    Bot.company_trie[cname] = symbol
                    Bot.symbol_trie[symbol] = cname

            with open(path + '/resources/nyse.csv') as csvfile:
                reader = csv.DictReader(csvfile)
                for row in reader:
                    cname = row['Name'].strip().upper()
                    symbol = row['Symbol'].strip().upper()
                    Bot.company_trie[cname] = symbol
                    Bot.symbol_trie[symbol] = cname
Пример #3
0
def create_blacklist_ip_trie():
    print('* Creating Blacklist IP Trie.')
    try:
        print('* Last update: {}'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(os.path.getmtime(BLACKLISTED_IP_TRIE_JOBLIB)))))
    except:
        pass
    blip = None
    blacklisted_ip_trie = trie.CharTrie()
    z = download_zipfile()
    if(z != None):
        try:
            with z.open(z.filelist[0].filename) as myfile:
                blip = myfile.read()
            blip = blip.decode()
            blip = blip.split('\n')
            for row in blip:
                re_obj = re.search(IP_PATTERN, row)
                if re_obj is not None:
                    matched_ip = row[re_obj.span()[0]:re_obj.span()[1]].split('\t')[0]
                    blacklisted_ip_trie[matched_ip] = True
            dump(blacklisted_ip_trie, BLACKLISTED_IP_TRIE_JOBLIB)
            print('* Blacklist IP Trie created.')
        except Exception as e:
            print('* ERROR IN CREATING BLACKLIST IP TRIE : ', e)
            sys.exit(1)
    else:
        print('* UNABLE TO UPDATE Blacklist IP Trie!!')
        sys.exit(1)
Пример #4
0
def load_token_dict_as_trie(token_dict='token_dict.txt'):
    with open(token_dict, 'r', encoding='UTF-8') as f:
        token_set = ast.literal_eval(f.read())
    t = trie.CharTrie()
    for token in token_set:
        t[token] = True
    return t
Пример #5
0
    def test_traverse_compressing(self):
        t = pygtrie.CharTrie()
        t.update({'aaa': 1, 'aab': 2, 'aac': 3, 'bb': 4})

        def make(path_conv, path, children, value=self._SENTINEL):
            children = sorted(children)
            if value is self._SENTINEL and len(children) == 1:
                # There is only one prefix.
                return children[0]
            else:
                return self._TestNode(path_conv(path), children, value)

        r = t.traverse(make)
        # Result:
        # <>
        #  aa
        #    aaa:1
        #    aab:2
        #    aac:3
        #  bb:4
        self.assertNode(r, '', 2)

        # For some reason pylint thinks a_node et al. are strings.
        # pylint: disable=no-member

        aa_node = self.assertNode(r.children[0], 'aa', 3)
        self.assertNode(aa_node.children[0], 'aaa', 0, 1)
        self.assertNode(aa_node.children[1], 'aab', 0, 2)
        self.assertNode(aa_node.children[2], 'aac', 0, 3)

        self.assertNode(r.children[1], 'bb', 0, 4)
Пример #6
0
def part_two():
    # need just two strings that differ by a single letter
    import pygtrie as trie
    # Build a prefix tree out of the words
    t = trie.CharTrie()
    ids = list(read_lines())
    # Populate trie, values are random (True)
    for word in ids:
        t[word] = True

    def trie_walk(path_conv, chars, children, whatsthis=None):
        path = path_conv(chars)
        # Got two subtrees here, where do they differ?
        words = list(t.keys(path))
        if len(words) == 2 and differ_barely(*words):
            # Found them! Propagate them back up
            return words
        if len(words) < 1:
            return
        if len(words) > 2:
            # we need to go deeper, this results in recursive call
            result = list(children)
            # Either there is one result, or none
            # Filter out the result and bubble back up
            result = list(filter(lambda x: x, result))
            result = result[0] if result else None
            return result

    res = t.traverse(trie_walk)

    # Now just print the common letters
    for a, b in zip(*res):
        if a == b:
            print(a, end='')
    print()
Пример #7
0
        def _hack_build_pygtrie_prefix_freq(items):
            """
            Builds the trie, and then modifies the internal nodes so the
            sentinal values become the frequenceies we need.
            """
            import pygtrie
            from collections import deque

            # construct tree
            self = pygtrie.CharTrie(zip(items, [0] * len(items)))

            # Hack into the internal structure and insert frequencies at each node
            def _iternodes(self):
                """
                Generates all nodes in the trie
                """
                stack = deque([[self._root]])
                while stack:
                    for node in stack.pop():
                        yield node
                        stack.append(node.children.values())

            for node in _iternodes(self):
                if node is not self._root:   # dont do this to the root
                    node.value = 0

            # For each item trace its path and increment frequencies
            for item in items:
                final_node, trace = self._get_node(item)
                for key, node in trace[1:]:
                    node.value += 1
            return self
Пример #8
0
def get_clue_trie(clues, max_difficulty=6, num_clues=50):
    print("Creating clue set")
    clue_trie = pygtrie.CharTrie()
    for clue in clues:
        try:
            diff = int(clue["difficulty"])
            q = clue["question"]
            new_clue = {"difficulty": diff, "hint": q}
        except:
            print("clue could not be parsed")
            continue
        # validate the question
        question = clue["question"]
        question.lower()
        if question == "":
            continue
        if "across" in question or "down" in question or \
            "Across" in question or "Down" in question or \
            "this puzzle" in question:
            continue

        # validate the answer
        if len(clue["answer"]) < 2:
            print("invalid one char answer: " + clue["answer"])
            continue
        if new_clue["difficulty"] >= max_difficulty:
            clue_trie[clue["answer"]] = new_clue
        #  num_clues -= 1
    #    if num_clues <= 0:
    #      break
    return clue_trie
Пример #9
0
def get_pygtrie() -> pygtrie.CharTrie:
    global _pygtrie
    if not _pygtrie:
        _pygtrie = pygtrie.CharTrie()
        for word in words():
            _pygtrie[word] = True
    return _pygtrie
Пример #10
0
    def test_traverse(self):
        t = pygtrie.CharTrie()
        t.update({'aaa': 1, 'aab': 2, 'aac': 3, 'bb': 4})

        r = t.traverse(self._make_test_node)
        # Result:
        #  <>
        #    a
        #      aa
        #        aaa:1
        #        aab:2
        #        aac:3
        #    b
        #      bb:4
        self.assertNode(r, '', 2)

        # For some reason pylint thinks a_node et al. are strings.
        # pylint: disable=no-member

        a_node = self.assertNode(r.children[0], 'a', 1)
        aa_node = self.assertNode(a_node.children[0], 'aa', 3)
        self.assertNode(aa_node.children[0], 'aaa', 0, 1)
        self.assertNode(aa_node.children[2], 'aac', 0, 3)

        b_node = self.assertNode(r.children[1], 'b', 1)
        self.assertNode(b_node.children[0], 'bb', 0, 4)
Пример #11
0
def set_value(partition_key, sort_key):

    lookup_key = partition_key + ":" + sort_key
    print("{} Saving {} to {}".format(self_server, request.data, lookup_key))
    data[lookup_key] = request.data.decode('utf-8')
    if lookup_key in indexed:
        return make_response(str(response.status_code), response.status_code)

    indexed[lookup_key] = True
    sort_index[partition_key + ":" + sort_key] = sort_key
    if sort_key not in sort_index:
        sort_index[sort_key] = pygtrie.CharTrie()
    sort_index[sort_key][partition_key] = partition_key + ":" + sort_key
    if partition_key not in between_index:
        between_index[partition_key] = Tree("", None, None)
    if partition_key not in partition_trees:
        partition_tree = both_between_index.insert(partition_key,
                                                   Tree("", None, None))
        partition_trees[partition_key] = partition_tree
    between_index[partition_key].insert(sort_key, partition_key,
                                        partition_key + ":" + sort_key)
    partition_trees[partition_key].partition_tree.insert(
        sort_key, partition_key, partition_key + ":" + sort_key)

    sql_index[sort_key] = lookup_key

    return make_response('', 202)
Пример #12
0
def _create_conversion_trie(strict):
    """
    Create the trie for betacode conversion.

    Args:
    text: The beta code text to convert. All of this text must be betacode.
    strict: Flag to allow for flexible diacritic order on input.

    Returns:
    The trie for conversion.
    """
    t = pygtrie.CharTrie()

    for beta, uni in _map.BETACODE_MAP.items():
        if strict:
            t[beta] = uni
        else:
            # The order of accents is very strict and weak. Allow for many orders of
            # accents between asterisk and letter or after letter. This does not
            # introduce ambiguity since each betacode token only has one letter and
            # either starts with a asterisk or a letter.
            diacritics = beta[1:]

            perms = itertools.permutations(diacritics)
            for perm in perms:
                perm_str = beta[0] + ''.join(perm)
                t[perm_str.lower()] = uni
                t[perm_str.upper()] = uni

    return t
Пример #13
0
 def initialize_trie(self):
     self.trie = pygtrie.CharTrie()
     with self.ix.reader() as reader:
         for doc in reader.iter_docs():
             self.trie[list(
                 WhooshConstants.normalized_analyzer(
                     doc[1]['name']))[0].text] = doc[1]['name']
Пример #14
0
    def test_traverse_singleton_tree(self):
        t = pygtrie.CharTrie()
        t.update({'a': 10})

        r = t.traverse(self._make_test_node)
        self.assertNode(r, '', 1)
        self.assertNode(r.children[0], 'a', 0, 10)
Пример #15
0
def precomp(labe_probs, neighbs, T):
    """
    labe_probs - T x nlabes, normalized along dim 1
    neighbs - nesz length list of tags
    """
    trie = pygtrie.CharTrie()
    labe2idx = ASCII2IDX
    # put all subsequences
    # [trie.__setitem__(''.join(ne[i:j]), True)
    #  for ne in neighbs for i in range(len(ne)) for j in range(i+1, min(len(ne), i+T)+1)]
    [trie.__setitem__(ne[i:j], True)
     for ne in neighbs for i in range(len(ne)) for j in range(i+1, min(len(ne), i+T)+1)]
    # now we can get cost for every start position.
    # Note we'll have trie[pfx][t] = cost of that prefix STARTING AT t
    prev, start = None, None
    for key in trie.iterkeys():
        lastlabe = key[-1]
        if len(key) == 1:
            cost = (1-labe_probs[:, labe2idx[lastlabe]]) # the "cost"
            start = 0
        elif prev is None: # non 1-length restart; figure out where we are
            prev = trie[key[:-1]]
            start = len(key) - 1
            cost = prev[:-1] + (1-labe_probs[start:, labe2idx[lastlabe]])
        else:
            # print(key, start)
            # print(prev)
            cost = prev[:-1] + (1-labe_probs[start:, labe2idx[lastlabe]])
        trie[key] = cost
        if not trie.has_subtrie(key): # the terminal subsequence
            prev, start = None, None
        else:
            prev = cost
            start += 1
    return trie
Пример #16
0
def set_value(partition_key, sort_key):
    lookup_key = partition_key + ":" + sort_key
    machine_index = hashes["hashes"].get_machine(partition_key)
    response = requests.post("http://{}/set/{}/{}".format(
        servers[machine_index], partition_key, sort_key),
                             data=request.data)

    if lookup_key in indexed:
        return make_response(str(response.status_code), response.status_code)

    indexed[lookup_key] = True
    sort_index[partition_key + ":" + sort_key] = sort_key
    if sort_key not in sort_index:
        sort_index[sort_key] = pygtrie.CharTrie()
    sort_index[sort_key][partition_key] = partition_key + ":" + sort_key
    if partition_key not in between_index:
        between_index[partition_key] = Tree("", None, None)
    if partition_key not in partition_trees:
        partition_tree = both_between_index.insert(partition_key,
                                                   Tree("", None, None))
        partition_trees[partition_key] = partition_tree
    between_index[partition_key].insert(sort_key, partition_key,
                                        partition_key + ":" + sort_key)
    partition_trees[partition_key].partition_tree.insert(
        sort_key, partition_key, partition_key + ":" + sort_key)

    return make_response(str(response.status_code), response.status_code)
Пример #17
0
 def _optimize_scrabble_words(self):
     '''Initializes a Trie of all possible Scrabble words for optimized lookups.'''
     print('Optimizing Word Dictionary...')
     trie = pygtrie.CharTrie()
     for word in WORDS:
         trie[word] = True
     print('Done Optimizing.')
     return trie
Пример #18
0
def load_dict(name):
    t = pygtrie.CharTrie()
    with open(path.join(here, 'dict',
                        'zyenpheng.dict.' + name + '.yaml')) as f:
        for line in f:
            k, v = line.rstrip().split('\t')
            t[k] = v
    return t
Пример #19
0
    def __init__(self, source=None):
        """Create an AutocompleteProvider.

        If source is provided, it is a file-like object used to train.
        """
        self.trie = pygtrie.CharTrie()
        if source:
            self.train(source.read())
Пример #20
0
def test_char():
    print('CharTrie Test')
    print('-------------')
    import pygtrie
    trie = pygtrie.CharTrie()
    trie.enable_sorting()
    for word in data.split():
        trie[word.lower()] = True
    print('K : ', ', '.join(trie.keys()))
Пример #21
0
 def _update(self):
     data = loadjson(MEMBER)
     self._rosters = {}
     for group in data:
         self._rosters[group] = pygtrie.CharTrie()
         for user in data[group]:
             for name in data[group][user]:
                 if name.lower() not in self._rosters[group]:
                     self._rosters[group][name.lower()] = user
Пример #22
0
def main():
    os.makedirs(args.wiki_preprocess, exist_ok=True)

    dummy = DummyVocab()
    datasets = read_datasets(args.dataroot, dummy, dummy, ctx_window=999)
    summarize_datasets(datasets)

    prefix_vocab = pygtrie.CharTrie()
    suffix_vocab = pygtrie.CharTrie()
    for _, dataset in tqdm(
            datasets.items(), dynamic_ncols=True, desc="Build Tries"):
        for mentions in dataset:
            for mention in mentions:
                mention_text = RE_PUNCT.sub(' ', mention.text.lower()).strip()
                prefix_vocab[mention_text + " "] = True
                suffix_vocab[mention_text[::-1] + " "] = True
    print(f'Forward trie size: {len(prefix_vocab)}')
    print(f'Backward trie size: {len(suffix_vocab)}')

    paths = list(glob.glob(os.path.join(args.wiki_dump, '*.xml-*')))
    paths = sorted(
        paths,
        key=lambda p: int(os.path.basename(p).split('-')[4][11:-4]))
    params = [(path, prefix_vocab, suffix_vocab) for path in paths]

    prefix_count = dict()
    suffix_count = dict()
    total_pages = 0
    with mp.Pool(processes=args.cpu) as pool, \
            tqdm(total=len(paths), dynamic_ncols=True) as pbar:
        for i, res in enumerate(pool.imap_unordered(process_stream, params)):
            prefix_count_, suffix_count_, page_counter = res
            total_pages += page_counter
            update_counts(prefix_count, prefix_count_)
            update_counts(suffix_count, suffix_count_)
            pbar.write(
                f'pages: {total_pages}, '
                f'# forward: {len(prefix_count)}/{len(prefix_vocab)}, '
                f'# backward: {len(suffix_count)}/{len(suffix_vocab)}')
            pbar.update()
            if i % 10 == 0:
                dump(prefix_count, suffix_count)
            del prefix_count_, suffix_count_
    dump(prefix_count, suffix_count)
Пример #23
0
def generate_trie(df, f):
    trie = pygtrie.CharTrie(df.to_dict('index'))

    if not os.path.exists(dict_path):
        os.makedirs(dict_path)
    file = open(f, 'wb')
    pickle.dump(trie, file)
    file.close()

    return trie
Пример #24
0
    def __init__(self, file_path):
        self.trie = pygtrie.CharTrie()

        print('Building trie...')
        num_words = 0
        with open(file_path) as f:
            for word in f:
                self.trie[CleanWord(word)] = True
                num_words += 1
        print('Trie built. Words found:', num_words)
Пример #25
0
 def store(self, partition_key, sort_key, item):
     lookup_key = partition_key + ":" + sort_key
     self.db[lookup_key] = item
     self.sort_index[lookup_key] = sort_key
     if sort_key not in self.sort_index:
         self.sort_index[sort_key] = pygtrie.CharTrie()
     self.sort_index[sort_key][partition_key] = lookup_key
     if partition_key not in self.between_index:
         self.between_index[partition_key] = Tree("", None, None)
     self.between_index[partition_key].insert(sort_key, partition_key,
                                              lookup_key)
Пример #26
0
 def test_empty_tokens(self):
     tokens = []
     pre_process = lambda x: x
     pre_filter = lambda x: x
     post_filter = lambda x: isinstance(x, basestring)
     ngrams = 1
     joiner = ' '
     n_trie = trie.CharTrie()
     c = Core()
     r = c._extract_using_dictionary(tokens, pre_process, n_trie, pre_filter, post_filter,
                                     ngrams, joiner)
     self.assertEqual(r, None)
Пример #27
0
def solve_word_search(puzzle, words):
    copy = [[['0', 'N'] for i in range(len(puzzle[0]))] for j in range(len(puzzle))]
    trie = pygtrie.CharTrie()
    for w in words:
        trie[w] = True
    
    for y in range(len(puzzle)):
        for x in range(len(puzzle[0])):
            if trie.has_subtrie(puzzle[y][x]):
                out = search_word(trie, copy, puzzle, x, y, "", "n")

    return copy
Пример #28
0
    def __init__(self, suffix_file=None, invert=False):
        """
        Constructor.

        Args:
            suffix_file (str): the path to the file containing the suffixes to
                filter.
            invert (bool): whether to invert matches.
        """
        self.suffix_set = pygtrie.CharTrie()
        self.create_filter_set(suffix_file)
        self.invert = invert
Пример #29
0
def compute_prefix_embeddings(words, emb_info, dtype='float'):
    """
    - Words in the pre-trained embeddings: discarded in the output
    - Words not in the pre-trained embeddings: the embeddings are computed as mean of the embeddings
    of words which share prefixes with the input words.
    - Words with no matching prefix: discarded in the output
    """

    emb_words, emb = emb_info
    emb_w2i = build_w2i(emb_words)

    emb_words_trie = pygtrie.CharTrie()
    for w in emb_words:
        emb_words_trie[w] = 1

    output_pairs = []
    for w in words:
        if w not in emb_w2i:
            #             print("===" + w)
            ## handle emb words for which the input word is a prefix
            longer_words = []
            if emb_words_trie.has_key(w):
                longer_words = emb_words_trie.keys(w)

            ## handle emb words which are prefixes of the input word is a prefix
            shorter_words = [x[0] for x in emb_words_trie.prefixes(w)]

            #             ## all matched words
            #             matched_words=longer_words+shorter_words

            ## longest short word and shortest long word
            matched_words = []
            if len(longer_words) > 0:
                matched_words.append(min(longer_words, key=lambda x: len(x)))
            if len(shorter_words) > 0:
                matched_words.append(max(shorter_words, key=lambda x: len(x)))


#             print(matched_words)

## embedding of word is mean of matched words embeddings
            if len(matched_words) > 0:
                w_emb = np.mean(np.array(
                    [emb[emb_w2i[mw]] for mw in matched_words], dtype=dtype),
                                axis=0)
                output_pairs.append((w, w_emb))

    output_words = [x[0] for x in output_pairs]
    output_emb = np.array([x[1] for x in output_pairs], dtype=dtype)

    return (output_words, output_emb)
Пример #30
0
class LocationTrie:
    __trie = pygtrie.CharTrie()

    def __init__(self):
        self.__trie.enable_sorting()

    def add_location(self, location):
        self.__trie[location] = True

    def delete_location(self, location):
        self.__trie.pop(location)

    def get_location_list(self, prefix):
        return self.__trie.keys(prefix)