def getCommonPrefix(fileSet, trieNode, separator='-_'):
    debug = False
    # debug = ('killswitch engage - temple from the within.mp3' in fileSet)
    root = Trie('$$')
    for f in fileSet:
        root.insert(list(trieNode.children[f].key))
    prefixList = []
    def dfs(trieNode, curStr=''):
        if debug:
            print curStr, trieNode.num_successors, int(0.8*len(fileSet))
        if (trieNode.num_successors >= int(0.8*len(fileSet)) and trieNode.num_successors > 1) or trieNode.num_successors >= 3:
            res = False
            for k in trieNode.children:
                res = (dfs(trieNode.children[k], curStr+k) or res)
            if res:
                return True
            elif trieNode.key in separator:
                prefixList.append((curStr, trieNode.num_successors))
                return True
            else:
                return False
        else:
            return False
    dfs(root)
    # if len(prefixList) > 0:
    #     print prefixList, "->\n", '\n\t'.join(fileSet)
    return prefixList
Пример #2
0
    def to_dict(self):
        state = self.state.to_dict(True)
        nstate = {}
        for s in state:
            t = Trie(STATEDB_DIR, state[s][STORAGE_INDEX])
            o = [0] * ACCT_RLP_LENGTH
            o[NONCE_INDEX] = decode_int(state[s][NONCE_INDEX])
            o[BALANCE_INDEX] = decode_int(state[s][BALANCE_INDEX])
            o[CODE_INDEX] = state[s][CODE_INDEX]
            td = t.to_dict(True)
            o[STORAGE_INDEX] = {decode_int(k): decode_int(td[k]) for k in td}
            nstate[s.encode('hex')] = o

        return {
            "number": self.number,
            "prevhash": self.prevhash,
            "uncles_root": self.uncles_root,
            "coinbase": self.coinbase,
            "state": nstate,
            "transactions_root": self.transactions_root,
            "difficulty": self.difficulty,
            "timestamp": self.timestamp,
            "extradata": self.extradata,
            "nonce": self.nonce
        }
Пример #3
0
def create_data_structure(filepath):
    """Open file and load wordlist into Trie ds"""
    ds = Trie()
    with open(filepath, 'r') as fp:
        for word in fp:
            ds.add_word(word.rstrip())
    return ds
Пример #4
0
def find_compound_words(words):
    """ trie + BFS + pruning
    Advantages of trie:
    1. Predictable O(k) lookup time where k is the size of the key.
    2. We can easily get all prefixes of a given word.
    Drawbacks of tries:
    1. Space-consuming, it is a trade-off between time-complexity and space\
    complexity. We can use radix-tree to get optimized space, but in \
    practice, it doesn't have a reasonable improvement and it takes more\
    time than trie.
    """
    compound_words = set([])
    trie = Trie()
    queue = collections.deque()
    prefixes_dict = {}
    for word in words:
        prefixes = trie.has_prefixes(word)
        for prefix in prefixes:
            queue.append((word, word[len(prefix) :]))
        trie.insert(word)
    while queue:
        word, suffix = queue.popleft()
        # pruning
        if word in compound_words:
            continue
        # find a compund word
        if suffix in trie:
            compound_words.add(word)
        else:
            prefixes = trie.has_prefixes(suffix)
            for prefix in prefixes:
                queue.append((word, suffix[len(prefix) :]))
    return compound_words
Пример #5
0
def test_contains_given_string_one_string_():
    """Test that a given string is in the list."""
    from trie import Trie
    trie = Trie()
    token = 'pig'
    trie.insert(token)
    assert trie.contains(token)
Пример #6
0
def car_trie():
    """Filled trie."""
    from trie import Trie
    t = Trie()
    for word in WORDS:
        t.insert(word)
    return t
Пример #7
0
 def test_finds_nodes(self):
     t1, t2, t3, t4 = Trie(), Trie(), Trie(), Trie()
     t1.children['b'] = t2
     t2.children['a'] = t3
     t3.children['r'] = t4
     trie = t1.find('bar')
     assert trie == t4
Пример #8
0
def test_contains_on_partial():
    """Test contains returns false on partial match."""
    from trie import Trie
    trie = Trie()
    token = 'piglet'
    trie.insert(token)
    assert trie.contains('pig') is False
Пример #9
0
def test_insert_one_token():
    """Test when token is inserted into the trie correctly."""
    from trie import Trie
    trie = Trie()
    token = 'pig'
    trie.insert(token)
    assert trie.container == {'p': {'i': {'g': {'$': '$'}}}}
Пример #10
0
def test_contatins():
    """Test contains responds with true for a word that has been inserted."""
    from trie import Trie
    t = Trie()
    t.insert("cat")
    result = t.contains("cat")
    assert result is True
Пример #11
0
def test_contains_on_shorter():
    """Test contains returns false on non-inserted longer word."""
    from trie import Trie
    trie = Trie()
    token = 'pig'
    trie.insert(token)
    assert trie.contains('piglet') is False
Пример #12
0
def find_top_k_with_trie(k = 10):
    """
    Too slow and large memory consuming.
    
    time consuming:  147.656000137
    (164, 'mh')
    (164, 'sq')
    (165, 'bi')
    (165, 'mo')
    (167, 'im')
    (168, 'ux')
    (169, 'br')
    (169, 'gj')
    (170, 'ij')
    (171, 'qd')
    """
    result = []
    t = Trie()
    # trie
    with open(TDATA) as f:
        for line in f:
            t.insert(line.strip())
    
    # heapq
    for n in t.ipreorder(t.root):
        if len(result) < k:
            heapq.heappush(result, n)
        else:
            heapq.heappushpop(result, n)
            
    return result
Пример #13
0
def test_contatins_false():
    """Test contains responds with false for a word that is not inserted."""
    from trie import Trie
    t = Trie()
    t.insert("cat")
    result = t.contains("dog")
    assert result is False
Пример #14
0
class DictionaryTest(unittest.TestCase):
    def setUp(self):
        self.unigrams = Trie()
        self.unigrams['a'] = 200
        self.unigrams['hi'] = 130
        self.unigrams['hello'] = 120
        self.unigrams['there'] = 140
        self.unigrams['how'] = 150
        self.unigrams['are'] = 80
        self.unigrams['you'] = 200
        self.unigrams['your'] = 100

        self.ngrams = Trie()
        self.ngrams[['hello','there']] = 20
        self.ngrams[['hello','you']] = 25
        self.ngrams[['how','are','you']] = 80
        self.ngrams[['you','are','there']] = 30
        self.ngrams[['are','you','there']] = 60

        self.bindict = BinaryDictionary()
        self.bindict.encode_unigrams(self.unigrams)
        self.bindict.encode_ngrams(self.ngrams)

    def test_trie_weight(self):
        self.assertEqual(self.unigrams['hello'], 120)
        self.assertEqual(self.ngrams[['hello','there']], 20)

    def test_trie_key_error(self):
        with self.assertRaises(KeyError):
            self.ngrams['hello']

    def test_trie_unigram_predict(self):
        self.assertTrue('e' in map(itemgetter(0), self.unigrams.get_predictions(['h'])))
        self.assertEquals('l', self.unigrams.get_predictions(list('he'))[0][0])
        self.assertEquals(len(self.unigrams.get_predictions(list('hello'))), 0)

    def test_trie_ngram_predict(self):
        self.assertTrue('there' in map(itemgetter(0), self.ngrams.get_predictions(['hello'])))
        self.assertTrue('you' in map(itemgetter(0), self.ngrams.get_predictions(['how','are'])))

    def test_bindict_exists(self):
        self.assertTrue(self.bindict.exists('hello'))
        self.assertTrue(not self.bindict.exists('hellos'))
        self.assertTrue(not self.bindict.exists('h'))
        self.assertTrue(self.bindict.exists('a'))

    def test_bindict_ngram_predict(self):
        self.assertTrue('there' in map(itemgetter(0), self.bindict.get_predictions(['hello'])))
        self.assertTrue('you' in map(itemgetter(0), self.bindict.get_predictions(['how','are'])))

    def test_correct(self):
        self.assertTrue('you' in self.bindict.get_corrections('yuu').keys())
        self.assertTrue('your' in self.bindict.get_corrections('yuur').keys())

    def test_completions(self):
        self.assertTrue('you' in self.bindict.get_completions('yo', 1))
        self.assertFalse('your' in self.bindict.get_completions('yo', 1))
        self.assertTrue('your' in self.bindict.get_completions('yo', 2))
        self.assertFalse('yo' in self.bindict.get_completions('y', 1))
Пример #15
0
def test_traversals_on_empty():
    """Test traversal for aan empty."""
    from trie import Trie
    t = Trie()
    result = []
    for item in t.traversal(t.root):
        result.append(item)
    assert result == []
Пример #16
0
def test_traversal_no_words():
    """Test traversal on trie with no words."""
    from trie import Trie
    word_list = []
    trie = Trie()
    for word in trie.traversal(start=trie.container):
        word_list.append(word)
    assert word_list == []
Пример #17
0
def test_overlapping_words():
    from trie import Trie
    new_trie = Trie()
    new_trie.insert('words')
    new_trie.insert('trie')
    new_trie.insert('trip')
    assert new_trie.root == {'w': {'o': {'r': {'d': {'s': {'$': '$'}}}}},
                             't': {'r': {'i': {'e': {'$': '$'}, 'p': {'$': '$'}}}}}
Пример #18
0
def readDict():
  #get words from /usr/share/dict/words
  f = open('/usr/share/dict/words', 'r')
  words = Trie()
  #load words into trie
  for word in f.read().split('\n'):
	words.insert(word)
  return words
Пример #19
0
def trie_dictionary(dictionary):
    trie = Trie()
    
    for key in dictionary.keys():
        #print key
        trie.add(key, dictionary[key])
        
    return trie
Пример #20
0
def test_2_trie():
    """Test that we can insert two words into the Trie."""
    from trie import Trie
    new_trie = Trie()
    new_trie.insert('words')
    new_trie.insert('trie')
    assert new_trie.root == {'w': {'o': {'r': {'d': {'s': {'$': '$'}}}}},
                             't': {'r': {'i': {'e': {'$': '$'}}}}}
Пример #21
0
    def test_search_7(self):
        trie = Trie()
        key = "tea"
        trie.insert( key )

        matches = trie.search( "pea", 1 )
        self.assertEqual(len(matches), 1)
        self.assertTrue(key in matches)
Пример #22
0
 def test_insert_fork(self):
     trie = Trie()
     ab_data = 'ripper X!'
     ax_data = 'for chopping'
     trie.insert( 'ab', ab_data )
     trie.insert( 'ax', ax_data )
     self.assertIsNone(trie.get('a'))
     self.assertEqual(trie.get('ab'), ab_data)
     self.assertEqual(trie.get('ax'), ax_data)
Пример #23
0
class Pruner(object):
    def __init__(self, **kwargs):
        self.file_a = kwargs.get('file_a')
        self.file_b = kwargs.get('file_b')
        self.file_c = kwargs.get('file_c')
        self.file_d = kwargs.get('file_d', 'junk_out.csv')
        # self.number_of_lines = kwargs.get('number_of_lines', 800000)
        self.trie = Trie()

    def prune(self):
        input_file_a = open(self.file_a, 'rU')
        data = csv.reader((line.replace('\0', '') for line in input_file_a),
                          delimiter=",")
        i = 0
        j = 0
        for line in data:
            i += 1
            if (len(line) < 1):
                j += 1
            else:
                line = self.parse_string(line[0])
                self.trie.insert_trie(line)

        print 'Total Number of lines inserted in input'+str(i)
        print 'Total Number of lines skipped while inserting'+str(j)
        input_file_a.close()

        input_file_b = open(self.file_b, 'rU')
        data = csv.reader((line.replace('\0', '') for line in input_file_b),
                          delimiter=",")

        output_file_c = open(self.file_c, 'w')
        output_file_d = open(self.file_d, 'w')
        csv_writer_file_c = csv.writer(output_file_c, delimiter=",")
        csv_writer_file_d = csv.writer(output_file_d, delimiter=",")
        j = 0
        k = 0
        for line in data:
            if (len(line) < 1):
                continue
            line = self.parse_string(line[0])
            if self.trie.in_trie(line):
                j += 1
                csv_writer_file_d.writerow(line)
            else:
                k += 1
                csv_writer_file_c.writerow(line)

        print str(j) + ' in junk'
        print str(k) + ' in output'

        output_file_c.close()
        output_file_d.close()
        print 'Done'

    def parse_string(self, input_str=''):
        return str(input_str)
Пример #24
0
def test_auto_complete_nonexist():
    """Test autocomplete for a word that does not exist in trie."""
    from trie import Trie
    t = Trie()
    t.insert("catty")
    t.insert("church")
    t.insert("crutch")
    t.insert("cats")
    t.insert("dog")
    assert t.autocomplete("p") == []
Пример #25
0
    def test_08_find_all(self):
        t = Trie()
        l = ["spam", "spammer", "spamhouse", "spammers", "spams", "bacon"]

        for i in l:
            t[i] = i

        self.assertEqual(t.find_all("bac"), ["bacon"])
        self.assertEqual(sorted(t.find_all("spam")), sorted(l[:-1]))
        self.assertEqual(sorted(t.find_all("")), sorted(l))
Пример #26
0
    def test_search_5(self):
        trie = Trie()
        key_ace = "ace"
        key_ate = "ate"
        trie.insert( key_ace )
        trie.insert( key_ate )

        matches = trie.search( "axe", 1 )
        self.assertEqual(len(matches), 2)
        self.assertTrue(key_ace in matches)
        self.assertTrue(key_ate in matches)
Пример #27
0
    def test_search_8(self):
        trie = Trie()
        key_tea = "tea"
        key_pet = "pet"
        trie.insert( key_tea )
        trie.insert( key_pet )

        matches = trie.search( "pea", 1 )
        self.assertEqual(len(matches), 2)
        self.assertTrue(key_tea in matches)
        self.assertTrue(key_pet in matches)
Пример #28
0
    def test_match_prefix_0(self):
        trie = Trie()

        matches = trie.match_prefix('a')
        self.assertEqual(len(matches), 0)

        matches = trie.match_prefix('z')
        self.assertEqual(len(matches), 0)

        matches = trie.match_prefix('abracadabra')
        self.assertEqual(len(matches), 0)
Пример #29
0
    def test_search_2(self):
        trie = Trie()
        key_at = "at"
        key_as = "as"
        trie.insert( key_at )
        trie.insert( key_as )

        matches = trie.search( "ax", 1 )
        self.assertEqual(len(matches), 2)
        self.assertTrue(key_at in matches)
        self.assertTrue(key_as in matches)
Пример #30
0
    def test_search_6(self):
        trie = Trie()
        key = "ate"
        trie.insert( key )

        matches = trie.search( "any", 2 )
        self.assertEqual(len(matches), 1)
        self.assertTrue(key in matches)

        matches = trie.search( "any", 1 )
        self.assertEqual(len(matches), 0)
Пример #31
0
class TestTrie(unittest.TestCase):
    def setUp(self):
        self.t = Trie()

    def add_words(self):
        self.assertTrue(self.t.add_word("AICIXE"))
        self.assertTrue(self.t.add_word("AMBKP"))

    def test_words(self):
        self.t.add_word("AICIXE")
        self.t.add_word("AMBKP")
        self.assertListEqual(self.t.list_words('')[1], ["AICIXE", "AMBKP"])
        self.assertListEqual(self.t.list_words('AI')[1], ["AICIXE"])
Пример #32
0
 def test_SelfAdd(self):
     self.trie["Foo"] = True
     t2 = Trie()
     t2["Food"] = True
     self.assertTrue("Foo" in self.trie)
     self.assertFalse("Food" in self.trie)
     self.assertTrue("Food" in t2)
     self.assertFalse("Foo" in t2)
     self.trie += t2
     self.assertTrue("Foo" in self.trie)
     self.assertTrue("Food" in self.trie)
Пример #33
0
    def build(self, n:int=1) -> Trie:
        trie = Trie()
        for i in range(len(self.seq) + 1 - n):
            tokens = self.seq[i:i+n]
            if '</s>' in tokens[:-1]:
                continue
            if tokens not in trie:
                trie[tokens] = 0
            trie[tokens] += 1

        return trie
Пример #34
0
 def test_Addition(self):
     self.trie["Foo"] = True
     t2 = Trie()
     t2["Food"] = True
     t3 = t2 + self.trie
     self.assertTrue("Foo" in self.trie)
     self.assertFalse("Food" in self.trie)
     self.assertTrue("Food" in t2)
     self.assertFalse("Foo" in t2)
     self.assertTrue("Foo" in t3)
     self.assertTrue("Food" in t3)
Пример #35
0
    def findWords(self, board: List[List[str]], words: List[str]) -> List[str]:
        def rec(r, c, node):
            ch = board[r][c]
            if ch not in node.children:
                return
            node = node.children[ch]

            if node.key:
                #res.add(word) # if using set() for output
                res.append(node.key)
                node.key = ""  # alternative to using set() for

            board[r][c] = '#'

            if r >= 1 and board[r - 1][c] != '#':
                rec(r - 1, c, node)
            if r + 1 < m and board[r + 1][c] != '#':
                rec(r + 1, c, node)
            if c >= 1 and board[r][c - 1] != '#':
                rec(r, c - 1, node)
            if c + 1 < n and board[r][c + 1] != '#':
                rec(r, c + 1, node)

            board[r][c] = ch

        trie = Trie()

        for w in words:
            trie.insert(w)

        m = len(board)
        n = len(board[0])
        #res = set()
        res = []

        for i in range(m):
            for j in range(n):
                rec(i, j, trie.root)

        #return list(res)
        return res
Пример #36
0
    def findWords(self, board: List[List[str]], words: List[str]) -> List[str]:
        def rec(r, c, word=""):
            word += board[r][c]

            node = trie.find_node(word)
            if not node:
                return
            if node.is_key:
                #res.add(word) # if using set() for output
                res.append(word)
                node.is_key = False  # alternative to using set() for

            board[r][c] = '#'

            if r >= 1 and board[r - 1][c] != '#':
                rec(r - 1, c, word)
            if r + 1 < m and board[r + 1][c] != '#':
                rec(r + 1, c, word)
            if c >= 1 and board[r][c - 1] != '#':
                rec(r, c - 1, word)
            if c + 1 < n and board[r][c + 1] != '#':
                rec(r, c + 1, word)

            board[r][c] = word[-1]

        trie = Trie()

        for w in words:
            trie.insert(w)

        m = len(board)
        n = len(board[0])
        #res = set()
        res = []

        for i in range(m):
            for j in range(n):
                rec(i, j)

        #return list(res)
        return res
Пример #37
0
    def __init__(self, header, transactions=None, uncles=None, db=None):
        if transactions is None:
            transactions = []
        if uncles is None:
            uncles = []

        self.db = db

        if self.db is None:
            raise TypeError("Block must have a db")

        super(FrontierBlock, self).__init__(
            header=header,
            transactions=transactions,
            uncles=uncles,
        )

        self.state_db = State(self.db, root_hash=self.header.state_root)
        self.transaction_db = Trie(self.db,
                                   root_hash=self.header.transaction_root)
        self.receipt_db = Trie(self.db, root_hash=self.header.receipts_root)
Пример #38
0
    def __init__(self):
        self.CHARS_MAPPING = {
            "a": ("a", "@", "*", "4"),
            "i": ("i", "*", "l", "1"),
            "o": ("o", "*", "0", "@"),
            "u": ("u", "*", "v"),
            "v": ("v", "*", "u"),
            "l": ("l", "1"),
            "e": ("e", "*", "3"),
            "s": ("s", "$", "5"),
            "t": ("t", "7")
        }
        self.censor_urls = set()
        self.profane_trie = Trie()
        self.default_wordlist_filename = get_complete_path(
            'data/profanity_wordlist.txt')
        self.default_urls_filename = get_complete_path(
            'data/profane_sites.txt')

        self.load_profane_words(profane_words=None, whitelist_words=None)
        self.load_profane_urls()
Пример #39
0
def read_volunteers():
    """ Read all the volunters in and orchestrate their transformation
    """
    group = None # Will hold Volunteer Objects
    user_trie = Trie() # Will contain complete slack user list in JSON
    with open("volunteers.csv") as volunteers:
        reader = csv.reader(volunteers)
        group = [Volunteer(line) for line in reader]
        group.pop(0)
    with open("config.yaml") as config:
        reader = yaml.load(config)
        user_list = get_users_slack(reader["slack"])
        for user in user_list:
            if 'real_name' not in user:
                continue
            user_trie.add(user['real_name'].lower(), user)
    md_file = open('./volunteers.md', 'w')
    for gr in group:
        gr.parse_slack(user_trie)
        md_file.write(str(gr))
    md_file.close()
Пример #40
0
def main():
    csv_filename = r"./Data/load_employees_dump.txt"
    emp_trie = Trie()
    emp_dict = {}
    with open(csv_filename, "r") as fd:
        dict_reader = csv.DictReader(fd, [
            'employee_id', 'birth_date', 'first_name', 'last_name', 'gender',
            'joining_date', 'manager_id'
        ])
        for record in dict_reader:
            emp_id = record['employee_id']
            name = record['first_name'] + '.' + record['last_name']
            mgr_id = record['manager_id']
            emp_dict[emp_id] = (emp_id, name, mgr_id)
            emp_trie.insert(name, emp_id)

    status, emp_id = emp_trie.search('Cristinel.Bouloucos')
    print(emp_dict[emp_id])
    emp_id, name, mgr_id = emp_dict[emp_id]
    print('Employee Name: {} and Manager Name: {}'.format(
        name, emp_dict[mgr_id][1]))
Пример #41
0
def get(conn: socket, request: str, trie: Trie):
    get_request = request
    key = get_request[len(GET):].strip()

    value = trie.get(key=key)
    if not value:
        not_found_response(s=conn, request=request)
        return

    # value = json.dumps(value).replace(",", ";")
    log(code=SUCCESS, request=request)
    construct_response(s=conn, data={key: value}, success=True)
Пример #42
0
    def test_06_trie_setdefault(self):
        t = Trie()

        t.setdefault("spam", []).append("eggs")
        self.assertEqual(t["spam"], ["eggs"])
        t.setdefault("spam", []).append("coffee")
        self.assertEqual(t["spam"], ["eggs", "coffee"])

        self.assertEqual(t.setdefault("spa", "bacon"), "bacon")
        self.assertEqual(t["spa"], "bacon")
Пример #43
0
    def __init__(self):
        # TODO 测试集上检查平滑处理的抉择问题
        self.minfreq = -3.14e+100
        # 构建字典树、用于扫描全切分有向图
        self.trie = Trie()
        self.construct_trie()
        # 构建 二元词典
        # self.construct_bigram_dic()
        # 读取二元词典
        with open('files/bigram_dic.json', 'r') as f:
            self.bigram_dic = json.load(f)

        # 进行特殊处理
        self.SP = SpecialProcess()

        # 创建HMM分词模型
        self.hmm = HMM()

        # 获取常用姓名中名字
        self.get_second_names()
        self.get_first_name()
Пример #44
0
 def func():
     # 等待tags和alias加载完毕
     while not tags or not all_alias:
         time.sleep(0.1)
     cmd_trie, tag_trie = Trie(), Trie()
     cmd_trie.add(cfg.cmds)
     cmd_trie.add(all_alias['cmd'].keys())
     tag_trie.add(tags.keys())
     global tries
     tries = {'cmd': cmd_trie, 'tag': tag_trie}
Пример #45
0
 def __init__(self, root=b'', env=Env(), executing_on_head=False, **kwargs):
     self.env = env
     self.trie = SecureTrie(Trie(RefcountDB(self.db), root))
     self.txindex = STATE_DEFAULTS['txindex']
     self.block_number = STATE_DEFAULTS['block_number']
     self.block_coinbase = STATE_DEFAULTS['block_coinbase']
     self.timestamp = STATE_DEFAULTS['timestamp']
     self.prev_headers = STATE_DEFAULTS['prev_headers']
     self.journal = []
     self.cache = {}
     self.changed = {}
     self.executing_on_head = executing_on_head
Пример #46
0
def delete_all(secret_key):
    if secret_key == '8':
        # secret key is correct, delete all from db
        RepoStrings.query.delete()
        db.session.commit()
        set_trie(Trie(), reset=True)
        return '', 204

    return jsonify({
        'Unauthorized':
        'Secret delete key incorrect, unable to truncate table'
    }), 403
Пример #47
0
    def __init__(self, dictionary, connection):
        #initalize trie
        self.trie = Trie()
        for line in open(dictionary):
            (yomi, lid, rid, cost, word) = line.strip().split("\t", 4)
            lid, rid, cost = int(lid), int(rid), int(cost)
            yomi, word = unicode(yomi, 'utf-8'), unicode(word, 'utf-8')
            self.trie.insert(yomi, (word, lid, rid, cost))

        #initialize connection
        file = open(connection)
        lsize, rsize = file.readline().strip().split(" ", 1)
        lsize, rsize = int(lsize), int(rsize)
        self.connection = [None] * rsize
        for line in file:
            (lid, rid, cost) = line.strip().split(" ", 2)
            lid, rid, cost = int(lid), int(rid), int(cost)
            if lid != 0:
                break
            self.connection[rid] = cost
        file.close()
Пример #48
0
def _save_trie(rsc_dir, entries):
    """
    트라이를 저장한다.
    Args:
        rsc_dir:  대상 리소스 디렉토리
        entries:  엔트리 리스트
    """
    trie = Trie()
    total_tag_nums = 0
    for entry in entries:
        val = total_tag_nums
        val += 1    # 인덱스는 0이 아니라 1부터 시작한다.
        val *= 2    # 어절 완전일치의 경우 짝수
        val += 1 if entry.is_pfx else 0    # 전망매칭 패턴의 경우 홀수
        trie.insert(entry.word, val)
        total_tag_nums += len(entry.tag_nums)
    trie.save(f'{rsc_dir}/preanal.tri')

    val_file = f'{rsc_dir}/preanal.val'
    with open(val_file, 'wb') as fout:
        fout.write(struct.pack('H', 0))    # 인덱스가 1부터 시작하므로 dummy 데이터를 맨 앞에 하나 넣는다.
        for idx, entry in enumerate(entries, start=1):
            logging.debug('%d: %s: %s: %s', idx, entry.word, entry.tag_outs, entry.tag_nums)
            fout.write(struct.pack('H' * len(entry.tag_nums), *entry.tag_nums))
    logging.info('value saved: %s', val_file)
    logging.info('total entries: %d', len(entries))
    logging.info('expected size: %d',
                 (sum([len(e.tag_nums) for e in entries])+1) * struct.Struct('H').size)
Пример #49
0
class TestTrie(unittest.TestCase):
    """ Python test ment to run on saved copy of slack JSON data
    """
    def setUp(self):
        self.trie = Trie()
        #Default Slack JSON file name
        with open('users.json') as users:
            loaded = json.load(users)
            for member in loaded['members']:
                if 'real_name' not in member:
                    continue
                self.trie.add_name(0, member['real_name'].lower(), member)

    def test_addition(self):
        """ Making Sure trie entries are added correctly
        """
        added = self.trie.add('Jane Doe'.lower(), {'value': 0})
        self.assertTrue(added)
        self.assertIsNotNone(self.trie.search('Jane Doe'.lower()))

    def test_search(self):
        """ Verifying Trie entries
        """
        self.assertIsNone(self.trie.search('John Doe'.lower()))
        self.assertIsNotNone(self.trie.search('Aaron Long'.lower()))
Пример #50
0
class Thesaurus(object):
    """When initialize the Thesaurus, it will scan the word_bank.txt and establish the dictionary trie."""
    def __init__(self, word_bank_path, tags):
        if not isinstance(tags, list):
            raise ValueError("'default_setting' must be dict!")
        self._trie = Trie()
        for line in open(word_bank_path, 'r'):
            item = strdecode(line).strip().split(' ')
            attr = {}
            for index in range(len(item) - 1):
                attr[tags[index]] = item[index + 1]
            self._trie.add_new_word(item[0], attr)

    def __len__(self):
        return self._trie.__len__()

    def __contains__(self, word):
        return self._trie.__contains__(word)

    def clear(self):
        self._trie = Trie()

    def has_word(self, word):
        """Return whether the thesaurus contains the word"""
        return self._trie.has_word(word)

    def get_attr(self, word):
        """Return the frequency of the word"""
        return self._trie.get_attr(word)
Пример #51
0
class TestPreprocess(unittest.TestCase):
    def setUp(self):
        self.common_prefix = ''.join(
            random.choices(string.ascii_letters + string.digits, k=16))
        self.ending_1 = ''.join(
            random.choices(string.ascii_letters + string.digits, k=16))
        self.ending_2 = ''.join(
            random.choices(string.ascii_letters + string.digits, k=16))
        self.string_1 = self.common_prefix + self.ending_1
        self.string_2 = self.common_prefix + self.ending_2
        self.not_string = self.common_prefix + ''.join(
            random.choices(string.ascii_letters + string.digits, k=16))

        self.root = TrieNode("")
        self.trie = Trie(self.root)
        self.trie.add_sentence(self.root, self.string_1)
        self.trie.add_sentence(self.root, self.string_2)

    def test_contains(self):
        """
        Test to verify that Trie.contains() returns True on sentences that have been added to the Trie.
        Returns False for those that do not exist within the Trie.
        """
        self.assertTrue(self.trie.contains(self.root, self.string_1)[0])
        self.assertTrue(self.trie.contains(self.root, self.string_2)[0])
        self.assertFalse(self.trie.contains(self.root, self.not_string)[0])

    def test_return_completions_from_node(self):
        """
        Test to verify that Trie.return_completions_from_node() correctly enumerates sentences that exist within the Trie,.
        """
        node = self.trie.contains(self.root, self.common_prefix)[1]
        completions = self.trie.return_completions_from_node(node)
        # Although the unnittest method is misleadingly named, it actually checks if two arrays contain same elements
        self.assertCountEqual([self.ending_1, self.ending_2], completions)
Пример #52
0
 def test_add(self):
     t = Trie()
     t.add('hello')
     self.assertEquals(t.trie,
                       {'h': {
                           'e': {
                               'l': {
                                   'l': {
                                       'o': {
                                           '$': None
                                       }
                                   }
                               }
                           }
                       }})
     t.add('hell')
     self.assertEquals(
         t.trie, {'h': {
             'e': {
                 'l': {
                     'l': {
                         'o': {
                             '$': None
                         },
                         '$': None
                     }
                 }
             }
         }})
def build_offline_data_model(max_suggestions, min_words_partial,
                             input_filepath, output_filepath):
    """Perform all offline data processing steps build the data model.
    Includes:
    1) Normalization of the agents' responses [normalize_responses.py].
    2) Building the data model [data_model.py] based on the normalized 
       responses and the Trie class [trie.py].
    3) Storage (pickled file) of the built data model at output_filepath.

    Arguments
    ---------
    max_suggestions: int
        Maximum number of auto-complete suggestions.
    min_words_partial: int
        Minimum number of words in auto-complete suggestions of 
        partial sentences.
    input_filepath: str
        Relative path of the JSON file with conversations.
    output_filepath: str
        Relative path of the pickled file to be saved.
    """
    responses = extract_responses_from_JSON(input_filepath)
    print('Read in data. Starting processing of responses now...')
    ########################   Normalization   ########################
    start = time.time()
    # A signature is constructed for each sentence in the responses using
    # lemmatized forms of words.  signature_to_text is a dict whose values
    # are sentences with identical signatures that are grouped together
    # and represented as repeated copies of the same normalized form.
    signature_to_text = get_signature_to_text_map(responses)
    # Next, we flatten nested lists of these normalized sentences:
    responses_processed = list(chain.from_iterable(signature_to_text.values()))
    end = time.time()
    duration = round(end - start, 2)
    print("Finished normalizing agent responses in {} s".format(duration))
    ##############   Creating and Saving the Data Model   #############
    start = time.time()
    # A trie (prefix tree) is used here to construct a data model.
    # A trie allows efficiently (w.r.t. time) accessing all strings
    # matching a prefix.
    autocomplete_trie = Trie(max_suggestions, min_words_partial)
    for response in responses_processed:
        # insert responses to grow the trie
        autocomplete_trie.insert_response(response)

    end = time.time()
    duration = round(end - start, 2)
    print('Inserted normalized responses into the trie in {} s.'.format(
        duration))

    # Save the trie containing all agent responses
    start = time.time()
    autocomplete_trie.save(output_filepath)
    end = time.time()
    duration = round(end - start, 2)
    print(
        'Trie with normalized responses saved at: {}'.format(output_filepath))
    print('Serialization of the trie took {} s.'.format(duration))

    return autocomplete_trie
Пример #54
0
    def findWords(self, board: List[List[str]], words: List[str]) -> List[str]:
        def rec(r, c, word=""):
            if not (0 <= r < m and 0 <= c < n):
                return False

            if board[r][c] == '#':
                return

            word += board[r][c]

            if not trie.starts_with(word):
                return

            if trie.search(word):
                res.add(word)
                # need to continue searching

            board[r][c] = '#'

            rec(r - 1, c, word)
            rec(r + 1, c, word)
            rec(r, c - 1, word)
            rec(r, c + 1, word)

            board[r][c] = word[-1]

        trie = Trie()

        for w in words:
            trie.insert(w)

        m = len(board)
        n = len(board[0])
        res = set()

        for i in range(m):
            for j in range(n):
                rec(i, j)

        return list(res)
Пример #55
0
 def test_remove_one_but_not_both(self):
     trie = Trie()
     key1 = 'abc'
     key2 = 'abd'
     data1 = 123
     data2 = 987
     trie.insert(key1, data1)
     trie.insert(key2, data2)
     self.assertEqual(trie.remove(key1), data1)
     self.assertIsNone(trie.get(key1))
     self.assertEqual(trie.get(key2), data2)
Пример #56
0
    def test_match_prefix_4(self):
        trie = Trie()
        key_at = 'at'
        key_absent = 'absent'
        trie.insert(key_at)
        trie.insert(key_absent)

        matches = trie.match_prefix('a')
        self.assertEqual(len(matches), 2)
        self.assertTrue(key_at in matches)
        self.assertTrue(key_absent in matches)
Пример #57
0
def test_traverse_simple():
    """Test traverse method."""
    t = Trie()
    t.insert('water')
    t.insert('wash')
    gen = t.traverse('wa')
    trie_words = []
    for _ in range(2):
        trie_words.append(next(gen))
    for word in trie_words:
        assert word in ['wash', 'water']
Пример #58
0
def test_dict_updates():
    """The dict of words stays updated."""
    from trie import Trie
    t = Trie()
    t.insert('Apple')
    t.insert('Banana')
    t.insert('Stalin')
    assert 'Stalin' in t.dict_of_words
Пример #59
0
def main():
    # arg parsing
    parser = argparse.ArgumentParser(
        description="Process arguments for data creation")
    parser.add_argument('-a', type=str, help="ip address", default=HOST)
    parser.add_argument('-p', type=int, help="port", default=PORT)
    args = parser.parse_args()

    # init Trie
    trie = Trie()

    # listen for connections
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:

        print('Listening on ' + args.a + ':' + str(args.p))
        s.bind((args.a, args.p))
        s.listen()

        while True:
            conn, addr = s.accept()
            with conn:

                while True:
                    # receive request size
                    request_size = conn.recv(BUFFER_SIZE)
                    if not request_size:
                        break

                    # ack for request size
                    conn.sendall(OK.encode())

                    # read data from request
                    request = conn.recv(BUFFER_SIZE)
                    method = request.decode().split(" ")[0]
                    while len(request) < int(request_size):
                        data = conn.recv(BUFFER_SIZE)
                        request = request + data

                    # process request
                    request = request.decode()
                    if method == GET:
                        get(conn=conn, request=request, trie=trie)

                    elif method == PUT:
                        put(conn=conn, request=request, trie=trie)

                    elif method == QUERY:
                        query(conn=conn, request=request, trie=trie)

                    elif method == DELETE:
                        delete(conn=conn, request=request, trie=trie)
Пример #60
0
class invertFile():
    def __init__(self):
        self.keyTrie = Trie()
        self.occurence_list = []
        self.list_length = 0

    def put(self, key, freq, seq, fileAddress,
            pageAddress):  ## PUT ELEMENT INTO INVERTFILE
        if self.keyTrie.isWordExist(key):  ## THIS IS A EXIST KEY
            occurence_list_index = self.keyTrie.searchValue(key)
            self.occurence_list[occurence_list_index][1] += freq
            self.occurence_list[occurence_list_index][2].append(
                (fileAddress, freq, seq, pageAddress))
            self.occurence_list[occurence_list_index][2].sort(
                key=lambda x: x[2])

        else:  ## THIS IS A NEW KEY
            ## CREATE OCCURENCE_LIST
            self.occurence_list.append(
                [key, freq, [(fileAddress, freq, seq, pageAddress)]])
            ## PUT IT IN TRIE
            self.keyTrie.insert(key, self.list_length)
            self.list_length += 1

    def get(self, key):  ## GET A OCCURENCE LIST OF A GIVEN KEY
        return self.occurence_list[self.keyTrie.searchValue(key)]

    def showDictionary(
            self):  ## PRINT ALL INVERTFILE. THIS IS A METHOD FOR TEST
        allkey = self.keyTrie.showAllKey()
        for key in allkey:
            print(key, self.keyTrie.searchValue(key), self.get(key))

    def saveInvertFile(
            self,
            fileName="Occurence_List.dat"):  ## SAVE THE ALL OCCURENCE LIST
        f = open(fileName, 'w')
        for [keyName, keyfreqency, Occurence_List] in self.occurence_list:
            f.write(keyName + "|||" + str(keyfreqency) + "|||")
            for (fileAddress, freqInPage, pageSeq,
                 pageAddress) in Occurence_List:
                f.write(fileAddress + "|||" + str(freqInPage) + "|||" +
                        str(pageSeq) + "|||" + pageAddress + "|||")
            f.write("\n")
        f.close()


# inf = invertFile()
# inf.put("nltk",10,"1.html",1,"www.q")
# inf.put("language",6,"1.html",1,"www.q")
# inf.put("nltk",10,"2.html",2,"www.e")
# print(inf.get("nltk"))
# inf.showDictionary()