def test_startswith(self): pd = PrefixSet() keys = ["".join(combo) for combo in itertools.product("abc", repeat=3)] for key in reversed(keys): pd.add(key) subset = [k for k in keys if k.startswith("ab")] self.assertSequenceEqual(subset, list(pd.startswith("ab")))
def test_startswith(self): pd = PrefixSet() keys = [''.join(combo) for combo in itertools.product('abc', repeat=3)] for key in reversed(keys): pd.add(key) subset = [k for k in keys if k.startswith('ab')] self.assertSequenceEqual(subset, list(pd.startswith('ab')))
def insert_search_delete(self, keys): pd = PrefixSet() for key in keys: pd.add(key) self.assertEqual(len(pd), len(set(keys))) for key in keys: self.assertIn(key, pd) for key in keys: pd.discard(key) self.assertEqual(len(pd), 0) for key in keys: self.assertFalse(key in pd) self.assertEqual(len(pd._root), 0)
def test_iter_post_el(self): pd = PrefixSet(['a', 'b', 'c']) pd.remove('b') list(pd)
def test_commonprefix_full(self): pd = PrefixSet(['abcd']) self.assertEqual('abcd', pd.commonprefix('abcd'))
def test_commonprefix_half(self): pd = PrefixSet(['abcd']) self.assertEqual(b'ab', pd.commonprefix('abef'))
def test_commonprefix_empty(self): pd = PrefixSet(['abcd']) self.assertEqual(b'', pd.commonprefix('efgh'))
def test_sort_order(self): pd = PrefixSet() keys = ['', 'a', 'aa', 'ab', 'b', 'ba'] for key in reversed(keys): pd.add(key) self.assertSequenceEqual(keys, list(iter(pd)))
def test_sort_order(self): pd = PrefixSet() keys = ["", "a", "aa", "ab", "b", "ba"] for key in reversed(keys): pd.add(key) self.assertSequenceEqual(keys, list(iter(pd)))
def test_init_iterable(self): pd = PrefixSet(['a']) self.assertIn('a', pd)
def test_pickle(self): pd = PrefixSet() pd.add("a") pickle.dumps(pd, pickle.HIGHEST_PROTOCOL)
def test_reversed(self): pd = PrefixSet() keys = ["".join(combo) for combo in itertools.product("abc", repeat=3)] for key in keys: pd.add(key) self.assertSequenceEqual(list(reversed(keys)), list(reversed(pd)))
def test_iter_post_el(self): pd = PrefixSet(["a", "b", "c"]) pd.remove("b") list(pd)
def test_commonprefix_full(self): pd = PrefixSet(["abcd"]) self.assertEqual("abcd", pd.commonprefix("abcd"))
def test_commonprefix_half(self): pd = PrefixSet(["abcd"]) self.assertEqual(b"ab", pd.commonprefix("abef"))
def test_commonprefix_empty(self): pd = PrefixSet(["abcd"]) self.assertEqual(b"", pd.commonprefix("efgh"))
def test_pickle(self): pd = PrefixSet() pd.add('a') pickle.dumps(pd, pickle.HIGHEST_PROTOCOL)
def test_invalid_key(self): pd = PrefixSet() self.assertRaises(TypeError, operator.setitem, pd, 0)
# -*- coding: utf-8 -*- import json import os import re from prefixtree import PrefixSet file = os.path.join(os.path.dirname(__file__), "../dictionaries/areas.json") places = os.path.join(os.path.dirname(__file__), "../dictionaries/places.txt") import ast ps = PrefixSet() with open(file, 'r') as f,\ open(places, 'w') as out: content = f.read() spaces = re.findall('"[^"]+"', content) for s in spaces: space = ast.literal_eval(s) ps.add(space) out.write("%s 30000 ns\n" % space) # 北京 34488 ns assert "大连" not in ps assert ps.startswith("大连") for x in ps.startswith("大连"): print(x)
def test_reversed(self): pd = PrefixSet() keys = [''.join(combo) for combo in itertools.product('abc', repeat=3)] for key in keys: pd.add(key) self.assertSequenceEqual(list(reversed(keys)), list(reversed(pd)))
def test_startswith_empty(self): pd = PrefixSet() pd.add("a") self.assertSequenceEqual([], list(pd.startswith("b")))
def test_startswith_empty(self): pd = PrefixSet() pd.add('a') self.assertSequenceEqual([], list(pd.startswith('b')))
DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "data") pos_result = os.path.join(DATA_DIR, "pos_combined.txt") neg_result = os.path.join(DATA_DIR, "neg_combined.txt") ntusd_dir = os.path.join(DICTIONARIES_DIR, "台湾大学NTUSD简体中文情感词典") pos_file = os.path.join(ntusd_dir, "NTUSD_positive_simplified.txt") neg_file = os.path.join(ntusd_dir, "NTUSD_negative_simplified.txt") new_line = "%s\n" ps = PrefixSet() pos_sentences = set() neg_sentences = set() places = os.path.join(os.path.dirname(__file__), "../dictionaries/places.txt") with open(places) as f: tokenizer.load_userdict(f) for line in f: s = line.strip().split()[0] ps.add(s) # print(pseg.lcut("大连")) # x, y = pseg.lcut("大连")[0] # assert y == "ns"