def gen_zhengzhou_tree(dirname=myconfig.ZZ_STD_ADD, sav_file=myconfig.zhengzhou_std_word, sav_file_2=myconfig.zhengzhou_std_tree): addr_kv_rec = open("./addr_match.txt", 'w+') print('\n>gen_zhengzhou_tree start') #pdb.set_trace() my_tree = trie_tree.Trie() my_word = trie_tree.Trie() paths = os.walk(dirname) sum_lines = [] cnt = 0 for _, _, fs in paths: for f in fs: pth = os.path.join(dirname, str(f)) lines = open(pth, 'r').readlines() np.random.shuffle(lines) #lines = open(pth,'r').readlines()[:myconfig.TRAIN_DATA] for line in lines: if not ',' in line: continue _line = line.split(',')[1] line = utils.pre_trans(_line) addr_kv_rec.write('%s\t%s\n' % (str(line), str(_line))) cnt += 1 if cnt % 10000 == 1: print(cnt) my_tree.insert(line) my_word.insert(_line) utils.save_var(my_word, sav_file) utils.save_var(my_tree, sav_file_2) print('\n>my address tree save ok') addr_kv_rec.close()
def test_format_df(): logger.debug('\n> 树的合并') from function_ultra import utils mStandAddrTreeBuilder = StandAddrTreeBuilder() # 实例 my_tree = utils.read_var(myconfig.MY_TREE) mStandAddrTreeBuilder.format_tree(my_tree.root) utils.save_var(my_tree, myconfig.MY_TREE) pdb.set_trace() logger.debug('\n> 树的合并测试完成')
def gen_address_tree(filename=myconfig.STDTXTPATH, sav_file=myconfig.MY_TREE): print('\n>gen_address_tree start') my_tree = trie_tree.Trie() df = open(filename, 'r') lines = df.readlines() #pd.read_csv(filename) for sent in lines: my_tree.insert(sent) utils.save_var(my_tree, sav_file) print('\n>my address tree save ok') return my_tree
def gen_std_tree_from_dataframe(data_src, sav_file=myconfig.MY_TREE): # 从dataframe创建标准地址树 print('\n>gen_std_tree_from_dataframe start') my_tree = trie_tree.Trie() for item in data_src: clritem = remove_nan(item) print(clritem) pdb.set_trace() my_tree.part_insert(my_tree.root, clritem) utils.save_var(my_tree, sav_file) print('\n>gen_std_tree_from_dataframe ready and save finish') return myconfig.SUCCESS
def gen_word_tree(filename=myconfig.STDTXTPATH, sav_file=myconfig.MY_WORD): print('\n>gen_address_tree start') my_tree = trie_tree.Trie() df = open(filename, 'r') lines = df.readlines() #pd.read_csv(filename) print(len(lines)) for sent in lines: words = sent.split('/') for word in words: my_tree.insert(word) utils.save_var(my_tree, sav_file) print('\n>my address tree save ok') return my_tree
def gen_std_tree(filename=myconfig.STDTXTPATH, sav_file=myconfig.MY_TREE, delimeter='/'): print('\n>gen_std_tree start') my_tree = trie_tree.Trie() df = open(filename, 'r') lines = df.readlines() #pd.read_csv(filename) for sent in lines: words = sent.split(delimeter) my_tree.insert(words) utils.save_var(my_tree, sav_file) print('\n>my std tree save ok') return my_tree
def first_init_DG(self): my_tree = utils.read_var(myconfig.MY_TREE) di = nx.DiGraph() my_tree.trans_tree_2_graph(my_tree.root,di) utils.save_var(di,myconfig.DIGRAPH) return my_tree,utils.read_var(myconfig.DIGRAPH)