def split_file( in_file, out_file, fields="bgg_user_name", trie_file=None, limits=LIMIT, construct=False, ): """ split input file along prefixes """ trie = None if trie_file and not construct: LOGGER.info("loading trie from file <%s>...", trie_file) trie = _trie_from_file(trie_file) if not trie: LOGGER.info("making trie for <%s>...", in_file) full_trie = _make_trie(file=in_file, fields=fields) limits = tuple(arg_to_iter(limits)) or (LIMIT, ) for limit in limits: trie = Trie(_prefixes(full_trie, limit=limit)) LOGGER.info("%d prefixes using limit %d", len(trie), limit) out_path = trie_file.format(limit=limit) if trie_file else None if not out_path or out_path == "-": for prefix, count in trie.items(): print(f"{prefix}\t{count}") else: with open(out_path, "w") as file_obj: for prefix, count in trie.items(): file_obj.write(f"{prefix}\t{count}\n") LOGGER.info("constructed trie of size %d", len(trie)) _save_to_prefixes(dst=out_file, trie=trie, file=in_file, fields=fields)
class TestTrie(unittest.TestCase): def setUp(self): self.words = 'an ant all allot alloy aloe are ate be'.split() self.trie = SortedStringTrie(zip(self.words, range(len(self.words)))) def test_longest_prefix(self): self.assertEqual(self.trie.longest_prefix('antonym'), 'ant') self.assertEqual(self.trie.longest_prefix('are'), 'are') self.assertEqual(self.trie.longest_prefix('alla'), 'all') self.assertEqual(self.trie.longest_prefix('allo'), 'all') self.assertRaises(KeyError, self.trie.longest_prefix_item, 'alumni') self.assertEqual(self.trie.longest_prefix('alumni', default=None), None) self.assertEqual(self.trie.longest_prefix('linux', default=-1), -1) def test_longest_prefix_value(self): self.assertEqual(self.trie.longest_prefix_value('antonym'), 1) self.assertEqual(self.trie.longest_prefix_value('are'), 6) self.assertEqual(self.trie.longest_prefix_value('alla'), 2) self.assertEqual(self.trie.longest_prefix_value('allo'), 2) self.assertRaises(KeyError, self.trie.longest_prefix_value, 'alumni') self.assertEqual( self.trie.longest_prefix_value('alumni', default=None), None) self.assertEqual(self.trie.longest_prefix_value('linux', default=-1), -1) def test_longest_prefix_item(self): self.assertEqual(self.trie.longest_prefix_item('antonym'), ('ant', 1)) self.assertEqual(self.trie.longest_prefix_item('are'), ('are', 6)) self.assertEqual(self.trie.longest_prefix_item('alla'), ('all', 2)) self.assertEqual(self.trie.longest_prefix_item('allo'), ('all', 2)) self.assertRaises(KeyError, self.trie.longest_prefix_item, 'alumni') self.assertEqual(self.trie.longest_prefix_item('alumni', default=None), None) self.assertEqual(self.trie.longest_prefix_item('linux', default=-1), -1) def test_iter_prefixes(self): self.assertEqual(list(self.trie.iter_prefixes('antonym')), ['an', 'ant']) self.assertEqual(list(self.trie.iter_prefixes('are')), ['are']) self.assertEqual(list(self.trie.iter_prefixes('alumni')), []) def test_iter_prefix_values(self): self.assertEqual(list(self.trie.iter_prefix_values('antonym')), [0, 1]) self.assertEqual(list(self.trie.iter_prefix_values('are')), [6]) self.assertEqual(list(self.trie.iter_prefix_values('alumni')), []) def test_iter_prefix_items(self): self.assertEqual(list(self.trie.iter_prefix_items('antonym')), [('an', 0), ('ant', 1)]) self.assertEqual(list(self.trie.iter_prefix_items('are')), [('are', 6)]) self.assertEqual(list(self.trie.iter_prefix_items('alumni')), []) def test_keys_wprefix(self): self.assertEqual(self.trie.keys('al'), ['all', 'allot', 'alloy', 'aloe']) self.assertEqual(self.trie.keys('are'), ['are']) self.assertEqual(self.trie.keys('ann'), []) def test_values_wprefix(self): self.assertEqual(self.trie.values('al'), [2, 3, 4, 5]) self.assertEqual(self.trie.values('are'), [6]) self.assertEqual(self.trie.values('ann'), []) def test_items_wprefix(self): self.assertEqual(self.trie.items('al'), [('all', 2), ('allot', 3), ('alloy', 4), ('aloe', 5)]) self.assertEqual(self.trie.items('are'), [('are', 6)]) self.assertEqual(self.trie.items('ann'), []) def test_consistency_wprefix(self): t = self.trie for prefix in 'al', 'are', 'ann': self.assertEqual(t.items(prefix), zip(t.keys(prefix), t.values(prefix))) def test_pickle(self): from pickle import dumps, loads, HIGHEST_PROTOCOL for proto in xrange(HIGHEST_PROTOCOL): unpickled = loads(dumps(self.trie, proto)) self.assertEqual(self.trie, unpickled) self.assert_(type(self.trie) is type(unpickled)) self.assert_(self.trie is not unpickled) def test_repr(self): evaled = eval(repr(self.trie)) self.assertEqual(evaled, self.trie) self.assertEqual(evaled.__class__, self.trie.__class__)
class TestTrie(unittest.TestCase): def setUp(self): self.words = 'an ant all allot alloy aloe are ate be'.split() self.trie = SortedStringTrie(zip(self.words, range(len(self.words)))) def test_longest_prefix(self): self.assertEqual(self.trie.longest_prefix('antonym'), 'ant') self.assertEqual(self.trie.longest_prefix('are'), 'are') self.assertEqual(self.trie.longest_prefix('alla'), 'all') self.assertEqual(self.trie.longest_prefix('allo'), 'all') self.assertRaises(KeyError, self.trie.longest_prefix_item, 'alumni') self.assertEqual(self.trie.longest_prefix('alumni', default=None), None) self.assertEqual(self.trie.longest_prefix('linux', default=-1), -1) def test_longest_prefix_value(self): self.assertEqual(self.trie.longest_prefix_value('antonym'), 1) self.assertEqual(self.trie.longest_prefix_value('are'), 6) self.assertEqual(self.trie.longest_prefix_value('alla'), 2) self.assertEqual(self.trie.longest_prefix_value('allo'), 2) self.assertRaises(KeyError, self.trie.longest_prefix_value, 'alumni') self.assertEqual(self.trie.longest_prefix_value('alumni', default=None), None) self.assertEqual(self.trie.longest_prefix_value('linux', default=-1), -1) def test_longest_prefix_item(self): self.assertEqual(self.trie.longest_prefix_item('antonym'), ('ant', 1)) self.assertEqual(self.trie.longest_prefix_item('are'), ('are', 6)) self.assertEqual(self.trie.longest_prefix_item('alla'), ('all', 2)) self.assertEqual(self.trie.longest_prefix_item('allo'), ('all', 2)) self.assertRaises(KeyError, self.trie.longest_prefix_item, 'alumni') self.assertEqual(self.trie.longest_prefix_item('alumni', default=None), None) self.assertEqual(self.trie.longest_prefix_item('linux', default=-1), -1) def test_iter_prefixes(self): self.assertEqual(list(self.trie.iter_prefixes('antonym')), ['an', 'ant']) self.assertEqual(list(self.trie.iter_prefixes('are')), ['are']) self.assertEqual(list(self.trie.iter_prefixes('alumni')), []) def test_iter_prefix_values(self): self.assertEqual(list(self.trie.iter_prefix_values('antonym')), [0, 1]) self.assertEqual(list(self.trie.iter_prefix_values('are')), [6]) self.assertEqual(list(self.trie.iter_prefix_values('alumni')), []) def test_iter_prefix_items(self): self.assertEqual(list(self.trie.iter_prefix_items('antonym')), [('an', 0), ('ant', 1)]) self.assertEqual(list(self.trie.iter_prefix_items('are')), [('are', 6)]) self.assertEqual(list(self.trie.iter_prefix_items('alumni')), []) def test_keys_wprefix(self): self.assertEqual(self.trie.keys('al'), ['all','allot','alloy','aloe']) self.assertEqual(self.trie.keys('are'), ['are']) self.assertEqual(self.trie.keys('ann'), []) def test_values_wprefix(self): self.assertEqual(self.trie.values('al'), [2,3,4,5]) self.assertEqual(self.trie.values('are'), [6]) self.assertEqual(self.trie.values('ann'), []) def test_items_wprefix(self): self.assertEqual(self.trie.items('al'), [('all',2),('allot',3),('alloy',4),('aloe',5)]) self.assertEqual(self.trie.items('are'), [('are',6)]) self.assertEqual(self.trie.items('ann'), []) def test_consistency_wprefix(self): t = self.trie for prefix in 'al','are','ann': self.assertEqual( t.items(prefix), list(zip(t.keys(prefix), t.values(prefix))) ) def test_pickle(self): from pickle import dumps, loads, HIGHEST_PROTOCOL for proto in range(HIGHEST_PROTOCOL): unpickled = loads(dumps(self.trie, proto)) self.assertEqual(self.trie, unpickled) self.assertTrue(type(self.trie) is type(unpickled)) self.assertTrue(self.trie is not unpickled) def test_repr(self): evaled = eval(repr(self.trie)) self.assertEqual(evaled, self.trie) self.assertEqual(evaled.__class__, self.trie.__class__)
class AutoComplete: MAXIMUM_NUM_CHARS = 10 NUM_OF_SUGGESTIONS = 5 def __init__(self): self.full_match = Trie() self.repairs_match = Trie() self.full_sentences = {} def prepare_full_match(self, folder_path): count = 0 for file in glob.glob(folder_path + "/**/*.txt", recursive=True): # os.listdir(folder_path): count += 1 print(count, file) with open(file, "r", encoding="utf8") as txt_file: for line_num, sen in enumerate(txt_file): if len(sen) > 1: self.process_line(file, sen, line_num) print("count", count) def process_line(self, file, sen, line_num): try: cleaned_line = AutoComplete.clean_line(sen) cleaned_line_in_words = cleaned_line.split() has_insert_flag = False for i in range(len(cleaned_line_in_words) - 1): tmp_line = " ".join(cleaned_line_in_words[i:]) if self.sentence_exist_for_prefix( sen, tmp_line[:AutoComplete.MAXIMUM_NUM_CHARS]): continue has_insert_flag = self.insert_full_match_sentence( file, tmp_line[:AutoComplete.MAXIMUM_NUM_CHARS], line_num) or has_insert_flag if has_insert_flag: self.full_sentences[(file, line_num)] = sen except: print("ERROR", sen) raise def sentence_exist_for_prefix(self, sen, prefix): try: for _tuple in self.full_match[prefix]: tmp_sen = self.full_sentences[_tuple] if sen.find(tmp_sen) > -1 or tmp_sen.find(sen) > -1: return True return False except: return False def insert_full_match_sentence(self, filename, cropped, line_num): try: sug_len = len(self.full_match[cropped]) except: self.full_match[cropped] = [] sug_len = 0 if sug_len < AutoComplete.NUM_OF_SUGGESTIONS: self.full_match[cropped].append((filename, line_num)) return True return False def __call__(self, prefix, mode="online"): clean_prefix = AutoComplete.clean_line(prefix) sugs = [] i = 0 # get data from full matched arr_of_tuples = self.full_match.items( prefix=clean_prefix)[:AutoComplete.NUM_OF_SUGGESTIONS] for arr in arr_of_tuples: for suggestion in arr[1]: sugs.append( self.get_auto_complete_data(suggestion, clean_prefix)) i += 1 if i >= AutoComplete.NUM_OF_SUGGESTIONS: return sugs print(len(sugs)) # get data from repairs trie - have sort #TODO - have to sort arr_of_tuples = self.repairs_match.items( prefix=clean_prefix)[:AutoComplete.NUM_OF_SUGGESTIONS] for arr in arr_of_tuples: for suggestion in arr[1]: sugs.append( self.get_auto_complete_data(suggestion, clean_prefix)) i += 1 if i >= AutoComplete.NUM_OF_SUGGESTIONS: return sugs print(len(sugs)) if i < AutoComplete.NUM_OF_SUGGESTIONS and mode == "prepare": repairs_sugs = self.predict_prefix_and_update_from_repairs_trie( clean_prefix, AutoComplete.NUM_OF_SUGGESTIONS - i) for repair in repairs_sugs: self.update_repair_sentence(clean_prefix, repair) sugs += repairs_sugs return sugs def update_repair_sentence(self, prefix, repair_data): try: sug_len = len(self.repairs_match[prefix]) except: self.repairs_match[prefix] = [] sug_len = 0 if sug_len < AutoComplete.NUM_OF_SUGGESTIONS: self.repairs_match[prefix].append( (repair_data.source_text, repair_data.line_num, repair_data.offset, repair_data.score)) # print(prefix, repair_data) def predict_prefix_and_update_from_repairs_trie(self, prefix, amount=NUM_OF_SUGGESTIONS, mode="offline"): options = get_mistakes_by_penalty(prefix) amount_found = 0 suggestions = [] for option, penalty in options: arr_of_tuples = self.full_match.items( prefix=option)[:AutoComplete.NUM_OF_SUGGESTIONS] for arr in arr_of_tuples: for suggestion in arr[1]: # if mode == "realtime": suggestions.append( self.get_auto_complete_data(suggestion, option, penalty)) amount_found += 1 if amount_found >= amount: return suggestions return suggestions @staticmethod def clean_line(sentence): sentence2 = re.sub(r'\W+', ' ', sentence.lower().strip()) sentence2 = re.sub(' +', ' ', sentence2) return sentence2 def get_auto_complete_data(self, file_line_tuple, clean_prefix, penalty=0, mode="full_match"): if mode == "full_match": filename, line_num = file_line_tuple full_sentence = self.full_sentences[file_line_tuple] score = 2 * len(clean_prefix) - penalty # TODO offset = AutoComplete.clean_line(full_sentence).find(clean_prefix) else: filename, line_num, offset, score = file_line_tuple full_sentence = self.full_sentences[(filename, line_num)] return AutoCompleteData(full_sentence, filename, offset, score, line_num) def observer_update_function(self): thread = Thread(target=self.save_to_pkl, args=(self, )) thread.start() def save_to_pkl(self, *args): now = datetime.now() # current date and time date_time = now.strftime("%d_%m_%Y__%H_%M_%S") with open( f"pkl_files_updated/auto_complete_with_repairs_{date_time}.pkl", "wb") as pkl_file: pickle.dump(self, pkl_file)
class TestTrie(unittest.TestCase): def setUp(self): self.words = "an ant all allot alloy aloe are ate be".split() self.trie = SortedStringTrie(zip(self.words, range(len(self.words)))) def test_longest_prefix(self): self.assertEqual(self.trie.longest_prefix("antonym"), "ant") self.assertEqual(self.trie.longest_prefix("are"), "are") self.assertEqual(self.trie.longest_prefix("alla"), "all") self.assertEqual(self.trie.longest_prefix("allo"), "all") self.assertRaises(KeyError, self.trie.longest_prefix_item, "alumni") self.assertEqual(self.trie.longest_prefix("alumni", default=None), None) self.assertEqual(self.trie.longest_prefix("linux", default=-1), -1) def test_longest_prefix_value(self): self.assertEqual(self.trie.longest_prefix_value("antonym"), 1) self.assertEqual(self.trie.longest_prefix_value("are"), 6) self.assertEqual(self.trie.longest_prefix_value("alla"), 2) self.assertEqual(self.trie.longest_prefix_value("allo"), 2) self.assertRaises(KeyError, self.trie.longest_prefix_value, "alumni") self.assertEqual(self.trie.longest_prefix_value("alumni", default=None), None) self.assertEqual(self.trie.longest_prefix_value("linux", default=-1), -1) def test_longest_prefix_item(self): self.assertEqual(self.trie.longest_prefix_item("antonym"), ("ant", 1)) self.assertEqual(self.trie.longest_prefix_item("are"), ("are", 6)) self.assertEqual(self.trie.longest_prefix_item("alla"), ("all", 2)) self.assertEqual(self.trie.longest_prefix_item("allo"), ("all", 2)) self.assertRaises(KeyError, self.trie.longest_prefix_item, "alumni") self.assertEqual(self.trie.longest_prefix_item("alumni", default=None), None) self.assertEqual(self.trie.longest_prefix_item("linux", default=-1), -1) def test_iter_prefixes(self): self.assertEqual(list(self.trie.iter_prefixes("antonym")), ["an", "ant"]) self.assertEqual(list(self.trie.iter_prefixes("are")), ["are"]) self.assertEqual(list(self.trie.iter_prefixes("alumni")), []) def test_iter_prefix_values(self): self.assertEqual(list(self.trie.iter_prefix_values("antonym")), [0, 1]) self.assertEqual(list(self.trie.iter_prefix_values("are")), [6]) self.assertEqual(list(self.trie.iter_prefix_values("alumni")), []) def test_iter_prefix_items(self): self.assertEqual(list(self.trie.iter_prefix_items("antonym")), [("an", 0), ("ant", 1)]) self.assertEqual(list(self.trie.iter_prefix_items("are")), [("are", 6)]) self.assertEqual(list(self.trie.iter_prefix_items("alumni")), []) def test_keys_wprefix(self): self.assertEqual(self.trie.keys("al"), ["all", "allot", "alloy", "aloe"]) self.assertEqual(self.trie.keys("are"), ["are"]) self.assertEqual(self.trie.keys("ann"), []) def test_values_wprefix(self): self.assertEqual(self.trie.values("al"), [2, 3, 4, 5]) self.assertEqual(self.trie.values("are"), [6]) self.assertEqual(self.trie.values("ann"), []) def test_items_wprefix(self): self.assertEqual(self.trie.items("al"), [("all", 2), ("allot", 3), ("alloy", 4), ("aloe", 5)]) self.assertEqual(self.trie.items("are"), [("are", 6)]) self.assertEqual(self.trie.items("ann"), []) def test_consistency_wprefix(self): t = self.trie for prefix in "al", "are", "ann": self.assertEqual(t.items(prefix), zip(t.keys(prefix), t.values(prefix))) def test_pickle(self): from pickle import dumps, loads, HIGHEST_PROTOCOL for proto in xrange(HIGHEST_PROTOCOL): unpickled = loads(dumps(self.trie, proto)) self.assertEqual(self.trie, unpickled) self.assert_(type(self.trie) is type(unpickled)) self.assert_(self.trie is not unpickled) def test_repr(self): evaled = eval(repr(self.trie)) self.assertEqual(evaled, self.trie) self.assertEqual(evaled.__class__, self.trie.__class__)
class TestTrie(unittest.TestCase): def setUp(self): """ 测试前准备环境的搭建(setUp) :return: """ self.words = 'an ant all allot alloy aloe are ate be'.split() self.trie = SortedStringTrie(zip(self.words, range(len(self.words)))) def test_longest_prefix(self): """ 定义了以'test'开头的方法, 即是一个测试用例 :return: """ self.assertEqual(self.trie.longest_prefix('antonym'), 'ant') self.assertEqual(self.trie.longest_prefix('are'), 'are') self.assertEqual(self.trie.longest_prefix('alla'), 'all') self.assertEqual(self.trie.longest_prefix('allo'), 'all') self.assertRaises(KeyError, self.trie.longest_prefix_item, 'alumni') self.assertEqual(self.trie.longest_prefix('alumni', default=None), None) self.assertEqual(self.trie.longest_prefix('linux', default=-1), -1) def test_longest_prefix_value(self): self.assertEqual(self.trie.longest_prefix_value('antonym'), 1) self.assertEqual(self.trie.longest_prefix_value('are'), 6) self.assertEqual(self.trie.longest_prefix_value('alla'), 2) self.assertEqual(self.trie.longest_prefix_value('allo'), 2) self.assertRaises(KeyError, self.trie.longest_prefix_value, 'alumni') self.assertEqual(self.trie.longest_prefix_value('alumni', default=None), None) self.assertEqual(self.trie.longest_prefix_value('linux', default=-1), -1) def test_longest_prefix_item(self): self.assertEqual(self.trie.longest_prefix_item('antonym'), ('ant', 1)) self.assertEqual(self.trie.longest_prefix_item('are'), ('are', 6)) self.assertEqual(self.trie.longest_prefix_item('alla'), ('all', 2)) self.assertEqual(self.trie.longest_prefix_item('allo'), ('all', 2)) self.assertRaises(KeyError, self.trie.longest_prefix_item, 'alumni') self.assertEqual(self.trie.longest_prefix_item('alumni', default=None), None) self.assertEqual(self.trie.longest_prefix_item('linux', default=-1), -1) def test_iter_prefixes(self): self.assertEqual(list(self.trie.iter_prefixes('antonym')), ['an', 'ant']) self.assertEqual(list(self.trie.iter_prefixes('are')), ['are']) self.assertEqual(list(self.trie.iter_prefixes('alumni')), []) def test_iter_prefix_values(self): self.assertEqual(list(self.trie.iter_prefix_values('antonym')), [0, 1]) self.assertEqual(list(self.trie.iter_prefix_values('are')), [6]) self.assertEqual(list(self.trie.iter_prefix_values('alumni')), []) def test_iter_prefix_items(self): self.assertEqual(list(self.trie.iter_prefix_items('antonym')), [('an', 0), ('ant', 1)]) self.assertEqual(list(self.trie.iter_prefix_items('are')), [('are', 6)]) self.assertEqual(list(self.trie.iter_prefix_items('alumni')), []) def test_keys_wprefix(self): self.assertEqual(self.trie.keys('al'), ['all', 'allot', 'alloy', 'aloe']) self.assertEqual(self.trie.keys('are'), ['are']) self.assertEqual(self.trie.keys('ann'), []) def test_values_wprefix(self): self.assertEqual(self.trie.values('al'), [2, 3, 4, 5]) self.assertEqual(self.trie.values('are'), [6]) self.assertEqual(self.trie.values('ann'), []) def test_items_wprefix(self): self.assertEqual(self.trie.items('al'), [('all', 2), ('allot', 3), ('alloy', 4), ('aloe', 5)]) self.assertEqual(self.trie.items('are'), [('are', 6)]) self.assertEqual(self.trie.items('ann'), []) def test_consistency_wprefix(self): trie = self.trie for prefix in 'al', 'are', 'ann': self.assertEqual( trie.items(prefix), list(zip(trie.keys(prefix), trie.values(prefix))) ) def test_empty_string(self): self.trie[''] = '!' self.assertEqual(self.trie.keys(''), ['', 'all', 'allot', 'alloy', 'aloe', 'an', 'ant', 'are', 'ate', 'be']) self.assertEqual(self.trie.values(''), ['!', 2, 3, 4, 5, 0, 1, 6, 7, 8]) self.assertEqual(self.trie.items(''), [('', '!'), ('all', 2), ('allot', 3), ('alloy', 4), ('aloe', 5), ('an', 0), ('ant', 1), ('are', 6), ('ate', 7), ('be', 8)]) self.assertEqual(list(self.trie.iter_prefixes('foo')), ['']) self.assertEqual(list(self.trie.iter_prefix_values('foo')), ['!']) self.assertEqual(list(self.trie.iter_prefix_items('foo')), [('', '!')]) self.assertEqual(self.trie.longest_prefix('foo'), '') self.assertEqual(self.trie.longest_prefix_value('foo'), '!') self.assertEqual(self.trie.longest_prefix_item('foo'), ('', '!')) def test_pickle(self): from pickle import dumps, loads, HIGHEST_PROTOCOL for proto in range(HIGHEST_PROTOCOL): unpickled = loads(dumps(self.trie, proto)) self.assertEqual(self.trie, unpickled) self.assertTrue(type(self.trie) is type(unpickled)) self.assertTrue(self.trie is not unpickled) def test_repr(self): evaled = eval(repr(self.trie)) self.assertEqual(evaled, self.trie) self.assertEqual(evaled.__class__, self.trie.__class__)