def test_get_fuzzy(): c = keyvi.CompletionDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("türkei news", 23698) c.Add("türkei side", 18838) c.Add("türkei urlaub", 23424) c.Add("türkisch anfänger", 20788) c.Add("türkisch für", 21655) c.Add("türkisch für anfänger", 20735) c.Add("türkçe dublaj", 28575) c.Add("türkçe dublaj izle", 16391) c.Add("türkçe izle", 19946) c.Add("tüv akademie", 9557) c.Add("tüv hessen", 7744) c.Add("tüv i", 331) c.Add("tüv in", 10188) c.Add("tüv ib", 10189) c.Add("tüv kosten", 11387) c.Add("tüv nord", 46052) c.Add("tüv sood", 46057) c.Add("tüs rhein", 462) c.Add("tüs rheinland", 39131) c.Add("tüs öffnungszeiten", 15999) key_values = [ (u'tüv sood', 46057), (u'tüv nord', 46052), ] with tmp_dictionary(c, 'get_fuzzy.kv') as d: for (base_key, base_value), m in zip(key_values, d.GetFuzzy('tüv koid', 2)): assert base_key == m.GetMatchedString() assert base_value == m.GetValue() assert len(list(d.GetFuzzy('tüv koid', 2))) == 2
def compile(args): params = {key: value for key, value in args.compiler_params} dict_type = args.dict_type if dict_type == 'json': dictionary = keyvi.JsonDictionaryCompiler(params) elif dict_type == 'string': dictionary = keyvi.StringDictionaryCompiler(params) elif dict_type == 'int': dictionary = keyvi.IntDictionaryCompiler(params) elif dict_type == 'completion': dictionary = keyvi.CompletionDictionaryCompiler(params) elif dict_type == 'key-only': dictionary = keyvi.KeyOnlyDictionaryCompiler(params) else: return 'Must never reach here' with open(args.input_file) as file_in: for line in file_in: line = line.rstrip('\n') try: splits = line.split('\t') if dict_type == 'key-only': dictionary.Add(splits[0]) elif dict_type == 'int' or dict_type == 'completion': dictionary.Add(splits[0], int(splits[1])) else: dictionary.Add(splits[0], splits[1]) except: print ('Can not parse line: {}'.format(line)) dictionary.Compile() dictionary.WriteToFile(args.output_file)
def test_exact_match_without_completion(): c = keyvi.CompletionDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("mr" + '\x1b' + "mr", 80) c.Add("mozilla firefox" + '\x1b' + "mozilla firefox", 80) c.Add("maa" + '\x1b' + "maa", 80) with tmp_dictionary(c, 'test_exact_match_without_completion.kv') as d: mw = keyvi.MultiWordCompletion(d) for m in mw.GetCompletions("mr "): assert m.GetMatchedString() == b'mr'
def test_get_value_int(): c = keyvi.CompletionDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("abc", 42) c.Add("abd", 21) with tmp_dictionary(c, 'match_object_int.kv') as d: m = d["abc"] assert m.GetValue() == 42 m = d["abd"] assert m.GetValue() == 21
def test_forward_backward_completion(): c = keyvi.CompletionDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("bayern munich vs. real madrid", 80) c.Add("munich vs. real madrid", 30) c_bw = keyvi.CompletionDictionaryCompiler({"memory_limit_mb": "10"}) c_bw.Add("bayern munich vs. real madrid"[::-1], 80) c_bw.Add("munich vs. real madrid"[::-1], 30) with tmp_dictionary(c, 'fw_bw_completion.kv') as d: with tmp_dictionary(c_bw, 'fw_bw_completion_bw.kv') as d2: completer = keyvi.ForwardBackwardCompletion(d, d2) matches = sorted( [(match.GetAttribute('weight'), match.GetMatchedString()) for match in completer.GetCompletions("munich")], reverse=True) assert len(matches) == 2 assert matches[0][1] == b'bayern munich vs. real madrid' assert matches[1][1] == b'munich vs. real madrid'
def test_fuzzy_completion(): c = keyvi.CompletionDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("turkei news", 23698) c.Add("turkei side", 18838) c.Add("turkei urlaub", 23424) c.Add("turkisch anfänger", 20788) c.Add("turkisch für", 21655) c.Add("turkisch für anfänger", 20735) c.Add("turkçe dublaj", 28575) c.Add("turkçe dublaj izle", 16391) c.Add("turkçe izle", 19946) c.Add("tuv", 97) c.Add("tuv akademie", 9557) c.Add("tuv hessen", 7744) c.Add("tuv i", 331) c.Add("tuv in", 10188) c.Add("tuv ib", 10189) c.Add("tuv kosten", 11387) c.Add("tuv nord", 46052) c.Add("tuv sood", 46057) c.Add("tus rhein", 462) c.Add("tus rheinland", 39131) c.Add("tus öffnungszeiten", 15999) with tmp_dictionary(c, 'fuzzy_completion.kv') as d: completer = keyvi.PrefixCompletion(d) matches = [m.GetMatchedString() for m in completer.GetFuzzyCompletions('tuv', 0)] assert len(matches) == 9 matches = [m.GetMatchedString() for m in completer.GetFuzzyCompletions('tue', 1)] assert len(matches) == 1 matches = [m.GetMatchedString() for m in completer.GetFuzzyCompletions('tuv h', 1)] assert len(matches) == 2 matches = [m.GetMatchedString() for m in completer.GetFuzzyCompletions('tuv h', 2)] assert len(matches) == 7 matches = [m.GetMatchedString() for m in completer.GetFuzzyCompletions('tuk töffnungszeiten', 2)] assert len(matches) == 1 matches = [m.GetMatchedString() for m in completer.GetFuzzyCompletions('tuk töffnung', 2)] assert len(matches) == 1 matches = [m.GetMatchedString() for m in completer.GetFuzzyCompletions('tuk txyzöff', 5)] assert len(matches) == 1 matches = [m.GetMatchedString() for m in completer.GetFuzzyCompletions('tuk txyzöffnung', 5)] assert len(matches) == 1 matches = [m.GetMatchedString() for m in completer.GetFuzzyCompletions('tuk txyzvöffnung', 6)] assert len(matches) == 1 matches = [m.GetMatchedString() for m in completer.GetFuzzyCompletions('tuk ffnung', 2)] assert len(matches) == 1
def test_overlong_completion(): c = keyvi.CompletionDictionaryCompiler({"memory_limit_mb":"10"}) c.Add("html disable" + MULTIWORD_QUERY_SEPARATOR + "html disable", 30075) c.Add("html disabled" + MULTIWORD_QUERY_SEPARATOR + "html disabled", 29650) c.Add("html display=main&referer=3c6120640656e466f726e26616d703b726566657265723d336336313230363436313734363132643631366136313738336432373636363136633733363532373230363837323635363633643237326636363732363136643635326536613733373032373230373436393734366336353364323735333734363137323734373336353639373436353237336535333734363137323734373336353639373436353363326636313365323032363637373433623230336336313230363436313734363132643631366136313738336432373636363136633733363532373230363837323635363632303364323732663733363537323736366336353734326636363666373236353665336636663730363536653436366637323635366535343732363536353364333132363631366437303362363436393733373036633631373933643664363136393665323636313664373033623734363137323637363537343639363433643330323636313664373033623734363137323637363537343734373937303635336433303236363136643730336236333664363433643338333032373230373436393734366336353364323737613735373232363735373536643663336236333662323037613735373232303436366637323635366532363735373536643663336236323635373237333639363336383734323733653436366637323635366532363735373536643663336236323635373237333639363336383734336332663631336532303230323636373734336232303463363536383732363736313665363737333636366637323635366526616d703b616a61783d3126616d703b6d6f62696c653d3026616d703b706167653d3026616d703b6f70656e466f72656e547265653d3127203e204c65687267616e6773666f72656e3c2f613e20&openforentree=1&targetid=130&targettype=1&cmd=6&page=null&fromhistory=1" + MULTIWORD_QUERY_SEPARATOR + "html display=main&referer=3c6120640656e466f726e26616d703b726566657265723d336336313230363436313734363132643631366136313738336432373636363136633733363532373230363837323635363633643237326636363732363136643635326536613733373032373230373436393734366336353364323735333734363137323734373336353639373436353237336535333734363137323734373336353639373436353363326636313365323032363637373433623230336336313230363436313734363132643631366136313738336432373636363136633733363532373230363837323635363632303364323732663733363537323736366336353734326636363666373236353665336636663730363536653436366637323635366535343732363536353364333132363631366437303362363436393733373036633631373933643664363136393665323636313664373033623734363137323637363537343639363433643330323636313664373033623734363137323637363537343734373937303635336433303236363136643730336236333664363433643338333032373230373436393734366336353364323737613735373232363735373536643663336236333662323037613735373232303436366637323635366532363735373536643663336236323635373237333639363336383734323733653436366637323635366532363735373536643663336236323635373237333639363336383734336332663631336532303230323636373734336232303463363536383732363736313665363737333636366637323635366526616d703b616a61783d3126616d703b6d6f62696c653d3026616d703b706167653d3026616d703b6f70656e466f72656e547265653d3127203e204c65687267616e6773666f72656e3c2f613e20&openforentree=1&targetid=130&targettype=1&cmd=6&page=null&fromhistory=1" , 23732) with tmp_dictionary(c, 'mw_overlong_completion.kv') as d: mw = keyvi.MultiWordCompletion(d) matches = sorted([(match.GetAttribute('weight'), match.GetMatchedString()) for match in mw.GetCompletions("html dis")], reverse=True) assert len(matches) == 3 assert matches[0][1] == b'html disable' assert matches[1][1] == b'html disabled' assert matches[2][1] == b"html display=main&referer=3c6120640656e466f726e26616d703b726566657265723d336336313230363436313734363132643631366136313738336432373636363136633733363532373230363837323635363633643237326636363732363136643635326536613733373032373230373436393734366336353364323735333734363137323734373336353639373436353237336535333734363137323734373336353639373436353363326636313365323032363637373433623230336336313230363436313734363132643631366136313738336432373636363136633733363532373230363837323635363632303364323732663733363537323736366336353734326636363666373236353665336636663730363536653436366637323635366535343732363536353364333132363631366437303362363436393733373036633631373933643664363136393665323636313664373033623734363137323637363537343639363433643330323636313664373033623734363137323637363537343734373937303635336433303236363136643730336236333664363433643338333032373230373436393734366336353364323737613735373232363735373536643663336236333662323037613735373232303436366637323635366532363735373536643663336236323635373237333639363336383734323733653436366637323635366532363735373536643663336236323635373237333639363336383734336332663631336532303230323636373734336232303463363536383732363736313665363737333636366637323635366526616d703b616a61783d3126616d703b6d6f62696c653d3026616d703b706167653d3026616d703b6f70656e466f72656e547265653d3127203e204c65687267616e6773666f72656e3c2f613e20&openforentree=1&targetid=130&targettype=1&cmd=6&page=null&fromhistory=1"
def test_mw_completion(): c = keyvi.CompletionDictionaryCompiler({"memory_limit_mb":"10"}) c.Add("mozilla firefox" + '\x1b' + "mozilla firefox", 80) c.Add("mozilla footprint" + '\x1b' + "mozilla footprint", 30) c.Add("mozilla fans" + '\x1b' + "mozilla fans", 43) c.Add("mozilla firebird" + '\x1b' + "mozilla firebird", 12) c.Add("internet microsoft explorer" + '\x1b' + "microsoft internet explorer", 21) c.Add("google chrome" + '\x1b' + "google chrome", 54) c.Add("netscape navigator" + '\x1b' + "netscape navigator", 10) with tmp_dictionary(c, 'mw_completion.kv') as d: mw = keyvi.MultiWordCompletion(d) matches = sorted([(match.GetAttribute('weight'), match.GetMatchedString()) for match in mw.GetCompletions("mozilla f")], reverse=True) assert len(matches) == 4 assert matches[0][1] == b'mozilla firefox' assert matches[1][1] == b'mozilla fans' assert matches[2][1] == b'mozilla footprint' assert matches[3][1] == b'mozilla firebird'
def __call__(self, query): query_tokens = query.split(" ") query_tokens_bow = sorted(query_tokens) length = len(query_tokens_bow) if not PERMUTATION_LOOKUP_TABLE.has_key(length): yield query return for permutation in PERMUTATION_LOOKUP_TABLE[len(query_tokens_bow)]: if len(permutation) < 3: first_token = query_tokens_bow[permutation[0]] if first_token != query_tokens[permutation[0]] and len(first_token) == 1: continue yield " ".join([query_tokens_bow[i] for i in permutation]) + MULTIWORD_QUERY_SEPARATOR + query if __name__ == '__main__': pipeline = [] pipeline.append(MultiWordPermutation()) c = keyvi.CompletionDictionaryCompiler() for line in sys.stdin: key, weight = line.split("\t") for q in reduce(lambda x, y: y(x), pipeline, key): c.Add(q, int(weight)) c.Compile() c.WriteToFile("mw-completion.keyvi")