def compile(args): params = {key: value for key, value in args.compiler_params} dict_type = args.dict_type if dict_type == 'json': dictionary = pykeyvi.JsonDictionaryCompiler(params) elif dict_type == 'string': dictionary = pykeyvi.StringDictionaryCompiler(params) elif dict_type == 'int': dictionary = pykeyvi.IntDictionaryCompiler(params) elif dict_type == 'completion': dictionary = pykeyvi.CompletionDictionaryCompiler(params) elif dict_type == 'key-only': dictionary = pykeyvi.KeyOnlyDictionaryCompiler(params) else: return 'Must never reach here' with open(args.input_file) as file_in: for line in file_in: line = line.rstrip('\n') try: splits = line.split('\t') if dict_type == 'key-only': dictionary.Add(splits[0]) elif dict_type == 'int' or dict_type == 'completion': dictionary.Add(splits[0], int(splits[1])) else: dictionary.Add(splits[0], splits[1]) except: print ('Can not parse line: {}'.format(line)) dictionary.Compile() dictionary.WriteToFile(args.output_file)
def test_get_value_int(): c = pykeyvi.CompletionDictionaryCompiler({"memory_limit_mb":"10"}) c.Add("abc", 42) c.Add("abd", 21) with tmp_dictionary(c, 'match_object_int.kv') as d: m = d["abc"] assert m.GetValue() == 42 m = d["abd"] assert m.GetValue() == 21
def test_exact_match_without_completion(): c = pykeyvi.CompletionDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("mr" + '\x1b' + "mr", 80) c.Add("mozilla firefox" + '\x1b' + "mozilla firefox", 80) c.Add("maa" + '\x1b' + "maa", 80) with tmp_dictionary(c, 'test_exact_match_without_completion.kv') as d: mw = pykeyvi.MultiWordCompletion(d) for m in mw.GetCompletions("mr "): assert m.GetMatchedString() == b'mr'
def test_overlong_completion(): c = pykeyvi.CompletionDictionaryCompiler() c.Add("html disable" + MULTIWORD_QUERY_SEPARATOR + "html disable", 30075) c.Add("html disabled" + MULTIWORD_QUERY_SEPARATOR + "html disabled", 29650) c.Add( "html display=main&referer=3c6120640656e466f726e26616d703b726566657265723d336336313230363436313734363132643631366136313738336432373636363136633733363532373230363837323635363633643237326636363732363136643635326536613733373032373230373436393734366336353364323735333734363137323734373336353639373436353237336535333734363137323734373336353639373436353363326636313365323032363637373433623230336336313230363436313734363132643631366136313738336432373636363136633733363532373230363837323635363632303364323732663733363537323736366336353734326636363666373236353665336636663730363536653436366637323635366535343732363536353364333132363631366437303362363436393733373036633631373933643664363136393665323636313664373033623734363137323637363537343639363433643330323636313664373033623734363137323637363537343734373937303635336433303236363136643730336236333664363433643338333032373230373436393734366336353364323737613735373232363735373536643663336236333662323037613735373232303436366637323635366532363735373536643663336236323635373237333639363336383734323733653436366637323635366532363735373536643663336236323635373237333639363336383734336332663631336532303230323636373734336232303463363536383732363736313665363737333636366637323635366526616d703b616a61783d3126616d703b6d6f62696c653d3026616d703b706167653d3026616d703b6f70656e466f72656e547265653d3127203e204c65687267616e6773666f72656e3c2f613e20&openforentree=1&targetid=130&targettype=1&cmd=6&page=null&fromhistory=1" + MULTIWORD_QUERY_SEPARATOR + "html display=main&referer=3c6120640656e466f726e26616d703b726566657265723d336336313230363436313734363132643631366136313738336432373636363136633733363532373230363837323635363633643237326636363732363136643635326536613733373032373230373436393734366336353364323735333734363137323734373336353639373436353237336535333734363137323734373336353639373436353363326636313365323032363637373433623230336336313230363436313734363132643631366136313738336432373636363136633733363532373230363837323635363632303364323732663733363537323736366336353734326636363666373236353665336636663730363536653436366637323635366535343732363536353364333132363631366437303362363436393733373036633631373933643664363136393665323636313664373033623734363137323637363537343639363433643330323636313664373033623734363137323637363537343734373937303635336433303236363136643730336236333664363433643338333032373230373436393734366336353364323737613735373232363735373536643663336236333662323037613735373232303436366637323635366532363735373536643663336236323635373237333639363336383734323733653436366637323635366532363735373536643663336236323635373237333639363336383734336332663631336532303230323636373734336232303463363536383732363736313665363737333636366637323635366526616d703b616a61783d3126616d703b6d6f62696c653d3026616d703b706167653d3026616d703b6f70656e466f72656e547265653d3127203e204c65687267616e6773666f72656e3c2f613e20&openforentree=1&targetid=130&targettype=1&cmd=6&page=null&fromhistory=1", 23732) with tmp_dictionary(c, 'mw_overlong_completion.kv') as d: mw = pykeyvi.MultiWordCompletion(d) matches = sorted( [(match.GetAttribute('weight'), match.GetMatchedString()) for match in mw.GetCompletions("html dis")], reverse=True) assert len(matches) == 3 assert matches[0][1] == 'html disable' assert matches[1][1] == 'html disabled' assert matches[2][ 1] == "html display=main&referer=3c6120640656e466f726e26616d703b726566657265723d336336313230363436313734363132643631366136313738336432373636363136633733363532373230363837323635363633643237326636363732363136643635326536613733373032373230373436393734366336353364323735333734363137323734373336353639373436353237336535333734363137323734373336353639373436353363326636313365323032363637373433623230336336313230363436313734363132643631366136313738336432373636363136633733363532373230363837323635363632303364323732663733363537323736366336353734326636363666373236353665336636663730363536653436366637323635366535343732363536353364333132363631366437303362363436393733373036633631373933643664363136393665323636313664373033623734363137323637363537343639363433643330323636313664373033623734363137323637363537343734373937303635336433303236363136643730336236333664363433643338333032373230373436393734366336353364323737613735373232363735373536643663336236333662323037613735373232303436366637323635366532363735373536643663336236323635373237333639363336383734323733653436366637323635366532363735373536643663336236323635373237333639363336383734336332663631336532303230323636373734336232303463363536383732363736313665363737333636366637323635366526616d703b616a61783d3126616d703b6d6f62696c653d3026616d703b706167653d3026616d703b6f70656e466f72656e547265653d3127203e204c65687267616e6773666f72656e3c2f613e20&openforentree=1&targetid=130&targettype=1&cmd=6&page=null&fromhistory=1"
def test_mw_completion(): c = pykeyvi.CompletionDictionaryCompiler() c.Add("mozilla firefox" + '\x1b' + "mozilla firefox", 80) c.Add("mozilla footprint" + '\x1b' + "mozilla footprint", 30) c.Add("mozilla fans" + '\x1b' + "mozilla fans", 43) c.Add("mozilla firebird" + '\x1b' + "mozilla firebird", 12) c.Add( "internet microsoft explorer" + '\x1b' + "microsoft internet explorer", 21) c.Add("google chrome" + '\x1b' + "google chrome", 54) c.Add("netscape navigator" + '\x1b' + "netscape navigator", 10) with tmp_dictionary(c, 'mw_completion.kv') as d: mw = pykeyvi.MultiWordCompletion(d) matches = sorted( [(match.GetAttribute('weight'), match.GetMatchedString()) for match in mw.GetCompletions("mozilla f")], reverse=True) assert len(matches) == 4 assert matches[0][1] == 'mozilla firefox' assert matches[1][1] == 'mozilla fans' assert matches[2][1] == 'mozilla footprint' assert matches[3][1] == 'mozilla firebird'
def __call__(self, query): query_tokens = query.split(" ") query_tokens_bow = sorted(query_tokens) length = len(query_tokens_bow) if not PERMUTATION_LOOKUP_TABLE.has_key(length): yield query return for permutation in PERMUTATION_LOOKUP_TABLE[len(query_tokens_bow)]: if len(permutation) < 3: first_token = query_tokens_bow[permutation[0]] if first_token != query_tokens[permutation[0]] and len(first_token) == 1: continue yield " ".join([query_tokens_bow[i] for i in permutation]) + MULTIWORD_QUERY_SEPARATOR + query if __name__ == '__main__': pipeline = [] pipeline.append(MultiWordPermutation()) c = pykeyvi.CompletionDictionaryCompiler() for line in sys.stdin: key, weight = line.split("\t") for q in reduce(lambda x, y: y(x), pipeline, key): c.Add(q, int(weight)) c.Compile() c.WriteToFile("mw-completion.keyvi")