示例#1
0
文件: cli.py 项目: mccrayinc/keyvi
def compile(args):
    params = {key: value for key, value in args.compiler_params}

    dict_type = args.dict_type
    if dict_type == 'json':
        dictionary = pykeyvi.JsonDictionaryCompiler(params)
    elif dict_type == 'string':
        dictionary = pykeyvi.StringDictionaryCompiler(params)
    elif dict_type == 'int':
        dictionary = pykeyvi.IntDictionaryCompiler(params)
    elif dict_type == 'completion':
        dictionary = pykeyvi.CompletionDictionaryCompiler(params)
    elif dict_type == 'key-only':
        dictionary = pykeyvi.KeyOnlyDictionaryCompiler(params)
    else:
        return 'Must never reach here'

    with open(args.input_file) as file_in:
        for line in file_in:
            line = line.rstrip('\n')
            try:
                splits = line.split('\t')
                if dict_type == 'key-only':
                    dictionary.Add(splits[0])
                elif dict_type == 'int' or dict_type == 'completion':
                    dictionary.Add(splits[0], int(splits[1]))
                else:
                    dictionary.Add(splits[0], splits[1])
            except:
                print ('Can not parse line: {}'.format(line))

    dictionary.Compile()
    dictionary.WriteToFile(args.output_file)
示例#2
0
def test_get_value_int():
    c = pykeyvi.CompletionDictionaryCompiler({"memory_limit_mb":"10"})
    c.Add("abc", 42)
    c.Add("abd", 21)
    with tmp_dictionary(c, 'match_object_int.kv') as d:
        m = d["abc"]
        assert m.GetValue() == 42
        m = d["abd"]
        assert m.GetValue() == 21
示例#3
0
def test_exact_match_without_completion():
    c = pykeyvi.CompletionDictionaryCompiler({"memory_limit_mb": "10"})
    c.Add("mr" + '\x1b' + "mr", 80)
    c.Add("mozilla firefox" + '\x1b' + "mozilla firefox", 80)
    c.Add("maa" + '\x1b' + "maa", 80)
    with tmp_dictionary(c, 'test_exact_match_without_completion.kv') as d:
        mw = pykeyvi.MultiWordCompletion(d)
        for m in mw.GetCompletions("mr "):
            assert m.GetMatchedString() == b'mr'
示例#4
0
def test_overlong_completion():
    c = pykeyvi.CompletionDictionaryCompiler()
    c.Add("html disable" + MULTIWORD_QUERY_SEPARATOR + "html disable", 30075)
    c.Add("html disabled" + MULTIWORD_QUERY_SEPARATOR + "html disabled", 29650)
    c.Add(
        "html display=main&referer=3c6120640656e466f726e26616d703b726566657265723d336336313230363436313734363132643631366136313738336432373636363136633733363532373230363837323635363633643237326636363732363136643635326536613733373032373230373436393734366336353364323735333734363137323734373336353639373436353237336535333734363137323734373336353639373436353363326636313365323032363637373433623230336336313230363436313734363132643631366136313738336432373636363136633733363532373230363837323635363632303364323732663733363537323736366336353734326636363666373236353665336636663730363536653436366637323635366535343732363536353364333132363631366437303362363436393733373036633631373933643664363136393665323636313664373033623734363137323637363537343639363433643330323636313664373033623734363137323637363537343734373937303635336433303236363136643730336236333664363433643338333032373230373436393734366336353364323737613735373232363735373536643663336236333662323037613735373232303436366637323635366532363735373536643663336236323635373237333639363336383734323733653436366637323635366532363735373536643663336236323635373237333639363336383734336332663631336532303230323636373734336232303463363536383732363736313665363737333636366637323635366526616d703b616a61783d3126616d703b6d6f62696c653d3026616d703b706167653d3026616d703b6f70656e466f72656e547265653d3127203e204c65687267616e6773666f72656e3c2f613e20&openforentree=1&targetid=130&targettype=1&cmd=6&page=null&fromhistory=1"
        + MULTIWORD_QUERY_SEPARATOR +
        "html display=main&referer=3c6120640656e466f726e26616d703b726566657265723d336336313230363436313734363132643631366136313738336432373636363136633733363532373230363837323635363633643237326636363732363136643635326536613733373032373230373436393734366336353364323735333734363137323734373336353639373436353237336535333734363137323734373336353639373436353363326636313365323032363637373433623230336336313230363436313734363132643631366136313738336432373636363136633733363532373230363837323635363632303364323732663733363537323736366336353734326636363666373236353665336636663730363536653436366637323635366535343732363536353364333132363631366437303362363436393733373036633631373933643664363136393665323636313664373033623734363137323637363537343639363433643330323636313664373033623734363137323637363537343734373937303635336433303236363136643730336236333664363433643338333032373230373436393734366336353364323737613735373232363735373536643663336236333662323037613735373232303436366637323635366532363735373536643663336236323635373237333639363336383734323733653436366637323635366532363735373536643663336236323635373237333639363336383734336332663631336532303230323636373734336232303463363536383732363736313665363737333636366637323635366526616d703b616a61783d3126616d703b6d6f62696c653d3026616d703b706167653d3026616d703b6f70656e466f72656e547265653d3127203e204c65687267616e6773666f72656e3c2f613e20&openforentree=1&targetid=130&targettype=1&cmd=6&page=null&fromhistory=1",
        23732)
    with tmp_dictionary(c, 'mw_overlong_completion.kv') as d:
        mw = pykeyvi.MultiWordCompletion(d)
        matches = sorted(
            [(match.GetAttribute('weight'), match.GetMatchedString())
             for match in mw.GetCompletions("html dis")],
            reverse=True)
        assert len(matches) == 3
        assert matches[0][1] == 'html disable'
        assert matches[1][1] == 'html disabled'
        assert matches[2][
            1] == "html display=main&referer=3c6120640656e466f726e26616d703b726566657265723d336336313230363436313734363132643631366136313738336432373636363136633733363532373230363837323635363633643237326636363732363136643635326536613733373032373230373436393734366336353364323735333734363137323734373336353639373436353237336535333734363137323734373336353639373436353363326636313365323032363637373433623230336336313230363436313734363132643631366136313738336432373636363136633733363532373230363837323635363632303364323732663733363537323736366336353734326636363666373236353665336636663730363536653436366637323635366535343732363536353364333132363631366437303362363436393733373036633631373933643664363136393665323636313664373033623734363137323637363537343639363433643330323636313664373033623734363137323637363537343734373937303635336433303236363136643730336236333664363433643338333032373230373436393734366336353364323737613735373232363735373536643663336236333662323037613735373232303436366637323635366532363735373536643663336236323635373237333639363336383734323733653436366637323635366532363735373536643663336236323635373237333639363336383734336332663631336532303230323636373734336232303463363536383732363736313665363737333636366637323635366526616d703b616a61783d3126616d703b6d6f62696c653d3026616d703b706167653d3026616d703b6f70656e466f72656e547265653d3127203e204c65687267616e6773666f72656e3c2f613e20&openforentree=1&targetid=130&targettype=1&cmd=6&page=null&fromhistory=1"
示例#5
0
def test_mw_completion():
    c = pykeyvi.CompletionDictionaryCompiler()
    c.Add("mozilla firefox" + '\x1b' + "mozilla firefox", 80)
    c.Add("mozilla footprint" + '\x1b' + "mozilla footprint", 30)
    c.Add("mozilla fans" + '\x1b' + "mozilla fans", 43)
    c.Add("mozilla firebird" + '\x1b' + "mozilla firebird", 12)
    c.Add(
        "internet microsoft explorer" + '\x1b' + "microsoft internet explorer",
        21)
    c.Add("google chrome" + '\x1b' + "google chrome", 54)
    c.Add("netscape navigator" + '\x1b' + "netscape navigator", 10)
    with tmp_dictionary(c, 'mw_completion.kv') as d:
        mw = pykeyvi.MultiWordCompletion(d)
        matches = sorted(
            [(match.GetAttribute('weight'), match.GetMatchedString())
             for match in mw.GetCompletions("mozilla f")],
            reverse=True)
        assert len(matches) == 4
        assert matches[0][1] == 'mozilla firefox'
        assert matches[1][1] == 'mozilla fans'
        assert matches[2][1] == 'mozilla footprint'
        assert matches[3][1] == 'mozilla firebird'
示例#6
0
    def __call__(self, query):
        query_tokens = query.split(" ")
        query_tokens_bow = sorted(query_tokens)
        length = len(query_tokens_bow)
        if not PERMUTATION_LOOKUP_TABLE.has_key(length):
            yield query
            return

        for permutation in PERMUTATION_LOOKUP_TABLE[len(query_tokens_bow)]:
            if len(permutation) < 3:
                first_token = query_tokens_bow[permutation[0]]
                if first_token != query_tokens[permutation[0]] and len(first_token) == 1:
                    continue
            yield " ".join([query_tokens_bow[i] for i in permutation]) + MULTIWORD_QUERY_SEPARATOR + query


if __name__ == '__main__':
    pipeline = []
    pipeline.append(MultiWordPermutation())
    c = pykeyvi.CompletionDictionaryCompiler()


    for line in sys.stdin:
        key, weight = line.split("\t")

        for q in reduce(lambda x, y: y(x), pipeline, key):
            c.Add(q, int(weight))
    c.Compile()
    c.WriteToFile("mw-completion.keyvi")