Пример #1
0
def getIdxPos(token):
    token = unicode(token, 'utf-8')
    hTok = func.myHash(token.lower())
    enc = func.Encoder(encMode)
    #print 'ddd'
    with open(func.PATH + "dict.data", 'rb') as dct:
        num = bisect_right(lterms, hTok)
        if num == 0:
            return None
        l = fterms[lterms[num - 1]]
        r = fterms[lterms[num]]
        #print l, r
        while l + 1 < r:
            m = (l + r) / 2
            #print enc.unpackTerm(dct, m)[0]
            #print hTok, enc.unpackTerm(dct, m)[0]
            if hTok < enc.unpackTerm(dct, m)[0]:
                r = m
            else:
                l = m
        h, val = enc.unpackTerm(dct, l)
        #print h, hTok, " OK"
        if hTok == h:
            #print "Posss = ", val
            return val
    return None
Пример #2
0
def main():
    global encMode
    with open(func.PATH + "urls.list", 'r') as f:
        for line in f:
            urls.append(line)
    with open(func.PATH + "fastDict.data", 'rb') as fdct:
        fdct.seek(-1, 2)
        encMode = ('simple9' if struct.unpack('B', fdct.read(1))[0] == 1 else 'varbyte') 
        size = os.path.getsize(func.PATH + "fastDict.data") / func.TERM_SIZE
        enc = func.Encoder(encMode)
        #print encMode
        i = 0
        while i < size:
            h, pos = enc.unpackTerm(fdct, i)
            #print h, pos
            lterms.append(h)
            fterms[h] = pos
            i += 1
    lterms.append(1 << 63)
    fterms[1 << 63] = os.path.getsize(func.PATH + "dict.data") / func.TERM_SIZE
    
    #print lterms
    #print fterms
    while True:
        try:
            req = raw_input()
            if req == "":
                break
            compute(req)
        except (EOFError):
            break
Пример #3
0
def getDocs(pos):
    if pos is None:
        return Set()
    #ans = []
    #print "Pos = ", pos
    ans = Set()
    last = 0
    enc = func.Encoder(encMode)
    with open(func.PATH + "idx.data", 'rb') as idx:
        ids = enc.unpackIdx(idx, pos)
        #print "ids = ", pos, ids
        for x in ids:
            x += last
            #ans.append(x)
            ans.add(x)
            last = x
    #print "Ans = ", ans
    return ans
Пример #4
0
def main():
    with open(func.PATH + "preDict.data", 'rb') as f:
        f.seek(-1, 2)
        encMode = ('simple9' if struct.unpack('B', f.read(1))[0] == 1 else 'varbyte') 
        e = func.Encoder(encMode)
        idx = 0
        size = os.path.getsize(func.PATH + "preDict.data") / func.TERM_SIZE
        while idx < size:
            h, pos = e.unpackTerm(f, idx)
            terms[h].append(pos)
            idx += 1

    global enc
    global denc
    global fenc
    enc.changeMode(encMode)
    fenc.changeMode(encMode)
    denc.changeMode(encMode)

    tmp = sorted(terms.items())
    for key, value in tmp:
        optimize(key, value)          
    with open(func.PATH + "fastDict.data", 'ab') as fdct:
        fdct.write(struct.pack('B', 1 if encMode == 'simple9' else 0))  
Пример #5
0
#!/usr/bin/env python

import sys
import os
import func
import struct
from collections import defaultdict
from collections import OrderedDict

PATH = "./files/"

terms = defaultdict(list)

enc = func.Encoder()
denc = func.Encoder()
fenc = func.Encoder()
encMode = ""
prev = 0

def optimize(h, arr):
    global enc
    global denc
    global fenc

    res = []
    last = 0
    with open(func.PATH + "preIdx.data") as f:
        for val in arr:
            tmp = enc.unpackIdx(f, val)
            tmp[0] -= last
            res.extend(tmp)