Пример #1
0
def printTrie(filecounter,pagetrie):
    s="subtemp%d"%(filecounter)
    print "Print to file",s
    f=open(s,"w")
    text=[]
    sort=sorted(pagetrie.iteritems(),key=operator.itemgetter(0))
    for key,value in sort:
        #print key,value
        flags=[]
        for i in value[5:]:
            temp=sorted(i.iteritems(),key=operator.itemgetter(1),reverse=True)
            flag=[]
            flag1=[]
            c=0
            for j in temp:
                if j[1]!=c: 
                    c=j[1]
                    if flag: 
                        flag1.append(":".join(flag))
                        flag=[]
                    flag.append("%s!%s"%(base.decimaltobase62(j[1]),j[0]))
                else:
                    flag.append(j[0])
            if flag:
                flag1.append(":".join(flag))
            flags.append("$".join(flag1))
        postlist="?".join(flags)
        text.append("#".join([key,"#".join(map(lambda x: str(x) if x>0 else '',value[:5])),postlist])+"\n")
    f.writelines(text)
    f.close()
Пример #2
0
def gettitle(docid):
    global titlewidth
    global sec
    f = open("titles", "r")
    keys = map(lambda x: base.base62todecimal(x), sec.keys())
    index = bisect.bisect_left(keys, docid)
    if keys[index] != docid:
        index -= 1
    #print index,keys[index],base.decimaltobase62(keys[index])
    seek = base.base62todecimal(sec[base.decimaltobase62(keys[index])])
    f.seek(seek)
    counter = 0
    while counter <= titlewidth:
        line = f.readline()
        txt = line.split("$", 1)
        if txt[0] == base.decimaltobase62(docid):
            return txt[1]
        counter += 1
    f.close()
Пример #3
0
def shortmerge(c,d):
    clength=c.__len__()
    dlength=d.__len__()
    if clength==0 and dlength==0:
        return ''
    if c=='': 
        c=[]
    else:
        c=c.split("$")
    if d=='': 
        d=[]
    else:
        d=d.split("$")
    l=[]
    i=0
    j=0
    clength=c.__len__()
    dlength=d.__len__()
    def m(x):
        t=x.split("!",1)
        return (base.base62todecimal(t[0]),t[1])
    a=map(m,c)
    b=map(m,d)
    while (i<clength or j<dlength):
        if i==clength and j==dlength:
            break
        elif i==clength:
            l.extend(d[j:])
            j+=dlength
        elif j==dlength:
            l.extend(c[i:])
            i+=clength
        else:
            if a[i][0]>b[j][0]:
                l.append(c[i])
                i+=1
            elif a[i][0]<b[j][0]:
                l.append(d[j])
                j+=1
            else:
                l.append("%s!%s:%s"%(base.decimaltobase62(a[i][0]),a[i][1],b[j][1]))
                i+=1
                j+=1
    return "$".join(l)
Пример #4
0
 lines = f.readlines()
 f.close()
 lenlines = len(lines)
 text = []
 i = 0
 while i < lenlines:
     line = lines[i].lstrip(" ")
     if line.startswith("<page>"):
         line = lines[i + 1].lstrip(" ")
         line = line.rstrip("\n")
         if line.startswith("<title>", 0):
             title = line[7:-8]
         line = lines[i + 3].lstrip(" ")
         line = line.rstrip("\n")
         if line.startswith("<id>", 0):
             docId = base.decimaltobase62(int(line[4:-5]))
             text.append("$".join([docId, title]) + "\n")
             #print docId,title
         i += 3
     i += 1
 if sys.getsizeof(text) > Memory:
     f = open(path_to_title, "a")
     #f.write("\n".join(text))
     f.writelines(text)
     text = []
     f.close()
 print "File process:%s Titles processed:%d" % (dir_list[j], i + 1)
 if len(text) > 0:
     f = open(path_to_title, "a")
     f.writelines(text)
     #f.write("\n".join(text))
Пример #5
0
M=process.communicate()
Memory=int(M[0].strip("\n"))
bufsize=Memory/3
inputfile=raw_input('Enter primary index file:')
outfile=raw_input('Enter secondary index file:')
g=open(inputfile,"r")
h=open(outfile,"w")
index=[]
with open(inputfile,"r") as f:
    doccount=f.readline()
    count=0
    for i in f:
        count+=1
    width=int(math.sqrt(count))
    #print count,width
    h.write(base.decimaltobase62(width)+"\n")
    counter=0
    f.seek(0)
    offset=0
    f.readline()
    offset=len(g.readline())
    l=g.readline()
    if l:
        index.append(" ".join([l.split("#")[0],base.decimaltobase62(offset),"\n"]))
    check=0
    for i in f:
        #print i
        offset+=len(i)
        if check==0:
            check=1
            continue
Пример #6
0
                           stdout=subprocess.PIPE)
M = process.communicate()
Memory = int(M[0].strip("\n"))
bufsize = Memory / 3
inputfile = raw_input('Enter title index file:')
outfile = raw_input('Enter title secondary index file:')
g = open(inputfile, "r")
h = open(outfile, "w")
index = OrderedDict()
with open(inputfile, "r") as f:
    count = 0
    for i in f:
        count += 1
    width = int(math.sqrt(count))
    #print count,width
    h.write(base.decimaltobase62(width) + "\n")
    counter = 0
    f.seek(0)
    offset = 0
    l = g.readline()
    if l:
        index[l.split("$")[0]] = base.decimaltobase62(offset)
    for i in f:
        #print i
        offset += len(i)
        counter += 1
        if counter == width:
            g.seek(offset)
            x = g.readline()
            index[x.split("$")[0]] = base.decimaltobase62(offset)
            #index.append(" ".join([x.split("$")[0],base.decimaltobase62(offset),"\n"]))
Пример #7
0
def processWikiText(ipfile,file):
    f=open(ipfile,"r")
    pagetrie=dict()
    lines=f.readlines()
    f.close()
    lenlines=lines.__len__()
    i=0
    count=0
    title=''
    docId=''
    text=''
    infoflag=0
    catflag=0
    txtflag=0
    outflag=0
    tflag=0
    start=time.time()
    while i<lenlines:
        line=lines[i].lstrip(" ")
        line=line.rstrip("\n")
        if line=="<page>":
            i+=1
            line=lines[i].lstrip(" ")
            line=line.rstrip("\n")
            if line[:7]=="<title>":
                title=line[7:-8]
            i+=2
            line=lines[i].lstrip(" ")
            line=line.rstrip("\n")
            if line[:4]=="<id>":
                docId=line[4:-5]
                docId=base.decimaltobase62(int(docId))
            term=''
            tlength=title.__len__()
            for j in xrange(tlength):
                lower=title[j].lower()
                if lower<'a' or lower>'z':
                    if term:
                        if not(stop_list.has_key(term)):
                            stem=porter2.stem(term)
                            if pagetrie.has_key(stem):
                                v=pagetrie[stem]
                                if v[5].has_key(docId):
                                    v[5][docId]+=1
                                else:
                                    v[0]+=1
                                    v[5][docId]=1
                            else:
                                pagetrie[stem]=[1,0,0,0,0,{docId:1},{},{},{},{}]
                    term=''
                else:
                    term+=lower
            if len(term)>0 and not(stop_list.has_key(term)):
                stem=porter2.stem(term)
                if pagetrie.has_key(stem):
                    v=pagetrie[stem]
                    if v[5].has_key(docId):
                        v[5][docId]+=1
                    else:
                        v[0]+=1
                        v[5][docId]=1
                else:
                    pagetrie[stem]=[1,0,0,0,0,{docId:1},{},{},{},{}]
            infoflag=0
            catflag=0
            txtflag=1
            outflag=0
            print docId,title
            i+=1
            continue
        elif line[:7]=="</page>":
            count+=1
            i+=1
            continue
        elif line[:5]=="<text":  #text flag
            l=line.split(">",1)
            text=[]
            if l[1][-7:]=="</text>":
                line=l[1].rsplit("<",1)
                text.append(l[0])
                i+=1
            else:
                text.append(l[1])
                i+=1
                while True:
                    if lines[i][-8:]=="</text>\n":
                        line=lines[i].rsplit("<",1)
                        text.append(line[0])
                        break
                    text.append(lines[i])
                    i+=1
            text="".join(text)
            text="%s\n"%(text)
            txtlength=text.__len__()
            j=0
            term=''
            txtflag=1
            prevind=0
            flag=0
            while j<txtlength:
                if text.startswith("[[",j):
                    j+=2
                    if (text.startswith("Cate",j)):
                        catflag=1
                        flag=1
                        j+=9
                    else:
                        if flag==1: break
                        outflag=1
                    term=''
                    txtflag=0
                    while 1:
                        check=0
                        if text.startswith("]",j):
                            check=1
                        lower=text[j].lower()
                        if lower<'a' or lower>'z':
                            if len(term)>2:
                                if not(stop_list.has_key(term)):
                                    stem=porter2.stem(term)
                                    if pagetrie.has_key(stem):
                                        v=pagetrie[stem]
                                        if infoflag: 
                                            if v[6].has_key(docId):
                                                v[6][docId]+=1
                                            else:
                                                v[1]+=1
                                                v[6][docId]=1
                                        if catflag: 
                                            if v[7].has_key(docId):
                                                v[7][docId]+=1
                                            else:
                                                v[2]+=1 
                                                v[7][docId]=1
                                        if outflag: 
                                            if v[8].has_key(docId):
                                                v[8][docId]+=1
                                            else:
                                                v[3]+=1 
                                                v[8][docId]=1
                                    else:
                                        pagetrie[stem]=[0,infoflag,catflag,outflag,0,{},{},{},{},{}]
                                        v=pagetrie[stem]
                                        if infoflag: v[6][docId]=1
                                        if catflag: v[7][docId]=1
                                        if outflag: v[8][docId]=1
                            term=''
                        else:
                            term+=lower
                        if check==1:
                            outflag=0
                            catflag=0
                            j+=2
                            break
                        j+=1
                    if not(infoflag): txtflag=1
                elif text.startswith("{{",j):
                    j+=2
                    if text.startswith("Info",j):
                        infoflag=1
                        txtflag=0
                        j+=7
                        #print "infostart",infoflag
                    else:
                        while 1:
                            if text[j]=="}" or text[j]=='\n':
                                j+=2
                                break
                            j+=1
                elif text.startswith("}}\n",j):
                    infoflag=0
                    txtflag=1
                    #print "infoclose",infoflag
                    j+=3
                else:
                    lower=text[j].lower()
                    if lower<'a' or lower>'z':
                        if j-prevind>3: #j-prevind-1 == length
                            term=text[prevind+1:j].lower()
                            if not(stop_list.has_key(term)):
                                stem=porter2.stem(term)
                                if pagetrie.has_key(stem):
                                    v=pagetrie[stem]
                                    if infoflag: 
                                        if v[6].has_key(docId):
                                            v[6][docId]+=1
                                        else:
                                            v[1]+=1 
                                            v[6][docId]=1
                                    if catflag: 
                                        if v[7].has_key(docId):
                                            v[7][docId]+=1
                                        else:
                                            v[2]+=1 
                                            v[stem][7][docId]=1
                                    if outflag: 
                                        if v[8].has_key(docId):
                                            v[8][docId]+=1
                                        else:
                                            v[3]+=1 
                                            v[8][docId]=1
                                    if txtflag: 
                                        if v[9].has_key(docId):
                                            v[9][docId]+=1
                                        else:
                                            v[4]+=1 
                                            v[9][docId]=1
                                else:
                                    pagetrie[stem]=[0,infoflag,catflag,outflag,txtflag,{},{},{},{},{}]
                                    v=pagetrie[stem]
                                    if infoflag: v[6][docId]=1
                                    if catflag: v[7][docId]=1
                                    if outflag: v[8][docId]=1
                                    if txtflag: v[9][docId]=1
                        prevind=j
                    j+=1
                    continue
                prevind=j
        i+=1
    print "proc%d"%(file)
    printTrie(file,pagetrie)
    pagetrie.clear()
    return count
Пример #8
0
def mergeIndex(index_file,n,totaldoc):
    print "Merging Indexes"
    freqthreshold=0.8*totaldoc
    iters=[]
    seeks=[]
    terms=[]
    index=open(index_file,"w")
    if n!=4:
        index.write(base.decimaltobase62(totaldoc)+"\n")
    index.close()
    bufsize=Memory/(2*(n+1))
    l=[]
    for i in range(0,n,1):
        name="temp%d"%(i+1)
        if n==4: name="sub"+name
        #print name,i+1
        f=open(name,"r")
        l.append(f.readlines(bufsize))
        #print i,len(l[i])
        #if len(l[i])==0:
        #    print i+1
        if len(l[i]):
            x=l[i][0].strip("\n").split('#',6)
            x=map(lambda p: '0' if p=='' else p,x)
            seeks.append(f.tell())
            iters.append(1)
            x.append(i)
            heapq.heappush(terms,x)
        else:
            iters.append(0)
            seeks.append(f.tell())
        f.close()
    #print terms,len(l[0])
    termtrie=pytrie.SortedStringTrie()
    print terms
    text=[]
    count=n
    while len(terms)>0:
        t=heapq.heappop(terms)
        #print t
        if termtrie.has_key(t[0]):
            value=termtrie[t[0]]
            items=[int(t[i+1])+value[i] for i in xrange(5)]
            if sum(items)<freqthreshold:
                items.append(merge(t[6],value[5]))
            else:
                items.append("")
            termtrie[t[0]]=items
        else:
			items=map(lambda x:int(x),t[1:-2])
			items.append(t[6])
			termtrie[t[0]]=items

        if len(termtrie)>n:
            text.append("\n".join([ "#".join([key,"#".join(map(lambda x: base.decimaltobase62(x) if x>0 else '',value[:-1])),value[5]]) for key,value in termtrie.iteritems() if value[5]!=''])+"\n")
            termtrie.clear()
    
        if text.__sizeof__()>bufsize:
            print "Index transfer to file"
            index=open(index_file,"a")
            index.writelines(text)
            index.close()
            text=[]
        
        if iters[t[7]]<len(l[t[7]]):
            line=l[t[7]][iters[t[7]]]
            iters[t[7]]+=1
        else:
            line=''
            m=t[7]
            name="temp%d"%(m+1)
            if n==4: name="sub"+name
            f=open(name,"r")
            f.seek(seeks[m])
            if count==1:
                if len(termtrie)>0: text.append("\n".join([ "#".join([key,"#".join(map(lambda x: base.decimaltobase62(x) if x>0 else '',value[:-1])),value[5]]) for key,value in termtrie.iteritems() if value[5]!=''])+"\n")
                termtrie.clear()
                index=open(index_file,"a")
                index.writelines(text)
                text=[]
                while 1:
                    line=f.read(bufsize)
                    if len(line)==0: break
                    index.write(line)
                break
                index.close()
            l[m]=f.readlines(bufsize)
            if len(l[m]):
                line=l[m][0]
                iters[m]=1
                seeks[m]=f.tell()
            else:
                count-=1
            f.close()
        if len(line)>0:
            x=line.strip("\n").split('#',6)
            x=map(lambda p: '0' if p=='' else p,x)
            x.append(t[7])
            heapq.heappush(terms,x)

    if len(termtrie)>0 or len(text)>0:
        index=open(index_file,"a")
        text.append("\n".join([ "#".join([key,"#".join(map(lambda x: base.decimaltobase62(x) if x>0 else '',value[:-1])),value[5]]) for key,value in termtrie.iteritems() if value[5]!=''])+"\n")
        termtrie.clear()
        index.writelines(text)
        index.close()

    terms=[]
    index.close()
    termtrie.clear()
    print "Removing temp files"