示例#1
0
    def __init__(self):
        self.mainindex = {}
        self.index = {}
        self.termdict = self.loadTermfile()
        self.mmseg = fmmseg.fmmseg()
        self.k = 500
        self.pos = 0
        self.filename = []
        self.fhashkeyToid = self.loadUrlfile(fstd.rootpath + 'file/url')
        os.chdir(fstd.rootpath + 'file')

        self.fstop = fstopword.fstopword()
示例#2
0
 def __init__(self):
     self.mainindex = {}
     self.index = {}
     self.termdict = self.loadTermfile()
     self.mmseg = fmmseg.fmmseg()  
     self.k = 500
     self.pos = 0
     self.filename = []
     self.fhashkeyToid = self.loadUrlfile(fstd.rootpath+'file/url')
     os.chdir(fstd.rootpath+'file')
     
     self.fstop = fstopword.fstopword()
示例#3
0
    def getDocList(self,query):
        self.loadIndexFile(fstd.rootpath+'file/index.main')
        self.loadUrlfile(fstd.rootpath+'file/url')
        fm = fmmseg.fmmseg()
        fm.loadTermfile()
#        print self.index
        docs = [] 
 
        if query  in fm.termdict:
            termid = int(fm.termdict[query]) 
            print termid
            docs = self.index[termid]
        
        return docs
示例#4
0
    def getDocList(self, query):
        self.loadIndexFile(fstd.rootpath + 'file/index.main')
        self.loadUrlfile(fstd.rootpath + 'file/url')
        fm = fmmseg.fmmseg()
        fm.loadTermfile()
        #        print self.index
        docs = []

        if query in fm.termdict:
            termid = int(fm.termdict[query])
            print termid
            docs = self.index[termid]

        return docs
示例#5
0
 def creatIndex(self):
     self.fhashkeyToid = finverted.loadUrlfile(fstd.rootpath+'file/url')
     fm = fmmseg.fmmseg()
     fm.loadTermfile() #
     
     
     filenames = os.listdir(fstd.rootpath+'file')
     os.chdir(fstd.rootpath+'file')
     for filename in filenames:
         
         fpos = filename.find('.tmp')
         if fpos != -1:
             print filename
             fp = open(filename,'r')
             hashkey = filename[:fpos]
             if hashkey not in self.fhashkeyToid:
                 continue
             docid = self.fhashkeyToid[hashkey]
             for each in fp:
                 if each == '\n':
                     continue
                 s = each[:each.find('\n')]
                 while s != '':
                     pos = s.find('###')
                     if pos == -1:
                         break
                     s1 = s[:pos]
                     if s1 == '':
                         break
                     if s1 not in fm.termdict:
                         s = s[pos+3:]
                         continue
                     id = fm.termdict[s1]
                     if id not in self.index.keys():
                             self.index[id] = set()
                     self.index[id].add(docid)
                     s = s[pos+3:]
                 
             fp.close()
     #print self.index
     fout = open(fstd.rootpath+'file/termid','w')
     for termids in self.index.keys():
         s = str(termids)+'###'
         for termid in self.index[termids]:
             s = s+str(termid) +'###'
         fout.write(s+'\n')
     fout.close()   
示例#6
0
    def query(self,q):
        self.loadIndexFile(fstd.rootpath+'file/index.main')
        self.loadUrlfile(fstd.rootpath+'file/url')
        fm = fmmseg.fmmseg()
        fm.loadTermfile()
#        print self.index
        docs = [] 
        re = []
        if q  in fm.termdict:
            termid = int(fm.termdict[q]) 
            print termid
            docs = self.index[termid]
            print docs
            for docid in docs:
                fp = open(fstd.rootpath + 'file/'+self.docidTohash[docid]+'.text')
                c =  fp.read()
                print '----------------------------------------------------'
                print c
                re.append(c)
                fp.close()
示例#7
0
 def query(self, q):
     self.loadIndexFile(fstd.rootpath + 'file/index.main')
     self.loadUrlfile(fstd.rootpath + 'file/url')
     fm = fmmseg.fmmseg()
     fm.loadTermfile()
     #        print self.index
     docs = []
     re = []
     if q in fm.termdict:
         termid = int(fm.termdict[q])
         print termid
         docs = self.index[termid]
         print docs
         for docid in docs:
             fp = open(fstd.rootpath + 'file/' + self.docidTohash[docid] +
                       '.text')
             c = fp.read()
             print '----------------------------------------------------'
             print c
             re.append(c)
             fp.close()
示例#8
0
def tidyTextfile():
    fm = fmmseg.fmmseg()
    fm.loadTermfile()
    filenames = os.listdir(fstd.rootpat + "file")
    os.chdir(fstd.rootpat + "file")
    for filename in filenames:
        if filename.find(".tmp") != -1:
            fp = open(filename, "r")
            fout = open(filename[: filename.find("tmp")] + "term", "w")
            sw = ""
            for each in fp:
                if each == "\n":
                    continue
                s = each[: each.find("\n")]
                print s, filename
                while s != "":
                    pos = s.find("###")
                    if pos == -1:
                        break
                    s1 = s[:pos]
                    if s1 == "":
                        break
                    print str(s1), pos
                    if s1 not in fm.termdict:
                        s = s[pos + 3 :]
                        continue
                    id = fm.termdict[s1]
                    sw = sw + str(id) + "#"
                    print id
                    s = s[pos + 3 :]

                sw = sw + "\n"

            fout.write(sw)
            fp.close()
            fout.close()
示例#9
0
def tidyTextfile():
    fm = fmmseg.fmmseg()
    fm.loadTermfile()
    filenames = os.listdir(fstd.rootpat+'file')
    os.chdir(fstd.rootpat+'file')
    for filename in filenames:
        if filename.find('.tmp') != -1:
            fp = open(filename,'r')
            fout = open(filename[:filename.find('tmp')]+'term','w')
            sw = ''
            for each in fp:
                if each == '\n':
                    continue
                s = each[:each.find('\n')]
                print s,filename
                while s != '':
                    pos = s.find('###')
                    if pos == -1:
                        break
                    s1 = s[:pos]
                    if s1 == '':
                        break
                    print str(s1),pos
                    if s1 not in fm.termdict:
                        s = s[pos+3:]
                        continue
                    id = fm.termdict[s1]
                    sw = sw + str(id) + '#'
                    print id
                    s = s[pos+3:]
                    
                sw=sw+'\n'
            
            fout.write(sw)
            fp.close()
            fout.close()
示例#10
0
    def MergeIndex(self):
        self.fhashkeyToid = finverted.loadUrlfile(fstd.rootpath+'file/url')
        
        fp = open(fstd.rootpath+'file/termid','r')
        for each in fp:
            pos1 = each.find('###')
            termid = int(each[:pos1])
            self.index[termid] = set()
            s = each[pos1 + 3:]
            pos2 = s.find('###')
            while pos2 != -1 :
                docid = int(s[:pos2])
                self.index[termid].add(docid)
                s = s[pos2+3:]
                pos2 = s.find('###')
            
        fp.close()
        
        
        #对新的文件进行分词
        os.chdir(fstd.rootpath+'file')
        fm = fmmseg.fmmseg()
        fm.loadTermfile()
        furl = open(fstd.rootpath+'file/newurl','r')
        for url in furl:
            url = url[:url.find('\n')]
            fm.segmentAFile(url+'.text')
            self.fhashkeyToid[url] = docid
 
        
        fm.mergeTermJieba()
        
        furl.close()
        #进行索引
        furl = open(fstd.rootpath+'file/newurl','r')
        
        for filename in furl:
            
            filename = filename[:filename.find('\n')]
            filename = filename+'.tmp'
            if True:
                print filename
                fp = open(filename,'r')
                hashkey = filename[:filename.find('.tmp')]
                if hashkey not in self.fhashkeyToid:
                    print "-----> "+hashkey + "not in"
                    continue
                docid = self.fhashkeyToid[hashkey]
                for each in fp:
                    if each == '\n':
                        continue
                    s = each[:each.find('\n')]
                    while s != '':
                        pos = s.find('###')
                        if pos == -1:
                            break
                        s1 = s[:pos]
                        if s1 == '':
                            break
                        if s1 not in fm.termdict:
                            s = s[pos+3:]
                            continue
                        id = fm.termdict[s1]
                        if id not in self.index.keys():
                                self.index[id] = set()
#                        print docid
                        self.index[id].add(docid)
                        s = s[pos+3:]
                    
                fp.close()
        
        #print self.index[2] 
        print '索引建好了'
        fout = open(fstd.rootpath+'file/termid','w')
        for termids in self.index.keys():
            s = str(termids)+'###'
            for termid in self.index[termids]:
                s = s+str(termid) +'###'
            fout.write(s+'\n')
        fout.close()