def __init__(self): self.mainindex = {} self.index = {} self.termdict = self.loadTermfile() self.mmseg = fmmseg.fmmseg() self.k = 500 self.pos = 0 self.filename = [] self.fhashkeyToid = self.loadUrlfile(fstd.rootpath + 'file/url') os.chdir(fstd.rootpath + 'file') self.fstop = fstopword.fstopword()
def __init__(self): self.mainindex = {} self.index = {} self.termdict = self.loadTermfile() self.mmseg = fmmseg.fmmseg() self.k = 500 self.pos = 0 self.filename = [] self.fhashkeyToid = self.loadUrlfile(fstd.rootpath+'file/url') os.chdir(fstd.rootpath+'file') self.fstop = fstopword.fstopword()
def getDocList(self,query): self.loadIndexFile(fstd.rootpath+'file/index.main') self.loadUrlfile(fstd.rootpath+'file/url') fm = fmmseg.fmmseg() fm.loadTermfile() # print self.index docs = [] if query in fm.termdict: termid = int(fm.termdict[query]) print termid docs = self.index[termid] return docs
def getDocList(self, query): self.loadIndexFile(fstd.rootpath + 'file/index.main') self.loadUrlfile(fstd.rootpath + 'file/url') fm = fmmseg.fmmseg() fm.loadTermfile() # print self.index docs = [] if query in fm.termdict: termid = int(fm.termdict[query]) print termid docs = self.index[termid] return docs
def creatIndex(self): self.fhashkeyToid = finverted.loadUrlfile(fstd.rootpath+'file/url') fm = fmmseg.fmmseg() fm.loadTermfile() # filenames = os.listdir(fstd.rootpath+'file') os.chdir(fstd.rootpath+'file') for filename in filenames: fpos = filename.find('.tmp') if fpos != -1: print filename fp = open(filename,'r') hashkey = filename[:fpos] if hashkey not in self.fhashkeyToid: continue docid = self.fhashkeyToid[hashkey] for each in fp: if each == '\n': continue s = each[:each.find('\n')] while s != '': pos = s.find('###') if pos == -1: break s1 = s[:pos] if s1 == '': break if s1 not in fm.termdict: s = s[pos+3:] continue id = fm.termdict[s1] if id not in self.index.keys(): self.index[id] = set() self.index[id].add(docid) s = s[pos+3:] fp.close() #print self.index fout = open(fstd.rootpath+'file/termid','w') for termids in self.index.keys(): s = str(termids)+'###' for termid in self.index[termids]: s = s+str(termid) +'###' fout.write(s+'\n') fout.close()
def query(self,q): self.loadIndexFile(fstd.rootpath+'file/index.main') self.loadUrlfile(fstd.rootpath+'file/url') fm = fmmseg.fmmseg() fm.loadTermfile() # print self.index docs = [] re = [] if q in fm.termdict: termid = int(fm.termdict[q]) print termid docs = self.index[termid] print docs for docid in docs: fp = open(fstd.rootpath + 'file/'+self.docidTohash[docid]+'.text') c = fp.read() print '----------------------------------------------------' print c re.append(c) fp.close()
def query(self, q): self.loadIndexFile(fstd.rootpath + 'file/index.main') self.loadUrlfile(fstd.rootpath + 'file/url') fm = fmmseg.fmmseg() fm.loadTermfile() # print self.index docs = [] re = [] if q in fm.termdict: termid = int(fm.termdict[q]) print termid docs = self.index[termid] print docs for docid in docs: fp = open(fstd.rootpath + 'file/' + self.docidTohash[docid] + '.text') c = fp.read() print '----------------------------------------------------' print c re.append(c) fp.close()
def tidyTextfile(): fm = fmmseg.fmmseg() fm.loadTermfile() filenames = os.listdir(fstd.rootpat + "file") os.chdir(fstd.rootpat + "file") for filename in filenames: if filename.find(".tmp") != -1: fp = open(filename, "r") fout = open(filename[: filename.find("tmp")] + "term", "w") sw = "" for each in fp: if each == "\n": continue s = each[: each.find("\n")] print s, filename while s != "": pos = s.find("###") if pos == -1: break s1 = s[:pos] if s1 == "": break print str(s1), pos if s1 not in fm.termdict: s = s[pos + 3 :] continue id = fm.termdict[s1] sw = sw + str(id) + "#" print id s = s[pos + 3 :] sw = sw + "\n" fout.write(sw) fp.close() fout.close()
def tidyTextfile(): fm = fmmseg.fmmseg() fm.loadTermfile() filenames = os.listdir(fstd.rootpat+'file') os.chdir(fstd.rootpat+'file') for filename in filenames: if filename.find('.tmp') != -1: fp = open(filename,'r') fout = open(filename[:filename.find('tmp')]+'term','w') sw = '' for each in fp: if each == '\n': continue s = each[:each.find('\n')] print s,filename while s != '': pos = s.find('###') if pos == -1: break s1 = s[:pos] if s1 == '': break print str(s1),pos if s1 not in fm.termdict: s = s[pos+3:] continue id = fm.termdict[s1] sw = sw + str(id) + '#' print id s = s[pos+3:] sw=sw+'\n' fout.write(sw) fp.close() fout.close()
def MergeIndex(self): self.fhashkeyToid = finverted.loadUrlfile(fstd.rootpath+'file/url') fp = open(fstd.rootpath+'file/termid','r') for each in fp: pos1 = each.find('###') termid = int(each[:pos1]) self.index[termid] = set() s = each[pos1 + 3:] pos2 = s.find('###') while pos2 != -1 : docid = int(s[:pos2]) self.index[termid].add(docid) s = s[pos2+3:] pos2 = s.find('###') fp.close() #对新的文件进行分词 os.chdir(fstd.rootpath+'file') fm = fmmseg.fmmseg() fm.loadTermfile() furl = open(fstd.rootpath+'file/newurl','r') for url in furl: url = url[:url.find('\n')] fm.segmentAFile(url+'.text') self.fhashkeyToid[url] = docid fm.mergeTermJieba() furl.close() #进行索引 furl = open(fstd.rootpath+'file/newurl','r') for filename in furl: filename = filename[:filename.find('\n')] filename = filename+'.tmp' if True: print filename fp = open(filename,'r') hashkey = filename[:filename.find('.tmp')] if hashkey not in self.fhashkeyToid: print "-----> "+hashkey + "not in" continue docid = self.fhashkeyToid[hashkey] for each in fp: if each == '\n': continue s = each[:each.find('\n')] while s != '': pos = s.find('###') if pos == -1: break s1 = s[:pos] if s1 == '': break if s1 not in fm.termdict: s = s[pos+3:] continue id = fm.termdict[s1] if id not in self.index.keys(): self.index[id] = set() # print docid self.index[id].add(docid) s = s[pos+3:] fp.close() #print self.index[2] print '索引建好了' fout = open(fstd.rootpath+'file/termid','w') for termids in self.index.keys(): s = str(termids)+'###' for termid in self.index[termids]: s = s+str(termid) +'###' fout.write(s+'\n') fout.close()