예제 #1
0
    def __init__(self, si, closeDir=False):
        self.directory = si.dir
        self.closeDirectory = closeDir
        self.segment = si.name
        self.nrms = {}
        self.deletedDocsDirty = False

        self.fieldInfos = field.FieldInfos(self.directory,
                                                self.segment + '.fnm')
        self.fieldsReader = field.FieldsReader(self.directory,
                                                      self.segment,
                                                      self.fieldInfos)

        self.tis = TermInfosReader(self.directory,
                                                   self.segment,
                                                   self.fieldInfos)

        if SegmentReader.hasDeletions(si):
            self.deletedDocs = BitVector(self.directory,
                                                   self.segment + '.del')
        else:
            self.deletedDocs = None

        # makes sure that all index files have been read or are kept open
        # so that if an index update removes them we'll still have them
        self.freqStream = self.directory.openFile(self.segment + '.frq')
        self.proxStream = self.directory.openFile(self.segment + '.prx')

        self.openNorms()
예제 #2
0
    def __init__(self, si, closeDir=False):
        self.directory = si.dir
        self.closeDirectory = closeDir
        self.segment = si.name
        self.nrms = {}
        self.deletedDocsDirty = False

        self.fieldInfos = field.FieldInfos(self.directory, self.segment + ".fnm")
        self.fieldsReader = field.FieldsReader(self.directory, self.segment, self.fieldInfos)

        self.tis = TermInfosReader(self.directory, self.segment, self.fieldInfos)

        if SegmentReader.hasDeletions(si):
            self.deletedDocs = BitVector(self.directory, self.segment + ".del")
        else:
            self.deletedDocs = None

        # makes sure that all index files have been read or are kept open
        # so that if an index update removes them we'll still have them
        self.freqStream = self.directory.openFile(self.segment + ".frq")
        self.proxStream = self.directory.openFile(self.segment + ".prx")

        self.openNorms()
예제 #3
0
 def doDelete(self, docNum):
     if self.deletedDocs is None:
         self.deletedDocs = BitVector(self.maxDoc())
     self.deletedDocsDirty = True
     self.deletedDocs.set(docNum)
예제 #4
0
class SegmentReader(IndexReader):
    
    # Class methods
    def hasDeletions(cls, si):
        return si.dir.fileExists(si.name + '.del')
    
    hasDeletions = classmethod(hasDeletions)
    

    # instance methods
    def __init__(self, si, closeDir=False):
        self.directory = si.dir
        self.closeDirectory = closeDir
        self.segment = si.name
        self.nrms = {}
        self.deletedDocsDirty = False

        self.fieldInfos = field.FieldInfos(self.directory,
                                                self.segment + '.fnm')
        self.fieldsReader = field.FieldsReader(self.directory,
                                                      self.segment,
                                                      self.fieldInfos)

        self.tis = TermInfosReader(self.directory,
                                                   self.segment,
                                                   self.fieldInfos)

        if SegmentReader.hasDeletions(si):
            self.deletedDocs = BitVector(self.directory,
                                                   self.segment + '.del')
        else:
            self.deletedDocs = None

        # makes sure that all index files have been read or are kept open
        # so that if an index update removes them we'll still have them
        self.freqStream = self.directory.openFile(self.segment + '.frq')
        self.proxStream = self.directory.openFile(self.segment + '.prx')

        self.openNorms()


    def closeNorms(self):
        for v in self.nrms.values():
            norm = v
            v.inStream.close()


    def docFreq(self, t):
        ti = self.tis.getTerm(t)
        if ti is None:
            return 0
        else:
            return ti.docFreq


    def doClose(self):
        if self.deletedDocsDirty:
            self.deletedDocs.write(self.directory, self.segment + ".tmp")
            self.directory.renameFile(self.segment + ".tmp",
                                      self.segment + ".del")
            self.deletedDocsDirty = False

        self.fieldsReader.close()
        self.tis.close()

        if self.freqStream is not None:
            self.freqStream.close()
        if self.proxStream is not None:
            self.proxStream.close()

        self.closeNorms()

        if self.closeDirectory:
            self.directory.close()


    def document(self, n):
        if self.isDeleted(n):
            raise Exception, 'attempt to access deleted document'
        return self.fieldsReader.doc(n)


    def doDelete(self, docNum):
        if self.deletedDocs is None:
            self.deletedDocs = BitVector(self.maxDoc())
        self.deletedDocsDirty = True
        self.deletedDocs.set(docNum)


    def files(self):
        suffix = ['.fnm','.fdx','.fdt','.tii','.tis','.frq','.prx']
        files = map((lambda x: self.segment + x), suffix)

        if self.directory.fileExists(self.segment + '.del'):
            files.append(self.segment + '.del')
            
        for i in range(len(self.fieldInfos)):
            fi = self.fieldInfos.fieldInfoInt(i)
            if fi.isIndexed:
                files.append(self.segment + '.f' + str(i))
                
        return files


    def isDeleted(self, n):
        return (self.deletedDocs is not None and self.deletedDocs.get(n))


    def maxDoc(self):
        return self.fieldsReader.size()
    

    def normsField(self, field):
        norm = self.nrms.get(field, None)
        if norm is None:
            return None
        if norm.bytes is None:
            bytes = array('B',[0x00]*self.maxDoc())
            self.norms(field, bytes, 0)
            norm.bytes = bytes

        return norm.bytes
    

    def norms(self, field, bytes, offset):
        normStream = self.normStream(field)
        if normStream is None:
            return
        try:
            normStream.readBytes(bytes, offset, self.maxDoc())
        finally:
            normStream.close()


    def normStream(self, field):
        norm = self.nrms.get(field, None)
        if norm is None:
            return None
        # Cloning????
        result = norm.inStream.clone()
        result.seek(0)
        return result


    def numDocs(self):
        n = self.maxDoc()
        if self.deletedDocs is not None:
            n -= self.deletedDocs.count()
        return n

    def openNorms(self):
        for i in range(len(self.fieldInfos)):
            fi = self.fieldInfos.fieldInfoInt(i)
            if fi.isIndexed:
                self.nrms[fi.name]=Norm(self.directory.openFile(
                    (self.segment + '.f' + str(fi.number))))
                

    def termDocs(self):
        return SegmentTermDocs(self)
        


    def termPositions(self):
        return SegmentTermPositions(self)
    

    def terms(self, t = None):
        if t:
            return self.tis.terms()
        else:
            return self.tis.terms(t)

    def fieldNames(self):
        # Experimental for auto-queries
        # Return a sorted list of all the field names
        fNames = self.fieldInfos.fieldNames()
        if not fNames:
            return []
        # Remove the field with no name
        fNames.remove('')
        return fNames
예제 #5
0
 def doDelete(self, docNum):
     if self.deletedDocs is None:
         self.deletedDocs = BitVector(self.maxDoc())
     self.deletedDocsDirty = True
     self.deletedDocs.set(docNum)
예제 #6
0
class SegmentReader(IndexReader):

    # Class methods
    def hasDeletions(cls, si):
        return si.dir.fileExists(si.name + ".del")

    hasDeletions = classmethod(hasDeletions)

    # instance methods
    def __init__(self, si, closeDir=False):
        self.directory = si.dir
        self.closeDirectory = closeDir
        self.segment = si.name
        self.nrms = {}
        self.deletedDocsDirty = False

        self.fieldInfos = field.FieldInfos(self.directory, self.segment + ".fnm")
        self.fieldsReader = field.FieldsReader(self.directory, self.segment, self.fieldInfos)

        self.tis = TermInfosReader(self.directory, self.segment, self.fieldInfos)

        if SegmentReader.hasDeletions(si):
            self.deletedDocs = BitVector(self.directory, self.segment + ".del")
        else:
            self.deletedDocs = None

        # makes sure that all index files have been read or are kept open
        # so that if an index update removes them we'll still have them
        self.freqStream = self.directory.openFile(self.segment + ".frq")
        self.proxStream = self.directory.openFile(self.segment + ".prx")

        self.openNorms()

    def closeNorms(self):
        for v in self.nrms.values():
            norm = v
            v.inStream.close()

    def docFreq(self, t):
        ti = self.tis.getTerm(t)
        if ti is None:
            return 0
        else:
            return ti.docFreq

    def doClose(self):
        if self.deletedDocsDirty:
            self.deletedDocs.write(self.directory, self.segment + ".tmp")
            self.directory.renameFile(self.segment + ".tmp", self.segment + ".del")
            self.deletedDocsDirty = False

        self.fieldsReader.close()
        self.tis.close()

        if self.freqStream is not None:
            self.freqStream.close()
        if self.proxStream is not None:
            self.proxStream.close()

        self.closeNorms()

        if self.closeDirectory:
            self.directory.close()

    def document(self, n):
        if self.isDeleted(n):
            raise Exception, "attempt to access deleted document"
        return self.fieldsReader.doc(n)

    def doDelete(self, docNum):
        if self.deletedDocs is None:
            self.deletedDocs = BitVector(self.maxDoc())
        self.deletedDocsDirty = True
        self.deletedDocs.set(docNum)

    def files(self):
        suffix = [".fnm", ".fdx", ".fdt", ".tii", ".tis", ".frq", ".prx"]
        files = map((lambda x: self.segment + x), suffix)

        if self.directory.fileExists(self.segment + ".del"):
            files.append(self.segment + ".del")

        for i in range(len(self.fieldInfos)):
            fi = self.fieldInfos.fieldInfoInt(i)
            if fi.isIndexed:
                files.append(self.segment + ".f" + str(i))

        return files

    def isDeleted(self, n):
        return self.deletedDocs is not None and self.deletedDocs.get(n)

    def maxDoc(self):
        return self.fieldsReader.size()

    def normsField(self, field):
        norm = self.nrms.get(field, None)
        if norm is None:
            return None
        if norm.bytes is None:
            bytes = array("B", [0x00] * self.maxDoc())
            self.norms(field, bytes, 0)
            norm.bytes = bytes

        return norm.bytes

    def norms(self, field, bytes, offset):
        normStream = self.normStream(field)
        if normStream is None:
            return
        try:
            normStream.readBytes(bytes, offset, self.maxDoc())
        finally:
            normStream.close()

    def normStream(self, field):
        norm = self.nrms.get(field, None)
        if norm is None:
            return None
        # Cloning????
        result = norm.inStream.clone()
        result.seek(0)
        return result

    def numDocs(self):
        n = self.maxDoc()
        if self.deletedDocs is not None:
            n -= self.deletedDocs.count()
        return n

    def openNorms(self):
        for i in range(len(self.fieldInfos)):
            fi = self.fieldInfos.fieldInfoInt(i)
            if fi.isIndexed:
                self.nrms[fi.name] = Norm(self.directory.openFile((self.segment + ".f" + str(fi.number))))

    def termDocs(self):
        return SegmentTermDocs(self)

    def termPositions(self):
        return SegmentTermPositions(self)

    def terms(self, t=None):
        if t:
            return self.tis.terms()
        else:
            return self.tis.terms(t)

    def fieldNames(self):
        # Experimental for auto-queries
        # Return a sorted list of all the field names
        fNames = self.fieldInfos.fieldNames()
        if not fNames:
            return []
        # Remove the field with no name
        fNames.remove("")
        return fNames