Пример #1
0
    def __init__(self, ix, postlimit, blocklimit, name=None):
        """
        :param ix: the Index object in which to write the new segment.
        :param postlimit: the maximum size for a run in the posting pool.
        :param blocklimit: the maximum number of postings in a posting block.
        :param name: the name of the segment.
        """

        self.index = ix
        self.schema = ix.schema
        self.storage = storage = ix.storage
        self.name = name or ix._next_segment_name()

        self.max_doc = 0

        self.pool = postpool.PostingPool(postlimit)

        # Create mappings of field numbers to the position of that field in the
        # lists of scorable and stored fields. For example, consider a schema
        # with fields (A, B, C, D, E, F). If B, D, and E are scorable, then the
        # list of scorable fields is (B, D, E). The _scorable_to_pos dictionary
        # would then map B -> 0, D -> 1, and E -> 2.
        self._scorable_to_pos = dict((fnum, i)
                                     for i, fnum
                                     in enumerate(self.schema.scorable_fields()))
        self._stored_to_pos = dict((fnum, i)
                                   for i, fnum
                                   in enumerate(self.schema.stored_fields()))

        # Create a temporary segment object just so we can access its
        # *_filename attributes (so if we want to change the naming convention,
        # we only have to do it in one place).
        tempseg = Segment(self.name, 0, 0, None)
        self.termtable = create_terms(storage, tempseg)
        self.docslist = create_storedfields(storage, tempseg)
        self.doclengths = None
        if self.schema.scorable_fields():
            self.doclengths = create_doclengths(storage, tempseg, len(self._scorable_to_pos))

        postfile = storage.create_file(tempseg.posts_filename)
        self.postwriter = FilePostingWriter(postfile, blocklimit=blocklimit)

        self.vectortable = None
        if self.schema.has_vectored_fields():
            # Table associating document fields with (postoffset, postcount)
            self.vectortable = create_vectors(storage, tempseg)
            vpostfile = storage.create_file(tempseg.vectorposts_filename)
            self.vpostwriter = FilePostingWriter(vpostfile, stringids=True)

        # Keep track of the total number of tokens (across all docs)
        # in each field
        self.field_length_totals = defaultdict(int)
Пример #2
0
    def __init__(self, ix, postlimit, blocklimit, name=None):
        """
        :param ix: the Index object in which to write the new segment.
        :param postlimit: the maximum size for a run in the posting pool.
        :param blocklimit: the maximum number of postings in a posting block.
        :param name: the name of the segment.
        """

        self.index = ix
        self.schema = ix.schema
        self.storage = storage = ix.storage
        self.name = name or ix._next_segment_name()

        self.max_doc = 0

        self.pool = postpool.PostingPool(postlimit)

        # Create mappings of field numbers to the position of that field in the
        # lists of scorable and stored fields. For example, consider a schema
        # with fields (A, B, C, D, E, F). If B, D, and E are scorable, then the
        # list of scorable fields is (B, D, E). The _scorable_to_pos dictionary
        # would then map B -> 0, D -> 1, and E -> 2.
        self._scorable_to_pos = dict(
            (fnum, i) for i, fnum in enumerate(self.schema.scorable_fields()))
        self._stored_to_pos = dict(
            (fnum, i) for i, fnum in enumerate(self.schema.stored_fields()))

        # Create a temporary segment object just so we can access its
        # *_filename attributes (so if we want to change the naming convention,
        # we only have to do it in one place).
        tempseg = Segment(self.name, 0, 0, None)
        self.termtable = create_terms(storage, tempseg)
        self.docslist = create_storedfields(storage, tempseg)
        self.doclengths = None
        if self.schema.scorable_fields():
            self.doclengths = create_doclengths(storage, tempseg,
                                                len(self._scorable_to_pos))

        postfile = storage.create_file(tempseg.posts_filename)
        self.postwriter = FilePostingWriter(postfile, blocklimit=blocklimit)

        self.vectortable = None
        if self.schema.has_vectored_fields():
            # Table associating document fields with (postoffset, postcount)
            self.vectortable = create_vectors(storage, tempseg)
            vpostfile = storage.create_file(tempseg.vectorposts_filename)
            self.vpostwriter = FilePostingWriter(vpostfile, stringids=True)

        # Keep track of the total number of tokens (across all docs)
        # in each field
        self.field_length_totals = defaultdict(int)
Пример #3
0
class SegmentWriter(object):
    """Do not instantiate this object directly; it is created by the
    IndexWriter object.
    
    Handles the actual writing of new documents to the index: writes stored
    fields, handles the posting pool, and writes out the term index.
    """
    def __init__(self, ix, postlimit, blocklimit, name=None):
        """
        :param ix: the Index object in which to write the new segment.
        :param postlimit: the maximum size for a run in the posting pool.
        :param blocklimit: the maximum number of postings in a posting block.
        :param name: the name of the segment.
        """

        self.index = ix
        self.schema = ix.schema
        self.storage = storage = ix.storage
        self.name = name or ix._next_segment_name()

        self.max_doc = 0

        self.pool = postpool.PostingPool(postlimit)

        # Create mappings of field numbers to the position of that field in the
        # lists of scorable and stored fields. For example, consider a schema
        # with fields (A, B, C, D, E, F). If B, D, and E are scorable, then the
        # list of scorable fields is (B, D, E). The _scorable_to_pos dictionary
        # would then map B -> 0, D -> 1, and E -> 2.
        self._scorable_to_pos = dict(
            (fnum, i) for i, fnum in enumerate(self.schema.scorable_fields()))
        self._stored_to_pos = dict(
            (fnum, i) for i, fnum in enumerate(self.schema.stored_fields()))

        # Create a temporary segment object just so we can access its
        # *_filename attributes (so if we want to change the naming convention,
        # we only have to do it in one place).
        tempseg = Segment(self.name, 0, 0, None)
        self.termtable = create_terms(storage, tempseg)
        self.docslist = create_storedfields(storage, tempseg)
        self.doclengths = None
        if self.schema.scorable_fields():
            self.doclengths = create_doclengths(storage, tempseg,
                                                len(self._scorable_to_pos))

        postfile = storage.create_file(tempseg.posts_filename)
        self.postwriter = FilePostingWriter(postfile, blocklimit=blocklimit)

        self.vectortable = None
        if self.schema.has_vectored_fields():
            # Table associating document fields with (postoffset, postcount)
            self.vectortable = create_vectors(storage, tempseg)
            vpostfile = storage.create_file(tempseg.vectorposts_filename)
            self.vpostwriter = FilePostingWriter(vpostfile, stringids=True)

        # Keep track of the total number of tokens (across all docs)
        # in each field
        self.field_length_totals = defaultdict(int)

    def segment(self):
        """Returns an index.Segment object for the segment being written."""
        return Segment(self.name, self.max_doc, dict(self.field_length_totals))

    def _close_all(self):
        self.termtable.close()
        self.postwriter.close()
        self.docslist.close()

        if self.doclengths:
            self.doclengths.close()

        if self.vectortable:
            self.vectortable.close()
            self.vpostwriter.close()

    def close(self):
        """Finishes writing the segment (flushes the posting pool out to disk)
        and closes all open files.
        """

        self._flush_pool()
        self._close_all()

    def add_reader(self, reader):
        """Adds the contents of another segment to this one. This is used to
        merge existing segments into the new one before deleting them.
        
        :param ix: The index.Index object containing the segment to merge.
        :param segment: The index.Segment object to merge into this one.
        """

        start_doc = self.max_doc
        has_deletions = reader.has_deletions()

        if has_deletions:
            doc_map = {}

        schema = self.schema
        name2num = schema.name_to_number
        stored_to_pos = self._stored_to_pos

        def storedkeyhelper(item):
            return stored_to_pos[name2num(item[0])]

        # Merge document info
        docnum = 0
        vectored_fieldnums = schema.vectored_fields()
        for docnum in xrange(reader.doc_count_all()):
            if not reader.is_deleted(docnum):
                # Copy the stored fields and field lengths from the reader
                # into this segment
                storeditems = reader.stored_fields(docnum).items()
                storedvalues = [
                    v for k, v in sorted(storeditems, key=storedkeyhelper)
                ]
                self._add_doc_data(storedvalues,
                                   reader.doc_field_lengths(docnum))

                if has_deletions:
                    doc_map[docnum] = self.max_doc

                # Copy term vectors
                for fieldnum in vectored_fieldnums:
                    if reader.has_vector(docnum, fieldnum):
                        self._add_vector(
                            fieldnum,
                            reader.vector(docnum, fieldnum).items())

                self.max_doc += 1

        # Add field length totals
        for fieldnum in schema.scorable_fields():
            self.field_length_totals[fieldnum] += reader.field_length(fieldnum)

        # Merge terms
        current_fieldnum = None
        decoder = None
        for fieldnum, text, _, _ in reader:
            if fieldnum != current_fieldnum:
                current_fieldnum = fieldnum
                decoder = schema[fieldnum].format.decode_frequency

            postreader = reader.postings(fieldnum, text)
            for docnum, valuestring in postreader.all_items():
                if has_deletions:
                    newdoc = doc_map[docnum]
                else:
                    newdoc = start_doc + docnum

                # TODO: Is there a faster way to do this?
                freq = decoder(valuestring)
                self.pool.add_posting(fieldnum, text, newdoc, freq,
                                      valuestring)

    def add_document(self, fields):
        scorable_to_pos = self._scorable_to_pos
        stored_to_pos = self._stored_to_pos
        schema = self.schema

        # Sort the keys by their order in the schema
        fieldnames = [
            name for name in fields.keys() if not name.startswith("_")
        ]
        fieldnames.sort(key=schema.name_to_number)

        # Check if the caller gave us a bogus field
        for name in fieldnames:
            if name not in schema:
                raise UnknownFieldError("There is no field named %r" % name)

        # Create an array of counters to record the length of each field
        fieldlengths = array(DOCLENGTH_TYPE, [0] * len(scorable_to_pos))

        # Create a list (initially a list of Nones) in which we will put stored
        # field values as we get them. Why isn't this an empty list that we
        # append to? Because if the caller doesn't supply a value for a stored
        # field, we don't want to have a list in the wrong order/of the wrong
        # length.
        storedvalues = [None] * len(stored_to_pos)

        for name in fieldnames:
            value = fields.get(name)
            if value:
                fieldnum = schema.name_to_number(name)
                field = schema.field_by_number(fieldnum)

                # If the field is indexed, add the words in the value to the
                # index
                if field.indexed:
                    # Count of all terms in the value
                    count = 0
                    # Count of UNIQUE terms in the value
                    unique = 0

                    # TODO: Method for adding progressive field values, ie
                    # setting start_pos/start_char?
                    for w, freq, valuestring in field.index(value):
                        #assert w != ""
                        self.pool.add_posting(fieldnum, w, self.max_doc, freq,
                                              valuestring)
                        count += freq
                        unique += 1

                    if field.scorable:
                        # Add the term count to the total for this field
                        self.field_length_totals[fieldnum] += count
                        # Set the term count to the per-document field length
                        pos = scorable_to_pos[fieldnum]
                        fieldlengths[pos] = min(count, DOCLENGTH_LIMIT)

                # If the field is vectored, add the words in the value to the
                # vector table
                vector = field.vector
                if vector:
                    # TODO: Method for adding progressive field values, ie
                    # setting start_pos/start_char?
                    vlist = sorted((w, valuestring) for w, freq, valuestring in
                                   vector.word_values(value, mode="index"))
                    self._add_vector(fieldnum, vlist)

                # If the field is stored, put the value in storedvalues
                if field.stored:
                    # Caller can override the stored value by including a key
                    # _stored_<fieldname>
                    storedname = "_stored_" + name
                    if storedname in fields:
                        stored_value = fields[storedname]
                    else:
                        stored_value = value

                    storedvalues[stored_to_pos[fieldnum]] = stored_value

        self._add_doc_data(storedvalues, fieldlengths)
        self.max_doc += 1

    def _add_terms(self):
        pass

    def _add_doc_data(self, storedvalues, fieldlengths):
        self.docslist.append(storedvalues)
        if self.doclengths:
            self.doclengths.append(fieldlengths)

    def _add_vector(self, fieldnum, vlist):
        vpostwriter = self.vpostwriter
        vformat = self.schema[fieldnum].vector

        offset = vpostwriter.start(vformat)
        for text, valuestring in vlist:
            assert isinstance(text, unicode), "%r is not unicode" % text
            vpostwriter.write(text, valuestring)
        vpostwriter.finish()

        self.vectortable.add((self.max_doc, fieldnum), offset)

    def _flush_pool(self):
        # This method pulls postings out of the posting pool (built up as
        # documents are added) and writes them to the posting file. Each time
        # it encounters a posting for a new term, it writes the previous term
        # to the term index (by waiting to write the term entry, we can easily
        # count the document frequency and sum the terms by looking at the
        # postings).

        termtable = self.termtable
        postwriter = self.postwriter
        schema = self.schema

        current_fieldnum = None  # Field number of the current term
        current_text = None  # Text of the current term
        first = True
        current_freq = 0
        offset = None

        # Loop through the postings in the pool. Postings always come out of
        # the pool in (field number, lexical) order.
        for fieldnum, text, docnum, freq, valuestring in self.pool:
            # Is this the first time through, or is this a new term?
            if first or fieldnum > current_fieldnum or text > current_text:
                if first:
                    first = False
                else:
                    # This is a new term, so finish the postings and add the
                    # term to the term table
                    postcount = postwriter.finish()
                    termtable.add((current_fieldnum, current_text),
                                  (current_freq, offset, postcount))

                # Reset the post writer and the term variables
                current_fieldnum = fieldnum
                current_text = text
                current_freq = 0
                offset = postwriter.start(schema[fieldnum].format)

            elif (fieldnum < current_fieldnum
                  or (fieldnum == current_fieldnum and text < current_text)):
                # This should never happen!
                raise Exception(
                    "Postings are out of order: %s:%s .. %s:%s" %
                    (current_fieldnum, current_text, fieldnum, text))

            # Write a posting for this occurrence of the current term
            current_freq += freq
            postwriter.write(docnum, valuestring)

        # If there are still "uncommitted" postings at the end, finish them off
        if not first:
            postcount = postwriter.finish()
            termtable.add((current_fieldnum, current_text),
                          (current_freq, offset, postcount))
Пример #4
0
class SegmentWriter(object):
    """Do not instantiate this object directly; it is created by the
    IndexWriter object.
    
    Handles the actual writing of new documents to the index: writes stored
    fields, handles the posting pool, and writes out the term index.
    """

    def __init__(self, ix, postlimit, blocklimit, name=None):
        """
        :param ix: the Index object in which to write the new segment.
        :param postlimit: the maximum size for a run in the posting pool.
        :param blocklimit: the maximum number of postings in a posting block.
        :param name: the name of the segment.
        """

        self.index = ix
        self.schema = ix.schema
        self.storage = storage = ix.storage
        self.name = name or ix._next_segment_name()

        self.max_doc = 0

        self.pool = postpool.PostingPool(postlimit)

        # Create mappings of field numbers to the position of that field in the
        # lists of scorable and stored fields. For example, consider a schema
        # with fields (A, B, C, D, E, F). If B, D, and E are scorable, then the
        # list of scorable fields is (B, D, E). The _scorable_to_pos dictionary
        # would then map B -> 0, D -> 1, and E -> 2.
        self._scorable_to_pos = dict((fnum, i)
                                     for i, fnum
                                     in enumerate(self.schema.scorable_fields()))
        self._stored_to_pos = dict((fnum, i)
                                   for i, fnum
                                   in enumerate(self.schema.stored_fields()))

        # Create a temporary segment object just so we can access its
        # *_filename attributes (so if we want to change the naming convention,
        # we only have to do it in one place).
        tempseg = Segment(self.name, 0, 0, None)
        self.termtable = create_terms(storage, tempseg)
        self.docslist = create_storedfields(storage, tempseg)
        self.doclengths = None
        if self.schema.scorable_fields():
            self.doclengths = create_doclengths(storage, tempseg, len(self._scorable_to_pos))

        postfile = storage.create_file(tempseg.posts_filename)
        self.postwriter = FilePostingWriter(postfile, blocklimit=blocklimit)

        self.vectortable = None
        if self.schema.has_vectored_fields():
            # Table associating document fields with (postoffset, postcount)
            self.vectortable = create_vectors(storage, tempseg)
            vpostfile = storage.create_file(tempseg.vectorposts_filename)
            self.vpostwriter = FilePostingWriter(vpostfile, stringids=True)

        # Keep track of the total number of tokens (across all docs)
        # in each field
        self.field_length_totals = defaultdict(int)

    def segment(self):
        """Returns an index.Segment object for the segment being written."""
        return Segment(self.name, self.max_doc, dict(self.field_length_totals))

    def _close_all(self):
        self.termtable.close()
        self.postwriter.close()
        self.docslist.close()

        if self.doclengths:
            self.doclengths.close()

        if self.vectortable:
            self.vectortable.close()
            self.vpostwriter.close()

    def close(self):
        """Finishes writing the segment (flushes the posting pool out to disk)
        and closes all open files.
        """

        self._flush_pool()
        self._close_all()

    def add_reader(self, reader):
        """Adds the contents of another segment to this one. This is used to
        merge existing segments into the new one before deleting them.
        
        :param ix: The index.Index object containing the segment to merge.
        :param segment: The index.Segment object to merge into this one.
        """

        start_doc = self.max_doc
        has_deletions = reader.has_deletions()

        if has_deletions:
            doc_map = {}

        schema = self.schema
        name2num = schema.name_to_number
        stored_to_pos = self._stored_to_pos

        def storedkeyhelper(item):
            return stored_to_pos[name2num(item[0])]

        # Merge document info
        docnum = 0
        vectored_fieldnums = schema.vectored_fields()
        for docnum in xrange(reader.doc_count_all()):
            if not reader.is_deleted(docnum):
                # Copy the stored fields and field lengths from the reader
                # into this segment
                storeditems = reader.stored_fields(docnum).items()
                storedvalues = [v for k, v
                                in sorted(storeditems, key=storedkeyhelper)]
                self._add_doc_data(storedvalues,
                                   reader.doc_field_lengths(docnum))

                if has_deletions:
                    doc_map[docnum] = self.max_doc

                # Copy term vectors
                for fieldnum in vectored_fieldnums:
                    if reader.has_vector(docnum, fieldnum):
                        self._add_vector(fieldnum,
                                         reader.vector(docnum, fieldnum).items())

                self.max_doc += 1

        # Add field length totals
        for fieldnum in schema.scorable_fields():
            self.field_length_totals[fieldnum] += reader.field_length(fieldnum)

        # Merge terms
        current_fieldnum = None
        decoder = None
        for fieldnum, text, _, _ in reader:
            if fieldnum != current_fieldnum:
                current_fieldnum = fieldnum
                decoder = schema[fieldnum].format.decode_frequency

            postreader = reader.postings(fieldnum, text)
            for docnum, valuestring in postreader.all_items():
                if has_deletions:
                    newdoc = doc_map[docnum]
                else:
                    newdoc = start_doc + docnum

                # TODO: Is there a faster way to do this?
                freq = decoder(valuestring)
                self.pool.add_posting(fieldnum, text, newdoc, freq, valuestring)

    def add_document(self, fields):
        scorable_to_pos = self._scorable_to_pos
        stored_to_pos = self._stored_to_pos
        schema = self.schema

        # Sort the keys by their order in the schema
        fieldnames = [name for name in fields.keys()
                      if not name.startswith("_")]
        fieldnames.sort(key=schema.name_to_number)

        # Check if the caller gave us a bogus field
        for name in fieldnames:
            if name not in schema:
                raise UnknownFieldError("There is no field named %r" % name)

        # Create an array of counters to record the length of each field
        fieldlengths = array(DOCLENGTH_TYPE, [0] * len(scorable_to_pos))

        # Create a list (initially a list of Nones) in which we will put stored
        # field values as we get them. Why isn't this an empty list that we
        # append to? Because if the caller doesn't supply a value for a stored
        # field, we don't want to have a list in the wrong order/of the wrong
        # length.
        storedvalues = [None] * len(stored_to_pos)

        for name in fieldnames:
            value = fields.get(name)
            if value:
                fieldnum = schema.name_to_number(name)
                field = schema.field_by_number(fieldnum)

                # If the field is indexed, add the words in the value to the
                # index
                if field.indexed:
                    # Count of all terms in the value
                    count = 0
                    # Count of UNIQUE terms in the value
                    unique = 0

                    # TODO: Method for adding progressive field values, ie
                    # setting start_pos/start_char?
                    for w, freq, valuestring in field.index(value):
                        #assert w != ""
                        self.pool.add_posting(fieldnum, w, self.max_doc, freq,
                                              valuestring)
                        count += freq
                        unique += 1

                    if field.scorable:
                        # Add the term count to the total for this field
                        self.field_length_totals[fieldnum] += count
                        # Set the term count to the per-document field length
                        pos = scorable_to_pos[fieldnum]
                        fieldlengths[pos] = min(count, DOCLENGTH_LIMIT)

                # If the field is vectored, add the words in the value to the
                # vector table
                vector = field.vector
                if vector:
                    # TODO: Method for adding progressive field values, ie
                    # setting start_pos/start_char?
                    vlist = sorted((w, valuestring) for w, freq, valuestring
                                   in vector.word_values(value, mode="index"))
                    self._add_vector(fieldnum, vlist)

                # If the field is stored, put the value in storedvalues
                if field.stored:
                    # Caller can override the stored value by including a key
                    # _stored_<fieldname>
                    storedname = "_stored_" + name
                    if storedname in fields:
                        stored_value = fields[storedname]
                    else :
                        stored_value = value

                    storedvalues[stored_to_pos[fieldnum]] = stored_value

        self._add_doc_data(storedvalues, fieldlengths)
        self.max_doc += 1

    def _add_terms(self):
        pass

    def _add_doc_data(self, storedvalues, fieldlengths):
        self.docslist.append(storedvalues)
        if self.doclengths:
            self.doclengths.append(fieldlengths)

    def _add_vector(self, fieldnum, vlist):
        vpostwriter = self.vpostwriter
        vformat = self.schema[fieldnum].vector

        offset = vpostwriter.start(vformat)
        for text, valuestring in vlist:
            assert isinstance(text, unicode), "%r is not unicode" % text
            vpostwriter.write(text, valuestring)
        vpostwriter.finish()

        self.vectortable.add((self.max_doc, fieldnum), offset)

    def _flush_pool(self):
        # This method pulls postings out of the posting pool (built up as
        # documents are added) and writes them to the posting file. Each time
        # it encounters a posting for a new term, it writes the previous term
        # to the term index (by waiting to write the term entry, we can easily
        # count the document frequency and sum the terms by looking at the
        # postings).

        termtable = self.termtable
        postwriter = self.postwriter
        schema = self.schema

        current_fieldnum = None # Field number of the current term
        current_text = None # Text of the current term
        first = True
        current_freq = 0
        offset = None

        # Loop through the postings in the pool. Postings always come out of
        # the pool in (field number, lexical) order.
        for fieldnum, text, docnum, freq, valuestring in self.pool:
            # Is this the first time through, or is this a new term?
            if first or fieldnum > current_fieldnum or text > current_text:
                if first:
                    first = False
                else:
                    # This is a new term, so finish the postings and add the
                    # term to the term table
                    postcount = postwriter.finish()
                    termtable.add((current_fieldnum, current_text),
                                  (current_freq, offset, postcount))

                # Reset the post writer and the term variables
                current_fieldnum = fieldnum
                current_text = text
                current_freq = 0
                offset = postwriter.start(schema[fieldnum].format)

            elif (fieldnum < current_fieldnum
                  or (fieldnum == current_fieldnum and text < current_text)):
                # This should never happen!
                raise Exception("Postings are out of order: %s:%s .. %s:%s" %
                                (current_fieldnum, current_text, fieldnum, text))

            # Write a posting for this occurrence of the current term
            current_freq += freq
            postwriter.write(docnum, valuestring)

        # If there are still "uncommitted" postings at the end, finish them off
        if not first:
            postcount = postwriter.finish()
            termtable.add((current_fieldnum, current_text),
                          (current_freq, offset, postcount))