示例#1
0
    def commit(self, **kwargs):
        writer = self.writer
        pool = writer.pool

        # Index the remaining documents in the doc buffer
        self._enqueue()
        # Tell the tasks to finish
        for task in self.tasks:
            self.jobqueue.put(None)
        # Wait for the tasks to finish
        for task in self.tasks:
            task.join()
        # Get the results
        results = []
        for task in self.tasks:
            # runname, doccount, lenname
            results.append(self.resultqueue.get(timeout=5))

        if results:
            for runname, doccount, lenname in results:
                f = writer.storage.open_file(lenname)
                lengths = Lengths.from_file(f, doccount)
                writer.lengths.add_other(lengths)
                writer.storage.delete_file(lenname)

            base = results[0][1]
            runreaders = [pool._read_run(results[0][0])]
            for runname, doccount, lenname in results[1:]:
                rr = self._read_and_renumber_run(runname, base)
                runreaders.append(rr)
                base += doccount
            writer.termswriter.add_iter(imerge(runreaders), writer.lengths)
示例#2
0
    def commit(self, **kwargs):
        writer = self.writer
        pool = writer.pool

        # Index the remaining documents in the doc buffer
        self._enqueue()
        # Tell the tasks to finish
        for task in self.tasks:
            self.jobqueue.put(None)
        # Wait for the tasks to finish
        for task in self.tasks:
            task.join()
        # Get the results
        results = []
        for task in self.tasks:
            # runname, doccount, lenname
            results.append(self.resultqueue.get(timeout=5))

        if results:
            for runname, doccount, lenname in results:
                f = writer.storage.open_file(lenname)
                lengths = Lengths.from_file(f, doccount)
                writer.lengths.add_other(lengths)
                writer.storage.delete_file(lenname)

            base = results[0][1]
            runreaders = [pool._read_run(results[0][0])]
            for runname, doccount, lenname in results[1:]:
                rr = self._read_and_renumber_run(runname, base)
                runreaders.append(rr)
                base += doccount
            writer.termswriter.add_iter(imerge(runreaders), writer.lengths)
示例#3
0
    def _merge_subsegments(self, results, sources, mergetype, optimize, merge):
        schema = self.schema
        storage = self.storage
        codec = self.codec
        fieldnames = list(schema.names())

        # Merge per-document information
        pdw = self.perdocwriter
        # Names of fields that store term vectors
        vnames = set(schema.vector_names())
        basedoc = self.docnum
        # A list to remember field length readers for each sub-segment (we'll
        # re-use them below)
        lenreaders = [pdw.lengths_reader()]

        for _, segment in results:
            # Create a field length reader for the sub-segment
            lenreader = codec.lengths_reader(storage, segment)
            # Remember it in the list for later
            lenreaders.append(lenreader)
            # Vector reader for the sub-segment
            vreader = codec.vector_reader(storage, segment)
            # Stored field reader for the sub-segment
            sfreader = codec.stored_fields_reader(storage, segment)
            # Iterating on the stored field reader yields a dictionary of
            # stored fields for *every* document in the segment (even if the
            # document has no stored fields it should yield {})
            for i, fs in enumerate(sfreader):
                # Add the base doc count to the sub-segment doc num
                pdw.start_doc(basedoc + i)
                # Call add_field to store the field values and lengths
                for fieldname in fieldnames:
                    value = fs.get(fieldname)
                    length = lenreader.doc_field_length(i, fieldname)
                    pdw.add_field(fieldname, schema[fieldname], value, length)
                # Copy over the vectors. TODO: would be much faster to bulk-
                # copy the postings
                for fieldname in vnames:
                    if (i, fieldname) in vreader:
                        field = schema[fieldname]
                        vmatcher = vreader.matcher(i, fieldname, field.vector)
                        pdw.add_vector_matcher(fieldname, field, vmatcher)
                pdw.finish_doc()
            basedoc += segment.doccount

        # Create a list of iterators from the run filenames
        basedoc = self.docnum
        for runname, segment in results:
            items = self._read_and_renumber_run(runname, basedoc)
            sources.append(items)
            basedoc += segment.doccount

        # Create a MultiLengths object combining the length files from the
        # subtask segments
        mlens = base.MultiLengths(lenreaders)
        # Merge the iterators into the field writer
        self.fieldwriter.add_postings(schema, mlens, imerge(sources))
        self.docnum = basedoc
示例#4
0
    def _merge_subsegments(self, results, mergetype, optimize, merge):
        schema = self.schema
        storage = self.storage
        codec = self.codec
        fieldnames = list(schema.names())

        # Merge per-document information
        pdw = self.perdocwriter
        # Names of fields that store term vectors
        vnames = set(schema.vector_names())
        basedoc = self.docnum
        # A list to remember field length readers for each sub-segment (we'll
        # re-use them below)
        lenreaders = [pdw.lengths_reader()]

        for _, segment in results:
            # Create a field length reader for the sub-segment
            lenreader = codec.lengths_reader(storage, segment)
            # Remember it in the list for later
            lenreaders.append(lenreader)
            # Vector reader for the sub-segment
            vreader = None
            if schema.has_vectored_fields():
                vreader = codec.vector_reader(storage, segment)
            # Stored field reader for the sub-segment
            sfreader = codec.stored_fields_reader(storage, segment)
            # Iterating on the stored field reader yields a dictionary of
            # stored fields for *every* document in the segment (even if the
            # document has no stored fields it should yield {})
            for i, fs in enumerate(sfreader):
                # Add the base doc count to the sub-segment doc num
                pdw.start_doc(basedoc + i)
                # Call add_field to store the field values and lengths
                for fieldname in fieldnames:
                    value = fs.get(fieldname)
                    length = lenreader.doc_field_length(i, fieldname)
                    pdw.add_field(fieldname, schema[fieldname], value, length)
                # Copy over the vectors. TODO: would be much faster to bulk-
                # copy the postings
                if vreader:
                    for fieldname in vnames:
                        if (i, fieldname) in vreader:
                            field = schema[fieldname]
                            vformat = field.vector
                            vmatcher = vreader.matcher(i, fieldname, vformat)
                            pdw.add_vector_matcher(fieldname, field, vmatcher)
                pdw.finish_doc()
            basedoc += segment.doccount

        # If information was added to this writer the conventional (e.g.
        # through add_reader or merging segments), add it as an extra source
        if self._added:
            sources = [self.pool.iter_postings()]
        else:
            sources = []
        # Add iterators from the run filenames
        basedoc = self.docnum
        for runname, segment in results:
            items = self._read_and_renumber_run(runname, basedoc)
            sources.append(items)
            basedoc += segment.doccount

        # Create a MultiLengths object combining the length files from the
        # subtask segments
        mlens = base.MultiLengths(lenreaders)
        # Merge the iterators into the field writer
        self.fieldwriter.add_postings(schema, mlens, imerge(sources))
        self.docnum = basedoc
        self._added = True