示例#1
0
 def compile(self):
     print_progress(self.stats)
     writeln()
     self.failed_articles.close()
     self.empty_articles.close()
     self.skipped_articles.close()
     writeln('Compiling .aar files')
     self.add_metadata("article_count", self.stats.articles)
     articles = self.article_store.sorted(
         key=lambda x: collation_key(x).getByteArray())
     log.info('Compiling %s', self.output_file_name)
     metadata = compress(tojson(self.metadata).encode('utf8'))
     header_meta_len = spec_len(HEADER_SPEC) + len(metadata)
     create_volume_func = functools.partial(self.create_volume,
                                            header_meta_len)
     for volume in self.make_volumes(create_volume_func, articles):
         m = "Creating volume %d" % volume.number
         log.info(m)
         writeln(m).flush()
         file_name = self.make_aar(volume)
         self.file_names.append(file_name)
         m = "Wrote volume %d" % volume.number
         log.info(m)
         writeln(m).flush()
     self.article_store.close()
     self.write_volume_count()
     self.write_sha1sum()
     rename_files(self.file_names)
示例#2
0
 def compile(self):
     print_progress(self.stats)
     writeln()
     self.failed_articles.close()
     self.empty_articles.close()
     self.skipped_articles.close()
     writeln('Compiling .aar files')
     self.add_metadata("article_count", self.stats.articles)
     articles = self.article_store.sorted(key=lambda x:
                                              collation_key(x).getByteArray())
     log.info('Compiling %s', self.output_file_name)
     metadata = compress(tojson(self.metadata).encode('utf8'))
     header_meta_len = spec_len(HEADER_SPEC) + len(metadata)
     create_volume_func = functools.partial(self.create_volume,
                                            header_meta_len)
     for volume in self.make_volumes(create_volume_func, articles):
         m = "Creating volume %d" % volume.number
         log.info(m)
         writeln(m).flush()
         file_name = self.make_aar(volume)
         self.file_names.append(file_name)
         m = "Wrote volume %d" % volume.number
         log.info(m)
         writeln(m).flush()
     self.article_store.close()
     self.write_volume_count()
     self.write_sha1sum()
     rename_files(self.file_names)
示例#3
0
    def _sort(self):
        index1_sorted = tempfile.NamedTemporaryFile(prefix='index1_sorted',
                                                    dir=self.work_dir,
                                                    delete=False)
        self.index1_sorted = index1_sorted
        index1_unit_len = struct.calcsize(INDEX1_ITEM_FORMAT)
        klen_structsize = struct.calcsize(KEY_LENGTH_FORMAT)

        key = lambda x: collation_key(x).getByteArray()

        with open(self.index1.name) as fi1, open(self.index2.name) as fi2:

            index1 = mmap.mmap(fi1.fileno(), 0, prot=mmap.PROT_READ)
            index2 = mmap.mmap(fi2.fileno(), 0, prot=mmap.PROT_READ)

            index_item_count = len(index1)/index1_unit_len

            def read_packed_index1_item(i):
                pos_start = i*index1_unit_len
                pos_end = pos_start + index1_unit_len
                return index1[pos_start:pos_end]

            def index1_item_at(i):
                return struct.unpack(INDEX1_ITEM_FORMAT,
                                     read_packed_index1_item(i))

            def read_key(pos):
                start = pos+klen_structsize
                s = index2[pos:start]
                strlen = struct.unpack(KEY_LENGTH_FORMAT, s)[0]
                return index2[start:start+strlen]

            def realkey(x):
                index_item = index1_item_at(x)
                index2_ptr = index_item[0]
                title = read_key(index2_ptr)
                return key(title)

            def sorted_index1_items():
                for i in sorted(xrange(index_item_count), key=realkey):
                    yield read_packed_index1_item(i)

            for index1_item in sorted_index1_items():
                index1_sorted.write(index1_item)

            index1_sorted.close()
        log.info("Index sorted, removing temp file %s", self.index1.name)
        os.remove(self.index1.name)
示例#4
0
    def _sort(self):
        index1_sorted = tempfile.NamedTemporaryFile(prefix='index1_sorted',
                                                    dir=self.work_dir,
                                                    delete=False)
        self.index1_sorted = index1_sorted
        index1_unit_len = struct.calcsize(INDEX1_ITEM_FORMAT)
        klen_structsize = struct.calcsize(KEY_LENGTH_FORMAT)

        key = lambda x: collation_key(x).getByteArray()

        with open(self.index1.name) as fi1, open(self.index2.name) as fi2:

            index1 = mmap.mmap(fi1.fileno(), 0, prot=mmap.PROT_READ)
            index2 = mmap.mmap(fi2.fileno(), 0, prot=mmap.PROT_READ)

            index_item_count = len(index1) / index1_unit_len

            def read_packed_index1_item(i):
                pos_start = i * index1_unit_len
                pos_end = pos_start + index1_unit_len
                return index1[pos_start:pos_end]

            def index1_item_at(i):
                return struct.unpack(INDEX1_ITEM_FORMAT,
                                     read_packed_index1_item(i))

            def read_key(pos):
                start = pos + klen_structsize
                s = index2[pos:start]
                strlen = struct.unpack(KEY_LENGTH_FORMAT, s)[0]
                return index2[start:start + strlen]

            def realkey(x):
                index_item = index1_item_at(x)
                index2_ptr = index_item[0]
                title = read_key(index2_ptr)
                return key(title)

            def sorted_index1_items():
                for i in sorted(xrange(index_item_count), key=realkey):
                    yield read_packed_index1_item(i)

            for index1_item in sorted_index1_items():
                index1_sorted.write(index1_item)

            index1_sorted.close()
        log.info("Index sorted, removing temp file %s", self.index1.name)
        os.remove(self.index1.name)