def writeQuickCheck(self, header, records, name): header.writeLine("const size_t UnicodeQuickCheck" + name + "RecordCount = " + str(len(records)) + ";") header.writeLine("const QuickCheckRecord UnicodeQuickCheck" + name + "Record[" + str(len(records)) + "] = {") header.indent() count = 0 for r in records: if (count % 4) == 0: header.writeIndentation() value = (r.value << 24) | r.count header.write("{ " + hex(r.start) + ", " + hex(r.end) + ", " + hex(value) + " },") count += 1 if count != len(records): if (count % 4) == 0: header.newLine() else: header.write(" ") header.newLine() header.outdent() header.writeLine("};") header.writeLine("const QuickCheckRecord* UnicodeQuickCheck" + name + "RecordPtr = UnicodeQuickCheck" + name + "Record;") header.newLine()
def writeDecompositionRecords(self, header, records, name, field): header.writeLine("const size_t Unicode" + name + "RecordCount = " + str(len(records)) + ";") header.writeLine("const DecompositionRecord Unicode" + name + "Record[" + str(len(records)) + "] = {") header.indent() count = 0 for r in records: if (count % 4) == 0: header.writeIndentation() header.write("{ " + hex(r.codepoint) + ", " + hex(r.__dict__[field]) + " },") count += 1 if count <> len(records): if (count % 4) == 0: header.newLine() else: header.write(" ") header.newLine() header.outdent() header.writeLine("};") header.writeLine("const DecompositionRecord* Unicode" + name + "RecordPtr = Unicode" + name + "Record;") header.newLine()
def Render(self, header): print('Writing tests for "' + self.name + '"...') if self.differs: print('\tUTF-16 differs from UTF-32, writing both.') header.newLine() header.newLine() header.writeLine('TEST_F(NaughtyStrings, ' + self.name + ')') header.writeLine('{') if self.differs: header.writeLine('#if UTF8_WCHAR_UTF16') header.indent() for t in self.tests: t.Render(header, 'utf16') header.outdent() header.writeLine('#elif UTF8_WCHAR_UTF32') header.indent() for t in self.tests: t.Render(header, 'utf32') header.outdent() header.writeLine('#endif') else: header.indent() for t in self.tests: t.Render(header) header.outdent() header.write("}")
def Render(self, filepath): print('Rendering tests to "' + os.path.realpath(filepath) + '"...') command_line = sys.argv[0] arguments = sys.argv[1:] for a in arguments: command_line += ' ' + a header = libs.header.Header(filepath) header.generatedNotice() header.newLine() header.writeLine('#include "tests-base.hpp"') header.newLine() header.writeLine('#include "../helpers/helpers-strings.hpp"') header.newLine() header.writeLine('#define NAUGHTY_STRINGS_LENGTH 10370') header.newLine() header.writeLine('class NaughtyStrings') header.writeLine(' : public ::testing::Test') header.writeLine('{') header.newLine() header.writeLine('protected:') header.newLine() header.writeLine(' void SetUp()') header.writeLine(' {') header.writeLine( ' file.open("testdata/big-list-of-naughty-strings-master/blns.txt", std::ios_base::in);' ) header.writeLine(' ASSERT_TRUE(file.is_open());') header.writeLine(' }') header.newLine() header.writeLine(' void TearDown()') header.writeLine(' {') header.writeLine(' file.close();') header.writeLine(' }') header.newLine() header.writeLine( ' std::string ReadSection(size_t position, size_t length)') header.writeLine(' {') header.writeLine(' std::string result;') header.newLine() header.writeLine(' file.seekg(position, std::ios::beg);') header.writeLine(' if (file.eof())') header.writeLine(' {') header.writeLine(' return result;') header.writeLine(' }') header.newLine() header.writeLine(' result.resize(length + 1);') header.writeLine(' file.read(&result[0], length);') header.newLine() header.writeLine(' return result;') header.writeLine(' }') header.newLine() header.writeLine(' std::fstream file;') header.newLine() header.write('};') for s in self.sections: s.Render(header)
def Render(self, filepath): print('Rendering tests to "' + os.path.realpath(filepath) + '"...') command_line = sys.argv[0] arguments = sys.argv[1:] for a in arguments: command_line += ' ' + a header = libs.header.Header(filepath) header.generatedNotice() header.newLine() header.writeLine('#include "tests-base.hpp"') header.newLine() header.writeLine('#include "../helpers/helpers-strings.hpp"') header.newLine() header.writeLine('#define NAUGHTY_STRINGS_LENGTH 10370') header.newLine() header.writeLine('class NaughtyStrings') header.writeLine(' : public ::testing::Test') header.writeLine('{') header.newLine() header.writeLine('protected:') header.newLine() header.writeLine(' void SetUp()') header.writeLine(' {') header.writeLine(' file.open("testdata/big-list-of-naughty-strings-master/blns.txt", std::ios_base::in);') header.writeLine(' ASSERT_TRUE(file.is_open());') header.writeLine(' }') header.newLine() header.writeLine(' void TearDown()') header.writeLine(' {') header.writeLine(' file.close();') header.writeLine(' }') header.newLine() header.writeLine(' std::string ReadSection(size_t position, size_t length)') header.writeLine(' {') header.writeLine(' std::string result;') header.newLine() header.writeLine(' file.seekg(position, std::ios::beg);') header.writeLine(' if (file.eof())') header.writeLine(' {') header.writeLine(' return result;') header.writeLine(' }') header.newLine() header.writeLine(' result.resize(length + 1);') header.writeLine(' file.read(&result[0], length);') header.newLine() header.writeLine(' return result;') header.writeLine(' }') header.newLine() header.writeLine(' std::fstream file;') header.newLine() header.write('};') for s in self.sections: s.Render(header)
def render(self, header, name, dataType = 'uint8_t'): print('Rendering compressed data for "' + name + '"...') header.newLine() header.writeLine("const size_t " + name + "Index[" + str(len(self.table_index)) + "] = {") header.indent() count = 0 for c in self.table_index: if (count % self.chunk_size) == 0: header.writeIndentation() header.write('%d,' % c) count += 1 if count != len(self.table_index): if (count % self.chunk_size) == 0: header.newLine() else: header.write(' ') header.newLine() header.outdent() header.writeLine("};") header.writeLine("const size_t* " + name + "IndexPtr = " + name + "Index;") header.newLine() header.writeLine("const " + dataType + " " + name + "Data[" + str(len(self.table_data)) + "] = {") header.indent() count = 0 for c in self.table_data: if (count % self.chunk_size) == 0: header.writeIndentation() header.write('0x%02X,' % c) count += 1 if count != len(self.table_data): if (count % self.chunk_size) == 0: header.newLine() else: header.write(' ') header.newLine() header.outdent() header.writeLine("};") header.write("const " + dataType + "* " + name + "DataPtr = " + name + "Data;")
def writeCompositionRecords(self, header): composed = [] for r in self.recordsOrdered: if r.compositionPairs: for p in r.compositionPairs.items(): key = (r.codepoint << 32) + p[0] if key in composed: print "collision " + hex(key) else: pair = {"key": key, "value": p[1]} composed.append(pair) composed_ordered = sorted(composed, key=lambda item: item["key"]) header.writeLine("const size_t UnicodeCompositionRecordCount = " + str(len(composed_ordered)) + ";") header.writeLine("const CompositionRecord UnicodeCompositionRecord[" + str(len(composed_ordered)) + "] = {") header.indent() count = 0 for c in composed_ordered: if (count % 4) == 0: header.writeIndentation() header.write("{ " + hex(c["key"]) + ", " + hex(c["value"]) + " },") count += 1 if count <> len(composed_ordered): if (count % 4) == 0: header.newLine() else: header.write(" ") header.newLine() header.outdent() header.writeLine("};") header.writeLine( "const CompositionRecord* UnicodeCompositionRecordPtr = UnicodeCompositionRecord;" ) header.newLine()
def writeCompositionRecords(self, header): composed = [] for r in self.recordsOrdered: if r.compositionPairs: for p in r.compositionPairs.items(): key = (r.codepoint << 32) + p[0] if key in composed: print('"collision " + hex(key)') else: pair = { "key": key, "value": p[1] } composed.append(pair) composed_ordered = sorted(composed, key=lambda item: item["key"]) header.writeLine("const size_t UnicodeCompositionRecordCount = " + str(len(composed_ordered)) + ";") header.writeLine("const CompositionRecord UnicodeCompositionRecord[" + str(len(composed_ordered)) + "] = {") header.indent() count = 0 for c in composed_ordered: if (count % 4) == 0: header.writeIndentation() header.write("{ " + hex(c["key"]) + ", " + hex(c["value"]) + " },") count += 1 if count != len(composed_ordered): if (count % 4) == 0: header.newLine() else: header.write(" ") header.newLine() header.outdent() header.writeLine("};") header.writeLine("const CompositionRecord* UnicodeCompositionRecordPtr = UnicodeCompositionRecord;") header.newLine()
def writeSource(self, filepath): command_line = sys.argv[0] arguments = sys.argv[1:] for a in arguments: command_line += " " + a d = datetime.datetime.now() page_starts = [] page_ends = [] page_starts.append(0) blob_size = self.offset blob_page = self.blob total_offset = 0 while 1: if blob_size < self.pageSize: page_ends.append(total_offset + blob_size) break page_read = 0 blob_search = blob_page while 1: end_index = blob_search.find("\\x00") if end_index == -1: break offset = (end_index / 4) + 1 if (page_read + offset) >= self.pageSize: break page_read += offset blob_search = blob_search[(end_index + 4):] total_offset += page_read page_ends.append(total_offset) page_starts.append(total_offset) blob_page = blob_page[(page_read * 4):] blob_size -= page_read pages = len(page_starts) print "pages", pages, "blobSize", blob_size print "pageStarts", page_starts print "pageEnds ", page_ends # comment header header = libs.header.Header(filepath) header.writeLine("/*") header.indent() header.writeLine("DO NOT MODIFY, AUTO-GENERATED") header.newLine() header.writeLine("Generated on:") header.indent() header.writeLine(d.strftime("%Y-%m-%dT%H:%M:%S")) header.outdent() header.newLine() header.writeLine("Command line:") header.indent() header.writeLine(command_line) header.outdent() header.outdent() header.writeLine("*/") header.newLine() # includes header.writeLine("#include \"normalization.h\"") header.newLine() # composition data header.writeLine("const size_t CompositionDataCount = " + str(len(self.entries)) + ";") header.writeLine("const CompositionEntry CompositionData[" + str(len(self.entries)) + "] = {") header.indent() for e in self.entries: header.writeLine(e.toHeaderString(page_starts)) header.outdent() header.writeLine("};") header.writeLine( "const CompositionEntry* CompositionDataPtr = CompositionData;") header.newLine() # decomposition data blob_page = self.blob header.writeLine("const size_t DecompositionDataPageCount = " + str(pages) + ";") header.writeLine("const char* DecompositionData[" + str(pages) + "] = {") header.indent() for p in range(0, pages): blob_page = self.blob[page_starts[p] * 4:page_ends[p] * 4] read = page_ends[p] - page_starts[p] written = 0 first_line = True blob_sliced = blob_page while (1): if first_line: character_count = min(read, 24) else: character_count = min(read, 25) character_line = blob_sliced[:(character_count * 4)] header.writeIndentation() header.write("\"") if first_line: header.write("\\x00") first_line = False header.write(character_line) header.write("\"") written += character_count read -= character_count if read <= 0: header.write(",") header.newLine() if read <= 0: break blob_sliced = blob_sliced[(character_count * 4):] header.outdent() header.writeLine("};") header.writeLine( "const char** DecompositionDataPtr = DecompositionData;") header.write("const size_t DecompositionDataLength[" + str(pages) + "] = { ") for p in range(0, pages): size = page_ends[p] - page_starts[p] header.write(str(size)) if p <> (pages - 1): header.write(',') header.write(' ') header.writeLine("};") header.write( "const size_t* DecompositionDataLengthPtr = DecompositionDataLength;" ) header.close() print "entries " + str(self.total) + " hashed " + str(len(self.hashed)) pages_start = [] pages_end = [] pages_start.append(0) current_page_end = self.pageSize
def render(self, header, name): print('Rendering compressed data for "' + name + '"...') header.newLine() header.writeLine("const uint32_t " + name + "Index1[" + str(len(self.table_index2)) + "] = {") header.indent() count = 0 for c in self.table_index2: if (count % 16) == 0: header.writeIndentation() header.write('%d,' % c) count += 1 if count != len(self.table_index2): if (count % 16) == 0: header.newLine() else: header.write(' ') header.newLine() header.outdent() header.writeLine("};") header.writeLine("const uint32_t* " + name + "Index1Ptr = " + name + "Index1;") header.newLine() header.writeLine("const uint32_t " + name + "Index2[" + str(len(self.table_index1_compressed)) + "] = {") header.indent() count = 0 for c in self.table_index1_compressed: if (count % 16) == 0: header.writeIndentation() header.write('0x%X,' % c) count += 1 if count != len(self.table_index1_compressed): if (count % 16) == 0: header.newLine() else: header.write(' ') header.newLine() header.outdent() header.writeLine("};") header.writeLine("const uint32_t* " + name + "Index2Ptr = " + name + "Index2;") header.newLine() header.writeLine("const uint32_t " + name + "Data[" + str(len(self.table_data_compressed)) + "] = {") header.indent() count = 0 for c in self.table_data_compressed: if (count % 16) == 0: header.writeIndentation() header.write('0x%X,' % int(c)) count += 1 if count != len(self.table_data_compressed): if (count % 16) == 0: header.newLine() else: header.write(' ') header.newLine() header.outdent() header.writeLine("};") header.write("const uint32_t* " + name + "DataPtr = " + name + "Data;")
def writeSource(self, filepath): print('Compressing code point properties...') compress_gc = Compression(db) compress_gc.process('generalCategoryCombined', 32) compress_ccc = Compression(db) compress_ccc.process('canonicalCombiningClass', 32) compress_qc_cm = Compression(db) compress_qc_cm.process('quickCaseMapped', 32) compress_qc_nfc = Compression(db) compress_qc_nfc.process('quickNFC', 32) compress_qc_nfd = Compression(db) compress_qc_nfd.process('quickNFD', 32) compress_qc_nfkc = Compression(db) compress_qc_nfkc.process('quickNFKC', 32) compress_qc_nfkd = Compression(db) compress_qc_nfkd.process('quickNFKD', 32) self.compressed = "" self.compressed_length = 0 compress_nfd = CompressionString(db) compress_nfd.process('decomposedNFD', 32, 128) compress_nfkd = CompressionString(db) compress_nfkd.process('decomposedNFKD', 32, 128) compress_uppercase = CompressionString(db) compress_uppercase.process('uppercase', 32, 128) compress_lowercase = CompressionString(db) compress_lowercase.process('lowercase', 32, 128) compress_titlecase = CompressionString(db) compress_titlecase.process('titlecase', 32, 128) compress_casefolding = CompressionString(db) compress_casefolding.process('caseFolding', 32, 128) print('Writing database to "' + os.path.realpath(filepath) + '"...') # comment header header = libs.header.Header(os.path.realpath(filepath)) header.writeLine("/*") header.indent() header.copyrightNotice() header.outdent() header.writeLine("*/") header.newLine() header.generatedNotice() header.newLine() # includes header.writeLine("#include \"unicodedatabase.h\"") # quick check compress_gc.render(header, 'GeneralCategory', 'uint32_t') header.newLine() compress_ccc.render(header, 'CanonicalCombiningClass') header.newLine() compress_qc_cm.render(header, 'QuickCheckCaseMapped'); header.newLine(); compress_qc_nfc.render(header, 'QuickCheckNFC') header.newLine() compress_qc_nfd.render(header, 'QuickCheckNFD') header.newLine() compress_qc_nfkc.render(header, 'QuickCheckNFKC') header.newLine() compress_qc_nfkd.render(header, 'QuickCheckNFKD') header.newLine() # decomposition compress_nfd.render(header, 'NFD') header.newLine() compress_nfkd.render(header, 'NFKD') header.newLine() # case mapping compress_uppercase.render(header, 'Uppercase') header.newLine() compress_lowercase.render(header, 'Lowercase') header.newLine() compress_titlecase.render(header, 'Titlecase') header.newLine() compress_casefolding.render(header, 'CaseFolding') header.newLine() # composition header.newLine() self.writeCompositionRecords(header) # decomposition data sliced_compressed = libs.blobsplitter.BlobSplitter() sliced_compressed.split(self.compressed, self.compressed_length) header.writeLine("const char* CompressedStringData = ") header.indent() for p in sliced_compressed.pages: p.start() p.firstLine = False while not p.atEnd: p.nextLine() header.writeIndentation() header.write(p.line) header.newLine() header.outdent() header.writeLine(";") header.write("const size_t CompressedStringDataLength = " + str(self.compressed_length) + ";")
def writeSource(self, filepath): print "Writing database to " + filepath + "..." command_line = sys.argv[0] arguments = sys.argv[1:] for a in arguments: command_line += " " + a d = datetime.datetime.now() nfd_records = [] nfkd_records = [] uppercase_records = [] lowercase_records = [] titlecase_records = [] for r in self.recordsOrdered: if r.offsetNFD <> 0: nfd_records.append(r) if r.offsetNFKD <> 0: nfkd_records.append(r) if r.offsetUppercase <> 0: uppercase_records.append(r) if r.offsetLowercase <> 0: lowercase_records.append(r) if r.offsetTitlecase <> 0: titlecase_records.append(r) sliced = libs.blobsplitter.BlobSplitter() sliced.split(self.blob, self.offset) # comment header header = libs.header.Header(filepath) header.writeLine("/*") header.indent() header.copyrightNotice() header.outdent() header.writeLine("*/") header.newLine() header.writeLine("/*") header.indent() header.writeLine("DO NOT MODIFY, AUTO-GENERATED") header.newLine() header.writeLine("Generated on:") header.indent() header.writeLine(d.strftime("%Y-%m-%dT%H:%M:%S")) header.outdent() header.newLine() header.writeLine("Command line:") header.indent() header.writeLine(command_line) header.outdent() header.outdent() header.writeLine("*/") header.newLine() # includes header.writeLine("#include \"unicodedatabase.h\"") header.newLine() # quick check records self.writeQuickCheck(header, self.qcGeneralCategory, "GeneralCategory") self.writeQuickCheck(header, self.qcCanonicalCombiningClass, "CanonicalCombiningClass") self.writeQuickCheck(header, self.qcNFCRecords, "NFC") self.writeQuickCheck(header, self.qcNFDRecords, "NFD") self.writeQuickCheck(header, self.qcNFKCRecords, "NFKC") self.writeQuickCheck(header, self.qcNFKDRecords, "NFKD") # decomposition records self.writeDecompositionRecords(header, nfd_records, "NFD", "offsetNFD") self.writeDecompositionRecords(header, nfkd_records, "NFKD", "offsetNFKD") # composition records self.writeCompositionRecords(header) # case mapping records self.writeDecompositionRecords(header, uppercase_records, "Uppercase", "offsetUppercase") self.writeDecompositionRecords(header, lowercase_records, "Lowercase", "offsetLowercase") self.writeDecompositionRecords(header, titlecase_records, "Titlecase", "offsetTitlecase") # decomposition data header.writeLine("const char* DecompositionData = ") header.indent() for p in sliced.pages: p.start() while not p.atEnd: p.nextLine() header.writeIndentation() header.write(p.line) header.newLine() header.outdent() header.writeLine(";") header.write("const size_t DecompositionDataLength = " + str(self.offset) + ";")
def writeSource(self, filepath): command_line = sys.argv[0] arguments = sys.argv[1:] for a in arguments: command_line += " " + a d = datetime.datetime.now() page_starts = [] page_ends = [] page_starts.append(0) blob_size = self.offset blob_page = self.blob total_offset = 0 while 1: if blob_size < self.pageSize: page_ends.append(total_offset + blob_size) break page_read = 0 blob_search = blob_page while 1: end_index = blob_search.find("\\x00") if end_index == -1: break offset = (end_index / 4) + 1 if (page_read + offset) >= self.pageSize: break page_read += offset blob_search = blob_search[(end_index + 4):] total_offset += page_read page_ends.append(total_offset) page_starts.append(total_offset) blob_page = blob_page[(page_read * 4):] blob_size -= page_read pages = len(page_starts) print "pages", pages, "blobSize", blob_size print "pageStarts", page_starts print "pageEnds ", page_ends # comment header header = libs.header.Header(filepath) header.writeLine("/*") header.indent() header.writeLine("DO NOT MODIFY, AUTO-GENERATED") header.newLine() header.writeLine("Generated on:") header.indent() header.writeLine(d.strftime("%Y-%m-%dT%H:%M:%S")) header.outdent() header.newLine() header.writeLine("Command line:") header.indent() header.writeLine(command_line) header.outdent() header.outdent() header.writeLine("*/") header.newLine() # includes header.writeLine("#include \"normalization.h\"") header.newLine() # composition data header.writeLine("const size_t CompositionDataCount = " + str(len(self.entries)) + ";") header.writeLine("const CompositionEntry CompositionData[" + str(len(self.entries)) + "] = {") header.indent() for e in self.entries: header.writeLine(e.toHeaderString(page_starts)) header.outdent() header.writeLine("};") header.writeLine("const CompositionEntry* CompositionDataPtr = CompositionData;") header.newLine() # decomposition data blob_page = self.blob header.writeLine("const size_t DecompositionDataPageCount = " + str(pages) + ";") header.writeLine("const char* DecompositionData[" + str(pages) + "] = {") header.indent() for p in range(0, pages): blob_page = self.blob[page_starts[p] * 4:page_ends[p] * 4] read = page_ends[p] - page_starts[p] written = 0 first_line = True blob_sliced = blob_page while (1): if first_line: character_count = min(read, 24) else: character_count = min(read, 25) character_line = blob_sliced[:(character_count * 4)] header.writeIndentation() header.write("\"") if first_line: header.write("\\x00") first_line = False header.write(character_line) header.write("\"") written += character_count read -= character_count if read <= 0: header.write(",") header.newLine() if read <= 0: break blob_sliced = blob_sliced[(character_count * 4):] header.outdent() header.writeLine("};") header.writeLine("const char** DecompositionDataPtr = DecompositionData;") header.write("const size_t DecompositionDataLength[" + str(pages) + "] = { ") for p in range(0, pages): size = page_ends[p] - page_starts[p] header.write(str(size)) if p <> (pages - 1): header.write(',') header.write(' ') header.writeLine("};") header.write("const size_t* DecompositionDataLengthPtr = DecompositionDataLength;") header.close() print "entries " + str(self.total) + " hashed " + str(len(self.hashed)) pages_start = [] pages_end = [] pages_start.append(0) current_page_end = self.pageSize
def render(self, header, name): print('Rendering compressed data for "' + name + '"...') header.newLine() header.writeLine("const uint32_t " + name + "Index1[" + str(len(self.table_index2)) + "] = {") header.indent() count = 0 for c in self.table_index2: if (count % 16) == 0: header.writeIndentation() header.write('%d,' % c) count += 1 if count != len(self.table_index2): if (count % 16) == 0: header.newLine() else: header.write(' ') header.newLine() header.outdent() header.writeLine("};") header.writeLine("const uint32_t* " + name + "Index1Ptr = " + name + "Index1;") header.newLine() header.writeLine("const uint32_t " + name + "Index2[" + str(len(self.table_index1_compressed)) + "] = {") header.indent() count = 0 for c in self.table_index1_compressed: if (count % 16) == 0: header.writeIndentation() header.write('0x%X,' % c) count += 1 if count != len(self.table_index1_compressed): if (count % 16) == 0: header.newLine() else: header.write(' ') header.newLine() header.outdent() header.writeLine("};") header.writeLine("const uint32_t* " + name + "Index2Ptr = " + name + "Index2;") header.newLine() header.writeLine("const uint32_t " + name + "Data[" + str(len(self.table_data_compressed)) + "] = {") header.indent() count = 0 for c in self.table_data_compressed: if (count % 16) == 0: header.writeIndentation() header.write('0x%X,' % c) count += 1 if count != len(self.table_data_compressed): if (count % 16) == 0: header.newLine() else: header.write(' ') header.newLine() header.outdent() header.writeLine("};") header.write("const uint32_t* " + name + "DataPtr = " + name + "Data;")