def partition(self, text): """private method - simulate zero width spaces for Japanese""" l = [] r = '' last_n = '' for c in text: try: n = unicodedata.name(c).split()[0] except ValueError: n = 'NoName' PrintLog.message(u'No unicode name for: "{0:s}"'.format(c)) if n in self.CJK: if '' != r: l.append(r) r = c last_n = n elif last_n in self.CJK: if n in self.PUNCTUATION: l.append(r + c) r = '' last_n = '' else: l.append(r) r = c last_n = n else: r += c last_n = n if '' != r: l.append(r) return l
def handle_entityref(self, name): """handle & > ...""" try: self.handle_data(unichr(htmlentitydefs.name2codepoint[name])) except KeyError: PrintLog.message(u'ENTITYREF ERROR: {0:s} article: {1:s}'.format( name, g_this_article_title))
def resolve_redirects(self): """add redirect to article_index""" global verbose count = 0 if verbose: PrintLog.message(u'Resolving redirects') else: pass for item in self.redirects: try: self.set_index(item, self.find(item)[:3] + (True,)) count += 1 if verbose and count % 1000 == 0: PrintLog.message(u'Redirects resolved: {0:d}'.format(count)) else: pass except KeyError: PrintLog.message(u'Unresolved redirect: {0:s} -> {1:s}'.format(item, self.redirects[item])) except CycleError: PrintLog.message(u'Cyclic redirect: {0:s} -> {1:s}'.format(item, self.redirects[item])) if verbose: PrintLog.message(u'Total redirects resolved: {0:d}'.format(count)) else: pass return count
def title(self, category, key, title, seek): if self.KEY_ARTICLE != key: if verbose: PrintLog.message('Non-article: {0:s}:{1:s}'.format(category,title)) return False return True
def resolve_redirects(self): """add redirect to article_index""" global verbose count = 0 if verbose: PrintLog.message(u'Resolving redirects') else: pass for item in self.redirects: try: self.set_index(item, self.find(item)[:3] + (True, )) count += 1 if verbose and count % 1000 == 0: PrintLog.message( u'Redirects resolved: {0:d}'.format(count)) else: pass except KeyError: PrintLog.message(u'Unresolved redirect: {0:s} -> {1:s}'.format( item, self.redirects[item])) except CycleError: PrintLog.message(u'Cyclic redirect: {0:s} -> {1:s}'.format( item, self.redirects[item])) if verbose: PrintLog.message(u'Total redirects resolved: {0:d}'.format(count)) else: pass return count
def title(self, category, key, title, seek): if self.KEY_ARTICLE != key: if verbose: PrintLog.message('Non-article: {0:s}:{1:s}'.format( category, title)) return False return True
def main(): global verbose try: opts, args = getopt.getopt(sys.argv[1:], 'hvd:e:', ['help', 'verbose', 'dir=', 'extract=', ]) except getopt.GetoptError as err: usage(err) verbose = False dir = 'image/enpedia' extract = None for opt, arg in opts: if opt in ('-v', '--verbose'): verbose = True elif opt in ('-h', '--help'): usage(None) elif opt in ('-d', '--dir'): dir = arg elif opt in ('-e', '--extract'): extract = arg else: usage('unhandled option: ' + opt) if not os.path.isdir(dir): usage('{0:s} is not a directory'.format(dir)) idx_file = open(os.path.join(dir, "wiki.idx"), "rb") fnd_file = SegmentedFileReader(os.path.join(dir, "wiki{0:s}.fnd")) dat_format = os.path.join(dir, "wiki{0:d}.dat") index_min = 1 index_max = struct.unpack('<I', idx_file.read(4))[0] PrintLog.message('Total index entries = {0:d}'.format(index_max)) PrintLog.message('') for item in args: try: index_number = int(item.translate(None, ',_'), 0) except ValueError: usage('"{0:s}" is not numeric'.format(item)) if index_number < index_min or index_number > index_max: usage('index: {0:d} is outdide [{1:d} .. {2:d}]'.format(index_number, index_min, index_max)) process(index_number, idx_file, fnd_file, dat_format, extract) idx_file.close() fnd_file.close()
def main(): global verbose try: opts, args = getopt.getopt(sys.argv[1:], 'hvd:e:', [ 'help', 'verbose', 'dir=', 'extract=', ]) except getopt.GetoptError as err: usage(err) verbose = False dir = 'image/enpedia' extract = None for opt, arg in opts: if opt in ('-v', '--verbose'): verbose = True elif opt in ('-h', '--help'): usage(None) elif opt in ('-d', '--dir'): dir = arg elif opt in ('-e', '--extract'): extract = arg else: usage('unhandled option: ' + opt) if not os.path.isdir(dir): usage('{0:s} is not a directory'.format(dir)) idx_file = open(os.path.join(dir, "wiki.idx"), "rb") fnd_file = SegmentedFileReader(os.path.join(dir, "wiki{0:s}.fnd")) dat_format = os.path.join(dir, "wiki{0:d}.dat") index_min = 1 index_max = struct.unpack('<I', idx_file.read(4))[0] PrintLog.message('Total index entries = {0:d}'.format(index_max)) PrintLog.message('') for item in args: try: index_number = int(item.translate(None, ',_'), 0) except ValueError: usage('"{0:s}" is not numeric'.format(item)) if index_number < index_min or index_number > index_max: usage('index: {0:d} is outdide [{1:d} .. {2:d}]'.format( index_number, index_min, index_max)) process(index_number, idx_file, fnd_file, dat_format, extract) idx_file.close() fnd_file.close()
def resolve_redirects(self): """add redirect to article_index""" count = 0 for item in self.redirects: try: self.set_index(item, self.find(item)[:3] + (True,)) count += 1 except KeyError: PrintLog.message(u'Unresolved redirect: {0:s} -> {1:s}'.format(item, self.redirects[item])) except CycleError: PrintLog.message(u'Cyclic redirect: {0:s} -> {1:s}'.format(item, self.redirects[item])) return count
def make_link(url, x0, x1, text): global g_starty, g_curr_face, g_link_cnt, g_links, g_this_article_title if article_index(url): try: esc_code10(x1 - x0) g_links[g_link_cnt] = (x0, g_starty - get_lineheight(g_curr_face), x1, g_starty, url) g_link_cnt = g_link_cnt + 1 except Exception as err: PrintLog.message( u'Exception making link {0:s} in article {1:s}: {2:s}'.format( url, g_this_article_title, err.message))
def title(self, category, key, title, seek): global verbose global enable_templates if self.KEY_ARTICLE == key: return True if enable_templates and self.KEY_TEMPLATE == key: if verbose: PrintLog.message(u'Template Title: {0:s}'.format(unicode(title, 'utf-8'))) return True return False
def process_article_text(id, count, title, text, newf): global verbose if verbose: PrintLog.message(u'[PA {0:d}] {1:s}'.format(count, title)) text = TidyUp.article(text) if newf: newf.write('{0:d}:'.format(id)) newf.write(title[1:].encode('utf-8')) # We pad the title to force the database to import strings newf.write('\n__NOTOC__\n') newf.write(text.encode('utf-8') + '\n') newf.write('***EOF***\n')
def main(): global verbose global INDEX_ITEM_SIZE global UINT32_SIZE try: opts, args = getopt.getopt(sys.argv[1:], 'hvo:f:p:', ['help', 'verbose', 'output=', 'prefix=']) except getopt.GetoptError as err: usage(err) verbose = False in_format = 'pedia{0:d}.idx-tmp' out_name = 'pedia.idx' for opt, arg in opts: if opt in ('-v', '--verbose'): verbose = True elif opt in ('-h', '--help'): usage(None) elif opt in ('-p', '--prefix'): in_format = arg + '{0:d}.idx-tmp' elif opt in ('-o', '--output'): out_name = arg else: usage('unhandled option: ' + opt) out = open(out_name, 'wb') article_count = 0 i = 0 data = {} while True: in_name = in_format.format(i) if not os.path.isfile(in_name): break if verbose: PrintLog.message('combining: {0:s}'.format(in_name)) data[i] = open(in_name, 'rb').read() article_count += len(data[i]) / INDEX_ITEM_SIZE i += 1 out.write(struct.pack('<I', article_count)) for j in range(i): out.write(data[j]) out.close() PrintLog.message('Combined {0:d} files'.format(i))
def body(self, category, key, title, text, seek): global verbose global error_flag title = self.convert(title).strip(u'\u200e\u200f') if self.KEY_TEMPLATE == key: if title not in self.ignored_templates: title = unicode(category, 'utf-8').capitalize() + ':' + upper_case_first_char(title) t_body = TidyUp.template(text) self.template_cursor.execute(u'insert or replace into templates (title, body) values(?, ?)', [u'~{0:d}~{1:s}'.format(self.file_id(), title), u'~' + t_body]) self.template_count += 1 return restricted = FilterWords.is_restricted(title) or FilterWords.is_restricted(text) self.article_count += 1 # do closer inspection to see if really restricted if restricted: (restricted, bad_words) = FilterWords.find_restricted(text) if restricted: self.restricted_count += 1 if self.article_count % 10000 == 0: start_time = time.time() PrintLog.message(u'Index: {0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count)) self.time = start_time for t in self.language_processor.translate(title): generate_bigram(t) if verbose: if restricted: PrintLog.message(u'Restricted Title: {0:s}'.format(title)) PrintLog.message(u' --> {0:s}'.format(bad_words)) else: PrintLog.message(u'Title: {0:s}'.format(title)) pass character_count = len(text) self.total_character_count += character_count self.offsets[self.article_count] = (self.file_id(), title, seek, character_count, self.total_character_count) if self.set_index(title, (self.article_count, -1, restricted, False)): # -1 == place holder PrintLog.message(u'ERROR: Duplicate Title: {0:s}'.format(title)) error_flag = True
def body(self, category, key, title, text, seek): global verbose global error_flag title = self.convert(title).strip(u'\u200e\u200f') if self.KEY_TEMPLATE == key: if title not in self.ignored_templates: title = unicode(category, 'utf-8').capitalize() + ':' + upper_case_first_char(title) t_body = TidyUp.template(text) self.template_cursor.execute(u'insert or replace into templates (title, body) values(?, ?)', [u'~{0:d}~{1:s}'.format(self.file_id(), title), u'~' + t_body]) self.template_count += 1 return restricted = FilterWords.is_restricted(title) or FilterWords.is_restricted(text) self.article_count += 1 # do closer inspection to see if really restricted if restricted: (restricted, bad_words) = FilterWords.find_restricted(text) if restricted: self.restricted_count += 1 if not verbose and self.article_count % 10000 == 0: start_time = time.time() PrintLog.message(u'Index: {0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count)) self.time = start_time for t in self.language_processor.translate(title): generate_bigram(t) if verbose: if restricted: PrintLog.message(u'Restricted Title: {0:s}'.format(title)) PrintLog.message(u' --> {0:s}'.format(bad_words)) else: PrintLog.message(u'Title: {0:s}'.format(title)) character_count = len(text) self.total_character_count += character_count self.offsets[self.article_count] = (self.file_id(), title, seek, character_count, self.total_character_count) if self.set_index(title, (self.article_count, -1, restricted, False)): # -1 == place holder PrintLog.message(u'ERROR: Duplicate Title: {0:s}'.format(title)) error_flag = True
def process(index_number, idx_file, fnd_file, dat_format): """dump the index and fnd file entries""" global verbose global sizes global distribution global dist_list global total global byte_count if verbose: PrintLog.message( 'Index number = {0:10n} 0x{0:08x}'.format(index_number)) uint32_size = 4 index_entry_size = 2 * uint32_size + 1 index_offset = uint32_size + index_entry_size * (index_number - 1) idx_file.seek(index_offset) offset_dat, offset_fnd, file_id = struct.unpack( '<2IB', idx_file.read(index_entry_size)) data_file_name = dat_format.format(file_id) dat_file = open(data_file_name, 'rb') dat_file.seek(offset_dat) number_of_pages = struct.unpack('B', dat_file.read(1))[0] for i in range(0, number_of_pages): page_id, page_offset, page_length = struct.unpack( '<3I', dat_file.read(12)) restricted = 'Restricted' if (0 != page_offset & 0x80000000) else '' page_offset = page_offset & 0x7fffffff if page_id in sizes: PrintLog.message('DUP: {0:10n}'.format(page_id)) sizes[page_id] = page_length for d in dist_list: if page_length <= d: distribution[d] += 1 byte_count[d] += page_length total += 1 break dat_file.close()
def esc_code14(width, height, data): """output bitmap""" global g_starty, g_curr_face global output if 0 == width or 0 == height: return if len(data) != (width + 7) // 8 * height: PrintLog.message(u'Error: Corrupt Image') output.write(struct.pack('<BBH', 15, width, height) + data) lineh = get_lineheight(g_curr_face) if height >= lineh: g_starty += height - lineh + 3 # since Eric draws images 3px lower for alignment
def get_imgdata(imgfile, indent): try: img = gd.image(imgfile) except IOError as e: PrintLog.message( u'unable to open image file: {0:s} because: {1:s}'.format( imgfile, e)) return (0, 0, r'') (width, height) = img.size() if width <= (LCD_WIDTH - LCD_IMG_MARGIN - indent): is_black = lambda x, y: (0, 0, 0) == img.colorComponents( img.getPixel((x, y))) h_range = range(0, width) v_range = range(0, height) elif height <= (LCD_WIDTH - LCD_IMG_MARGIN - indent): is_black = lambda x, y: (0, 0, 0) == img.colorComponents( img.getPixel((y, x))) v_range = range(0, width) h_range = range(height - 1, -1, -1) (width, height) = (height, width) else: PrintLog.message(u'image file: {0:s} is too big'.format(imgfile)) return (0, 0, r'') data = '' for v in v_range: byte = 0 bit_count = 8 for h in h_range: if is_black(h, v): pixel = 1 else: pixel = 0 bit_count -= 1 byte |= pixel << bit_count if 0 == bit_count: data += struct.pack('<B', byte) byte = 0 bit_count = 8 if 8 != bit_count: data += struct.pack('<B', byte) return (width, height, data)
def process(index_number, idx_file, fnd_file, dat_format): """dump the index and fnd file entries""" global verbose global sizes global distribution global dist_list global total global byte_count if verbose: PrintLog.message('Index number = {0:10n} 0x{0:08x}'.format(index_number)) uint32_size = 4 index_entry_size = 2 * uint32_size + 1 index_offset = uint32_size + index_entry_size * (index_number - 1) idx_file.seek(index_offset) offset_dat, offset_fnd, file_id = struct.unpack('<2IB', idx_file.read(index_entry_size)) data_file_name = dat_format.format(file_id) dat_file = open(data_file_name, 'rb') dat_file.seek(offset_dat) number_of_pages = struct.unpack('B', dat_file.read(1))[0] for i in range(0, number_of_pages): page_id, page_offset, page_length = struct.unpack('<3I', dat_file.read(12)) restricted = 'Restricted' if (0 != page_offset & 0x80000000) else '' page_offset = page_offset & 0x7fffffff if page_id in sizes: PrintLog.message('DUP: {0:10n}'.format(page_id)) sizes[page_id] = page_length for d in dist_list: if page_length <= d: distribution[d] += 1 byte_count[d] += page_length total += 1 break dat_file.close()
def output_pfx(filename): """output the pfx matrix""" global index_matrix PrintLog.message(u'Writing: {0:s}'.format(filename)) start_time = time.time() out_f = open(filename, 'wb') list = '\0' + SearchKey.all_characters() for k1 in list: for k2 in list: for k3 in list: key = k1+k2+k3 if key in index_matrix: offset = index_matrix[key] else: offset = 0 out_f.write(struct.pack('<I', offset)) out_f.close() PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
def get_imgdata(imgfile, indent): try: img = gd.image(imgfile) except IOError as e: PrintLog.message(u'unable to open image file: {0:s} because: {1:s}'.format(imgfile, e)) return (0, 0, r'') (width, height) = img.size() if width <= (LCD_WIDTH - LCD_IMG_MARGIN - indent): is_black = lambda x, y: (0, 0, 0) == img.colorComponents(img.getPixel((x, y))) h_range = range(0, width) v_range = range(0, height) elif height <= (LCD_WIDTH - LCD_IMG_MARGIN - indent): is_black = lambda x, y: (0, 0, 0) == img.colorComponents(img.getPixel((y, x))) v_range = range(0, width) h_range = range(height - 1, -1, -1) (width, height) = (height, width) else: PrintLog.message(u'image file: {0:s} is too big'.format(imgfile)) return (0, 0, r'') data = '' for v in v_range: byte = 0 bit_count = 8 for h in h_range: if is_black(h, v): pixel = 1 else: pixel = 0 bit_count -= 1 byte |= pixel << bit_count if 0 == bit_count: data += struct.pack('<B', byte) byte = 0 bit_count = 8 if 8 != bit_count: data += struct.pack('<B', byte) return (width, height, data)
def redirect(self, category, key, title, rcategory, rkey, rtitle, seek): global whitespaces global verbose title = self.translate(title).strip(u'\u200e\u200f') rtitle = self.translate(rtitle).strip().strip(u'\u200e\u200f') rtitle = whitespaces.sub(' ', rtitle).strip().lstrip(':') if self.KEY_TEMPLATE == key: if title != rtitle: title = unicode(category, 'utf-8') + ':' + title.lower() rtitle = unicode(rcategory, 'utf-8') + ':' + rtitle.lower() self.template_cursor.execute(u'insert or replace into redirects (title, redirect) values(?, ?)', [u'~{0:d}~{1:s}'.format(self.file_id(), title), u'~{0:d}~{1:s}'.format(self.file_id(), rtitle)]) self.template_redirect_count += 1 return if self.KEY_ARTICLE != key or self.KEY_ARTICLE != rkey: if verbose: PrintLog.message(u'Non-article Redirect: {0:s}[{1:d}]:{2:s} -> {3:s}[{4:d}]:{5:s}' .format(category, key, title, rcategory, rkey, rtitle)) return if '' == rtitle: PrintLog.message(u'Empty Redirect for: {0:s}[{1:d}]:{2:s}'.format(category, key, title)) else: self.redirects[title] = rtitle self.redirect_count += 1 if verbose: PrintLog.message(u'Redirect: {0:s}[{1:d}]:{2:s} -> {3:s}[{4:d}]:{5:s}' .format(category, key, title, rcategory, rkey, rtitle))
def handle_charref(self, name): """handle &#DDDD; &#xXXXX;""" if 0 == len(name): return if 'x' == name[0] or 'X' == name[0]: try: value = int(name[1:], 16) except ValueError: PrintLog.message(u'charref: "{0:s}" is not hexadecimal'.format(name)) return elif name.isdigit(): try: value = int(name) except ValueError: PrintLog.message(u'charref: "{0:s}" is not decimal'.format(name)) return try: c = unichr(value) except ValueError: PrintLog.message(u'charref: "{0:d}" is not convertible to unicode'.format(value)) c = '?' self.handle_data(c)
def handle_charref(self, name): """handle &#DDDD; &#xXXXX;""" if 0 == len(name): return if 'x' == name[0] or 'X' == name[0]: try: value = int(name[1:], 16) except ValueError: PrintLog.message( u'charref: "{0:s}" is not hexadecimal'.format(name)) return elif name.isdigit(): try: value = int(name) except ValueError: PrintLog.message( u'charref: "{0:s}" is not decimal'.format(name)) return try: c = unichr(value) except ValueError: PrintLog.message( u'charref: "{0:d}" is not convertible to unicode'.format( value)) c = '?' self.handle_data(c)
def redirect(self, category, key, title, rcategory, rkey, rtitle, seek): global verbose title = self.convert(title).strip(u'\u200e\u200f') rtitle = self.convert(rtitle).strip().strip(u'\u200e\u200f') # redirected title may contain '%xx' items - treat as unicode sequence # if it fails just keep the %xx sequences intact since it must represent # either real %xx or some unknowable coding scheme try: rtitle = unicode(urllib.unquote(rtitle.encode('utf-8')), 'utf-8').strip().strip(u'\u200e\u200f') except UnicodeDecodeError: pass rtitle = SearchKey.compact_spaces(rtitle).lstrip(':').strip() if self.KEY_TEMPLATE == key: if title != rtitle: title = unicode( category, 'utf-8').capitalize() + ':' + upper_case_first_char(title) rtitle = unicode( rcategory, 'utf-8').capitalize() + ':' + upper_case_first_char(rtitle) self.template_cursor.execute( u'insert or replace into redirects (title, redirect) values(?, ?)', [ u'~{0:d}~{1:s}'.format(self.file_id(), title), u'~{0:d}~{1:s}'.format(self.file_id(), rtitle) ]) self.template_redirect_count += 1 return if self.KEY_ARTICLE != key or self.KEY_ARTICLE != rkey: if verbose: PrintLog.message( u'Non-article Redirect: {0:s}[{1:d}]:{2:s} -> {3:s}[{4:d}]:{5:s}' .format(unicode(category, 'utf-8'), key, title, unicode(rcategory, 'utf-8'), rkey, rtitle)) return if '' == rtitle: PrintLog.message(u'Empty Redirect for: {0:s}[{1:d}]:{2:s}'.format( category, key, title)) else: self.redirects[title] = rtitle self.redirect_count += 1 for t in self.language_processor.translate(title): generate_bigram(t) if verbose: PrintLog.message( u'Redirect: {0:s}[{1:d}]:{2:s} -> {3:s}[{4:d}]:{5:s}'. format(category, key, title, rcategory, rkey, rtitle))
def write_article_index(file_offset, length): global verbose global output, f_out, i_out global g_this_article_title global file_number try: (article_number, fnd_offset, restricted) = article_index(g_this_article_title) data_offset = (file_offset & 0x7fffffff) if bool(int(restricted)): # '0' is True so turn it into False data_offset |= 0x80000000 data_length = (0x80 << 24) | (file_number << 24) | length # 0x80 => lzma encoding i_out.write(struct.pack('III', data_offset, fnd_offset, data_length)) i_out.flush() except KeyError: PrintLog.message(u'Error in: write_article, Title not found') PrintLog.message(u'Title: {0:s}'.format(g_this_article_title)) PrintLog.message(u'Offset: {0:s}'.format(file_offset)) PrintLog.message(u'Count: {0:s}'.format(article_count))
def body(self, category, key, title, text, seek): global verbose, show_restricted restricted_title = FilterWords.is_restricted(title) restricted_text = FilterWords.is_restricted(text) restricted = restricted_title or restricted_text self.article_count += 1 if restricted: self.restricted_count += 1 if not verbose and self.article_count % 10000 == 0: start_time = time.time() PrintLog.message('{0:7.2f}s {1:10d}'.format( start_time - self.time, self.article_count)) self.time = start_time if verbose: PrintLog.message('Title: {0:s}'.format(title)) if restricted: if restricted_title: t_state = ' Title' else: t_state = '' if restricted_text: b_state = ' Text' (flag, contains) = FilterWords.find_restricted(text) if not flag: self.unrestricted_count += 1 else: b_state = '' contains = None if show_restricted: PrintLog.message('{0:10d} Restricted{1:s}{2:s}: {3:s}'.format( self.restricted_count, t_state, b_state, title)) if None != contains: PrintLog.message(' -> {0!s:s} {1:s}'.format( flag, contains))
def body(self, category, key, title, text, seek): global verbose, show_restricted restricted_title = FilterWords.is_restricted(title) restricted_text = FilterWords.is_restricted(text) restricted = restricted_title or restricted_text self.article_count += 1 if restricted: self.restricted_count += 1 if not verbose and self.article_count % 10000 == 0: start_time = time.time() PrintLog.message('{0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count)) self.time = start_time if verbose: PrintLog.message('Title: {0:s}'.format(title)) if restricted: if restricted_title: t_state = ' Title' else: t_state = '' if restricted_text: b_state = ' Text' (flag, contains) = FilterWords.find_restricted(text) if not flag: self.unrestricted_count += 1 else: b_state = '' contains = None if show_restricted: PrintLog.message('{0:10d} Restricted{1:s}{2:s}: {3:s}' .format(self.restricted_count, t_state, b_state, title)) if None != contains: PrintLog.message(' -> {0!s:s} {1:s}'.format(flag, contains))
def redirect(self, category, key, title, rcategory, rkey, rtitle, seek): global verbose title = self.convert(title).strip(u'\u200e\u200f') rtitle = self.convert(rtitle).strip().strip(u'\u200e\u200f') # redirected title may contain '%xx' items - treat as unicode sequence # if it fails just keep the %xx sequences intact since it must represent # either real %xx or some unknowable coding scheme try: rtitle = unicode(urllib.unquote(rtitle.encode('utf-8')), 'utf-8').strip().strip(u'\u200e\u200f') except UnicodeDecodeError: pass rtitle = SearchKey.compact_spaces(rtitle).lstrip(':').strip() if self.KEY_TEMPLATE == key: if title != rtitle: title = unicode(category, 'utf-8').capitalize() + ':' + upper_case_first_char(title) rtitle = unicode(rcategory, 'utf-8').capitalize() + ':' + upper_case_first_char(rtitle) self.template_cursor.execute(u'insert or replace into redirects (title, redirect) values(?, ?)', [u'~{0:d}~{1:s}'.format(self.file_id(), title), u'~{0:d}~{1:s}'.format(self.file_id(), rtitle)]) self.template_redirect_count += 1 return if self.KEY_ARTICLE != key or self.KEY_ARTICLE != rkey: if verbose: PrintLog.message(u'Non-article Redirect: {0:s}[{1:d}]:{2:s} -> {3:s}[{4:d}]:{5:s}' .format(unicode(category, 'utf-8'), key, title, unicode(rcategory, 'utf-8'), rkey, rtitle)) return if '' == rtitle: PrintLog.message(u'Empty Redirect for: {0:s}[{1:d}]:{2:s}'.format(category, key, title)) else: self.redirects[title] = rtitle self.redirect_count += 1 for t in self.language_processor.translate(title): generate_bigram(t) if verbose: PrintLog.message(u'Redirect: {0:s}[{1:d}]:{2:s} -> {3:s}[{4:d}]:{5:s}' .format(category, key, title, rcategory, rkey, rtitle))
break (file_id, title, seek, length) = row if file_id != current_file_id: current_file_id = file_id if input_file: input_file.close() offset_cursor.execute('select filename from files where file_id = ? limit 1', (file_id,)) filename = offset_cursor.fetchone()[0] input_file = open(filename, 'rb') if not input_file: PrintlogLog.message('Failed to open: {0:s}'.format(filename)) current_file_id = None continue if verbose: PrintLog.message(u'Opened: {0:s}'.format(filename)) try: input_file.seek(seek) except Exception, e: PrintLog.message(u'seek failed: e={0:!s:s} seek={1:d} f={2:s}'.format(e, seek, filename)) sys.exit(1) # restart the background process if it fails to try to record all failing articles if None != background_process and None == process_id: process_id = subprocess.Popen(background_process, shell=True, stdin=subprocess.PIPE) try: process_article_text(current_file_id, total_articles + 1, title, input_file.read(length), process_id.stdin) except Exception, e:
def main(): """ main processing""" global verbose try: opts, args = getopt.getopt(sys.argv[1:], 'hvi:o:', ['help', 'verbose', 'input=', 'output=', ]) except getopt.GetoptError as err: usage(err) verbose = False input_file_name = 'pinyin_table.txt' output_file_name = 'PinyinTable.py' for opt, arg in opts: if opt in ('-v', '--verbose'): verbose = True elif opt in ('-h', '--help'): usage(None) elif opt in ('-i', '--input'): input_file_name = arg elif opt in ('-o', '--output'): output_file_name = arg else: usage('unhandled option: ' + opt) if [] != args: usage('Extraneous argument(s)') PrintLog.message(u'Reading Data File: {0:s}'.format(input_file_name)) errors = False pinyin = {} with open(input_file_name, 'rb') as f: PrintLog.message(u'File Header: {0:s}'.format(f.readline().strip())) PrintLog.message(u'File Version: {0:s}'.format(f.readline().strip())) expected_lines = int(f.readline()) line_count = 0 char_count = 0 for line in f: line_count += 1 n = line.strip().split() phonetic = make_pinyin(n.pop(0)) item_count = int(n.pop(0)) if len(n) != item_count: PrintLog.message(u'Error: incorrect item count, expected: {0:d} got: {1:d}'.format(item_count, len(n))) errors = True break for s in n: cjk = unicode(s, 'utf-8')[0] if cjk in pinyin: pinyin[cjk] += [phonetic] else: pinyin[cjk] = [phonetic] char_count += 1 if line_count == expected_lines: PrintLog.message(u'Counted CJK glyphs: {0:d}'.format(char_count)) PrintLog.message(u'Expected Lines: {0:d}'.format(expected_lines)) PrintLog.message(u'Counted Lines: {0:d}'.format(line_count)) else: PrintLog.message(u'Error: linecount miosmatch: {0:d} != {1:d}'.format(expected_lines, line_count)) errors = True if errors: PrintLog.message(u'Error: failed to read data file') return 1 else: PrintLog.message(u'Data Read Completed Sucessfully') text = u'欧洲,软件+互联网[用统一码] 歐洲,軟體及網際網路[讓統一碼] ABC 西安 先' expected = u'ōuzhōu,ruănjiàn+hùliánwăng[yòngtŏngyīmă] ōuzhōu,ruăntĭjíwăngjìwănglù[ràngtŏngyīmă] ABC xīān xiān' result = u'' for c in text: if c in pinyin: result += pinyin[c][0] else: result += c if result == expected: PrintLog.message(u'Creating: {0:s}'.format(output_file_name)) generate_output(output_file_name, 6, pinyin) PrintLog.message(u'Finished: {0:s}'.format(output_file_name)) else: PrintLog.message(u'Error in test:') PrintLog.message(u'input: {0:s}'.format(text)) PrintLog.message(u'output: {0:s}'.format(result)) PrintLog.message(u'expected: {0:s}'.format(expected)) return 2 return 0
def redirect(self, category, key, title, rcategory, rkey, rtitle, seek): self.redirect_count += 1 if verbose: PrintLog.message('Redirect: {0:s}:{1:s} -> {2:s}:{3:s}'.format(category, title, rcategory, rtitle))
def main(): global verbose global error_flag try: opts, args = getopt.getopt(sys.argv[1:], 'hvi:o:c:t:I:l:p:L:T', ['help', 'verbose', 'article-index=', 'article-offsets=', 'article-counts=', 'templates=', 'ignore-templates=', 'limit=', 'prefix=', 'language=', 'truncate-title', ]) except getopt.GetoptError as err: usage(err) verbose = False art_name = "articles.db" off_name = "offsets.db" cnt_name = "counts.text" fnd_name = 'pedia{0:s}.fnd' pfx_name = 'pedia.pfx' template_name = 'templates.db' ignore_templates_name = None limit = 'all' language = 'en' # some languages may require special processing truncate_title = False # set tru when not using language links for opt, arg in opts: if opt in ('-v', '--verbose'): verbose = True elif opt in ('-h', '--help'): usage(None) elif opt in ('-i', '--article-index'): art_name = arg elif opt in ('-o', '--article-offsets'): off_name = arg elif opt in ('-c', '--article-counts'): cnt_name = arg elif opt in ('-t', '--templates'): template_name = arg elif opt in ('-I', '--ignore-templates'): ignore_templates_name = arg if not os.path.exists(ignore_templates_name): usage(u'ignore-templates file: {0:s} does not exist'.format(ignore_templates_name)) elif opt in ('-T', '--truncate-title'): truncate_title = True elif opt in ('-l', '--limit'): if arg[-1] == 'k': arg = arg[:-1] + '000' if arg != 'all': try: limit = int(arg) except ValueError: usage('"{0:s}={1:s}" is not numeric'.format(opt, arg)) if limit <= 0: usage('"{0:s}={1:s}" must be > zero'.format(opt, arg)) elif opt in ('-p', '--prefix'): fnd_name = arg + '{0:s}.fnd' pfx_name = arg + '.pfx' elif opt in ('-L', '--language'): language = arg else: usage('unhandled option: ' + opt) if [] == args: usage('Missing argument(s)') ignored_templates = {} if None != ignore_templates_name: with open(ignore_templates_name) as f: for l in f.readlines(): line = unicode(l, 'utf-8').strip() if line.startswith('#'): continue if '' != line: ignored_templates[line] = True language_convert = LanguageTranslation.LanguageNormal() if 'ja' == language: language_convert = LanguageTranslation.LanguageJapanese() processor = FileProcessing(articles = art_name, offsets = off_name, templates = template_name, ignored_templates = ignored_templates, language = language_convert) for f in args: limit = processor.process(f, limit) if limit != 'all' and limit <= 0: break # record initial counts a = processor.article_count r = processor.redirect_count # fix up redirects m = a + processor.resolve_redirects() # record combined count and display statistics s = a + r cf = open(cnt_name, 'w') for f in (sys.stdout, cf): f.write('Articles: {0:10d}\n'.format(a)) f.write('Redirects: {0:10d}\n'.format(r)) f.write('Sum: {0:10d}\n'.format(s)) f.write('Merged: {0:10d}\n'.format(m)) f.write('Difference: {0:10d}\n'.format(m - s)) f.write('Restricted: {0:10d}\n'.format(processor.restricted_count)) f.write('Templates: {0:10d}\n'.format(processor.template_count)) f.write('rTemplates: {0:10d}\n'.format(processor.template_redirect_count)) f.write('Characters: {0:10d}\n'.format(processor.total_character_count)) cf.close() output_fnd(fnd_name, processor, language_convert, truncate_title) output_pfx(pfx_name) del processor # return non-zero status if there have been any errors if error_flag: PrintLog.message('*** ERROR in Index build') PrintLog.message('*** Currently "Duplicate Title" is the only condition that causes this error') PrintLog.message('*** Likely "license.xml" or "terms.xml" file duplicates a title in main wiki file') PrintLog.message('*** Manually edit "license.xml" or "terms.xml" file to change the title') sys.exit(1)
def handle_endtag(self, tag): global g_this_article_title global article_count global warnings # ignore end tag without start tag if (tag, True) not in self.tag_stack and (tag, False) not in self.tag_stack: if warnings: (line, column) = self.getpos() PrintLog.message(u'Warning: superfluous </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}' .format(tag, line, column, article_count + 1, g_this_article_title)) return # backtrack up the stack closing each open tag until there is a match (start_tag, self.printing) = self.tag_stack.pop() while start_tag != tag: self.tag_stack.append((start_tag, self.printing)) if warnings: (line, column) = self.getpos() PrintLog.message(u'Warning: force </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}' .format(start_tag, line, column, article_count + 1, g_this_article_title)) self.handle_endtag(start_tag) (start_tag, self.printing) = self.tag_stack.pop() # must always do </html> tag if tag == 'html': self.printing = True self.tag_stack = [] self.in_html = False esc_code1() write_article(self.language_links) return if not self.printing: return elif tag == 'script': pass elif tag == 'title': self.in_title = False g_this_article_title = g_this_article_title.strip() elif tag == 'body': self.in_body = False self.flush_buffer() elif tag == 'table': if self.in_table > 0: self.in_table -= 1 # if in a table suppress everything after this point if self.in_table > 0: return elif tag == 'h1': self.flush_buffer() self.in_h1 = False esc_code0(H1_MARGIN_BOTTOM) elif tag == 'h2': self.flush_buffer() self.in_h2 = False elif tag == 'h3': self.flush_buffer() self.in_h3 = False elif tag == 'h4': self.flush_buffer() self.in_h4 = False elif tag == 'h5': self.flush_buffer() self.in_h5 = False elif tag == 'h6': self.flush_buffer() self.in_h6 = False elif tag == 'div': self.flush_buffer() elif tag == 'p': self.flush_buffer() self.in_p = False elif tag == 'blockquote': self.flush_buffer() if self.quote > 0: if self.quote < MAX_QUOTE_LEVEL: self.indent -= BLOCKQUOTE_MARGIN_LEFT self.lwidth += BLOCKQUOTE_MARGIN_LEFT + BLOCKQUOTE_MARGIN_RIGHT esc_code9(-BLOCKQUOTE_MARGIN_LEFT) self.quote -= 1 elif tag == 'b': self.in_b = False elif tag == 'big': self.in_b = False elif tag == 'strong': self.in_b = False elif tag == 'i': self.in_i = False elif tag == 'del': self.in_del = False elif tag == 'ins': self.in_ins = False elif tag == 'a': self.in_a = False self.url = "" elif tag in ['ul', 'ol', 'dl']: self.leave_list() elif tag == 'li': if 0 == self.level: if warnings: (line, column) = self.getpos() PrintLog.message(u'Warning: stray </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}' .format(tag, line, column, article_count + 1, g_this_article_title)) else: self.flush_buffer(False) self.list_decrease_indent() self.li_inside[self.level] = False elif tag == 'dd': self.flush_buffer() self.list_decrease_indent() elif tag == 'dt': self.flush_buffer() elif tag == 'br': self.flush_buffer() self.in_br = False elif tag == 'img': self.in_img = False
def main(): global verbose global PARSER_COMMAND global total_articles try: opts, args = getopt.getopt(sys.argv[1:], 'hvx:s:c:o:t:l:V:jnw:T:', ['help', 'verbose', 'xhtml=', 'start=', 'count=', 'article-offsets=', 'templates=', 'language=', 'language-variant=', 'just-cat', 'no-output', 'parser-workdir=', 'parser-tempdir=', ]) except getopt.GetoptError as err: usage(err) verbose = False out_name = 'all_articles.html' off_name = 'offsets.db' parser_workdir = '/tmp' parser_tempdir = os.path.join(parser_workdir, 'tmp') start_article = 1 article_count = 'all' failed_articles = 0 do_output = True template_name = 'templates.db' language = 'en' language_variant = '' for opt, arg in opts: if opt in ('-v', '--verbose'): verbose = True elif opt in ('-h', '--help'): usage(None) elif opt in ('-x', '--xhtml'): out_name = arg elif opt in ('-o', '--article-offsets'): off_name = arg elif opt in ('-t', '--templates'): template_name = arg elif opt in ('-l', '--language'): language = arg elif opt in ('-V', '--language-variant'): language_variant = arg elif opt in ('-w', '--parser-workdir'): parser_workdir = arg elif opt in ('-T', '--parser-tempdir'): parser_tempdir = arg elif opt in ('-j', '--just-cat'): PARSER_COMMAND = 'cat' elif opt in ('-n', '--no-output'): do_output = False elif opt in ('-s', '--start'): if arg[-1] == 'k': arg = arg[:-1] + '000' try: start_article = int(arg) except ValueError: usage('"{0:s}={1:s}" is not numeric'.format(opt, arg)) if start_article < 1: usage('"{0:s}={1:s}" must be >= 1'.format(opt, arg)) elif opt in ('-c', '--count'): if arg[-1] == 'k': arg = arg[:-1] + '000' if arg != 'all': try: article_count = int(arg) except ValueError: usage('"{0:s}={1:s}" is not numeric'.format(opt, arg)) if article_count <= 0: usage('"{0:s}={1:s}" must be > zero'.format(opt, arg)) else: usage('unhandled option: ' + opt) if not os.path.isdir(parser_workdir): usage('workdir: {0:s} does not exist'.format(parser_workdir)) if not os.path.isdir(parser_tempdir): usage('tempdir: {0:s} does not exist'.format(parser_tempdir)) # pass parameters to the PHP parser os.environ['WORKDIR'] = parser_workdir os.environ['TEMPDIR'] = parser_tempdir os.environ['LANGUAGE'] = language.lower() os.environ['LANGUAGE_VARIANT'] = language_variant.lower().replace('_', '-') # e.g. zh_TW -> zh-tw os.environ['TEMPLATE_DB'] = template_name offset_db = sqlite3.connect(off_name) offset_db.execute('pragma synchronous = 0') offset_db.execute('pragma temp_store = 2') offset_db.execute('pragma read_uncommitted = true') offset_db.execute('pragma cache_size = 20000000') offset_db.execute('pragma default_cache_size = 20000000') offset_db.execute('pragma journal_mode = off') offset_cursor = offset_db.cursor() if do_output: background_process = PARSER_COMMAND + ' > ' + out_name else: background_process = None # process all required articles out_base_name = os.path.basename(out_name) # for logging messages current_file_id = None input_file = None process_id = None total_articles = 0 start_time = time.time() while article_count == 'all' or article_count != 0: offset_cursor.execute('select file_id, title, seek, length from offsets where article_number = ? limit 1', (start_article,)) row = offset_cursor.fetchone() if None == row: break (file_id, title, seek, length) = row if file_id != current_file_id: current_file_id = file_id if input_file: input_file.close() offset_cursor.execute('select filename from files where file_id = ? limit 1', (file_id,)) filename = offset_cursor.fetchone()[0] input_file = open(filename, 'rb') if not input_file: PrintLog.message('Failed to open: {0:s}'.format(filename)) current_file_id = None continue if verbose: PrintLog.message(u'Opened: {0:s}'.format(filename)) try: input_file.seek(seek) except Exception as e: PrintLog.message(u'seek failed: e={0:!s:s} seek={1:d} f={2:s}'.format(e, seek, filename)) sys.exit(1) # restart the background process if it fails to try to record all failing articles if None != background_process and None == process_id: process_id = subprocess.Popen(background_process, shell=True, stdin=subprocess.PIPE) try: process_article_text(current_file_id, total_articles + 1, title, input_file.read(length), process_id.stdin) except Exception as e: failed_articles += 1 # extract from log by: grep '^!' log-file PrintLog.message(u'!Process failed, file: {0:s} article({1:d}): {2:s} because: {3!s:s}' .format(filename, total_articles, title, e)) trace = sys.exc_info() if None != trace: traceback.print_tb(trace[2]) process_id.stdin.close() process_id.wait() process_id = None if article_count != 'all': article_count -= 1 total_articles += 1 start_article += 1 if not verbose and total_articles % 1000 == 0: if 0 != failed_articles: failed_message = 'Failed: {0:d}'.format(failed_articles) else: failed_message = '' now_time = time.time() PrintLog.message(u'Parse[{0:s}]: {1:7.2f}s {2:10d} {3:s}' .format(out_base_name, now_time - start_time, total_articles, failed_message)) start_time = now_time # close files if input_file: input_file.close() # wait for background process to finish if process_id: process_id.stdin.close() process_id.wait() # output some statistics and create count file PrintLog.message(u'Parse[{0:s}]: Total: {1:d}'.format(out_base_name, total_articles)) # write the total count for Rendering program fd = open(out_name + '.count', 'wb') if fd is not None: fd.write('TOTAL_ARTICLES = {count:d}\n'.format(count = total_articles)) fd.close() # indicate failures if 0 != failed_articles: PrintLog.message(u'Parse[{0:s}]: Failed: {1:d}'.format(out_base_name, failed_articles)) sys.exit(1)
def output_fnd(filename, article_index, language_processor, truncate_title): """create bigram table""" global bigram global index_matrix global MAXIMUM_TITLE_LENGTH global MAXIMUM_TITLE_ACTUAL PrintLog.message(u'Writing bigrams: {0:s}'.format(filename)) start_time = time.time() out_f = open(filename, 'wb') sortedgram = [ (value, key) for key, value in bigram.iteritems() ] sortedgram.sort() sortedgram.reverse() bigram = {} i = 0 for k, v in sortedgram: out_f.write(v) bigram[v] = chr(i + 128) i += 1 if i >= 128: break while i < 128: out_f.write('zz') bigram['zz'] = chr(i + 128) i += 1 PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time)) # create pfx matrix and write encoded titles #article_list = [strip_accents(k) for k in article_index.keys()] #article_list.sort(key = lambda x: strip_accents(x).lower()) PrintLog.message(u'Sorting titles') start_time = time.time() article_list = [ (SearchKey.make_key(language_processor.translate(title)), title) for title in article_index.all_indices() ] article_list.sort() PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time)) PrintLog.message(u'Writing matrix: {0:s}'.format(filename)) start_time = time.time() index_matrix = {} index_matrix['\0\0\0'] = out_f.tell() previous_bigram_title = '' previous_utf8_title = '' mod_counter = 0 for stripped_title, title in article_list: bigram_title = bigram_encode(stripped_title)[:MAXIMUM_TITLE_LENGTH] (article_number, dummy, restricted, is_redirect) = article_index.get_index(title) if '' == bigram_title and is_redirect: continue utf8_title = title.encode('utf-8') if truncate_title: utf8_title = utf8_title[:MAXIMUM_TITLE_LENGTH] else: utf8_title = utf8_title[:MAXIMUM_TITLE_ACTUAL] offset = out_f.tell() article_index.set_index(title, (article_number, offset, restricted, is_redirect)) key3 = (stripped_title[0:3] + '\0\0\0')[0:3].lower() key2 = key3[0:2] + '\0' key1 = key3[0:1] + '\0\0' if key1 not in index_matrix: index_matrix[key1] = offset if key2 not in index_matrix: index_matrix[key2] = offset if key3 not in index_matrix: index_matrix[key3] = offset if 0 == mod_counter & 0x0f: bigram_common_length = 0 utf8_common_length = 0 else: bigram_common_length = common_prefix_length(previous_bigram_title, bigram_title) utf8_common_length = common_prefix_length(previous_utf8_title, utf8_title) mod_counter += 1 previous_bigram_title = bigram_title previous_utf8_title = utf8_title if bigram_common_length > 1: bigram_title = chr(bigram_common_length - 1) + bigram_title[bigram_common_length:] if utf8_common_length > 1: utf8_title = chr(utf8_common_length - 1) + utf8_title[utf8_common_length:] out_f.write(struct.pack('<I', article_number) + '\0' + bigram_title + '\0' + utf8_title + '\0') out_f.close() PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
def main(): global verbose try: opts, args = getopt.getopt(sys.argv[1:], 'hv', [ 'help', 'verbose', ]) except getopt.GetoptError as err: usage(err) verbose = False uint32_size = 4 for opt, arg in opts: if opt in ('-v', '--verbose'): verbose = True elif opt in ('-h', '--help'): usage(None) else: usage('unhandled option: ' + opt) if len(args) < 1: usage('missing arguments') fnd_file = SegmentedFileReader(args) total_entries = 0 bigram_table = {} for i in range(128, 256): bigram_table[i] = fnd_file.read(2) previous_title1 = '' previous_title2 = '' while True: fnd_offset = fnd_file.tell() header = fnd_file.read(uint32_size + 1) if 0 == len(header): break article_number, nul_byte = struct.unpack('<IB', header) title1 = get_title(fnd_file) title2 = get_title(fnd_file) total_entries += 1 length1 = len(title1) length2 = len(title2) if 0 != length1 and title1[0] < ' ': prefix_length = ord(title1[0]) + 1 title1 = previous_title1[:prefix_length] + title1[1:] if 0 != length2 and title2[0] < ' ': prefix_length = ord(title2[0]) + 1 title2 = previous_title2[:prefix_length] + title2[1:] full_length1 = len(title1) full_length2 = len(title2) decoded_title1 = '' for c in title1: i = ord(c) if i in bigram_table: decoded_title1 += bigram_table[i] else: decoded_title1 += c PrintLog.message(u'Index: {an:13n} @ Offset: {of:13n} [0x{of:08x}]\n' u'{pad1:s}[{l1:3d}/{fl1:3d}]:{t1!r:s}\n' u'{pad1:s}{pad2}{dt1!r:s}\n' u'{pad1:s}[{fl1:3d}/{fl2:3d}]:"{t2:s}"\n'.format( of=fnd_offset, an=article_number, l1=length1, fl1=full_length1, t1=title1, dt1=decoded_title1, pad1=' ' * 2, pad2=' ' * (2 * 3 + 4), l2=length2, fl2=full_length2, t2=truncated_utf8(title2))) previous_title1 = title1 previous_title2 = title2 fnd_file.close() PrintLog.message(u'Total entries = {0:13n}'.format(total_entries))
f.write('Restricted: {0:10d}\n'.format(processor.restricted_count)) f.write('Templates: {0:10d}\n'.format(processor.template_count)) f.write('rTemplates: {0:10d}\n'.format(processor.template_redirect_count)) f.write('Characters: {0:10d}\n'.format(processor.total_character_count)) cf.close() output_fnd(fnd_name, processor, language_convert, truncate_title) output_pfx(pfx_name) del processor # return non-zero status if there have been any errors if error_flag: PrintLog.message('*** ERROR in Index build') PrintLog.message('*** Currently "Duplicate Title" is the only condition that causes this error') PrintLog.message('*** Likely "license.xml" or "terms.xml" file duplicates a title in main wiki file') PrintLog.message('*** Manually edit "license.xml" or "terms.xml" file to change the title') sys.exit(1) def generate_bigram(text): """create bigram from pairs of characters""" global bigram if len(text) > 2: try: if SearchKey.is_valid_character(text[0]) and SearchKey.is_valid_character(text[1]): bigram[text[0:2]] += 1 except KeyError:
def redirect(self, category, key, title, rcategory, rkey, rtitle, seek): self.redirect_count += 1 if verbose: PrintLog.message('Redirect: {0:s}:{1:s} -> {2:s}:{3:s}'.format( category, title, rcategory, rtitle))
def write_article(language_links): global compress global verbose global output, f_out, i_out global article_count global g_this_article_title global file_number global start_time global article_writer article_count += 1 if verbose: PrintLog.message(u'[MWR {0:d}] {1:s}'.format(article_count, g_this_article_title)) elif article_count % 1000 == 0: now_time = time.time() PrintLog.message(u'Render[{0:d}]: {1:7.2f}s {2:10d}'.format( file_number, now_time - start_time, article_count)) start_time = now_time # create links links_stream = io.BytesIO('') for i in g_links: (x0, y0, x1, y1, url) = g_links[i] links_stream.write( struct.pack('<3I', (y0 << 8) | x0, (y1 << 8) | x1, link_number(url))) links_stream.flush() links = links_stream.getvalue() links_stream.close() # create language links links_stream = io.BytesIO('') japanese_convert = LanguageTranslation.LanguageJapanese().translate normal_convert = LanguageTranslation.LanguageNormal().translate for l in language_links: language, link = l.split(':', 1) language = language.strip() link = link.strip() # only need the first pronunciation for the link # as this must always be present if link is not None and '' != link: if 'ja' == language: stripped = japanese_convert(link)[0] else: stripped = normal_convert(link)[0] stripped = SearchKey.strip_accents(stripped) if link == stripped: links_stream.write(l.encode('utf-8') + '\0') else: links_stream.write((language + '#' + stripped).encode('utf-8') + '\1' + link.encode('utf-8') + '\0') links_stream.flush() langs = links_stream.getvalue() links_stream.close() # create the header (header size = 8) header = struct.pack('<I2H', 8 + len(links) + len(langs), g_link_cnt, 0) body = output.fetch() # combine the data whole_article = header + links + langs + body if compress: try: (article_number, fnd_offset, restricted) = article_index(g_this_article_title) restricted = bool( int(restricted)) # '0' is True so turn it into False article_writer.add_article(article_number, whole_article, fnd_offset, restricted) except KeyError: PrintLog.message(u'Error in: write_article, Title not found') PrintLog.message(u'Title: {0:s}'.format(g_this_article_title)) PrintLog.message(u'Count: {0:s}'.format(article_count)) else: f_out.write(whole_article) f_out.flush()
def main(): global verbose, warnings, compress global f_out, output, i_out global font_id_values global file_number global article_count global article_db global start_time global article_writer global MAXIMUM_ARTICLES_PER_BLOCK try: opts, args = getopt.getopt(sys.argv[1:], 'hvwn:d:p:i:t:f:L:l:a:b:m:', [ 'help', 'verbose', 'warnings', 'number=', 'data-prefix=', 'index-prefix=', 'article-index=', 'test=', 'font-path=', 'language=', 'language-links=', 'images=', 'articles=', 'block-size=', 'max-article-length=', ]) except getopt.GetoptError as err: usage(err) verbose = False warnings = False data_file = 'pedia{0:d}.dat' index_file = 'pedia{0:d}.idx-tmp' art_file = 'articles.db' file_number = 0 test_file = '' font_path = "../fonts" article_db = None language = 'en' inter_links = True enable_images = True articles_per_block = 32 block_size = 262144 max_article_length = 'unlimited' for opt, arg in opts: if opt in ('-v', '--verbose'): verbose = True elif opt in ('-w', '--warnings'): warnings = True elif opt in ('-h', '--help'): usage(None) elif opt in ('-t', '--test'): test_file = arg elif opt in ('-i', '--article-index'): art_file = arg elif opt in ('-n', '--number'): try: file_number = int(arg) except ValueError: usage('"{0:s}={1:s}" is not numeric'.format(opt, arg)) elif opt in ('-d', '--data-prefix'): data_file = arg + '{0:d}.dat' elif opt in ('-p', '--index-prefix'): index_file = arg + '{0:d}.idx-tmp' elif opt in ('-f', '--font-path'): font_path = arg elif opt in ('-L', '--language'): language = arg.lower() elif opt in ('-l', '--language-links'): arg = arg.lower() inter_links = ('yes' == arg) elif opt in ('-l', '--images'): arg = arg.lower() enable_images = ('yes' == arg) elif opt in ('-a', '--articles'): try: articles_per_block = int(arg) except ValueError: usage('"{0:s}={1:s}" is not numeric'.format(opt, arg)) if articles_per_block < 1 or articles_per_block > MAXIMUM_ARTICLES_PER_BLOCK: usage('"{o:s}={a:s}" is out of range [1..{m:d}]'.format( o=opt, a=arg, m=MAXIMUM_ARTICLES_PER_BLOCK)) elif opt in ('-b', '--block-size'): try: block_size = int(arg) except ValueError: usage('"{0:s}={1:s}" is not numeric'.format(opt, arg)) if block_size < 65536 or block_size > 524288: usage('"{0:s}={1:s}" is out of range [65536..524288]'.format( opt, arg)) elif opt in ('-m', '--max-article-length'): if 'unlimited' == arg.lower(): max_article_length = 'unlimited' else: try: max_article_length = int(arg) except ValueError: usage('"{0:s}={1:s}" is not numeric'.format(opt, arg)) if max_article_length < 0: usage( '"{0:s}={1:s}" is out of range [0..unlimited]'.format( opt, arg)) else: usage('unhandled option: ' + opt) start_time = time.time() f_fontr = open(os.path.join(font_path, "text.bmf"), "rb") f_fonti = open(os.path.join(font_path, "texti.bmf"), "rb") f_fontt = open(os.path.join(font_path, "title.bmf"), "rb") f_fontst = open(os.path.join(font_path, "subtitle.bmf"), "rb") f_font_all = open(os.path.join(font_path, "textall.bmf"), "rb") f_fontt_all = open(os.path.join(font_path, "titleall.bmf"), "rb") f_fontst_all = open(os.path.join(font_path, "subtlall.bmf"), "rb") font_id_values = { ITALIC_FONT_IDX: f_fonti, DEFAULT_FONT_IDX: f_fontr, TITLE_FONT_IDX: f_fontt, TITLE_ALL_FONT_IDX: f_fontt_all, SUBTITLE_FONT_IDX: f_fontst, SUBTITLE_ALL_FONT_IDX: f_fontst_all, DEFAULT_ALL_FONT_IDX: f_font_all } article_db = sqlite3.connect(art_file) article_db.execute('pragma auto_vacuum = none') article_db.execute('pragma synchronous = off') article_db.execute('pragma temp_store = memory') article_db.execute('pragma locking_mode = normal') article_db.execute('pragma read_uncommitted = true') article_db.execute('pragma cache_size = 20000000') article_db.execute('pragma default_cache_size = 20000000') article_db.execute('pragma journal_mode = off') def y_adjust(inc): global g_starty g_starty += inc output = EscapeBuffer.EscapeBuffer(callback=y_adjust, max_length=max_article_length) if test_file == '': compress = True i_out = open(index_file.format(file_number), 'wb') f_out = open(data_file.format(file_number), 'wb') article_writer = ArticleWriter(file_number, f_out, i_out, max_buckets=50, bucket_size=block_size, max_items_per_bucket=articles_per_block) else: compress = False f_out = open(test_file, 'wb') for name in args: f = codecs.open(name, 'r', 'utf-8', 'replace') t = get_parameter_value(name + '.count', 'TOTAL_ARTICLES') if t is not None: PrintLog.message("Render[{0:d}]: Total: {1:s}".format( file_number, t)) WrProcess(f, language, inter_links, enable_images) f.close() for item in font_id_values: font_id_values[item].close() if output != None: del output if article_writer != None: del article_writer if f_out != None: f_out.close() if i_out != None: i_out.close() if article_db != None: article_db.close() for i in font_id_values: font_id_values[i].close() # final message PrintLog.message("Render[{0:d}]: Total: {1:d}".format( file_number, article_count))
def __del__(self): PrintLog.message(u'Flushing databases') self.template_db.commit() self.template_cursor.close() self.template_db.close() PrintLog.message(u'Writing: files') start_time = time.time() i = 0 with open(self.file_import, 'w') as f: for filename in self.file_list: f.write('{0:d}\t{1:s}\n'.format(i, filename)) i += 1 PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time)) PrintLog.message(u'Writing: articles') start_time = time.time() with open(self.article_import, 'w') as f: for title in self.articles: (article_number, fnd_offset, restricted, is_redirect) = self.articles[title] f.write('~' + title.encode('utf-8')) # force string f.write('\t{0:d}\t{1:d}\t{2:d}\t{3:d}\n'.format(article_number, fnd_offset, restricted, is_redirect)) PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time)) PrintLog.message(u'Writing: offsets') start_time = time.time() with open(self.offset_import, 'w') as f: for article_number in self.offsets: (file_id, title, seek, length, accumulated) = self.offsets[article_number] f.write('{0:d}\t{1:d}\t'.format(article_number, file_id)) f.write('~' + title.encode('utf-8')) # force string f.write('\t{0:d}\t{1:d}\t{2:d}\n'.format(seek, length, accumulated)) PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time)) PrintLog.message(u'Loading: articles') start_time = time.time() p = subprocess.Popen('sqlite3 > /dev/null 2>&1 ' + self.article_db_name, shell=True, stdin=subprocess.PIPE) p.stdin.write(""" create table articles ( title varchar primary key, article_number integer, fnd_offset integer, restricted integer, is_redirect integer ); pragma synchronous = 0; pragma temp_store = 2; pragma locking_mode = exclusive; pragma cache_size = 20000000; pragma default_cache_size = 20000000; pragma journal_mode = memory; .mode tabs .import {0:s} articles .exit """.format(self.article_import)) p.stdin.close() p.wait() PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time)) PrintLog.message(u'Loading: offsets and files') start_time = time.time() p = subprocess.Popen('sqlite3 > /dev/null 2>&1 ' + self.offset_db_name, shell=True, stdin=subprocess.PIPE) p.stdin.write(""" create table offsets ( article_number integer primary key, file_id integer, title varchar, seek integer, length integer, accumulated integer ); create table files ( file_id integer primary key, filename varchar ); pragma synchronous = 0; pragma temp_store = 2; pragma locking_mode = exclusive; pragma cache_size = 20000000; pragma default_cache_size = 20000000; pragma journal_mode = memory; .mode tabs .import {0:s} offsets .import {1:s} files .exit """.format(self.offset_import, self.file_import)) p.stdin.close() p.wait() PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
def handle_starttag(self, tag, attrs): global g_starty, g_curr_face, g_halign global g_this_article_title, g_links, g_link_cnt global warnings attrs = dict(attrs) # must always do the <html> tag if tag == 'html': self.local_init() self.in_html = True self.tag_stack = [(tag, True)] return self.tag_stack.append((tag, self.printing)) # we want to skip content that isn't for printing if 'class' in attrs: if 'noprint' in attrs['class']: self.printing = False # create a list of language links if self.inter_links and tag == 'a' and 'lang-link' in attrs[ 'class']: link = attrs['href'] (lang, data) = link.split(':', 1) if lang != self.language: self.language_links.append(link) # handle the tags if not self.printing: return elif tag == 'script': self.printing = False elif tag == 'title': self.in_title = True g_this_article_title = '' elif tag == 'body': self.in_body = True elif tag == 'table': self.in_table += 1 # if in a table suppress everything after this point if self.in_table > 0: return elif tag == 'h1': self.flush_buffer() self.in_h1 = True esc_code0(H1_MARGIN_TOP) elif tag == 'h2': self.flush_buffer() self.in_h2 = True esc_code0(H2_MARGIN_TOP) elif tag == 'h3': self.flush_buffer() self.in_h3 = True esc_code0(H3_MARGIN_TOP) elif tag == 'h4': self.flush_buffer() self.in_h4 = True esc_code0(H4_MARGIN_TOP) elif tag == 'h5': self.flush_buffer() self.in_h5 = True esc_code0(H5_MARGIN_TOP) elif tag == 'h6': self.flush_buffer() self.in_h6 = True esc_code0(H6_MARGIN_TOP) elif tag == 'div': self.flush_buffer() # suppress thumb info boxes if 'class' in attrs: c = attrs['class'].lower() for ignore in [ 'thumb', 'left', 'right', 'dablink', 'magnify', 'navframe', 'navtoggle', 'navcontent', ]: if ignore in c: self.printing = False return esc_code0(DIV_MARGIN_TOP) elif tag == 'p': self.flush_buffer() self.in_p = True #esc_code0(P_MARGIN_TOP) elif tag == 'blockquote' or tag == 'pre': self.flush_buffer() self.quote += 1 if self.quote < MAX_QUOTE_LEVEL: esc_code0(BLOCKQUOTE_MARGIN_TOP) self.indent += BLOCKQUOTE_MARGIN_LEFT self.lwidth -= BLOCKQUOTE_MARGIN_LEFT + BLOCKQUOTE_MARGIN_RIGHT esc_code9(BLOCKQUOTE_MARGIN_LEFT) elif tag == 'b': self.in_b = True elif tag == 'i': self.in_i = True elif tag == 'big': # Not sure what to do with this one self.in_b = True elif tag == 'strong': self.in_b = True elif tag == 'del': self.in_del = True elif tag == 'ins': self.in_ins = True elif tag == 'a' and 'href' in attrs: self.in_a = True self.url = attrs['href'] elif tag in ['ul', 'ol', 'dl']: if 'start' in attrs: list_start = re.sub(r'^\D*(\d+)\D?.*$', r'\1', attrs['start']) try: list_start = int(list_start) except ValueError: list_start = 1 self.enter_list(tag, list_start) else: self.enter_list(tag) elif tag == 'li': if 0 == self.level: if warnings: (line, column) = self.getpos() PrintLog.message( u'Warning: stray <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}' .format(tag, line, column, article_count + 1, g_this_article_title)) (t, p) = self.tag_stack.pop() return # just ignore it # force ul since this is a li without a parent #(t, p) = self.tag_stack.pop() #self.tag_stack.append(('ul', p)) #self.tag_stack.append((t,p)) #self.enter_list('ul') # handle missing </li> at the same level # simulate </li> and continue if self.li_inside[self.level]: if warnings: (line, column) = self.getpos() PrintLog.message( u'Warning: missing </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}' .format(tag, line, column, article_count + 1, g_this_article_title)) (t, p) = self.tag_stack.pop() self.flush_buffer(False) self.list_decrease_indent() self.li_inside[self.level] = True if 'value' in attrs: list_index = re.sub(r'^\D*(\d+)\D?.*$', r'\1', attrs['value']) try: self.li_cnt[self.level] = int(list_index) except ValueError: pass else: self.li_cnt[self.level] += 1 if self.li_type[self.level] == 'ol': self.wordwrap.append( ("{0:d}".format(self.li_cnt[self.level])) + u".", DEFAULT_FONT_IDX, None) else: if self.level > LIMAX_BULLETS: bullet_num = LIMAX_BULLETS else: bullet_num = self.level self.wordwrap.append(bullet_c[bullet_num], DEFAULT_FONT_IDX, None) self.flush_buffer() self.list_increase_indent() elif tag == 'dd': if 0 == self.level: if warnings: (line, column) = self.getpos() PrintLog.message( u'Warning: stray <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}' .format(tag, line, column, article_count + 1, g_this_article_title)) (t, p) = self.tag_stack.pop() return # just ignore it esc_code0(LIST_MARGIN_TOP) if not self.li_inside[self.level]: self.li_cnt[self.level] += 1 self.li_inside[self.level] = True self.list_increase_indent() elif warnings: (line, column) = self.getpos() PrintLog.message( u'Warning: nested <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}' .format(tag, line, column, article_count + 1, g_this_article_title)) elif tag == 'dt': if 0 == self.level: if warnings: (line, column) = self.getpos() PrintLog.message( u'Warning: stray <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}' .format(tag, line, column, article_count + 1, g_this_article_title)) (t, p) = self.tag_stack.pop() return # just ignore it # close unterminated 'dd' # i.e. have this <dt>tag</dt><dd>xxxxx<dt>tag2</dt>....... if self.li_inside[self.level]: if warnings: (line, column) = self.getpos() PrintLog.message( u'Warning: unterminated <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}' .format('dd', line, column, article_count + 1, g_this_article_title)) (t, p) = self.tag_stack.pop() self.handle_endtag('dd') self.tag_stack.append((t, p)) esc_code0(LIST_MARGIN_TOP) elif tag == 'br': self.flush_buffer() esc_code0(BR_MARGIN_TOP) self.in_br = True elif tag == 'img' and 'src' in attrs: # include either image or the 'alt' text if self.enable_images: (width, height, data) = get_imgdata(attrs['src'], self.indent) self.wordwrap.AppendImage(width, height, data, None) elif 'alt' in attrs: self.handle_data(attrs['alt']) self.in_img = True
def output_fnd(filename_format, article_index, language_processor, truncate_title): """create bigram table""" global bigram global index_matrix global MAXIMUM_TITLE_LENGTH global MAXIMUM_TITLE_ACTUAL global FND_FILE_SEGMENT_SIZE start_time = time.time() out_f = SegmentedFileWriter(filename_format, FND_FILE_SEGMENT_SIZE) PrintLog.message(u'Writing bigrams: {0:s}'.format(out_f.current_filename)) sortedgram = [ (value, key) for key, value in bigram.iteritems() ] sortedgram.sort() sortedgram.reverse() bigram = {} i = 0 for k, v in sortedgram: out_f.write(v) bigram[v] = chr(i + 128) i += 1 if i >= 128: break while i < 128: out_f.write('zz') bigram['zz'] = chr(i + 128) i += 1 PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time)) # create pfx matrix and write encoded titles PrintLog.message(u'Sorting titles') start_time = time.time() ####@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@#### unique_articles = {} for article in [ (SearchKey.make_key(translated_title[:MAXIMUM_TITLE_LENGTH]), title) for title in article_index.all_indices() for translated_title in language_processor.translate(title) ]: unique_articles[article] = 1 article_list = sorted(unique_articles.keys()) PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time)) PrintLog.message(u'Writing matrix: {0:s}'.format(out_f.current_filename)) start_time = time.time() index_matrix = {} index_matrix['\0\0\0'] = out_f.tell() previous_bigram_title = '' previous_utf8_title = '' mod_counter = 0 for stripped_title, title in article_list: bigram_title = bigram_encode(stripped_title)[:MAXIMUM_TITLE_LENGTH] (article_number, dummy, restricted, is_redirect) = article_index.get_index(title) if '' == bigram_title and is_redirect: continue utf8_title = title.encode('utf-8') if truncate_title: utf8_title = utf8_title[:MAXIMUM_TITLE_LENGTH] else: utf8_title = utf8_title[:MAXIMUM_TITLE_ACTUAL] offset = out_f.tell() article_index.set_index(title, (article_number, offset, restricted, is_redirect)) key3 = (stripped_title[0:3] + '\0\0\0')[0:3].lower() key2 = key3[0:2] + '\0' key1 = key3[0:1] + '\0\0' if key1 not in index_matrix: index_matrix[key1] = offset if key2 not in index_matrix: index_matrix[key2] = offset if key3 not in index_matrix: index_matrix[key3] = offset if 0 == mod_counter & 0x0f: bigram_common_length = 0 utf8_common_length = 0 else: bigram_common_length = common_prefix_length(previous_bigram_title, bigram_title) utf8_common_length = common_prefix_length(previous_utf8_title, utf8_title) mod_counter += 1 previous_bigram_title = bigram_title previous_utf8_title = utf8_title if bigram_common_length > 1: bigram_title = chr(bigram_common_length - 1) + bigram_title[bigram_common_length:] if utf8_common_length > 1: utf8_title = chr(utf8_common_length - 1) + utf8_title[utf8_common_length:] out_f.write(struct.pack('<I', article_number) + '\0' + bigram_title + '\0' + utf8_title + '\0') PrintLog.message(u'Final segment: {0:s}'.format(out_f.current_filename)) out_f.close() PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
def main(): global verbose global sizes global distribution global dist_list global total global byte_count try: opts, args = getopt.getopt(sys.argv[1:], 'hvd:', ['help', 'verbose', 'dir=']) except getopt.GetoptError as err: usage(err) verbose = False dir = 'image/enpedia' for opt, arg in opts: if opt in ('-v', '--verbose'): verbose = True elif opt in ('-h', '--help'): usage(None) elif opt in ('-d', '--dir'): dir = arg else: usage('unhandled option: ' + opt) if not os.path.isdir(dir): usage('{0:s} is not a directory'.format(dir)) idx_file = open(os.path.join(dir, "wiki.idx"), "rb") fnd_file = open(os.path.join(dir, "wiki.fnd"), "rb") dat_format = os.path.join(dir, "wiki{0:d}.dat") index_min = 1 index_max = struct.unpack('<I', idx_file.read(4))[0] PrintLog.message('Total index entries = {0:d}'.format(index_max)) total = 0 sizes = {} distribution = {} byte_count = {} dist_list = [ 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 5000, 7500, 10000, 20000, 50000, 100000, 200000, 500000, 99999999 ] for d in dist_list: distribution[d] = 0 byte_count[d] = 0 for item in range(index_max): index_number = 1 + item if index_number not in sizes: process(index_number, idx_file, fnd_file, dat_format) PrintLog.message('{0:>10s} {1:>20s} {2:>20s} {3:>14s}'.format( 'Size(<=)', 'Articles', 'Accumulated', 'Bytes')) sum = 0 for i in dist_list: sum += distribution[i] PrintLog.message( '{0:10n} = {1:10n} {2:7.1f} % {3:10n} {4:7.1f} % {5:14n}'.format( i, distribution[i], 100.0 * distribution[i] / index_max, sum, 100.0 * sum / index_max, byte_count[i])) PrintLog.message('summed = {0:10n}'.format(sum)) PrintLog.message('sizes = {0:10n}'.format(len(sizes))) PrintLog.message('total = {0:10n}'.format(total)) idx_file.close() fnd_file.close()
def get_imgdata(imgfile, indent): try: img = gd.image(imgfile) except IOError, e: PrintLog.message(u'unable to open image file: {0:s}'.format(imgfile)) return (0, 0, r'')
def handle_starttag(self, tag, attrs): global g_starty, g_curr_face, g_halign global g_this_article_title, g_links, g_link_cnt global warnings attrs = dict(attrs) # must always do the <html> tag if tag == 'html': self.local_init() self.in_html = True self.tag_stack = [(tag, True)] return self.tag_stack.append((tag, self.printing)) # we want to skip content that isn't for printing if 'class' in attrs: if 'noprint' in attrs['class']: self.printing = False # create a list of language links if self.inter_links and tag == 'a' and 'lang-link' in attrs['class']: self.language_links.append(attrs['href']) # handle the tags if not self.printing: return; elif tag == 'script': self.printing = False elif tag == 'title': self.in_title = True g_this_article_title = '' elif tag == 'body': self.in_body = True elif tag == 'table': self.in_table += 1 # if in a table suppress everything after this point if self.in_table > 0: return elif tag == 'h1': self.flush_buffer() self.in_h1 = True esc_code0(H1_MARGIN_TOP) elif tag == 'h2': self.flush_buffer() self.in_h2 = True esc_code0(H2_MARGIN_TOP) elif tag == 'h3': self.flush_buffer() self.in_h3 = True esc_code0(H3_MARGIN_TOP) elif tag == 'h4': self.flush_buffer() self.in_h4 = True esc_code0(H4_MARGIN_TOP) elif tag == 'h5': self.flush_buffer() self.in_h5 = True esc_code0(H5_MARGIN_TOP) elif tag == 'h6': self.flush_buffer() self.in_h6 = True esc_code0(H6_MARGIN_TOP) elif tag == 'div': self.flush_buffer() # suppress thumb info boxes if 'class' in attrs: c = attrs['class'] if 'thumb' in c or 'left' in c or 'right' in c \ or 'dablink' in c or 'magnify' in c: self.printing = False return esc_code0(DIV_MARGIN_TOP) elif tag == 'p': self.flush_buffer() self.in_p = True esc_code0(P_MARGIN_TOP) elif tag == 'blockquote': self.flush_buffer() self.quote += 1 if self.quote < MAX_QUOTE_LEVEL: esc_code0(BLOCKQUOTE_MARGIN_TOP) self.indent += BLOCKQUOTE_MARGIN_LEFT self.lwidth -= BLOCKQUOTE_MARGIN_LEFT + BLOCKQUOTE_MARGIN_RIGHT esc_code9(BLOCKQUOTE_MARGIN_LEFT) elif tag == 'b': self.in_b = True elif tag == 'i': self.in_i = True elif tag == 'big': # Not sure what to do with this one self.in_b = True elif tag == 'strong': self.in_b = True elif tag == 'del': self.in_del = True elif tag == 'ins': self.in_ins = True elif tag == 'a' and 'href' in attrs: self.in_a = True self.url = attrs['href'] elif tag in ['ul', 'ol', 'dl']: if 'start' in attrs: list_start = re.sub(r'^\D*(\d+)\D?.*$', r'\1', attrs['start']) try: list_start = int(list_start) except ValueError: list_start = 1 self.enter_list(tag, list_start) else: self.enter_list(tag) elif tag == 'li': if 0 == self.level: if warnings: (line, column) = self.getpos() PrintLog.message(u'Warning: stray <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}' .format(tag, line, column, article_count + 1, g_this_article_title)) (t, p) = self.tag_stack.pop() return # just ignore it # force ul since this is a li without a parent #(t, p) = self.tag_stack.pop() #self.tag_stack.append(('ul', p)) #self.tag_stack.append((t,p)) #self.enter_list('ul') # handle missing </li> at the same level # simulate </li> and continue if self.li_inside[self.level]: if warnings: (line, column) = self.getpos() PrintLog.message(u'Warning: missing </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}' .format(tag, line, column, article_count + 1, g_this_article_title)) (t, p) = self.tag_stack.pop() self.flush_buffer(False) self.list_decrease_indent() self.li_inside[self.level] = True if 'value' in attrs: list_index = re.sub(r'^\D*(\d+)\D?.*$', r'\1', attrs['value']) try: self.li_cnt[self.level] = int(list_index) except ValueError: pass else: self.li_cnt[self.level] += 1 if self.li_type[self.level] == 'ol': self.wordwrap.append(("{0:d}".format(self.li_cnt[self.level])) + u".", DEFAULT_FONT_IDX, None) else: if self.level > LIMAX_BULLETS: bullet_num = LIMAX_BULLETS else: bullet_num = self.level self.wordwrap.append(bullet_c[bullet_num], DEFAULT_FONT_IDX, None) self.flush_buffer() self.list_increase_indent() elif tag == 'dd': if 0 == self.level: if warnings: (line, column) = self.getpos() PrintLog.message(u'Warning: stray <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}' .format(tag, line, column, article_count + 1, g_this_article_title)) (t, p) = self.tag_stack.pop() return # just ignore it self.li_cnt[self.level] += 1 self.list_increase_indent() elif tag == 'br': self.in_br = True elif tag == 'img' and 'src' in attrs: # include either image or the 'alt' text if self.enable_images: (width, height, data) = get_imgdata(attrs['src'], self.indent) self.wordwrap.AppendImage(width, height, data, None) elif 'alt' in attrs: self.handle_data(attrs['alt']) self.in_img = True
def handle_endtag(self, tag): global g_this_article_title global article_count global warnings # ignore end tag without start tag if (tag, True) not in self.tag_stack and (tag, False) not in self.tag_stack: if warnings: (line, column) = self.getpos() PrintLog.message( u'Warning: superfluous </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}' .format(tag, line, column, article_count + 1, g_this_article_title)) return # backtrack up the stack closing each open tag until there is a match (start_tag, self.printing) = self.tag_stack.pop() while start_tag != tag: self.tag_stack.append((start_tag, self.printing)) if warnings: (line, column) = self.getpos() PrintLog.message( u'Warning: force </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}' .format(start_tag, line, column, article_count + 1, g_this_article_title)) self.handle_endtag(start_tag) (start_tag, self.printing) = self.tag_stack.pop() # must always do </html> tag if tag == 'html': self.printing = True self.tag_stack = [] self.in_html = False esc_code1() write_article(self.language_links) return if not self.printing: return elif tag == 'script': pass elif tag == 'title': self.in_title = False g_this_article_title = g_this_article_title.strip() elif tag == 'body': self.in_body = False self.flush_buffer() elif tag == 'table': if self.in_table > 0: self.in_table -= 1 # if in a table suppress everything after this point if self.in_table > 0: return elif tag == 'h1': self.flush_buffer() self.in_h1 = False esc_code0(H1_MARGIN_BOTTOM) esc_code_separate() # force the above escape code be output elif tag == 'h2': self.flush_buffer() self.in_h2 = False elif tag == 'h3': self.flush_buffer() self.in_h3 = False elif tag == 'h4': self.flush_buffer() self.in_h4 = False elif tag == 'h5': self.flush_buffer() self.in_h5 = False elif tag == 'h6': self.flush_buffer() self.in_h6 = False elif tag == 'div': self.flush_buffer() elif tag == 'p': self.flush_buffer() self.in_p = False elif tag == 'blockquote' or tag == 'pre': self.flush_buffer() if self.quote > 0: if self.quote < MAX_QUOTE_LEVEL: self.indent -= BLOCKQUOTE_MARGIN_LEFT self.lwidth += BLOCKQUOTE_MARGIN_LEFT + BLOCKQUOTE_MARGIN_RIGHT esc_code9(-BLOCKQUOTE_MARGIN_LEFT) self.quote -= 1 elif tag == 'b': self.in_b = False elif tag == 'big': self.in_b = False elif tag == 'strong': self.in_b = False elif tag == 'i': self.in_i = False elif tag == 'del': self.in_del = False elif tag == 'ins': self.in_ins = False elif tag == 'a': self.in_a = False self.url = "" elif tag in ['ul', 'ol', 'dl']: self.leave_list() elif tag == 'li': if 0 == self.level: if warnings: (line, column) = self.getpos() PrintLog.message( u'Warning: stray </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}' .format(tag, line, column, article_count + 1, g_this_article_title)) else: self.flush_buffer(False) self.list_decrease_indent() self.li_inside[self.level] = False elif tag == 'dd': if 0 == self.level: if warnings or True: (line, column) = self.getpos() PrintLog.message( u'Warning: stray </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}' .format(tag, line, column, article_count + 1, g_this_article_title)) else: self.flush_buffer() esc_code0(LIST_MARGIN_TOP) if self.li_inside[self.level]: self.li_inside[self.level] = False self.list_decrease_indent() elif tag == 'dt': self.flush_buffer() esc_code0(LIST_MARGIN_TOP) elif tag == 'br': self.in_br = False elif tag == 'img': self.in_img = False
def main(): global verbose try: opts, args = getopt.getopt(sys.argv[1:], 'hv', ['help', 'verbose', ]) except getopt.GetoptError as err: usage(err) verbose = False uint32_size = 4 for opt, arg in opts: if opt in ('-v', '--verbose'): verbose = True elif opt in ('-h', '--help'): usage(None) else: usage('unhandled option: ' + opt) if len(args) < 1: usage('missing arguments') fnd_file = SegmentedFileReader(args) total_entries = 0 bigram_table = {} for i in range(128,256): bigram_table[i] = fnd_file.read(2) previous_title1 = '' previous_title2 = '' while True: fnd_offset = fnd_file.tell() header = fnd_file.read(uint32_size + 1) if 0 == len(header): break article_number, nul_byte = struct.unpack('<IB', header) title1 = get_title(fnd_file) title2 = get_title(fnd_file) total_entries += 1 length1 = len(title1) length2 = len(title2) if 0 != length1 and title1[0] < ' ': prefix_length = ord(title1[0]) + 1 title1 = previous_title1[:prefix_length] + title1[1:] if 0 != length2 and title2[0] < ' ': prefix_length = ord(title2[0]) + 1 title2 = previous_title2[:prefix_length] + title2[1:] full_length1 = len(title1) full_length2 = len(title2) decoded_title1 = '' for c in title1: i = ord(c) if i in bigram_table: decoded_title1 += bigram_table[i] else: decoded_title1 += c PrintLog.message(u'Index: {an:13n} @ Offset: {of:13n} [0x{of:08x}]\n' u'{pad1:s}[{l1:3d}/{fl1:3d}]:{t1!r:s}\n' u'{pad1:s}{pad2}{dt1!r:s}\n' u'{pad1:s}[{fl1:3d}/{fl2:3d}]:"{t2:s}"\n' .format(of = fnd_offset, an = article_number, l1 = length1, fl1 = full_length1, t1 = title1, dt1 = decoded_title1, pad1 = ' ' * 2, pad2 = ' ' * (2 * 3 + 4), l2 = length2, fl2 = full_length2, t2 = truncated_utf8(title2))) previous_title1 = title1 previous_title2 = title2 fnd_file.close() PrintLog.message(u'Total entries = {0:13n}'.format(total_entries))
def handle_entityref(self, name): """handle & > ...""" try: self.handle_data(unichr(htmlentitydefs.name2codepoint[name])) except KeyError: PrintLog.message(u'ENTITYREF ERROR: {0:s} article: {1:s}'.format(name, g_this_article_title))
def process(index_number, idx_file, fnd_file, dat_format, extract): """dump the index and fnd file entries""" PrintLog.message('Index number = {0:13n} [0x{0:08x}]'.format(index_number)) PrintLog.message('') uint32_size = 4 index_entry_size = 2 * uint32_size + 1 index_offset = uint32_size + index_entry_size * (index_number - 1) idx_file.seek(index_offset) offset_dat, offset_fnd, file_id = struct.unpack( '<2IB', idx_file.read(index_entry_size)) data_file_name = dat_format.format(file_id) PrintLog.message('Index offset = {0:13n} [0x{0:08x}]'.format(index_offset)) PrintLog.message('Data offset = {0:13n} [0x{0:08x}]'.format(offset_dat)) PrintLog.message('FND offset = {0:13n} [0x{0:08x}]'.format(offset_fnd)) PrintLog.message('File ID = {0:13n} [0x{0:08x}] => "{1:s}"'.format( file_id, data_file_name)) fnd_file.seek(offset_fnd) article_index_check = struct.unpack('<I', fnd_file.read(uint32_size))[0] index_match = '(Matches)' if article_index_check == index_number else '(**MISMATCHED INDEX**)' PrintLog.message('FND index = {0:13n} [0x{0:08x}] {1:s}'.format( article_index_check, index_match)) ignored = fnd_file.read(1) # skip nul byte titles = fnd_file.read(1024).split('\0') # >= 2 * MAX_TITLE_SEARCH PrintLog.message('FND title = "{0!r:s}"'.format( titles[1])) # actual title dat_file = open(data_file_name, 'rb') dat_file.seek(offset_dat) number_of_pages = struct.unpack('<B', dat_file.read(1))[0] PrintLog.message( 'Data Pages = {0:13n} [0x{0:08x}]'.format(number_of_pages)) PrintLog.message('') total_article_bytes = 0 PrintLog.message('{0:>29s}{1:>25s}{2:>25s}'.format('Article Number', 'Article Offset', 'Uncompressed Length')) for i in range(0, number_of_pages): page_id, page_offset, page_length = struct.unpack( '<3I', dat_file.read(3 * uint32_size)) restricted = 'Restricted' if (0 != page_offset & 0x80000000) else '' page_offset = page_offset & 0x7fffffff total_article_bytes += page_length PrintLog.message( '{0:3d}: {1:10n} [0x{1:08x}] {2:10n} [0x{2:08x}] {3:10n} [0x{3:08x}] {4:s}' .format(i, page_id, page_offset, page_length, restricted)) PrintLog.message('{0:<{1}s}{2:10n} [0x{2:08x}]'.format( 'Total bytes: ', 3 + 3 + 10 + 4 + 8 + 3 + 10 + 4 + 8 + 3, total_article_bytes)) PrintLog.message('') data_length = struct.unpack('<I', dat_file.read(4))[0] PrintLog.message('DataLength = {0:13n} [0x{0:08x}]'.format(data_length)) article_data = dat_file.read(data_length) dat_file.close() if extract is not None: output_file_name = extract + '-I' + str(index_number) + '-b' + str( data_length) + '.articles' PrintLog.message('Extracting uncompressed articles to: {0:s}'.format( output_file_name)) out = open(output_file_name, 'wb') out.write(DecompressData(article_data)) out.close() PrintLog.message('')
def write_article(language_links): global compress global verbose global output, f_out, i_out global article_count global g_this_article_title global file_number global start_time global article_writer article_count += 1 if verbose: PrintLog.message(u'[MWR {0:d}] {1:s}'.format(article_count, g_this_article_title)) elif article_count % 1000 == 0: now_time = time.time() PrintLog.message(u'Render[{0:d}]: {1:7.2f}s {2:10d}'.format(file_number, now_time - start_time, article_count)) start_time = now_time output.flush() # create links links_stream = io.BytesIO('') for i in g_links: (x0, y0, x1, y1, url) = g_links[i] links_stream.write(struct.pack('<3I', (y0 << 8) | x0, (y1 << 8) | x1, link_number(url))) links_stream.flush() links = links_stream.getvalue() links_stream.close() # create language links links_stream = io.BytesIO('') japanese_convert = LanguageTranslation.LanguageJapanese().translate normal_convert = LanguageTranslation.LanguageNormal().translate for l in language_links: language, link = l.split(':', 1) if 'ja' == language: stripped = japanese_convert(link) else: stripped = normal_convert(link) if link == stripped: links_stream.write(l.encode('utf-8') + '\0') else: links_stream.write((language + '#' + stripped).encode('utf-8') + '\1' + link.encode('utf-8') + '\0') links_stream.flush() langs = links_stream.getvalue() links_stream.close() # create the header (header size = 8) header = struct.pack('<I2H', 8 + len(links) + len(langs), g_link_cnt, 0) body = output.getvalue() # combine the data whole_article = header + links + langs + body if compress: try: (article_number, fnd_offset, restricted) = article_index(g_this_article_title) restricted = bool(int(restricted)) # '0' is True so turn it into False article_writer.add_article(article_number, whole_article, fnd_offset, restricted) except KeyError: PrintLog.message(u'Error in: write_article, Title not found') PrintLog.message(u'Title: {0:s}'.format(g_this_article_title)) PrintLog.message(u'Offset: {0:s}'.format(file_offset)) PrintLog.message(u'Count: {0:s}'.format(article_count)) else: f_out.write(whole_article) f_out.flush() # Note: some versions of Python do not move file position on truncate # so an explicit seek is needed to avoid nul padding bytes. output.seek(0) output.truncate(0)