def bigram_encode(title): """encode a title in bigram form""" global bigram result = '' title = SearchKey.strip_accents(title) while len(title) >= 2: if SearchKey.is_valid_character(title[0]): b = title[0:2] if b in bigram: result += bigram[b] title = title[2:] else: result += chr(ord(title[0:1])) title = title[1:] else: #result += '?' title = title[1:] if len(title) == 1: if SearchKey.is_valid_character(title[0]): result += chr(ord(title[0])) #else: # result += '?' return SearchKey.compact_spaces(result)
def write_article(language_links): global compress global verbose global output, f_out, i_out global article_count global g_this_article_title global file_number global start_time global article_writer article_count += 1 if verbose: PrintLog.message(u'[MWR {0:d}] {1:s}'.format(article_count, g_this_article_title)) elif article_count % 1000 == 0: now_time = time.time() PrintLog.message(u'Render[{0:d}]: {1:7.2f}s {2:10d}'.format(file_number, now_time - start_time, article_count)) start_time = now_time # create links links_stream = io.BytesIO('') for i in g_links: (x0, y0, x1, y1, url) = g_links[i] links_stream.write(struct.pack('<3I', (y0 << 8) | x0, (y1 << 8) | x1, link_number(url))) links_stream.flush() links = links_stream.getvalue() links_stream.close() # create language links links_stream = io.BytesIO('') japanese_convert = LanguageTranslation.LanguageJapanese().translate normal_convert = LanguageTranslation.LanguageNormal().translate for l in language_links: language, link = l.split(':', 1) language = language.strip() link = link.strip() # only need the first pronunciation for the link # as this must always be present if link is not None and '' != link: if 'ja' == language: stripped = japanese_convert(link)[0] else: stripped = normal_convert(link)[0] stripped = SearchKey.strip_accents(stripped) if link == stripped: links_stream.write(l.encode('utf-8') + '\0') else: links_stream.write((language + '#' + stripped).encode('utf-8') + '\1' + link.encode('utf-8') + '\0') links_stream.flush() langs = links_stream.getvalue() links_stream.close() # create the header (header size = 8) header = struct.pack('<I2H', 8 + len(links) + len(langs), g_link_cnt, 0) body = output.fetch() # combine the data whole_article = header + links + langs + body if compress: try: (article_number, fnd_offset, restricted) = article_index(g_this_article_title) restricted = bool(int(restricted)) # '0' is True so turn it into False article_writer.add_article(article_number, whole_article, fnd_offset, restricted) except KeyError: PrintLog.message(u'Error in: write_article, Title not found') PrintLog.message(u'Title: {0:s}'.format(g_this_article_title)) PrintLog.message(u'Count: {0:s}'.format(article_count)) else: f_out.write(whole_article) f_out.flush()
def write_article(language_links): global compress global verbose global output, f_out, i_out global article_count global g_this_article_title global file_number global start_time global article_writer article_count += 1 if verbose: PrintLog.message(u'[MWR {0:d}] {1:s}'.format(article_count, g_this_article_title)) elif article_count % 1000 == 0: now_time = time.time() PrintLog.message(u'Render[{0:d}]: {1:7.2f}s {2:10d}'.format( file_number, now_time - start_time, article_count)) start_time = now_time # create links links_stream = io.BytesIO('') for i in g_links: (x0, y0, x1, y1, url) = g_links[i] links_stream.write( struct.pack('<3I', (y0 << 8) | x0, (y1 << 8) | x1, link_number(url))) links_stream.flush() links = links_stream.getvalue() links_stream.close() # create language links links_stream = io.BytesIO('') japanese_convert = LanguageTranslation.LanguageJapanese().translate normal_convert = LanguageTranslation.LanguageNormal().translate for l in language_links: language, link = l.split(':', 1) language = language.strip() link = link.strip() # only need the first pronunciation for the link # as this must always be present if link is not None and '' != link: if 'ja' == language: stripped = japanese_convert(link)[0] else: stripped = normal_convert(link)[0] stripped = SearchKey.strip_accents(stripped) if link == stripped: links_stream.write(l.encode('utf-8') + '\0') else: links_stream.write((language + '#' + stripped).encode('utf-8') + '\1' + link.encode('utf-8') + '\0') links_stream.flush() langs = links_stream.getvalue() links_stream.close() # create the header (header size = 8) header = struct.pack('<I2H', 8 + len(links) + len(langs), g_link_cnt, 0) body = output.fetch() # combine the data whole_article = header + links + langs + body if compress: try: (article_number, fnd_offset, restricted) = article_index(g_this_article_title) restricted = bool( int(restricted)) # '0' is True so turn it into False article_writer.add_article(article_number, whole_article, fnd_offset, restricted) except KeyError: PrintLog.message(u'Error in: write_article, Title not found') PrintLog.message(u'Title: {0:s}'.format(g_this_article_title)) PrintLog.message(u'Count: {0:s}'.format(article_count)) else: f_out.write(whole_article) f_out.flush()