Exemplo n.º 1
0
def bigram_encode(title):
    """encode a title in bigram form"""
    global bigram

    result = ''
    title = SearchKey.strip_accents(title)

    while len(title) >= 2:
        if SearchKey.is_valid_character(title[0]):

            b = title[0:2]
            if b in bigram:
                result += bigram[b]
                title = title[2:]
            else:
                result += chr(ord(title[0:1]))
                title = title[1:]
        else:
            #result += '?'
            title = title[1:]
    if len(title) == 1:
        if SearchKey.is_valid_character(title[0]):
            result += chr(ord(title[0]))
        #else:
        #    result += '?'

    return SearchKey.compact_spaces(result)
Exemplo n.º 2
0
def bigram_encode(title):
    """encode a title in bigram form"""
    global bigram

    result = ''
    title = SearchKey.strip_accents(title)

    while len(title) >= 2:
        if SearchKey.is_valid_character(title[0]):

            b = title[0:2]
            if b in bigram:
                result += bigram[b]
                title = title[2:]
            else:
                result += chr(ord(title[0:1]))
                title = title[1:]
        else:
            #result += '?'
            title = title[1:]
    if len(title) == 1:
        if SearchKey.is_valid_character(title[0]):
            result += chr(ord(title[0]))
        #else:
        #    result += '?'

    return SearchKey.compact_spaces(result)
Exemplo n.º 3
0
    def translate(self, text):
        """take Japanese string and convert to Roman letters"""

        result = []

        for text in super(type(self), self).translate(text):
            split_text = ''.join([
                c if not c in SearchKey.unsupported_punctuation() else ' '
                for c in list(text)
            ]).split()
            for tt in split_text:
                if type(tt) == unicode:
                    tt = tt.encode('utf-8')
                phonetics = self.get_phonetics(tt)
                #result = super(type(self), self).append_translations(result, phonetics, ' ')
                # *** nasty hack to make sure the number of translations does not exceed 10000
                # *** as some Japanese phrases can have hundreds of millions of possible pronunciations
                # *** e.g. 平親清女・平親清女妹・平親清四女・平親清五女
                # ***        120  *   360   *   120   *  120   -> 622,080,000
                # *** just cut the arrays to the first 100 elements
                result = super(type(self), self).append_translations(
                    result[:100], phonetics[:100], ' ')

        if result is None or [] == result or '' == result:
            return ['']

        return result
Exemplo n.º 4
0
def generate_bigram(text):
    """create bigram from pairs of characters"""
    global bigram

    if len(text) > 2:
        try:
            if SearchKey.is_valid_character(text[0]) and SearchKey.is_valid_character(text[1]):
                bigram[text[0:2]] += 1
        except KeyError:
            bigram[text[0:2]] = 1

    if len(text) > 4:
        try:
            if SearchKey.is_valid_character(text[2]) and SearchKey.is_valid_character(text[3]):
                bigram[text[2:4]] += 1
        except KeyError:
            bigram[text[2:4]] = 1
Exemplo n.º 5
0
def generate_bigram(text):
    """create bigram from pairs of characters"""
    global bigram

    if len(text) > 2:
        try:
            if SearchKey.is_valid_character(text[0]) and SearchKey.is_valid_character(text[1]):
                bigram[text[0:2]] += 1
        except KeyError:
            bigram[text[0:2]] = 1

    if len(text) > 4:
        try:
            if SearchKey.is_valid_character(text[2]) and SearchKey.is_valid_character(text[3]):
                bigram[text[2:4]] += 1
        except KeyError:
            bigram[text[2:4]] = 1
Exemplo n.º 6
0
    def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
        global verbose

        title = self.convert(title).strip(u'\u200e\u200f')

        rtitle = self.convert(rtitle).strip().strip(u'\u200e\u200f')

        # redirected title may contain '%xx' items - treat as unicode sequence
        # if it fails just keep the %xx sequences intact since it must represent
        # either real %xx or some unknowable coding scheme
        try:
            rtitle = unicode(urllib.unquote(rtitle.encode('utf-8')),
                             'utf-8').strip().strip(u'\u200e\u200f')
        except UnicodeDecodeError:
            pass

        rtitle = SearchKey.compact_spaces(rtitle).lstrip(':').strip()

        if self.KEY_TEMPLATE == key:
            if title != rtitle:
                title = unicode(
                    category,
                    'utf-8').capitalize() + ':' + upper_case_first_char(title)
                rtitle = unicode(
                    rcategory,
                    'utf-8').capitalize() + ':' + upper_case_first_char(rtitle)
                self.template_cursor.execute(
                    u'insert or replace into redirects (title, redirect) values(?, ?)',
                    [
                        u'~{0:d}~{1:s}'.format(self.file_id(), title),
                        u'~{0:d}~{1:s}'.format(self.file_id(), rtitle)
                    ])

            self.template_redirect_count += 1
            return

        if self.KEY_ARTICLE != key or self.KEY_ARTICLE != rkey:
            if verbose:
                PrintLog.message(
                    u'Non-article Redirect: {0:s}[{1:d}]:{2:s} ->  {3:s}[{4:d}]:{5:s}'
                    .format(unicode(category, 'utf-8'), key, title,
                            unicode(rcategory, 'utf-8'), rkey, rtitle))
            return

        if '' == rtitle:
            PrintLog.message(u'Empty Redirect for: {0:s}[{1:d}]:{2:s}'.format(
                category, key, title))
        else:
            self.redirects[title] = rtitle
            self.redirect_count += 1

            for t in self.language_processor.translate(title):
                generate_bigram(t)

            if verbose:
                PrintLog.message(
                    u'Redirect: {0:s}[{1:d}]:{2:s} ->  {3:s}[{4:d}]:{5:s}'.
                    format(category, key, title, rcategory, rkey, rtitle))
Exemplo n.º 7
0
    def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
        global verbose

        title = self.convert(title).strip(u'\u200e\u200f')

        rtitle = self.convert(rtitle).strip().strip(u'\u200e\u200f')

        # redirected title may contain '%xx' items - treat as unicode sequence
        # if it fails just keep the %xx sequences intact since it must represent
        # either real %xx or some unknowable coding scheme
        try:
            rtitle = unicode(urllib.unquote(rtitle.encode('utf-8')), 'utf-8').strip().strip(u'\u200e\u200f')
        except UnicodeDecodeError:
            pass

        rtitle = SearchKey.compact_spaces(rtitle).lstrip(':').strip()

        if self.KEY_TEMPLATE == key:
            if title != rtitle:
                title = unicode(category, 'utf-8').capitalize() + ':' + upper_case_first_char(title)
                rtitle = unicode(rcategory, 'utf-8').capitalize() + ':' + upper_case_first_char(rtitle)
                self.template_cursor.execute(u'insert or replace into redirects (title, redirect) values(?, ?)',
                                             [u'~{0:d}~{1:s}'.format(self.file_id(), title),
                                              u'~{0:d}~{1:s}'.format(self.file_id(), rtitle)])

            self.template_redirect_count += 1
            return

        if self.KEY_ARTICLE != key or self.KEY_ARTICLE != rkey:
            if verbose:
                PrintLog.message(u'Non-article Redirect: {0:s}[{1:d}]:{2:s} ->  {3:s}[{4:d}]:{5:s}'
                                 .format(unicode(category, 'utf-8'), key, title,
                                         unicode(rcategory, 'utf-8'), rkey, rtitle))
            return

        if '' == rtitle:
            PrintLog.message(u'Empty Redirect for: {0:s}[{1:d}]:{2:s}'.format(category, key, title))
        else:
            self.redirects[title] = rtitle
            self.redirect_count += 1

            for t in self.language_processor.translate(title):
                generate_bigram(t)

            if verbose:
                PrintLog.message(u'Redirect: {0:s}[{1:d}]:{2:s} ->  {3:s}[{4:d}]:{5:s}'
                                 .format(category, key, title, rcategory, rkey, rtitle))
Exemplo n.º 8
0
def output_pfx(filename):
    """output the pfx matrix"""
    global index_matrix

    PrintLog.message(u'Writing: {0:s}'.format(filename))
    start_time = time.time()
    out_f = open(filename, 'wb')
    list = '\0' + SearchKey.all_characters()
    for k1 in list:
        for k2 in list:
            for k3 in list:
                key = k1+k2+k3
                if key in index_matrix:
                    offset = index_matrix[key]
                else:
                    offset = 0
                out_f.write(struct.pack('<I', offset))

    out_f.close()
    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
Exemplo n.º 9
0
def output_pfx(filename):
    """output the pfx matrix"""
    global index_matrix

    PrintLog.message(u'Writing: {0:s}'.format(filename))
    start_time = time.time()
    out_f = open(filename, 'wb')
    list = '\0' + SearchKey.all_characters()
    for k1 in list:
        for k2 in list:
            for k3 in list:
                key = k1+k2+k3
                if key in index_matrix:
                    offset = index_matrix[key]
                else:
                    offset = 0
                out_f.write(struct.pack('<I', offset))

    out_f.close()
    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
Exemplo n.º 10
0
    def translate(self, text):
        """take Japanese string and convert to Roman letters"""

        result = []

        for text in super(type(self), self).translate(text):
            split_text = ''.join([ c if not c in SearchKey.unsupported_punctuation() else ' ' for c in list(text)]).split()
            for tt in split_text:
                if type(tt) == unicode:
                    tt = tt.encode('utf-8')
                phonetics = self.get_phonetics(tt)
                #result = super(type(self), self).append_translations(result, phonetics, ' ')
                # *** nasty hack to make sure the number of translations does not exceed 10000
                # *** as some Japanese phrases can have hundreds of millions of possible pronunciations
                # *** e.g. 平親清女・平親清女妹・平親清四女・平親清五女
                # ***        120  *   360   *   120   *  120   -> 622,080,000
                # *** just cut the arrays to the first 100 elements
                result = super(type(self), self).append_translations(result[:100], phonetics[:100], ' ')

        if result is None or [] == result or '' == result:
            return ['']

        return result
Exemplo n.º 11
0
def output_fnd(filename, article_index, language_processor, truncate_title):
    """create bigram table"""
    global bigram
    global index_matrix
    global MAXIMUM_TITLE_LENGTH
    global MAXIMUM_TITLE_ACTUAL

    PrintLog.message(u'Writing bigrams: {0:s}'.format(filename))
    start_time = time.time()
    out_f = open(filename, 'wb')

    sortedgram = [ (value, key) for key, value in bigram.iteritems() ]
    sortedgram.sort()
    sortedgram.reverse()

    bigram = {}
    i = 0
    for k, v in sortedgram:
        out_f.write(v)
        bigram[v] = chr(i + 128)
        i += 1
        if i >= 128:
            break
    while i < 128:
        out_f.write('zz')
        bigram['zz'] = chr(i + 128)
        i += 1

    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))

    # create pfx matrix and write encoded titles

    #article_list = [strip_accents(k) for k in article_index.keys()]
    #article_list.sort(key = lambda x: strip_accents(x).lower())

    PrintLog.message(u'Sorting titles')
    start_time = time.time()

    article_list = [ (SearchKey.make_key(language_processor.translate(title)), title)
                      for title in article_index.all_indices() ]
    article_list.sort()

    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))

    PrintLog.message(u'Writing matrix: {0:s}'.format(filename))
    start_time = time.time()

    index_matrix = {}
    index_matrix['\0\0\0'] = out_f.tell()

    previous_bigram_title = ''
    previous_utf8_title = ''
    mod_counter = 0

    for stripped_title, title in article_list:

        bigram_title = bigram_encode(stripped_title)[:MAXIMUM_TITLE_LENGTH]
        (article_number, dummy, restricted, is_redirect) = article_index.get_index(title)

        if '' == bigram_title and is_redirect:
            continue

        utf8_title = title.encode('utf-8')
        if truncate_title:
            utf8_title = utf8_title[:MAXIMUM_TITLE_LENGTH]
        else:
            utf8_title = utf8_title[:MAXIMUM_TITLE_ACTUAL]

        offset = out_f.tell()
        article_index.set_index(title, (article_number, offset, restricted, is_redirect))

        key3 = (stripped_title[0:3] + '\0\0\0')[0:3].lower()
        key2 = key3[0:2] + '\0'
        key1 = key3[0:1] + '\0\0'
        if key1 not in index_matrix:
            index_matrix[key1] = offset
        if key2 not in index_matrix:
            index_matrix[key2] = offset
        if key3 not in index_matrix:
            index_matrix[key3] = offset

        if 0 == mod_counter & 0x0f:
            bigram_common_length = 0
            utf8_common_length = 0
        else:
            bigram_common_length = common_prefix_length(previous_bigram_title, bigram_title)
            utf8_common_length = common_prefix_length(previous_utf8_title, utf8_title)
        mod_counter += 1

        previous_bigram_title = bigram_title
        previous_utf8_title = utf8_title

        if bigram_common_length > 1:
            bigram_title = chr(bigram_common_length - 1) + bigram_title[bigram_common_length:]
        if utf8_common_length > 1:
            utf8_title = chr(utf8_common_length - 1) + utf8_title[utf8_common_length:]

        out_f.write(struct.pack('<I', article_number) + '\0' + bigram_title + '\0' + utf8_title + '\0')

    out_f.close()
    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
Exemplo n.º 12
0
def write_article(language_links):
    global compress
    global verbose
    global output, f_out, i_out
    global article_count
    global g_this_article_title
    global file_number
    global start_time
    global article_writer

    article_count += 1
    if verbose:
        PrintLog.message(u'[MWR {0:d}] {1:s}'.format(article_count, g_this_article_title))

    elif article_count % 1000 == 0:
        now_time = time.time()
        PrintLog.message(u'Render[{0:d}]: {1:7.2f}s {2:10d}'.format(file_number, now_time - start_time, article_count))
        start_time = now_time

    # create links
    links_stream = io.BytesIO('')

    for i in g_links:
        (x0, y0, x1, y1, url) = g_links[i]
        links_stream.write(struct.pack('<3I', (y0 << 8) | x0, (y1 << 8) | x1, link_number(url)))

    links_stream.flush()
    links = links_stream.getvalue()
    links_stream.close()

    # create language links
    links_stream = io.BytesIO('')
    japanese_convert = LanguageTranslation.LanguageJapanese().translate
    normal_convert = LanguageTranslation.LanguageNormal().translate

    for l in language_links:
        language, link = l.split(':', 1)

        language = language.strip()
        link = link.strip()

        # only need the first pronunciation for the link
        # as this must always be present
        if link is not None and '' != link:
            if 'ja' == language:
                stripped = japanese_convert(link)[0]
            else:
                stripped = normal_convert(link)[0]

            stripped = SearchKey.strip_accents(stripped)

            if link == stripped:
                links_stream.write(l.encode('utf-8') + '\0')
            else:
                links_stream.write((language + '#' + stripped).encode('utf-8') + '\1' + link.encode('utf-8') + '\0')

    links_stream.flush()
    langs = links_stream.getvalue()
    links_stream.close()

    # create the header (header size = 8)
    header = struct.pack('<I2H', 8 + len(links) + len(langs), g_link_cnt, 0)
    body = output.fetch()

    # combine the data
    whole_article = header + links + langs + body

    if compress:
        try:
            (article_number, fnd_offset, restricted) = article_index(g_this_article_title)
            restricted =  bool(int(restricted))  # '0' is True so turn it into False
            article_writer.add_article(article_number, whole_article, fnd_offset, restricted)
        except KeyError:
            PrintLog.message(u'Error in: write_article, Title not found')
            PrintLog.message(u'Title:  {0:s}'.format(g_this_article_title))
            PrintLog.message(u'Count:  {0:s}'.format(article_count))
    else:
        f_out.write(whole_article)
        f_out.flush()
Exemplo n.º 13
0
def write_article(language_links):
    global compress
    global verbose
    global output, f_out, i_out
    global article_count
    global g_this_article_title
    global file_number
    global start_time
    global article_writer

    article_count += 1
    if verbose:
        PrintLog.message(u'[MWR {0:d}] {1:s}'.format(article_count,
                                                     g_this_article_title))

    elif article_count % 1000 == 0:
        now_time = time.time()
        PrintLog.message(u'Render[{0:d}]: {1:7.2f}s {2:10d}'.format(
            file_number, now_time - start_time, article_count))
        start_time = now_time

    # create links
    links_stream = io.BytesIO('')

    for i in g_links:
        (x0, y0, x1, y1, url) = g_links[i]
        links_stream.write(
            struct.pack('<3I', (y0 << 8) | x0, (y1 << 8) | x1,
                        link_number(url)))

    links_stream.flush()
    links = links_stream.getvalue()
    links_stream.close()

    # create language links
    links_stream = io.BytesIO('')
    japanese_convert = LanguageTranslation.LanguageJapanese().translate
    normal_convert = LanguageTranslation.LanguageNormal().translate

    for l in language_links:
        language, link = l.split(':', 1)

        language = language.strip()
        link = link.strip()

        # only need the first pronunciation for the link
        # as this must always be present
        if link is not None and '' != link:
            if 'ja' == language:
                stripped = japanese_convert(link)[0]
            else:
                stripped = normal_convert(link)[0]

            stripped = SearchKey.strip_accents(stripped)

            if link == stripped:
                links_stream.write(l.encode('utf-8') + '\0')
            else:
                links_stream.write((language + '#' +
                                    stripped).encode('utf-8') + '\1' +
                                   link.encode('utf-8') + '\0')

    links_stream.flush()
    langs = links_stream.getvalue()
    links_stream.close()

    # create the header (header size = 8)
    header = struct.pack('<I2H', 8 + len(links) + len(langs), g_link_cnt, 0)
    body = output.fetch()

    # combine the data
    whole_article = header + links + langs + body

    if compress:
        try:
            (article_number, fnd_offset,
             restricted) = article_index(g_this_article_title)
            restricted = bool(
                int(restricted))  # '0' is True so turn it into False
            article_writer.add_article(article_number, whole_article,
                                       fnd_offset, restricted)
        except KeyError:
            PrintLog.message(u'Error in: write_article, Title not found')
            PrintLog.message(u'Title:  {0:s}'.format(g_this_article_title))
            PrintLog.message(u'Count:  {0:s}'.format(article_count))
    else:
        f_out.write(whole_article)
        f_out.flush()
Exemplo n.º 14
0
def output_fnd(filename_format, article_index, language_processor, truncate_title):
    """create bigram table"""
    global bigram
    global index_matrix
    global MAXIMUM_TITLE_LENGTH
    global MAXIMUM_TITLE_ACTUAL
    global FND_FILE_SEGMENT_SIZE

    start_time = time.time()
    out_f = SegmentedFileWriter(filename_format, FND_FILE_SEGMENT_SIZE)
    PrintLog.message(u'Writing bigrams: {0:s}'.format(out_f.current_filename))

    sortedgram = [ (value, key) for key, value in bigram.iteritems() ]
    sortedgram.sort()
    sortedgram.reverse()

    bigram = {}
    i = 0
    for k, v in sortedgram:
        out_f.write(v)
        bigram[v] = chr(i + 128)
        i += 1
        if i >= 128:
            break
    while i < 128:
        out_f.write('zz')
        bigram['zz'] = chr(i + 128)
        i += 1

    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))

    # create pfx matrix and write encoded titles

    PrintLog.message(u'Sorting titles')
    start_time = time.time()

    ####@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@####
    unique_articles = {}
    for article in [ (SearchKey.make_key(translated_title[:MAXIMUM_TITLE_LENGTH]), title)
                     for title in article_index.all_indices()
                     for translated_title in language_processor.translate(title) ]:
        unique_articles[article] = 1

    article_list = sorted(unique_articles.keys())

    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))

    PrintLog.message(u'Writing matrix: {0:s}'.format(out_f.current_filename))
    start_time = time.time()

    index_matrix = {}
    index_matrix['\0\0\0'] = out_f.tell()

    previous_bigram_title = ''
    previous_utf8_title = ''
    mod_counter = 0

    for stripped_title, title in article_list:

        bigram_title = bigram_encode(stripped_title)[:MAXIMUM_TITLE_LENGTH]
        (article_number, dummy, restricted, is_redirect) = article_index.get_index(title)

        if '' == bigram_title and is_redirect:
            continue

        utf8_title = title.encode('utf-8')
        if truncate_title:
            utf8_title = utf8_title[:MAXIMUM_TITLE_LENGTH]
        else:
            utf8_title = utf8_title[:MAXIMUM_TITLE_ACTUAL]

        offset = out_f.tell()
        article_index.set_index(title, (article_number, offset, restricted, is_redirect))

        key3 = (stripped_title[0:3] + '\0\0\0')[0:3].lower()
        key2 = key3[0:2] + '\0'
        key1 = key3[0:1] + '\0\0'
        if key1 not in index_matrix:
            index_matrix[key1] = offset
        if key2 not in index_matrix:
            index_matrix[key2] = offset
        if key3 not in index_matrix:
            index_matrix[key3] = offset

        if 0 == mod_counter & 0x0f:
            bigram_common_length = 0
            utf8_common_length = 0
        else:
            bigram_common_length = common_prefix_length(previous_bigram_title, bigram_title)
            utf8_common_length = common_prefix_length(previous_utf8_title, utf8_title)
        mod_counter += 1

        previous_bigram_title = bigram_title
        previous_utf8_title = utf8_title

        if bigram_common_length > 1:
            bigram_title = chr(bigram_common_length - 1) + bigram_title[bigram_common_length:]
        if utf8_common_length > 1:
            utf8_title = chr(utf8_common_length - 1) + utf8_title[utf8_common_length:]

        out_f.write(struct.pack('<I', article_number) + '\0' + bigram_title + '\0' + utf8_title + '\0')

    PrintLog.message(u'Final segment: {0:s}'.format(out_f.current_filename))
    out_f.close()
    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))