Python normalize примеры, unicodedata2.normalize Python примеры использования

Пример #1

0

Показать файл

def correct_alef_prev_char_normal_case_version_2(letter):
    overall = ""
    comp = ""
    is_corrected = False
    for c in letter:
        if not unicodedata2.combining(c):
            overall = c
            comp = unicodedata2.normalize('NFC', c)

        elif c == u'َ' or c == u'ّ' or c == u'ً':
            overall += c
            comp = unicodedata2.normalize('NFC', overall)
            is_corrected = True

        else:
            c = u'َ'
            overall += c
            comp = unicodedata2.normalize('NFC', overall)
            is_corrected = True

    if not is_corrected:
        c = u'َ'
        overall += c
        comp = unicodedata2.normalize('NFC', overall)

    return comp

Пример #2

0

Показать файл

def teh_marbota_char_correction(char):
    overall = ""
    comp = ""
    is_corrected = False
    for c in char:
        if not unicodedata2.combining(c):
            overall = c
            comp = unicodedata2.normalize('NFC', c)

        elif c == u'َ' or c == u'ّ' or c == u'ً':
            overall += c
            comp = unicodedata2.normalize('NFC', overall)
            is_corrected = True

        else:
            c = u'َ'
            overall += c
            comp = unicodedata2.normalize('NFC', overall)
            is_corrected = True

    if not is_corrected:
        c = u'َ'
        overall += c
        comp = unicodedata2.normalize('NFC', overall)

    return comp

Пример #3

0

Показать файл

def correct_alef_maksora_prev_char_normal_case_version_2(letter):
    overall = ""
    comp = ""
    is_corrected = False
    try:
        for c in letter:
            if not unicodedata2.combining(c):
                overall = c
                comp = unicodedata2.normalize('NFC', c)

            elif c == u'َ' or c == u'ّ' or c == u'ً':
                overall += c
                comp = unicodedata2.normalize('NFC', overall)
                is_corrected = True

            else:
                c = u'َ'
                overall += c
                comp = unicodedata2.normalize('NFC', overall)
                is_corrected = True
    except:
        raise Exception("bug found in correct_alef_maksora_prev_char_normal_case")

    if not is_corrected:
        c = u'َ'
        overall += c
        comp = unicodedata2.normalize('NFC', overall)

    return comp

Пример #4

0

Показать файл

Файл: main.py Проект: votranbaohieu/do-an-tot-nghiep

    def handle(self):

        newData = []

        for v in self.data:
            t = v[self.content].lower()

            if self.html:
                t = BeautifulSoup(t, 'html.parser').get_text()

            # Chuẩn hóa láy âm tiết
            t = re.sub(r'(\D)\1+', r'\1', t)

            # Tách từ
            t = ViTokenizer.tokenize(t)

            if self.accented_char:
                t = unicodedata2.normalize('NFD',
                                           t).encode('ascii',
                                                     'ignore').decode("utf-8")

            if self.special_char:
                t = [x.strip(SPECIAL_CHARACTER) for x in t.split()]

            if self.stopwords:
                t = [word for word in t if word not in self.list_stopword]

            v[self.content] = t
            if v not in newData:
                newData.append(v)

        print(np.array(newData))

Пример #5

0

Показать файл

Файл: intern_utils.py Проект: dang161/Odoo

def no_accent_vietnamese2(s):
    # s = s.decode('utf-8')
    text = re.sub(u'Đ', 'XX', s)
    text = re.sub(u'đ', 'XX', text)
    # return s.encode('utf-8')
    return unicodedata2.normalize('NFKD', unicodedata2(text)).encode(
        'ASCII', 'ignore')

Пример #6

0

Показать файл

Файл: scrape_phone.py Проект: fatimesamplesolutions/scrape_phone_email

    def findNumberInTag(self, tag):
        textToTest = []
        if hasattr(tag, 'text'):
            textToTest.append(tag.text)
        elif isinstance(tag, str):
            textToTest.append(tag)

        #  <a href="tel:+12312312"> +123 123 321 </a>
        if tag.name == 'a':
            textToTest.append(tag.get('href', ''))
        # remove unicode \x0a etc
        normalized = [
            unicodedata2.normalize("NFKD", text) for text in textToTest
        ]

        # [['+123', '123], ['33', '222']] => ['+123', '123', '33','222']
        # flatten nested list
        # sum method takes every element and sums to each other
        # ['+123', '123] +  ['33', '222'] + [] =>  ['+123', '123', '33','222']
        numbers = sum([
            pattern.findall('.'.join(normalized))
            for pattern in self.regexPhoneFormatPatterns
        ], [])

        return numbers

Пример #7

0

Показать файл

Файл: parse.py Проект: QHOD/hyperglot

def character_list_from_string(string, normalize=True):
    """
    Return a list of characters without space separators from an input string
    """
    # Since Unicode allows writing the same string either precomposed or as
    # combining characters, we want to transform all those strings that are
    # written as combining characters to precomposed, if possible. In our
    # data a combining char (be it encoded as precomposed or with
    # combining marks) means we want to explicitly check
    # a) the combining marks, and
    # b) with flag we want to check the precomposed unicode is present - and
    # for this we need to make sure our data input with combing marks is
    # actually interpreted (and re-saved) as precomposed!

    # Before splitting a string into a list of each character (and removing
    # spaces) make sure any composable characters written with combining marks
    # are in fact transformed to precomposed characters; otherwise the
    # "listifying" will split base and mark(s) into several list items (chars)
    if normalize:
        # Make sure we are in fact dealing with a string, not a list
        if isinstance(string, list) or isinstance(string, set):
            string = "".join(string)

        # N_ormal F_orm C_omposed
        # See more https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize # noqa
        string = unicodedata2.normalize("NFC", string)

    li = list(string)
    li = list_unique([c for c in li if c.strip() != ""])
    return li

Пример #8

0

Показать файл

Файл: intern_utils.py Проект: dang161/Odoo

def no_accent_vietnamese(s):
    if not s:
        return ''
    if check_han_language(s):
        return s
    text = re.sub(u'Đ', 'D', s)
    text = re.sub(u'đ', 'd', text)
    return unicodedata2.normalize('NFKD', unicodedata2(text)).encode(
        'ASCII', 'ignore')

Пример #9

0

Показать файл

Файл: new_comments.py Проект: robdy/svexdb

    def process_latest_comments(self, stopping_id):
        new_stopping_id = stopping_id
        COMMENTS_LIMIT = 100  # intended for a cron job every 15 minutes
        comments = self.r_praw.subreddit('SVExchange').comments(limit=COMMENTS_LIMIT)
        user_tsv_set = set([])  # stores user/tsv combo to avoid duplicates
        user_set = set()  # stores users to avoid duplicates
        for i, c in enumerate(comments):
            if i == 0:
                new_stopping_id = c.id

            link_title_ascii = unicodedata2.normalize('NFKD', c.link_title).encode('ascii', 'ignore').decode('ascii')
            self.stdout.write("%s %s %s" % (c.id, c.link_author.ljust(24), link_title_ascii))
            if c.id <= stopping_id:
                from datetime import datetime
                self.stdout.write("new_comments [Stop] " + str(datetime.utcnow()))
                break
            op = c.link_author
            commenter = c.author.name
            if c.is_submitter and cmd_helper.is_from_tsv_thread(c.link_title):
                user_tsv_tuple = (op, c.link_title)
                tsv = int(c.link_title)
                ts = c.created_utc
                if user_tsv_tuple in user_tsv_set:
                    self.stdout.write("\tRepeat")
                elif TSV.objects.check_if_exists(op, tsv):
                    self.stdout.write("\tUpdating")
                    new_sub_id = cmd_helper.get_id_from_full_url(c.link_url)
                    # comment lacks gen info that's found in submission flair
                    gen = cmd_helper.get_gen_from_comment(op, tsv, new_sub_id, self.r_praw)
                    user_tsv = TSV.objects.get_user_tsv(op, tsv, gen)
                    # check if submission id should be updated, in case db doesn't have user's latest thread
                    old_sub_id = user_tsv.sub_id

                    if new_sub_id > old_sub_id:
                        user_tsv.sub_id = new_sub_id
                        user_tsv.save()

                    cmd_helper.scrape_user_tsv(user_tsv, self.r_praw, ts)
                else:
                    self.stdout.write("\tAdding?")
                    sub_id = cmd_helper.get_id_from_full_url(c.link_url)
                    subm = self.r_praw.submission(id=sub_id)
                    if not subm.over_18:
                        self.stdout.write("\tAdd")
                        gen = cmd_helper.get_gen_from_flair_class(subm.link_flair_css_class)
                        TSV.objects.update_or_create_user_tsv(op, subm.author_flair_text, subm.author_flair_css_class,
                                              tsv, gen, sub_id, False, False,
                                              subm.created_utc, ts, None)
                user_tsv_set.add(user_tsv_tuple)
            else:
                if commenter not in user_set:
                    user_set.add(commenter)
                    tr = Trainer.objects.get_user(commenter)
                    if tr:
                        tr.set_activity(c.created_utc)

        return new_stopping_id

Пример #10

0

Показать файл

def correct_alef_prev_char_mem(prev_char_object):
    overall = ""
    comp = ""
    is_corrected = False
    for c in prev_char_object.letter:
        if not unicodedata2.combining(c):
            overall = c
            comp = unicodedata2.normalize('NFC', c)
        else:
            c = u'ِ'
            overall += c
            comp = unicodedata2.normalize('NFC', overall)
            is_corrected = True

    if not is_corrected:
        c = u'ِ'
        overall += c
        comp = unicodedata2.normalize('NFC', overall)
    return comp

Пример #11

0

Показать файл

def correct_alef_prev_char_ba2_maksora_version_2(letter):
    overall = ""
    comp = ""
    is_corrected = False
    for c in letter:
        if not unicodedata2.combining(c):
            overall = c
            comp = unicodedata2.normalize('NFC', c)
        else:
            c = u'ِ'
            overall += c
            comp = unicodedata2.normalize('NFC', overall)
            is_corrected = True

    if not is_corrected:
        c = u'ِ'
        overall += c
        comp = unicodedata2.normalize('NFC', overall)

    return comp

Пример #12

0

Показать файл

Файл: string_cleaner.py Проект: stcybrdgs/wxMatchingEngine

def remove_accents(d):
    # rem unicode is default on python3
    try:
        d = unicode(d, 'utf-8')
    except (TypeError, NameError):
        pass

    d = unicodedata2.normalize('NFC', d)
    d = d.encode('ascii', 'ignore')
    d = d.decode("utf-8")
    return str(d)

Пример #13

0

Показать файл

    def slugify(value):
        """
		Convert to ASCII. Convert spaces to hyphens.
		Remove characters that aren't alphanumerics, underscores, or hyphens.
		Convert to lowercase. Also strip leading and trailing whitespace.
		"""
        value = str(value)
        value = unicodedata.normalize('NFKD',
                                      value).encode('ascii',
                                                    'ignore').decode('ascii')
        value = str(re.sub('[^\w\s-]', '', value).strip().lower())
        value = str(re.sub('[-\s]+', '-', value))
        return value

Пример #14

0

Показать файл

def save_tweets():
    print 'Saving Tweets'
    # print tweets
    file_tweets = open('../data_set/new_tweets.txt', 'a+')
    lastline = file_tweets.readlines()[-1].split(',')
    line_number = int(lastline[0])
    print 'Last line', lastline
    print 'Last line number', line_number
    for tweet in tweets:
        line_number += 1
        data = unicodedata.normalize('NFKD',
                                     tweet.text).encode('utf-8', 'ignore')
        file_tweets.write(str(line_number) + ', ,')
        file_tweets.write(json.dumps(data))
        file_tweets.write('\n')

    file_tweets.close()

Пример #15

0

Показать файл

        def on_data(self, data):
            try:
                all_data = json.loads(data)
                #print(data)

                # filename = '/tmp/myfile%s.txt'%datetime.utcnow().strftime('%Y%m%d%H%M%S%f')[:-3]
                # f = open(filename,'w')
                # f.write(data)
                # f.close()
                #hdfs.put(filename,"hdfs://r01mstr.bddata.local:9000/user/vijay/tweets/")
                #call(["hadoop fs" "-put   /user/saurzcode/dir3/"])
                # cmd = 'hadoop fs -put %s /user/vijay/tweets/'%filename
                # os.system(cmd)
                #print cmd

                # print(data)
                #tweet = all_data["text"].encode('utf-8')
                tweet = unicodedata2.normalize('NFKD',
                                               u'' + all_data["text"]).encode(
                                                   'ascii', 'ignore')
                created_at = all_data["created_at"].encode('utf-8')
                username = all_data["user"]["screen_name"].encode('utf-8')
                sentiment = self.get_tweet_sentiment(tweet)
                image_url = all_data["user"]["profile_image_url"].encode(
                    'utf-8')
                rt_user = "******"
                rt_user = all_data["retweeted_status"]["user"][
                    "screen_name"].encode('utf-8')
                rt_user_image_url = all_data["retweeted_status"]["user"][
                    "profile_image_url"].encode('utf-8')
                #print(tweet)
                print(rt_user)
                print(rt_user_image_url)
                print(sentiment)
                print("\n")
                #print(image_url)
                #print("\n")
                topics = str(sys.argv[1])
                c.execute(
                    "INSERT INTO tweets (created_at, screen_name, text,sentiment,profile_image_url,topic,rt_user,rt_user_image_url) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)",
                    (created_at, username, tweet, sentiment, image_url, topics,
                     rt_user, rt_user_image_url))
                conn.commit()
                return True
            except:
                pass

Пример #16

0

Показать файл

Файл: translator.py Проект: davidhaase/logos

def clean_line(line):
    table = str.maketrans('', '', string.punctuation)
    re_print = re.compile('[^%s]' % re.escape(string.printable))

    line = normalize('NFD', line).encode('ascii', 'ignore')
    line = line.decode('UTF-8')
    # tokenize on white space
    line = line.split()
    # convert to lowercase
    line = [word.lower() for word in line]
    # remove punctuation from each token
    line = [word.translate(table) for word in line]
    # remove non-printable chars form each token
    line = [re_print.sub('', w) for w in line]
    # remove tokens with numbers in them
    line = [word for word in line if word.isalpha()]
    # store as string
    return ' '.join(line)

Пример #17

0

Показать файл

Файл: run.py Проект: kamligph/nfc-text

def main():

    try:
        malformed_string = sys.argv[1]
    except IndexError:
        malformed_string = 'Please put something...'

    try:
        form = sys.argv[2]
    except IndexError:
        form = 'NFC'

    formed_string = unicodedata2.normalize(form, malformed_string)
    dmp = dmp_module.diff_match_patch()
    diffs = dmp.diff_main(malformed_string, formed_string)

    with open('templates/index.html') as inf:
        html = inf.read()
        soup = BeautifulSoup(html, 'html.parser')

    div_output = soup.find('div', {'id': 'output'})
    div_output.clear()
    new_pre = "<pre class='diff_data'>\n----------------------------------------\nForm: " + form + \
      "\nMalformed string: " + malformed_string + \
      "\n========================================\n " + form + \
      " string: " + formed_string + "\n========================================\n" + \
      "\nDiff: " + ''.join(map(str, diffs)) + "</pre>"
    extra_soup = BeautifulSoup(new_pre, 'html.parser')
    div_output.append(extra_soup)

    pretty_diff = dmp.diff_prettyHtml(diffs)
    extra_soup = BeautifulSoup(
        ("<div class='diff_wrapper'>" + pretty_diff + "</div>"), 'html.parser')

    div_output.append(extra_soup)

    with open('templates/index.html', 'w') as outf:
        outf.write(str(soup))

    threading.Timer(1.25,
                    lambda: webbrowser.open("http://127.0.0.1:5000/")).start()
    subprocess.call("FLASK_APP=routes.py flask run", shell=True)

Пример #18

0

Показать файл

Файл: main.py Проект: votranbaohieu/do-an-tot-nghiep

    def handle(self):
        t = self.text.lower()

        t = BeautifulSoup(t, 'html.parser').get_text()

        # Chuẩn hóa láy âm tiết
        t = re.sub(r'(\D)\1+', r'\1', t)

        # Tách từ
        t = ViTokenizer.tokenize(t)

        # Xóa dấu
        t = unicodedata2.normalize('NFD', t).encode('ascii',
                                                    'ignore').decode("utf-8")

        t = [x.strip(SPECIAL_CHARACTER) for x in t.split()]

        t = [word for word in t if word not in self.stopwords]

        self.text = t

Пример #19

0

Показать файл

def slugify(value):
    """
    Normalizes string, converts to lowercase, removes non-alpha characters,
    and converts spaces to hyphens.
    """
    import unicodedata2
    import re

    symbols = (u"абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
               u"abvgdeejzijklmnoprstufhzcss_y_euaABVGDEEJZIJKLMNOPRSTUFHZCSS_Y_EUA")

    tr = {ord(a): ord(b) for a, b in zip(*symbols)}

    # for Python 2.*:
    # tr = dict( [ (ord(a), ord(b)) for (a, b) in zip(*symbols) ] )

    value = value.translate(tr)  # looks good

    value = unicodedata2.normalize('NFKD', value).encode('ascii', 'ignore').decode('utf-8').strip()
#    value = re.sub('[^\w\s-]', '', value).strip().lower()
#    value = re.sub('[-\s]+', '-', value)
    # ...
    return value

Пример #20

0

Показать файл

    def handle(self):
        label = self.labelCol
        content = self.contentCol
        newData = []

        # print(self.data)

        for v in self.data:
            t = v[content].lower()

            if self.html_stripping:
                t = BeautifulSoup(t, 'html.parser').get_text()

            # Chuẩn hóa láy âm tiết
            t = re.sub(r'(\D)\1+', r'\1', t)

            # Tách từ
            t = ViTokenizer.tokenize(t)

            if self.remove_accented_chars:
                t = unicodedata2.normalize('NFD',
                                           t).encode('ascii',
                                                     'ignore').decode("utf-8")

            if self.remove_special_characters:
                t = [x.strip(SPECIAL_CHARACTER) for x in t.split()]
                t = [i for i in t if i]

            if self.remove_stopwords:
                stopwords = self.stopwords
                t = [word for word in t if word not in self.stopwords]

            v[content] = t
            if v not in newData:
                newData.append(v)

        print(np.array(newData))

Пример #21

0

Показать файл

def fatha_correction(list_of_objects_of_chars_and_its_location):
    counter = 0
    current_index = 0
    actual_letters_after_fatha_correction = []

    prev_char_object = WordLetterProcessingHelperMethod.LetterPosition()
    prev_prev_char_object = WordLetterProcessingHelperMethod.LetterPosition()
    next_char_object = WordLetterProcessingHelperMethod.LetterPosition()

    for each_letter_object in list_of_objects_of_chars_and_its_location:

        actual_letters_after_fatha_correction.append(each_letter_object)
        character = remove_diacritics(each_letter_object.letter)

        if (character in letters_of_fatha_correction) and (each_letter_object.location != 'first'):

            letter_caused_fatha_correction = character

            if (counter - 1) >= 0:
                prev_char_object = list_of_objects_of_chars_and_its_location[counter - 1]
                prev_char_object.letter = unicodedata2.normalize('NFC', str(prev_char_object.letter))
            if (counter - 2) >= 0:
                prev_prev_char_object = list_of_objects_of_chars_and_its_location[counter - 2]
                prev_prev_char_object.letter = unicodedata2.normalize('NFC', prev_prev_char_object.letter)
            if ((counter + 1) <= (len(list_of_objects_of_chars_and_its_location) - 1)) and (each_letter_object.location != 'last'):
                next_char_object = list_of_objects_of_chars_and_its_location[counter + 1]

            corrected_char = prev_char_object.letter
            if letter_caused_fatha_correction == u'ة':
                corrected_char = correct_teh_marbota_prev_char(prev_char_object)

            elif letter_caused_fatha_correction == u'ا':

                if each_letter_object.location == 'middle':
                    if remove_diacritics(prev_char_object.letter) == u'ب':
                        # , بِاتِّخَاذِكُمُ ,وَبِالْآخِرَةِ , بِالْعُدْوَةِ
                        if u'ّ' in next_char_object.letter or\
                                        next_char_object.letter == remove_diacritics(next_char_object.letter):
                            corrected_char = correct_alef_prev_char_ba2_maksora(prev_char_object)

                        # بَالِغَةٌ , بَاسِرَةٌ
                        else:
                            corrected_char = correct_alef_prev_char_normal_case(prev_char_object)

                    elif remove_diacritics(prev_char_object.letter) == u'ل':
                        if prev_char_object.location == 'first':
                            # do not handle this case
                            # special case with no law (these are contradict) لَا , لِامْرَأَتِهِ
                            corrected_char = prev_char_object.letter

                        elif prev_prev_char_object.letter == u'ا':
                            # do not handle this case
                            # special case with no law (these are contradict)  الِاسْمُ
                            corrected_char = prev_char_object.letter
                        else:
                            corrected_char = correct_alef_prev_char_normal_case(prev_char_object)
                    # مِائَةَ , مِائَتَيْنِ
                    elif remove_diacritics(prev_char_object.letter) == u'م' \
                            and prev_char_object.location == 'first' \
                            and next_char_object.letter == u'ئَ':

                        corrected_char = correct_alef_prev_char_mem(prev_char_object)

                    else:
                        corrected_char = correct_alef_prev_char_normal_case(prev_char_object)

                elif each_letter_object.location == 'last' or each_letter_object.location == 'first':
                    corrected_char = prev_char_object.letter

                else:
                    corrected_char = correct_alef_prev_char_normal_case(prev_char_object)

            elif letter_caused_fatha_correction == u'ى':

                # طُوًى, ضُحًى
                if prev_prev_char_object.location == 'first' and u'ُ' in prev_prev_char_object.letter and \
                                each_letter_object.location == 'last':

                    corrected_char = correct_alef_maksora_prev_char_tanween_case(prev_char_object)

                # أَبَى
                else:
                    corrected_char = correct_alef_maksora_prev_char_normal_case(prev_char_object)

            actual_letters_after_fatha_correction[counter - 1].letter = corrected_char
            counter += 1
        else:
            counter += 1

        current_index += 1
    return actual_letters_after_fatha_correction

Пример #22

0

Показать файл

Файл: strsync.py Проект: metasmile/strsync

def len_unicode(ustr):
    return len(unicodedata2.normalize('NFC', ustr.decode('utf-8')))

Пример #23

0

Показать файл

Файл: Cleaner.py Проект: DiegoKoz/scisci

 def strip_accents(self, text):
     text = unicodedata2.normalize('NFD', text)
     text = text.encode('ascii', 'ignore')
     text = text.decode("utf-8")
     return str(text)

Пример #24

0

Показать файл

Файл: seq2seq_translation_tutorial.py Проект: alexanderkeijser73/encoder-decoder

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

Пример #25

0

Показать файл

def remove_diacritics(character):
    nkfd_form = unicodedata2.normalize('NFKD', str(character))
    char = u"".join([c for c in nkfd_form if not unicodedata2.combining(c) or c == u'ٓ' or c == u'ٔ' or c == u'ٕ'])
    return char

Пример #26

0

Показать файл

def get_stats_from_chars(text_chars, db=None):

    report = {}

    uppercase = []
    numerals = []
    punctuation = []
    controlchars = []
    spaces = []
    other = []

    # Include decomposed forms
    for c in text_chars:
        decomposed = ud.normalize("NFKD", c)
        if len(decomposed) > 1:
            text_chars = text_chars + [d for d in decomposed]

    text_chars = set(text_chars)

    for c in text_chars:
        # print(c, ud.category(c))
        cat = ud.category(c)

        if cat == "Lu":
            uppercase.append(c)
        elif cat.startswith("N"):
            numerals.append(c)
        elif cat.startswith("P"):
            punctuation.append(c)
        elif cat.startswith("C") and len(c) > 1:
            controlchars.append(c)
        elif cat.startswith("Z"):
            spaces.append(c)
        else:
            other.append(c)

    # Remove all but "other" from chars, we don't care about them for diffing
    for remove in [
            uppercase, numerals, punctuation, controlchars, spaces,
        ["\n", "\t"]
    ]:
        text_chars = text_chars.difference(set(remove))

    report["iso_in_db"] = db is not None
    report["found_in_text"] = {
        "uppercase": sorted(uppercase),
        "numerals": sorted(numerals),
        "punctuation": sorted(punctuation),
        "chars": sorted(text_chars)
    }

    # Compare to orthographies
    if db is not None:
        db_chars = []
        if "orthographies" in db:
            for o in db["orthographies"]:
                if "base" in o:
                    db_chars = db_chars + o["base"]
                if "auxiliary" in o:
                    db_chars = db_chars + o["auxiliary"]

        db_chars = set(sorted(db_chars))

        not_in_db = text_chars.difference(db_chars)
        missing_from_text = db_chars.difference(text_chars)
        decomposed = set(parse_chars("".join(text_chars), decompose=True))

        # print("Listed in DB but not in text", missing_from_text)
        # print("Appears in text but not listed in DB", not_in_db)
        # print("Text can be written with DB characters",
        #       decomposed.issubset(db_chars))
        missing_from_db = ""
        for c in not_in_db:
            missing = ud.normalize("NFKD", c)
            missing_parts = ""
            for part in missing:
                if part not in db_chars:
                    missing_parts = missing_parts + part
            if missing_parts != []:
                missing_from_db = missing_from_db + missing_parts
        # print("missing from db", sorted(list(missing_from_db)))
        missing_from_db = sorted(list(set(missing_from_db)))

        report["not_in_text"] = sorted(missing_from_text)
        report["not_in_db"] = sorted(not_in_db)
        if missing_from_db:
            report["missing_from_db"] = missing_from_db
        report["db_chars_valid"] = decomposed.issubset(db_chars)

    return report

Пример #27

0

Показать файл

Файл: python_tests.py Проект: happy5214/pywikibot-core

 def test_issue_10254_unicodedata2(self):
     text = 'Li̍t-sṳ́'
     self.assertEqual(text, unicodedata2.normalize('NFC', text))

Пример #28

0

Показать файл

Файл: python_tests.py Проект: Darkdadaah/pywikibot-core

 def test_issue_10254_unicodedata2(self):
     """Test Python issue #10254 is avoided with unicodedata2 package."""
     text = 'Li̍t-sṳ́'
     self.assertEqual(text, unicodedata2.normalize('NFC', text))

Пример #29

0

Показать файл

 def test_issue_10254_unicodedata2(self):
     text = 'Li̍t-sṳ́'
     self.assertEqual(text, unicodedata2.normalize('NFC', text))

Python normalize примеры использования