Пример #1
0
def find_runs(text):
    ''' Count the number of characters in each character block '''
    run_types = defaultdict(int)

    totalCount = 0

    for c in text:
        if c.isalpha():
            block = unicodeBlock(c)
            run_types[block] += 1
            totalCount += 1

    # import pprint
    # pprint.pprint(run_types)
    
    # return run types that used for 40% or more of the string
    # always return basic latin if found more than 15%
    # and extended additional latin if over 10% (for Vietnamese)
    relevant_runs = []
    for key, value in run_types.items():
        pct = (value*100) / totalCount
        if pct >=40:
            relevant_runs.append(key)
        elif key == "Basic Latin" and ( pct >=15 ):
            relevant_runs.append(key)
        elif key == "Latin Extended Additional" and ( pct >=10 ):
            relevant_runs.append(key)

    return relevant_runs
Пример #2
0
def find_runs(text):
    ''' Count the number of characters in each character block '''
    run_types = defaultdict(int)

    totalCount = 0

    for c in text:
        if c.isalpha():
            block = unicodeBlock(c)
            run_types[block] += 1
            totalCount += 1

    # import pprint
    # pprint.pprint(run_types)
    
    # return run types that used for 40% or more of the string
    # always return basic latin if found more than 15%
    # and extended additional latin if over 10% (for Vietnamese)
    relevant_runs = []
    for key, value in run_types.items():
        pct = (value*100) / totalCount
        if pct >=40:
            relevant_runs.append(key)
        elif key == "Basic Latin" and ( pct >=15 ):
            relevant_runs.append(key)
        elif key == "Latin Extended Additional" and ( pct >=10 ):
            relevant_runs.append(key)

    return relevant_runs
Пример #3
0
 def assertBlock(self, name, c):
     c = unichr(c)
     block = unicodeBlock(c)
     self.assertEquals(name, unicodeBlock(c), '%s != %s for %r' % (name, block, c))
Пример #4
0
 def assertBlock(self, name, c):
     c = unichr(c)
     block = unicodeBlock(c)
     self.assertEquals(name, unicodeBlock(c), '%s != %s for %r' % (name, block, c))