示例#1
0
def make_alphabetic(hits, processname, sortnames=False, lang="sv"):
    """
    Loop through hits, apply the function 'processname' on each object and then sort the result in alphabetical order.

    The function processname should append zero or more processed form of
    the object to the result list.
    This processed forms should be a pair (first_letter, result)
    where first_letter is the first_letter of each object (to sort on), and the result
    is what the html-template want e.g. a pair of (name, no_hits)
    """
    def fix_lastname(name):
        vonaf_pattern = re.compile(r"^(%s) " % "|".join(VONAV_LIST))
        name = re.sub(vonaf_pattern, r"", name)
        return name.replace(" ", "z")

    results = []
    for hit in hits:
        processname(hit, results)

    letter_results = {}
    # Split the result into start letters
    for first_letter, result in results:
        if first_letter == "Ø":
            first_letter = "Ö"
        if first_letter == "Æ":
            first_letter = "Ä"
        if first_letter == "Ü":
            first_letter = "Y"
        if lang == "en" and first_letter == "Ö":
            first_letter = "O"
        if lang == "en" and first_letter in "ÄÅ":
            first_letter = "A"
        if first_letter not in letter_results:
            letter_results[first_letter] = [result]
        else:
            letter_results[first_letter].append(result)

    # Sort result dictionary alphabetically into list
    if lang == "en":
        collator = icu.Collator.createInstance(icu.Locale("en_EN.UTF-8"))
    else:
        collator = icu.Collator.createInstance(icu.Locale("sv_SE.UTF-8"))
    for _n, items in list(letter_results.items()):
        if sortnames:
            items.sort(key=lambda x: collator.getSortKey(
                fix_lastname(x[0]) + " " + x[1]))
        else:
            items.sort(key=lambda x: collator.getSortKey(x[0]))

    letter_results = sorted(list(letter_results.items()),
                            key=lambda x: collator.getSortKey(x[0]))
    return letter_results
示例#2
0
def make_alphabetic(hits, processname, sortnames=False, lang="sv"):
    """ Loops through hits, applies the function 'processname'
        on each object and then sorts the result in alphabetical
        order.
        The function processname should append zero or more processed form of
        the object to the result list.
        This processed forms should be a pair (first_letter, result)
        where first_letter is the first_letter of each object (to sort on), and the result
        is what the html-template want e.g. a pair of (name, no_hits)
    """
    def fix_lastname(name):
        name = re.sub(r"(^von )|(^af )", r"", name)
        return name.replace(" ", "z")

    results = []
    for hit in hits:
        processname(hit, results)

    letter_results = {}
    # Split the result into start letters
    for first_letter, result in results:
        if first_letter == u'Ø':
            first_letter = u'Ö'
        if first_letter == u'Æ':
            first_letter = u'Ä'
        if first_letter == u'Ü':
            first_letter = u'Y'
        if lang == "en" and first_letter == u"Ö":
            first_letter = u"O"
        if lang == "en" and first_letter in u"ÄÅ":
            first_letter = u"A"
        if first_letter not in letter_results:
            letter_results[first_letter] = [result]
        else:
            letter_results[first_letter].append(result)

    # Sort result dictionary alphabetically into list
    if lang == "en":
        collator = icu.Collator.createInstance(icu.Locale('en_EN.UTF-8'))
    else:
        collator = icu.Collator.createInstance(icu.Locale('sv_SE.UTF-8'))
    for n, items in letter_results.items():
        if sortnames:
            items.sort(key=lambda x: collator.getSortKey(fix_lastname(x[0]) + " " + x[1]))
        else:
            items.sort(key=lambda x: collator.getSortKey(x[0]))

    letter_results = sorted(letter_results.items(), key=lambda x: collator.getSortKey(x[0]))
    return letter_results
示例#3
0
    def __binaryFind(self, what):
        log.log("HintRegisterBrowser.__binaryFind", [what], 0)
        collator = icu.Collator.createInstance(icu.Locale('pl_PL.UTF-8'))

        def __pom(left, right):
            if left == right:
                if collator.compare(anyHint(self.__hints[left]), what) > 0:
                    return left
                else:
                    return left + 1
            elif left + 1 == right:
                if collator.compare(anyHint(self.__hints[left]), what) > 0:
                    return left
                elif collator.compare(anyHint(self.__hints[right]), what) > 0:
                    return right
                else:
                    return right + 1
            lenn = right - left
            center = left + lenn // 2
            if collator.compare(anyHint(self.__hints[center]), what) > 0:
                return __pom(left, center - 1)
            else:
                return __pom(center + 1, right)

        res = __pom(0, len(self.__hints) - 1)
        log.log("HintRegisterBrowser.__binaryFind return", [res], 1)
        return res
示例#4
0
 def _testMissorderedTags(self):
     # For accented char sorting
     collator = icu.Collator.createInstance(icu.Locale('fr_FR.UTF-8'))
     if sorted(removeSpecialCharFromArray(self.track.artists),
               key=collator.getSortKey) != removeSpecialCharFromArray(
                   self.track.artists):
         self.missorderedTag.append('Artists')
         self.missorderedTagsCounter += 1
     if self.track.remix == '':
         if sorted(removeSpecialCharFromArray(self.track.artists),
                   key=collator.getSortKey) != removeSpecialCharFromArray(
                       self.track.fileNameList[4].split(', ')):
             self.missorderedTag.append('Artists')
             self.missorderedTagsCounter += 1
     if sorted(removeSpecialCharFromArray(self.track.performers),
               key=collator.getSortKey) != removeSpecialCharFromArray(
                   self.track.performers):
         self.missorderedTag.append('Performers')
         self.missorderedTagsCounter += 1
     if sorted(removeSpecialCharFromArray(self.track.feat),
               key=collator.getSortKey) != removeSpecialCharFromArray(
                   self.track.feat):
         self.missorderedTag.append('Featuring')
         self.missorderedTagsCounter += 1
     if sorted(removeSpecialCharFromArray(self.track.remix),
               key=collator.getSortKey) != removeSpecialCharFromArray(
                   self.track.remix):
         self.missorderedTag.append('Remixer')
         self.missorderedTagsCounter += 1
     if self.missorderedTagsCounter > 0:
         self.errorCounter += 1
         self.errors.append(ErrorEnum.MISS_ORDERED_TAGS)
示例#5
0
	def __init__(self, *args, **kwargs):
		RegisterBrowser.__init__(self, *args, **kwargs)
		self.__dBController = None
		self.__limit = 100
		self.__localVeto = False
		self.__binaryType = None
		self.__collator = icu.Collator.createInstance(icu.Locale('pl_PL.UTF-8'))
    def character_tokenize(self, word):
        """ Returns the tokenization in character level.
        
        Arguments:
            word {string} -- word to be tokenized in character level.
        
        Returns:
            [list] -- list of characters.
        """
        try:
            import icu

        except:
            print("please install PyICU")
        
        temp_ = icu.BreakIterator.createCharacterInstance(icu.Locale())
        temp_.setText(word)
        char = []
        i = 0
        for j in temp_:
            s = word[i:j]
            char.append(s)
            i = j

        return char
示例#7
0
def jwOnSortedFunction(s1,
                       s2,
                       collator=icu.Collator.createInstance(
                           icu.Locale('de_DE.UTF-8'))):
    s1_s = ''.join(sorted(list(s1), key=collator.getSortKey))
    s2_s = ''.join(sorted(list(s2), key=collator.getSortKey))
    return jw_distance.get_jaro_distance(s1_s, s2_s, winkler=True)
示例#8
0
def gen_words(text):
    bd = icu.BreakIterator.createWordInstance(icu.Locale("th"))
    bd.setText(text)
    p = bd.first()
    for q in bd:
        yield text[p:q]
        p = q
示例#9
0
 def findWord(self, key):
     self.__exact = False
     collator = icu.Collator.createInstance(icu.Locale('pl_PL.UTF-8'))
     if self._level == 0:
         res = self._dict.get(key)
         if res != None:
             self.__exact = True
             return res
         for k in self.__keys:
             #print "  ", k, key
             if collator.compare(k, key) > 0:
                 return self._dict.get(k)
         return None
     else:
         h = self._at(key, 0)
         if h != u'':
             t = key[1:]
         else:
             t = u''
         subdict = self._dict.get(h)
         if subdict != None:
             res = subdict.findWord(t)
             if res != None:
                 self.__exact = subdict.exact()
                 return res
         first = True
         for k in self.__keys:
             #print "L", k, key
             if collator.compare(k, h) > 0:
                 res = self._dict.get(k).findWord(u'')
                 if res != None:
                     return res
         return None
示例#10
0
 def updateSheetOrder(self):
     try:
         users = []
         for user in self.fetch_group.users:
             users.append(str(user.name))
         sheet = self.client.open(self.sheet_order).sheet1
         collator = icu.Collator.createInstance(icu.Locale('de_DE.UTF-8'))
         users.sort(key=collator.getSortKey)
         print(users)
         cell_list = sheet.range('B2:B' + str(len(users) + 100))
         usr = deque(users)
         try:
             for cell in cell_list:
                 cell.value = ''
                 if usr != deque([]):
                     cell.value = str(usr.popleft())
         except IndexError as e:
             print('IndexError')
         # Update in batch
         sheet.update_cells(cell_list)
     except Exception as e:
         self.sendMsg(self.fetch_error,
                      msg='update order error: ' + e,
                      rich=False,
                      typing=False)
示例#11
0
def replace_gerund():
    bounfolder = "/home/dicle/git/serdoo-servis2/django_docker/learning/_lexicons/tr_sentiment_boun"
    names = ["positive_n2", "negative_n2"]

    for n in names:
        boun = open(os.path.join(bounfolder, n + ".txt"), "r").readlines()
        boun = [w.strip() for w in boun]

        _newlist = [
            re.sub("\set(tir)?(me)?\#[nvpb]", "", w).strip() for w in boun
        ]
        _newlist = [
            re.sub("\syap(tır)?(ma)?\#[nvpb]", "", w).strip() for w in _newlist
        ]

        newlist = []
        for w in _newlist:
            if "#" not in w:
                w = w + "#n"
            newlist.append(w)

        newlist = list(set(newlist))
        import icu
        collator = icu.Collator.createInstance(icu.Locale('tr_TR.UTF-8'))
        newlist.sort(key=collator.getSortKey)
        open(os.path.join(bounfolder, n + "_n4.txt"),
             "w").write("\n".join(newlist))
示例#12
0
def gen_words(text):
    it = icu.BreakIterator.createWordInstance(icu.Locale("th"))
    it.setText(text)
    start = it.first()
    for end in it:
        yield text[start:end]
        start = end
示例#13
0
def main(song_dir: Path) -> None:
    """
    Writes a .tex file with a list of inputs. The file name is the song
    directory name concatenated with '.autogenerated.tex'.
    Sorts the inputs by a key constructed from the song title and artist.
    """
    # Get sort key and file names.
    key_and_file_pairs = []
    for song_file in song_dir.glob("*.tex"):
        match = re.match(
            r".*\\SongTitle(\[[^\[]+\])?\{(?P<title>[^\}]+)\}\{(?P<artist>[^\}]+)\}",
            song_file.read_text(),
            re.DOTALL | re.UNICODE,
        )
        if not match:
            raise ValueError(f"{song_file} does not seem to be a song file")
        key = match["title"] + " - " + match["artist"]
        key_and_file_pairs.append((key, song_file))

    # Sort by key.
    collator = icu.Collator.createInstance(icu.Locale('sk_SK.UTF-8'))
    key_and_file_pairs.sort(key=lambda x: collator.getSortKey(x[0]))

    # Write the file with inputs.
    output_file = song_dir.with_suffix(".autogenerated.tex")
    with output_file.open("w") as f:
        f.write(f"% THIS FILE IS AUTOGENERATED.\n")
        f.write(f"% DO NOT EDIT!\n")
        for _, song_file in key_and_file_pairs:
            f.write(f"\\input{{{song_file}}}\n")
示例#14
0
def shorten(manuf):
    '''Convert a long manufacturer name to abbreviated and short names'''
    # Normalize whitespace.
    manuf = ' '.join(manuf.split())
    orig_manuf = manuf
    # Add exactly one space on each end.
    # XXX This appears to be for the re.sub below.
    manuf = u' {} '.format(manuf)
    # Convert to consistent case
    manuf = manuf.title()
    # Remove any punctuation
    # XXX Use string.punctuation? Note that it includes '-' and '*'.
    manuf = re.sub(u"[',.()]", ' ', manuf)
    # & isn't needed when Standalone
    manuf = manuf.replace(" & ", " ")
    # Remove any "the", "inc", "plc" ...
    manuf = re.sub(
        '\W(the|incorporated|inc|plc|systems|corporation|corp|s/a|a/s|ab|ag|kg|gmbh|company|co|limited|ltd|holding|spa)(?= )',
        '',
        manuf,
        flags=re.IGNORECASE)
    # Remove all spaces
    manuf = re.sub('\s+', '', manuf)

    # Truncate names to a reasonable length, say, 8 characters. If
    # the string contains UTF-8, this may be substantially more than
    # 8 bytes. It might also be less than 8 visible characters. Plain
    # Python slices Unicode strings by code point, which is better
    # than raw bytes but not as good as grapheme clusters. PyICU
    # supports grapheme clusters. https://bugs.python.org/issue30717
    #
    # In our case plain Python truncates 'Savroni̇k Elektroni̇k'
    # to 'Savroni̇', which is 7 visible characters, 8 code points,
    # and 9 bytes.

    # Truncate by code points
    trunc_len = 8

    if have_icu:
        # Truncate by grapheme clusters
        bi_ci = icu.BreakIterator.createCharacterInstance(icu.Locale('en_US'))
        bi_ci.setText(manuf)
        bounds = list(bi_ci)
        bounds = bounds[0:8]
        trunc_len = bounds[-1]

    manuf = manuf[:trunc_len]

    if manuf.lower() == orig_manuf.lower():
        # Original manufacturer name was short and simple.
        return manuf

    mixed_manuf = orig_manuf
    # At least one entry has whitespace in front of a period.
    mixed_manuf = re.sub('\s+\.', '.', mixed_manuf)
    #If company is all caps, convert to mixed case (so it doesn't look like we're screaming the company name)
    if mixed_manuf.upper() == mixed_manuf:
        mixed_manuf = mixed_manuf.title()

    return u'{}\t{}'.format(manuf, mixed_manuf)
示例#15
0
def cleanup_title(value):
    # Need to use this rather than .title() because .title()
    # does not handle things like "Wouldn't" properly. It
    # converts it to "Wouldn'T" rather than keeping the T
    # lowercase
    if value[0] == '"' or value[0] == "'":
        value = value[1:]
    if value[len(value) - 1] == '"' or value[len(value) - 1] == "'":
        value = value[:len(value) - 1]
    value = value.replace('"', "").strip()
    value = HTMLParser.HTMLParser().unescape(value.lower())
    en_us_locale = icu.Locale('en_US')
    break_iter = icu.BreakIterator.createTitleInstance(en_us_locale)
    temp_title = icu.UnicodeString(value)
    title = unicode(temp_title.toTitle(break_iter, en_us_locale))
    word_enders = [" ", ",", ".", ";", ":", '"', "'", "-"]
    for acronym in settings.COMPANY_ACRONYMS:
        if '.com' in acronym[0]:
            # .com often comes at the end of a title so we don't want to add
            # the trailing space check
            if acronym[1] in title:
                title = title.replace(acronym[1], acronym[0])
        else:
            if title.rfind(acronym[1]) == len(title) - len(acronym[1]):
                title = "%s%s" % (title[:len(title) -
                                        (len(acronym[1]))], acronym[0])
            for ender in word_enders:
                if "%s%s" % (acronym[1], ender) in title:
                    title = title.replace("%s%s" % (acronym[1], ender),
                                          "%s%s" % (acronym[0], ender))
    return title
示例#16
0
def segment(txt):
    """รับค่า ''str'' คืนค่าออกมาเป็น ''list'' ที่ได้มาจากการตัดคำโดย ICU"""
    bd = icu.BreakIterator.createWordInstance(icu.Locale(
        "th"))  # เริ่มต้นเรียกใช้งานคำสั่งตัดคำโดยเลือกเป็นภาษาไทยโดยใช้ icu
    txt = txt.replace('\n', '')
    bd.setText(txt)  # ทำการตัดคำ
    breaks = list(bd)
    result = [txt[x[0]:x[1]] for x in zip([0] + breaks, breaks)]
    result1 = []
    for data in result:
        data1 = list(data)
        data2 = []
        for txt1 in data1:
            if isThai(txt1) == True:
                if len(data2) == 0:
                    data2.append(txt1)
                else:
                    if isThai(data2[data1.index(txt1) - 1]) == True:
                        data2.append(txt1)
                    else:
                        data2.append(',' + txt1)
            else:
                if len(data2) == 0:
                    data2.append(txt1)
                else:
                    if isThai(data2[data1.index(txt1) - 1]) == True:
                        data2.append(',' + txt1)
                    else:
                        data2.append(txt1)
        data1 = ''.join(data2)
        result1 += data1.split(',')
    return [x for x in result1 if x != ' ']
示例#17
0
def print_lexicon(lex):
    print('# SPDX-License-Identifier: Unicode-DFS-2016')
    print('# Columns: Form; Pronunciation')
    print()
    collator = icu.Collator.createInstance(icu.Locale('si'))
    for line in sorted(lex, key=collator.getSortKey):
        print(line.encode('utf-8'))
示例#18
0
def write_additions(deltas, out):
    for lang, (chars, refs, cldr_sources) in sorted(deltas.items()):
        locale = icu.Locale(lang)
        out.write('\n\n### %s: %s\n\n' % (lang, locale.getDisplayName()))
        reflist = ['R%d' % i for i in range(1, len(refs) + 1)]
        references = ' references="%s"' % ' '.join(reflist) if reflist else ''
        if locale.getScript() in ('Arab', 'Thaa', 'Nkoo', 'Syrc'):
            characterOrder = 'right-to-left'
        else:
            characterOrder = 'left-to-right'
        out.write('```xml\n')
        out.write(
            CLDR_EXEMPLAR_XML_START % {
                'language': locale.getLanguage(),
                'script': locale.getScript(),
                'characterOrder': characterOrder,
                'lineOrder': 'top-to-bottom',
                'characters': xmlescape(format_unicodeset(chars)),
                'references': xmlescape(references),
            })
        if refs:
            out.write('\t<references>\n')
            for i, ref in enumerate(refs):
                out.write(
                    CLDR_EXEMPLAR_XML_REFERENCE % {
                        'type': reflist[i],
                        'uri': xmlescape(ref),
                        'text': xmlescape(get_reference_description(ref)),
                    })
            out.write('\t</references>\n')
        out.write('</ldml>\n```\n\n')
示例#19
0
def compare(a, b, hint):
    collator = icu.Collator.createInstance(icu.Locale('pl_PL.UTF-8'))
    if len(commonprefix(a, hint)) > len(commonprefix(b, hint)):
        return 1
    elif len(commonprefix(b, hint)) > len(commonprefix(a, hint)):
        return -1
    else:
        return collator.compare(b, a)
def exact_matching():
    spa_words = {word for word in wn.all_lemma_names(lang='spa') if not discartable(word)}
    por_words = {word for word in wn.all_lemma_names(lang='por') if not discartable(word)}

    common_words = spa_words & por_words

    collator = icu.Collator.createInstance(icu.Locale('es_ES.UTF-8'))
    return sorted(common_words, key=collator.getSortKey)
示例#21
0
 def find_students_by_class(self):
     classe = self.classe_list.currentText()
     cursor.execute(''' SELECT nom FROM eleve WHERE classe = ? ''', (classe,))
     liste = cursor.fetchall()
     l = [e[0] for e in liste]
     collator = icu.Collator.createInstance(icu.Locale('ar_utf8'))
     l1 = sorted(l, key=collator.getSortKey)
     return l1
示例#22
0
def get_inspire_theme_link_children_tags(inspire_theme_link_parent_id=1):
    list = []
    tags = Tags.query.join(Links.tags).with_entities(Tags.title).filter(
        Links.parent_id == inspire_theme_link_parent_id).all()
    for tag in tags:
        tag.title not in list and list.append(tag.title)
    collator = icu.Collator.createInstance(icu.Locale('de_DE.UTF-8'))
    list.sort(key=collator.getSortKey)
    return tuple(list)
def similar_matching():
    """Matches ignoring accent."""
    spa_words = {change_similar_letters(word) for word in wn.all_lemma_names(lang='spa') if not discartable(word)}
    por_words = {change_similar_letters(word) for word in wn.all_lemma_names(lang='por') if not discartable(word)}

    common_words = spa_words & por_words

    collator = icu.Collator.createInstance(icu.Locale('es_ES.UTF-8'))
    return sorted(common_words, key=collator.getSortKey)
示例#24
0
def get_lang_choices(request, with_default=False):
    choices = [(k, v) for k, v in request.locale.languages.items()]
    collator = icu.Collator.createInstance(
        icu.Locale(pyramid.i18n.negotiate_locale_name(request)))
    f = functools.cmp_to_key(collator.compare)
    choices.sort(key=lambda it: f(it[1]))
    if with_default:
        choices = [('*', _("DEFAULT"))] + choices
    return choices
示例#25
0
def now():
    '''
	คืนค่าเวลา ณ ขณะนี้ ในรูปแบบ str
	ตัวอย่าง "7 มกราคม 2560 20:22:59"
	'''
    formatter = icu.DateFormat.createDateTimeInstance(icu.DateFormat.LONG,
                                                      icu.DateFormat.kDefault,
                                                      icu.Locale('th_TH'))
    return formatter.format(datetime.datetime.now())
示例#26
0
def tokenize_icu(text, language):
    bd = icu.BreakIterator.createWordInstance(icu.Locale(language))
    bd.setText(text)
    start_pos = 0
    tokens = ''
    for obj in bd:
        tokens += text[start_pos:obj]
        tokens += ' '
        start_pos = obj
    return tokens
示例#27
0
 def _testPerformerComposition(self):
     # For accented char sorting
     collator = icu.Collator.createInstance(icu.Locale('fr_FR.UTF-8'))
     # If track has featured artists, we append them to the performer tmp string
     # Sorted comparison to only test value equality. The artists alphabetic order is tested elsewhere
     if len(self.track.performers) != len(self.track.composedPerformer) or \
         sorted(removeSpecialCharFromArray(self.track.performers), key=collator.getSortKey) != \
         sorted(removeSpecialCharFromArray(self.track.composedPerformer), key=collator.getSortKey):
         self.errorCounter += 1
         self.errors.append(ErrorEnum.INCONSISTENT_PERFORMER)
示例#28
0
def tokenize(text: str, lang: str):
    """Split a string into tokens."""
    # Is there no word breaker already set up? Instantiate it
    if lang not in _breakers:
        _breakers[lang] = (icu.BreakIterator.createWordInstance(
            icu.Locale(lang)))

    _breakers[lang].setText(text)
    boundaries = list(_breakers[lang])
    return [text[i:j] for i, j in zip([0] + boundaries, boundaries)]
示例#29
0
文件: _token.py 项目: zxlzr/bistring
    def __init__(self, locale: str, constructor: Callable):
        # BreakIterator is not a thread-safe API, so store a cache of
        # thread-local iterators
        self._locale = icu.Locale(locale)
        self._constructor = constructor
        self._local = threading.local()

        # Eagerly construct one on this thread as an optimization, and to check
        # for errors
        self._break_iterator()
示例#30
0
 def sort_by_name(self):
     for record in self:
         collator = icu.Collator.createInstance(icu.Locale('es'))
         # student_ids = sorted(record.student_ids,key=attrgetter('last_name','mother_name','first_name','middle_name'),cmp=collator.compare)
         student_ids = sorted(record.student_ids,
                              key=attrgetter('full_name'),
                              cmp=collator.compare)
         seq = 0
         for student in student_ids:
             seq += 1
             student.write({'seq': seq})