Пример #1
0
class IndexLetter(SearchBase):

    def __init__(self, word):
        self._word = word
        self.searcher = None
        self.query = None
        self.collator = Collator()
        self.num_results = 0

    def get_num_results(self):
        return self.num_results

    def sort_key(self, string):
        s = string.decode("utf-8")
        return self.collator.sort_key(s)

    def get_results(self):
        if self.searcher is None:
            self.search()

        facet = FieldFacet("verb_form")
        facet = TranslateFacet(self.sort_key, facet)

        results = self.searcher.search(self.query,
                                      limit=None,
                                      sortedby=facet,
                                      collapse_limit=1,
                                      collapse='verb_form')

        self.num_results = len(results)
        return results

    def search(self):
        self.searcher = ix_letter.searcher()
        fields = []
        qs = u'index_letter:({0})'.format(self.word)
        fields.append("index_letter")
        self.query = MultifieldParser(fields, ix_letter.schema).parse(qs)

    def get_json(self):
        OK = 200
        status = OK
        results = self.get_results()
        all_results = []
        for result in results:
            verb = {}
            verb['verb_form'] = result['verb_form']
            if result['verb_form'] != result['infinitive']:
                verb['infinitive'] = result['infinitive']
            all_results.append(verb)

        return json.dumps(all_results, indent=4, separators=(',', ': ')), status
Пример #2
0
class FromFullTest(TestCase):

    def setUp(self):
        from pyuca import Collator
        self.c = Collator()

    def test_1(self):
        self.assertEqual(
            self.c.sort_key("\u0332\u0334"),
            (0x0000, 0x004A, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    def test_2(self):
        self.assertEqual(
            self.c.sort_key("\u0430\u0306\u0334"),
            (0x1991, 0x0000, 0x0020, 0x004A, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    def test_3(self):
        self.assertEqual(
            self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"),
            (0x2571, 0x2587, 0x258A, 0x15EB, 0x0000, 0x0020, 0x0020, 0x0020,
                0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000)
        )

    def test_4(self):
        self.assertEqual(
            self.c.sort_key("\u4E00\u0021"),
            (0xFB40, 0xCE00, 0x025D, 0x0000, 0x0020,
                0x0020, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    def test_5(self):
        self.assertEqual(
            self.c.sort_key("\u3400\u0021"),
            (0xFB80, 0xB400, 0x025D, 0x0000, 0x0020,
                0x0020, 0x0000, 0x0002, 0x0002, 0x0000)
        )
Пример #3
0
def show_event(id):
    try:
        event = Event.query.filter_by(id=id).first_or_404()
    except:
        abort(404)

    # I need to prefill these variables here to simplify the template
    participantes, compositores, personas = set(), set(), set()
    for i in event.participants:
        if i.person and i.activity.name == "Compositor/a":
            i.person.is_composer = True
            compositores.add(i.person)
            personas.add(i.person)
        else:
            participantes.add(i)
            if i.person:
                personas.add(i.person)

    # Now, iterate in performances to extract other composers
    for p in event.performances:
        for c in p.musical_piece.composers:
            c.is_composer = True
            compositores.add(c)
            personas.add(c)
    collator = Collator()
    compositores = sorted(compositores,
                          key=lambda e: collator.sort_key(e.get_name()))
    participantes = sorted(participantes,
                           key=lambda e: collator.sort_key(e.get_name()))

    return render_template('public/detalle.html',
                           e=event,
                           participantes=participantes,
                           compositores=compositores,
                           personas=personas,
                           request=request)
def GenerateCollationEquivalenceTable(unicodecharlist):
    charbuckets = {}
    C = Collator()
    
    def internal_sortfunc(codepointA, codepointB):
        A = rationalizeCollationKeys(C.sort_key(codepointA))
        B = rationalizeCollationKeys(C.sort_key(codepointB))
        cmp = 0
        if (A[2], A[3]) < (B[2], B[3]):
            cmp = -1
        elif (A[2], A[3]) > (B[2], B[3]):
            cmp = 1
        return cmp

    for codepoint in unicodecharlist:
        # Up to 4 collation keys are returned, we group on first two non-zero keys
        collationkeys = rationalizeCollationKeys(C.sort_key(codepoint))
        # print codepoint + " : " + repr(collationkeys)
        if collationkeys[0] == 0:
            continue
        
        # Not sure why case-ish transitions map to this value in the Unicode standard,
        # but this value seems to be consitently used in this way across all scripts.
        if collationkeys[1][0] != 32:
            continue
        k0 = collationkeys[0]
        k1 = collationkeys[1]
        if k0 not in charbuckets:
            charbuckets[k0] = {}
        if k1 not in charbuckets[k0]:
            charbuckets[k0][k1] = []
        charbuckets[k0][k1].append(codepoint)
    
    codepointMap = {}
    for k1 in charbuckets:
        for k2 in charbuckets[k1]:
            # This is what we are looking for:  buckets containing multiple characters.
            # Find the character with the lowest sort order in the bucket according
            # to it's full collation key sequence and map all of the other characters
            # in the bucket to this "smallest" characeter.  For instance this maps
            # "A" to "a".
            if len(charbuckets[k1][k2]) > 1:
                s = sorted(charbuckets[k1][k2], internal_sortfunc)
                for codepoint in s[1:]:
                    codepointMap[codepoint] = s[0]
    
    return codepointMap
Пример #5
0
class UnicodeCollationNormalizer(SimpleNormalizer):
    """ Use pyuca to create sort key for string
        Only, but Very, useful for sorting
    """
    def __init__(self, session, config, parent):
        SimpleNormalizer.__init__(self, session, config, parent)
        keyPath = self.get_path(session, 'keyFile', 'allkeys.txt')
        # This is handy -- means if no pyuca, no problem
        from pyuca import Collator
        self.collator = Collator(keyPath)

    def process_string(self, session, data):
        # fix eszett sorting
        data = data.replace(u'\u00DF', 'ss')
        ints = self.collator.sort_key(data)
        exp = ["%04d" % i for i in ints]
        return ''.join(exp)
Пример #6
0
class UnicodeCollationNormalizer(SimpleNormalizer):
    """ Use pyuca to create sort key for string
        Only, but Very, useful for sorting
    """

    def __init__(self, session, config, parent):
        SimpleNormalizer.__init__(self, session, config, parent)
        keyPath = self.get_path(session, 'keyFile', 'allkeys.txt')
        # This is handy -- means if no pyuca, no problem
        from pyuca import Collator
        self.collator = Collator(keyPath)

    def process_string(self, session, data):
        # fix eszett sorting
        data = data.replace(u'\u00DF', 'ss')
        ints = self.collator.sort_key(data)
        exp = ["%04d" % i for i in ints]
        return ''.join(exp)
Пример #7
0
def person(initial="A"):
    if initial not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
        return redirect(url_for('.person', initial='A'))

    try:
        # Primer intento fallido, hay gente sin apellido
        # personas = Person.query.filter(Person.last_name.ilike(initial + "%")).all()
        personas = Person.query.filter(
            or_(
                and_(Person.last_name == '',
                     Person.first_name.ilike(initial + "%")),
                Person.last_name.ilike(initial + "%"))).all()
        collator = Collator()
        personas = sorted(
            personas, key=lambda e: collator.sort_key(e.get_name().upper()))
        return render_template('public/person_initial.html',
                               initial=initial,
                               personas=personas)
    except TemplateNotFound:
        abort(404)
Пример #8
0
    def create_language_name_map(self) -> None:
        join = os.path.join
        deploy_root = settings.DEPLOY_ROOT
        path = join(deploy_root, "locale", "language_options.json")
        output_path = join(deploy_root, "locale", "language_name_map.json")

        with open(path, "rb") as reader:
            languages = orjson.loads(reader.read())
            lang_list = []
            for lang_info in languages["languages"]:
                lang_info["name"] = lang_info["name_local"]
                del lang_info["name_local"]
                lang_list.append(lang_info)

            collator = Collator()
            lang_list.sort(key=lambda lang: collator.sort_key(lang["name"]))

        with open(output_path, "wb") as output_file:
            output_file.write(
                orjson.dumps(
                    {"name_map": lang_list},
                    option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2
                    | orjson.OPT_SORT_KEYS,
                ))
Пример #9
0
def sort_by_name(iterable):
    """Sort by a translatable name, using pyuca for a better result."""
    c = Collator()
    key = lambda obj: c.sort_key(str(obj.name))
    return sorted(iterable, key=key)
Пример #10
0
class FromFullTest(unittest.TestCase):

    def __init__(self, *args, **kwargs):
        from pyuca import Collator
        super(FromFullTest, self).__init__(*args, **kwargs)
        self.c = Collator()
        (0, 74, 33, 0, 2, 2, 0)

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    def test_1(self):
        self.assertEqual(
            self.c.sort_key("\u0332\u0334"),
            (0x0000, 0x004A, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    @unittest.skipIf(V8_0_0, "not for UCA version 8.0.0")
    @unittest.skipIf(V10_0_0, "not for UCA version 10.0.0")
    def test_2(self):
        self.assertEqual(
            self.c.sort_key("\u0430\u0306\u0334"),
            (0x1991, 0x0000, 0x0020, 0x004A, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    @unittest.skipIf(V8_0_0, "not for UCA version 8.0.0")
    @unittest.skipIf(V10_0_0, "not for UCA version 10.0.0")
    def test_3(self):
        self.assertEqual(
            self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"),
            (0x2571, 0x2587, 0x258A, 0x15EB, 0x0000, 0x0020, 0x0020, 0x0020,
                0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000)
        )

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    @unittest.skipIf(V8_0_0, "not for UCA version 8.0.0")
    @unittest.skipIf(V10_0_0, "not for UCA version 10.0.0")
    def test_4(self):
        self.assertEqual(
            self.c.sort_key("\u4E00\u0021"),
            (0xFB40, 0xCE00, 0x025D, 0x0000, 0x0020,
                0x0020, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    @unittest.skipIf(V8_0_0, "not for UCA version 8.0.0")
    @unittest.skipIf(V10_0_0, "not for UCA version 10.0.0")
    def test_5(self):
        self.assertEqual(
            self.c.sort_key("\u3400\u0021"),
            (0xFB80, 0xB400, 0x025D, 0x0000, 0x0020,
                0x0020, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_1_old(self):
        self.assertEqual(
            self.c.sort_key("\u0332\u0334"),
            (0x0000, 0x007C, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_2_old(self):
        self.assertEqual(
            self.c.sort_key("\u0430\u0306\u0334"),
            (0x15B0, 0x0000, 0x0020, 0x007C, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_3_old(self):
        self.assertEqual(
            self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"),
            (0x205B, 0x206D, 0x2070, 0x120F, 0x0000, 0x0020, 0x0020, 0x0020,
                0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000)
        )

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_4_old(self):
        self.assertEqual(
            self.c.sort_key("\u4E00\u0021"),
            (0xFB40, 0xCE00, 0x026E, 0x0000, 0x0020,
                0x0020, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_5_old(self):
        self.assertEqual(
            self.c.sort_key("\u3400\u0021"),
            (0xFB80, 0xB400, 0x026E, 0x0000, 0x0020,
                0x0020, 0x0000, 0x0002, 0x0002, 0x0000)
        )
Пример #11
0
#!/usr/bin/env python3

from collections import defaultdict
import sys

from pyuca import Collator

c = Collator()

filename = sys.argv[1]

entries = defaultdict(list)

key = None

with open(filename) as f:
    for line in f:
        if line.strip() == "":
            continue
        elif line.startswith("    "):
            assert key
            entries[key].append(line.rstrip())
        else:
            key = line.strip()

for key, lines in sorted(entries.items(), key=lambda i: c.sort_key(i[0])):
    print()
    print(key)
    for line in lines:
        print(line)
from morphgnt.utils import load_yaml

def n(x):
    return unicodedata.normalize("NFKC", x)

lexemes = load_yaml("lexemes.yaml")

headwords = set()
with open("../data-cleanup/bdag-headwords/bdag_headwords.txt") as f:
    for line in f:
        headwords.add(n(line.strip().decode("utf-8")))

existing_not_in_headwords = []
missing_not_in_headwords = []
added = []
for lexeme, metadata in sorted(lexemes.items(), key=lambda x: collator.sort_key(x[0])):
    if "bdag-headword" in metadata:
        print "{}:\n    pos: {}\n    bdag-headword: {}".format(lexeme.encode("utf-8"), metadata["pos"], metadata["bdag-headword"].encode("utf-8"))
        if metadata["bdag-headword"] not in headwords:
            existing_not_in_headwords.append(metadata["bdag-headword"].encode("utf-8"))
    else:
        if lexeme in headwords:
            print "{}:\n    pos: {}\n    bdag-headword: {}".format(lexeme.encode("utf-8"), metadata["pos"], lexeme.encode("utf-8"))
            added.append(lexeme.encode("utf-8"))
        else:
            print "{}:\n    pos: {}".format(lexeme.encode("utf-8"), metadata["pos"])
            missing_not_in_headwords.append(lexeme.encode("utf-8"))

print >>sys.stderr, "existing"
for word in existing_not_in_headwords:
    print >>sys.stderr, "\t", word
Пример #13
0
c = Collator()

prev_sort_key = None

success = 0
failure = 0

with open("CollationTest/CollationTest_NON_IGNORABLE.txt") as f:
    for line in f.readlines():
        points = line.split("#")[0].split(";")[0].strip().split()
        if points:
            test_string = "".join(
                chr(int(point, 16)) for point in points
            )
            test_string_sort_key = c.sort_key(test_string)
            x = format_sort_key(test_string_sort_key)
            if prev_sort_key:
                if prev_sort_key > test_string_sort_key:
                    failure += 1
                    print(line)
                    print(x)
                else:
                    success += 1
            prev_sort_key = test_string_sort_key

print()
print("{} success; {} failure".format(success, failure))

if failure > 0:
    sys.exit(1)
Пример #14
0
success = 0
failure = 0

path = "CollationTest/{0}/CollationTest_NON_IGNORABLE.txt".format(
    c.UCA_VERSION)

with open(path) as f:
    for i, line in enumerate(f.readlines()):
        points = line.split("#", 1)[0].split(";", 1)[0].strip().split()

        if points:
            test_string = "".join(
                chr(int(point, 16)) for point in points
            )
            test_string_sort_key = c.sort_key(test_string)
            if prev_sort_key:
                if prev_sort_key > test_string_sort_key:
                    failure += 1
                    print('-------')
                    print("failed on line {0}:".format(i+1))
                    print(line.rstrip('\n'))
                    print("PREV: {0}".format(format_sort_key(prev_sort_key)))
                    print("THIS: {0}".format(
                        format_sort_key(test_string_sort_key)))
                    print('-------')
                else:
                    success += 1
            prev_sort_key = test_string_sort_key

print("")
Пример #15
0
def n(x):
    return unicodedata.normalize("NFKC", x)


lexemes = load_yaml("lexemes.yaml")

headwords = set()
with open("../data-cleanup/bdag-headwords/bdag_headwords.txt") as f:
    for line in f:
        headwords.add(n(line.strip().decode("utf-8")))

existing_not_in_headwords = []
missing_not_in_headwords = []
added = []
for lexeme, metadata in sorted(lexemes.items(),
                               key=lambda x: collator.sort_key(x[0])):
    if "bdag-headword" in metadata:
        print "{}:\n    pos: {}\n    bdag-headword: {}".format(
            lexeme.encode("utf-8"), metadata["pos"],
            metadata["bdag-headword"].encode("utf-8"))
        if metadata["bdag-headword"] not in headwords:
            existing_not_in_headwords.append(
                metadata["bdag-headword"].encode("utf-8"))
    else:
        if lexeme in headwords:
            print "{}:\n    pos: {}\n    bdag-headword: {}".format(
                lexeme.encode("utf-8"), metadata["pos"],
                lexeme.encode("utf-8"))
            added.append(lexeme.encode("utf-8"))
        else:
            print "{}:\n    pos: {}".format(lexeme.encode("utf-8"),
Пример #16
0
def sort_by_name(iterable):
    """Sort by a translatable name, using pyuca for a better result."""
    c = Collator()
    key = lambda obj: c.sort_key(str(obj.name))
    return sorted(iterable, key=key)
Пример #17
0
class FromFullTest(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        from pyuca import Collator
        super(FromFullTest, self).__init__(*args, **kwargs)
        self.c = Collator()
        (0, 74, 33, 0, 2, 2, 0)

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    def test_1(self):
        self.assertEqual(
            self.c.sort_key("\u0332\u0334"),
            (0x0000, 0x004A, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000))

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    def test_2(self):
        self.assertEqual(
            self.c.sort_key("\u0430\u0306\u0334"),
            (0x1991, 0x0000, 0x0020, 0x004A, 0x0000, 0x0002, 0x0002, 0x0000))

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    def test_3(self):
        self.assertEqual(
            self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"),
            (0x2571, 0x2587, 0x258A, 0x15EB, 0x0000, 0x0020, 0x0020, 0x0020,
             0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000))

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    def test_4(self):
        self.assertEqual(self.c.sort_key("\u4E00\u0021"),
                         (0xFB40, 0xCE00, 0x025D, 0x0000, 0x0020, 0x0020,
                          0x0000, 0x0002, 0x0002, 0x0000))

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    def test_5(self):
        self.assertEqual(self.c.sort_key("\u3400\u0021"),
                         (0xFB80, 0xB400, 0x025D, 0x0000, 0x0020, 0x0020,
                          0x0000, 0x0002, 0x0002, 0x0000))

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_1_old(self):
        self.assertEqual(
            self.c.sort_key("\u0332\u0334"),
            (0x0000, 0x007C, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000))

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_2_old(self):
        self.assertEqual(
            self.c.sort_key("\u0430\u0306\u0334"),
            (0x15B0, 0x0000, 0x0020, 0x007C, 0x0000, 0x0002, 0x0002, 0x0000))

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_3_old(self):
        self.assertEqual(
            self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"),
            (0x205B, 0x206D, 0x2070, 0x120F, 0x0000, 0x0020, 0x0020, 0x0020,
             0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000))

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_4_old(self):
        self.assertEqual(self.c.sort_key("\u4E00\u0021"),
                         (0xFB40, 0xCE00, 0x026E, 0x0000, 0x0020, 0x0020,
                          0x0000, 0x0002, 0x0002, 0x0000))

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_5_old(self):
        self.assertEqual(self.c.sort_key("\u3400\u0021"),
                         (0xFB80, 0xB400, 0x026E, 0x0000, 0x0020, 0x0020,
                          0x0000, 0x0002, 0x0002, 0x0000))
Пример #18
0
#!/usr/bin/env python3

import sys

import yaml

from pyuca import Collator
c = Collator()

FILENAME = sys.argv[1]

with open(FILENAME) as f:
    # load yaml
    data = yaml.safe_load(f)

    # sort based on the keys using pyuca
    data = dict(sorted(data.items(), key=lambda x: c.sort_key(x[0])))

with open(FILENAME, "w") as g:
    yaml.dump(data, g, sort_keys=False, allow_unicode=True)