Пример #1
0
 def __init__(self):
     self.template = os.path.join(os.path.dirname(__file__), "sort.html")
     self.silpacollator = Collator(
         os.path.join(os.path.dirname(__file__), "allkeys-silpa-6.0.0.txt"))
     self.ucacollator = Collator(
         os.path.join(os.path.dirname(__file__), "allkeys-6.0.0.txt"))
     self.response = SilpaResponse(self.template)
Пример #2
0
    def __init__(self, session, config, parent):
        SimpleNormalizer.__init__(self, session, config, parent)
        keyPath = self.get_path(session, "keyFile", "allkeys.txt")
        # This is handy -- means if no pyuca, no problem
        from pyuca import Collator

        self.collator = Collator(keyPath)
def GenerateCollationEquivalenceTable(unicodecharlist):
    charbuckets = {}
    C = Collator()
    
    def internal_sortfunc(codepointA, codepointB):
        A = rationalizeCollationKeys(C.sort_key(codepointA))
        B = rationalizeCollationKeys(C.sort_key(codepointB))
        cmp = 0
        if (A[2], A[3]) < (B[2], B[3]):
            cmp = -1
        elif (A[2], A[3]) > (B[2], B[3]):
            cmp = 1
        return cmp

    for codepoint in unicodecharlist:
        # Up to 4 collation keys are returned, we group on first two non-zero keys
        collationkeys = rationalizeCollationKeys(C.sort_key(codepoint))
        # print codepoint + " : " + repr(collationkeys)
        if collationkeys[0] == 0:
            continue
        
        # Not sure why case-ish transitions map to this value in the Unicode standard,
        # but this value seems to be consitently used in this way across all scripts.
        if collationkeys[1][0] != 32:
            continue
        k0 = collationkeys[0]
        k1 = collationkeys[1]
        if k0 not in charbuckets:
            charbuckets[k0] = {}
        if k1 not in charbuckets[k0]:
            charbuckets[k0][k1] = []
        charbuckets[k0][k1].append(codepoint)
    
    codepointMap = {}
    for k1 in charbuckets:
        for k2 in charbuckets[k1]:
            # This is what we are looking for:  buckets containing multiple characters.
            # Find the character with the lowest sort order in the bucket according
            # to it's full collation key sequence and map all of the other characters
            # in the bucket to this "smallest" characeter.  For instance this maps
            # "A" to "a".
            if len(charbuckets[k1][k2]) > 1:
                s = sorted(charbuckets[k1][k2], internal_sortfunc)
                for codepoint in s[1:]:
                    codepointMap[codepoint] = s[0]
    
    return codepointMap
Пример #4
0
    def get_ranked_documents(self, query_string):
        ranked_documents = list()

        # Cuando el sistema inicia con la colección procesada y llega una consulta
        # Se procesa la consulta utilizando la función del analizador
        query_terms = Analyzer.retrieve_html_str_terms(query_string)

        # Se buscan los términos de la consulta en el vocabulario
        # Filtered query terms debe estar ordenado alfabéticamente para luego
        # utilizar dicho ordenamiento en las entradas de los vectores
        filtered_query_terms = [
            term for term in query_terms
            if term in self.__collection_vocabulary.keys()
        ]

        # Caso en el que ninguno de los términos de la consulta existe en el vocabulario de la colección
        if len(filtered_query_terms) == 0:
            return ranked_documents

        query_terms_np_array = np.array(filtered_query_terms)
        terms, counts = np.unique(query_terms_np_array, return_counts=True)
        query_vocabulary = dict(zip(terms, counts))
        max_l_freq_lq = max(counts)
        collator = Collator()
        final_query_terms = sorted(query_vocabulary.keys(),
                                   key=collator.sort_key)

        # Se obtiene el vector de pesos de la consulta
        query_weights_vector = self.get_query_weights_vector(
            final_query_terms, max_l_freq_lq, query_vocabulary)

        # Se recuperan las listas de posteo de cada palabra involucrada
        postings_lists = self.__collection_handler.get_postings_lists(
            final_query_terms)

        # Se obtienen los vectores de pesos para los documentos de las listas de posteo
        documents_weights_vectors = SearchEngine.get_documents_weights_short_vectors(
            postings_lists, final_query_terms)

        # Se calcula la similaridad del peso de cada documento de la lista de posteo con el peso de la consulta
        sorted_documents_aliases = sorted(documents_weights_vectors.keys(),
                                          key=collator.sort_key)
        query_documents_dot_products = SearchEngine.get_query_documents_dot_products(
            query_weights_vector, documents_weights_vectors,
            sorted_documents_aliases)
        query_documents_norms_products = self.get_query_documents_norms_products(
            query_weights_vector, sorted_documents_aliases)
        similarities = query_documents_dot_products / query_documents_norms_products

        # Se hace el ranking respectivo para ordenar los documentos
        # Los documentos se identifican por ID en la lista de document_entries,
        # y lo que se devuelve es el document entry de cada uno
        ascending_ranked_similarities = np.argsort(similarities)
        descending_ranked_similarities = ascending_ranked_similarities[::-1]
        for index in descending_ranked_similarities:
            document_complete_alias = sorted_documents_aliases[index]
            ranked_documents.append(document_complete_alias)

        return ranked_documents
Пример #5
0
    def test_cafe(self):
        from pyuca import Collator
        c = Collator()

        self.assertEqual(sorted(["cafe", "caff", "café"]),
                         ["cafe", "caff", "café"])
        self.assertEqual(sorted(["cafe", "caff", "café"], key=c.sort_key),
                         ["cafe", "café", "caff"])
Пример #6
0
def person(initial="A"):
    if initial not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
        return redirect(url_for('.person', initial='A'))

    try:
        # Primer intento fallido, hay gente sin apellido
        # personas = Person.query.filter(Person.last_name.ilike(initial + "%")).all()
        personas = Person.query.filter(
            or_(
                and_(Person.last_name == '',
                     Person.first_name.ilike(initial + "%")),
                Person.last_name.ilike(initial + "%"))).all()
        collator = Collator()
        personas = sorted(
            personas, key=lambda e: collator.sort_key(e.get_name().upper()))
        return render_template('public/person_initial.html',
                               initial=initial,
                               personas=personas)
    except TemplateNotFound:
        abort(404)
Пример #7
0
class IndexLetter(SearchBase):

    def __init__(self, word):
        self._word = word
        self.searcher = None
        self.query = None
        self.collator = Collator()
        self.num_results = 0

    def get_num_results(self):
        return self.num_results

    def sort_key(self, string):
        s = string.decode("utf-8")
        return self.collator.sort_key(s)

    def get_results(self):
        if self.searcher is None:
            self.search()

        facet = FieldFacet("verb_form")
        facet = TranslateFacet(self.sort_key, facet)

        results = self.searcher.search(self.query,
                                      limit=None,
                                      sortedby=facet,
                                      collapse_limit=1,
                                      collapse='verb_form')

        self.num_results = len(results)
        return results

    def search(self):
        self.searcher = ix_letter.searcher()
        fields = []
        qs = u'index_letter:({0})'.format(self.word)
        fields.append("index_letter")
        self.query = MultifieldParser(fields, ix_letter.schema).parse(qs)

    def get_json(self):
        OK = 200
        status = OK
        results = self.get_results()
        all_results = []
        for result in results:
            verb = {}
            verb['verb_form'] = result['verb_form']
            if result['verb_form'] != result['infinitive']:
                verb['infinitive'] = result['infinitive']
            all_results.append(verb)

        return json.dumps(all_results, indent=4, separators=(',', ': ')), status
Пример #8
0
class FromFullTest(TestCase):

    def setUp(self):
        from pyuca import Collator
        self.c = Collator()

    def test_1(self):
        self.assertEqual(
            self.c.sort_key("\u0332\u0334"),
            (0x0000, 0x004A, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    def test_2(self):
        self.assertEqual(
            self.c.sort_key("\u0430\u0306\u0334"),
            (0x1991, 0x0000, 0x0020, 0x004A, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    def test_3(self):
        self.assertEqual(
            self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"),
            (0x2571, 0x2587, 0x258A, 0x15EB, 0x0000, 0x0020, 0x0020, 0x0020,
                0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000)
        )

    def test_4(self):
        self.assertEqual(
            self.c.sort_key("\u4E00\u0021"),
            (0xFB40, 0xCE00, 0x025D, 0x0000, 0x0020,
                0x0020, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    def test_5(self):
        self.assertEqual(
            self.c.sort_key("\u3400\u0021"),
            (0xFB80, 0xB400, 0x025D, 0x0000, 0x0020,
                0x0020, 0x0000, 0x0002, 0x0002, 0x0000)
        )
Пример #9
0
def sort_language_constants():
    """
    function to generate correct ordering of constants.LANGUAGES list
    sorted by Unicode characters
    """
    c = Collator()
    lang_names = [
        Locale(lang).get_language_name(lang).capitalize()
        for lang in constants.LANGUAGES
    ]
    available_languages = dict(zip(lang_names, constants.LANGUAGES))
    sorted_lang_names = sorted(lang_names, key=c.sort_key)

    return [available_languages[lang_name] for lang_name in sorted_lang_names]
Пример #10
0
def show_event(id):
    try:
        event = Event.query.filter_by(id=id).first_or_404()
    except:
        abort(404)

    # I need to prefill these variables here to simplify the template
    participantes, compositores, personas = set(), set(), set()
    for i in event.participants:
        if i.person and i.activity.name == "Compositor/a":
            i.person.is_composer = True
            compositores.add(i.person)
            personas.add(i.person)
        else:
            participantes.add(i)
            if i.person:
                personas.add(i.person)

    # Now, iterate in performances to extract other composers
    for p in event.performances:
        for c in p.musical_piece.composers:
            c.is_composer = True
            compositores.add(c)
            personas.add(c)
    collator = Collator()
    compositores = sorted(compositores,
                          key=lambda e: collator.sort_key(e.get_name()))
    participantes = sorted(participantes,
                           key=lambda e: collator.sort_key(e.get_name()))

    return render_template('public/detalle.html',
                           e=event,
                           participantes=participantes,
                           compositores=compositores,
                           personas=personas,
                           request=request)
Пример #11
0
def tcompare(text1, text2): #short for "text comparer"

    c = Collator("allkeys.txt") #from James Tauber's "pyuca"

    text1split = text1.splitlines()
    text2split = text2.splitlines()

    sortedwords1 = sorted(text1split, key=c.sort_key)
    sortedwords2 = sorted(text2split, key=c.sort_key)

    for line1 in sortedwords1:
        for line2 in sortedwords2:
            if line1 == line2:
                if line1 != '':
                    print(line1)
Пример #12
0
    def create_language_name_map(self) -> None:
        join = os.path.join
        deploy_root = settings.DEPLOY_ROOT
        path = join(deploy_root, "locale", "language_options.json")
        output_path = join(deploy_root, "locale", "language_name_map.json")

        with open(path, "rb") as reader:
            languages = orjson.loads(reader.read())
            lang_list = []
            for lang_info in languages["languages"]:
                lang_info["name"] = lang_info["name_local"]
                del lang_info["name_local"]
                lang_list.append(lang_info)

            collator = Collator()
            lang_list.sort(key=lambda lang: collator.sort_key(lang["name"]))

        with open(output_path, "wb") as output_file:
            output_file.write(
                orjson.dumps(
                    {"name_map": lang_list},
                    option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2
                    | orjson.OPT_SORT_KEYS,
                ))
Пример #13
0
class UnicodeCollationNormalizer(SimpleNormalizer):
    """ Use pyuca to create sort key for string
        Only, but Very, useful for sorting
    """
    def __init__(self, session, config, parent):
        SimpleNormalizer.__init__(self, session, config, parent)
        keyPath = self.get_path(session, 'keyFile', 'allkeys.txt')
        # This is handy -- means if no pyuca, no problem
        from pyuca import Collator
        self.collator = Collator(keyPath)

    def process_string(self, session, data):
        # fix eszett sorting
        data = data.replace(u'\u00DF', 'ss')
        ints = self.collator.sort_key(data)
        exp = ["%04d" % i for i in ints]
        return ''.join(exp)
Пример #14
0
def aTergo(date):

    site = pywikibot.getSite()
    c = Collator('allkeys.txt')
    pageList = getListFromXML(
        date
    )  # if not called from afterDump.py, it can be changed to getListFromXML('foo', True) - will fetch the latest dump

    sortDict = collections.defaultdict()
    sortedDict = collections.defaultdict()

    #sweep through the dump and add reversed words to appropriate dictionaries
    for elem in pageList:
        try:
            page = Haslo(
                elem.title,
                elem.text)  #use the parser in klasa.py to parse xml entry
        except sectionsNotFound:
            continue
        except WrongHeader:
            continue
        else:
            if page.type == 3:
                for section in page.listLangs:
                    if section.lang in sortDict:
                        sortDict[section.lang].append(page.title[::-1])
                    else:
                        sortDict[section.lang] = [page.title[::-1]]

    sortedDict['afrykanerski'] = sorted(sortDict['afrykanerski'],
                                        key=c.sort_key)
    letter = sortedDict['afrykanerski'][0][0]
    text = '{| class=hiddentable style="text-align:right"\n|-'
    counter = 0
    for i in range(len(sortedDict['afrykanerski'])):

        if elem[0] == letter:
            text = text + '|[[%s|%s]]|\n' % (elem[::-1], elem)
        else:
            pywikibot.Page(
                site, 'Wikipedysta:Alkamid/atergo/afrykanerski/%s' %
                letter).put(text)
            text = ''
            letter = elem[0]
            text = text + '* %s\n' % elem
Пример #15
0
class UnicodeCollationNormalizer(SimpleNormalizer):
    """ Use pyuca to create sort key for string
        Only, but Very, useful for sorting
    """

    def __init__(self, session, config, parent):
        SimpleNormalizer.__init__(self, session, config, parent)
        keyPath = self.get_path(session, 'keyFile', 'allkeys.txt')
        # This is handy -- means if no pyuca, no problem
        from pyuca import Collator
        self.collator = Collator(keyPath)

    def process_string(self, session, data):
        # fix eszett sorting
        data = data.replace(u'\u00DF', 'ss')
        ints = self.collator.sort_key(data)
        exp = ["%04d" % i for i in ints]
        return ''.join(exp)
Пример #16
0
def sort_by_name(iterable):
    """Sort by a translatable name, using pyuca for a better result."""
    c = Collator()
    key = lambda obj: c.sort_key(str(obj.name))
    return sorted(iterable, key=key)
Пример #17
0
import sys

from pyuca import Collator
from pyuca.utils import format_sort_key


c = Collator()

prev_sort_key = None

success = 0
failure = 0

with open("CollationTest/CollationTest_NON_IGNORABLE.txt") as f:
    for line in f.readlines():
        points = line.split("#")[0].split(";")[0].strip().split()
        if points:
            test_string = "".join(
                chr(int(point, 16)) for point in points
            )
            test_string_sort_key = c.sort_key(test_string)
            x = format_sort_key(test_string_sort_key)
            if prev_sort_key:
                if prev_sort_key > test_string_sort_key:
                    failure += 1
                    print(line)
                    print(x)
                else:
                    success += 1
            prev_sort_key = test_string_sort_key
Пример #18
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import codecs
import nltk
from nltk.corpus import stopwords
from pyuca import Collator
c = Collator("corpustxt/allkeys.txt")




arq="corpustxt/catatau.txt"


fileObj = codecs.open( arq, "r", "utf-8" )
catatau = fileObj.read() # Returns a Unicode string from the UTF-8 bytes in the file

# separa em linhas
stok = nltk.data.load('tokenizers/punkt/portuguese.pickle')
catalinhas=stok.tokenize(catatau) 



# filtra repetições
a = set(catalinhas)
frases=list(a)

# usando o padrao de ordenamento do collate pyuca para considerar acentos
frases=sorted(frases, key=c.sort_key)
Пример #19
0
from pyuca import Collator
col = Collator()

file_obj = open("dictionary.txt", "r", encoding="utf8")

count = 0
dict = {}
english_word = ""

for line in file_obj:
    if count % 2 == 0:
        ## do something with english text
        ##print("English: " + line)
        english_word = line.rstrip("\n")

    else:
        ## do something with greek text
        ##print("Greek: " + line)

        ## check if word in the dict and if so make multiple entries
        if english_word in dict:
            dict[english_word + " (1)"] = dict.pop(english_word)
            dict[english_word + " (2)"] = line.rstrip("\n")

        else:
            dict[english_word] = line.rstrip("\n")

    count += 1

file_obj.close()
Пример #20
0
def unicode_sorted_key(key):
    return Collator().sort_key(key)
Пример #21
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import codecs
import nltk
from nltk.corpus import stopwords
from pyuca import Collator

c = Collator("corpustxt/allkeys.txt")
arq = "corpustxt/exclama.txt"

fileObj = codecs.open(arq, "r", "utf-8")
catatau = fileObj.read(
)  # Returns a Unicode string from the UTF-8 bytes in the file

# separa em linhas
stok = nltk.data.load('tokenizers/punkt/portuguese.pickle')
catalinhas = stok.tokenize(catatau)

# filtra repetições
a = set(catalinhas)
frases = list(a)

# usando o padrao de ordenamento do collate pyuca para considerar acentos
frases = sorted(frases, key=c.sort_key)

#frases.reverse()

# termina em interrogação.
txt = ""
conta = 0
Пример #22
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import codecs
import nltk
from nltk.corpus import stopwords
from pyuca import Collator
c = Collator("allkeys.txt")




arq="catatau.txt"


fileObj = codecs.open( arq, "r", "utf-8" )
catatau = fileObj.read() # Returns a Unicode string from the UTF-8 bytes in the file

# separa em linhas
stok = nltk.data.load('tokenizers/punkt/portuguese.pickle')
catalinhas=stok.tokenize(catatau) 


#limpa palavras conectivas
def cleanupDoc(s):
     stopset = set(stopwords.words('portuguese'))
     tokens = nltk.word_tokenize(s)
     cleanup = " ".join(filter(lambda word: word not in stopset, s.split()))
     return cleanup

Пример #23
0
 def __init__(self):
     self.silpacollator = Collator(
         os.path.join(os.path.dirname(__file__), "allkeys-silpa-6.0.0.txt"))
     self.ucacollator = Collator(
         os.path.join(os.path.dirname(__file__), "allkeys-6.0.0.txt"))
Пример #24
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import codecs
import nltk
from nltk.corpus import stopwords
from pyuca import Collator

c = Collator("corpustxt/allkeys.txt")
arq = "corpustxt/catatau_semlinebreaks.txt"


fileObj = codecs.open(arq, "r", "utf-8")
catatau = fileObj.read()  # Returns a Unicode string from the UTF-8 bytes in the file

# separa em linhas
stok = nltk.data.load("tokenizers/punkt/portuguese.pickle")
catalinhas = stok.tokenize(catatau)


# filtra repetições
a = set(catalinhas)
frases = list(a)

# usando o padrao de ordenamento do collate pyuca para considerar acentos
frases = sorted(frases, key=c.sort_key)

# frases.reverse()


# termina em interrogação.
Пример #25
0
#!/usr/bin/env python3

from collections import defaultdict
import sys

from pyuca import Collator

c = Collator()

filename = sys.argv[1]

entries = defaultdict(list)

key = None

with open(filename) as f:
    for line in f:
        if line.strip() == "":
            continue
        elif line.startswith("    "):
            assert key
            entries[key].append(line.rstrip())
        else:
            key = line.strip()

for key, lines in sorted(entries.items(), key=lambda i: c.sort_key(i[0])):
    print()
    print(key)
    for line in lines:
        print(line)
Пример #26
0
    def _get_sorted_copy(self):
        '''
        Returns the SegmentPool as a list of tuples sorted appropriately for
        human consumption in *the current language*. This means that the
        _(NAME) value should determine the sort order of the outer dict and
        the _('segment_config') key should determine the order of the inner
        dicts. In both cases, the keys need to be compared in the provided
        language.

        Further note that the current language is given by get_language() and
        that this will reflect the CMS operator's user settings, NOT the current
        PAGE language.

        NOTE: that the structure of the sorted pool is different. Two of the
        nested dicts are now lists of tuples so that the sort can be retained.

        _sorted_segments = [
            (/class/, {
                NAME: _(/name/),
                CFGS: [
                    (/configuration_string/, {
                        LABEL: _(/configuration_string/),
                        OVERRIDES: {
                            /user.id/: /SegmentOverride enum value/,
                            ...
                        },
                        INSTANCES: [ ... ]
                    })
                ]
            })
        ]

        NOTE: On Python 3.0+ systems, we depend on pyuca for collation, which
        produces excellent results. On earlier systems, this is not available,
        so, we use a cruder mapping of accented characters into their
        unaccented ASCII equivalents.
        '''

        sort_key = None
        if sys.version_info >= (3, 0):
            uca = None
            #
            # Unfortunately, the pyuca class–which can provide collation of
            # strings in a thread-safe manner–is for Python 3.0+ only
            #
            try:
                from pyuca import Collator
                uca = Collator()
                sort_key = uca.sort_key
            except:
                pass

        if not sort_key:
            #
            # Our fallback position is to use a more simple approach of
            # mapping 'accented' chars to latin equivalents before sorting,
            # this is crude, but better than nothing.
            #
            from .unaccent import unaccented_map

            def sort_key(s):
                return s.translate(unaccented_map())

        pool = self.segments
        clone = []
        for cls_key in sorted(pool.keys()):
            cls_dict = {
                self.NAME: pool[cls_key][self.NAME],
                self.CFGS: list(),
            }
            clone.append((cls_key, cls_dict))
            # We'll build the CFG as a list in arbitrary order for now...
            for cfg_key in pool[cls_key][self.CFGS]:
                cfg_dict = {
                    self.LABEL: pool[cls_key][self.CFGS][cfg_key][self.LABEL],
                    self.OVERRIDES: dict(),
                    self.INSTANCES: list(),
                }
                for username, override in pool[cls_key][self.CFGS][cfg_key][
                        self.OVERRIDES].items():
                    cfg_dict[self.OVERRIDES][username] = override
                for instance in pool[cls_key][self.CFGS][cfg_key][
                        self.INSTANCES]:
                    cfg_dict[self.INSTANCES].append(instance)
                cls_dict[self.CFGS].append((cfg_key, cfg_dict))
            #
            # Now, sort the CFGS by their LABEL, using which every means we
            # have available to us at this moment.
            #
            cls_dict[self.CFGS] = sorted(
                cls_dict[self.CFGS],
                key=lambda x: sort_key(force_text(x[1][self.LABEL])))

        return clone
Пример #27
0
def sort_by_name(iterable):
    """Sort by a translatable name, using pyuca for a better result."""
    c = Collator()
    key = lambda obj: c.sort_key(str(obj.name))
    return sorted(iterable, key=key)
Пример #28
0
from distutils.dir_util import copy_tree
from os.path import isfile, join
from subprocess import call
from tempfile import mkdtemp

from django.conf import settings
from django.core.management.base import CommandError
from django.template import Template
from django.utils import translation

from django_countries import countries
from pyuca import Collator

from hosting.models import Place

c = Collator()


class LatexCommand(object):
    template_name = 'PasportaServo.tex'
    address_only = False
    make_pdf = False
    tex_files = [
        template_name,
        'pages/title.tex',
        'pages/address.tex',
    ]

    def activate_translation(self):
        translation.activate(settings.LANGUAGE_CODE)
Пример #29
0
 def __init__(self, session, config, parent):
     SimpleNormalizer.__init__(self, session, config, parent)
     keyPath = self.get_path(session, 'keyFile', 'allkeys.txt')
     # This is handy -- means if no pyuca, no problem
     from pyuca import Collator
     self.collator = Collator(keyPath)
Пример #30
0
 def __init__(self, *args, **kwargs):
     from pyuca import Collator
     super(FromFullTest, self).__init__(*args, **kwargs)
     self.c = Collator()
     (0, 74, 33, 0, 2, 2, 0)
Пример #31
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import codecs
import nltk
from nltk.corpus import stopwords
from pyuca import Collator
c = Collator("allkeys.txt")

arq = "catatau.txt"

fileObj = codecs.open(arq, "r", "utf-8")
catatau = fileObj.read(
)  # Returns a Unicode string from the UTF-8 bytes in the file

# separa em linhas
stok = nltk.data.load('tokenizers/punkt/portuguese.pickle')
catalinhas = stok.tokenize(catatau)

# filtra repetições
a = set(catalinhas)
frases = list(a)

# usando o padrao de ordenamento do collate pyuca para considerar acentos
frases = sorted(frases, key=c.sort_key)

#frases.reverse()

# termina em interrogação.
txt = ""
conta = 0
#!/usr/bin/env python

import sys
import unicodedata

from pyuca import Collator
collator = Collator()

from morphgnt.utils import load_yaml

def n(x):
    return unicodedata.normalize("NFKC", x)

lexemes = load_yaml("lexemes.yaml")

headwords = set()
with open("../data-cleanup/bdag-headwords/bdag_headwords.txt") as f:
    for line in f:
        headwords.add(n(line.strip().decode("utf-8")))

existing_not_in_headwords = []
missing_not_in_headwords = []
added = []
for lexeme, metadata in sorted(lexemes.items(), key=lambda x: collator.sort_key(x[0])):
    if "bdag-headword" in metadata:
        print "{}:\n    pos: {}\n    bdag-headword: {}".format(lexeme.encode("utf-8"), metadata["pos"], metadata["bdag-headword"].encode("utf-8"))
        if metadata["bdag-headword"] not in headwords:
            existing_not_in_headwords.append(metadata["bdag-headword"].encode("utf-8"))
    else:
        if lexeme in headwords:
            print "{}:\n    pos: {}\n    bdag-headword: {}".format(lexeme.encode("utf-8"), metadata["pos"], lexeme.encode("utf-8"))
Пример #33
0
def test_authors_ordering():
    with open('AUTHORS', 'rt', encoding='utf8') as fh:
        authors = fh.readlines()
    ordered_authors = sorted(authors, key=Collator().sort_key)
    assert authors == ordered_authors
Пример #34
0
"""PytSite Object Document Mapper UI Plugin Widgets
"""
__author__ = 'Oleksandr Shepetko'
__email__ = '*****@*****.**'
__license__ = 'MIT'

import htmler
from typing import List, Callable, Union, Iterable, Tuple
from pyuca import Collator
from json import dumps as json_dumps
from pytsite import lang
from plugins import widget, odm, http_api, odm_http_api

_pyuca_col = Collator()


def _sanitize_kwargs_exclude(kwargs: dict):
    if not ('exclude' in kwargs and kwargs['exclude']):
        return

    if isinstance(kwargs['exclude'], odm.Entity):
        kwargs['exclude'] = [kwargs['exclude'].ref
                             ] if not kwargs['exclude'].is_new else []
    elif isinstance(kwargs['exclude'], str):
        kwargs['exclude'] = [kwargs['exclude']]
    elif isinstance(kwargs['exclude'], (list, tuple)):
        ex = []
        for item in kwargs['exclude']:
            if isinstance(item, odm.Entity):
                if not item.is_new:
                    ex.append(item.ref)
Пример #35
0
class FromFullTest(unittest.TestCase):

    def __init__(self, *args, **kwargs):
        from pyuca import Collator
        super(FromFullTest, self).__init__(*args, **kwargs)
        self.c = Collator()
        (0, 74, 33, 0, 2, 2, 0)

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    def test_1(self):
        self.assertEqual(
            self.c.sort_key("\u0332\u0334"),
            (0x0000, 0x004A, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    @unittest.skipIf(V8_0_0, "not for UCA version 8.0.0")
    @unittest.skipIf(V10_0_0, "not for UCA version 10.0.0")
    def test_2(self):
        self.assertEqual(
            self.c.sort_key("\u0430\u0306\u0334"),
            (0x1991, 0x0000, 0x0020, 0x004A, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    @unittest.skipIf(V8_0_0, "not for UCA version 8.0.0")
    @unittest.skipIf(V10_0_0, "not for UCA version 10.0.0")
    def test_3(self):
        self.assertEqual(
            self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"),
            (0x2571, 0x2587, 0x258A, 0x15EB, 0x0000, 0x0020, 0x0020, 0x0020,
                0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000)
        )

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    @unittest.skipIf(V8_0_0, "not for UCA version 8.0.0")
    @unittest.skipIf(V10_0_0, "not for UCA version 10.0.0")
    def test_4(self):
        self.assertEqual(
            self.c.sort_key("\u4E00\u0021"),
            (0xFB40, 0xCE00, 0x025D, 0x0000, 0x0020,
                0x0020, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    @unittest.skipIf(V8_0_0, "not for UCA version 8.0.0")
    @unittest.skipIf(V10_0_0, "not for UCA version 10.0.0")
    def test_5(self):
        self.assertEqual(
            self.c.sort_key("\u3400\u0021"),
            (0xFB80, 0xB400, 0x025D, 0x0000, 0x0020,
                0x0020, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_1_old(self):
        self.assertEqual(
            self.c.sort_key("\u0332\u0334"),
            (0x0000, 0x007C, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_2_old(self):
        self.assertEqual(
            self.c.sort_key("\u0430\u0306\u0334"),
            (0x15B0, 0x0000, 0x0020, 0x007C, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_3_old(self):
        self.assertEqual(
            self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"),
            (0x205B, 0x206D, 0x2070, 0x120F, 0x0000, 0x0020, 0x0020, 0x0020,
                0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000)
        )

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_4_old(self):
        self.assertEqual(
            self.c.sort_key("\u4E00\u0021"),
            (0xFB40, 0xCE00, 0x026E, 0x0000, 0x0020,
                0x0020, 0x0000, 0x0002, 0x0002, 0x0000)
        )

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_5_old(self):
        self.assertEqual(
            self.c.sort_key("\u3400\u0021"),
            (0xFB80, 0xB400, 0x026E, 0x0000, 0x0020,
                0x0020, 0x0000, 0x0002, 0x0002, 0x0000)
        )
Пример #36
0
 def __init__(self, *args, **kwargs):
     from pyuca import Collator
     super(FromFullTest, self).__init__(*args, **kwargs)
     self.c = Collator()
     (0, 74, 33, 0, 2, 2, 0)
Пример #37
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import codecs
import nltk
from nltk.corpus import stopwords
from pyuca import Collator
from string import ascii_lowercase


c = Collator("corpustxt/allkeys.txt")


# arquivo analisado (no mesmo diretorio)
arq="corpustxt/catatau_semlinebreaks.txt"
fileObj = codecs.open( arq, "r", "utf-8" )
mikrofesto = fileObj.read() 

# separa em linhas
stok = nltk.data.load('tokenizers/punkt/portuguese.pickle')
catalinhas=stok.tokenize(mikrofesto) 



#separando pontuações do final de palavras e demais tokens
tokens = nltk.word_tokenize(mikrofesto)

# limpando conectivos
#cleanupDoc(tokens)

#formatando em estrutura de dados nltk para padronizar posteriormente
Пример #38
0
#!/usr/bin/env python
"""
C.11.5 Index and Glossary (p211)

"""

import string, os
from plasTeX.Tokenizer import Token, EscapeSequence
from plasTeX import Command, Environment
from plasTeX.Logging import getLogger
from Sectioning import SectionUtils

try:
    from pyuca import Collator
    collator = Collator(os.path.join(os.path.dirname(__file__),
                                     'allkeys.txt')).sort_key
except ImportError:
    collator = lambda x: x.lower()


class IndexUtils(object):
    """ Helper functions for generating indexes """

    linkType = 'index'
    level = Command.CHAPTER_LEVEL

    class Index(Command):
        """
        Utility class used to surface the index entries to the renderer
    
        """
Пример #39
0
class FromFullTest(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        from pyuca import Collator
        super(FromFullTest, self).__init__(*args, **kwargs)
        self.c = Collator()
        (0, 74, 33, 0, 2, 2, 0)

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    def test_1(self):
        self.assertEqual(
            self.c.sort_key("\u0332\u0334"),
            (0x0000, 0x004A, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000))

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    def test_2(self):
        self.assertEqual(
            self.c.sort_key("\u0430\u0306\u0334"),
            (0x1991, 0x0000, 0x0020, 0x004A, 0x0000, 0x0002, 0x0002, 0x0000))

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    def test_3(self):
        self.assertEqual(
            self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"),
            (0x2571, 0x2587, 0x258A, 0x15EB, 0x0000, 0x0020, 0x0020, 0x0020,
             0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000))

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    def test_4(self):
        self.assertEqual(self.c.sort_key("\u4E00\u0021"),
                         (0xFB40, 0xCE00, 0x025D, 0x0000, 0x0020, 0x0020,
                          0x0000, 0x0002, 0x0002, 0x0000))

    @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version")
    def test_5(self):
        self.assertEqual(self.c.sort_key("\u3400\u0021"),
                         (0xFB80, 0xB400, 0x025D, 0x0000, 0x0020, 0x0020,
                          0x0000, 0x0002, 0x0002, 0x0000))

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_1_old(self):
        self.assertEqual(
            self.c.sort_key("\u0332\u0334"),
            (0x0000, 0x007C, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000))

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_2_old(self):
        self.assertEqual(
            self.c.sort_key("\u0430\u0306\u0334"),
            (0x15B0, 0x0000, 0x0020, 0x007C, 0x0000, 0x0002, 0x0002, 0x0000))

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_3_old(self):
        self.assertEqual(
            self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"),
            (0x205B, 0x206D, 0x2070, 0x120F, 0x0000, 0x0020, 0x0020, 0x0020,
             0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000))

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_4_old(self):
        self.assertEqual(self.c.sort_key("\u4E00\u0021"),
                         (0xFB40, 0xCE00, 0x026E, 0x0000, 0x0020, 0x0020,
                          0x0000, 0x0002, 0x0002, 0x0000))

    @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version")
    def test_5_old(self):
        self.assertEqual(self.c.sort_key("\u3400\u0021"),
                         (0xFB80, 0xB400, 0x026E, 0x0000, 0x0020, 0x0020,
                          0x0000, 0x0002, 0x0002, 0x0000))
Пример #40
0
from smithers import data_types
from smithers import redis_keys as rkeys
from smithers.utils import get_epoch_minute

if settings.ENABLE_REDIS:
    redis = get_redis_connection('smithers')
else:
    redis = False

TWITTER_URL = 'https://twitter.com/share'
FB_URL = 'https://www.facebook.com/sharer/sharer.php'
COUNT_FOOTNOTE = ('<a href="#number-modal" class="number-help" '
                  'data-toggle="modal" title="{}">'
                  '<span class="share_total"></span>'
                  '<i class="fa fa-question-circle"></i></a>')
uca_collator = Collator()


def uca_sort_key(country):
    """Sort key function using pyuca on the 2nd element of the argument."""
    return uca_collator.sort_key(country[1])


def get_tw_share_url(**kwargs):
    kwargs.setdefault('dnt', 'true')
    text = kwargs.get('text')
    if text:
        kwargs['text'] = text.encode('utf8')
    return '?'.join([TWITTER_URL, urlencode(kwargs)])

Пример #41
0
#!/usr/bin/env python

import sys

from pyuca import Collator
collator = Collator()

from morphgnt.utils import load_yaml
from morphgnt.utils import nfkc_normalize as n

danker = load_yaml("../data-cleanup/danker-concise-lexicon/components.yaml")

greenlee = {}
with open("../data-cleanup/greenlee-morphology/morphemes-utf8.txt") as f:
    for line in f:
        key, value = line.strip().split("\t")
        greenlee[n(key.decode("utf-8")).split(",")[0]] = {
            "full-entry": n(key.decode("utf-8")),
            "components": n(value.decode("utf-8")),
        }

words = [n(word) for word in set(danker.keys()).union(set(greenlee.keys()))]

count = 0
for word in sorted(words, key=collator.sort_key):
    count += 1
    print "{}:".format(word.encode("utf-8"))
    if word in danker:
        print "    danker-full-entry: \"{}\"".format(danker[word]["full-entry"].encode("utf-8"))
        print "    danker-components: \"{}\"".format(danker[word]["components"].encode("utf-8"))
    if word in greenlee:
Пример #42
0
 def setUp(self):
     from pyuca import Collator
     self.c = Collator()