Python Collator.Collator примеры, pyuca.Collator.Collator Python примеры использования

Пример #1

0

Показать файл

Файл: sort.py Проект: stultus/silpa

 def __init__(self):
     self.template = os.path.join(os.path.dirname(__file__), "sort.html")
     self.silpacollator = Collator(
         os.path.join(os.path.dirname(__file__), "allkeys-silpa-6.0.0.txt"))
     self.ucacollator = Collator(
         os.path.join(os.path.dirname(__file__), "allkeys-6.0.0.txt"))
     self.response = SilpaResponse(self.template)

Пример #2

0

Показать файл

Файл: search_engine.py Проект: joseferqm/ri-proyecto

    def get_ranked_documents(self, query_string):
        ranked_documents = list()

        # Cuando el sistema inicia con la colección procesada y llega una consulta
        # Se procesa la consulta utilizando la función del analizador
        query_terms = Analyzer.retrieve_html_str_terms(query_string)

        # Se buscan los términos de la consulta en el vocabulario
        # Filtered query terms debe estar ordenado alfabéticamente para luego
        # utilizar dicho ordenamiento en las entradas de los vectores
        filtered_query_terms = [
            term for term in query_terms
            if term in self.__collection_vocabulary.keys()
        ]

        # Caso en el que ninguno de los términos de la consulta existe en el vocabulario de la colección
        if len(filtered_query_terms) == 0:
            return ranked_documents

        query_terms_np_array = np.array(filtered_query_terms)
        terms, counts = np.unique(query_terms_np_array, return_counts=True)
        query_vocabulary = dict(zip(terms, counts))
        max_l_freq_lq = max(counts)
        collator = Collator()
        final_query_terms = sorted(query_vocabulary.keys(),
                                   key=collator.sort_key)

        # Se obtiene el vector de pesos de la consulta
        query_weights_vector = self.get_query_weights_vector(
            final_query_terms, max_l_freq_lq, query_vocabulary)

        # Se recuperan las listas de posteo de cada palabra involucrada
        postings_lists = self.__collection_handler.get_postings_lists(
            final_query_terms)

        # Se obtienen los vectores de pesos para los documentos de las listas de posteo
        documents_weights_vectors = SearchEngine.get_documents_weights_short_vectors(
            postings_lists, final_query_terms)

        # Se calcula la similaridad del peso de cada documento de la lista de posteo con el peso de la consulta
        sorted_documents_aliases = sorted(documents_weights_vectors.keys(),
                                          key=collator.sort_key)
        query_documents_dot_products = SearchEngine.get_query_documents_dot_products(
            query_weights_vector, documents_weights_vectors,
            sorted_documents_aliases)
        query_documents_norms_products = self.get_query_documents_norms_products(
            query_weights_vector, sorted_documents_aliases)
        similarities = query_documents_dot_products / query_documents_norms_products

        # Se hace el ranking respectivo para ordenar los documentos
        # Los documentos se identifican por ID en la lista de document_entries,
        # y lo que se devuelve es el document entry de cada uno
        ascending_ranked_similarities = np.argsort(similarities)
        descending_ranked_similarities = ascending_ranked_similarities[::-1]
        for index in descending_ranked_similarities:
            document_complete_alias = sorted_documents_aliases[index]
            ranked_documents.append(document_complete_alias)

        return ranked_documents

Пример #3

0

Показать файл

    def test_cafe(self):
        from pyuca import Collator
        c = Collator()

        self.assertEqual(sorted(["cafe", "caff", "café"]),
                         ["cafe", "caff", "café"])
        self.assertEqual(sorted(["cafe", "caff", "café"], key=c.sort_key),
                         ["cafe", "café", "caff"])

Пример #4

0

Показать файл

def sort_language_constants():
    """
    function to generate correct ordering of constants.LANGUAGES list
    sorted by Unicode characters
    """
    c = Collator()
    lang_names = [
        Locale(lang).get_language_name(lang).capitalize()
        for lang in constants.LANGUAGES
    ]
    available_languages = dict(zip(lang_names, constants.LANGUAGES))
    sorted_lang_names = sorted(lang_names, key=c.sort_key)

    return [available_languages[lang_name] for lang_name in sorted_lang_names]

Пример #5

0

Показать файл

Файл: Update3.py Проект: FrancisDelaPena/UCATest

def tcompare(text1, text2): #short for "text comparer"

    c = Collator("allkeys.txt") #from James Tauber's "pyuca"

    text1split = text1.splitlines()
    text2split = text2.splitlines()

    sortedwords1 = sorted(text1split, key=c.sort_key)
    sortedwords2 = sorted(text2split, key=c.sort_key)

    for line1 in sortedwords1:
        for line2 in sortedwords2:
            if line1 == line2:
                if line1 != '':
                    print(line1)

Пример #6

0

Показать файл

Файл: aTergo.py Проект: jagnajoz/wiktionary

def aTergo(date):

    site = pywikibot.getSite()
    c = Collator('allkeys.txt')
    pageList = getListFromXML(
        date
    )  # if not called from afterDump.py, it can be changed to getListFromXML('foo', True) - will fetch the latest dump

    sortDict = collections.defaultdict()
    sortedDict = collections.defaultdict()

    #sweep through the dump and add reversed words to appropriate dictionaries
    for elem in pageList:
        try:
            page = Haslo(
                elem.title,
                elem.text)  #use the parser in klasa.py to parse xml entry
        except sectionsNotFound:
            continue
        except WrongHeader:
            continue
        else:
            if page.type == 3:
                for section in page.listLangs:
                    if section.lang in sortDict:
                        sortDict[section.lang].append(page.title[::-1])
                    else:
                        sortDict[section.lang] = [page.title[::-1]]

    sortedDict['afrykanerski'] = sorted(sortDict['afrykanerski'],
                                        key=c.sort_key)
    letter = sortedDict['afrykanerski'][0][0]
    text = '{| class=hiddentable style="text-align:right"\n|-'
    counter = 0
    for i in range(len(sortedDict['afrykanerski'])):

        if elem[0] == letter:
            text = text + '|[[%s|%s]]|\n' % (elem[::-1], elem)
        else:
            pywikibot.Page(
                site, 'Wikipedysta:Alkamid/atergo/afrykanerski/%s' %
                letter).put(text)
            text = ''
            letter = elem[0]
            text = text + '* %s\n' % elem

Пример #7

0

Показать файл

Файл: routes.py Проект: epikt/cl-concerts-db

def person(initial="A"):
    if initial not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
        return redirect(url_for('.person', initial='A'))

    try:
        # Primer intento fallido, hay gente sin apellido
        # personas = Person.query.filter(Person.last_name.ilike(initial + "%")).all()
        personas = Person.query.filter(
            or_(
                and_(Person.last_name == '',
                     Person.first_name.ilike(initial + "%")),
                Person.last_name.ilike(initial + "%"))).all()
        collator = Collator()
        personas = sorted(
            personas, key=lambda e: collator.sort_key(e.get_name().upper()))
        return render_template('public/person_initial.html',
                               initial=initial,
                               personas=personas)
    except TemplateNotFound:
        abort(404)

Пример #8

0

Показать файл

Файл: routes.py Проект: epikt/cl-concerts-db

def show_event(id):
    try:
        event = Event.query.filter_by(id=id).first_or_404()
    except:
        abort(404)

    # I need to prefill these variables here to simplify the template
    participantes, compositores, personas = set(), set(), set()
    for i in event.participants:
        if i.person and i.activity.name == "Compositor/a":
            i.person.is_composer = True
            compositores.add(i.person)
            personas.add(i.person)
        else:
            participantes.add(i)
            if i.person:
                personas.add(i.person)

    # Now, iterate in performances to extract other composers
    for p in event.performances:
        for c in p.musical_piece.composers:
            c.is_composer = True
            compositores.add(c)
            personas.add(c)
    collator = Collator()
    compositores = sorted(compositores,
                          key=lambda e: collator.sort_key(e.get_name()))
    participantes = sorted(participantes,
                           key=lambda e: collator.sort_key(e.get_name()))

    return render_template('public/detalle.html',
                           e=event,
                           participantes=participantes,
                           compositores=compositores,
                           personas=personas,
                           request=request)

Пример #9

0

Показать файл

Файл: compilemessages.py Проект: scrapcode/zulip

    def create_language_name_map(self) -> None:
        join = os.path.join
        deploy_root = settings.DEPLOY_ROOT
        path = join(deploy_root, "locale", "language_options.json")
        output_path = join(deploy_root, "locale", "language_name_map.json")

        with open(path, "rb") as reader:
            languages = orjson.loads(reader.read())
            lang_list = []
            for lang_info in languages["languages"]:
                lang_info["name"] = lang_info["name_local"]
                del lang_info["name_local"]
                lang_list.append(lang_info)

            collator = Collator()
            lang_list.sort(key=lambda lang: collator.sort_key(lang["name"]))

        with open(output_path, "wb") as output_file:
            output_file.write(
                orjson.dumps(
                    {"name_map": lang_list},
                    option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2
                    | orjson.OPT_SORT_KEYS,
                ))

Пример #10

0

Показать файл

Файл: test_infra.py Проект: eduzen/fades

def test_authors_ordering():
    with open('AUTHORS', 'rt', encoding='utf8') as fh:
        authors = fh.readlines()
    ordered_authors = sorted(authors, key=Collator().sort_key)
    assert authors == ordered_authors

Пример #11

0

Показать файл

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import codecs
import nltk
from nltk.corpus import stopwords
from pyuca import Collator
c = Collator("allkeys.txt")

arq = "catatau.txt"

fileObj = codecs.open(arq, "r", "utf-8")
catatau = fileObj.read(
)  # Returns a Unicode string from the UTF-8 bytes in the file

# separa em linhas
stok = nltk.data.load('tokenizers/punkt/portuguese.pickle')
catalinhas = stok.tokenize(catatau)

# filtra repetições
a = set(catalinhas)
frases = list(a)

# usando o padrao de ordenamento do collate pyuca para considerar acentos
frases = sorted(frases, key=c.sort_key)

#frases.reverse()

# termina em interrogação.
txt = ""
conta = 0

Пример #12

0

Показать файл

Файл: normalizer.py Проект: tanmoydeb07/cheshire3

 def __init__(self, session, config, parent):
     SimpleNormalizer.__init__(self, session, config, parent)
     keyPath = self.get_path(session, 'keyFile', 'allkeys.txt')
     # This is handy -- means if no pyuca, no problem
     from pyuca import Collator
     self.collator = Collator(keyPath)

Пример #13

0

Показать файл

def sort_by_name(iterable):
    """Sort by a translatable name, using pyuca for a better result."""
    c = Collator()
    key = lambda obj: c.sort_key(str(obj.name))
    return sorted(iterable, key=key)

Пример #14

0

Показать файл

Файл: segment_pool.py Проект: nasft/aldryn-segmentation

    def _get_sorted_copy(self):
        '''
        Returns the SegmentPool as a list of tuples sorted appropriately for
        human consumption in *the current language*. This means that the
        _(NAME) value should determine the sort order of the outer dict and
        the _('segment_config') key should determine the order of the inner
        dicts. In both cases, the keys need to be compared in the provided
        language.

        Further note that the current language is given by get_language() and
        that this will reflect the CMS operator's user settings, NOT the current
        PAGE language.

        NOTE: that the structure of the sorted pool is different. Two of the
        nested dicts are now lists of tuples so that the sort can be retained.

        _sorted_segments = [
            (/class/, {
                NAME: _(/name/),
                CFGS: [
                    (/configuration_string/, {
                        LABEL: _(/configuration_string/),
                        OVERRIDES: {
                            /user.id/: /SegmentOverride enum value/,
                            ...
                        },
                        INSTANCES: [ ... ]
                    })
                ]
            })
        ]

        NOTE: On Python 3.0+ systems, we depend on pyuca for collation, which
        produces excellent results. On earlier systems, this is not available,
        so, we use a cruder mapping of accented characters into their
        unaccented ASCII equivalents.
        '''

        sort_key = None
        if sys.version_info >= (3, 0):
            uca = None
            #
            # Unfortunately, the pyuca class–which can provide collation of
            # strings in a thread-safe manner–is for Python 3.0+ only
            #
            try:
                from pyuca import Collator
                uca = Collator()
                sort_key = uca.sort_key
            except:
                pass

        if not sort_key:
            #
            # Our fallback position is to use a more simple approach of
            # mapping 'accented' chars to latin equivalents before sorting,
            # this is crude, but better than nothing.
            #
            from .unaccent import unaccented_map

            def sort_key(s):
                return s.translate(unaccented_map())

        pool = self.segments
        clone = []
        for cls_key in sorted(pool.keys()):
            cls_dict = {
                self.NAME: pool[cls_key][self.NAME],
                self.CFGS: list(),
            }
            clone.append((cls_key, cls_dict))
            # We'll build the CFG as a list in arbitrary order for now...
            for cfg_key in pool[cls_key][self.CFGS]:
                cfg_dict = {
                    self.LABEL: pool[cls_key][self.CFGS][cfg_key][self.LABEL],
                    self.OVERRIDES: dict(),
                    self.INSTANCES: list(),
                }
                for username, override in pool[cls_key][self.CFGS][cfg_key][
                        self.OVERRIDES].items():
                    cfg_dict[self.OVERRIDES][username] = override
                for instance in pool[cls_key][self.CFGS][cfg_key][
                        self.INSTANCES]:
                    cfg_dict[self.INSTANCES].append(instance)
                cls_dict[self.CFGS].append((cfg_key, cfg_dict))
            #
            # Now, sort the CFGS by their LABEL, using which every means we
            # have available to us at this moment.
            #
            cls_dict[self.CFGS] = sorted(
                cls_dict[self.CFGS],
                key=lambda x: sort_key(force_text(x[1][self.LABEL])))

        return clone

Пример #15

0

Показать файл

def unicode_sorted_key(key):
    return Collator().sort_key(key)

Пример #16

0

Показать файл

Файл: geraLINKS.py Проект: glerm/CatatauScripts

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import codecs
import nltk
from nltk.corpus import stopwords
from pyuca import Collator
from string import ascii_lowercase


c = Collator("corpustxt/allkeys.txt")


# arquivo analisado (no mesmo diretorio)
arq="corpustxt/catatau_semlinebreaks.txt"
fileObj = codecs.open( arq, "r", "utf-8" )
mikrofesto = fileObj.read() 

# separa em linhas
stok = nltk.data.load('tokenizers/punkt/portuguese.pickle')
catalinhas=stok.tokenize(mikrofesto) 



#separando pontuações do final de palavras e demais tokens
tokens = nltk.word_tokenize(mikrofesto)

# limpando conectivos
#cleanupDoc(tokens)

#formatando em estrutura de dados nltk para padronizar posteriormente

Пример #17

0

Показать файл

Файл: indexletter.py Проект: Softcatala/conjugador

 def __init__(self, word):
     self._word = word
     self.searcher = None
     self.query = None
     self.collator = Collator()
     self.num_results = 0

Пример #18

0

Показать файл

 def __init__(self, *args, **kwargs):
     from pyuca import Collator
     super(FromFullTest, self).__init__(*args, **kwargs)
     self.c = Collator()
     (0, 74, 33, 0, 2, 2, 0)

Пример #19

0

Показать файл

#!/usr/bin/env python3

import csv

from pyuca import Collator
c = Collator("bin/allkeys.txt")

codes = {}

lists = {
    'country': {
        'title': 'Country register',
        'url': 'http://country.openregister.org',
        'publisher': 'foreign-commonwealth-office',
        'format': 'register',
        'path': '../data/country/countries.tsv'
    },
    'territory': {
        'title': 'Territory register',
        'url': 'http://territory.openregister.org',
        'publisher': 'foreign-commonwealth-office',
        'format': 'register',
        'path': '../data/territory/territories.tsv'
    },
    'uk': {
        'title': 'UK register',
        'url': 'http://uk.openregister.org',
        'publisher': 'cabinet-office',
        'format': 'register',
        'path': '../data/uk/uk.tsv'
    },

Пример #20

0

Показать файл

from smithers import data_types
from smithers import redis_keys as rkeys
from smithers.utils import get_epoch_minute

if settings.ENABLE_REDIS:
    redis = get_redis_connection('smithers')
else:
    redis = False

TWITTER_URL = 'https://twitter.com/share'
FB_URL = 'https://www.facebook.com/sharer/sharer.php'
COUNT_FOOTNOTE = ('<a href="#number-modal" class="number-help" '
                  'data-toggle="modal" title="{}">'
                  '<span class="share_total"></span>'
                  '<i class="fa fa-question-circle"></i></a>')
uca_collator = Collator()


def uca_sort_key(country):
    """Sort key function using pyuca on the 2nd element of the argument."""
    return uca_collator.sort_key(country[1])


def get_tw_share_url(**kwargs):
    kwargs.setdefault('dnt', 'true')
    text = kwargs.get('text')
    if text:
        kwargs['text'] = text.encode('utf8')
    return '?'.join([TWITTER_URL, urlencode(kwargs)])

Пример #21

0

Показать файл

from pyuca import Collator
col = Collator()

file_obj = open("dictionary.txt", "r", encoding="utf8")

count = 0
dict = {}
english_word = ""

for line in file_obj:
    if count % 2 == 0:
        ## do something with english text
        ##print("English: " + line)
        english_word = line.rstrip("\n")

    else:
        ## do something with greek text
        ##print("Greek: " + line)

        ## check if word in the dict and if so make multiple entries
        if english_word in dict:
            dict[english_word + " (1)"] = dict.pop(english_word)
            dict[english_word + " (2)"] = line.rstrip("\n")

        else:
            dict[english_word] = line.rstrip("\n")

    count += 1

file_obj.close()

Пример #22

0

Показать файл

Файл: _widget.py Проект: pytsite/plugin-odm_ui

"""PytSite Object Document Mapper UI Plugin Widgets
"""
__author__ = 'Oleksandr Shepetko'
__email__ = '*****@*****.**'
__license__ = 'MIT'

import htmler
from typing import List, Callable, Union, Iterable, Tuple
from pyuca import Collator
from json import dumps as json_dumps
from pytsite import lang
from plugins import widget, odm, http_api, odm_http_api

_pyuca_col = Collator()


def _sanitize_kwargs_exclude(kwargs: dict):
    if not ('exclude' in kwargs and kwargs['exclude']):
        return

    if isinstance(kwargs['exclude'], odm.Entity):
        kwargs['exclude'] = [kwargs['exclude'].ref
                             ] if not kwargs['exclude'].is_new else []
    elif isinstance(kwargs['exclude'], str):
        kwargs['exclude'] = [kwargs['exclude']]
    elif isinstance(kwargs['exclude'], (list, tuple)):
        ex = []
        for item in kwargs['exclude']:
            if isinstance(item, odm.Entity):
                if not item.is_new:
                    ex.append(item.ref)

Пример #23

0

Показать файл

Файл: Index.py Проект: khemanta/plastex-oreilly

#!/usr/bin/env python
"""
C.11.5 Index and Glossary (p211)

"""

import string, os
from plasTeX.Tokenizer import Token, EscapeSequence
from plasTeX import Command, Environment
from plasTeX.Logging import getLogger
from Sectioning import SectionUtils

try:
    from pyuca import Collator
    collator = Collator(os.path.join(os.path.dirname(__file__),
                                     'allkeys.txt')).sort_key
except ImportError:
    collator = lambda x: x.lower()


class IndexUtils(object):
    """ Helper functions for generating indexes """

    linkType = 'index'
    level = Command.CHAPTER_LEVEL

    class Index(Command):
        """
        Utility class used to surface the index entries to the renderer
    
        """

Пример #24

0

Показать файл

Файл: core.py Проект: balasankarc/ucasort

 def __init__(self):
     self.silpacollator = Collator(
         os.path.join(os.path.dirname(__file__), "allkeys-silpa-6.0.0.txt"))
     self.ucacollator = Collator(
         os.path.join(os.path.dirname(__file__), "allkeys-6.0.0.txt"))

Пример #25

0

Показать файл

Файл: indexer.py Проект: joseferqm/ri-proyecto

    def process_collection(self, document_entries, debug):
        self.__collection_handler.create_tok_dir()
        self.__collection_handler.create_wtd_dir()
        vocabulary = dict()
        documents = dict()
        collator = Collator()

        if debug:
            long_file_lines = list()
            special_file_lines = list()
            dash_file_lines = list()
            terms_per_document_sum = 0

        for document_entry in document_entries:
            document_alias = document_entry.get_alias()
            documents[document_alias] = dict()
            document_html_str = document_entry.get_html_str()
            tok_file_lines = list()
            if debug:
                print('Procesando {}...'.format(document_entry.get_alias()))
                long = list()
                special = list()
                dash = list()
                document_terms = Analyzer.retrieve_html_str_terms(
                    document_html_str, long, special, dash)
            else:
                document_terms = Analyzer.retrieve_html_str_terms(
                    document_html_str)

            if len(document_terms) > 0:
                if debug:
                    # prueba de promedio
                    terms_per_document_sum += len(document_terms)

                document_terms_np_array = np.array(document_terms)

                terms, counts = np.unique(document_terms_np_array,
                                          return_counts=True)
                doc_vocabulary = dict(zip(terms, counts))
                max_l_freq_lj = max(counts)

                # El archivo tok debe estar ordenado alfabéticamente
                for term in sorted(doc_vocabulary.keys(),
                                   key=collator.sort_key):
                    freq_ij = doc_vocabulary[
                        term]  # freq_ij = frecuencia del término k_i en el documento d_j
                    f_ij = freq_ij / max_l_freq_lj  # f_ij = frecuencia normalizada del término k_i en el documento d_j
                    # Se calcula como freq_ij divido por la frecuencia del término más frecuente en el documento d_j
                    tok_line = '{:30} {:12} {:20}'.format(
                        term, str(freq_ij), str(round(f_ij, 3)))
                    tok_file_lines.append(tok_line)
                    self.update_vocabulary_dict(vocabulary, term, freq_ij)
                    documents[document_alias][term] = round(f_ij, 3)
            else:
                tok_file_lines.append('\n')

            if debug:
                for long_elem in long:
                    line = '{:35} {}'.format(document_entry.get_alias(),
                                             long_elem)
                    long_file_lines.append(line)

                for special_elem in special:
                    line = '{:35} {}'.format(document_entry.get_alias(),
                                             special_elem)
                    special_file_lines.append(line)

                for dash_elem in dash:
                    line = '{:35} {}'.format(document_entry.get_alias(),
                                             dash_elem)
                    dash_file_lines.append(line)

            self.__collection_handler.create_file_for_document(
                document_entry.get_alias(), tok_file_lines,
                DocumentOutputFiles.TOK)

        vocabulary_file_lines = list()

        # El archivo Vocabulario debe estar ordenado alfabéticamente
        for term in sorted(vocabulary.keys(), key=collator.sort_key):
            values_tuple = vocabulary[term]
            n_i = values_tuple[0]
            idf = Utilities.get_inverse_term_frequency(len(document_entries),
                                                       n_i)
            vocabulary[term] = (vocabulary[term][0], vocabulary[term][1], idf,
                                vocabulary[term][3])
            line = '{:30} {:12} {:20}'.format(term, str(n_i), str(idf))
            vocabulary_file_lines.append(line)

        self.__collection_handler.create_file_for_collection(
            vocabulary_file_lines, CollectionOutputFiles.VOCABULARY)

        # Archivos Indice y Postings
        postings_file_lines = list()
        index_file_lines = list()
        postings_file_vocabulary = dict()
        for document_alias, document_terms in documents.items():
            wtd_file_lines = list()
            for term, f_ij in document_terms.items():
                weight = Utilities.get_term_weight(f_ij, vocabulary[term][2])
                line = '{:30} {:20}'.format(term, str(weight))
                wtd_file_lines.append(line)
                Indexer.update_postings_dict(postings_file_vocabulary, term,
                                             document_alias, weight)
            self.__collection_handler.create_file_for_document(
                document_alias, wtd_file_lines, DocumentOutputFiles.WTD)

        current_postings_line = 0
        for term in sorted(postings_file_vocabulary.keys(),
                           key=collator.sort_key):
            documents_list = postings_file_vocabulary[term]
            for values_tuple in documents_list:
                document_alias = values_tuple[0]
                term_weight = values_tuple[1]
                if vocabulary[term][3] is None:
                    vocabulary[term] = (vocabulary[term][0],
                                        vocabulary[term][1],
                                        vocabulary[term][2],
                                        current_postings_line)
                line = '{:30} {:40} {:20}'.format(term,
                                                  document_alias + '.html',
                                                  str(term_weight))
                postings_file_lines.append(line)
                current_postings_line += 1

        for term in sorted(vocabulary.keys(), key=collator.sort_key):
            values_tuple = vocabulary[term]
            postings_entries_count = values_tuple[0]
            postings_initial_position = values_tuple[3]
            line = '{:30} {:12} {:12}'.format(term,
                                              str(postings_initial_position),
                                              str(postings_entries_count))
            index_file_lines.append(line)

        self.__collection_handler.create_file_for_collection(
            index_file_lines, CollectionOutputFiles.INDEX)
        self.__collection_handler.create_file_for_collection(
            postings_file_lines, CollectionOutputFiles.POSTINGS)

        if debug:
            Utilities.print_debug_header('Resultados de la indexación', True)
            print("La cantidad de palabras en el vocabulario es: ",
                  len(vocabulary_file_lines))
            print("La cantidad promedio de palabras por documento es de: ",
                  terms_per_document_sum / len(document_entries), " palabras.")
            print("La cantidad de palabras en long es: ", len(long_file_lines))
            print("La cantidad de palabras en special es: ",
                  len(special_file_lines))
            print("La cantidad de palabras en dash es: ", len(dash_file_lines))

            long_file_str = '\n'.join(line for line in long_file_lines)
            Utilities.create_and_save_file('long.txt', long_file_str)

            special_file_str = '\n'.join(line for line in special_file_lines)
            Utilities.create_and_save_file('special.txt', special_file_str)

            dash_file_str = '\n'.join(line for line in dash_file_lines)
            Utilities.create_and_save_file('dash.txt', dash_file_str)

Пример #26

0

Показать файл

#!/usr/bin/env python

import sys

from pyuca import Collator
collator = Collator()

from morphgnt.utils import load_yaml
from morphgnt.utils import nfkc_normalize as n

danker = load_yaml("../data-cleanup/danker-concise-lexicon/components.yaml")

greenlee = {}
with open("../data-cleanup/greenlee-morphology/morphemes-utf8.txt") as f:
    for line in f:
        key, value = line.strip().split("\t")
        greenlee[n(key.decode("utf-8")).split(",")[0]] = {
            "full-entry": n(key.decode("utf-8")),
            "components": n(value.decode("utf-8")),
        }

words = [n(word) for word in set(danker.keys()).union(set(greenlee.keys()))]

count = 0
for word in sorted(words, key=collator.sort_key):
    count += 1
    print "{}:".format(word.encode("utf-8"))
    if word in danker:
        print "    danker-full-entry: \"{}\"".format(danker[word]["full-entry"].encode("utf-8"))
        print "    danker-components: \"{}\"".format(danker[word]["components"].encode("utf-8"))
    if word in greenlee:

Пример #27

0

Показать файл

Файл: base.py Проект: interDist/pasportaservo

from distutils.dir_util import copy_tree
from os.path import isfile, join
from subprocess import call
from tempfile import mkdtemp

from django.conf import settings
from django.core.management.base import CommandError
from django.template import Template
from django.utils import translation

from django_countries import countries
from pyuca import Collator

from hosting.models import Place

c = Collator()


class LatexCommand(object):
    template_name = 'PasportaServo.tex'
    address_only = False
    make_pdf = False
    tex_files = [
        template_name,
        'pages/title.tex',
        'pages/address.tex',
    ]

    def activate_translation(self):
        translation.activate(settings.LANGUAGE_CODE)

Python Collator.Collator примеры использования