Exemplo n.º 1
0
def normalize(word):
    """Return the normalized form for a word.

    The word given in argument should be unicode !

    currently normalized word are :
       _ in lower case
       _ without any accent

    This function may raise StopWord if the word shouldn't be indexed

    stop words are :
       _ single letter
       _ numbers
    """
    assert isinstance(word, unicode), '%r should be unicode' % word
    # do not index single letters
    if len(word) == 1:
        raise StopWord()
    # do not index numbers
    try:
        float(word)
        raise StopWord()
    except ValueError:
        pass
    word = unormalize(word.lower(), ignorenonascii=True)
    return word.encode('ascii', 'ignore')
Exemplo n.º 2
0
 def test_unormalize(self):
     data = [(u'\u0153nologie', u'oenologie'),
             (u'\u0152nologie', u'OEnologie'),
             (u'l\xf8to', u'loto'),
             (u'été', u'ete'),
             ]
     for input, output in data:
         yield self.assertEquals, tu.unormalize(input), output
Exemplo n.º 3
0
 def test_unormalize(self):
     data = [
         (u'\u0153nologie', u'oenologie'),
         (u'\u0152nologie', u'OEnologie'),
         (u'l\xf8to', u'loto'),
         (u'été', u'ete'),
     ]
     for input, output in data:
         yield self.assertEquals, tu.unormalize(input), output
 def test_unormalize_no_substitute(self):
     data = [('\u0153nologie', 'oenologie'),
             ('\u0152nologie', 'OEnologie'),
             ('l\xf8to', 'loto'),
             ('été', 'ete'),
             ('àèùéïîôêç', 'aeueiioec'),
             ('ÀÈÙÉÏÎÔÊÇ', 'AEUEIIOEC'),
             ('\xa0', ' '), # NO-BREAK SPACE managed by NFKD decomposition
             ('\u0154', 'R'),
            ]
     for input, output in data:
         yield self.assertEqual, tu.unormalize(input), output
Exemplo n.º 5
0
 def test_unormalize_no_substitute(self):
     data = [
         (u'\u0153nologie', u'oenologie'),
         (u'\u0152nologie', u'OEnologie'),
         (u'l\xf8to', u'loto'),
         (u'été', u'ete'),
         (u'àèùéïîôêç', u'aeueiioec'),
         (u'ÀÈÙÉÏÎÔÊÇ', u'AEUEIIOEC'),
         (u'\xa0', u' '),  # NO-BREAK SPACE managed by NFKD decomposition
         (u'\u0154', u'R'),
     ]
     for input, output in data:
         yield self.assertEqual, tu.unormalize(input), output
 def test_unormalize(self):
     data = [(u'\u0153nologie', u'oenologie'),
             (u'\u0152nologie', u'OEnologie'),
             (u'l\xf8to', u'loto'),
             (u'été', u'ete'),
             (u'àèùéïîôêç', u'aeueiioec'),
             (u'ÀÈÙÉÏÎÔÊÇ', u'AEUEIIOEC'),
             (u'\xa0', u' '), # NO-BREAK SPACE managed by NFKD decomposition
            ]
     for input, output in data:
         yield self.assertEqual, tu.unormalize(input), output
     self.assertRaises(ValueError, tu.unormalize, u"non ascii char is \u0154",
                       ignorenonascii=False)
Exemplo n.º 7
0
 def test_unormalize_no_substitute(self):
     data = [(u'\u0153nologie', u'oenologie'),
             (u'\u0152nologie', u'OEnologie'),
             (u'l\xf8to', u'loto'),
             (u'été', u'ete'),
             (u'àèùéïîôêç', u'aeueiioec'),
             (u'ÀÈÙÉÏÎÔÊÇ', u'AEUEIIOEC'),
             (u'\xa0', u' '), # NO-BREAK SPACE managed by NFKD decomposition
             (u'\u0154', u'R'),
             (u'Pointe d\u2019Yves', u"Pointe d'Yves"),
             (u'Bordeaux\u2013Mérignac', u'Bordeaux-Merignac'),
            ]
     for input, output in data:
         yield self.assertEqual, tu.unormalize(input), output
Exemplo n.º 8
0
 def entity_types_table(self, eschemas):
     infos = sorted(self.entity_types(eschemas),
                    key=lambda t: unormalize(t[0]))
     q, r = divmod(len(infos), 2)
     if r:
         infos.append((None, ' ', ' '))
     infos = zip(infos[:q + r], infos[q + r:])
     for (_, etypelink, addlink), (_, etypelink2, addlink2) in infos:
         self.w(u'<tr>\n')
         self.w(u'<td class="addcol">%s</td><td>%s</td>\n' %
                (addlink, etypelink))
         self.w(u'<td class="addcol">%s</td><td>%s</td>\n' %
                (addlink2, etypelink2))
         self.w(u'</tr>\n')
Exemplo n.º 9
0
 def __init__(self, repo, source_config, eid=None):
     self.repo = repo
     self.set_schema(repo.schema)
     self.eid = eid
     self.public_config = source_config.copy()
     self.public_config['use-cwuri-as-url'] = self.use_cwuri_as_url
     self.remove_sensitive_information(self.public_config)
     self.uri = source_config.pop('uri')
     # unormalize to avoid non-ascii characters in logger's name, this will cause decoding error
     # on logging
     set_log_methods(self, getLogger('cubicweb.sources.' + unormalize(self.uri)))
     source_config.pop('type')
     self.config = self._check_config_dict(
         eid, source_config, raise_on_error=False)
Exemplo n.º 10
0
def entity_types_table(self, eschemas):
    infos = sorted(self.entity_types(eschemas),
                   key=lambda (l,a,e): unormalize(l))
    q, r = divmod(len(infos), 2)
    html = []
    w = html.append
    w(u'<div class="row">')
    for links in (infos[:q+r], infos[q+r:]):
        if links:
            w(u'<div class="col-sm-6">')
            w(u'<ul class="list-unstyled">')
            for (_, etypelink, addlink) in links:
                w('<li>%s %s</li>' % (addlink,  etypelink))
            w(u'</ul>')
            w(u'</div>')
    w(u'</div>')
    return html
Exemplo n.º 11
0
def normalize(word):
    """Return the normalized form for a word.

    The word given in argument should be unicode !

    currently normalized word are :
       _ in lower case
       _ without any accent

    This function may raise StopWord if the word shouldn't be indexed

    stop words are :
       _ single letter
    """
    assert isinstance(word, text_type), '%r should be unicode' % word
    # do not index single letters
    if len(word) == 1:
        raise StopWord()
    word = unormalize(word.lower(), substitute='')
    # we need an ascii-only unicode string, not bytes
    return word.encode('ascii', 'ignore').decode('ascii')
Exemplo n.º 12
0
 def test_unormalize_backward_compat(self):
     self.assertRaises(ValueError, tu.unormalize, u"\u8000")
     self.assertEqual(tu.unormalize(u"\u8000", substitute=''), u'')
Exemplo n.º 13
0
 def test_unormalize_substitute(self):
     self.assertEqual(tu.unormalize(u'ab \u8000 cd', substitute='_'),
                      'ab _ cd')
 def test_unormalize_backward_compat(self):
     self.assertRaises(ValueError, tu.unormalize, u"\u8000")
     self.assertEqual(tu.unormalize(u"\u8000", substitute=''), u'')
Exemplo n.º 15
0
 def __init__(self, repo, user):
     self.user = user  # XXX deprecate and store only a login.
     self.repo = repo
     self.sessionid = make_uid(unormalize(user.login))
     self.data = {}
 def test_unormalize_backward_compat(self):
     self.assertRaises(ValueError, tu.unormalize, "\u8000",
                       ignorenonascii=False)
     self.assertEqual(tu.unormalize("\u8000", ignorenonascii=True), '')
 def test_unormalize_substitute(self):
     self.assertEqual(tu.unormalize('ab \u8000 cd', substitute='_'),
                      'ab _ cd')
Exemplo n.º 18
0
 def test_unormalize_backward_compat(self):
     self.assertRaises(ValueError,
                       tu.unormalize,
                       u"\u8000",
                       ignorenonascii=False)
     self.assertEqual(tu.unormalize(u"\u8000", ignorenonascii=True), u'')