def normalize(word): """Return the normalized form for a word. The word given in argument should be unicode ! currently normalized word are : _ in lower case _ without any accent This function may raise StopWord if the word shouldn't be indexed stop words are : _ single letter _ numbers """ assert isinstance(word, unicode), '%r should be unicode' % word # do not index single letters if len(word) == 1: raise StopWord() # do not index numbers try: float(word) raise StopWord() except ValueError: pass word = unormalize(word.lower(), ignorenonascii=True) return word.encode('ascii', 'ignore')
def test_unormalize(self): data = [(u'\u0153nologie', u'oenologie'), (u'\u0152nologie', u'OEnologie'), (u'l\xf8to', u'loto'), (u'été', u'ete'), ] for input, output in data: yield self.assertEquals, tu.unormalize(input), output
def test_unormalize(self): data = [ (u'\u0153nologie', u'oenologie'), (u'\u0152nologie', u'OEnologie'), (u'l\xf8to', u'loto'), (u'été', u'ete'), ] for input, output in data: yield self.assertEquals, tu.unormalize(input), output
def test_unormalize_no_substitute(self): data = [('\u0153nologie', 'oenologie'), ('\u0152nologie', 'OEnologie'), ('l\xf8to', 'loto'), ('été', 'ete'), ('àèùéïîôêç', 'aeueiioec'), ('ÀÈÙÉÏÎÔÊÇ', 'AEUEIIOEC'), ('\xa0', ' '), # NO-BREAK SPACE managed by NFKD decomposition ('\u0154', 'R'), ] for input, output in data: yield self.assertEqual, tu.unormalize(input), output
def test_unormalize_no_substitute(self): data = [ (u'\u0153nologie', u'oenologie'), (u'\u0152nologie', u'OEnologie'), (u'l\xf8to', u'loto'), (u'été', u'ete'), (u'àèùéïîôêç', u'aeueiioec'), (u'ÀÈÙÉÏÎÔÊÇ', u'AEUEIIOEC'), (u'\xa0', u' '), # NO-BREAK SPACE managed by NFKD decomposition (u'\u0154', u'R'), ] for input, output in data: yield self.assertEqual, tu.unormalize(input), output
def test_unormalize(self): data = [(u'\u0153nologie', u'oenologie'), (u'\u0152nologie', u'OEnologie'), (u'l\xf8to', u'loto'), (u'été', u'ete'), (u'àèùéïîôêç', u'aeueiioec'), (u'ÀÈÙÉÏÎÔÊÇ', u'AEUEIIOEC'), (u'\xa0', u' '), # NO-BREAK SPACE managed by NFKD decomposition ] for input, output in data: yield self.assertEqual, tu.unormalize(input), output self.assertRaises(ValueError, tu.unormalize, u"non ascii char is \u0154", ignorenonascii=False)
def test_unormalize_no_substitute(self): data = [(u'\u0153nologie', u'oenologie'), (u'\u0152nologie', u'OEnologie'), (u'l\xf8to', u'loto'), (u'été', u'ete'), (u'àèùéïîôêç', u'aeueiioec'), (u'ÀÈÙÉÏÎÔÊÇ', u'AEUEIIOEC'), (u'\xa0', u' '), # NO-BREAK SPACE managed by NFKD decomposition (u'\u0154', u'R'), (u'Pointe d\u2019Yves', u"Pointe d'Yves"), (u'Bordeaux\u2013Mérignac', u'Bordeaux-Merignac'), ] for input, output in data: yield self.assertEqual, tu.unormalize(input), output
def entity_types_table(self, eschemas): infos = sorted(self.entity_types(eschemas), key=lambda t: unormalize(t[0])) q, r = divmod(len(infos), 2) if r: infos.append((None, ' ', ' ')) infos = zip(infos[:q + r], infos[q + r:]) for (_, etypelink, addlink), (_, etypelink2, addlink2) in infos: self.w(u'<tr>\n') self.w(u'<td class="addcol">%s</td><td>%s</td>\n' % (addlink, etypelink)) self.w(u'<td class="addcol">%s</td><td>%s</td>\n' % (addlink2, etypelink2)) self.w(u'</tr>\n')
def __init__(self, repo, source_config, eid=None): self.repo = repo self.set_schema(repo.schema) self.eid = eid self.public_config = source_config.copy() self.public_config['use-cwuri-as-url'] = self.use_cwuri_as_url self.remove_sensitive_information(self.public_config) self.uri = source_config.pop('uri') # unormalize to avoid non-ascii characters in logger's name, this will cause decoding error # on logging set_log_methods(self, getLogger('cubicweb.sources.' + unormalize(self.uri))) source_config.pop('type') self.config = self._check_config_dict( eid, source_config, raise_on_error=False)
def entity_types_table(self, eschemas): infos = sorted(self.entity_types(eschemas), key=lambda (l,a,e): unormalize(l)) q, r = divmod(len(infos), 2) html = [] w = html.append w(u'<div class="row">') for links in (infos[:q+r], infos[q+r:]): if links: w(u'<div class="col-sm-6">') w(u'<ul class="list-unstyled">') for (_, etypelink, addlink) in links: w('<li>%s %s</li>' % (addlink, etypelink)) w(u'</ul>') w(u'</div>') w(u'</div>') return html
def normalize(word): """Return the normalized form for a word. The word given in argument should be unicode ! currently normalized word are : _ in lower case _ without any accent This function may raise StopWord if the word shouldn't be indexed stop words are : _ single letter """ assert isinstance(word, text_type), '%r should be unicode' % word # do not index single letters if len(word) == 1: raise StopWord() word = unormalize(word.lower(), substitute='') # we need an ascii-only unicode string, not bytes return word.encode('ascii', 'ignore').decode('ascii')
def test_unormalize_backward_compat(self): self.assertRaises(ValueError, tu.unormalize, u"\u8000") self.assertEqual(tu.unormalize(u"\u8000", substitute=''), u'')
def test_unormalize_substitute(self): self.assertEqual(tu.unormalize(u'ab \u8000 cd', substitute='_'), 'ab _ cd')
def __init__(self, repo, user): self.user = user # XXX deprecate and store only a login. self.repo = repo self.sessionid = make_uid(unormalize(user.login)) self.data = {}
def test_unormalize_backward_compat(self): self.assertRaises(ValueError, tu.unormalize, "\u8000", ignorenonascii=False) self.assertEqual(tu.unormalize("\u8000", ignorenonascii=True), '')
def test_unormalize_substitute(self): self.assertEqual(tu.unormalize('ab \u8000 cd', substitute='_'), 'ab _ cd')
def test_unormalize_backward_compat(self): self.assertRaises(ValueError, tu.unormalize, u"\u8000", ignorenonascii=False) self.assertEqual(tu.unormalize(u"\u8000", ignorenonascii=True), u'')