def test_empty(self): self.assertEqual(None, slugify(None)) self.assertEqual(None, ascii_text(None)) self.assertEqual(None, latinize_text(None)) self.assertEqual(None, normalize(None)) self.assertEqual(None, normalize('')) self.assertEqual(None, normalize(' '))
def index_form(texts): """Turn a set of strings into the appropriate form for indexing.""" results = [] total_len = 0 for text in texts: # We don't want to store more than INDEX_MAX_LEN of text per doc if total_len > INDEX_MAX_LEN: # TODO: there might be nicer techniques for dealing with overly # long text buffers? results = list(set(results)) total_len = sum((len(t) for t in results)) if total_len > INDEX_MAX_LEN: break text = stringify(text) if text is None: continue text = collapse_spaces(text) total_len += len(text) results.append(text) # Make latinized text version latin = latinize_text(text) latin = stringify(latin) if latin is None or latin == text: continue total_len += len(latin) results.append(latin) return results
def finalize_index(data, schema, texts): """Apply final denormalisations to the index.""" data['schema'] = schema.name # Get implied schemata (i.e. parents of the actual schema) data['schemata'] = schema.names properties = data.get('properties', {}) for name, prop in schema.properties.items(): if name not in properties: continue if prop.type_name in ['entity', 'date', 'url', 'uri', 'country']: continue for value in ensure_list(properties[name]): if name == 'name': data['name'] = value texts.append(value) data = schema.invert(data) data['text'] = index_form(texts) names = data.get('names', []) fps = [fingerprints.generate(name) for name in names] fps = [fp for fp in fps if fp is not None] data['fingerprints'] = list(set(fps)) # Add latinised names for name in list(names): names.append(latinize_text(name)) data['names'] = list(set(names)) if 'created_at' not in data: data['created_at'] = data.get('updated_at') return data
def normalize_strong(text): """Perform heavy normalisation of a given text. The goal of this function is not to retain a readable version of the given string, but rather to yield a normalised version suitable for comparisons and machine analysis. """ text = latinize_text(string_value(text)) if text is None: return text = category_replace(text.lower()) return collapse_spaces(text)
def search_term(term): if term is None: return term = latinize_text(term) if term is None: return term = term.replace('"', ' ').strip().lower() for stopword in STOPWORDS: if term.startswith(stopword): term = term[len(stopword):] if len(term) < 4: return return term
def pick_name(names: Tuple[str], all_names: Tuple[str]) -> Optional[str]: candidates: List[str] = [] for name in all_names: candidates.append(name) latin = latinize_text(name) if latin is not None: candidates.append(latin.title()) scores: Dict[str, int] = defaultdict(int) for pair in combinations(candidates, 2): left, right = sorted(pair) dist = Levenshtein.distance(left[:128], right[:128]) scores[left] += dist scores[right] += dist for cand, _ in sorted(scores.items(), key=lambda x: x[1]): if cand in names: return cand return None
def test_petro(self): text = u'Порошенко Петро Олексійович' self.assertEqual('porosenko-petro-oleksijovic', slugify(text)) self.assertEqual('Porosenko Petro Oleksijovic', ascii_text(text)) self.assertEqual(u'Porošenko Petro Oleksíjovič', latinize_text(text)) self.assertEqual(u'порошенко петро олексіиович', normalize(text))
def test_petro(self): text = u"Порошенко Петро Олексійович" self.assertEqual("porosenko-petro-oleksijovic", slugify(text)) self.assertEqual("Porosenko Petro Oleksijovic", ascii_text(text)) self.assertEqual(u"Porošenko Petro Oleksíjovič", latinize_text(text)) self.assertEqual(u"порошенко петро олексіиович", normalize(text))
def latin_alt(value): """Make a latin version of a string and return if it differs from the input.""" trans_value = latinize_text(value) if trans_value.lower() != value.lower(): return trans_value
# coding: utf-8 from normality import normalize, latinize_text, ascii_text, slugify SAMPLES = [ u'Порошенко Петро Олексійович', u'FUAD ALIYEV ƏHMƏD OĞLU', u'Häschen Spaß', u'ავლაბრის ფონდი', ] for sample in SAMPLES: print 'SAMPLE :', sample print ' NORM :', normalize(sample) print ' SLUG :', slugify(sample) print ' LATIN:', latinize_text(sample) print ' ASCII:', ascii_text(sample)