def __init__(self, manager, name, email): self.email = ascii_text(stringify(email)) self.name = stringify(name) if not registry.email.validate(self.email): self.email = None if registry.email.validate(self.name): self.email = self.email or ascii_text(self.name) self.name = None # This should be using formataddr, but I cannot figure out how # to use that without encoding the name. self.label = None if self.name is not None and self.email is not None: self.label = "%s <%s>" % (self.name, self.email) elif self.name is None and self.email is not None: self.label = self.email elif self.email is None and self.name is not None: self.label = self.name self.entity = None key = registry.email.node_id_safe(self.email) if self.name is not None and len(self.name) > 10: key = key or registry.name.node_id_safe(self.name) if key is not None: fragment = safe_fragment(self.label) self.entity = manager.make_entity("Person") self.entity.context = {"mutable": False} self.entity.make_id(key) self.entity.add("name", self.name) self.entity.add("email", self.email) manager.emit_entity(self.entity, fragment=fragment)
def __init__(self, manager, name, email): self.email = ascii_text(stringify(email)) self.name = stringify(name) if not registry.email.validate(self.email): self.email = None if registry.email.validate(self.name): self.email = self.email or ascii_text(self.name) self.name = None # This should be using formataddr, but I cannot figure out how # to use that without encoding the name. self.label = None if self.name is not None and self.email is not None: self.label = '%s <%s>' % (self.name, self.email) elif self.name is None and self.email is not None: self.label = self.email elif self.email is None and self.name is not None: self.label = self.name self.entity = None if self.email is not None: key = self.email.lower().strip() fragment = safe_fragment(self.label) self.entity = manager.make_entity('Person') self.entity.make_id(key) self.entity.add('name', self.name) self.entity.add('email', self.email) manager.emit_entity(self.entity, fragment=fragment)
def suggest_entities(prefix, authz, min_count=0, schemas=None, size=5): """Auto-complete API.""" options = [] if prefix is not None and len(prefix.strip()): q = {'match_phrase_prefix': {'name': prefix.strip()}} if min_count > 0: q = add_filter(q, {'range': {'doc_count': {'gte': min_count}}}) if schemas is not None and len(schemas): q = add_filter(q, {'terms': {'$schema': schemas}}) # TODO: is this correct? should we allow filter by dataset entities? q = add_filter(q, {'terms': {'collection_id': authz.collections_read}}) q = { 'size': size, 'sort': [{ 'doc_count': 'desc' }, '_score'], 'query': q, '_source': ['name', 'schema', 'fingerprints', 'doc_count'] } ref = ascii_text(prefix) result = es.search(index=es_index, doc_type=TYPE_ENTITY, body=q) for res in result.get('hits', {}).get('hits', []): ent = res.get('_source') terms = [ascii_text(t) for t in ent.pop('fingerprints', [])] ent['match'] = ref in terms ent['score'] = res.get('_score') ent['id'] = res.get('_id') options.append(ent) return {'prefix': prefix, 'results': options}
def to_index_dict(self): data = self.meta.to_index_dict() data['text'] = index_form(self.text_parts()) data['schema'] = self.SCHEMA data['schemata'] = [self.SCHEMA] data['name_sort'] = ascii_text(data.get('title')) data['title_latin'] = ascii_text(data.get('title')) data['summary_latin'] = ascii_text(data.get('summary')) return self._add_to_dict(data)
def index_document(document, index_records=True): log.info("Index document: %r", document) data = document.to_index_dict() data['text'] = get_text(document) data['entities'] = generate_entities(document) data['title_latin'] = ascii_text(data.get('title')) data['summary_latin'] = ascii_text(data.get('summary')) es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id) if index_records: clear_records(document.id) bulk_op(generate_records(document))
def generate_records(document): """Generate index records, based on document rows or pages.""" if document.type == Document.TYPE_TEXT: for page in document.pages: tid = sha1(str(document.id)) tid.update(str(page.id)) tid = tid.hexdigest() text = stringify(page.text) latin = ascii_text(text) yield { '_id': tid, '_type': TYPE_RECORD, '_index': six.text_type(es_index), '_parent': document.id, '_source': { 'type': 'page', 'content_hash': document.content_hash, 'document_id': document.id, 'collection_id': document.collection_id, 'page': page.number, 'text': text, 'text_latin': latin } } elif document.type == Document.TYPE_TABULAR: for record in document.records: data = {k: stringify(v) for (k, v) in record.data.items()} text = [v for v in data.values()] latin = [ascii_text(t) for t in text] latin = [t for t in latin if t not in text and t is not None] yield { '_id': record.tid, '_type': TYPE_RECORD, '_index': six.text_type(es_index), '_parent': document.id, '_source': { 'type': 'row', 'content_hash': document.content_hash, 'document_id': document.id, 'collection_id': document.collection_id, 'row_id': record.row_id, 'sheet': record.sheet, 'text': text, 'text_latin': latin, 'raw': data } }
def test_empty(self): self.assertEqual(None, slugify(None)) self.assertEqual(None, ascii_text(None)) self.assertEqual(None, latinize_text(None)) self.assertEqual(None, normalize(None)) self.assertEqual(None, normalize('')) self.assertEqual(None, normalize(' '))
def get_text(document): """Generate an array with the full text of the given document. This will limit document length to TEXT_MAX_LEN in order to avoid uploading extremely long documents. """ texts = [] for text in document.text_parts(): text = stringify(text) texts.append(text) latin = ascii_text(text) if latin != text: texts.append(latin) text_len = sum((len(t) for t in texts)) # First, try getting rid of duplicate entries, which are more likely in # tabular documents. If that does not help, partial text will be # returned. if text_len >= TEXT_MAX_LEN: texts = list(set(texts)) text_len = sum((len(t) for t in texts)) if text_len >= TEXT_MAX_LEN: return texts return texts
def normalize(self, name): name = ascii_text(name) name = category_replace(name, UNICODE_CATEGORIES) if name.upper() == name: name = name.replace(WS, '_') name = name.lower() else: name = stringcase.snakecase(name) return re.sub('_+', '_', name)
def clean_strict(text, boundary=WS): """Super-hardcore string scrubbing.""" # transliterate to ascii text = ascii_text(text) # replace punctuation and symbols text = CHARACTERS_REMOVE_RE.sub('', text) text = category_replace(text) # pad out for company type replacements text = ''.join((boundary, collapse_spaces(text), boundary)) return text
def finalize_index(data, schema): """Apply final denormalisations to the index.""" properties = data.get('properties', {}) texts = [] for vs in properties.values(): for v in ensure_list(vs): texts.append(v) data['text'] = index_form(texts) data['fingerprints'] = data.get('fingerprints', []) # Generate inverted representations of the data stored in properties. for prop in schema.properties: values = properties.get(prop.name, []) if not len(values): continue # Find an set the name property if prop.is_label: data['name'] = values[0] # Generate key material # TODO: this should probably be record-based. data['fingerprints'].extend(prop.type.fingerprint(values)) # Add inverted properties. This takes all the properties # of a specific type (names, dates, emails etc.) invert = prop.type.index_invert if invert: if invert not in data: data[invert] = [] for norm in prop.type.normalize(values): if norm not in data[invert]: data[invert].append(norm) data['fingerprints'] = list(set(data['fingerprints'])) # Add latinised names names = data.get('names', []) for name in list(names): names.append(ascii_text(name)) data['names'] = list(set(names)) # Get implied schemata (i.e. parents of the actual schema) data['schema'] = schema.name data['schemata'] = [] for parent in schema.schemata: if not parent.hidden: data['schemata'].append(parent.name) # Second name field for non-tokenised sorting. if 'name' in data: data['name_sort'] = data.get('name') return data
def index_names(data): """Handle entity names on documents and entities.""" names = data.get('names', []) fps = [fingerprints.generate(name) for name in names] fps = [fp for fp in fps if fp is not None] data['fingerprints'] = list(set(fps)) # Add latinised names for name in list(names): names.append(ascii_text(name)) data['names'] = list(set(names))
def pending(id): collection = obj_or_404(Collection.by_id(id)) request.authz.require(request.authz.collection_read(collection)) q = collection.pending_entities() q = q.limit(30) entities = [] for entity in q.all(): data = entity.to_dict() data['name_latin'] = ascii_text(entity.name) entities.append(data) return jsonify({'results': entities, 'total': len(entities)})
def text_query_string(text, literal=False): if text is None or not len(text.strip()): return match_all() if literal: text = '"%s"' % ascii_text(text) return { 'query_string': { 'query': text, 'fields': ['text'], 'default_operator': 'AND', 'use_dis_max': True } }
def clean_strict(text: Optional[str], boundary: str = WS) -> Optional[str]: """Super-hardcore string scrubbing.""" # transliterate to ascii text = ascii_text(text) if not isinstance(text, str): return None # replace punctuation and symbols text = CHARACTERS_REMOVE_RE.sub("", text) text = category_replace(text) text = collapse_spaces(text) if text is None: return None # pad out for company type replacements return "".join((boundary, text, boundary))
def test_georgian(self): text = u'ავლაბრის ფონდი' self.assertEqual('avlabris pondi', ascii_text(text))
def test_ahmad(self): text = u'FUAD ALIYEV ƏHMƏD OĞLU' self.assertEqual('FUAD ALIYEV AHMAD OGLU', ascii_text(text))
def test_petro(self): text = u'Порошенко Петро Олексійович' self.assertEqual('porosenko-petro-oleksijovic', slugify(text)) self.assertEqual('Porosenko Petro Oleksijovic', ascii_text(text)) self.assertEqual(u'Porošenko Petro Oleksíjovič', latinize_text(text)) self.assertEqual(u'порошенко петро олексіиович', normalize(text))
def normalize(text): text = ascii_text(text) text = text.replace("'", '') return text
def test_georgian(self): text = u"ავლაბრის ფონდი" self.assertEqual("avlabris pondi", ascii_text(text))
def test_azeri(self): text = u"FUAD ALIYEV ƏHMƏD OĞLU" self.assertEqual("FUAD ALIYEV AHMAD OGLU", ascii_text(text))
def test_ahmad(self): text = u"əhməd" self.assertEqual("ahmad", ascii_text(text))
def test_petro(self): text = u"Порошенко Петро Олексійович" self.assertEqual("porosenko-petro-oleksijovic", slugify(text)) self.assertEqual("Porosenko Petro Oleksijovic", ascii_text(text)) self.assertEqual(u"Porošenko Petro Oleksíjovič", latinize_text(text)) self.assertEqual(u"порошенко петро олексіиович", normalize(text))
def test_ahmad(self): text = u'əhməd' self.assertEqual('ahmad', ascii_text(text))
def normalize(text): text = category_replace(text, replacements=UNICODE_CATEGORIES) text = ascii_text(text) if text is not None: return text.lower()
def test_german(self): text = u'Häschen Spaß' self.assertEqual('Haschen Spass', ascii_text(text))
def strconv(text): if text is None or not len(text.strip()): return return ascii_text(text)
def test_german(self): text = u"Häschen Spaß" self.assertEqual("Haschen Spass", ascii_text(text)) self.assertEqual("haschen-spass", slugify(text, sep="-"))
def latin_alt(value): """Make a latin version of a string and return if it differs from the input.""" trans_value = ascii_text(value) if trans_value.lower() != value.lower(): return trans_value
def normalize_value(self, value): value = collapse_spaces(value) return value, ascii_text(value)