def finalize_index(data, schema): """Apply final denormalisations to the index.""" properties = data.get('properties', {}) texts = [] for prop in schema.properties: if prop.name not in properties: continue if prop.type_name in ['date', 'url', 'uri', 'country']: continue texts.extend(ensure_list(properties[prop.name])) data['text'] = index_form(texts) data = schema.invert(data) index_names(data) data['schema'] = schema.name # Get implied schemata (i.e. parents of the actual schema) data['schemata'] = schema.names # Second name field for non-tokenised sorting. if 'name' in data: data['name_sort'] = data.get('name') # pprint(data) return data
def finalize_index(data, schema): """Apply final denormalisations to the index.""" properties = data.get('properties', {}) texts = [] for vs in properties.values(): for v in ensure_list(vs): texts.append(v) data['text'] = index_form(texts) # Generate inverted representations of the data stored in properties. for prop in schema.properties: values = properties.get(prop.name, []) if not len(values): continue # Find an set the name property if prop.is_label: data['name'] = values[0] # Add inverted properties. This takes all the properties # of a specific type (names, dates, emails etc.) invert = prop.type.index_invert if invert: if invert not in data: data[invert] = [] for norm in prop.type.normalize(values): if norm not in data[invert]: data[invert].append(norm) index_names(data) # Get implied schemata (i.e. parents of the actual schema) data['schema'] = schema.name data['schemata'] = [p.name for p in schema.schemata if not p.hidden] # Second name field for non-tokenised sorting. if 'name' in data: data['name_sort'] = data.get('name') # pprint(data) return data
def index_document(document): if document.status == Document.STATUS_PENDING: return # FIXME: if document.type == Document.TYPE_OTHER: return log.info("Index document [%s]: %s", document.id, document.title) data = { 'schema': document.SCHEMA, 'schemata': [document.SCHEMA], 'collection_id': document.collection_id, 'roles': document.collection.roles, 'type': document.type, 'status': document.status, 'content_hash': document.content_hash, 'foreign_id': document.foreign_id, 'error_message': document.error_message, 'uploader_id': document.uploader_id, 'created_at': document.created_at, 'updated_at': document.updated_at, 'title': document.title, 'name_sort': document.title, 'summary': document.summary, 'author': document.author, 'file_size': document.file_size, 'file_name': document.file_title, 'source_url': document.source_url, 'languages': document.languages, 'countries': document.countries, 'keywords': document.keywords, 'dates': document.dates, 'extension': document.extension, 'encoding': document.encoding, 'mime_type': document.mime_type, 'pdf_version': document.pdf_version, 'columns': document.columns, '$children': document.children.count(), 'text': index_form(document.text_parts()) } if document.parent_id is not None: data['parent'] = { 'id': document.parent_id, 'type': document.parent.type, 'title': document.parent.title, } q = db.session.query(DocumentTag) q = q.filter(DocumentTag.document_id == document.id) for tag in q.yield_per(5000): field = TAG_FIELDS.get(tag.type) if field is None: log.warning("Cannot index document tag: %r", tag) continue if field not in data: data[field] = [] data[field].append(tag.text) index_names(data) es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id) data['id'] = document.id data['$type'] = TYPE_DOCUMENT return data