def load_item(self, doc):
        # Recursively index associated models like attachments
        for model in doc.traverse():
            model_body = json_encoder.encode(
                JsonLDSerializer().serialize(model))

            log.debug('ElasticsearchUpsertLoader indexing document id: %s' %
                      model.get_ori_identifier())

            # Update document
            elasticsearch.update(
                id=model.get_short_identifier(),
                index=self.index_name,
                body={
                    'doc': json.loads(model_body),
                    'doc_as_upsert': True,
                },
            )

            if 'enricher_task' in model:
                # The value seems to be enriched so add to resolver
                url_doc = {
                    'ori_identifier': model.get_short_identifier(),
                    'original_url': model.original_url,
                    'file_name': model.name,
                }

                if 'content_type' in model:
                    url_doc['content_type'] = model.content_type

                # Update if already exists
                elasticsearch.index(index=settings.RESOLVER_URL_INDEX,
                                    id=get_sha1_hash(model.original_url),
                                    body=url_doc)
Пример #2
0
    def load_item(self, object_id, combined_index_doc, doc):
        log.info('Indexing documents...')
        elasticsearch.index(index=settings.COMBINED_INDEX, doc_type='item',
                            id=object_id, body=combined_index_doc)

        # Index documents into new index
        elasticsearch.index(index=self.index_name, doc_type='item', body=doc,
                            id=object_id)

        m_url_content_types = {}
        if 'media_urls' in doc['enrichments']:
            for media_url in doc['enrichments']['media_urls']:
                if 'content_type' in media_url:
                    m_url_content_types[media_url['original_url']] = \
                        media_url['content_type']

        # For each media_urls.url, add a resolver document to the
        # RESOLVER_URL_INDEX
        if 'media_urls' in doc:
            for media_url in doc['media_urls']:
                url_hash = media_url['url'].split('/')[-1]
                url_doc = {
                    'original_url': media_url['original_url']
                }

                if media_url['original_url'] in m_url_content_types:
                    url_doc['content_type'] = \
                        m_url_content_types[media_url['original_url']]

                try:
                    elasticsearch.create(index=settings.RESOLVER_URL_INDEX,
                                         doc_type='url', id=url_hash,
                                         body=url_doc)
                except ConflictError:
                    log.debug('Resolver document %s already exists' % url_hash)
Пример #3
0
    def load_item(self, doc):
        body = json_encoder.encode(JsonLDSerializer().serialize(doc))

        log.info('Indexing document id: %s' % doc.get_ori_identifier())

        # Index documents into new index
        elasticsearch.index(index=self.index_name, doc_type=doc_type(doc.verbose_name()),
                            body=body, id=doc.get_short_identifier())

        # Recursively index associated models like attachments
        for _, value in doc.properties(rels=True, props=False):
            self.load_item(value)

            if 'enricher_task' in value:
                # The value seems to be enriched so add to resolver
                url_doc = {
                    'ori_identifier': value.get_short_identifier(),
                    'original_url': value.original_url,
                    'file_name': value.name,
                }

                if 'content_type' in value:
                    url_doc['content_type'] = value.content_type

                # Update if already exists
                elasticsearch.index(index=settings.RESOLVER_URL_INDEX, doc_type='url',
                                    id=get_sha1_hash(value.original_url), body=url_doc)
Пример #4
0
    def load_item(self, doc):
        body = json_encoder.encode(JsonLDSerializer().serialize(doc))

        log.info('Indexing document id: %s' % doc.get_ori_identifier())

        # Index documents into new index
        elasticsearch.index(index=self.index_name, doc_type=doc_type(doc.verbose_name()),
                            body=body, id=doc.get_ori_identifier())

        # Recursively index associated models like attachments
        for _, value in doc.properties(rels=True, props=False):
            self.load_item(value)

            if 'enricher_task' in value:
                # The value seems to be enriched so add to resolver
                url_doc = {
                    'ori_identifier': value.get_ori_identifier(),
                    'original_url': value.original_url,
                    'file_name': value.name,
                }

                if 'content_type' in value:
                    url_doc['content_type'] = value.content_type

                # Update if already exists
                elasticsearch.index(index=settings.RESOLVER_URL_INDEX, doc_type='url',
                                    id=get_sha1_hash(value.original_url), body=url_doc)
Пример #5
0
    def load_item(self, object_id, combined_index_doc, doc):
        log.info('Indexing documents...')
        elasticsearch.index(index=settings.COMBINED_INDEX,
                            doc_type='item',
                            id=object_id,
                            body=combined_index_doc)
        elasticsearch.index(
            index='%s_%s' %
            (settings.DEFAULT_INDEX_PREFIX, self.source_definition['id']),
            doc_type='item',
            id=object_id,
            body=doc)

        # For each media_urls.url, add a resolver document to the
        # RESOLVER_URL_INDEX
        if 'media_urls' in doc:
            for media_url in doc['media_urls']:
                url_hash = media_url['url'].split('/')[-1]
                url_doc = {'original_url': media_url['original_url']}

                try:
                    elasticsearch.create(index=settings.RESOLVER_URL_INDEX,
                                         doc_type='url',
                                         id=url_hash,
                                         body=url_doc)
                except ConflictError:
                    log.debug('Resolver document %s already exists' % url_hash)
Пример #6
0
    def load_item(
        self, combined_object_id, object_id, combined_index_doc, doc
    ):
        log.info('Indexing documents...')
        elasticsearch.index(index=settings.COMBINED_INDEX,
                            doc_type=self.doc_type, id=combined_object_id,
                            body=combined_index_doc)

        # Index documents into new index
        elasticsearch.index(index=self.index_name, doc_type=self.doc_type,
                            body=doc, id=object_id)

        self._create_resolvable_media_urls(doc)
Пример #7
0
    def load_item(
        self, combined_object_id, object_id, combined_index_doc, doc
    ):
        log.info('Indexing documents...')
        # log.exception('Indexing topics: %s' % (
        #     combined_index_doc.get('topics', []),))
        # log.exception('Indexing sentiment: %s' % (
        #     combined_index_doc.get('sentiment', {}),))

        elasticsearch.index(index=settings.COMBINED_INDEX,
                            doc_type=self.doc_type, id=combined_object_id,
                            body=combined_index_doc)

        # Index documents into new index
        elasticsearch.index(index=self.index_name, doc_type=self.doc_type,
                            body=doc, id=object_id)

        self._create_resolvable_media_urls(doc)
Пример #8
0
    def load_item(self, combined_object_id, object_id, combined_index_doc,
                  doc):
        log.info('Indexing documents...')
        doc_type = self._get_doc_type(combined_index_doc, self.doc_type)
        elasticsearch.index(index=self.combined_index_name,
                            doc_type=doc_type,
                            id=combined_object_id,
                            body=combined_index_doc)

        # Index documents into new index
        doc_type = self._get_doc_type(doc, self.doc_type)
        elasticsearch.index(index=self.index_name,
                            doc_type=doc_type,
                            body=doc,
                            id=object_id)

        m_url_content_types = {}
        if 'media_urls' in doc['enrichments']:
            for media_url in doc['enrichments']['media_urls']:
                if 'content_type' in media_url:
                    m_url_content_types[media_url['original_url']] = \
                        media_url['content_type']

        # For each media_urls.url, add a resolver document to the
        # RESOLVER_URL_INDEX
        if 'media_urls' in doc:
            for media_url in doc['media_urls']:
                url_hash = media_url['url'].split('/')[-1]
                url_doc = {'original_url': media_url['original_url']}

                if media_url['original_url'] in m_url_content_types:
                    url_doc['content_type'] = \
                        m_url_content_types[media_url['original_url']]

                try:
                    elasticsearch.create(index=settings.RESOLVER_URL_INDEX,
                                         doc_type='url',
                                         id=url_hash,
                                         body=url_doc)
                except ConflictError:
                    log.debug('Resolver document %s already exists' % url_hash)
Пример #9
0
    def load_item(self, combined_object_id, object_id, combined_index_doc, doc,
                  doc_type):
        log.info('Indexing document id: %s' % object_id)
        elasticsearch.index(index=settings.COMBINED_INDEX,
                            doc_type=doc_type, id=combined_object_id,
                            body=combined_index_doc)

        # Index documents into new index
        elasticsearch.index(index=self.index_name, doc_type=doc_type,
                            body=doc, id=object_id)

        m_url_content_types = {}
        if 'media_urls' in doc['enrichments']:
            for media_url in doc['enrichments']['media_urls']:
                if 'content_type' in media_url:
                    m_url_content_types[media_url['original_url']] = \
                        media_url['content_type']

        # For each media_urls.url, add a resolver document to the
        # RESOLVER_URL_INDEX
        if 'media_urls' in doc:
            for media_url in doc['media_urls']:
                url_hash = media_url['url'].split('/')[-1]
                url_doc = {
                    'original_url': media_url['original_url']
                }

                if media_url['original_url'] in m_url_content_types:
                    url_doc['content_type'] = \
                        m_url_content_types[media_url['original_url']]

                # Update if already exists
                elasticsearch.index(index=settings.RESOLVER_URL_INDEX,
                                         doc_type='url', id=url_hash,
                                         body=url_doc)
Пример #10
0
    def load_item(self, object_id, combined_index_doc, doc):
        log.info('Indexing documents...')
        elasticsearch.index(index=settings.COMBINED_INDEX, doc_type='item',
                            id=object_id, body=combined_index_doc)
        elasticsearch.index(index='%s_%s' % (settings.DEFAULT_INDEX_PREFIX, self.source_definition['id']),
                            doc_type='item', id=object_id, body=doc)

        # For each media_urls.url, add a resolver document to the
        # RESOLVER_URL_INDEX
        if 'media_urls' in doc:
            for media_url in doc['media_urls']:
                url_hash = media_url['url'].split('/')[-1]
                url_doc = {
                    'original_url': media_url['original_url']
                }

                try:
                    elasticsearch.create(index=settings.RESOLVER_URL_INDEX,
                                         doc_type='url', id=url_hash,
                                         body=url_doc)
                except ConflictError:
                    log.debug('Resolver document %s already exists' % url_hash)
Пример #11
0
 def process(self, model, model_body):
     # Index document into new index
     elasticsearch.index(index=self.index_name,
                         body=model_body,
                         id=model.get_short_identifier())