def _create_resolvable_media_urls(self, doc): m_url_content_types = {} if 'media_urls' in doc['enrichments']: for media_url in doc['enrichments']['media_urls']: if 'content_type' in media_url: m_url_content_types[media_url['original_url']] = \ media_url['content_type'] # For each media_urls.url, add a resolver document to the # RESOLVER_URL_INDEX if 'media_urls' in doc: for media_url in doc['media_urls']: url_hash = media_url['url'].split('/')[-1] url_doc = { 'original_url': media_url['original_url'] } if media_url['original_url'] in m_url_content_types: url_doc['content_type'] = \ m_url_content_types[media_url['original_url']] try: elasticsearch.create(index=settings.RESOLVER_URL_INDEX, doc_type='url', id=url_hash, body=url_doc) except ConflictError: log.debug('Resolver document %s already exists' % url_hash)
def load_item(self, object_id, combined_index_doc, doc): log.info('Indexing documents...') elasticsearch.index(index=settings.COMBINED_INDEX, doc_type='item', id=object_id, body=combined_index_doc) # Index documents into new index elasticsearch.index(index=self.index_name, doc_type='item', body=doc, id=object_id) m_url_content_types = {} if 'media_urls' in doc['enrichments']: for media_url in doc['enrichments']['media_urls']: if 'content_type' in media_url: m_url_content_types[media_url['original_url']] = \ media_url['content_type'] # For each media_urls.url, add a resolver document to the # RESOLVER_URL_INDEX if 'media_urls' in doc: for media_url in doc['media_urls']: url_hash = media_url['url'].split('/')[-1] url_doc = { 'original_url': media_url['original_url'] } if media_url['original_url'] in m_url_content_types: url_doc['content_type'] = \ m_url_content_types[media_url['original_url']] try: elasticsearch.create(index=settings.RESOLVER_URL_INDEX, doc_type='url', id=url_hash, body=url_doc) except ConflictError: log.debug('Resolver document %s already exists' % url_hash)
def load_item(self, object_id, combined_index_doc, doc): log.info('Indexing documents...') elasticsearch.index(index=settings.COMBINED_INDEX, doc_type='item', id=object_id, body=combined_index_doc) elasticsearch.index( index='%s_%s' % (settings.DEFAULT_INDEX_PREFIX, self.source_definition['id']), doc_type='item', id=object_id, body=doc) # For each media_urls.url, add a resolver document to the # RESOLVER_URL_INDEX if 'media_urls' in doc: for media_url in doc['media_urls']: url_hash = media_url['url'].split('/')[-1] url_doc = {'original_url': media_url['original_url']} try: elasticsearch.create(index=settings.RESOLVER_URL_INDEX, doc_type='url', id=url_hash, body=url_doc) except ConflictError: log.debug('Resolver document %s already exists' % url_hash)
def load_item(self, combined_object_id, object_id, combined_index_doc, doc): log.info('Indexing documents...') doc_type = self._get_doc_type(combined_index_doc, self.doc_type) elasticsearch.index(index=self.combined_index_name, doc_type=doc_type, id=combined_object_id, body=combined_index_doc) # Index documents into new index doc_type = self._get_doc_type(doc, self.doc_type) elasticsearch.index(index=self.index_name, doc_type=doc_type, body=doc, id=object_id) m_url_content_types = {} if 'media_urls' in doc['enrichments']: for media_url in doc['enrichments']['media_urls']: if 'content_type' in media_url: m_url_content_types[media_url['original_url']] = \ media_url['content_type'] # For each media_urls.url, add a resolver document to the # RESOLVER_URL_INDEX if 'media_urls' in doc: for media_url in doc['media_urls']: url_hash = media_url['url'].split('/')[-1] url_doc = {'original_url': media_url['original_url']} if media_url['original_url'] in m_url_content_types: url_doc['content_type'] = \ m_url_content_types[media_url['original_url']] try: elasticsearch.create(index=settings.RESOLVER_URL_INDEX, doc_type='url', id=url_hash, body=url_doc) except ConflictError: log.debug('Resolver document %s already exists' % url_hash)
def load_item(self, object_id, combined_index_doc, doc): log.info('Indexing documents...') elasticsearch.index(index=settings.COMBINED_INDEX, doc_type='item', id=object_id, body=combined_index_doc) elasticsearch.index(index='%s_%s' % (settings.DEFAULT_INDEX_PREFIX, self.source_definition['id']), doc_type='item', id=object_id, body=doc) # For each media_urls.url, add a resolver document to the # RESOLVER_URL_INDEX if 'media_urls' in doc: for media_url in doc['media_urls']: url_hash = media_url['url'].split('/')[-1] url_doc = { 'original_url': media_url['original_url'] } try: elasticsearch.create(index=settings.RESOLVER_URL_INDEX, doc_type='url', id=url_hash, body=url_doc) except ConflictError: log.debug('Resolver document %s already exists' % url_hash)
def create_queries(mapping_dir, doc_type, index_name): """ Create queries for which a json file is available. """ click.echo( 'Creating queries for ES queries in index %s (%s) (doc type: %s)' % ( index_name, mapping_dir, doc_type, )) try: es.indices.create(index=index_name) # use templae except Exception: pass for mapping_file_path in glob('%s/*.json' % mapping_dir): # Extract the index name from the filename query_id = os.path.split(mapping_file_path)[-1].split('.')[0] click.echo('Creating ES query %s' % query_id) mapping_file = open(mapping_file_path, 'rb') mapping = json.load(mapping_file) mapping_file.close() try: r = es.create(index=index_name, doc_type=doc_type, body=mapping, id=query_id) click.echo('Query %s was %s' % ( query_id, r['result'], )) except ConflictError as e: click.echo('Query already existed') except RequestError as e: error_msg = click.style('Failed to create query %s due to ES ' 'error: %s' % (query_id, e.error), fg='red') click.echo(error_msg) click.echo(e)