def load_item(self, doc): body = json_encoder.encode(JsonLDSerializer().serialize(doc)) log.info('Indexing document id: %s' % doc.get_ori_identifier()) # Index documents into new index elasticsearch.index(index=self.index_name, doc_type=doc_type(doc.verbose_name()), body=body, id=doc.get_short_identifier()) # Recursively index associated models like attachments for _, value in doc.properties(rels=True, props=False): self.load_item(value) if 'enricher_task' in value: # The value seems to be enriched so add to resolver url_doc = { 'ori_identifier': value.get_short_identifier(), 'original_url': value.original_url, 'file_name': value.name, } if 'content_type' in value: url_doc['content_type'] = value.content_type # Update if already exists elasticsearch.index(index=settings.RESOLVER_URL_INDEX, doc_type='url', id=get_sha1_hash(value.original_url), body=url_doc)
def load_item(self, doc): # Recursively index associated models like attachments for model in doc.traverse(): model_body = json_encoder.encode( JsonLDSerializer().serialize(model)) log.debug('ElasticsearchUpsertLoader indexing document id: %s' % model.get_ori_identifier()) # Update document elasticsearch.update( id=model.get_short_identifier(), index=self.index_name, body={ 'doc': json.loads(model_body), 'doc_as_upsert': True, }, ) if 'enricher_task' in model: # The value seems to be enriched so add to resolver url_doc = { 'ori_identifier': model.get_short_identifier(), 'original_url': model.original_url, 'file_name': model.name, } if 'content_type' in model: url_doc['content_type'] = model.content_type # Update if already exists elasticsearch.index(index=settings.RESOLVER_URL_INDEX, id=get_sha1_hash(model.original_url), body=url_doc)
def load_item(self, doc): body = json_encoder.encode(JsonLDSerializer().serialize(doc)) log.info('Indexing document id: %s' % doc.get_ori_identifier()) # Index documents into new index elasticsearch.index(index=self.index_name, doc_type=doc_type(doc.verbose_name()), body=body, id=doc.get_ori_identifier()) # Recursively index associated models like attachments for _, value in doc.properties(rels=True, props=False): self.load_item(value) if 'enricher_task' in value: # The value seems to be enriched so add to resolver url_doc = { 'ori_identifier': value.get_ori_identifier(), 'original_url': value.original_url, 'file_name': value.name, } if 'content_type' in value: url_doc['content_type'] = value.content_type # Update if already exists elasticsearch.index(index=settings.RESOLVER_URL_INDEX, doc_type='url', id=get_sha1_hash(value.original_url), body=url_doc)
def fetch(self, url, modified_date): modified_date = localize_datetime(str_to_datetime(modified_date)) url_hash = get_sha1_hash(url) base_path = self.base_path(url_hash) try: file_path, latest_version = self._latest_version(base_path) latest_version_path = '%s-%s' % (file_path, latest_version) self._check_path(latest_version_path) except OSError: # File does not exist, download and cache the url data = self._download_file(url) self._write_to_cache(base_path, data, modified_date) return data if modified_date and modified_date > str_to_datetime(latest_version): # If file has been modified download it data = self._download_file(url) self._write_to_cache(base_path, data, modified_date) return data else: # todo force_old_files with open(latest_version_path, 'rb') as f: return f.read()
def enrich_item(self, item): """Enriches the media objects referenced in a single item. First, a media item will be retrieved from the source, than the registered and configured tasks will run. In case fetching the item fails, enrichment of the media item will be skipped. In case a specific media enrichment task fails, only that task is skipped, which means that we move on to the next task. """ self.setup_http_session() if self.enricher_settings.get('authentication', False): self.setup_http_auth() # Check the settings to see if media should by fetch partially partial_fetch = self.enricher_settings.get('partial_media_fetch', False) content_type, content_length, media_file = self.fetch_media( item.get_ori_identifier(), item.original_url, partial_fetch ) item.url = '%s/%s' % (RESOLVER_BASE_URL, get_sha1_hash(item.original_url)) item.content_type = content_type item.size_in_bytes = content_length enrich_tasks = item.enricher_task if isinstance(enrich_tasks, basestring): enrich_tasks = [item.enricher_task] for task in enrich_tasks: # Seek to the beginning of the file before starting a task media_file.seek(0) try: self.available_tasks[task](item, content_type, media_file) except UnsupportedContentType: log.info('Skipping media enrichment task %s, ' 'content-type %s (object_id: %s, url %s) is not ' 'supported.' % (task, content_type, item.get_ori_identifier(), item.original_url)) continue media_file.close() item.save()
def enrich_item(self, item): """Enriches the media objects referenced in a single item. First, a media item will be retrieved from the source, than the registered and configured tasks will run. In case fetching the item fails, enrichment of the media item will be skipped. In case a specific media enrichment task fails, only that task is skipped, which means that we move on to the next task. """ try: identifier = item.identifier_url except AttributeError: identifier = None # todo content_type, content_length, media_file = self.fetch( item.original_url, identifier, item.date_modified, ) item.url = '%s/%s' % (RESOLVER_BASE_URL, get_sha1_hash(item.original_url)) item.content_type = content_type item.size_in_bytes = content_length enrich_tasks = item.enricher_task if isinstance(enrich_tasks, basestring): enrich_tasks = [item.enricher_task] for task in enrich_tasks: # Seek to the beginning of the file before starting a task media_file.seek(0) try: self.available_tasks[task](item, content_type, media_file) except UnsupportedContentType: log.info('Skipping media enrichment task %s, ' 'content-type %s (object_id: %s, url %s) is not ' 'supported.' % (task, content_type, item.get_ori_identifier(), item.original_url)) continue media_file.close() item.save()
def fetch_media(self, object_id, url, partial_fetch=False): http_resp = self.http_session.get(url, stream=True, timeout=(60, 120)) http_resp.raise_for_status() static_dir = os.path.join(DATA_DIR_PATH, 'static') if not os.path.exists(static_dir): log.info('Creating static directory %s' % static_dir) os.makedirs(static_dir) file_id = get_sha1_hash(url) # Create a file to store the media item in the static dir media_file = open(os.path.join(static_dir, file_id), "w+b") # When a partial fetch is requested, request up to two MB partial_target_size = 1024 * 1024 * 2 content_length = http_resp.headers.get('content-length') if content_length and int(content_length) < partial_target_size: partial_target_size = int(content_length) retrieved_bytes = 0 for chunk in http_resp.iter_content(chunk_size=512 * 1024): if chunk: # filter out keep-alive chunks media_file.write(chunk) retrieved_bytes += len(chunk) if partial_fetch and retrieved_bytes >= partial_target_size: break log.debug('Fetched media item %s [%s/%s]' % (url, retrieved_bytes, content_length)) # If the server doens't provide a content-length and this isn't # a partial fetch, determine the size by looking at the retrieved # content if not content_length and not partial_fetch: media_file.seek(0, 2) content_length = media_file.tell() return (http_resp.headers.get('content-type'), content_length, media_file)
def test_modified_data_source(self, mocked_download_file): with open(os.path.join(self.PWD, "..", "test_dumps/notubiz_meeting_amsterdam.json"), 'rb') as f: data1 = f.read() with open(os.path.join(self.PWD, "..", "test_dumps/notubiz_meeting_amsterdam_update1.json"), 'rb') as f: data2 = f.read() # The second and third call to _download_file will return the second data source mocked_download_file.side_effect = [data1, data2, data2] # The download will be mocked so this is just for show url = "https://api.notubiz.nl/events/meetings/458902?format=json&version=1.10.8" self.mixin.fetch(url, datetime.datetime(2018, 11, 30, 12, 0)) sleep(1) self.mixin.fetch(url, datetime.datetime(2018, 11, 30, 12, 1)) sleep(1) self.mixin.fetch(url, datetime.datetime(2018, 11, 30, 12, 1)) url_hash = get_sha1_hash(url) base_path = self.mixin.base_path(url_hash) file_count = len(glob.glob(base_path + "*")) self.assertEqual(file_count, 2)