Пример #1
0
    def load_item(self, doc):
        body = json_encoder.encode(JsonLDSerializer().serialize(doc))

        log.info('Indexing document id: %s' % doc.get_ori_identifier())

        # Index documents into new index
        elasticsearch.index(index=self.index_name, doc_type=doc_type(doc.verbose_name()),
                            body=body, id=doc.get_short_identifier())

        # Recursively index associated models like attachments
        for _, value in doc.properties(rels=True, props=False):
            self.load_item(value)

            if 'enricher_task' in value:
                # The value seems to be enriched so add to resolver
                url_doc = {
                    'ori_identifier': value.get_short_identifier(),
                    'original_url': value.original_url,
                    'file_name': value.name,
                }

                if 'content_type' in value:
                    url_doc['content_type'] = value.content_type

                # Update if already exists
                elasticsearch.index(index=settings.RESOLVER_URL_INDEX, doc_type='url',
                                    id=get_sha1_hash(value.original_url), body=url_doc)
    def load_item(self, doc):
        # Recursively index associated models like attachments
        for model in doc.traverse():
            model_body = json_encoder.encode(
                JsonLDSerializer().serialize(model))

            log.debug('ElasticsearchUpsertLoader indexing document id: %s' %
                      model.get_ori_identifier())

            # Update document
            elasticsearch.update(
                id=model.get_short_identifier(),
                index=self.index_name,
                body={
                    'doc': json.loads(model_body),
                    'doc_as_upsert': True,
                },
            )

            if 'enricher_task' in model:
                # The value seems to be enriched so add to resolver
                url_doc = {
                    'ori_identifier': model.get_short_identifier(),
                    'original_url': model.original_url,
                    'file_name': model.name,
                }

                if 'content_type' in model:
                    url_doc['content_type'] = model.content_type

                # Update if already exists
                elasticsearch.index(index=settings.RESOLVER_URL_INDEX,
                                    id=get_sha1_hash(model.original_url),
                                    body=url_doc)
Пример #3
0
    def load_item(self, doc):
        body = json_encoder.encode(JsonLDSerializer().serialize(doc))

        log.info('Indexing document id: %s' % doc.get_ori_identifier())

        # Index documents into new index
        elasticsearch.index(index=self.index_name, doc_type=doc_type(doc.verbose_name()),
                            body=body, id=doc.get_ori_identifier())

        # Recursively index associated models like attachments
        for _, value in doc.properties(rels=True, props=False):
            self.load_item(value)

            if 'enricher_task' in value:
                # The value seems to be enriched so add to resolver
                url_doc = {
                    'ori_identifier': value.get_ori_identifier(),
                    'original_url': value.original_url,
                    'file_name': value.name,
                }

                if 'content_type' in value:
                    url_doc['content_type'] = value.content_type

                # Update if already exists
                elasticsearch.index(index=settings.RESOLVER_URL_INDEX, doc_type='url',
                                    id=get_sha1_hash(value.original_url), body=url_doc)
Пример #4
0
    def fetch(self, url, modified_date):
        modified_date = localize_datetime(str_to_datetime(modified_date))

        url_hash = get_sha1_hash(url)
        base_path = self.base_path(url_hash)

        try:
            file_path, latest_version = self._latest_version(base_path)
            latest_version_path = '%s-%s' % (file_path, latest_version)
            self._check_path(latest_version_path)
        except OSError:
            # File does not exist, download and cache the url
            data = self._download_file(url)
            self._write_to_cache(base_path, data, modified_date)
            return data

        if modified_date and modified_date > str_to_datetime(latest_version):
            # If file has been modified download it
            data = self._download_file(url)
            self._write_to_cache(base_path, data, modified_date)
            return data
        else:
            # todo force_old_files
            with open(latest_version_path, 'rb') as f:
                return f.read()
Пример #5
0
    def enrich_item(self, item):
        """Enriches the media objects referenced in a single item.

        First, a media item will be retrieved from the source, than the
        registered and configured tasks will run. In case fetching the
        item fails, enrichment of the media item will be skipped. In case
        a specific media enrichment task fails, only that task is
        skipped, which means that we move on to the next task.
        """
        self.setup_http_session()

        if self.enricher_settings.get('authentication', False):
            self.setup_http_auth()

        # Check the settings to see if media should by fetch partially
        partial_fetch = self.enricher_settings.get('partial_media_fetch', False)

        content_type, content_length, media_file = self.fetch_media(
            item.get_ori_identifier(),
            item.original_url,
            partial_fetch
        )

        item.url = '%s/%s' % (RESOLVER_BASE_URL, get_sha1_hash(item.original_url))
        item.content_type = content_type
        item.size_in_bytes = content_length

        enrich_tasks = item.enricher_task
        if isinstance(enrich_tasks, basestring):
            enrich_tasks = [item.enricher_task]

        for task in enrich_tasks:
            # Seek to the beginning of the file before starting a task
            media_file.seek(0)
            try:
                self.available_tasks[task](item, content_type, media_file)
            except UnsupportedContentType:
                log.info('Skipping media enrichment task %s, '
                         'content-type %s (object_id: %s, url %s) is not '
                         'supported.' % (task, content_type, item.get_ori_identifier(),
                                         item.original_url))
                continue

            media_file.close()

        item.save()
Пример #6
0
    def enrich_item(self, item):
        """Enriches the media objects referenced in a single item.

        First, a media item will be retrieved from the source, than the
        registered and configured tasks will run. In case fetching the
        item fails, enrichment of the media item will be skipped. In case
        a specific media enrichment task fails, only that task is
        skipped, which means that we move on to the next task.
        """

        try:
            identifier = item.identifier_url
        except AttributeError:
            identifier = None  # todo

        content_type, content_length, media_file = self.fetch(
            item.original_url,
            identifier,
            item.date_modified,
        )

        item.url = '%s/%s' % (RESOLVER_BASE_URL, get_sha1_hash(item.original_url))
        item.content_type = content_type
        item.size_in_bytes = content_length

        enrich_tasks = item.enricher_task
        if isinstance(enrich_tasks, basestring):
            enrich_tasks = [item.enricher_task]

        for task in enrich_tasks:
            # Seek to the beginning of the file before starting a task
            media_file.seek(0)
            try:
                self.available_tasks[task](item, content_type, media_file)
            except UnsupportedContentType:
                log.info('Skipping media enrichment task %s, '
                         'content-type %s (object_id: %s, url %s) is not '
                         'supported.' % (task, content_type, item.get_ori_identifier(),
                                         item.original_url))
                continue

            media_file.close()

        item.save()
Пример #7
0
    def fetch_media(self, object_id, url, partial_fetch=False):
        http_resp = self.http_session.get(url, stream=True, timeout=(60, 120))
        http_resp.raise_for_status()

        static_dir = os.path.join(DATA_DIR_PATH, 'static')

        if not os.path.exists(static_dir):
            log.info('Creating static directory %s' % static_dir)
            os.makedirs(static_dir)

        file_id = get_sha1_hash(url)

        # Create a file to store the media item in the static dir
        media_file = open(os.path.join(static_dir, file_id), "w+b")

        # When a partial fetch is requested, request up to two MB
        partial_target_size = 1024 * 1024 * 2
        content_length = http_resp.headers.get('content-length')
        if content_length and int(content_length) < partial_target_size:
            partial_target_size = int(content_length)

        retrieved_bytes = 0
        for chunk in http_resp.iter_content(chunk_size=512 * 1024):
            if chunk:  # filter out keep-alive chunks
                media_file.write(chunk)
                retrieved_bytes += len(chunk)

            if partial_fetch and retrieved_bytes >= partial_target_size:
                break

        log.debug('Fetched media item %s [%s/%s]' %
                  (url, retrieved_bytes, content_length))

        # If the server doens't provide a content-length and this isn't
        # a partial fetch, determine the size by looking at the retrieved
        # content
        if not content_length and not partial_fetch:
            media_file.seek(0, 2)
            content_length = media_file.tell()

        return (http_resp.headers.get('content-type'), content_length,
                media_file)
Пример #8
0
    def test_modified_data_source(self, mocked_download_file):
        with open(os.path.join(self.PWD, "..", "test_dumps/notubiz_meeting_amsterdam.json"), 'rb') as f:
            data1 = f.read()

        with open(os.path.join(self.PWD, "..", "test_dumps/notubiz_meeting_amsterdam_update1.json"), 'rb') as f:
            data2 = f.read()

        # The second and third call to _download_file will return the second data source
        mocked_download_file.side_effect = [data1, data2, data2]

        # The download will be mocked so this is just for show
        url = "https://api.notubiz.nl/events/meetings/458902?format=json&version=1.10.8"
        self.mixin.fetch(url, datetime.datetime(2018, 11, 30, 12, 0))
        sleep(1)
        self.mixin.fetch(url, datetime.datetime(2018, 11, 30, 12, 1))
        sleep(1)
        self.mixin.fetch(url, datetime.datetime(2018, 11, 30, 12, 1))

        url_hash = get_sha1_hash(url)
        base_path = self.mixin.base_path(url_hash)

        file_count = len(glob.glob(base_path + "*"))
        self.assertEqual(file_count, 2)