示例#1
0
    def create(self, docs, **kwargs):
        new_guids = []
        provider = self.get_provider()
        for doc in docs:
            if not doc.get('desk'):
                # if no desk is selected then it is bad request
                raise SuperdeskApiError.badRequestError("Destination desk cannot be empty.")
            try:
                archived_doc = self.fetch(doc['guid'])
            except FileNotFoundError as ex:
                raise ProviderError.externalProviderError(ex, provider)

            dest_doc = dict(archived_doc)
            new_id = generate_guid(type=GUID_TAG)
            new_guids.append(new_id)
            dest_doc['_id'] = new_id
            generate_unique_id_and_name(dest_doc)

            if provider:
                dest_doc['ingest_provider'] = str(provider[superdesk.config.ID_FIELD])

            dest_doc[config.VERSION] = 1
            send_to(doc=dest_doc, update=None, desk_id=doc.get('desk'), stage_id=doc.get('stage'))
            dest_doc[ITEM_STATE] = doc.get(ITEM_STATE, CONTENT_STATE.FETCHED)
            dest_doc[INGEST_ID] = archived_doc['_id']
            dest_doc[FAMILY_ID] = archived_doc['_id']
            remove_unwanted(dest_doc)
            set_original_creator(dest_doc)

            superdesk.get_resource_service(ARCHIVE).post([dest_doc])
            insert_into_versions(dest_doc.get('_id'))

        return new_guids
示例#2
0
    def create(self, docs, **kwargs):
        new_guids = []
        provider = get_resource_service('ingest_providers').find_one(source='aapmm', req=None)
        for doc in docs:
            if not doc.get('desk'):
                # if no desk is selected then it is bad request
                raise SuperdeskApiError.badRequestError("Destination desk cannot be empty.")

            archived_doc = self.backend.find_one_raw(doc['guid'], doc['guid'])

            dest_doc = dict(archived_doc)
            new_id = generate_guid(type=GUID_TAG)
            new_guids.append(new_id)
            dest_doc['_id'] = new_id
            generate_unique_id_and_name(dest_doc)

            if provider:
                dest_doc['ingest_provider'] = str(provider[superdesk.config.ID_FIELD])

            dest_doc[config.VERSION] = 1
            send_to(doc=dest_doc, update=None, desk_id=doc.get('desk'), stage_id=doc.get('stage'))
            dest_doc[config.CONTENT_STATE] = doc.get('state', STATE_FETCHED)
            dest_doc[INGEST_ID] = archived_doc['_id']
            dest_doc[FAMILY_ID] = archived_doc['_id']
            remove_unwanted(dest_doc)
            set_original_creator(dest_doc)

            superdesk.get_resource_service(ARCHIVE).post([dest_doc])
            insert_into_versions(dest_doc.get('_id'))

        return new_guids
示例#3
0
    def create(self, docs, **kwargs):
        new_guids = []
        provider = get_resource_service("ingest_providers").find_one(source="aapmm", req=None)
        if provider and "config" in provider and "username" in provider["config"]:
            self.backend.set_credentials(provider["config"]["username"], provider["config"]["password"])
        for doc in docs:
            if not doc.get("desk"):
                # if no desk is selected then it is bad request
                raise SuperdeskApiError.badRequestError("Destination desk cannot be empty.")
            try:
                archived_doc = self.backend.find_one_raw(doc["guid"], doc["guid"])
            except FileNotFoundError as ex:
                raise ProviderError.externalProviderError(ex, provider)

            dest_doc = dict(archived_doc)
            new_id = generate_guid(type=GUID_TAG)
            new_guids.append(new_id)
            dest_doc["_id"] = new_id
            generate_unique_id_and_name(dest_doc)

            if provider:
                dest_doc["ingest_provider"] = str(provider[superdesk.config.ID_FIELD])

            dest_doc[config.VERSION] = 1
            send_to(doc=dest_doc, update=None, desk_id=doc.get("desk"), stage_id=doc.get("stage"))
            dest_doc[ITEM_STATE] = doc.get(ITEM_STATE, CONTENT_STATE.FETCHED)
            dest_doc[INGEST_ID] = archived_doc["_id"]
            dest_doc[FAMILY_ID] = archived_doc["_id"]
            remove_unwanted(dest_doc)
            set_original_creator(dest_doc)

            superdesk.get_resource_service(ARCHIVE).post([dest_doc])
            insert_into_versions(dest_doc.get("_id"))

        return new_guids
示例#4
0
    def create(self, docs, **kwargs):
        new_guids = []
        provider = get_resource_service('ingest_providers').find_one(
            source='aapmm', req=None)
        for doc in docs:
            if not doc.get('desk'):
                # if no desk is selected then it is bad request
                raise SuperdeskApiError.badRequestError(
                    "Destination desk cannot be empty.")

            archived_doc = self.backend.find_one_raw(doc['guid'], doc['guid'])

            dest_doc = dict(archived_doc)
            new_id = generate_guid(type=GUID_TAG)
            new_guids.append(new_id)
            dest_doc['_id'] = new_id
            generate_unique_id_and_name(dest_doc)

            if provider:
                dest_doc['ingest_provider'] = str(
                    provider[superdesk.config.ID_FIELD])

            dest_doc[config.VERSION] = 1
            send_to(dest_doc, doc.get('desk'), doc.get('stage'))
            dest_doc[config.CONTENT_STATE] = doc.get('state', STATE_FETCHED)
            dest_doc[INGEST_ID] = archived_doc['_id']
            dest_doc[FAMILY_ID] = archived_doc['_id']
            remove_unwanted(dest_doc)
            set_original_creator(dest_doc)

            superdesk.get_resource_service(ARCHIVE).post([dest_doc])
            insert_into_versions(dest_doc.get('_id'))

        return new_guids
示例#5
0
    def fetch(self, docs, id=None, **kwargs):
        id_of_fetched_items = []

        for doc in docs:
            id_of_item_to_be_fetched = doc.get('_id') if id is None else id

            desk_id = doc.get('desk')
            stage_id = doc.get('stage')

            ingest_service = get_resource_service('ingest')
            ingest_doc = ingest_service.find_one(req=None,
                                                 _id=id_of_item_to_be_fetched)

            if not ingest_doc:
                raise SuperdeskApiError.notFoundError(
                    'Fail to found ingest item with _id: %s' %
                    id_of_item_to_be_fetched)

            if not is_workflow_state_transition_valid(
                    'fetch_from_ingest', ingest_doc[config.CONTENT_STATE]):
                raise InvalidStateTransitionError()

            if doc.get('macro'):  # there is a macro so transform it
                ingest_doc = get_resource_service('macros').execute_macro(
                    ingest_doc, doc.get('macro'))

            archived = utcnow()
            ingest_service.patch(id_of_item_to_be_fetched,
                                 {'archived': archived})

            dest_doc = dict(ingest_doc)
            new_id = generate_guid(type=GUID_TAG)
            id_of_fetched_items.append(new_id)
            dest_doc['_id'] = new_id
            dest_doc['guid'] = new_id
            dest_doc['destination_groups'] = doc.get('destination_groups')
            generate_unique_id_and_name(dest_doc)

            dest_doc[config.VERSION] = 1
            send_to(dest_doc, desk_id, stage_id)
            dest_doc[config.CONTENT_STATE] = doc.get('state', STATE_FETCHED)
            dest_doc[INGEST_ID] = dest_doc[FAMILY_ID] = ingest_doc['_id']

            remove_unwanted(dest_doc)
            set_original_creator(dest_doc)
            self.__fetch_items_in_package(dest_doc, desk_id, stage_id,
                                          doc.get('state', STATE_FETCHED),
                                          doc.get('destination_groups'))

            get_resource_service(ARCHIVE).post([dest_doc])
            insert_into_versions(doc=dest_doc)
            build_custom_hateoas(custom_hateoas, dest_doc)
            doc.update(dest_doc)

        if kwargs.get('notify', True):
            push_notification('item:fetch', fetched=1)

        return id_of_fetched_items
示例#6
0
    def create(self, docs, **kwargs):
        search_provider = get_resource_service('search_providers').find_one(
            search_provider=PROVIDER_NAME, req=None)

        if not search_provider or search_provider.get('is_closed', False):
            raise SuperdeskApiError.badRequestError(
                'No search provider found or the search provider is closed.')

        if 'config' in search_provider:
            self.backend.set_credentials(search_provider['config'])

        new_guids = []
        for doc in docs:
            if not doc.get(
                    'desk'):  # if no desk is selected then it is bad request
                raise SuperdeskApiError.badRequestError(
                    "Destination desk cannot be empty.")

            try:
                archived_doc = self.backend.find_one_raw(
                    doc['guid'], doc['guid'])
            except FileNotFoundError as ex:
                raise ProviderError.externalProviderError(ex, search_provider)

            dest_doc = dict(archived_doc)
            new_id = generate_guid(type=GUID_TAG)
            new_guids.append(new_id)
            dest_doc[config.ID_FIELD] = new_id
            generate_unique_id_and_name(dest_doc)

            if search_provider:
                dest_doc['ingest_provider'] = str(
                    search_provider[config.ID_FIELD])

            dest_doc[config.VERSION] = 1
            send_to(doc=dest_doc,
                    update=None,
                    desk_id=doc.get('desk'),
                    stage_id=doc.get('stage'))
            dest_doc[ITEM_STATE] = doc.get(ITEM_STATE, CONTENT_STATE.FETCHED)
            dest_doc[INGEST_ID] = archived_doc[config.ID_FIELD]
            dest_doc[FAMILY_ID] = archived_doc[config.ID_FIELD]
            dest_doc[ITEM_OPERATION] = ITEM_FETCH
            remove_unwanted(dest_doc)
            set_original_creator(dest_doc)

            superdesk.get_resource_service(ARCHIVE).post([dest_doc])
            insert_into_versions(dest_doc[config.ID_FIELD])

            get_resource_service('search_providers').system_update(
                search_provider[config.ID_FIELD],
                {'last_item_update': utcnow()}, search_provider)

        return new_guids
示例#7
0
    def fetch(self, docs, id=None, **kwargs):
        id_of_fetched_items = []

        for doc in docs:
            id_of_item_to_be_fetched = doc.get('_id') if id is None else id

            desk_id = doc.get('desk')
            stage_id = doc.get('stage')

            ingest_service = get_resource_service('ingest')
            ingest_doc = ingest_service.find_one(req=None, _id=id_of_item_to_be_fetched)

            if not ingest_doc:
                raise SuperdeskApiError.notFoundError('Fail to found ingest item with _id: %s' %
                                                      id_of_item_to_be_fetched)

            if not is_workflow_state_transition_valid('fetch_from_ingest', ingest_doc[config.CONTENT_STATE]):
                raise InvalidStateTransitionError()

            if doc.get('macro'):  # there is a macro so transform it
                ingest_doc = get_resource_service('macros').execute_macro(ingest_doc, doc.get('macro'))

            archived = utcnow()
            ingest_service.patch(id_of_item_to_be_fetched, {'archived': archived})

            dest_doc = dict(ingest_doc)
            new_id = generate_guid(type=GUID_TAG)
            id_of_fetched_items.append(new_id)
            dest_doc['_id'] = new_id
            dest_doc['guid'] = new_id
            dest_doc['destination_groups'] = doc.get('destination_groups')
            generate_unique_id_and_name(dest_doc)

            dest_doc[config.VERSION] = 1
            send_to(dest_doc, desk_id, stage_id)
            dest_doc[config.CONTENT_STATE] = doc.get('state', STATE_FETCHED)
            dest_doc[INGEST_ID] = dest_doc[FAMILY_ID] = ingest_doc['_id']

            remove_unwanted(dest_doc)
            set_original_creator(dest_doc)
            self.__fetch_items_in_package(dest_doc, desk_id, stage_id,
                                          doc.get('state', STATE_FETCHED),
                                          doc.get('destination_groups'))

            get_resource_service(ARCHIVE).post([dest_doc])
            insert_into_versions(doc=dest_doc)
            build_custom_hateoas(custom_hateoas, dest_doc)
            doc.update(dest_doc)

        if kwargs.get('notify', True):
            push_notification('item:fetch', fetched=1)

        return id_of_fetched_items
示例#8
0
    def fetch(self, docs, id=None, **kwargs):
        id_of_fetched_items = []

        for doc in docs:
            id_of_item_to_be_fetched = doc.get("_id") if id is None else id

            desk_id = doc.get("desk")
            stage_id = doc.get("stage")

            ingest_service = get_resource_service("ingest")
            ingest_doc = ingest_service.find_one(req=None, _id=id_of_item_to_be_fetched)

            if not ingest_doc:
                raise SuperdeskApiError.notFoundError(
                    "Fail to found ingest item with _id: %s" % id_of_item_to_be_fetched
                )

            if not is_workflow_state_transition_valid("fetch_from_ingest", ingest_doc[ITEM_STATE]):
                raise InvalidStateTransitionError()

            if doc.get("macro"):  # there is a macro so transform it
                ingest_doc = get_resource_service("macros").execute_macro(ingest_doc, doc.get("macro"))

            archived = utcnow()
            ingest_service.patch(id_of_item_to_be_fetched, {"archived": archived})

            dest_doc = dict(ingest_doc)
            new_id = generate_guid(type=GUID_TAG)
            id_of_fetched_items.append(new_id)
            dest_doc["_id"] = new_id
            dest_doc["guid"] = new_id
            generate_unique_id_and_name(dest_doc)

            dest_doc[config.VERSION] = 1
            send_to(doc=dest_doc, desk_id=desk_id, stage_id=stage_id)
            dest_doc[ITEM_STATE] = doc.get(ITEM_STATE, CONTENT_STATE.FETCHED)
            dest_doc[INGEST_ID] = dest_doc[FAMILY_ID] = ingest_doc["_id"]
            dest_doc[ITEM_OPERATION] = ITEM_FETCH

            remove_unwanted(dest_doc)
            set_original_creator(dest_doc)
            self.__fetch_items_in_package(dest_doc, desk_id, stage_id, doc.get(ITEM_STATE, CONTENT_STATE.FETCHED))

            get_resource_service(ARCHIVE).post([dest_doc])
            insert_into_versions(doc=dest_doc)
            build_custom_hateoas(custom_hateoas, dest_doc)
            doc.update(dest_doc)

        if kwargs.get("notify", True):
            push_notification("item:fetch", fetched=1)

        return id_of_fetched_items
    def ingest_items_for(self, desk, no_of_stories, skip_index):
        desk_id = desk['_id']
        stage_id = desk['incoming_stage']

        bucket_size = min(100, no_of_stories)

        no_of_buckets = len(range(0, no_of_stories, bucket_size))

        for x in range(0, no_of_buckets):
            skip = x * bucket_size * skip_index
            logger.info('Page : {}, skip: {}'.format(x + 1, skip))
            cursor = get_resource_service('published').get_from_mongo(None, {})
            cursor.skip(skip)
            cursor.limit(bucket_size)
            items = list(cursor)
            logger.info('Inserting {} items'.format(len(items)))
            archive_items = []

            for item in items:
                dest_doc = dict(item)
                new_id = generate_guid(type=GUID_TAG)
                dest_doc[app.config['ID_FIELD']] = new_id
                dest_doc['guid'] = new_id
                generate_unique_id_and_name(dest_doc)

                dest_doc[app.config['VERSION']] = 1
                dest_doc[ITEM_STATE] = CONTENT_STATE.FETCHED
                user_id = desk.get('members', [{'user': None}])[0].get('user')
                dest_doc['original_creator'] = user_id
                dest_doc['version_creator'] = user_id

                from apps.tasks import send_to
                send_to(dest_doc,
                        desk_id=desk_id,
                        stage_id=stage_id,
                        user_id=user_id)
                dest_doc[app.config[
                    'VERSION']] = 1  # Above step increments the version and needs to reset
                dest_doc[FAMILY_ID] = item['_id']

                remove_unwanted(dest_doc)
                archive_items.append(dest_doc)

            get_resource_service(ARCHIVE).post(archive_items)
            for item in archive_items:
                insert_into_versions(id_=item[app.config['ID_FIELD']])
示例#10
0
    def create(self, docs, **kwargs):
        search_provider = get_resource_service('search_providers').find_one(search_provider=PROVIDER_NAME, req=None)

        if not search_provider or search_provider.get('is_closed', False):
            raise SuperdeskApiError.badRequestError('No search provider found or the search provider is closed.')

        if 'config' in search_provider:
            self.backend.set_credentials(search_provider['config'])

        new_guids = []
        for doc in docs:
            if not doc.get('desk'):  # if no desk is selected then it is bad request
                raise SuperdeskApiError.badRequestError("Destination desk cannot be empty.")

            try:
                archived_doc = self.backend.find_one_raw(doc['guid'], doc['guid'])
            except FileNotFoundError as ex:
                raise ProviderError.externalProviderError(ex, search_provider)

            dest_doc = dict(archived_doc)
            new_id = generate_guid(type=GUID_TAG)
            new_guids.append(new_id)
            dest_doc[config.ID_FIELD] = new_id
            generate_unique_id_and_name(dest_doc)

            if search_provider:
                dest_doc['ingest_provider'] = str(search_provider[config.ID_FIELD])

            dest_doc[config.VERSION] = 1
            send_to(doc=dest_doc, update=None, desk_id=doc.get('desk'), stage_id=doc.get('stage'))
            dest_doc[ITEM_STATE] = doc.get(ITEM_STATE, CONTENT_STATE.FETCHED)
            dest_doc[INGEST_ID] = archived_doc[config.ID_FIELD]
            dest_doc[FAMILY_ID] = archived_doc[config.ID_FIELD]
            dest_doc[ITEM_OPERATION] = ITEM_FETCH
            remove_unwanted(dest_doc)
            set_original_creator(dest_doc)

            superdesk.get_resource_service(ARCHIVE).post([dest_doc])
            insert_into_versions(dest_doc[config.ID_FIELD])

            get_resource_service('search_providers').system_update(search_provider[config.ID_FIELD],
                                                                   {'last_item_update': utcnow()}, search_provider)

        return new_guids
示例#11
0
    def create(self, docs, **kwargs):
        new_guids = []
        provider = get_resource_service('ingest_providers').find_one(
            source='aapmm', req=None)
        if provider and 'config' in provider and 'username' in provider[
                'config']:
            self.backend.set_credentials(provider['config']['username'],
                                         provider['config']['password'])
        for doc in docs:
            if not doc.get('desk'):
                # if no desk is selected then it is bad request
                raise SuperdeskApiError.badRequestError(
                    "Destination desk cannot be empty.")
            try:
                archived_doc = self.backend.find_one_raw(
                    doc['guid'], doc['guid'])
            except FileNotFoundError as ex:
                raise ProviderError.externalProviderError(ex, provider)

            dest_doc = dict(archived_doc)
            new_id = generate_guid(type=GUID_TAG)
            new_guids.append(new_id)
            dest_doc['_id'] = new_id
            generate_unique_id_and_name(dest_doc)

            if provider:
                dest_doc['ingest_provider'] = str(
                    provider[superdesk.config.ID_FIELD])

            dest_doc[config.VERSION] = 1
            send_to(doc=dest_doc,
                    update=None,
                    desk_id=doc.get('desk'),
                    stage_id=doc.get('stage'))
            dest_doc[ITEM_STATE] = doc.get(ITEM_STATE, CONTENT_STATE.FETCHED)
            dest_doc[INGEST_ID] = archived_doc['_id']
            dest_doc[FAMILY_ID] = archived_doc['_id']
            remove_unwanted(dest_doc)
            set_original_creator(dest_doc)

            superdesk.get_resource_service(ARCHIVE).post([dest_doc])
            insert_into_versions(dest_doc.get('_id'))

        return new_guids
示例#12
0
    def create(self, docs, **kwargs):
        new_guids = []
        provider = self.get_provider()
        for doc in docs:
            if not doc.get('desk'):
                # if no desk is selected then it is bad request
                raise SuperdeskApiError.badRequestError(
                    _("Destination desk cannot be empty."))
            try:
                archived_doc = self.fetch(doc['guid'])
            except FileNotFoundError as ex:
                raise ProviderError.externalProviderError(ex, provider)

            dest_doc = dict(archived_doc)
            new_id = generate_guid(type=GUID_TAG)
            new_guids.append(new_id)
            dest_doc['_id'] = new_id
            generate_unique_id_and_name(dest_doc)

            if provider:
                dest_doc['ingest_provider'] = str(
                    provider[superdesk.config.ID_FIELD])

            dest_doc[config.VERSION] = 1
            send_to(doc=dest_doc,
                    update=None,
                    desk_id=doc.get('desk'),
                    stage_id=doc.get('stage'))
            dest_doc[ITEM_STATE] = doc.get(ITEM_STATE, CONTENT_STATE.FETCHED)
            dest_doc[INGEST_ID] = archived_doc['_id']
            dest_doc[FAMILY_ID] = archived_doc['_id']
            dest_doc[ITEM_OPERATION] = ITEM_FETCH
            remove_unwanted(dest_doc)
            set_original_creator(dest_doc)

            superdesk.get_resource_service(ARCHIVE).post([dest_doc])
            insert_into_versions(dest_doc.get('_id'))

        if new_guids:
            get_resource_service('search_providers').system_update(
                provider.get(config.ID_FIELD), {'last_item_update': utcnow()},
                provider)

        return new_guids
    def ingest_items_for(self, desk, no_of_stories, skip_index):
        desk_id = desk['_id']
        stage_id = desk['incoming_stage']

        bucket_size = min(100, no_of_stories)

        no_of_buckets = len(range(0, no_of_stories, bucket_size))

        for x in range(0, no_of_buckets):
            skip = x * bucket_size * skip_index
            logger.info('Page : {}, skip: {}'.format(x + 1, skip))
            cursor = get_resource_service('published').get_from_mongo(None, {})
            cursor.skip(skip)
            cursor.limit(bucket_size)
            items = list(cursor)
            logger.info('Inserting {} items'.format(len(items)))
            archive_items = []

            for item in items:
                dest_doc = dict(item)
                new_id = generate_guid(type=GUID_TAG)
                dest_doc[app.config['ID_FIELD']] = new_id
                dest_doc['guid'] = new_id
                generate_unique_id_and_name(dest_doc)

                dest_doc[app.config['VERSION']] = 1
                dest_doc[ITEM_STATE] = CONTENT_STATE.FETCHED
                user_id = desk.get('members', [{'user': None}])[0].get('user')
                dest_doc['original_creator'] = user_id
                dest_doc['version_creator'] = user_id

                from apps.tasks import send_to
                send_to(dest_doc, desk_id=desk_id, stage_id=stage_id, user_id=user_id)
                dest_doc[app.config['VERSION']] = 1  # Above step increments the version and needs to reset
                dest_doc[FAMILY_ID] = item['_id']

                remove_unwanted(dest_doc)
                archive_items.append(dest_doc)

            get_resource_service(ARCHIVE).post(archive_items)
            for item in archive_items:
                insert_into_versions(id_=item[app.config['ID_FIELD']])
    def ingest_items_for(self, desk, no_of_stories, skip_index):
        desk_id = desk['_id']
        stage_id = desk['incoming_stage']

        bucket_size = min(100, no_of_stories)

        no_of_buckets = len(range(0, no_of_stories, bucket_size))

        for x in range(0, no_of_buckets):
            skip = x * bucket_size * skip_index
            self.logger.info('Page : {}, skip: {}'.format(x + 1, skip))
            cursor = get_resource_service('text_archive').get_from_mongo(None, {})
            cursor.skip(skip)
            cursor.limit(bucket_size)
            items = list(cursor)
            self.logger.info('Inserting {} items'.format(len(items)))
            archive_items = []

            for item in items:
                dest_doc = dict(item)
                new_id = generate_guid(type=GUID_TAG)
                dest_doc['_id'] = new_id
                dest_doc['guid'] = new_id
                generate_unique_id_and_name(dest_doc)

                dest_doc[app.config['VERSION']] = 1
                dest_doc['state'] = 'fetched'
                user_id = desk.get('members', [{'user': None}])[0].get('user')
                dest_doc['original_creator'] = user_id
                dest_doc['version_creator'] = user_id
                send_to(dest_doc, desk_id=desk_id, stage_id=stage_id, user_id=user_id)
                dest_doc[FAMILY_ID] = item['_id']

                remove_unwanted(dest_doc)
                archive_items.append(dest_doc)

            get_resource_service(ARCHIVE).post(archive_items)
            for item in archive_items:
                insert_into_versions(id_=item['_id'])
示例#15
0
    def _process_bunch(self, x):
        # x.findall('dc_rest_docs/dc_rest_doc')[0].get('href')
        items = []
        for doc in x.findall('dc_rest_docs/dc_rest_doc'):
            try:
                # print(doc.get('href'))
                id = doc.find('dcdossier').get('id')
                if self._direction:
                    if int(id) > self._id:
                        self._id = int(id)
                else:
                    if int(id) < self._id:
                        self._id = int(id)
                item = {}
                item['guid'] = doc.find('dcdossier').get('guid')

                # if the item has been modified in the archive then it is due to a kill
                # there is an argument that this item should not be imported at all
                if doc.find('dcdossier').get('created') != doc.find('dcdossier').get('modified'):
                    # item[ITEM_STATE] = CONTENT_STATE.KILLED
                    continue
                else:
                    item[ITEM_STATE] = CONTENT_STATE.PUBLISHED

                value = datetime.strptime(self._get_head_value(doc, 'PublicationDate'), '%Y%m%d%H%M%S')
                local_tz = pytz.timezone('Australia/Sydney')
                try:
                    aus_dt = local_tz.localize(value, is_dst=None)
                except NonExistentTimeError as ex:
                    aus_dt = local_tz.localize(value, is_dst=True)
                except AmbiguousTimeError:
                    aus_dt = local_tz.localize(value, is_dst=False)

                item['firstcreated'] = aus_dt.astimezone(pytz.utc)
                item['versioncreated'] = item['firstcreated']

                generate_unique_id_and_name(item)
                item['ingest_id'] = id

                item['source'] = self._get_head_value(doc, 'Agency')

    #            self._addkeywords('AsiaPulseCodes', doc, item)

                byline = self._get_head_value(doc, 'Byline')
                if byline:
                    item['byline'] = byline

                # item['service'] = self._get_head_value(doc,'Service')

                category = self._get_head_value(doc, 'Category')
                if not category:
                    publication_name = self._get_head_value(doc, 'PublicationName')
                    if publication_name in pubnames:
                        category = pubnames[publication_name]
                if category:
                    anpacategory = {}
                    anpacategory['qcode'] = category
                    for anpa_category in self._anpa_categories['items']:
                        if anpacategory['qcode'].lower() == anpa_category['qcode'].lower():
                            anpacategory = {'qcode': anpacategory['qcode'], 'name': anpa_category['name']}
                            break
                    item['anpa_category'] = [anpacategory]

    #           self._addkeywords('CompanyCodes', doc, item)

                type = self._get_head_value(doc, 'Format')
                if type == 'x':
                    item[ITEM_TYPE] = CONTENT_TYPE.TEXT
                elif type == 't':
                    item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED
                else:
                    item[ITEM_TYPE] = CONTENT_TYPE.TEXT

                item['keyword'] = self._get_head_value(doc, 'Keyword')
                item['ingest_provider_sequence'] = self._get_head_value(doc, 'Sequence')

                orginal_source = self._get_head_value(doc, 'Author')
                if orginal_source:
                    item['original_source'] = orginal_source

                item['headline'] = self._get_head_value(doc, 'Headline')

                code = self._get_head_value(doc, 'SubjectRefNum')
                if code and len(code) == 7:
                    code = '0' + code
                if code and code in subject_codes:
                    item['subject'] = []
                    item['subject'].append({'qcode': code, 'name': subject_codes[code]})
                    try:
                        process_iptc_codes(item, None)
                    except:
                        pass

                slug = self._get_head_value(doc, 'SLUG')
                if slug:
                    item['slugline'] = slug
                else:
                    item['slugline'] = self._get_head_value(doc, 'Keyword')

                take_key = self._get_head_value(doc, 'Takekey')
                if take_key:
                    item['anpa_take_key'] = take_key

    #            self._addkeywords('Topic', doc, item)

    #            self._addkeywords('Selectors', doc, item)

                el = doc.find('dcdossier/document/body/BodyText')
                if el is not None:
                    story = el.text
                    if item[ITEM_TYPE] == CONTENT_TYPE.TEXT:
                        story = story.replace('\n   ', '<br><br>')
                        story = story.replace('\n', '<br>')
                        item['body_html'] = story
                    else:
                        item['body_html'] = story
                    try:
                        item['word_count'] = get_text_word_count(item['body_html'])
                    except:
                        pass

                item['pubstatus'] = 'usable'
                # this is required for the archived service additional lookup
                item['item_id'] = item['guid']
                item[config.VERSION] = 1
                item['flags'] = {'marked_archived_only': True}

                # item['_id'] = ObjectId(id.rjust(24,'0'))
                item['_id'] = ObjectId()
                items.append(item)

                if self._limit:
                    self._limit -= 1
                # print(item)
            except Exception as ex:
                print('Exception parsing DC documnent {}'.format(id))
                pass

        try:
            res = superdesk.get_resource_service('archived')
            s = time.time()
            res.post(items)
            print('Post to Batch to Superdesk took {:.2f}'.format(time.time() - s))
        except Exception as ex:
            if ex.code == 409:
                print('Key clash exceptionn detected')
                # create a list of the guids we tried to post
                guids = [g['guid'] for g in items]
                # create a query for all those id's
                query = {
                    'size': self.BATCH_SIZE,
                    'query': {
                        'filtered': {
                            'filter': {
                                "terms": {
                                    "guid": [guids]
                                }
                            }
                        }
                    }
                }

                req = ParsedRequest()
                repos = 'archived'
                req.args = {'source': json.dumps(query), 'repo': repos}

                search_res = superdesk.get_resource_service('search')
                existing = search_res.get(req=req, lookup=None)
                existing_guids = [e['guid'] for e in existing]
                not_existing = [g for g in guids if g not in existing_guids]
                for missing_guid in not_existing:
                    i = [m for m in items if m['guid'] == missing_guid]
                    original = res.find_one(req=None, guid=i[0]['guid'])
                    if not original:
                        try:
                            s = time.time()
                            res.post(i)
                            print('Post single item to Superdesk in {:.2f} seconds'.format(time.time() - s))
                        except Exception as ex:
                            print('Exception posting single item')
            else:
                print('Exception posting batch')
示例#16
0
    def _process_bunch(self, x):
        # x.findall('dc_rest_docs/dc_rest_doc')[0].get('href')
        for doc in x.findall('dc_rest_docs/dc_rest_doc'):
            print(doc.get('href'))
            id = doc.find('dcdossier').get('id')
            if int(id) < self._id:
                self._id = int(id)
            item = {}
            item['guid'] = doc.find('dcdossier').get('guid')

            # if the item has been modified in the archive then it is due to a kill
            # there is an argument that this item should not be imported at all
            if doc.find('dcdossier').get('created') != doc.find(
                    'dcdossier').get('modified'):
                item[ITEM_STATE] = CONTENT_STATE.KILLED
            else:
                item[ITEM_STATE] = CONTENT_STATE.PUBLISHED

            value = datetime.strptime(
                self._get_head_value(doc, 'PublicationDate'), '%Y%m%d%H%M%S')
            item['firstcreated'] = utc.normalize(
                value) if value.tzinfo else value
            item['versioncreated'] = item['firstcreated']

            generate_unique_id_and_name(item)
            item['ingest_id'] = id

            item['source'] = self._get_head_value(doc, 'Agency')

            self._addkeywords('AsiaPulseCodes', doc, item)

            byline = self._get_head_value(doc, 'Byline')
            if byline:
                item['byline'] = byline

            # item['service'] = self._get_head_value(doc,'Service')

            category = self._get_head_value(doc, 'Category')
            if not category:
                publication_name = self._get_head_value(doc, 'PublicationName')
                if publication_name in pubnames:
                    category = pubnames[publication_name]
            if category:
                anpacategory = {}
                anpacategory['qcode'] = category
                for anpa_category in self._anpa_categories['items']:
                    if anpacategory['qcode'].lower(
                    ) == anpa_category['qcode'].lower():
                        anpacategory = {
                            'qcode': anpacategory['qcode'],
                            'name': anpa_category['name']
                        }
                        break
                item['anpa_category'] = [anpacategory]

            self._addkeywords('CompanyCodes', doc, item)

            type = self._get_head_value(doc, 'Format')
            if type == 'x':
                item[ITEM_TYPE] = CONTENT_TYPE.TEXT
            elif type == 't':
                item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED
            else:
                item[ITEM_TYPE] = CONTENT_TYPE.TEXT

            item['keyword'] = self._get_head_value(doc, 'Keyword')
            item['ingest_provider_sequence'] = self._get_head_value(
                doc, 'Sequence')

            orginal_source = self._get_head_value(doc, 'Author')
            if orginal_source:
                item['original_source'] = orginal_source

            item['headline'] = self._get_head_value(doc, 'Headline')

            code = self._get_head_value(doc, 'SubjectRefNum')
            if code and len(code) == 7:
                code = '0' + code
            if code and code in subject_codes:
                item['subject'] = []
                item['subject'].append({
                    'qcode': code,
                    'name': subject_codes[code]
                })
                try:
                    process_iptc_codes(item, None)
                except:
                    pass

            slug = self._get_head_value(doc, 'SLUG')
            if slug:
                item['slugline'] = slug
            else:
                item['slugline'] = self._get_head_value(doc, 'Keyword')

            # self._addkeywords('Takekey', doc, item)
            take_key = self._get_head_value(doc, 'Takekey')
            if take_key:
                item['anpa_take_key'] = take_key

            self._addkeywords('Topic', doc, item)

            self._addkeywords('Selectors', doc, item)

            el = doc.find('dcdossier/document/body/BodyText')
            if el is not None:
                story = el.text
                if item[ITEM_TYPE] == CONTENT_TYPE.TEXT:
                    story = story.replace('\n   ', '<br><br>')
                    story = story.replace('\n', '<br>')
                    item['body_html'] = story
                else:
                    item['body_html'] = story
                try:
                    item['word_count'] = get_text_word_count(item['body_html'])
                except:
                    pass

            item['pubstatus'] = 'usable'
            item['allow_post_publish_actions'] = False

            res = superdesk.get_resource_service('published')
            original = res.find_one(req=None, guid=item['guid'])
            if not original:
                item['_id'] = item['guid']
                res.post([item])
            else:
                res.patch(original['_id'], item)

            if self._limit:
                self._limit -= 1
示例#17
0
    def _process_bunch(self, x):
        # x.findall('dc_rest_docs/dc_rest_doc')[0].get('href')
        for doc in x.findall('dc_rest_docs/dc_rest_doc'):
            print(doc.get('href'))
            id = doc.find('dcdossier').get('id')
            if int(id) < self._id:
                self._id = int(id)
            item = {}
            item['guid'] = doc.find('dcdossier').get('guid')

            # if the item has been modified in the archive then it is due to a kill
            # there is an argument that this item should not be imported at all
            if doc.find('dcdossier').get('created') != doc.find('dcdossier').get('modified'):
                item[ITEM_STATE] = CONTENT_STATE.KILLED
            else:
                item[ITEM_STATE] = CONTENT_STATE.PUBLISHED

            value = datetime.strptime(self._get_head_value(doc, 'PublicationDate'), '%Y%m%d%H%M%S')
            item['firstcreated'] = utc.normalize(value) if value.tzinfo else value
            item['versioncreated'] = item['firstcreated']

            generate_unique_id_and_name(item)
            item['ingest_id'] = id

            item['source'] = self._get_head_value(doc, 'Agency')

            self._addkeywords('AsiaPulseCodes', doc, item)

            byline = self._get_head_value(doc, 'Byline')
            if byline:
                item['byline'] = byline

            # item['service'] = self._get_head_value(doc,'Service')

            category = self._get_head_value(doc, 'Category')
            if not category:
                publication_name = self._get_head_value(doc, 'PublicationName')
                if publication_name in pubnames:
                    category = pubnames[publication_name]
            if category:
                anpacategory = {}
                anpacategory['qcode'] = category
                for anpa_category in self._anpa_categories['items']:
                    if anpacategory['qcode'].lower() == anpa_category['qcode'].lower():
                        anpacategory = {'qcode': anpacategory['qcode'], 'name': anpa_category['name']}
                        break
                item['anpa_category'] = [anpacategory]

            self._addkeywords('CompanyCodes', doc, item)

            type = self._get_head_value(doc, 'Format')
            if type == 'x':
                item[ITEM_TYPE] = CONTENT_TYPE.TEXT
            elif type == 't':
                item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED
            else:
                item[ITEM_TYPE] = CONTENT_TYPE.TEXT

            item['keyword'] = self._get_head_value(doc, 'Keyword')
            item['ingest_provider_sequence'] = self._get_head_value(doc, 'Sequence')

            orginal_source = self._get_head_value(doc, 'Author')
            if orginal_source:
                item['original_source'] = orginal_source

            item['headline'] = self._get_head_value(doc, 'Headline')

            code = self._get_head_value(doc, 'SubjectRefNum')
            if code and len(code) == 7:
                code = '0' + code
            if code and code in subject_codes:
                item['subject'] = []
                item['subject'].append({'qcode': code, 'name': subject_codes[code]})
                try:
                    process_iptc_codes(item, None)
                except:
                    pass

            slug = self._get_head_value(doc, 'SLUG')
            if slug:
                item['slugline'] = slug
            else:
                item['slugline'] = self._get_head_value(doc, 'Keyword')

            # self._addkeywords('Takekey', doc, item)
            take_key = self._get_head_value(doc, 'Takekey')
            if take_key:
                item['anpa_take_key'] = take_key

            self._addkeywords('Topic', doc, item)

            self._addkeywords('Selectors', doc, item)

            el = doc.find('dcdossier/document/body/BodyText')
            if el is not None:
                story = el.text
                if item[ITEM_TYPE] == CONTENT_TYPE.TEXT:
                    story = story.replace('\n   ', '<br><br>')
                    story = story.replace('\n', '<br>')
                    item['body_html'] = story
                else:
                    item['body_html'] = story
                try:
                    item['word_count'] = get_text_word_count(item['body_html'])
                except:
                    pass

            item['pubstatus'] = 'usable'
            # this is required for the archived service additional lookup
            item['item_id'] = item['guid']
            item[config.VERSION] = 1

            res = superdesk.get_resource_service('archived')
            original = res.find_one(req=None, guid=item['guid'])
            if not original:
                item['_id'] = item['guid']
                res.post([item])
            else:
                res.patch(original['_id'], item)

            if self._limit:
                self._limit -= 1
示例#18
0
    def fetch(self, docs, id=None, **kwargs):
        id_of_fetched_items = []

        for doc in docs:
            id_of_item_to_be_fetched = doc.get(
                config.ID_FIELD) if id is None else id

            desk_id = doc.get('desk')
            stage_id = doc.get('stage')

            ingest_service = get_resource_service('ingest')
            ingest_doc = ingest_service.find_one(req=None,
                                                 _id=id_of_item_to_be_fetched)

            if not ingest_doc:
                raise SuperdeskApiError.notFoundError(
                    _('Fail to found ingest item with _id: {id}').format(
                        id=id_of_item_to_be_fetched))

            if not is_workflow_state_transition_valid('fetch_from_ingest',
                                                      ingest_doc[ITEM_STATE]):
                raise InvalidStateTransitionError()

            if doc.get('macro'):  # there is a macro so transform it
                ingest_doc = get_resource_service('macros').execute_macro(
                    ingest_doc, doc.get('macro'))

            archived = utcnow()
            ingest_service.patch(id_of_item_to_be_fetched,
                                 {'archived': archived})

            dest_doc = dict(ingest_doc)

            if doc.get('target'):
                dest_doc.update(doc.get('target'))

            new_id = generate_guid(type=GUID_TAG)
            id_of_fetched_items.append(new_id)
            dest_doc[config.ID_FIELD] = new_id
            dest_doc[GUID_FIELD] = new_id
            generate_unique_id_and_name(dest_doc)

            dest_doc[config.VERSION] = 1
            dest_doc['versioncreated'] = archived
            send_to(doc=dest_doc, desk_id=desk_id, stage_id=stage_id)
            dest_doc[ITEM_STATE] = doc.get(ITEM_STATE, CONTENT_STATE.FETCHED)
            dest_doc[INGEST_ID] = dest_doc[FAMILY_ID] = ingest_doc[
                config.ID_FIELD]
            dest_doc[ITEM_OPERATION] = ITEM_FETCH

            remove_unwanted(dest_doc)
            set_original_creator(dest_doc)
            self.__fetch_items_in_package(
                dest_doc, desk_id, stage_id,
                doc.get(ITEM_STATE, CONTENT_STATE.FETCHED))

            self.__fetch_associated_items(
                dest_doc, desk_id, stage_id,
                doc.get(ITEM_STATE, CONTENT_STATE.FETCHED))

            desk = get_resource_service('desks').find_one(req=None,
                                                          _id=desk_id)
            if desk and desk.get('default_content_profile'):
                dest_doc['profile'] = desk['default_content_profile']

            if dest_doc.get('type', 'text') in MEDIA_TYPES:
                dest_doc['profile'] = None

            get_resource_service(ARCHIVE).post([dest_doc])
            insert_into_versions(doc=dest_doc)
            build_custom_hateoas(custom_hateoas, dest_doc)
            superdesk.item_fetched.send(self,
                                        item=dest_doc,
                                        ingest_item=ingest_doc)
            doc.update(dest_doc)

        if kwargs.get('notify', True):
            ingest_doc.update({'task': dest_doc.get('task')})
            push_item_move_notification(ingest_doc, doc, 'item:fetch')

        return id_of_fetched_items
    def _process_bunch(self, x):
        # x.findall('dc_rest_docs/dc_rest_doc')[0].get('href')
        items = []
        for doc in x.findall('dc_rest_docs/dc_rest_doc'):
            try:
                # print(doc.get('href'))
                id = doc.find('dcdossier').get('id')
                if self._direction:
                    if int(id) > self._id:
                        self._id = int(id)
                else:
                    if int(id) < self._id:
                        self._id = int(id)
                item = {}
                item['guid'] = doc.find('dcdossier').get('guid')
                item[ITEM_TYPE] = CONTENT_TYPE.TEXT
                format = self._get_head_value(doc, 'Format')
                if format == 't':
                    item[FORMAT] = FORMATS.PRESERVED
                else:
                    item[FORMAT] = FORMATS.HTML
                # item[FORMAT] = FORMATS.HTML

                # if the item has been modified in the archive then it is due to a kill
                # there is an argument that this item should not be imported at all
                if doc.find('dcdossier').get('created') != doc.find('dcdossier').get('modified'):
                    # item[ITEM_STATE] = CONTENT_STATE.KILLED
                    continue
                else:
                    item[ITEM_STATE] = CONTENT_STATE.PUBLISHED

                value = datetime.strptime(self._get_head_value(doc, 'PublicationDate'), '%Y%m%d%H%M%S')
                local_tz = pytz.timezone('Australia/Sydney')
                try:
                    aus_dt = local_tz.localize(value, is_dst=None)
                except NonExistentTimeError as ex:
                    aus_dt = local_tz.localize(value, is_dst=True)
                except AmbiguousTimeError:
                    aus_dt = local_tz.localize(value, is_dst=False)

                item['firstcreated'] = aus_dt.astimezone(pytz.utc)
                item['versioncreated'] = item['firstcreated']

                generate_unique_id_and_name(item)
                item['ingest_id'] = id

                last_line = None
                el = doc.find('dcdossier/document/body/BodyText')
                if el is not None:
                    story = el.text
                    lines = story.split('\n')
                    if len(lines) > 0:
                        last_line = lines[-1]
                    if item.get(FORMAT) == FORMATS.HTML:
                        story = story.replace('\n   ', '<p></p>')
                        story = story.replace('\n', '<br>')
                        item['body_html'] = '<p>' + story + '</p>'
                    else:
                        item['body_html'] = '<pre>' + story + '</pre>'
                    try:
                        item['word_count'] = get_text_word_count(item['body_html'])
                    except:
                        pass
                else:
                    # Items with no body are ignored
                    continue

                item['source'] = self._get_head_value(doc, 'Agency')
                # if the source document contains no agency then by definition it is unknown
                if item['source'] is None:
                    item['source'] = 'UNKNOWN'
                else:
                    # check if the source of the document was Newscentre
                    dc_unique = doc.find('dcdossier').get('unique')
                    if dc_unique.startswith('NC.') and last_line is not None:
                        # The AFR summary articles all have agency values 25 chars long
                        if len(item['source']) == 25:
                            item['source'] = 'AAP'
                        # is it a numeric Agency
                        elif self._get_head_value(doc, 'Agency').isdigit():
                            sign_off = last_line.split(' ')
                            if len(sign_off) > 0:
                                item['source'] = sign_off[0].upper()
                            else:
                                item['source'] = sign_off.upper()
                            # clean up what we have extracted
                            if item['source'].startswith('AAP'):
                                item['source'] = 'AAP'
                            else:
                                # make sure it is one of the known values
                                if item['source'] not in {'AAP', 'AP', 'REUT', 'Asia Pulse', 'DPA', 'AFP', 'RAW', 'NZA',
                                                          'NZPA', 'KRT', 'PA', 'PAA', 'SNI', 'REUTERS'}:
                                    print('Source : {}'.format(item['source']))
                                    item['source'] = 'UNKNOWN'

    #            self._addkeywords('AsiaPulseCodes', doc, item)

                byline = self._get_head_value(doc, 'Byline')
                if byline:
                    item['byline'] = byline

                # item['service'] = self._get_head_value(doc,'Service')

                category = self._get_head_value(doc, 'Category')
                if not category:
                    publication_name = self._get_head_value(doc, 'PublicationName')
                    if publication_name in pubnames:
                        category = pubnames[publication_name]
                if category:
                    anpacategory = {}
                    anpacategory['qcode'] = category
                    for anpa_category in self._anpa_categories['items']:
                        if anpacategory['qcode'].lower() == anpa_category['qcode'].lower():
                            anpacategory = {'qcode': anpacategory['qcode'], 'name': anpa_category['name']}
                            break
                    item['anpa_category'] = [anpacategory]

                self._addkeywords('CompanyCodes', doc, item)

                item['keyword'] = self._get_head_value(doc, 'Keyword')
                item['ingest_provider_sequence'] = self._get_head_value(doc, 'Sequence')

                orginal_source = self._get_head_value(doc, 'Author')
                if orginal_source:
                    item['original_source'] = orginal_source

                item['headline'] = self._get_head_value(doc, 'Headline')

                code = self._get_head_value(doc, 'SubjectRefNum')
                if code and len(code) == 7:
                    code = '0' + code
                if code and code in subject_codes:
                    item['subject'] = []
                    item['subject'].append({'qcode': code, 'name': subject_codes[code]})
                    try:
                        process_iptc_codes(item, None)
                    except:
                        pass

                slug = self._get_head_value(doc, 'SLUG')
                if slug:
                    item['slugline'] = slug
                else:
                    item['slugline'] = self._get_head_value(doc, 'Keyword')

                take_key = self._get_head_value(doc, 'Takekey')
                if take_key:
                    item['anpa_take_key'] = take_key

                self._addkeywords('Topic', doc, item)

    #            self._addkeywords('Selectors', doc, item)

                item['pubstatus'] = 'usable'
                # this is required for the archived service additional lookup
                item['item_id'] = item['guid']
                item[config.VERSION] = 1
                item['flags'] = {'marked_archived_only': True}

                # item['_id'] = ObjectId(id.rjust(24,'0'))
                item['_id'] = ObjectId()
                items.append(item)

                if self._limit:
                    self._limit -= 1
                # print(item)
            except Exception as ex:
                print('Exception parsing DC documnent {}'.format(id))
                pass

        try:
            res = superdesk.get_resource_service('archived')
            s = time.time()
            res.post(items)
            print('Post to Batch to Superdesk took {:.2f}'.format(time.time() - s))
        except Exception as ex:
            if ex.code == 409:
                print('Key clash exceptionn detected')
                # create a list of the guids we tried to post
                guids = [g['guid'] for g in items]
                # create a query for all those id's
                query = {
                    'size': self.BATCH_SIZE,
                    'query': {
                        'filtered': {
                            'filter': {
                                "terms": {
                                    "guid": [guids]
                                }
                            }
                        }
                    }
                }

                req = ParsedRequest()
                repos = 'archived'
                req.args = {'source': json.dumps(query), 'repo': repos}

                search_res = superdesk.get_resource_service('search')
                existing = search_res.get(req=req, lookup=None)
                existing_guids = [e['guid'] for e in existing]
                not_existing = [g for g in guids if g not in existing_guids]
                for missing_guid in not_existing:
                    i = [m for m in items if m['guid'] == missing_guid]
                    original = res.find_one(req=None, guid=i[0]['guid'])
                    if not original:
                        try:
                            s = time.time()
                            res.post(i)
                            print('Post single item to Superdesk in {:.2f} seconds'.format(time.time() - s))
                        except Exception as ex:
                            print('Exception posting single item')
            else:
                print('Exception posting batch')
示例#20
0
    def _process_bunch(self, x):
        # x.findall('dc_rest_docs/dc_rest_doc')[0].get('href')
        items = []
        for doc in x.findall('dc_rest_docs/dc_rest_doc'):
            try:
                # print(doc.get('href'))
                id = doc.find('dcdossier').get('id')
                if self._direction:
                    if int(id) > self._id:
                        self._id = int(id)
                else:
                    if int(id) < self._id:
                        self._id = int(id)
                item = {}
                item['guid'] = doc.find('dcdossier').get('guid')
                item[ITEM_TYPE] = CONTENT_TYPE.TEXT
                format = self._get_head_value(doc, 'Format')
                if format == 't':
                    item[FORMAT] = FORMATS.PRESERVED
                else:
                    item[FORMAT] = FORMATS.HTML
                # item[FORMAT] = FORMATS.HTML

                # if the item has been modified in the archive then it is due to a kill
                # there is an argument that this item should not be imported at all
                if doc.find('dcdossier').get('created') != doc.find(
                        'dcdossier').get('modified'):
                    # item[ITEM_STATE] = CONTENT_STATE.KILLED
                    continue
                else:
                    item[ITEM_STATE] = CONTENT_STATE.PUBLISHED

                value = datetime.strptime(
                    self._get_head_value(doc, 'PublicationDate'),
                    '%Y%m%d%H%M%S')
                local_tz = pytz.timezone('Australia/Sydney')
                try:
                    aus_dt = local_tz.localize(value, is_dst=None)
                except NonExistentTimeError as ex:
                    aus_dt = local_tz.localize(value, is_dst=True)
                except AmbiguousTimeError:
                    aus_dt = local_tz.localize(value, is_dst=False)

                item['firstcreated'] = aus_dt.astimezone(pytz.utc)
                item['versioncreated'] = item['firstcreated']

                generate_unique_id_and_name(item)
                item['ingest_id'] = id

                last_line = None
                el = doc.find('dcdossier/document/body/BodyText')
                if el is not None:
                    story = el.text
                    lines = story.split('\n')
                    if len(lines) > 0:
                        last_line = lines[-1]
                    if item.get(FORMAT) == FORMATS.HTML:
                        story = story.replace('\n   ', '<p></p>')
                        story = story.replace('\n', '<br>')
                        item['body_html'] = '<p>' + story + '</p>'
                    else:
                        item['body_html'] = '<pre>' + story + '</pre>'
                    try:
                        item['word_count'] = get_text_word_count(
                            item['body_html'])
                    except:
                        pass
                else:
                    # Items with no body are ignored
                    continue

                item['source'] = self._get_head_value(doc, 'Agency')
                # if the source document contains no agency then by definition it is unknown
                if item['source'] is None:
                    item['source'] = 'UNKNOWN'
                else:
                    # check if the source of the document was Newscentre
                    dc_unique = doc.find('dcdossier').get('unique')
                    if dc_unique.startswith('NC.') and last_line is not None:
                        # The AFR summary articles all have agency values 25 chars long
                        if len(item['source']) == 25:
                            item['source'] = 'AAP'
                        # is it a numeric Agency
                        elif self._get_head_value(doc, 'Agency').isdigit():
                            sign_off = last_line.split(' ')
                            if len(sign_off) > 0:
                                item['source'] = sign_off[0].upper()
                            else:
                                item['source'] = sign_off.upper()
                            # clean up what we have extracted
                            if item['source'].startswith('AAP'):
                                item['source'] = 'AAP'
                            else:
                                # make sure it is one of the known values
                                if item['source'] not in {
                                        'AAP', 'AP', 'REUT', 'Asia Pulse',
                                        'DPA', 'AFP', 'RAW', 'NZA', 'NZPA',
                                        'KRT', 'PA', 'PAA', 'SNI', 'REUTERS'
                                }:
                                    print('Source : {}'.format(item['source']))
                                    item['source'] = 'UNKNOWN'

    #            self._addkeywords('AsiaPulseCodes', doc, item)

                byline = self._get_head_value(doc, 'Byline')
                if byline:
                    item['byline'] = byline

                # item['service'] = self._get_head_value(doc,'Service')

                category = self._get_head_value(doc, 'Category')
                if not category:
                    publication_name = self._get_head_value(
                        doc, 'PublicationName')
                    if publication_name in pubnames:
                        category = pubnames[publication_name]
                if category:
                    anpacategory = {}
                    anpacategory['qcode'] = category
                    for anpa_category in self._anpa_categories['items']:
                        if anpacategory['qcode'].lower(
                        ) == anpa_category['qcode'].lower():
                            anpacategory = {
                                'qcode': anpacategory['qcode'],
                                'name': anpa_category['name']
                            }
                            break
                    item['anpa_category'] = [anpacategory]

                self._addkeywords('CompanyCodes', doc, item)

                item['keyword'] = self._get_head_value(doc, 'Keyword')
                item['ingest_provider_sequence'] = self._get_head_value(
                    doc, 'Sequence')

                orginal_source = self._get_head_value(doc, 'Author')
                if orginal_source:
                    item['original_source'] = orginal_source

                item['headline'] = self._get_head_value(doc, 'Headline')

                code = self._get_head_value(doc, 'SubjectRefNum')
                if code and len(code) == 7:
                    code = '0' + code
                if code and code in subject_codes:
                    item['subject'] = []
                    item['subject'].append({
                        'qcode': code,
                        'name': subject_codes[code]
                    })
                    try:
                        process_iptc_codes(item, None)
                    except:
                        pass

                slug = self._get_head_value(doc, 'SLUG')
                if slug:
                    item['slugline'] = slug
                else:
                    item['slugline'] = self._get_head_value(doc, 'Keyword')

                take_key = self._get_head_value(doc, 'Takekey')
                if take_key:
                    item['anpa_take_key'] = take_key

                self._addkeywords('Topic', doc, item)

                #            self._addkeywords('Selectors', doc, item)

                item['pubstatus'] = 'usable'
                # this is required for the archived service additional lookup
                item['item_id'] = item['guid']
                item[config.VERSION] = 1
                item['flags'] = {'marked_archived_only': True}

                # item['_id'] = ObjectId(id.rjust(24,'0'))
                item['_id'] = ObjectId()
                items.append(item)

                if self._limit:
                    self._limit -= 1
                # print(item)
            except Exception as ex:
                print('Exception parsing DC documnent {}'.format(id))
                pass

        try:
            res = superdesk.get_resource_service('archived')
            s = time.time()
            res.post(items)
            print('Post to Batch to Superdesk took {:.2f}'.format(time.time() -
                                                                  s))
        except Exception as ex:
            if ex.code == 409:
                print('Key clash exceptionn detected')
                # create a list of the guids we tried to post
                guids = [g['guid'] for g in items]
                # create a query for all those id's
                query = {
                    'size': self.BATCH_SIZE,
                    'query': {
                        'filtered': {
                            'filter': {
                                "terms": {
                                    "guid": [guids]
                                }
                            }
                        }
                    }
                }

                req = ParsedRequest()
                repos = 'archived'
                req.args = {'source': json.dumps(query), 'repo': repos}

                search_res = superdesk.get_resource_service('search')
                existing = search_res.get(req=req, lookup=None)
                existing_guids = [e['guid'] for e in existing]
                not_existing = [g for g in guids if g not in existing_guids]
                for missing_guid in not_existing:
                    i = [m for m in items if m['guid'] == missing_guid]
                    original = res.find_one(req=None, guid=i[0]['guid'])
                    if not original:
                        try:
                            s = time.time()
                            res.post(i)
                            print(
                                'Post single item to Superdesk in {:.2f} seconds'
                                .format(time.time() - s))
                        except Exception as ex:
                            print('Exception posting single item')
            else:
                print('Exception posting batch')
示例#21
0
    def _process_bunch(self, x):
        # x.findall('dc_rest_docs/dc_rest_doc')[0].get('href')
        items = []
        for doc in x.findall('dc_rest_docs/dc_rest_doc'):
            try:
                # print(doc.get('href'))
                id = doc.find('dcdossier').get('id')
                if self._direction:
                    if int(id) > self._id:
                        self._id = int(id)
                else:
                    if int(id) < self._id:
                        self._id = int(id)
                item = {}
                item['guid'] = doc.find('dcdossier').get('guid')

                # if the item has been modified in the archive then it is due to a kill
                # there is an argument that this item should not be imported at all
                if doc.find('dcdossier').get('created') != doc.find(
                        'dcdossier').get('modified'):
                    # item[ITEM_STATE] = CONTENT_STATE.KILLED
                    continue
                else:
                    item[ITEM_STATE] = CONTENT_STATE.PUBLISHED

                value = datetime.strptime(
                    self._get_head_value(doc, 'PublicationDate'),
                    '%Y%m%d%H%M%S')
                local_tz = pytz.timezone('Australia/Sydney')
                try:
                    aus_dt = local_tz.localize(value, is_dst=None)
                except NonExistentTimeError as ex:
                    aus_dt = local_tz.localize(value, is_dst=True)
                except AmbiguousTimeError:
                    aus_dt = local_tz.localize(value, is_dst=False)

                item['firstcreated'] = aus_dt.astimezone(pytz.utc)
                item['versioncreated'] = item['firstcreated']

                generate_unique_id_and_name(item)
                item['ingest_id'] = id

                item['source'] = self._get_head_value(doc, 'Agency')

                #            self._addkeywords('AsiaPulseCodes', doc, item)

                byline = self._get_head_value(doc, 'Byline')
                if byline:
                    item['byline'] = byline

                # item['service'] = self._get_head_value(doc,'Service')

                category = self._get_head_value(doc, 'Category')
                if not category:
                    publication_name = self._get_head_value(
                        doc, 'PublicationName')
                    if publication_name in pubnames:
                        category = pubnames[publication_name]
                if category:
                    anpacategory = {}
                    anpacategory['qcode'] = category
                    for anpa_category in self._anpa_categories['items']:
                        if anpacategory['qcode'].lower(
                        ) == anpa_category['qcode'].lower():
                            anpacategory = {
                                'qcode': anpacategory['qcode'],
                                'name': anpa_category['name']
                            }
                            break
                    item['anpa_category'] = [anpacategory]

    #           self._addkeywords('CompanyCodes', doc, item)

                type = self._get_head_value(doc, 'Format')
                if type == 'x':
                    item[ITEM_TYPE] = CONTENT_TYPE.TEXT
                elif type == 't':
                    item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED
                else:
                    item[ITEM_TYPE] = CONTENT_TYPE.TEXT

                item['keyword'] = self._get_head_value(doc, 'Keyword')
                item['ingest_provider_sequence'] = self._get_head_value(
                    doc, 'Sequence')

                orginal_source = self._get_head_value(doc, 'Author')
                if orginal_source:
                    item['original_source'] = orginal_source

                item['headline'] = self._get_head_value(doc, 'Headline')

                code = self._get_head_value(doc, 'SubjectRefNum')
                if code and len(code) == 7:
                    code = '0' + code
                if code and code in subject_codes:
                    item['subject'] = []
                    item['subject'].append({
                        'qcode': code,
                        'name': subject_codes[code]
                    })
                    try:
                        process_iptc_codes(item, None)
                    except:
                        pass

                slug = self._get_head_value(doc, 'SLUG')
                if slug:
                    item['slugline'] = slug
                else:
                    item['slugline'] = self._get_head_value(doc, 'Keyword')

                take_key = self._get_head_value(doc, 'Takekey')
                if take_key:
                    item['anpa_take_key'] = take_key

    #            self._addkeywords('Topic', doc, item)

    #            self._addkeywords('Selectors', doc, item)

                el = doc.find('dcdossier/document/body/BodyText')
                if el is not None:
                    story = el.text
                    if item[ITEM_TYPE] == CONTENT_TYPE.TEXT:
                        story = story.replace('\n   ', '<br><br>')
                        story = story.replace('\n', '<br>')
                        item['body_html'] = story
                    else:
                        item['body_html'] = story
                    try:
                        item['word_count'] = get_text_word_count(
                            item['body_html'])
                    except:
                        pass

                item['pubstatus'] = 'usable'
                # this is required for the archived service additional lookup
                item['item_id'] = item['guid']
                item[config.VERSION] = 1
                item['flags'] = {'marked_archived_only': True}

                # item['_id'] = ObjectId(id.rjust(24,'0'))
                item['_id'] = ObjectId()
                items.append(item)

                if self._limit:
                    self._limit -= 1
                # print(item)
            except Exception as ex:
                print('Exception parsing DC documnent {}'.format(id))
                pass

        try:
            res = superdesk.get_resource_service('archived')
            s = time.time()
            res.post(items)
            print('Post to Batch to Superdesk took {:.2f}'.format(time.time() -
                                                                  s))
        except Exception as ex:
            if ex.code == 409:
                print('Key clash exceptionn detected')
                # create a list of the guids we tried to post
                guids = [g['guid'] for g in items]
                # create a query for all those id's
                query = {
                    'size': self.BATCH_SIZE,
                    'query': {
                        'filtered': {
                            'filter': {
                                "terms": {
                                    "guid": [guids]
                                }
                            }
                        }
                    }
                }

                req = ParsedRequest()
                repos = 'archived'
                req.args = {'source': json.dumps(query), 'repo': repos}

                search_res = superdesk.get_resource_service('search')
                existing = search_res.get(req=req, lookup=None)
                existing_guids = [e['guid'] for e in existing]
                not_existing = [g for g in guids if g not in existing_guids]
                for missing_guid in not_existing:
                    i = [m for m in items if m['guid'] == missing_guid]
                    original = res.find_one(req=None, guid=i[0]['guid'])
                    if not original:
                        try:
                            s = time.time()
                            res.post(i)
                            print(
                                'Post single item to Superdesk in {:.2f} seconds'
                                .format(time.time() - s))
                        except Exception as ex:
                            print('Exception posting single item')
            else:
                print('Exception posting batch')