Пример #1
0
 def crop_and_store_file(self, doc, content, filename, content_type):
     # retrieve file name and metadata from file
     file_name, content_type, metadata = process_file_from_stream(content, content_type=content_type)
     # crop the file if needed, can change the image size
     was_cropped, out = crop_image(content, filename, doc)
     # the length in metadata could be updated if it was cropped
     if was_cropped:
         file_name, content_type, metadata_after_cropped = process_file_from_stream(out, content_type=content_type)
         # when cropped, metadata are reseted. Then we update the previous metadata variable
         metadata['length'] = metadata_after_cropped['length']
     try:
         logger.debug('Going to save media file with %s ' % file_name)
         out.seek(0)
         file_id = app.media.put(out, filename=file_name, content_type=content_type,
                                 resource=self.datasource, metadata=metadata)
         doc['media'] = file_id
         doc['mimetype'] = content_type
         doc['filemeta'] = decode_metadata(metadata)
         inserted = [doc['media']]
         file_type = content_type.split('/')[0]
         rendition_spec = config.RENDITIONS['avatar']
         renditions = generate_renditions(out, file_id, inserted, file_type,
                                          content_type, rendition_spec, url_for_media)
         doc['renditions'] = renditions
     except Exception as io:
         logger.exception(io)
         for file_id in inserted:
             delete_file_on_error(doc, file_id)
         raise SuperdeskApiError.internalError('Generating renditions failed')
Пример #2
0
 def crop_and_store_file(self, doc, content, filename, content_type):
     # retrieve file name and metadata from file
     file_name, content_type, metadata = process_file_from_stream(
         content, content_type=content_type)
     # crop the file if needed, can change the image size
     was_cropped, out = crop_image(content, filename, doc)
     # the length in metadata could be updated if it was cropped
     if was_cropped:
         file_name, content_type, metadata_after_cropped = process_file_from_stream(
             out, content_type=content_type)
         # when cropped, metadata are reseted. Then we update the previous metadata variable
         metadata['length'] = metadata_after_cropped['length']
     try:
         logger.debug('Going to save media file with %s ' % file_name)
         out.seek(0)
         file_id = app.media.put(out,
                                 filename=file_name,
                                 content_type=content_type,
                                 resource=self.datasource,
                                 metadata=metadata)
         doc['media'] = file_id
         doc['mimetype'] = content_type
         set_filemeta(doc, decode_metadata(metadata))
         inserted = [doc['media']]
         file_type = content_type.split('/')[0]
         rendition_spec = config.RENDITIONS['avatar']
         renditions = generate_renditions(out, file_id, inserted, file_type,
                                          content_type, rendition_spec,
                                          url_for_media)
         doc['renditions'] = renditions
     except Exception as io:
         for file_id in inserted:
             delete_file_on_error(doc, file_id)
         raise SuperdeskApiError.internalError(
             'Generating renditions failed', exception=io)
Пример #3
0
    def parse_item(self, image_path):
        filename = os.path.basename(image_path)
        content_type = mimetypes.guess_type(image_path)[0]
        guid = utils.generate_guid(type=GUID_TAG)
        item = {'guid': guid,
                'uri': guid,
                config.VERSION: 1,
                ITEM_TYPE: CONTENT_TYPE.PICTURE,
                'mimetype': content_type,
                'versioncreated': utcnow(),
                }
        with open(image_path, 'rb') as f:
            _, content_type, file_metadata = process_file_from_stream(f, content_type=content_type)
            f.seek(0)
            file_id = app.media.put(f, filename=filename, content_type=content_type, metadata=file_metadata)
            filemeta.set_filemeta(item, file_metadata)
            f.seek(0)

            metadata = get_meta_iptc(f)
            f.seek(0)
            self.parse_meta(item, metadata)

            rendition_spec = get_renditions_spec(no_custom_crops=True)
            renditions = generate_renditions(f, file_id, [file_id], 'image',
                                             content_type, rendition_spec, url_for_media)
            item['renditions'] = renditions
        return item
Пример #4
0
    def store_file(self, doc, content, filename, content_type):
        res = process_file_from_stream(content, filename=filename, content_type=content_type)
        file_name, content_type, metadata = res

        cropping_data = self.get_cropping_data(doc)
        _, out = crop_image(content, filename, cropping_data)
        metadata['length'] = json.dumps(len(out.getvalue()))

        try:
            logger.debug('Going to save media file with %s ' % file_name)
            out.seek(0)
            id = app.media.put(out, filename=file_name, content_type=content_type, metadata=metadata)
            doc['media'] = id
            doc['mime_type'] = content_type
            doc['filemeta'] = decode_metadata(metadata)
            inserted = [doc['media']]
            file_type = content_type.split('/')[0]

            rendition_spec = config.RENDITIONS['avatar']
            renditions = generate_renditions(out, doc['media'], inserted, file_type,
                                             content_type, rendition_spec, url_for_media)
            doc['renditions'] = renditions
        except Exception as io:
            logger.exception(io)
            for file_id in inserted:
                delete_file_on_error(doc, file_id)
            raise SuperdeskError(message='Generating renditions failed')
Пример #5
0
    def parse_attachment(self, newscomponent_el):
        """
        Parse attachment component, save it to storage and return attachment id

        <NewsComponent Duid="0" xml:lang="nl">
            <Role FormalName="Image"/>
            <DescriptiveMetadata>
                <Property FormalName="ComponentClass" Value="Image"/>
            </DescriptiveMetadata>
            <ContentItem Href="IMG_0182.jpg">
                <Format FormalName="Jpeg"/>
                <Characteristics>
                    <SizeInBytes>2267043</SizeInBytes>
                    <Property FormalName="Width" Value="4032"/>
                    <Property FormalName="Height" Value="3024"/>
                </Characteristics>
            </ContentItem>
        </NewsComponent>
        """
        content_item = newscomponent_el.find('ContentItem')
        if content_item is None:
            return

        # avoid re-adding media after item is ingested
        guid = hashlib.md5(ElementTree.tostring(content_item)).hexdigest()
        attachment_service = get_resource_service('attachments')
        old_attachment = attachment_service.find_one(req=None, guid=guid)
        if old_attachment:
            return {'attachment': old_attachment['_id']}

        filename = content_item.attrib.get('Href')
        if filename is None:
            return

        format_name = ''
        format_el = content_item.find('Format')
        if format_el is not None:
            format_name = format_el.attrib.get('FormalName')

        content = self._get_file(filename)
        if not content:
            return
        _, content_type, metadata = process_file_from_stream(content, 'application/' + format_name)
        content.seek(0)
        media_id = app.media.put(content,
                                 filename=filename,
                                 content_type=content_type,
                                 metadata=metadata,
                                 resource='attachments')
        try:
            ids = attachment_service.post([{
                'media': media_id,
                'filename': filename,
                'title': filename,
                'description': 'belga remote attachment',
                'guid': guid,
            }])
            return {'attachment': next(iter(ids), None)}
        except Exception as ex:
            app.media.delete(media_id)
Пример #6
0
 def _save_cropped_image(self, file_stream, original, doc):
     """
     Saves the cropped image and returns the crop dictionary
     :param file_stream: cropped image stream
     :param original: original rendition
     :param doc: crop data
     :return dict: Crop values
     :raises SuperdeskApiError.internalError
     """
     crop = {}
     try:
         file_name, content_type, metadata = process_file_from_stream(
             file_stream, content_type=original.get('mimetype'))
         file_stream.seek(0)
         file_id = superdesk.app.media.put(file_stream,
                                           filename=file_name,
                                           content_type=content_type,
                                           resource='upload',
                                           metadata=metadata)
         crop['media'] = file_id
         crop['mimetype'] = content_type
         crop['href'] = url_for_media(file_id, content_type)
         crop['CropTop'] = doc.get('CropTop', None)
         crop['CropLeft'] = doc.get('CropLeft', None)
         crop['CropRight'] = doc.get('CropRight', None)
         crop['CropBottom'] = doc.get('CropBottom', None)
         return crop
     except Exception as ex:
         try:
             superdesk.app.media.delete(file_id)
         except:
             pass
         raise SuperdeskApiError.internalError(
             'Generating crop failed: {}'.format(str(ex)))
Пример #7
0
 def _save_cropped_image(self, file_stream, original, doc):
     """
     Saves the cropped image and returns the crop dictionary
     :param file_stream: cropped image stream
     :param original: original rendition
     :param doc: crop data
     :return dict: Crop values
     :raises SuperdeskApiError.internalError
     """
     crop = {}
     try:
         file_name, content_type, metadata = process_file_from_stream(file_stream,
                                                                      content_type=original.get('mimetype'))
         file_stream.seek(0)
         file_id = superdesk.app.media.put(file_stream, filename=file_name,
                                           content_type=content_type,
                                           resource='upload',
                                           metadata=metadata)
         crop['media'] = file_id
         crop['mimetype'] = content_type
         crop['href'] = url_for_media(file_id, content_type)
         crop['CropTop'] = doc.get('CropTop', None)
         crop['CropLeft'] = doc.get('CropLeft', None)
         crop['CropRight'] = doc.get('CropRight', None)
         crop['CropBottom'] = doc.get('CropBottom', None)
         return crop
     except Exception as ex:
         try:
             superdesk.app.media.delete(file_id)
         except:
             pass
         raise SuperdeskApiError.internalError('Generating crop failed: {}'.format(str(ex)))
Пример #8
0
    def setUp(self):
        super().setUp()
        dirname = os.path.dirname(os.path.realpath(__file__))
        image_path = os.path.normpath(
            os.path.join(dirname, "fixtures", self.filename))
        content_type = mimetypes.guess_type(image_path)[0]
        guid = utils.generate_guid(type=GUID_TAG)
        self.item = {
            "guid": guid,
            "version": 1,
            "_id": guid,
            ITEM_TYPE: CONTENT_TYPE.PICTURE,
            "mimetype": content_type,
            "versioncreated": datetime.now(),
        }

        with open(image_path, "rb") as f:
            _, content_type, file_metadata = process_file_from_stream(
                f, content_type=content_type)
            f.seek(0)
            file_id = app.media.put(f,
                                    filename=self.filename,
                                    content_type=content_type,
                                    metadata=file_metadata)
            filemeta.set_filemeta(self.item, file_metadata)
            f.seek(0)
            rendition_spec = get_renditions_spec()
            renditions = generate_renditions(f, file_id, [file_id], "image",
                                             content_type, rendition_spec,
                                             url_for_media)
            self.item["renditions"] = renditions
        archive = get_resource_service("archive")
        archive.post([self.item])
    def setUp(self):
        super().setUp()
        dirname = os.path.dirname(os.path.realpath(__file__))
        image_path = os.path.normpath(
            os.path.join(dirname, 'fixtures', self.filename))
        content_type = mimetypes.guess_type(image_path)[0]
        guid = utils.generate_guid(type=GUID_TAG)
        self.item = {
            'guid': guid,
            'version': 1,
            '_id': guid,
            ITEM_TYPE: CONTENT_TYPE.PICTURE,
            'mimetype': content_type,
            'versioncreated': datetime.now()
        }

        with open(image_path, 'rb') as f:
            _, content_type, file_metadata = process_file_from_stream(
                f, content_type=content_type)
            f.seek(0)
            file_id = app.media.put(f,
                                    filename=self.filename,
                                    content_type=content_type,
                                    metadata=file_metadata)
            filemeta.set_filemeta(self.item, file_metadata)
            f.seek(0)
            rendition_spec = get_renditions_spec()
            renditions = generate_renditions(f, file_id, [file_id], 'image',
                                             content_type, rendition_spec,
                                             url_for_media)
            self.item['renditions'] = renditions
        archive = get_resource_service('archive')
        archive.post([self.item])
    def setUp(self):
        super().setUp()
        dirname = os.path.dirname(os.path.realpath(__file__))
        image_path = os.path.normpath(os.path.join(dirname, 'fixtures', self.filename))
        content_type = mimetypes.guess_type(image_path)[0]
        guid = utils.generate_guid(type=GUID_TAG)
        self.item = {'guid': guid,
                     'version': 1,
                     '_id': guid,
                     ITEM_TYPE: CONTENT_TYPE.PICTURE,
                     'mimetype': content_type,
                     'versioncreated': datetime.now()
                     }

        with open(image_path, 'rb') as f:
            _, content_type, file_metadata = process_file_from_stream(f, content_type=content_type)
            f.seek(0)
            file_id = app.media.put(f, filename=self.filename, content_type=content_type, metadata=file_metadata)
            filemeta.set_filemeta(self.item, file_metadata)
            f.seek(0)
            rendition_spec = get_renditions_spec()
            renditions = generate_renditions(f, file_id, [file_id], 'image',
                                             content_type, rendition_spec, url_for_media)
            self.item['renditions'] = renditions
        archive = get_resource_service('archive')
        archive.post([self.item])
Пример #11
0
    def store_file(self, doc, content, filename, content_type):
        res = process_file_from_stream(content,
                                       filename=filename,
                                       content_type=content_type)
        file_name, content_type, metadata = res

        cropping_data = self.get_cropping_data(doc)
        _, out = crop_image(content, filename, cropping_data)
        metadata['length'] = json.dumps(len(out.getvalue()))

        try:
            logger.debug('Going to save media file with %s ' % file_name)
            out.seek(0)
            id = app.media.put(out,
                               filename=file_name,
                               content_type=content_type,
                               metadata=metadata)
            doc['media'] = id
            doc['mime_type'] = content_type
            doc['filemeta'] = decode_metadata(metadata)
            inserted = [doc['media']]
            file_type = content_type.split('/')[0]

            rendition_spec = config.RENDITIONS['avatar']
            renditions = generate_renditions(out, doc['media'], inserted,
                                             file_type, content_type,
                                             rendition_spec, url_for_media)
            doc['renditions'] = renditions
        except Exception as io:
            logger.exception(io)
            for file_id in inserted:
                delete_file_on_error(doc, file_id)
            raise SuperdeskError(message='Generating renditions failed')
Пример #12
0
    def _save_cropped_image(self, file_stream, original, doc):
        """Saves the cropped image and returns the crop dictionary

        :param file_stream: cropped image stream
        :param original: original rendition
        :param doc: crop data
        :return dict: Crop values
        :raises SuperdeskApiError.internalError
        """
        crop = {}
        try:
            file_name, content_type, metadata = process_file_from_stream(
                file_stream, content_type=original.get("mimetype")
            )
            file_stream.seek(0)
            file_id = app.media.put(
                file_stream, filename=file_name, content_type=content_type, resource="upload", metadata=metadata
            )
            crop["media"] = file_id
            crop["mimetype"] = content_type
            crop["href"] = url_for_media(file_id, content_type)
            crop["CropTop"] = doc.get("CropTop", None)
            crop["CropLeft"] = doc.get("CropLeft", None)
            crop["CropRight"] = doc.get("CropRight", None)
            crop["CropBottom"] = doc.get("CropBottom", None)
            return crop
        except Exception as ex:
            try:
                app.media.delete(file_id)
            except Exception:
                pass
            raise SuperdeskApiError.internalError("Generating crop failed: {}".format(str(ex)), exception=ex)
Пример #13
0
    def save_attachment(self, data, items):
        """
        Given a data email for getting stream of attachment.

        """
        attachments = []
        for response_part in data:
            if isinstance(response_part, tuple):
                msg = email.message_from_bytes(response_part[1])
                for part in msg.walk():
                    if part.get_content_maintype() == 'multipart':
                        continue
                    disposition = part.get('Content-Disposition')
                    if disposition is not None and disposition.split(
                            ';')[0] == 'attachment':
                        fileName = part.get_filename()
                        if bool(fileName):
                            content = part.get_payload(decode=True)
                            content = io.BytesIO(content)
                            res = process_file_from_stream(
                                content, part.get_content_type())
                            file_name, content_type, metadata = res
                            content.seek(0)
                            media_id = app.media.put(content,
                                                     filename=fileName,
                                                     content_type=content_type,
                                                     metadata=metadata,
                                                     resource='attachments')
                            try:
                                attachment_service = get_resource_service(
                                    'attachments')
                                ids = attachment_service.post([{
                                    "media":
                                    media_id,
                                    "filename":
                                    fileName,
                                    "title":
                                    'attachment',
                                    "description":
                                    "email's attachment"
                                }])
                                if ids:
                                    attachments.append(
                                        {'attachment': next(iter(ids), None)})
                            except Exception as ex:
                                logger.error(
                                    "cannot add attachment for %s, %s" %
                                    (fileName, ex.args[0]))
                                app.media.delete(media_id)

                if attachments:
                    for item in items:
                        if item['type'] == 'text':
                            item['attachments'] = attachments
                            item[
                                'ednote'] = 'The story has %s attachment(s)' % str(
                                    len(attachments))
Пример #14
0
    def find_one_raw(self, resource, _id):
        # XXX: preview is used here instead of paid download
        #      see SDNTB-15
        data = {}
        url = self._app.config['SCANPIX_SEARCH_URL'] + '/search'
        data['refPtrs'] = [_id]
        r = self._request(url, data)
        doc = r.json()['data'][0]
        self._parse_doc(doc)

        url = doc['renditions']['baseImage']['href']
        # if MIME type can't be guessed, we default to jpeg
        mime_type = mimetypes.guess_type(url)[0] or 'image/jpeg'

        r = self._request(url, data)
        out = BytesIO(r.content)
        file_name, content_type, metadata = process_file_from_stream(
            out, mime_type)

        logger.debug('Going to save media file with %s ' % file_name)
        out.seek(0)
        try:
            file_id = self._app.media.put(out,
                                          filename=file_name,
                                          content_type=content_type,
                                          metadata=None)
        except Exception as e:
            logger.exception(e)
            raise SuperdeskApiError.internalError('Media saving failed')
        else:
            try:
                inserted = [file_id]
                doc['mimetype'] = content_type
                doc['filemeta'] = decode_metadata(metadata)
                # set the version created to now to bring it to the top of the desk, images can be quite old
                doc['versioncreated'] = utcnow()
                file_type = content_type.split('/')[0]
                rendition_spec = get_renditions_spec()
                renditions = generate_renditions(out,
                                                 file_id,
                                                 inserted,
                                                 file_type,
                                                 content_type,
                                                 rendition_spec,
                                                 url_for_media,
                                                 insert_metadata=False)
                doc['renditions'] = renditions
            except (IndexError, KeyError, json.JSONDecodeError) as e:
                logger.exception("Internal error: {}".format(e))
                delete_file_on_error(doc, file_id)

                raise SuperdeskApiError.internalError(
                    'Generating renditions failed')
        return doc
Пример #15
0
    def parse_item(self, image_path):
        filename = os.path.basename(image_path)
        content_type = mimetypes.guess_type(image_path)[0]
        guid = utils.generate_guid(type=GUID_TAG)
        item = {
            'guid': guid,
            config.VERSION: 1,
            config.ID_FIELD: guid,
            ITEM_TYPE: CONTENT_TYPE.PICTURE,
            'mimetype': content_type,
            'versioncreated': datetime.now()
        }
        with open(image_path, 'rb') as f:
            _, content_type, file_metadata = process_file_from_stream(
                f, content_type=content_type)
            f.seek(0)
            file_id = app.media.put(f,
                                    filename=filename,
                                    content_type=content_type,
                                    metadata=file_metadata)
            filemeta.set_filemeta(item, file_metadata)
            f.seek(0)
            metadata = get_meta_iptc(f)
            f.seek(0)
            rendition_spec = get_renditions_spec(no_custom_crops=True)
            renditions = generate_renditions(f, file_id, [file_id], 'image',
                                             content_type, rendition_spec,
                                             url_for_media)
            item['renditions'] = renditions

        try:
            date_created, time_created = metadata[TAG.DATE_CREATED], metadata[
                TAG.TIME_CREATED]
        except KeyError:
            pass
        else:
            # we format proper ISO 8601 date so we can parse it with dateutil
            datetime_created = '{}-{}-{}T{}:{}:{}{}{}:{}'.format(
                date_created[0:4], date_created[4:6], date_created[6:8],
                time_created[0:2], time_created[2:4], time_created[4:6],
                time_created[6], time_created[7:9], time_created[9:])
            item['firstcreated'] = dateutil.parser.parse(datetime_created)

        # now we map IPTC metadata to superdesk metadata
        for source_key, dest_key in IPTC_MAPPING.items():
            try:
                item[dest_key] = metadata[source_key]
            except KeyError:
                continue
        return item
Пример #16
0
 def get_file_from_document(self, doc):
     file = doc.get('media_fetched')
     if file:
         del doc['media_fetched']
     else:
         content = doc['media']
         res = process_file_from_stream(content, filename=content.filename, content_type=content.mimetype)
         file_name, content_type, metadata = res
         logger.debug('Going to save media file with %s ' % file_name)
         content.seek(0)
         id = app.media.put(content, filename=file_name, content_type=content_type, metadata=metadata)
         doc['media'] = id
         return content, content_type, decode_metadata(metadata)
     return file, file.content_type, file.metadata
 def get_file_from_document(self, doc):
     file = doc.get('media_fetched')
     if file:
         del doc['media_fetched']
     else:
         content = doc['media']
         res = process_file_from_stream(content, filename=content.filename, content_type=content.mimetype)
         file_name, content_type, metadata = res
         logger.debug('Going to save media file with %s ' % file_name)
         content.seek(0)
         id = app.media.put(content, filename=file_name, content_type=content_type, metadata=metadata)
         doc['media'] = id
         return content, content_type, decode_metadata(metadata)
     return file, file.content_type, file.metadata
Пример #18
0
    def get_file_from_document(self, doc):
        file = doc.get("media_fetched")
        if file:
            del doc["media_fetched"]
        else:
            content = doc["media"]
            res = process_file_from_stream(content, content_type=content.mimetype)
            file_name, content_type, metadata = res
            logger.debug("Going to save media file with %s " % file_name)
            content.seek(0)
            with timer("media:put.original"):
                doc["media"] = app.media.put(content, filename=file_name, content_type=content_type, metadata=metadata)
            return content, content_type, decode_metadata(metadata)

        return file, file.content_type, file.metadata
Пример #19
0
 def store_file(self, doc, content, filename, content_type):
     # retrieve file name and metadata from file
     file_name, content_type, metadata = process_file_from_stream(content, content_type=content_type)
     try:
         content.seek(0)
         file_id = doc['media_id']
         existing = app.media.get(doc['media_id'], self.datasource)
         if not existing:
             file_id = app.media.put(content, filename=file_name, content_type=content_type,
                                     resource=self.datasource, metadata=metadata, _id=ObjectId(doc['media_id']))
         doc['media'] = file_id
         doc['mime_type'] = content_type
         doc['filemeta'] = decode_metadata(metadata)
     except Exception as io:
         raise SuperdeskApiError.internalError('Saving file failed', exception=io)
Пример #20
0
    def parse_item(self, image_path):
        filename = os.path.basename(image_path)
        content_type = mimetypes.guess_type(image_path)[0]
        guid = utils.generate_guid(type=GUID_TAG)
        item = {'guid': guid,
                config.VERSION: 1,
                config.ID_FIELD: guid,
                ITEM_TYPE: CONTENT_TYPE.PICTURE,
                'mimetype': content_type,
                'versioncreated': datetime.now()
                }
        with open(image_path, 'rb') as f:
            _, content_type, file_metadata = process_file_from_stream(f, content_type=content_type)
            f.seek(0)
            file_id = app.media.put(f, filename=filename, content_type=content_type, metadata=file_metadata)
            filemeta.set_filemeta(item, file_metadata)
            f.seek(0)
            metadata = get_meta_iptc(f)
            f.seek(0)
            rendition_spec = get_renditions_spec(no_custom_crops=True)
            renditions = generate_renditions(f, file_id, [file_id], 'image',
                                             content_type, rendition_spec, url_for_media)
            item['renditions'] = renditions

        try:
            date_created, time_created = metadata[TAG.DATE_CREATED], metadata[TAG.TIME_CREATED]
        except KeyError:
            pass
        else:
            # we format proper ISO 8601 date so we can parse it with dateutil
            datetime_created = '{}-{}-{}T{}:{}:{}{}{}:{}'.format(date_created[0:4],
                                                                 date_created[4:6],
                                                                 date_created[6:8],
                                                                 time_created[0:2],
                                                                 time_created[2:4],
                                                                 time_created[4:6],
                                                                 time_created[6],
                                                                 time_created[7:9],
                                                                 time_created[9:])
            item['firstcreated'] = dateutil.parser.parse(datetime_created)

        # now we map IPTC metadata to superdesk metadata
        for source_key, dest_key in IPTC_MAPPING.items():
            try:
                item[dest_key] = metadata[source_key]
            except KeyError:
                continue
        return item
Пример #21
0
    def find_one_raw(self, resource, _id):
        url = self._app.config['AAP_MM_SEARCH_URL'] + '/Assets/{}'.format(_id)
        r = self._http.request('GET', url, headers=self._headers)
        doc = json.loads(r.data.decode('UTF-8'))
        self._parse_doc(doc)
        if 'fetch_endpoint' in doc:
            del doc['fetch_endpoint']

        # Only if we have credentials can we download the original if the account has that privilege
        if 'AAP_MM_USER' in self._app.config and 'AAP_MM_PASSWORD' in self._app.config \
                and self._app.config['AAP_MM_USER'] is not None:
            url = self._app.config[
                'AAP_MM_SEARCH_URL'] + '/Assets/{}/Original/download'.format(
                    _id)
        else:
            url = doc['renditions']['original']['href']
        r = self._http.request('GET', url, headers=self._headers)

        out = BytesIO(r.data)
        file_name, content_type, metadata = process_file_from_stream(
            out, 'image/jpeg')

        try:
            logger.debug('Going to save media file with %s ' % file_name)
            out.seek(0)
            file_id = self._app.media.put(out,
                                          filename=file_name,
                                          content_type=content_type,
                                          metadata=metadata)
            doc['mimetype'] = content_type
            doc['filemeta'] = decode_metadata(metadata)
            # set the version created to now to bring it to the top of the desk, images can be quite old
            doc['versioncreated'] = utcnow()
            inserted = [file_id]
            file_type = content_type.split('/')[0]
            rendition_spec = self._app.config['RENDITIONS']['picture']

            renditions = generate_renditions(out, file_id, inserted, file_type,
                                             content_type, rendition_spec,
                                             self.url_for_media)
            doc['renditions'] = renditions
        except Exception as io:
            logger.exception(io)
            for file_id in inserted:
                delete_file_on_error(doc, file_id)

        return doc
Пример #22
0
    def find_one_raw(self, resource, _id):
        if self._headers is None:
            self.__set_auth_cookie(self._app)

        url = self._app.config['AAP_MM_SEARCH_URL'] + '/Assets/{}'.format(_id)
        r = self._http.request('GET', url, headers=self._headers)
        doc = json.loads(r.data.decode('UTF-8'))
        self._parse_doc(doc)
        if 'fetch_endpoint' in doc:
            del doc['fetch_endpoint']

        # Only if we have credentials can we download the original if the account has that privilege
        if 'AAP_MM_USER' in self._app.config and 'AAP_MM_PASSWORD' in self._app.config \
                and self._app.config['AAP_MM_USER'] is not None:
            url = self._app.config['AAP_MM_SEARCH_URL'] + '/Assets/{}/Original/download'.format(_id)
        else:
            url = doc['renditions']['original']['href']
        r = self._http.request('GET', url, headers=self._headers)

        out = BytesIO(r.data)
        file_name, content_type, metadata = process_file_from_stream(out, 'image/jpeg')

        try:
            logger.debug('Going to save media file with %s ' % file_name)
            out.seek(0)
            file_id = self._app.media.put(out, filename=file_name, content_type=content_type, metadata=metadata)
            doc['mimetype'] = content_type
            doc['filemeta'] = decode_metadata(metadata)
            # set the version created to now to bring it to the top of the desk, images can be quite old
            doc['versioncreated'] = utcnow()
            inserted = [file_id]
            file_type = content_type.split('/')[0]
            rendition_spec = self._app.config['RENDITIONS']['picture']

            renditions = generate_renditions(out, file_id, inserted, file_type,
                                             content_type, rendition_spec, self.url_for_media)
            doc['renditions'] = renditions
        except Exception as io:
            logger.exception(io)
            for file_id in inserted:
                delete_file_on_error(doc, file_id)

        return doc
Пример #23
0
 def store_file(self, doc, content, filename, content_type):
     # retrieve file name and metadata from file
     file_name, content_type, metadata = process_file_from_stream(
         content, content_type=content_type)
     try:
         content.seek(0)
         file_id = doc['media_id']
         existing = app.media.get(doc['media_id'], self.datasource)
         if not existing:
             file_id = app.media.put(content,
                                     filename=file_name,
                                     content_type=content_type,
                                     resource=self.datasource,
                                     metadata=metadata,
                                     _id=ObjectId(doc['media_id']))
         doc['media'] = file_id
         doc['mime_type'] = content_type
         doc['filemeta'] = decode_metadata(metadata)
     except Exception as io:
         raise SuperdeskApiError.internalError('Saving file failed',
                                               exception=io)
Пример #24
0
    def find_one_raw(self, resource, _id):
        if self._headers is None:
            self.__set_auth_cookie(self._app)

        url = self._app.config['AAP_MM_SEARCH_URL'] + '/Assets/{}'.format(_id)
        r = self._http.request('GET', url, headers=self._headers)
        doc = json.loads(r.data.decode('UTF-8'))
        self._parse_doc(doc)
        if 'fetch_endpoint' in doc:
            del doc['fetch_endpoint']

        # Only if we have credentials can we download the original if the account has that privilege
        if self._username is not None and self._password is not None:
            resolutions = self._get_resolutions(_id)
            if doc[ITEM_TYPE] == CONTENT_TYPE.PICTURE:
                if any(i['Name'] == 'Original' for i in resolutions['Image']):
                    url = self._app.config['AAP_MM_SEARCH_URL'] + '/Assets/{}/Original/download'.format(_id)
                    mime_type = 'image/jpeg'
                else:
                    raise FileNotFoundError
            elif doc[ITEM_TYPE] == CONTENT_TYPE.VIDEO:
                if any(v['Name'] == 'Ipod' for v in resolutions['Video']):
                    url = self._app.config['AAP_MM_SEARCH_URL'] + '/Assets/{}/Ipod/download'.format(_id)
                    mime_type = doc.get('renditions').get('original').get('mimetype')
                else:
                    raise FileNotFoundError
            else:
                raise NotImplementedError
        else:
            if doc[ITEM_TYPE] == CONTENT_TYPE.VIDEO:
                mime_type = doc.get('renditions').get('original').get('mimetype')
            else:
                mime_type = 'image/jpeg'
            url = doc['renditions']['original']['href']

        r = self._http.request('GET', url, headers=self._headers)
        out = BytesIO(r.data)
        file_name, content_type, metadata = process_file_from_stream(out, mime_type)

        inserted = []

        try:
            logger.debug('Going to save media file with %s ' % file_name)
            out.seek(0)
            file_id = self._app.media.put(out, filename=file_name, content_type=content_type, metadata=None)
            doc['mimetype'] = content_type
            doc['filemeta'] = decode_metadata(metadata)
            # set the version created to now to bring it to the top of the desk, images can be quite old
            doc['versioncreated'] = utcnow()
            inserted = [file_id]
            file_type = content_type.split('/')[0]
            rendition_spec = self._app.config['RENDITIONS']['picture']

            renditions = generate_renditions(out, file_id, inserted, file_type,
                                             content_type, rendition_spec,
                                             self.url_for_media, insert_metadata=False)
            doc['renditions'] = renditions
        except Exception as io:
            logger.exception(io)
            for file_id in inserted:
                delete_file_on_error(doc, file_id)

            raise SuperdeskApiError.internalError('Generating renditions failed')

        return doc
Пример #25
0
    def find_one_raw(self, resource, _id):
        if self._headers is None:
            self.__set_auth_cookie(self._app)

        url = self._app.config["AAP_MM_SEARCH_URL"] + "/Assets/{}".format(_id)
        r = self._http.request("GET", url, headers=self._headers)
        doc = json.loads(r.data.decode("UTF-8"))
        self._parse_doc(doc)
        if "fetch_endpoint" in doc:
            del doc["fetch_endpoint"]

        # Only if we have credentials can we download the original if the account has that privilege
        if self._username is not None and self._password is not None:
            resolutions = self._get_resolutions(_id)
            if doc["type"] == "picture":
                if any(i["Name"] == "Original" for i in resolutions["Image"]):
                    url = self._app.config["AAP_MM_SEARCH_URL"] + "/Assets/{}/Original/download".format(_id)
                    mime_type = "image/jpeg"
                    source_ref = {"href": url, "mimetype": mime_type}
                else:
                    raise FileNotFoundError
            elif doc["type"] == "video":
                if any(v["Name"] == "Ipod" for v in resolutions["Video"]):
                    url = self._app.config["AAP_MM_SEARCH_URL"] + "/Assets/{}/Ipod/download".format(_id)
                    mime_type = doc.get("renditions").get("original").get("mimetype")
                else:
                    raise FileNotFoundError
                if any(v["Name"] == "Video" for v in resolutions["Video"]):
                    source_ref = {
                        "href": self._app.config["AAP_MM_SEARCH_URL"] + "/Assets/{}/Video/download".format(_id),
                        "mimetype": "video/quicktime",
                    }
                else:
                    raise FileNotFoundError
            else:
                raise NotImplementedError
        else:
            if doc["type"] == "video":
                mime_type = doc.get("renditions").get("original").get("mimetype")
            else:
                mime_type = "image/jpeg"
            url = doc["renditions"]["original"]["href"]
            source_ref = {"href": url, "mimetype": mime_type}

        r = self._http.request("GET", url, headers=self._headers)
        out = BytesIO(r.data)
        file_name, content_type, metadata = process_file_from_stream(out, mime_type)

        try:
            logger.debug("Going to save media file with %s " % file_name)
            out.seek(0)
            file_id = self._app.media.put(out, filename=file_name, content_type=content_type, metadata=metadata)
            doc["mimetype"] = content_type
            doc["filemeta"] = decode_metadata(metadata)
            # set the version created to now to bring it to the top of the desk, images can be quite old
            doc["versioncreated"] = utcnow()
            inserted = [file_id]
            file_type = content_type.split("/")[0]
            rendition_spec = self._app.config["RENDITIONS"]["picture"]

            renditions = generate_renditions(
                out, file_id, inserted, file_type, content_type, rendition_spec, self.url_for_media
            )
            doc["renditions"] = renditions
            doc["renditions"]["original_source"] = source_ref
        except Exception as io:
            logger.exception(io)
            for file_id in inserted:
                delete_file_on_error(doc, file_id)

        return doc
Пример #26
0
    def parse(self, data, provider=None):
        config = provider.get("config", {})
        # If the channel is configured to process structured email generated from a google form
        if config.get("formatted", False):
            return self._parse_formatted_email(data, provider)
        try:
            new_items = []
            # create an item for the body text of the email
            # either text or html
            item = dict()
            item[ITEM_TYPE] = CONTENT_TYPE.TEXT
            item["versioncreated"] = utcnow()

            comp_item = None

            # a list to keep the references to the attachments
            refs = []

            html_body = None
            text_body = None

            for response_part in data:
                if isinstance(response_part, tuple):
                    msg = email.message_from_bytes(response_part[1])
                    item["headline"] = self.parse_header(msg["subject"])
                    field_from = self.parse_header(msg["from"])
                    item["original_source"] = field_from
                    try:
                        if email_regex.findall(field_from):
                            email_address = email_regex.findall(field_from)[0]
                            user = get_resource_service(
                                "users").get_user_by_email(email_address)
                            item["original_creator"] = user[
                                eve.utils.config.ID_FIELD]
                    except UserNotRegisteredException:
                        pass
                    item["guid"] = msg["Message-ID"]
                    date_tuple = email.utils.parsedate_tz(msg["Date"])
                    if date_tuple:
                        dt = datetime.datetime.utcfromtimestamp(
                            email.utils.mktime_tz(date_tuple))
                        dt = dt.replace(tzinfo=timezone("utc"))
                        item["firstcreated"] = dt

                    # this will loop through all the available multiparts in mail
                    for part in msg.walk():
                        if part.get_content_type() == "text/plain":
                            body = part.get_payload(decode=True)
                            try:
                                # if we don't know the charset just have a go!
                                if part.get_content_charset() is None:
                                    text_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    text_body = body.decode(charset)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing text body for {0} from {1}: {2}"
                                    .format(item["headline"], field_from, ex))
                                continue
                        if part.get_content_type() == "text/html":
                            body = part.get_payload(decode=True)
                            try:
                                if part.get_content_charset() is None:
                                    html_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    html_body = body.decode(charset)
                                html_body = sanitize_html(html_body)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing html body for {0} from {1}: {2}"
                                    .format(item["headline"], field_from, ex))
                                continue
                        if part.get_content_maintype() == "multipart":
                            continue
                        if part.get("Content-Disposition") is None:
                            continue
                        # we are only going to pull off image attachments at this stage
                        if part.get_content_maintype() != "image":
                            continue

                        fileName = part.get_filename()
                        if bool(fileName):
                            image = part.get_payload(decode=True)
                            content = io.BytesIO(image)
                            res = process_file_from_stream(
                                content, part.get_content_type())
                            file_name, content_type, metadata = res
                            if content_type == "image/gif" or content_type == "image/png":
                                continue
                            content.seek(0)
                            image_id = self.parser_app.media.put(
                                content,
                                filename=fileName,
                                content_type=content_type,
                                metadata=metadata)
                            renditions = {"baseImage": {"href": image_id}}

                            # if we have not got a composite item then create one
                            if not comp_item:
                                comp_item = dict()
                                comp_item[ITEM_TYPE] = CONTENT_TYPE.COMPOSITE
                                comp_item["guid"] = generate_guid(
                                    type=GUID_TAG)
                                comp_item["versioncreated"] = utcnow()
                                comp_item["groups"] = []
                                comp_item["headline"] = item["headline"]
                                comp_item["groups"] = []
                                comp_item["original_source"] = item[
                                    "original_source"]
                                if "original_creator" in item:
                                    comp_item["original_creator"] = item[
                                        "original_creator"]

                                # create a reference to the item that stores the body of the email
                                item_ref = {
                                    "guid": item["guid"],
                                    "residRef": item["guid"],
                                    "headline": item["headline"],
                                    "location": "ingest",
                                    "itemClass": "icls:text",
                                    "original_source": item["original_source"],
                                }
                                if "original_creator" in item:
                                    item_ref["original_creator"] = item[
                                        "original_creator"]
                                refs.append(item_ref)

                            media_item = dict()
                            media_item["guid"] = generate_guid(type=GUID_TAG)
                            media_item["versioncreated"] = utcnow()
                            media_item[ITEM_TYPE] = CONTENT_TYPE.PICTURE
                            media_item["renditions"] = renditions
                            media_item["mimetype"] = content_type
                            set_filemeta(media_item, metadata)
                            media_item["slugline"] = fileName
                            if text_body is not None:
                                media_item["body_html"] = text_body
                            media_item["headline"] = item["headline"]
                            media_item["original_source"] = item[
                                "original_source"]
                            if "original_creator" in item:
                                media_item["original_creator"] = item[
                                    "original_creator"]
                            new_items.append(media_item)

                            # add a reference to this item in the composite item
                            media_ref = {
                                "guid": media_item["guid"],
                                "residRef": media_item["guid"],
                                "headline": fileName,
                                "location": "ingest",
                                "itemClass": "icls:picture",
                                "original_source": item["original_source"],
                            }
                            if "original_creator" in item:
                                media_ref["original_creator"] = item[
                                    "original_creator"]
                            refs.append(media_ref)

            if html_body:
                item["body_html"] = html_body
            else:
                item["body_html"] = "<pre>" + text_body + "</pre>"
                item[FORMAT] = FORMATS.PRESERVED

            # if there is composite item then add the main group and references
            if comp_item:
                grefs = {
                    "refs": [{
                        "idRef": "main"
                    }],
                    "id": "root",
                    "role": "grpRole:NEP"
                }
                comp_item["groups"].append(grefs)

                grefs = {"refs": refs, "id": "main", "role": "grpRole:Main"}
                comp_item["groups"].append(grefs)

                new_items.append(comp_item)

            new_items.append(item)
            return new_items
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)
Пример #27
0
    def parse(self, data, provider=None):
        config = provider.get('config', {})
        # If the channel is configured to process structured email generated from a google form
        if config.get('formatted', False):
            return self._parse_formatted_email(data, provider)
        try:
            new_items = []
            # create an item for the body text of the email
            # either text or html
            item = dict()
            item[ITEM_TYPE] = CONTENT_TYPE.TEXT
            item['versioncreated'] = utcnow()

            comp_item = None

            # a list to keep the references to the attachments
            refs = []

            html_body = None
            text_body = None

            for response_part in data:
                if isinstance(response_part, tuple):
                    msg = email.message_from_bytes(response_part[1])
                    item['headline'] = self.parse_header(msg['subject'])
                    field_from = self.parse_header(msg['from'])
                    item['original_source'] = field_from
                    try:
                        if email_regex.findall(field_from):
                            email_address = email_regex.findall(field_from)[0]
                            user = get_resource_service(
                                'users').get_user_by_email(email_address)
                            item['original_creator'] = user[
                                eve.utils.config.ID_FIELD]
                    except UserNotRegisteredException:
                        pass
                    item['guid'] = msg['Message-ID']
                    date_tuple = email.utils.parsedate_tz(msg['Date'])
                    if date_tuple:
                        dt = datetime.datetime.utcfromtimestamp(
                            email.utils.mktime_tz(date_tuple))
                        dt = dt.replace(tzinfo=timezone('utc'))
                        item['firstcreated'] = dt

                    # this will loop through all the available multiparts in mail
                    for part in msg.walk():
                        if part.get_content_type() == "text/plain":
                            body = part.get_payload(decode=True)
                            try:
                                # if we don't know the charset just have a go!
                                if part.get_content_charset() is None:
                                    text_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    text_body = body.decode(charset)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing text body for {0} from {1}: {2}"
                                    .format(item['headline'], field_from, ex))
                                continue
                        if part.get_content_type() == "text/html":
                            body = part.get_payload(decode=True)
                            try:
                                if part.get_content_charset() is None:
                                    html_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    html_body = body.decode(charset)
                                html_body = self.safe_html(html_body)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing html body for {0} from {1}: {2}"
                                    .format(item['headline'], field_from, ex))
                                continue
                        if part.get_content_maintype() == 'multipart':
                            continue
                        if part.get('Content-Disposition') is None:
                            continue
                        # we are only going to pull off image attachments at this stage
                        if part.get_content_maintype() != 'image':
                            continue

                        fileName = part.get_filename()
                        if bool(fileName):
                            image = part.get_payload(decode=True)
                            content = io.BytesIO(image)
                            res = process_file_from_stream(
                                content, part.get_content_type())
                            file_name, content_type, metadata = res
                            if content_type == 'image/gif' or content_type == 'image/png':
                                continue
                            content.seek(0)
                            image_id = self.parser_app.media.put(
                                content,
                                filename=fileName,
                                content_type=content_type,
                                metadata=metadata)
                            renditions = {'baseImage': {'href': image_id}}

                            # if we have not got a composite item then create one
                            if not comp_item:
                                comp_item = dict()
                                comp_item[ITEM_TYPE] = CONTENT_TYPE.COMPOSITE
                                comp_item['guid'] = generate_guid(
                                    type=GUID_TAG)
                                comp_item['versioncreated'] = utcnow()
                                comp_item['groups'] = []
                                comp_item['headline'] = item['headline']
                                comp_item['groups'] = []
                                comp_item['original_source'] = item[
                                    'original_source']
                                if 'original_creator' in item:
                                    comp_item['original_creator'] = item[
                                        'original_creator']

                                # create a reference to the item that stores the body of the email
                                item_ref = {
                                    'guid': item['guid'],
                                    'residRef': item['guid'],
                                    'headline': item['headline'],
                                    'location': 'ingest',
                                    'itemClass': 'icls:text',
                                    'original_source': item['original_source']
                                }
                                if 'original_creator' in item:
                                    item_ref['original_creator'] = item[
                                        'original_creator']
                                refs.append(item_ref)

                            media_item = dict()
                            media_item['guid'] = generate_guid(type=GUID_TAG)
                            media_item['versioncreated'] = utcnow()
                            media_item[ITEM_TYPE] = CONTENT_TYPE.PICTURE
                            media_item['renditions'] = renditions
                            media_item['mimetype'] = content_type
                            set_filemeta(media_item, metadata)
                            media_item['slugline'] = fileName
                            if text_body is not None:
                                media_item['body_html'] = text_body
                            media_item['headline'] = item['headline']
                            media_item['original_source'] = item[
                                'original_source']
                            if 'original_creator' in item:
                                media_item['original_creator'] = item[
                                    'original_creator']
                            new_items.append(media_item)

                            # add a reference to this item in the composite item
                            media_ref = {
                                'guid': media_item['guid'],
                                'residRef': media_item['guid'],
                                'headline': fileName,
                                'location': 'ingest',
                                'itemClass': 'icls:picture',
                                'original_source': item['original_source']
                            }
                            if 'original_creator' in item:
                                media_ref['original_creator'] = item[
                                    'original_creator']
                            refs.append(media_ref)

            if html_body is not None:
                item['body_html'] = html_body
            else:
                item['body_html'] = '<pre>' + text_body + '</pre>'
                item[FORMAT] = FORMATS.PRESERVED

            # if there is composite item then add the main group and references
            if comp_item:
                grefs = {
                    'refs': [{
                        'idRef': 'main'
                    }],
                    'id': 'root',
                    'role': 'grpRole:NEP'
                }
                comp_item['groups'].append(grefs)

                grefs = {'refs': refs, 'id': 'main', 'role': 'grpRole:Main'}
                comp_item['groups'].append(grefs)

                new_items.append(comp_item)

            new_items.append(item)
            return new_items
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)
Пример #28
0
    def find_one_raw(self, resource, _id):
        if self._headers is None:
            self.__set_auth_cookie(self._app)

        url = self._app.config['AAP_MM_SEARCH_URL'] + '/Assets/{}'.format(_id)
        r = self._http.request('GET', url, headers=self._headers)
        doc = json.loads(r.data.decode('UTF-8'))
        self._parse_doc(doc)
        if 'fetch_endpoint' in doc:
            del doc['fetch_endpoint']

        # Only if we have credentials can we download the original if the account has that privilege
        if self._username is not None and self._password is not None:
            resolutions = self._get_resolutions(_id)
            if doc[ITEM_TYPE] == CONTENT_TYPE.PICTURE:
                if any(i['Name'] == 'Original' for i in resolutions['Image']):
                    url = self._app.config[
                        'AAP_MM_SEARCH_URL'] + '/Assets/{}/Original/download'.format(
                            _id)
                    mime_type = 'image/jpeg'
                else:
                    raise FileNotFoundError
            elif doc[ITEM_TYPE] == CONTENT_TYPE.VIDEO:
                if any(v['Name'] == 'Ipod' for v in resolutions['Video']):
                    url = self._app.config[
                        'AAP_MM_SEARCH_URL'] + '/Assets/{}/Ipod/download'.format(
                            _id)
                    mime_type = doc.get('renditions').get('original').get(
                        'mimetype')
                else:
                    raise FileNotFoundError
            else:
                raise NotImplementedError
        else:
            if doc[ITEM_TYPE] == CONTENT_TYPE.VIDEO:
                mime_type = doc.get('renditions').get('original').get(
                    'mimetype')
            else:
                mime_type = 'image/jpeg'
            url = doc['renditions']['original']['href']

        r = self._http.request('GET', url, headers=self._headers)
        out = BytesIO(r.data)
        file_name, content_type, metadata = process_file_from_stream(
            out, mime_type)

        inserted = []

        try:
            logger.debug('Going to save media file with %s ' % file_name)
            out.seek(0)
            file_id = self._app.media.put(out,
                                          filename=file_name,
                                          content_type=content_type,
                                          metadata=None)
            doc['mimetype'] = content_type
            doc['filemeta'] = decode_metadata(metadata)
            # set the version created to now to bring it to the top of the desk, images can be quite old
            doc['versioncreated'] = utcnow()
            inserted = [file_id]
            file_type = content_type.split('/')[0]
            rendition_spec = self._app.config['RENDITIONS']['picture']

            renditions = generate_renditions(out,
                                             file_id,
                                             inserted,
                                             file_type,
                                             content_type,
                                             rendition_spec,
                                             self.url_for_media,
                                             insert_metadata=False)
            doc['renditions'] = renditions
        except Exception as io:
            logger.exception(io)
            for file_id in inserted:
                delete_file_on_error(doc, file_id)

            raise SuperdeskApiError.internalError(
                'Generating renditions failed')

        return doc
Пример #29
0
    def parse_email(self, data, provider):
        try:
            new_items = []
            # create an item for the body text of the email
            # either text or html
            item = dict()
            item['type'] = 'text'
            item['versioncreated'] = utcnow()

            comp_item = None

            # a list to keep the references to the attachments
            refs = []

            html_body = None
            text_body = None

            for response_part in data:
                if isinstance(response_part, tuple):
                    msg = email.message_from_bytes(response_part[1])
                    item['headline'] = self.parse_header(msg['subject'])
                    item['original_creator'] = self.parse_header(msg['from'])
                    item['guid'] = msg['Message-ID']
                    date_tuple = email.utils.parsedate_tz(msg['Date'])
                    if date_tuple:
                        dt = datetime.datetime.utcfromtimestamp(email.utils.mktime_tz(date_tuple))
                        dt = dt.replace(tzinfo=timezone('utc'))
                        item['firstcreated'] = dt

                    # this will loop through all the available multiparts in mail
                    for part in msg.walk():
                        if part.get_content_type() == "text/plain":
                            body = part.get_payload(decode=True)
                            try:
                                # if we don't know the charset just have a go!
                                if part.get_content_charset() is None:
                                    text_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    text_body = body.decode(charset)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing text body for {0} from {1}".format(item['headline'],
                                                                                          item['original_creator']), ex)
                                continue
                        if part.get_content_type() == "text/html":
                            body = part.get_payload(decode=True)
                            try:
                                if part.get_content_charset() is None:
                                    html_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    html_body = body.decode(charset)
                                html_body = self.safe_html(html_body)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing text html for {0} from {1}".format(item['headline'],
                                                                                          item['original_creator']), ex)
                                continue
                        if part.get_content_maintype() == 'multipart':
                            continue
                        if part.get('Content-Disposition') is None:
                            continue
                        # we are only going to pull off image attachments at this stage
                        if part.get_content_maintype() != 'image':
                            continue

                        fileName = part.get_filename()
                        if bool(fileName):
                            image = part.get_payload(decode=True)
                            content = io.BytesIO(image)
                            res = process_file_from_stream(content, part.get_content_type())
                            file_name, content_type, metadata = res
                            if content_type == 'image/gif' or content_type == 'image/png':
                                continue
                            content.seek(0)
                            image_id = self.parser_app.media.put(content, filename=fileName,
                                                                 content_type=content_type, metadata=metadata)
                            renditions = {'baseImage': {'href': image_id}}

                            # if we have not got a composite item then create one
                            if not comp_item:
                                comp_item = dict()
                                comp_item['type'] = 'composite'
                                comp_item['guid'] = generate_guid(type=GUID_TAG)
                                comp_item['versioncreated'] = utcnow()
                                comp_item['groups'] = []
                                comp_item['headline'] = item['headline']
                                comp_item['groups'] = []

                                # create a reference to the item that stores the body of the email
                                item_ref = {}
                                item_ref['guid'] = item['guid']
                                item_ref['residRef'] = item['guid']
                                item_ref['headline'] = item['headline']
                                item_ref['location'] = 'ingest'
                                item_ref['itemClass'] = 'icls:text'
                                refs.append(item_ref)

                            media_item = dict()
                            media_item['guid'] = generate_guid(type=GUID_TAG)
                            media_item['versioncreated'] = utcnow()
                            media_item['type'] = 'picture'
                            media_item['renditions'] = renditions
                            media_item['mimetype'] = content_type
                            media_item['filemeta'] = metadata
                            media_item['slugline'] = fileName
                            if text_body is not None:
                                media_item['body_html'] = text_body
                            media_item['headline'] = item['headline']
                            new_items.append(media_item)

                            # add a reference to this item in the composite item
                            media_ref = {}
                            media_ref['guid'] = media_item['guid']
                            media_ref['residRef'] = media_item['guid']
                            media_ref['headline'] = fileName
                            media_ref['location'] = 'ingest'
                            media_ref['itemClass'] = 'icls:picture'
                            refs.append(media_ref)

            if html_body is not None:
                item['body_html'] = html_body
            else:
                item['body_html'] = text_body
                item['type'] = 'preformatted'

            # if there is composite item then add the main group and references
            if comp_item:
                grefs = {}
                grefs['refs'] = [{'idRef': 'main'}]
                grefs['id'] = 'root'
                grefs['role'] = 'grpRole:NEP'
                comp_item['groups'].append(grefs)

                grefs = {}
                grefs['refs'] = refs
                grefs['id'] = 'main'
                grefs['role'] = 'grpRole:Main'
                comp_item['groups'].append(grefs)

                new_items.append(comp_item)

            new_items.append(item)
            return new_items
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)
Пример #30
0
    def parse(self, data, provider=None):
        config = provider.get('config', {})
        # If the channel is configured to process structured email generated from a google form
        if config.get('formatted', False):
            return self._parse_formatted_email(data, provider)
        try:
            new_items = []
            # create an item for the body text of the email
            # either text or html
            item = dict()
            item[ITEM_TYPE] = CONTENT_TYPE.TEXT
            item['versioncreated'] = utcnow()

            comp_item = None

            # a list to keep the references to the attachments
            refs = []

            html_body = None
            text_body = None

            for response_part in data:
                if isinstance(response_part, tuple):
                    msg = email.message_from_bytes(response_part[1])
                    item['headline'] = self.parse_header(msg['subject'])
                    field_from = self.parse_header(msg['from'])
                    item['original_source'] = field_from
                    try:
                        if email_regex.findall(field_from):
                            email_address = email_regex.findall(field_from)[0]
                            user = get_resource_service('users').get_user_by_email(email_address)
                            item['original_creator'] = user[eve.utils.config.ID_FIELD]
                    except UserNotRegisteredException:
                        pass
                    item['guid'] = msg['Message-ID']
                    date_tuple = email.utils.parsedate_tz(msg['Date'])
                    if date_tuple:
                        dt = datetime.datetime.utcfromtimestamp(
                            email.utils.mktime_tz(date_tuple))
                        dt = dt.replace(tzinfo=timezone('utc'))
                        item['firstcreated'] = dt

                    # this will loop through all the available multiparts in mail
                    for part in msg.walk():
                        if part.get_content_type() == "text/plain":
                            body = part.get_payload(decode=True)
                            try:
                                # if we don't know the charset just have a go!
                                if part.get_content_charset() is None:
                                    text_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    text_body = body.decode(charset)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing text body for {0} from {1}: {2}".format(item['headline'],
                                                                                               field_from, ex))
                                continue
                        if part.get_content_type() == "text/html":
                            body = part.get_payload(decode=True)
                            try:
                                if part.get_content_charset() is None:
                                    html_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    html_body = body.decode(charset)
                                html_body = self.safe_html(html_body)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing html body for {0} from {1}: {2}".format(item['headline'],
                                                                                               field_from, ex))
                                continue
                        if part.get_content_maintype() == 'multipart':
                            continue
                        if part.get('Content-Disposition') is None:
                            continue
                        # we are only going to pull off image attachments at this stage
                        if part.get_content_maintype() != 'image':
                            continue

                        fileName = part.get_filename()
                        if bool(fileName):
                            image = part.get_payload(decode=True)
                            content = io.BytesIO(image)
                            res = process_file_from_stream(content, part.get_content_type())
                            file_name, content_type, metadata = res
                            if content_type == 'image/gif' or content_type == 'image/png':
                                continue
                            content.seek(0)
                            image_id = self.parser_app.media.put(content, filename=fileName,
                                                                 content_type=content_type, metadata=metadata)
                            renditions = {'baseImage': {'href': image_id}}

                            # if we have not got a composite item then create one
                            if not comp_item:
                                comp_item = dict()
                                comp_item[ITEM_TYPE] = CONTENT_TYPE.COMPOSITE
                                comp_item['guid'] = generate_guid(type=GUID_TAG)
                                comp_item['versioncreated'] = utcnow()
                                comp_item['groups'] = []
                                comp_item['headline'] = item['headline']
                                comp_item['groups'] = []
                                comp_item['original_source'] = item['original_source']
                                if 'original_creator' in item:
                                    comp_item['original_creator'] = item['original_creator']

                                # create a reference to the item that stores the body of the email
                                item_ref = {'guid': item['guid'], 'residRef': item['guid'],
                                            'headline': item['headline'], 'location': 'ingest',
                                            'itemClass': 'icls:text', 'original_source': item['original_source']}
                                if 'original_creator' in item:
                                    item_ref['original_creator'] = item['original_creator']
                                refs.append(item_ref)

                            media_item = dict()
                            media_item['guid'] = generate_guid(type=GUID_TAG)
                            media_item['versioncreated'] = utcnow()
                            media_item[ITEM_TYPE] = CONTENT_TYPE.PICTURE
                            media_item['renditions'] = renditions
                            media_item['mimetype'] = content_type
                            set_filemeta(media_item, metadata)
                            media_item['slugline'] = fileName
                            if text_body is not None:
                                media_item['body_html'] = text_body
                            media_item['headline'] = item['headline']
                            media_item['original_source'] = item['original_source']
                            if 'original_creator' in item:
                                media_item['original_creator'] = item['original_creator']
                            new_items.append(media_item)

                            # add a reference to this item in the composite item
                            media_ref = {'guid': media_item['guid'], 'residRef': media_item['guid'],
                                         'headline': fileName, 'location': 'ingest', 'itemClass': 'icls:picture',
                                         'original_source': item['original_source']}
                            if 'original_creator' in item:
                                media_ref['original_creator'] = item['original_creator']
                            refs.append(media_ref)

            if html_body is not None:
                item['body_html'] = html_body
            else:
                item['body_html'] = '<pre>' + text_body + '</pre>'
                item[FORMAT] = FORMATS.PRESERVED

            # if there is composite item then add the main group and references
            if comp_item:
                grefs = {'refs': [{'idRef': 'main'}], 'id': 'root', 'role': 'grpRole:NEP'}
                comp_item['groups'].append(grefs)

                grefs = {'refs': refs, 'id': 'main', 'role': 'grpRole:Main'}
                comp_item['groups'].append(grefs)

                new_items.append(comp_item)

            new_items.append(item)
            return new_items
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)
Пример #31
0
    def parse_newscomponent_media(self, item, newscomponent_el):
        """
        Parse NewsComponent in NewsItem element.
        Supports only text items which roles are in `SUPPORTED_MEDIA_ASSET_TYPES`

        Example:

        <NewsComponent>
          <NewsLines>
              <DateLine xml:lang="fr">Paris, 9 déc 2018 (AFP) -</DateLine>
            <HeadLine xml:lang="fr">Un an après, les fans de Johnny lui rendent hommage à Paris</HeadLine>
            <NewsLine>
              <NewsLineType FormalName="ProductLine"/>
              <NewsLineText xml:lang="fr">(Photo+Live Video+Video)</NewsLineText>
            </NewsLine>
          </NewsLines>
          <AdministrativeMetadata>
            <Provider>
              <Party FormalName="AFP"/>
            </Provider>
          </AdministrativeMetadata>
          <DescriptiveMetadata>
            ....
          </DescriptiveMetadata>
          <ContentItem>
            ....
          </ContentItem>
        </NewsComponent>

        :param item:
        :param component_el:
        :return:
        """

        # language
        item['language'] = newscomponent_el.attrib.get(XML_LANG)

        # NewsLines
        newslines_el = newscomponent_el.find('NewsLines')
        self.parse_newslines(item, newslines_el)

        # AdministrativeMetadata
        admin_el = newscomponent_el.find('AdministrativeMetadata')
        self.parse_administrativemetadata(item, admin_el)

        # DescriptiveMetadata
        descript_el = newscomponent_el.find('DescriptiveMetadata')
        self.parse_descriptivemetadata(item, descript_el)

        # description_text, headline
        for formalname, item_key in (('Body', 'description_text'), ('Title', 'headline')):
            role = newscomponent_el.find('NewsComponent/Role[@FormalName="{}"]'.format(formalname))
            if role is not None:
                newscomponent = role.getparent()
                datacontent = newscomponent.find('ContentItem/DataContent')
                format = newscomponent.find('ContentItem/Format')

                if datacontent is not None and format is not None:
                    formalname = format.attrib.get('FormalName')
                    if not formalname or formalname not in ('Text', 'ascii'):
                        logger.warning(
                            'ContentItem/FormalName was not found or not supported: "{}". '
                            'Skiping an "{}" item.'.format(formalname, item['guid'])
                        )
                        raise SkipItemException
                    if datacontent.text:
                        item[item_key] = datacontent.text.strip()

                        if item_key == 'description_text':
                            item[item_key] = self._plain_to_html(item[item_key])
                else:
                    logger.warning('Mimetype or DataContent was not found. Skiping an "{}" item.'.format(
                        item['guid']
                    ))
                    raise SkipItemException

        # type
        role = newscomponent_el.find('Role')
        if role is not None:
            role_name = role.attrib.get('FormalName')
            if not role_name:
                logger.warning('NewsComponent/Role was not found. Skiping an "{}" item.'.format(
                    item['guid']
                ))
                raise SkipItemException
            role_name = role_name.upper()
            item[ITEM_TYPE] = getattr(CONTENT_TYPE, role_name)

        # read files and save them into the storage
        for newscomponent in newscomponent_el.findall('NewsComponent'):
            component_role = self._get_role(newscomponent)
            if component_role and component_role.upper() in self.SUPPORTED_MEDIA_ASSET_TYPES[role_name].keys():
                content_item = newscomponent.find('ContentItem')
                if content_item is None:
                    continue

                filename = content_item.attrib.get('Href')
                if filename is None:
                    continue

                format_name = ''
                format_el = content_item.find('Format')
                if format_el is not None:
                    format_name = format_el.attrib.get('FormalName')

                content = self._get_file(filename)
                if not content:
                    continue

                _, content_type, metadata = process_file_from_stream(content, 'application/' + format_name)
                content.seek(0)
                media_id = app.media.put(
                    content,
                    filename=filename,
                    content_type=content_type,
                    metadata=metadata
                )

                rendition_key = self.SUPPORTED_MEDIA_ASSET_TYPES[role_name][component_role.upper()]
                item.setdefault('renditions', {})[rendition_key] = {
                    'media': media_id,
                    'mimetype': content_type,
                    'href': app.media.url_for_media(media_id, content_type),
                }

        # this attibutes are redundand for media item
        attrs_to_be_removed = ('date_id', 'item_id', 'provider_id', 'public_identifier')
        for attr in attrs_to_be_removed:
            if attr in item:
                del item[attr]

        # clean subject
        subject_to_be_removed = (
            'genre',
        )
        item['subject'] = [i for i in item.get('subject', []) if i['scheme'] not in subject_to_be_removed]
Пример #32
0
    def _update(self, provider, update):
        config = provider.get('config', {})
        server = config.get('server', '')
        port = int(config.get('port', 993))

        try:
            imap = imaplib.IMAP4_SSL(host=server, port=port)
            try:
                imap.login(config.get('user', None),
                           config.get('password', None))
            except imaplib.IMAP4.error:
                raise IngestEmailError.emailLoginError(imaplib.IMAP4.error,
                                                       provider)

            rv, data = imap.select(config.get('mailbox', None), readonly=False)
            if rv == 'OK':
                rv, data = imap.search(None, config.get('filter', '(UNSEEN)'))
                if rv == 'OK':
                    new_items = []
                    for num in data[0].split():
                        rv, data = imap.fetch(num, '(RFC822)')
                        if rv == 'OK':
                            try:
                                logger.info('Ingesting events from email')
                                parser = self.get_feed_parser(provider, data)
                                for response_part in data:
                                    if isinstance(response_part, tuple):
                                        if isinstance(response_part[1], bytes):
                                            msg = email.message_from_bytes(
                                                response_part[1])
                                        else:
                                            msg = email.message_from_string(
                                                response_part[1])
                                        # this will loop through all the available multiparts in email
                                        for part in msg.walk():
                                            # parse attached files only
                                            if part.get('Content-Disposition'
                                                        ) is None:
                                                continue
                                            fileName = part.get_filename()
                                            if bool(fileName):
                                                attachment = part.get_payload(
                                                    decode=True)
                                                content = io.BytesIO(
                                                    attachment)
                                                res = process_file_from_stream(
                                                    content,
                                                    part.get_content_type())
                                                file_name, content_type, metadata = res
                                                logger.info(
                                                    'Ingesting events with {} parser'
                                                    .format(parser.__class__.
                                                            __name__))
                                                if getattr(
                                                        parser, 'parse_email'):
                                                    try:
                                                        new_items.append(
                                                            parser.parse_email(
                                                                content,
                                                                content_type,
                                                                provider))
                                                    except ParserError.parseMessageError:
                                                        continue
                                                else:
                                                    new_items.append(
                                                        parser.parse(
                                                            data, provider))
                                rv, data = imap.store(num, '+FLAGS', '\\Seen')
                            except IngestEmailError:
                                continue
                imap.close()
            imap.logout()
        except IngestEmailError:
            raise
        except Exception as ex:
            raise IngestEmailError.emailError(ex, provider)
        return new_items
Пример #33
0
    def parse_email(self, data, provider):
        try:
            new_items = []
            # create an item for the body text of the email
            # either text or html
            item = dict()
            item['type'] = 'text'
            item['versioncreated'] = utcnow()

            comp_item = None

            # a list to keep the references to the attachments
            refs = []

            html_body = None
            text_body = None

            for response_part in data:
                if isinstance(response_part, tuple):
                    msg = email.message_from_bytes(response_part[1])
                    item['headline'] = self.parse_header(msg['subject'])
                    item['original_creator'] = self.parse_header(msg['from'])
                    item['guid'] = msg['Message-ID']
                    date_tuple = email.utils.parsedate_tz(msg['Date'])
                    if date_tuple:
                        dt = datetime.datetime.utcfromtimestamp(
                            email.utils.mktime_tz(date_tuple))
                        dt = dt.replace(tzinfo=timezone('utc'))
                        item['firstcreated'] = dt

                    # this will loop through all the available multiparts in mail
                    for part in msg.walk():
                        if part.get_content_type() == "text/plain":
                            body = part.get_payload(decode=True)
                            try:
                                # if we don't know the charset just have a go!
                                if part.get_content_charset() is None:
                                    text_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    text_body = body.decode(charset)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing text body for {0} from {1}"
                                    .format(item['headline'],
                                            item['original_creator']), ex)
                                continue
                        if part.get_content_type() == "text/html":
                            body = part.get_payload(decode=True)
                            try:
                                if part.get_content_charset() is None:
                                    html_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    html_body = body.decode(charset)
                                html_body = self.safe_html(html_body)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing text html for {0} from {1}"
                                    .format(item['headline'],
                                            item['original_creator']), ex)
                                continue
                        if part.get_content_maintype() == 'multipart':
                            continue
                        if part.get('Content-Disposition') is None:
                            continue
                        # we are only going to pull off image attachments at this stage
                        if part.get_content_maintype() != 'image':
                            continue

                        fileName = part.get_filename()
                        if bool(fileName):
                            image = part.get_payload(decode=True)
                            content = io.BytesIO(image)
                            res = process_file_from_stream(
                                content, part.get_content_type())
                            file_name, content_type, metadata = res
                            if content_type == 'image/gif' or content_type == 'image/png':
                                continue
                            content.seek(0)
                            image_id = self.parser_app.media.put(
                                content,
                                filename=fileName,
                                content_type=content_type,
                                metadata=metadata)
                            renditions = {'baseImage': {'href': image_id}}

                            # if we have not got a composite item then create one
                            if not comp_item:
                                comp_item = dict()
                                comp_item['type'] = 'composite'
                                comp_item['guid'] = generate_guid(
                                    type=GUID_TAG)
                                comp_item['versioncreated'] = utcnow()
                                comp_item['groups'] = []
                                comp_item['headline'] = item['headline']
                                comp_item['groups'] = []

                                # create a reference to the item that stores the body of the email
                                item_ref = {}
                                item_ref['guid'] = item['guid']
                                item_ref['residRef'] = item['guid']
                                item_ref['headline'] = item['headline']
                                item_ref['location'] = 'ingest'
                                item_ref['itemClass'] = 'icls:text'
                                refs.append(item_ref)

                            media_item = dict()
                            media_item['guid'] = generate_guid(type=GUID_TAG)
                            media_item['versioncreated'] = utcnow()
                            media_item['type'] = 'picture'
                            media_item['renditions'] = renditions
                            media_item['mimetype'] = content_type
                            media_item['filemeta'] = metadata
                            media_item['slugline'] = fileName
                            if text_body is not None:
                                media_item['body_html'] = text_body
                            media_item['headline'] = item['headline']
                            new_items.append(media_item)

                            # add a reference to this item in the composite item
                            media_ref = {}
                            media_ref['guid'] = media_item['guid']
                            media_ref['residRef'] = media_item['guid']
                            media_ref['headline'] = fileName
                            media_ref['location'] = 'ingest'
                            media_ref['itemClass'] = 'icls:picture'
                            refs.append(media_ref)

            if html_body is not None:
                item['body_html'] = html_body
            else:
                item['body_html'] = text_body
                item['type'] = 'preformatted'

            # if there is composite item then add the main group and references
            if comp_item:
                grefs = {}
                grefs['refs'] = [{'idRef': 'main'}]
                grefs['id'] = 'root'
                grefs['role'] = 'grpRole:NEP'
                comp_item['groups'].append(grefs)

                grefs = {}
                grefs['refs'] = refs
                grefs['id'] = 'main'
                grefs['role'] = 'grpRole:Main'
                comp_item['groups'].append(grefs)

                new_items.append(comp_item)

            new_items.append(item)
            return new_items
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)