def crop_and_store_file(self, doc, content, filename, content_type): # retrieve file name and metadata from file file_name, content_type, metadata = process_file_from_stream(content, content_type=content_type) # crop the file if needed, can change the image size was_cropped, out = crop_image(content, filename, doc) # the length in metadata could be updated if it was cropped if was_cropped: file_name, content_type, metadata_after_cropped = process_file_from_stream(out, content_type=content_type) # when cropped, metadata are reseted. Then we update the previous metadata variable metadata['length'] = metadata_after_cropped['length'] try: logger.debug('Going to save media file with %s ' % file_name) out.seek(0) file_id = app.media.put(out, filename=file_name, content_type=content_type, resource=self.datasource, metadata=metadata) doc['media'] = file_id doc['mimetype'] = content_type doc['filemeta'] = decode_metadata(metadata) inserted = [doc['media']] file_type = content_type.split('/')[0] rendition_spec = config.RENDITIONS['avatar'] renditions = generate_renditions(out, file_id, inserted, file_type, content_type, rendition_spec, url_for_media) doc['renditions'] = renditions except Exception as io: logger.exception(io) for file_id in inserted: delete_file_on_error(doc, file_id) raise SuperdeskApiError.internalError('Generating renditions failed')
def crop_and_store_file(self, doc, content, filename, content_type): # retrieve file name and metadata from file file_name, content_type, metadata = process_file_from_stream( content, content_type=content_type) # crop the file if needed, can change the image size was_cropped, out = crop_image(content, filename, doc) # the length in metadata could be updated if it was cropped if was_cropped: file_name, content_type, metadata_after_cropped = process_file_from_stream( out, content_type=content_type) # when cropped, metadata are reseted. Then we update the previous metadata variable metadata['length'] = metadata_after_cropped['length'] try: logger.debug('Going to save media file with %s ' % file_name) out.seek(0) file_id = app.media.put(out, filename=file_name, content_type=content_type, resource=self.datasource, metadata=metadata) doc['media'] = file_id doc['mimetype'] = content_type set_filemeta(doc, decode_metadata(metadata)) inserted = [doc['media']] file_type = content_type.split('/')[0] rendition_spec = config.RENDITIONS['avatar'] renditions = generate_renditions(out, file_id, inserted, file_type, content_type, rendition_spec, url_for_media) doc['renditions'] = renditions except Exception as io: for file_id in inserted: delete_file_on_error(doc, file_id) raise SuperdeskApiError.internalError( 'Generating renditions failed', exception=io)
def parse_item(self, image_path): filename = os.path.basename(image_path) content_type = mimetypes.guess_type(image_path)[0] guid = utils.generate_guid(type=GUID_TAG) item = {'guid': guid, 'uri': guid, config.VERSION: 1, ITEM_TYPE: CONTENT_TYPE.PICTURE, 'mimetype': content_type, 'versioncreated': utcnow(), } with open(image_path, 'rb') as f: _, content_type, file_metadata = process_file_from_stream(f, content_type=content_type) f.seek(0) file_id = app.media.put(f, filename=filename, content_type=content_type, metadata=file_metadata) filemeta.set_filemeta(item, file_metadata) f.seek(0) metadata = get_meta_iptc(f) f.seek(0) self.parse_meta(item, metadata) rendition_spec = get_renditions_spec(no_custom_crops=True) renditions = generate_renditions(f, file_id, [file_id], 'image', content_type, rendition_spec, url_for_media) item['renditions'] = renditions return item
def store_file(self, doc, content, filename, content_type): res = process_file_from_stream(content, filename=filename, content_type=content_type) file_name, content_type, metadata = res cropping_data = self.get_cropping_data(doc) _, out = crop_image(content, filename, cropping_data) metadata['length'] = json.dumps(len(out.getvalue())) try: logger.debug('Going to save media file with %s ' % file_name) out.seek(0) id = app.media.put(out, filename=file_name, content_type=content_type, metadata=metadata) doc['media'] = id doc['mime_type'] = content_type doc['filemeta'] = decode_metadata(metadata) inserted = [doc['media']] file_type = content_type.split('/')[0] rendition_spec = config.RENDITIONS['avatar'] renditions = generate_renditions(out, doc['media'], inserted, file_type, content_type, rendition_spec, url_for_media) doc['renditions'] = renditions except Exception as io: logger.exception(io) for file_id in inserted: delete_file_on_error(doc, file_id) raise SuperdeskError(message='Generating renditions failed')
def parse_attachment(self, newscomponent_el): """ Parse attachment component, save it to storage and return attachment id <NewsComponent Duid="0" xml:lang="nl"> <Role FormalName="Image"/> <DescriptiveMetadata> <Property FormalName="ComponentClass" Value="Image"/> </DescriptiveMetadata> <ContentItem Href="IMG_0182.jpg"> <Format FormalName="Jpeg"/> <Characteristics> <SizeInBytes>2267043</SizeInBytes> <Property FormalName="Width" Value="4032"/> <Property FormalName="Height" Value="3024"/> </Characteristics> </ContentItem> </NewsComponent> """ content_item = newscomponent_el.find('ContentItem') if content_item is None: return # avoid re-adding media after item is ingested guid = hashlib.md5(ElementTree.tostring(content_item)).hexdigest() attachment_service = get_resource_service('attachments') old_attachment = attachment_service.find_one(req=None, guid=guid) if old_attachment: return {'attachment': old_attachment['_id']} filename = content_item.attrib.get('Href') if filename is None: return format_name = '' format_el = content_item.find('Format') if format_el is not None: format_name = format_el.attrib.get('FormalName') content = self._get_file(filename) if not content: return _, content_type, metadata = process_file_from_stream(content, 'application/' + format_name) content.seek(0) media_id = app.media.put(content, filename=filename, content_type=content_type, metadata=metadata, resource='attachments') try: ids = attachment_service.post([{ 'media': media_id, 'filename': filename, 'title': filename, 'description': 'belga remote attachment', 'guid': guid, }]) return {'attachment': next(iter(ids), None)} except Exception as ex: app.media.delete(media_id)
def _save_cropped_image(self, file_stream, original, doc): """ Saves the cropped image and returns the crop dictionary :param file_stream: cropped image stream :param original: original rendition :param doc: crop data :return dict: Crop values :raises SuperdeskApiError.internalError """ crop = {} try: file_name, content_type, metadata = process_file_from_stream( file_stream, content_type=original.get('mimetype')) file_stream.seek(0) file_id = superdesk.app.media.put(file_stream, filename=file_name, content_type=content_type, resource='upload', metadata=metadata) crop['media'] = file_id crop['mimetype'] = content_type crop['href'] = url_for_media(file_id, content_type) crop['CropTop'] = doc.get('CropTop', None) crop['CropLeft'] = doc.get('CropLeft', None) crop['CropRight'] = doc.get('CropRight', None) crop['CropBottom'] = doc.get('CropBottom', None) return crop except Exception as ex: try: superdesk.app.media.delete(file_id) except: pass raise SuperdeskApiError.internalError( 'Generating crop failed: {}'.format(str(ex)))
def _save_cropped_image(self, file_stream, original, doc): """ Saves the cropped image and returns the crop dictionary :param file_stream: cropped image stream :param original: original rendition :param doc: crop data :return dict: Crop values :raises SuperdeskApiError.internalError """ crop = {} try: file_name, content_type, metadata = process_file_from_stream(file_stream, content_type=original.get('mimetype')) file_stream.seek(0) file_id = superdesk.app.media.put(file_stream, filename=file_name, content_type=content_type, resource='upload', metadata=metadata) crop['media'] = file_id crop['mimetype'] = content_type crop['href'] = url_for_media(file_id, content_type) crop['CropTop'] = doc.get('CropTop', None) crop['CropLeft'] = doc.get('CropLeft', None) crop['CropRight'] = doc.get('CropRight', None) crop['CropBottom'] = doc.get('CropBottom', None) return crop except Exception as ex: try: superdesk.app.media.delete(file_id) except: pass raise SuperdeskApiError.internalError('Generating crop failed: {}'.format(str(ex)))
def setUp(self): super().setUp() dirname = os.path.dirname(os.path.realpath(__file__)) image_path = os.path.normpath( os.path.join(dirname, "fixtures", self.filename)) content_type = mimetypes.guess_type(image_path)[0] guid = utils.generate_guid(type=GUID_TAG) self.item = { "guid": guid, "version": 1, "_id": guid, ITEM_TYPE: CONTENT_TYPE.PICTURE, "mimetype": content_type, "versioncreated": datetime.now(), } with open(image_path, "rb") as f: _, content_type, file_metadata = process_file_from_stream( f, content_type=content_type) f.seek(0) file_id = app.media.put(f, filename=self.filename, content_type=content_type, metadata=file_metadata) filemeta.set_filemeta(self.item, file_metadata) f.seek(0) rendition_spec = get_renditions_spec() renditions = generate_renditions(f, file_id, [file_id], "image", content_type, rendition_spec, url_for_media) self.item["renditions"] = renditions archive = get_resource_service("archive") archive.post([self.item])
def setUp(self): super().setUp() dirname = os.path.dirname(os.path.realpath(__file__)) image_path = os.path.normpath( os.path.join(dirname, 'fixtures', self.filename)) content_type = mimetypes.guess_type(image_path)[0] guid = utils.generate_guid(type=GUID_TAG) self.item = { 'guid': guid, 'version': 1, '_id': guid, ITEM_TYPE: CONTENT_TYPE.PICTURE, 'mimetype': content_type, 'versioncreated': datetime.now() } with open(image_path, 'rb') as f: _, content_type, file_metadata = process_file_from_stream( f, content_type=content_type) f.seek(0) file_id = app.media.put(f, filename=self.filename, content_type=content_type, metadata=file_metadata) filemeta.set_filemeta(self.item, file_metadata) f.seek(0) rendition_spec = get_renditions_spec() renditions = generate_renditions(f, file_id, [file_id], 'image', content_type, rendition_spec, url_for_media) self.item['renditions'] = renditions archive = get_resource_service('archive') archive.post([self.item])
def setUp(self): super().setUp() dirname = os.path.dirname(os.path.realpath(__file__)) image_path = os.path.normpath(os.path.join(dirname, 'fixtures', self.filename)) content_type = mimetypes.guess_type(image_path)[0] guid = utils.generate_guid(type=GUID_TAG) self.item = {'guid': guid, 'version': 1, '_id': guid, ITEM_TYPE: CONTENT_TYPE.PICTURE, 'mimetype': content_type, 'versioncreated': datetime.now() } with open(image_path, 'rb') as f: _, content_type, file_metadata = process_file_from_stream(f, content_type=content_type) f.seek(0) file_id = app.media.put(f, filename=self.filename, content_type=content_type, metadata=file_metadata) filemeta.set_filemeta(self.item, file_metadata) f.seek(0) rendition_spec = get_renditions_spec() renditions = generate_renditions(f, file_id, [file_id], 'image', content_type, rendition_spec, url_for_media) self.item['renditions'] = renditions archive = get_resource_service('archive') archive.post([self.item])
def _save_cropped_image(self, file_stream, original, doc): """Saves the cropped image and returns the crop dictionary :param file_stream: cropped image stream :param original: original rendition :param doc: crop data :return dict: Crop values :raises SuperdeskApiError.internalError """ crop = {} try: file_name, content_type, metadata = process_file_from_stream( file_stream, content_type=original.get("mimetype") ) file_stream.seek(0) file_id = app.media.put( file_stream, filename=file_name, content_type=content_type, resource="upload", metadata=metadata ) crop["media"] = file_id crop["mimetype"] = content_type crop["href"] = url_for_media(file_id, content_type) crop["CropTop"] = doc.get("CropTop", None) crop["CropLeft"] = doc.get("CropLeft", None) crop["CropRight"] = doc.get("CropRight", None) crop["CropBottom"] = doc.get("CropBottom", None) return crop except Exception as ex: try: app.media.delete(file_id) except Exception: pass raise SuperdeskApiError.internalError("Generating crop failed: {}".format(str(ex)), exception=ex)
def save_attachment(self, data, items): """ Given a data email for getting stream of attachment. """ attachments = [] for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) for part in msg.walk(): if part.get_content_maintype() == 'multipart': continue disposition = part.get('Content-Disposition') if disposition is not None and disposition.split( ';')[0] == 'attachment': fileName = part.get_filename() if bool(fileName): content = part.get_payload(decode=True) content = io.BytesIO(content) res = process_file_from_stream( content, part.get_content_type()) file_name, content_type, metadata = res content.seek(0) media_id = app.media.put(content, filename=fileName, content_type=content_type, metadata=metadata, resource='attachments') try: attachment_service = get_resource_service( 'attachments') ids = attachment_service.post([{ "media": media_id, "filename": fileName, "title": 'attachment', "description": "email's attachment" }]) if ids: attachments.append( {'attachment': next(iter(ids), None)}) except Exception as ex: logger.error( "cannot add attachment for %s, %s" % (fileName, ex.args[0])) app.media.delete(media_id) if attachments: for item in items: if item['type'] == 'text': item['attachments'] = attachments item[ 'ednote'] = 'The story has %s attachment(s)' % str( len(attachments))
def find_one_raw(self, resource, _id): # XXX: preview is used here instead of paid download # see SDNTB-15 data = {} url = self._app.config['SCANPIX_SEARCH_URL'] + '/search' data['refPtrs'] = [_id] r = self._request(url, data) doc = r.json()['data'][0] self._parse_doc(doc) url = doc['renditions']['baseImage']['href'] # if MIME type can't be guessed, we default to jpeg mime_type = mimetypes.guess_type(url)[0] or 'image/jpeg' r = self._request(url, data) out = BytesIO(r.content) file_name, content_type, metadata = process_file_from_stream( out, mime_type) logger.debug('Going to save media file with %s ' % file_name) out.seek(0) try: file_id = self._app.media.put(out, filename=file_name, content_type=content_type, metadata=None) except Exception as e: logger.exception(e) raise SuperdeskApiError.internalError('Media saving failed') else: try: inserted = [file_id] doc['mimetype'] = content_type doc['filemeta'] = decode_metadata(metadata) # set the version created to now to bring it to the top of the desk, images can be quite old doc['versioncreated'] = utcnow() file_type = content_type.split('/')[0] rendition_spec = get_renditions_spec() renditions = generate_renditions(out, file_id, inserted, file_type, content_type, rendition_spec, url_for_media, insert_metadata=False) doc['renditions'] = renditions except (IndexError, KeyError, json.JSONDecodeError) as e: logger.exception("Internal error: {}".format(e)) delete_file_on_error(doc, file_id) raise SuperdeskApiError.internalError( 'Generating renditions failed') return doc
def parse_item(self, image_path): filename = os.path.basename(image_path) content_type = mimetypes.guess_type(image_path)[0] guid = utils.generate_guid(type=GUID_TAG) item = { 'guid': guid, config.VERSION: 1, config.ID_FIELD: guid, ITEM_TYPE: CONTENT_TYPE.PICTURE, 'mimetype': content_type, 'versioncreated': datetime.now() } with open(image_path, 'rb') as f: _, content_type, file_metadata = process_file_from_stream( f, content_type=content_type) f.seek(0) file_id = app.media.put(f, filename=filename, content_type=content_type, metadata=file_metadata) filemeta.set_filemeta(item, file_metadata) f.seek(0) metadata = get_meta_iptc(f) f.seek(0) rendition_spec = get_renditions_spec(no_custom_crops=True) renditions = generate_renditions(f, file_id, [file_id], 'image', content_type, rendition_spec, url_for_media) item['renditions'] = renditions try: date_created, time_created = metadata[TAG.DATE_CREATED], metadata[ TAG.TIME_CREATED] except KeyError: pass else: # we format proper ISO 8601 date so we can parse it with dateutil datetime_created = '{}-{}-{}T{}:{}:{}{}{}:{}'.format( date_created[0:4], date_created[4:6], date_created[6:8], time_created[0:2], time_created[2:4], time_created[4:6], time_created[6], time_created[7:9], time_created[9:]) item['firstcreated'] = dateutil.parser.parse(datetime_created) # now we map IPTC metadata to superdesk metadata for source_key, dest_key in IPTC_MAPPING.items(): try: item[dest_key] = metadata[source_key] except KeyError: continue return item
def get_file_from_document(self, doc): file = doc.get('media_fetched') if file: del doc['media_fetched'] else: content = doc['media'] res = process_file_from_stream(content, filename=content.filename, content_type=content.mimetype) file_name, content_type, metadata = res logger.debug('Going to save media file with %s ' % file_name) content.seek(0) id = app.media.put(content, filename=file_name, content_type=content_type, metadata=metadata) doc['media'] = id return content, content_type, decode_metadata(metadata) return file, file.content_type, file.metadata
def get_file_from_document(self, doc): file = doc.get("media_fetched") if file: del doc["media_fetched"] else: content = doc["media"] res = process_file_from_stream(content, content_type=content.mimetype) file_name, content_type, metadata = res logger.debug("Going to save media file with %s " % file_name) content.seek(0) with timer("media:put.original"): doc["media"] = app.media.put(content, filename=file_name, content_type=content_type, metadata=metadata) return content, content_type, decode_metadata(metadata) return file, file.content_type, file.metadata
def store_file(self, doc, content, filename, content_type): # retrieve file name and metadata from file file_name, content_type, metadata = process_file_from_stream(content, content_type=content_type) try: content.seek(0) file_id = doc['media_id'] existing = app.media.get(doc['media_id'], self.datasource) if not existing: file_id = app.media.put(content, filename=file_name, content_type=content_type, resource=self.datasource, metadata=metadata, _id=ObjectId(doc['media_id'])) doc['media'] = file_id doc['mime_type'] = content_type doc['filemeta'] = decode_metadata(metadata) except Exception as io: raise SuperdeskApiError.internalError('Saving file failed', exception=io)
def parse_item(self, image_path): filename = os.path.basename(image_path) content_type = mimetypes.guess_type(image_path)[0] guid = utils.generate_guid(type=GUID_TAG) item = {'guid': guid, config.VERSION: 1, config.ID_FIELD: guid, ITEM_TYPE: CONTENT_TYPE.PICTURE, 'mimetype': content_type, 'versioncreated': datetime.now() } with open(image_path, 'rb') as f: _, content_type, file_metadata = process_file_from_stream(f, content_type=content_type) f.seek(0) file_id = app.media.put(f, filename=filename, content_type=content_type, metadata=file_metadata) filemeta.set_filemeta(item, file_metadata) f.seek(0) metadata = get_meta_iptc(f) f.seek(0) rendition_spec = get_renditions_spec(no_custom_crops=True) renditions = generate_renditions(f, file_id, [file_id], 'image', content_type, rendition_spec, url_for_media) item['renditions'] = renditions try: date_created, time_created = metadata[TAG.DATE_CREATED], metadata[TAG.TIME_CREATED] except KeyError: pass else: # we format proper ISO 8601 date so we can parse it with dateutil datetime_created = '{}-{}-{}T{}:{}:{}{}{}:{}'.format(date_created[0:4], date_created[4:6], date_created[6:8], time_created[0:2], time_created[2:4], time_created[4:6], time_created[6], time_created[7:9], time_created[9:]) item['firstcreated'] = dateutil.parser.parse(datetime_created) # now we map IPTC metadata to superdesk metadata for source_key, dest_key in IPTC_MAPPING.items(): try: item[dest_key] = metadata[source_key] except KeyError: continue return item
def find_one_raw(self, resource, _id): url = self._app.config['AAP_MM_SEARCH_URL'] + '/Assets/{}'.format(_id) r = self._http.request('GET', url, headers=self._headers) doc = json.loads(r.data.decode('UTF-8')) self._parse_doc(doc) if 'fetch_endpoint' in doc: del doc['fetch_endpoint'] # Only if we have credentials can we download the original if the account has that privilege if 'AAP_MM_USER' in self._app.config and 'AAP_MM_PASSWORD' in self._app.config \ and self._app.config['AAP_MM_USER'] is not None: url = self._app.config[ 'AAP_MM_SEARCH_URL'] + '/Assets/{}/Original/download'.format( _id) else: url = doc['renditions']['original']['href'] r = self._http.request('GET', url, headers=self._headers) out = BytesIO(r.data) file_name, content_type, metadata = process_file_from_stream( out, 'image/jpeg') try: logger.debug('Going to save media file with %s ' % file_name) out.seek(0) file_id = self._app.media.put(out, filename=file_name, content_type=content_type, metadata=metadata) doc['mimetype'] = content_type doc['filemeta'] = decode_metadata(metadata) # set the version created to now to bring it to the top of the desk, images can be quite old doc['versioncreated'] = utcnow() inserted = [file_id] file_type = content_type.split('/')[0] rendition_spec = self._app.config['RENDITIONS']['picture'] renditions = generate_renditions(out, file_id, inserted, file_type, content_type, rendition_spec, self.url_for_media) doc['renditions'] = renditions except Exception as io: logger.exception(io) for file_id in inserted: delete_file_on_error(doc, file_id) return doc
def find_one_raw(self, resource, _id): if self._headers is None: self.__set_auth_cookie(self._app) url = self._app.config['AAP_MM_SEARCH_URL'] + '/Assets/{}'.format(_id) r = self._http.request('GET', url, headers=self._headers) doc = json.loads(r.data.decode('UTF-8')) self._parse_doc(doc) if 'fetch_endpoint' in doc: del doc['fetch_endpoint'] # Only if we have credentials can we download the original if the account has that privilege if 'AAP_MM_USER' in self._app.config and 'AAP_MM_PASSWORD' in self._app.config \ and self._app.config['AAP_MM_USER'] is not None: url = self._app.config['AAP_MM_SEARCH_URL'] + '/Assets/{}/Original/download'.format(_id) else: url = doc['renditions']['original']['href'] r = self._http.request('GET', url, headers=self._headers) out = BytesIO(r.data) file_name, content_type, metadata = process_file_from_stream(out, 'image/jpeg') try: logger.debug('Going to save media file with %s ' % file_name) out.seek(0) file_id = self._app.media.put(out, filename=file_name, content_type=content_type, metadata=metadata) doc['mimetype'] = content_type doc['filemeta'] = decode_metadata(metadata) # set the version created to now to bring it to the top of the desk, images can be quite old doc['versioncreated'] = utcnow() inserted = [file_id] file_type = content_type.split('/')[0] rendition_spec = self._app.config['RENDITIONS']['picture'] renditions = generate_renditions(out, file_id, inserted, file_type, content_type, rendition_spec, self.url_for_media) doc['renditions'] = renditions except Exception as io: logger.exception(io) for file_id in inserted: delete_file_on_error(doc, file_id) return doc
def store_file(self, doc, content, filename, content_type): # retrieve file name and metadata from file file_name, content_type, metadata = process_file_from_stream( content, content_type=content_type) try: content.seek(0) file_id = doc['media_id'] existing = app.media.get(doc['media_id'], self.datasource) if not existing: file_id = app.media.put(content, filename=file_name, content_type=content_type, resource=self.datasource, metadata=metadata, _id=ObjectId(doc['media_id'])) doc['media'] = file_id doc['mime_type'] = content_type doc['filemeta'] = decode_metadata(metadata) except Exception as io: raise SuperdeskApiError.internalError('Saving file failed', exception=io)
def find_one_raw(self, resource, _id): if self._headers is None: self.__set_auth_cookie(self._app) url = self._app.config['AAP_MM_SEARCH_URL'] + '/Assets/{}'.format(_id) r = self._http.request('GET', url, headers=self._headers) doc = json.loads(r.data.decode('UTF-8')) self._parse_doc(doc) if 'fetch_endpoint' in doc: del doc['fetch_endpoint'] # Only if we have credentials can we download the original if the account has that privilege if self._username is not None and self._password is not None: resolutions = self._get_resolutions(_id) if doc[ITEM_TYPE] == CONTENT_TYPE.PICTURE: if any(i['Name'] == 'Original' for i in resolutions['Image']): url = self._app.config['AAP_MM_SEARCH_URL'] + '/Assets/{}/Original/download'.format(_id) mime_type = 'image/jpeg' else: raise FileNotFoundError elif doc[ITEM_TYPE] == CONTENT_TYPE.VIDEO: if any(v['Name'] == 'Ipod' for v in resolutions['Video']): url = self._app.config['AAP_MM_SEARCH_URL'] + '/Assets/{}/Ipod/download'.format(_id) mime_type = doc.get('renditions').get('original').get('mimetype') else: raise FileNotFoundError else: raise NotImplementedError else: if doc[ITEM_TYPE] == CONTENT_TYPE.VIDEO: mime_type = doc.get('renditions').get('original').get('mimetype') else: mime_type = 'image/jpeg' url = doc['renditions']['original']['href'] r = self._http.request('GET', url, headers=self._headers) out = BytesIO(r.data) file_name, content_type, metadata = process_file_from_stream(out, mime_type) inserted = [] try: logger.debug('Going to save media file with %s ' % file_name) out.seek(0) file_id = self._app.media.put(out, filename=file_name, content_type=content_type, metadata=None) doc['mimetype'] = content_type doc['filemeta'] = decode_metadata(metadata) # set the version created to now to bring it to the top of the desk, images can be quite old doc['versioncreated'] = utcnow() inserted = [file_id] file_type = content_type.split('/')[0] rendition_spec = self._app.config['RENDITIONS']['picture'] renditions = generate_renditions(out, file_id, inserted, file_type, content_type, rendition_spec, self.url_for_media, insert_metadata=False) doc['renditions'] = renditions except Exception as io: logger.exception(io) for file_id in inserted: delete_file_on_error(doc, file_id) raise SuperdeskApiError.internalError('Generating renditions failed') return doc
def find_one_raw(self, resource, _id): if self._headers is None: self.__set_auth_cookie(self._app) url = self._app.config["AAP_MM_SEARCH_URL"] + "/Assets/{}".format(_id) r = self._http.request("GET", url, headers=self._headers) doc = json.loads(r.data.decode("UTF-8")) self._parse_doc(doc) if "fetch_endpoint" in doc: del doc["fetch_endpoint"] # Only if we have credentials can we download the original if the account has that privilege if self._username is not None and self._password is not None: resolutions = self._get_resolutions(_id) if doc["type"] == "picture": if any(i["Name"] == "Original" for i in resolutions["Image"]): url = self._app.config["AAP_MM_SEARCH_URL"] + "/Assets/{}/Original/download".format(_id) mime_type = "image/jpeg" source_ref = {"href": url, "mimetype": mime_type} else: raise FileNotFoundError elif doc["type"] == "video": if any(v["Name"] == "Ipod" for v in resolutions["Video"]): url = self._app.config["AAP_MM_SEARCH_URL"] + "/Assets/{}/Ipod/download".format(_id) mime_type = doc.get("renditions").get("original").get("mimetype") else: raise FileNotFoundError if any(v["Name"] == "Video" for v in resolutions["Video"]): source_ref = { "href": self._app.config["AAP_MM_SEARCH_URL"] + "/Assets/{}/Video/download".format(_id), "mimetype": "video/quicktime", } else: raise FileNotFoundError else: raise NotImplementedError else: if doc["type"] == "video": mime_type = doc.get("renditions").get("original").get("mimetype") else: mime_type = "image/jpeg" url = doc["renditions"]["original"]["href"] source_ref = {"href": url, "mimetype": mime_type} r = self._http.request("GET", url, headers=self._headers) out = BytesIO(r.data) file_name, content_type, metadata = process_file_from_stream(out, mime_type) try: logger.debug("Going to save media file with %s " % file_name) out.seek(0) file_id = self._app.media.put(out, filename=file_name, content_type=content_type, metadata=metadata) doc["mimetype"] = content_type doc["filemeta"] = decode_metadata(metadata) # set the version created to now to bring it to the top of the desk, images can be quite old doc["versioncreated"] = utcnow() inserted = [file_id] file_type = content_type.split("/")[0] rendition_spec = self._app.config["RENDITIONS"]["picture"] renditions = generate_renditions( out, file_id, inserted, file_type, content_type, rendition_spec, self.url_for_media ) doc["renditions"] = renditions doc["renditions"]["original_source"] = source_ref except Exception as io: logger.exception(io) for file_id in inserted: delete_file_on_error(doc, file_id) return doc
def parse(self, data, provider=None): config = provider.get("config", {}) # If the channel is configured to process structured email generated from a google form if config.get("formatted", False): return self._parse_formatted_email(data, provider) try: new_items = [] # create an item for the body text of the email # either text or html item = dict() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item["versioncreated"] = utcnow() comp_item = None # a list to keep the references to the attachments refs = [] html_body = None text_body = None for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) item["headline"] = self.parse_header(msg["subject"]) field_from = self.parse_header(msg["from"]) item["original_source"] = field_from try: if email_regex.findall(field_from): email_address = email_regex.findall(field_from)[0] user = get_resource_service( "users").get_user_by_email(email_address) item["original_creator"] = user[ eve.utils.config.ID_FIELD] except UserNotRegisteredException: pass item["guid"] = msg["Message-ID"] date_tuple = email.utils.parsedate_tz(msg["Date"]) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone("utc")) item["firstcreated"] = dt # this will loop through all the available multiparts in mail for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) try: # if we don't know the charset just have a go! if part.get_content_charset() is None: text_body = body.decode() else: charset = part.get_content_charset() text_body = body.decode(charset) continue except Exception as ex: logger.exception( "Exception parsing text body for {0} from {1}: {2}" .format(item["headline"], field_from, ex)) continue if part.get_content_type() == "text/html": body = part.get_payload(decode=True) try: if part.get_content_charset() is None: html_body = body.decode() else: charset = part.get_content_charset() html_body = body.decode(charset) html_body = sanitize_html(html_body) continue except Exception as ex: logger.exception( "Exception parsing html body for {0} from {1}: {2}" .format(item["headline"], field_from, ex)) continue if part.get_content_maintype() == "multipart": continue if part.get("Content-Disposition") is None: continue # we are only going to pull off image attachments at this stage if part.get_content_maintype() != "image": continue fileName = part.get_filename() if bool(fileName): image = part.get_payload(decode=True) content = io.BytesIO(image) res = process_file_from_stream( content, part.get_content_type()) file_name, content_type, metadata = res if content_type == "image/gif" or content_type == "image/png": continue content.seek(0) image_id = self.parser_app.media.put( content, filename=fileName, content_type=content_type, metadata=metadata) renditions = {"baseImage": {"href": image_id}} # if we have not got a composite item then create one if not comp_item: comp_item = dict() comp_item[ITEM_TYPE] = CONTENT_TYPE.COMPOSITE comp_item["guid"] = generate_guid( type=GUID_TAG) comp_item["versioncreated"] = utcnow() comp_item["groups"] = [] comp_item["headline"] = item["headline"] comp_item["groups"] = [] comp_item["original_source"] = item[ "original_source"] if "original_creator" in item: comp_item["original_creator"] = item[ "original_creator"] # create a reference to the item that stores the body of the email item_ref = { "guid": item["guid"], "residRef": item["guid"], "headline": item["headline"], "location": "ingest", "itemClass": "icls:text", "original_source": item["original_source"], } if "original_creator" in item: item_ref["original_creator"] = item[ "original_creator"] refs.append(item_ref) media_item = dict() media_item["guid"] = generate_guid(type=GUID_TAG) media_item["versioncreated"] = utcnow() media_item[ITEM_TYPE] = CONTENT_TYPE.PICTURE media_item["renditions"] = renditions media_item["mimetype"] = content_type set_filemeta(media_item, metadata) media_item["slugline"] = fileName if text_body is not None: media_item["body_html"] = text_body media_item["headline"] = item["headline"] media_item["original_source"] = item[ "original_source"] if "original_creator" in item: media_item["original_creator"] = item[ "original_creator"] new_items.append(media_item) # add a reference to this item in the composite item media_ref = { "guid": media_item["guid"], "residRef": media_item["guid"], "headline": fileName, "location": "ingest", "itemClass": "icls:picture", "original_source": item["original_source"], } if "original_creator" in item: media_ref["original_creator"] = item[ "original_creator"] refs.append(media_ref) if html_body: item["body_html"] = html_body else: item["body_html"] = "<pre>" + text_body + "</pre>" item[FORMAT] = FORMATS.PRESERVED # if there is composite item then add the main group and references if comp_item: grefs = { "refs": [{ "idRef": "main" }], "id": "root", "role": "grpRole:NEP" } comp_item["groups"].append(grefs) grefs = {"refs": refs, "id": "main", "role": "grpRole:Main"} comp_item["groups"].append(grefs) new_items.append(comp_item) new_items.append(item) return new_items except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
def parse(self, data, provider=None): config = provider.get('config', {}) # If the channel is configured to process structured email generated from a google form if config.get('formatted', False): return self._parse_formatted_email(data, provider) try: new_items = [] # create an item for the body text of the email # either text or html item = dict() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['versioncreated'] = utcnow() comp_item = None # a list to keep the references to the attachments refs = [] html_body = None text_body = None for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) item['headline'] = self.parse_header(msg['subject']) field_from = self.parse_header(msg['from']) item['original_source'] = field_from try: if email_regex.findall(field_from): email_address = email_regex.findall(field_from)[0] user = get_resource_service( 'users').get_user_by_email(email_address) item['original_creator'] = user[ eve.utils.config.ID_FIELD] except UserNotRegisteredException: pass item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt # this will loop through all the available multiparts in mail for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) try: # if we don't know the charset just have a go! if part.get_content_charset() is None: text_body = body.decode() else: charset = part.get_content_charset() text_body = body.decode(charset) continue except Exception as ex: logger.exception( "Exception parsing text body for {0} from {1}: {2}" .format(item['headline'], field_from, ex)) continue if part.get_content_type() == "text/html": body = part.get_payload(decode=True) try: if part.get_content_charset() is None: html_body = body.decode() else: charset = part.get_content_charset() html_body = body.decode(charset) html_body = self.safe_html(html_body) continue except Exception as ex: logger.exception( "Exception parsing html body for {0} from {1}: {2}" .format(item['headline'], field_from, ex)) continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue # we are only going to pull off image attachments at this stage if part.get_content_maintype() != 'image': continue fileName = part.get_filename() if bool(fileName): image = part.get_payload(decode=True) content = io.BytesIO(image) res = process_file_from_stream( content, part.get_content_type()) file_name, content_type, metadata = res if content_type == 'image/gif' or content_type == 'image/png': continue content.seek(0) image_id = self.parser_app.media.put( content, filename=fileName, content_type=content_type, metadata=metadata) renditions = {'baseImage': {'href': image_id}} # if we have not got a composite item then create one if not comp_item: comp_item = dict() comp_item[ITEM_TYPE] = CONTENT_TYPE.COMPOSITE comp_item['guid'] = generate_guid( type=GUID_TAG) comp_item['versioncreated'] = utcnow() comp_item['groups'] = [] comp_item['headline'] = item['headline'] comp_item['groups'] = [] comp_item['original_source'] = item[ 'original_source'] if 'original_creator' in item: comp_item['original_creator'] = item[ 'original_creator'] # create a reference to the item that stores the body of the email item_ref = { 'guid': item['guid'], 'residRef': item['guid'], 'headline': item['headline'], 'location': 'ingest', 'itemClass': 'icls:text', 'original_source': item['original_source'] } if 'original_creator' in item: item_ref['original_creator'] = item[ 'original_creator'] refs.append(item_ref) media_item = dict() media_item['guid'] = generate_guid(type=GUID_TAG) media_item['versioncreated'] = utcnow() media_item[ITEM_TYPE] = CONTENT_TYPE.PICTURE media_item['renditions'] = renditions media_item['mimetype'] = content_type set_filemeta(media_item, metadata) media_item['slugline'] = fileName if text_body is not None: media_item['body_html'] = text_body media_item['headline'] = item['headline'] media_item['original_source'] = item[ 'original_source'] if 'original_creator' in item: media_item['original_creator'] = item[ 'original_creator'] new_items.append(media_item) # add a reference to this item in the composite item media_ref = { 'guid': media_item['guid'], 'residRef': media_item['guid'], 'headline': fileName, 'location': 'ingest', 'itemClass': 'icls:picture', 'original_source': item['original_source'] } if 'original_creator' in item: media_ref['original_creator'] = item[ 'original_creator'] refs.append(media_ref) if html_body is not None: item['body_html'] = html_body else: item['body_html'] = '<pre>' + text_body + '</pre>' item[FORMAT] = FORMATS.PRESERVED # if there is composite item then add the main group and references if comp_item: grefs = { 'refs': [{ 'idRef': 'main' }], 'id': 'root', 'role': 'grpRole:NEP' } comp_item['groups'].append(grefs) grefs = {'refs': refs, 'id': 'main', 'role': 'grpRole:Main'} comp_item['groups'].append(grefs) new_items.append(comp_item) new_items.append(item) return new_items except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
def find_one_raw(self, resource, _id): if self._headers is None: self.__set_auth_cookie(self._app) url = self._app.config['AAP_MM_SEARCH_URL'] + '/Assets/{}'.format(_id) r = self._http.request('GET', url, headers=self._headers) doc = json.loads(r.data.decode('UTF-8')) self._parse_doc(doc) if 'fetch_endpoint' in doc: del doc['fetch_endpoint'] # Only if we have credentials can we download the original if the account has that privilege if self._username is not None and self._password is not None: resolutions = self._get_resolutions(_id) if doc[ITEM_TYPE] == CONTENT_TYPE.PICTURE: if any(i['Name'] == 'Original' for i in resolutions['Image']): url = self._app.config[ 'AAP_MM_SEARCH_URL'] + '/Assets/{}/Original/download'.format( _id) mime_type = 'image/jpeg' else: raise FileNotFoundError elif doc[ITEM_TYPE] == CONTENT_TYPE.VIDEO: if any(v['Name'] == 'Ipod' for v in resolutions['Video']): url = self._app.config[ 'AAP_MM_SEARCH_URL'] + '/Assets/{}/Ipod/download'.format( _id) mime_type = doc.get('renditions').get('original').get( 'mimetype') else: raise FileNotFoundError else: raise NotImplementedError else: if doc[ITEM_TYPE] == CONTENT_TYPE.VIDEO: mime_type = doc.get('renditions').get('original').get( 'mimetype') else: mime_type = 'image/jpeg' url = doc['renditions']['original']['href'] r = self._http.request('GET', url, headers=self._headers) out = BytesIO(r.data) file_name, content_type, metadata = process_file_from_stream( out, mime_type) inserted = [] try: logger.debug('Going to save media file with %s ' % file_name) out.seek(0) file_id = self._app.media.put(out, filename=file_name, content_type=content_type, metadata=None) doc['mimetype'] = content_type doc['filemeta'] = decode_metadata(metadata) # set the version created to now to bring it to the top of the desk, images can be quite old doc['versioncreated'] = utcnow() inserted = [file_id] file_type = content_type.split('/')[0] rendition_spec = self._app.config['RENDITIONS']['picture'] renditions = generate_renditions(out, file_id, inserted, file_type, content_type, rendition_spec, self.url_for_media, insert_metadata=False) doc['renditions'] = renditions except Exception as io: logger.exception(io) for file_id in inserted: delete_file_on_error(doc, file_id) raise SuperdeskApiError.internalError( 'Generating renditions failed') return doc
def parse_email(self, data, provider): try: new_items = [] # create an item for the body text of the email # either text or html item = dict() item['type'] = 'text' item['versioncreated'] = utcnow() comp_item = None # a list to keep the references to the attachments refs = [] html_body = None text_body = None for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) item['headline'] = self.parse_header(msg['subject']) item['original_creator'] = self.parse_header(msg['from']) item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp(email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt # this will loop through all the available multiparts in mail for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) try: # if we don't know the charset just have a go! if part.get_content_charset() is None: text_body = body.decode() else: charset = part.get_content_charset() text_body = body.decode(charset) continue except Exception as ex: logger.exception( "Exception parsing text body for {0} from {1}".format(item['headline'], item['original_creator']), ex) continue if part.get_content_type() == "text/html": body = part.get_payload(decode=True) try: if part.get_content_charset() is None: html_body = body.decode() else: charset = part.get_content_charset() html_body = body.decode(charset) html_body = self.safe_html(html_body) continue except Exception as ex: logger.exception( "Exception parsing text html for {0} from {1}".format(item['headline'], item['original_creator']), ex) continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue # we are only going to pull off image attachments at this stage if part.get_content_maintype() != 'image': continue fileName = part.get_filename() if bool(fileName): image = part.get_payload(decode=True) content = io.BytesIO(image) res = process_file_from_stream(content, part.get_content_type()) file_name, content_type, metadata = res if content_type == 'image/gif' or content_type == 'image/png': continue content.seek(0) image_id = self.parser_app.media.put(content, filename=fileName, content_type=content_type, metadata=metadata) renditions = {'baseImage': {'href': image_id}} # if we have not got a composite item then create one if not comp_item: comp_item = dict() comp_item['type'] = 'composite' comp_item['guid'] = generate_guid(type=GUID_TAG) comp_item['versioncreated'] = utcnow() comp_item['groups'] = [] comp_item['headline'] = item['headline'] comp_item['groups'] = [] # create a reference to the item that stores the body of the email item_ref = {} item_ref['guid'] = item['guid'] item_ref['residRef'] = item['guid'] item_ref['headline'] = item['headline'] item_ref['location'] = 'ingest' item_ref['itemClass'] = 'icls:text' refs.append(item_ref) media_item = dict() media_item['guid'] = generate_guid(type=GUID_TAG) media_item['versioncreated'] = utcnow() media_item['type'] = 'picture' media_item['renditions'] = renditions media_item['mimetype'] = content_type media_item['filemeta'] = metadata media_item['slugline'] = fileName if text_body is not None: media_item['body_html'] = text_body media_item['headline'] = item['headline'] new_items.append(media_item) # add a reference to this item in the composite item media_ref = {} media_ref['guid'] = media_item['guid'] media_ref['residRef'] = media_item['guid'] media_ref['headline'] = fileName media_ref['location'] = 'ingest' media_ref['itemClass'] = 'icls:picture' refs.append(media_ref) if html_body is not None: item['body_html'] = html_body else: item['body_html'] = text_body item['type'] = 'preformatted' # if there is composite item then add the main group and references if comp_item: grefs = {} grefs['refs'] = [{'idRef': 'main'}] grefs['id'] = 'root' grefs['role'] = 'grpRole:NEP' comp_item['groups'].append(grefs) grefs = {} grefs['refs'] = refs grefs['id'] = 'main' grefs['role'] = 'grpRole:Main' comp_item['groups'].append(grefs) new_items.append(comp_item) new_items.append(item) return new_items except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
def parse(self, data, provider=None): config = provider.get('config', {}) # If the channel is configured to process structured email generated from a google form if config.get('formatted', False): return self._parse_formatted_email(data, provider) try: new_items = [] # create an item for the body text of the email # either text or html item = dict() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['versioncreated'] = utcnow() comp_item = None # a list to keep the references to the attachments refs = [] html_body = None text_body = None for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) item['headline'] = self.parse_header(msg['subject']) field_from = self.parse_header(msg['from']) item['original_source'] = field_from try: if email_regex.findall(field_from): email_address = email_regex.findall(field_from)[0] user = get_resource_service('users').get_user_by_email(email_address) item['original_creator'] = user[eve.utils.config.ID_FIELD] except UserNotRegisteredException: pass item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt # this will loop through all the available multiparts in mail for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) try: # if we don't know the charset just have a go! if part.get_content_charset() is None: text_body = body.decode() else: charset = part.get_content_charset() text_body = body.decode(charset) continue except Exception as ex: logger.exception( "Exception parsing text body for {0} from {1}: {2}".format(item['headline'], field_from, ex)) continue if part.get_content_type() == "text/html": body = part.get_payload(decode=True) try: if part.get_content_charset() is None: html_body = body.decode() else: charset = part.get_content_charset() html_body = body.decode(charset) html_body = self.safe_html(html_body) continue except Exception as ex: logger.exception( "Exception parsing html body for {0} from {1}: {2}".format(item['headline'], field_from, ex)) continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue # we are only going to pull off image attachments at this stage if part.get_content_maintype() != 'image': continue fileName = part.get_filename() if bool(fileName): image = part.get_payload(decode=True) content = io.BytesIO(image) res = process_file_from_stream(content, part.get_content_type()) file_name, content_type, metadata = res if content_type == 'image/gif' or content_type == 'image/png': continue content.seek(0) image_id = self.parser_app.media.put(content, filename=fileName, content_type=content_type, metadata=metadata) renditions = {'baseImage': {'href': image_id}} # if we have not got a composite item then create one if not comp_item: comp_item = dict() comp_item[ITEM_TYPE] = CONTENT_TYPE.COMPOSITE comp_item['guid'] = generate_guid(type=GUID_TAG) comp_item['versioncreated'] = utcnow() comp_item['groups'] = [] comp_item['headline'] = item['headline'] comp_item['groups'] = [] comp_item['original_source'] = item['original_source'] if 'original_creator' in item: comp_item['original_creator'] = item['original_creator'] # create a reference to the item that stores the body of the email item_ref = {'guid': item['guid'], 'residRef': item['guid'], 'headline': item['headline'], 'location': 'ingest', 'itemClass': 'icls:text', 'original_source': item['original_source']} if 'original_creator' in item: item_ref['original_creator'] = item['original_creator'] refs.append(item_ref) media_item = dict() media_item['guid'] = generate_guid(type=GUID_TAG) media_item['versioncreated'] = utcnow() media_item[ITEM_TYPE] = CONTENT_TYPE.PICTURE media_item['renditions'] = renditions media_item['mimetype'] = content_type set_filemeta(media_item, metadata) media_item['slugline'] = fileName if text_body is not None: media_item['body_html'] = text_body media_item['headline'] = item['headline'] media_item['original_source'] = item['original_source'] if 'original_creator' in item: media_item['original_creator'] = item['original_creator'] new_items.append(media_item) # add a reference to this item in the composite item media_ref = {'guid': media_item['guid'], 'residRef': media_item['guid'], 'headline': fileName, 'location': 'ingest', 'itemClass': 'icls:picture', 'original_source': item['original_source']} if 'original_creator' in item: media_ref['original_creator'] = item['original_creator'] refs.append(media_ref) if html_body is not None: item['body_html'] = html_body else: item['body_html'] = '<pre>' + text_body + '</pre>' item[FORMAT] = FORMATS.PRESERVED # if there is composite item then add the main group and references if comp_item: grefs = {'refs': [{'idRef': 'main'}], 'id': 'root', 'role': 'grpRole:NEP'} comp_item['groups'].append(grefs) grefs = {'refs': refs, 'id': 'main', 'role': 'grpRole:Main'} comp_item['groups'].append(grefs) new_items.append(comp_item) new_items.append(item) return new_items except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
def parse_newscomponent_media(self, item, newscomponent_el): """ Parse NewsComponent in NewsItem element. Supports only text items which roles are in `SUPPORTED_MEDIA_ASSET_TYPES` Example: <NewsComponent> <NewsLines> <DateLine xml:lang="fr">Paris, 9 déc 2018 (AFP) -</DateLine> <HeadLine xml:lang="fr">Un an après, les fans de Johnny lui rendent hommage à Paris</HeadLine> <NewsLine> <NewsLineType FormalName="ProductLine"/> <NewsLineText xml:lang="fr">(Photo+Live Video+Video)</NewsLineText> </NewsLine> </NewsLines> <AdministrativeMetadata> <Provider> <Party FormalName="AFP"/> </Provider> </AdministrativeMetadata> <DescriptiveMetadata> .... </DescriptiveMetadata> <ContentItem> .... </ContentItem> </NewsComponent> :param item: :param component_el: :return: """ # language item['language'] = newscomponent_el.attrib.get(XML_LANG) # NewsLines newslines_el = newscomponent_el.find('NewsLines') self.parse_newslines(item, newslines_el) # AdministrativeMetadata admin_el = newscomponent_el.find('AdministrativeMetadata') self.parse_administrativemetadata(item, admin_el) # DescriptiveMetadata descript_el = newscomponent_el.find('DescriptiveMetadata') self.parse_descriptivemetadata(item, descript_el) # description_text, headline for formalname, item_key in (('Body', 'description_text'), ('Title', 'headline')): role = newscomponent_el.find('NewsComponent/Role[@FormalName="{}"]'.format(formalname)) if role is not None: newscomponent = role.getparent() datacontent = newscomponent.find('ContentItem/DataContent') format = newscomponent.find('ContentItem/Format') if datacontent is not None and format is not None: formalname = format.attrib.get('FormalName') if not formalname or formalname not in ('Text', 'ascii'): logger.warning( 'ContentItem/FormalName was not found or not supported: "{}". ' 'Skiping an "{}" item.'.format(formalname, item['guid']) ) raise SkipItemException if datacontent.text: item[item_key] = datacontent.text.strip() if item_key == 'description_text': item[item_key] = self._plain_to_html(item[item_key]) else: logger.warning('Mimetype or DataContent was not found. Skiping an "{}" item.'.format( item['guid'] )) raise SkipItemException # type role = newscomponent_el.find('Role') if role is not None: role_name = role.attrib.get('FormalName') if not role_name: logger.warning('NewsComponent/Role was not found. Skiping an "{}" item.'.format( item['guid'] )) raise SkipItemException role_name = role_name.upper() item[ITEM_TYPE] = getattr(CONTENT_TYPE, role_name) # read files and save them into the storage for newscomponent in newscomponent_el.findall('NewsComponent'): component_role = self._get_role(newscomponent) if component_role and component_role.upper() in self.SUPPORTED_MEDIA_ASSET_TYPES[role_name].keys(): content_item = newscomponent.find('ContentItem') if content_item is None: continue filename = content_item.attrib.get('Href') if filename is None: continue format_name = '' format_el = content_item.find('Format') if format_el is not None: format_name = format_el.attrib.get('FormalName') content = self._get_file(filename) if not content: continue _, content_type, metadata = process_file_from_stream(content, 'application/' + format_name) content.seek(0) media_id = app.media.put( content, filename=filename, content_type=content_type, metadata=metadata ) rendition_key = self.SUPPORTED_MEDIA_ASSET_TYPES[role_name][component_role.upper()] item.setdefault('renditions', {})[rendition_key] = { 'media': media_id, 'mimetype': content_type, 'href': app.media.url_for_media(media_id, content_type), } # this attibutes are redundand for media item attrs_to_be_removed = ('date_id', 'item_id', 'provider_id', 'public_identifier') for attr in attrs_to_be_removed: if attr in item: del item[attr] # clean subject subject_to_be_removed = ( 'genre', ) item['subject'] = [i for i in item.get('subject', []) if i['scheme'] not in subject_to_be_removed]
def _update(self, provider, update): config = provider.get('config', {}) server = config.get('server', '') port = int(config.get('port', 993)) try: imap = imaplib.IMAP4_SSL(host=server, port=port) try: imap.login(config.get('user', None), config.get('password', None)) except imaplib.IMAP4.error: raise IngestEmailError.emailLoginError(imaplib.IMAP4.error, provider) rv, data = imap.select(config.get('mailbox', None), readonly=False) if rv == 'OK': rv, data = imap.search(None, config.get('filter', '(UNSEEN)')) if rv == 'OK': new_items = [] for num in data[0].split(): rv, data = imap.fetch(num, '(RFC822)') if rv == 'OK': try: logger.info('Ingesting events from email') parser = self.get_feed_parser(provider, data) for response_part in data: if isinstance(response_part, tuple): if isinstance(response_part[1], bytes): msg = email.message_from_bytes( response_part[1]) else: msg = email.message_from_string( response_part[1]) # this will loop through all the available multiparts in email for part in msg.walk(): # parse attached files only if part.get('Content-Disposition' ) is None: continue fileName = part.get_filename() if bool(fileName): attachment = part.get_payload( decode=True) content = io.BytesIO( attachment) res = process_file_from_stream( content, part.get_content_type()) file_name, content_type, metadata = res logger.info( 'Ingesting events with {} parser' .format(parser.__class__. __name__)) if getattr( parser, 'parse_email'): try: new_items.append( parser.parse_email( content, content_type, provider)) except ParserError.parseMessageError: continue else: new_items.append( parser.parse( data, provider)) rv, data = imap.store(num, '+FLAGS', '\\Seen') except IngestEmailError: continue imap.close() imap.logout() except IngestEmailError: raise except Exception as ex: raise IngestEmailError.emailError(ex, provider) return new_items
def parse_email(self, data, provider): try: new_items = [] # create an item for the body text of the email # either text or html item = dict() item['type'] = 'text' item['versioncreated'] = utcnow() comp_item = None # a list to keep the references to the attachments refs = [] html_body = None text_body = None for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) item['headline'] = self.parse_header(msg['subject']) item['original_creator'] = self.parse_header(msg['from']) item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt # this will loop through all the available multiparts in mail for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) try: # if we don't know the charset just have a go! if part.get_content_charset() is None: text_body = body.decode() else: charset = part.get_content_charset() text_body = body.decode(charset) continue except Exception as ex: logger.exception( "Exception parsing text body for {0} from {1}" .format(item['headline'], item['original_creator']), ex) continue if part.get_content_type() == "text/html": body = part.get_payload(decode=True) try: if part.get_content_charset() is None: html_body = body.decode() else: charset = part.get_content_charset() html_body = body.decode(charset) html_body = self.safe_html(html_body) continue except Exception as ex: logger.exception( "Exception parsing text html for {0} from {1}" .format(item['headline'], item['original_creator']), ex) continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue # we are only going to pull off image attachments at this stage if part.get_content_maintype() != 'image': continue fileName = part.get_filename() if bool(fileName): image = part.get_payload(decode=True) content = io.BytesIO(image) res = process_file_from_stream( content, part.get_content_type()) file_name, content_type, metadata = res if content_type == 'image/gif' or content_type == 'image/png': continue content.seek(0) image_id = self.parser_app.media.put( content, filename=fileName, content_type=content_type, metadata=metadata) renditions = {'baseImage': {'href': image_id}} # if we have not got a composite item then create one if not comp_item: comp_item = dict() comp_item['type'] = 'composite' comp_item['guid'] = generate_guid( type=GUID_TAG) comp_item['versioncreated'] = utcnow() comp_item['groups'] = [] comp_item['headline'] = item['headline'] comp_item['groups'] = [] # create a reference to the item that stores the body of the email item_ref = {} item_ref['guid'] = item['guid'] item_ref['residRef'] = item['guid'] item_ref['headline'] = item['headline'] item_ref['location'] = 'ingest' item_ref['itemClass'] = 'icls:text' refs.append(item_ref) media_item = dict() media_item['guid'] = generate_guid(type=GUID_TAG) media_item['versioncreated'] = utcnow() media_item['type'] = 'picture' media_item['renditions'] = renditions media_item['mimetype'] = content_type media_item['filemeta'] = metadata media_item['slugline'] = fileName if text_body is not None: media_item['body_html'] = text_body media_item['headline'] = item['headline'] new_items.append(media_item) # add a reference to this item in the composite item media_ref = {} media_ref['guid'] = media_item['guid'] media_ref['residRef'] = media_item['guid'] media_ref['headline'] = fileName media_ref['location'] = 'ingest' media_ref['itemClass'] = 'icls:picture' refs.append(media_ref) if html_body is not None: item['body_html'] = html_body else: item['body_html'] = text_body item['type'] = 'preformatted' # if there is composite item then add the main group and references if comp_item: grefs = {} grefs['refs'] = [{'idRef': 'main'}] grefs['id'] = 'root' grefs['role'] = 'grpRole:NEP' comp_item['groups'].append(grefs) grefs = {} grefs['refs'] = refs grefs['id'] = 'main' grefs['role'] = 'grpRole:Main' comp_item['groups'].append(grefs) new_items.append(comp_item) new_items.append(item) return new_items except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)