def parse_item(self, image_path): filename = os.path.basename(image_path) content_type = mimetypes.guess_type(image_path)[0] guid = utils.generate_guid(type=GUID_TAG) item = {'guid': guid, 'uri': guid, config.VERSION: 1, ITEM_TYPE: CONTENT_TYPE.PICTURE, 'mimetype': content_type, 'versioncreated': utcnow(), } with open(image_path, 'rb') as f: _, content_type, file_metadata = process_file_from_stream(f, content_type=content_type) f.seek(0) file_id = app.media.put(f, filename=filename, content_type=content_type, metadata=file_metadata) filemeta.set_filemeta(item, file_metadata) f.seek(0) metadata = get_meta_iptc(f) f.seek(0) self.parse_meta(item, metadata) rendition_spec = get_renditions_spec(no_custom_crops=True) renditions = generate_renditions(f, file_id, [file_id], 'image', content_type, rendition_spec, url_for_media) item['renditions'] = renditions return item
def on_create(self, docs): """Create corresponding item on file upload.""" for doc in docs: if 'media' not in doc or doc['media'] is None: abort(400, description="No media found") file, content_type, metadata = self.get_file_from_document(doc) inserted = [doc['media']] file_type = content_type.split('/')[0] self._set_metadata(doc) try: doc[ITEM_TYPE] = self.type_av.get(file_type) rendition_spec = get_renditions_spec() renditions = generate_renditions(file, doc['media'], inserted, file_type, content_type, rendition_spec, url_for_media) doc['renditions'] = renditions doc['mimetype'] = content_type set_filemeta(doc, metadata) add_activity('upload', 'uploaded media {{ name }}', 'archive', item=doc, name=doc.get('headline', doc.get('mimetype')), renditions=doc.get('renditions')) except Exception as io: logger.exception(io) for file_id in inserted: delete_file_on_error(doc, file_id) abort(500)
def setUp(self): super().setUp() dirname = os.path.dirname(os.path.realpath(__file__)) image_path = os.path.normpath( os.path.join(dirname, "fixtures", self.filename)) content_type = mimetypes.guess_type(image_path)[0] guid = utils.generate_guid(type=GUID_TAG) self.item = { "guid": guid, "version": 1, "_id": guid, ITEM_TYPE: CONTENT_TYPE.PICTURE, "mimetype": content_type, "versioncreated": datetime.now(), } with open(image_path, "rb") as f: _, content_type, file_metadata = process_file_from_stream( f, content_type=content_type) f.seek(0) file_id = app.media.put(f, filename=self.filename, content_type=content_type, metadata=file_metadata) filemeta.set_filemeta(self.item, file_metadata) f.seek(0) rendition_spec = get_renditions_spec() renditions = generate_renditions(f, file_id, [file_id], "image", content_type, rendition_spec, url_for_media) self.item["renditions"] = renditions archive = get_resource_service("archive") archive.post([self.item])
def crop_and_store_file(self, doc, content, filename, content_type): # retrieve file name and metadata from file file_name, content_type, metadata = process_file_from_stream( content, content_type=content_type) # crop the file if needed, can change the image size was_cropped, out = crop_image(content, filename, doc) # the length in metadata could be updated if it was cropped if was_cropped: file_name, content_type, metadata_after_cropped = process_file_from_stream( out, content_type=content_type) # when cropped, metadata are reseted. Then we update the previous metadata variable metadata['length'] = metadata_after_cropped['length'] try: logger.debug('Going to save media file with %s ' % file_name) out.seek(0) file_id = app.media.put(out, filename=file_name, content_type=content_type, resource=self.datasource, metadata=metadata) doc['media'] = file_id doc['mimetype'] = content_type set_filemeta(doc, decode_metadata(metadata)) inserted = [doc['media']] file_type = content_type.split('/')[0] rendition_spec = config.RENDITIONS['avatar'] renditions = generate_renditions(out, file_id, inserted, file_type, content_type, rendition_spec, url_for_media) doc['renditions'] = renditions except Exception as io: for file_id in inserted: delete_file_on_error(doc, file_id) raise SuperdeskApiError.internalError( 'Generating renditions failed', exception=io)
def crop_and_store_file(self, doc, content, filename, content_type): # retrieve file name and metadata from file file_name, content_type, metadata = process_file_from_stream(content, content_type=content_type) # crop the file if needed, can change the image size was_cropped, out = crop_image(content, filename, doc) # the length in metadata could be updated if it was cropped if was_cropped: file_name, content_type, metadata_after_cropped = process_file_from_stream(out, content_type=content_type) # when cropped, metadata are reseted. Then we update the previous metadata variable metadata['length'] = metadata_after_cropped['length'] try: logger.debug('Going to save media file with %s ' % file_name) out.seek(0) file_id = app.media.put(out, filename=file_name, content_type=content_type, resource=self.datasource, metadata=metadata) doc['media'] = file_id doc['mimetype'] = content_type set_filemeta(doc, decode_metadata(metadata)) inserted = [doc['media']] file_type = content_type.split('/')[0] rendition_spec = config.RENDITIONS['avatar'] renditions = generate_renditions(out, file_id, inserted, file_type, content_type, rendition_spec, url_for_media) doc['renditions'] = renditions except Exception as io: logger.exception(io) for file_id in inserted: delete_file_on_error(doc, file_id) raise SuperdeskApiError.internalError('Generating renditions failed')
def setUp(self): super().setUp() dirname = os.path.dirname(os.path.realpath(__file__)) image_path = os.path.normpath( os.path.join(dirname, 'fixtures', self.filename)) content_type = mimetypes.guess_type(image_path)[0] guid = utils.generate_guid(type=GUID_TAG) self.item = { 'guid': guid, 'version': 1, '_id': guid, ITEM_TYPE: CONTENT_TYPE.PICTURE, 'mimetype': content_type, 'versioncreated': datetime.now() } with open(image_path, 'rb') as f: _, content_type, file_metadata = process_file_from_stream( f, content_type=content_type) f.seek(0) file_id = app.media.put(f, filename=self.filename, content_type=content_type, metadata=file_metadata) filemeta.set_filemeta(self.item, file_metadata) f.seek(0) rendition_spec = get_renditions_spec() renditions = generate_renditions(f, file_id, [file_id], 'image', content_type, rendition_spec, url_for_media) self.item['renditions'] = renditions archive = get_resource_service('archive') archive.post([self.item])
def setUp(self): super().setUp() dirname = os.path.dirname(os.path.realpath(__file__)) image_path = os.path.normpath(os.path.join(dirname, 'fixtures', self.filename)) content_type = mimetypes.guess_type(image_path)[0] guid = utils.generate_guid(type=GUID_TAG) self.item = {'guid': guid, 'version': 1, '_id': guid, ITEM_TYPE: CONTENT_TYPE.PICTURE, 'mimetype': content_type, 'versioncreated': datetime.now() } with open(image_path, 'rb') as f: _, content_type, file_metadata = process_file_from_stream(f, content_type=content_type) f.seek(0) file_id = app.media.put(f, filename=self.filename, content_type=content_type, metadata=file_metadata) filemeta.set_filemeta(self.item, file_metadata) f.seek(0) rendition_spec = get_renditions_spec() renditions = generate_renditions(f, file_id, [file_id], 'image', content_type, rendition_spec, url_for_media) self.item['renditions'] = renditions archive = get_resource_service('archive') archive.post([self.item])
def on_create(self, docs): """Create corresponding item on file upload.""" for doc in docs: if 'media' not in doc or doc['media'] is None: abort(400, description="No media found") # check content type of video by python-magic content_type = magic.from_buffer(doc['media'].read(1024), mime=True) doc['media'].seek(0) file_type = content_type.split('/')[0] if file_type == 'video' and app.config.get("VIDEO_SERVER_ENABLE"): if not self.videoEditor.check_video_server(): raise SuperdeskApiError( message="Cannot connect to videoserver", status_code=500) # upload media to video server res, renditions, metadata = self.upload_file_to_video_server( doc) # get thumbnails for timeline bar self.videoEditor.get_timeline_thumbnails(doc.get('media'), 40) else: file, content_type, metadata = self.get_file_from_document(doc) inserted = [doc['media']] # if no_custom_crops is set to False the custom crops are generated automatically on media upload # see (SDESK-4742) rendition_spec = get_renditions_spec( no_custom_crops=app.config.get("NO_CUSTOM_CROPS")) with timer('archive:renditions'): renditions = generate_renditions(file, doc['media'], inserted, file_type, content_type, rendition_spec, url_for_media) try: self._set_metadata(doc) doc[ITEM_TYPE] = self.type_av.get(file_type) doc[ITEM_STATE] = CONTENT_STATE.PROGRESS doc['renditions'] = renditions doc['mimetype'] = content_type set_filemeta(doc, metadata) add_activity('upload', 'uploaded media {{ name }}', 'archive', item=doc, name=doc.get('headline', doc.get('mimetype')), renditions=doc.get('renditions')) except Exception as io: logger.exception(io) for file_id in inserted: delete_file_on_error(doc, file_id) if res: self.videoEditor.delete(res.get('_id')) abort(500)
def on_create(self, docs): """Create corresponding item on file upload.""" for doc in docs: if "media" not in doc or doc["media"] is None: abort(400, description="No media found") # check content type of video by python-magic content_type = app.media._get_mimetype(doc["media"]) doc["media"].seek(0) file_type = content_type.split("/")[0] if file_type == "video" and app.config.get("VIDEO_SERVER_ENABLED"): # upload media to video server res, renditions, metadata = self.upload_file_to_video_server( doc) # get thumbnails for timeline bar self.video_editor.create_timeline_thumbnails( doc.get("media"), 60) else: file, content_type, metadata = self.get_file_from_document(doc) inserted = [doc["media"]] # if no_custom_crops is set to False the custom crops are generated automatically on media upload # see (SDESK-4742) rendition_spec = get_renditions_spec( no_custom_crops=app.config.get("NO_CUSTOM_CROPS")) with timer("archive:renditions"): renditions = generate_renditions(file, doc["media"], inserted, file_type, content_type, rendition_spec, url_for_media) try: self._set_metadata(doc) doc[ITEM_TYPE] = self.type_av.get(file_type) doc[ITEM_STATE] = CONTENT_STATE.PROGRESS doc["renditions"] = renditions doc["mimetype"] = content_type set_filemeta(doc, metadata) add_activity( "upload", "uploaded media {{ name }}", "archive", item=doc, name=doc.get("headline", doc.get("mimetype")), renditions=doc.get("renditions"), ) except Exception as io: logger.exception(io) for file_id in inserted: delete_file_on_error(doc, file_id) if res: self.video_editor.delete(res.get("_id")) abort(500)
def parse_item(self, image_path): filename = os.path.basename(image_path) content_type = mimetypes.guess_type(image_path)[0] guid = utils.generate_guid(type=GUID_TAG) item = { 'guid': guid, config.VERSION: 1, config.ID_FIELD: guid, ITEM_TYPE: CONTENT_TYPE.PICTURE, 'mimetype': content_type, 'versioncreated': datetime.now() } with open(image_path, 'rb') as f: _, content_type, file_metadata = process_file_from_stream( f, content_type=content_type) f.seek(0) file_id = app.media.put(f, filename=filename, content_type=content_type, metadata=file_metadata) filemeta.set_filemeta(item, file_metadata) f.seek(0) metadata = get_meta_iptc(f) f.seek(0) rendition_spec = get_renditions_spec(no_custom_crops=True) renditions = generate_renditions(f, file_id, [file_id], 'image', content_type, rendition_spec, url_for_media) item['renditions'] = renditions try: date_created, time_created = metadata[TAG.DATE_CREATED], metadata[ TAG.TIME_CREATED] except KeyError: pass else: # we format proper ISO 8601 date so we can parse it with dateutil datetime_created = '{}-{}-{}T{}:{}:{}{}{}:{}'.format( date_created[0:4], date_created[4:6], date_created[6:8], time_created[0:2], time_created[2:4], time_created[4:6], time_created[6], time_created[7:9], time_created[9:]) item['firstcreated'] = dateutil.parser.parse(datetime_created) # now we map IPTC metadata to superdesk metadata for source_key, dest_key in IPTC_MAPPING.items(): try: item[dest_key] = metadata[source_key] except KeyError: continue return item
def on_create(self, docs): super().on_create(docs) for doc in docs: update_dates_for(doc) doc['original_creator'] = str(get_user().get('_id')) if doc.get('item_type'): if doc['item_type'] == 'embed': metadata = doc['meta'] set_filemeta(doc, metadata) if get_filemeta(doc, 'version'): metadata['version'] = str(metadata.get('version')) if get_filemeta(doc, 'width'): metadata['width'] = str(metadata.get('width')) if get_filemeta(doc, 'height'): metadata['height'] = str(metadata.get('height'))
def parse_item(self, image_path): filename = os.path.basename(image_path) content_type = mimetypes.guess_type(image_path)[0] guid = utils.generate_guid(type=GUID_TAG) item = {'guid': guid, config.VERSION: 1, config.ID_FIELD: guid, ITEM_TYPE: CONTENT_TYPE.PICTURE, 'mimetype': content_type, 'versioncreated': datetime.now() } with open(image_path, 'rb') as f: _, content_type, file_metadata = process_file_from_stream(f, content_type=content_type) f.seek(0) file_id = app.media.put(f, filename=filename, content_type=content_type, metadata=file_metadata) filemeta.set_filemeta(item, file_metadata) f.seek(0) metadata = get_meta_iptc(f) f.seek(0) rendition_spec = get_renditions_spec(no_custom_crops=True) renditions = generate_renditions(f, file_id, [file_id], 'image', content_type, rendition_spec, url_for_media) item['renditions'] = renditions try: date_created, time_created = metadata[TAG.DATE_CREATED], metadata[TAG.TIME_CREATED] except KeyError: pass else: # we format proper ISO 8601 date so we can parse it with dateutil datetime_created = '{}-{}-{}T{}:{}:{}{}{}:{}'.format(date_created[0:4], date_created[4:6], date_created[6:8], time_created[0:2], time_created[2:4], time_created[4:6], time_created[6], time_created[7:9], time_created[9:]) item['firstcreated'] = dateutil.parser.parse(datetime_created) # now we map IPTC metadata to superdesk metadata for source_key, dest_key in IPTC_MAPPING.items(): try: item[dest_key] = metadata[source_key] except KeyError: continue return item
def update_renditions(item, href, old_item, request_kwargs=None): """Update renditions for an item. If the old_item has renditions uploaded in to media then the old rendition details are assigned to the item, this avoids repeatedly downloading the same image and leaving the media entries orphaned. If there is no old_item the original is downloaded and renditions are generated. :param item: parsed item from source :param href: reference to original :param old_item: the item that we have already ingested, if it exists :return: item with renditions """ inserted = [] try: # If there is an existing set of renditions we keep those if old_item: media = old_item.get("renditions", {}).get("original", {}).get("media", {}) if media: item["renditions"] = old_item["renditions"] item["mimetype"] = old_item.get("mimetype") item["filemeta"] = old_item.get("filemeta") item["filemeta_json"] = old_item.get("filemeta_json") return content, filename, content_type = download_file_from_url( href, request_kwargs) file_type, ext = content_type.split("/") metadata = process_file(content, file_type) file_guid = app.media.put(content, filename=filename, content_type=content_type, metadata=metadata) inserted.append(file_guid) rendition_spec = get_renditions_spec() renditions = generate_renditions(content, file_guid, inserted, file_type, content_type, rendition_spec, app.media.url_for_media) item["renditions"] = renditions item["mimetype"] = content_type set_filemeta(item, metadata) except Exception as e: logger.exception(e) for file_id in inserted: app.media.delete(file_id) raise
def on_create(self, docs): """Create corresponding item on file upload.""" for doc in docs: if 'media' not in doc or doc['media'] is None: abort(400, description="No media found") file, content_type, metadata = self.get_file_from_document(doc) inserted = [doc['media']] file_type = content_type.split('/')[0] self._set_metadata(doc) try: doc[ITEM_TYPE] = self.type_av.get(file_type) doc[ITEM_STATE] = CONTENT_STATE.PROGRESS # if no_custom_crops is set to False the custom crops are generated automatically on media upload # see (SDESK-4742) rendition_spec = get_renditions_spec( no_custom_crops=app.config.get("NO_CUSTOM_CROPS")) with timer('archive:renditions'): renditions = generate_renditions(file, doc['media'], inserted, file_type, content_type, rendition_spec, url_for_media) doc['renditions'] = renditions doc['mimetype'] = content_type set_filemeta(doc, metadata) add_activity('upload', 'uploaded media {{ name }}', 'archive', item=doc, name=doc.get('headline', doc.get('mimetype')), renditions=doc.get('renditions')) except Exception as io: logger.exception(io) for file_id in inserted: delete_file_on_error(doc, file_id) abort(500)
def update_renditions(item, href, old_item): """ If the old_item has renditions uploaded in to media then the old rendition details are assigned to the item, this avoids repeatedly downloading the same image and leaving the media entries orphaned. If there is no old_item the original is downloaded and renditions are generated. :param item: parsed item from source :param href: reference to original :param old_item: the item that we have already ingested, if it exists :return: item with renditions """ inserted = [] try: # If there is an existing set of renditions we keep those if old_item: media = old_item.get('renditions', {}).get('original', {}).get('media', {}) if media: item['renditions'] = old_item['renditions'] item['mimetype'] = old_item.get('mimetype') item['filemeta'] = old_item.get('filemeta') item['filemeta_json'] = old_item.get('filemeta_json') return content, filename, content_type = download_file_from_url(href) file_type, ext = content_type.split('/') metadata = process_file(content, file_type) file_guid = app.media.put(content, filename, content_type, metadata) inserted.append(file_guid) rendition_spec = get_renditions_spec() renditions = generate_renditions(content, file_guid, inserted, file_type, content_type, rendition_spec, url_for_media) item['renditions'] = renditions item['mimetype'] = content_type set_filemeta(item, metadata) except Exception: for file_id in inserted: app.media.delete(file_id) raise
def update_renditions(item, href, old_item): """Update renditions for an item. If the old_item has renditions uploaded in to media then the old rendition details are assigned to the item, this avoids repeatedly downloading the same image and leaving the media entries orphaned. If there is no old_item the original is downloaded and renditions are generated. :param item: parsed item from source :param href: reference to original :param old_item: the item that we have already ingested, if it exists :return: item with renditions """ inserted = [] try: # If there is an existing set of renditions we keep those if old_item: media = old_item.get('renditions', {}).get('original', {}).get('media', {}) if media: item['renditions'] = old_item['renditions'] item['mimetype'] = old_item.get('mimetype') item['filemeta'] = old_item.get('filemeta') item['filemeta_json'] = old_item.get('filemeta_json') return content, filename, content_type = download_file_from_url(href) file_type, ext = content_type.split('/') metadata = process_file(content, file_type) file_guid = app.media.put(content, filename, content_type, metadata) inserted.append(file_guid) rendition_spec = get_renditions_spec() renditions = generate_renditions(content, file_guid, inserted, file_type, content_type, rendition_spec, app.media.url_for_media) item['renditions'] = renditions item['mimetype'] = content_type set_filemeta(item, metadata) except Exception as e: logger.exception(e) for file_id in inserted: app.media.delete(file_id) raise
def test_get_set_filemeta(self): item = {} set_filemeta(item, {'foo': 'bar'}) self.assertEqual('bar', get_filemeta(item, 'foo')) self.assertEqual({'foo': 'bar'}, get_filemeta(item))
def parse(self, data, provider=None): config = provider.get("config", {}) # If the channel is configured to process structured email generated from a google form if config.get("formatted", False): return self._parse_formatted_email(data, provider) try: new_items = [] # create an item for the body text of the email # either text or html item = dict() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item["versioncreated"] = utcnow() comp_item = None # a list to keep the references to the attachments refs = [] html_body = None text_body = None for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) item["headline"] = self.parse_header(msg["subject"]) field_from = self.parse_header(msg["from"]) item["original_source"] = field_from try: if email_regex.findall(field_from): email_address = email_regex.findall(field_from)[0] user = get_resource_service( "users").get_user_by_email(email_address) item["original_creator"] = user[ eve.utils.config.ID_FIELD] except UserNotRegisteredException: pass item["guid"] = msg["Message-ID"] date_tuple = email.utils.parsedate_tz(msg["Date"]) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone("utc")) item["firstcreated"] = dt # this will loop through all the available multiparts in mail for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) try: # if we don't know the charset just have a go! if part.get_content_charset() is None: text_body = body.decode() else: charset = part.get_content_charset() text_body = body.decode(charset) continue except Exception as ex: logger.exception( "Exception parsing text body for {0} from {1}: {2}" .format(item["headline"], field_from, ex)) continue if part.get_content_type() == "text/html": body = part.get_payload(decode=True) try: if part.get_content_charset() is None: html_body = body.decode() else: charset = part.get_content_charset() html_body = body.decode(charset) html_body = sanitize_html(html_body) continue except Exception as ex: logger.exception( "Exception parsing html body for {0} from {1}: {2}" .format(item["headline"], field_from, ex)) continue if part.get_content_maintype() == "multipart": continue if part.get("Content-Disposition") is None: continue # we are only going to pull off image attachments at this stage if part.get_content_maintype() != "image": continue fileName = part.get_filename() if bool(fileName): image = part.get_payload(decode=True) content = io.BytesIO(image) res = process_file_from_stream( content, part.get_content_type()) file_name, content_type, metadata = res if content_type == "image/gif" or content_type == "image/png": continue content.seek(0) image_id = self.parser_app.media.put( content, filename=fileName, content_type=content_type, metadata=metadata) renditions = {"baseImage": {"href": image_id}} # if we have not got a composite item then create one if not comp_item: comp_item = dict() comp_item[ITEM_TYPE] = CONTENT_TYPE.COMPOSITE comp_item["guid"] = generate_guid( type=GUID_TAG) comp_item["versioncreated"] = utcnow() comp_item["groups"] = [] comp_item["headline"] = item["headline"] comp_item["groups"] = [] comp_item["original_source"] = item[ "original_source"] if "original_creator" in item: comp_item["original_creator"] = item[ "original_creator"] # create a reference to the item that stores the body of the email item_ref = { "guid": item["guid"], "residRef": item["guid"], "headline": item["headline"], "location": "ingest", "itemClass": "icls:text", "original_source": item["original_source"], } if "original_creator" in item: item_ref["original_creator"] = item[ "original_creator"] refs.append(item_ref) media_item = dict() media_item["guid"] = generate_guid(type=GUID_TAG) media_item["versioncreated"] = utcnow() media_item[ITEM_TYPE] = CONTENT_TYPE.PICTURE media_item["renditions"] = renditions media_item["mimetype"] = content_type set_filemeta(media_item, metadata) media_item["slugline"] = fileName if text_body is not None: media_item["body_html"] = text_body media_item["headline"] = item["headline"] media_item["original_source"] = item[ "original_source"] if "original_creator" in item: media_item["original_creator"] = item[ "original_creator"] new_items.append(media_item) # add a reference to this item in the composite item media_ref = { "guid": media_item["guid"], "residRef": media_item["guid"], "headline": fileName, "location": "ingest", "itemClass": "icls:picture", "original_source": item["original_source"], } if "original_creator" in item: media_ref["original_creator"] = item[ "original_creator"] refs.append(media_ref) if html_body: item["body_html"] = html_body else: item["body_html"] = "<pre>" + text_body + "</pre>" item[FORMAT] = FORMATS.PRESERVED # if there is composite item then add the main group and references if comp_item: grefs = { "refs": [{ "idRef": "main" }], "id": "root", "role": "grpRole:NEP" } comp_item["groups"].append(grefs) grefs = {"refs": refs, "id": "main", "role": "grpRole:Main"} comp_item["groups"].append(grefs) new_items.append(comp_item) new_items.append(item) return new_items except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
def parse(self, data, provider=None): config = provider.get('config', {}) # If the channel is configured to process structured email generated from a google form if config.get('formatted', False): return self._parse_formatted_email(data, provider) try: new_items = [] # create an item for the body text of the email # either text or html item = dict() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['versioncreated'] = utcnow() comp_item = None # a list to keep the references to the attachments refs = [] html_body = None text_body = None for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) item['headline'] = self.parse_header(msg['subject']) field_from = self.parse_header(msg['from']) item['original_source'] = field_from try: if email_regex.findall(field_from): email_address = email_regex.findall(field_from)[0] user = get_resource_service( 'users').get_user_by_email(email_address) item['original_creator'] = user[ eve.utils.config.ID_FIELD] except UserNotRegisteredException: pass item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt # this will loop through all the available multiparts in mail for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) try: # if we don't know the charset just have a go! if part.get_content_charset() is None: text_body = body.decode() else: charset = part.get_content_charset() text_body = body.decode(charset) continue except Exception as ex: logger.exception( "Exception parsing text body for {0} from {1}: {2}" .format(item['headline'], field_from, ex)) continue if part.get_content_type() == "text/html": body = part.get_payload(decode=True) try: if part.get_content_charset() is None: html_body = body.decode() else: charset = part.get_content_charset() html_body = body.decode(charset) html_body = self.safe_html(html_body) continue except Exception as ex: logger.exception( "Exception parsing html body for {0} from {1}: {2}" .format(item['headline'], field_from, ex)) continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue # we are only going to pull off image attachments at this stage if part.get_content_maintype() != 'image': continue fileName = part.get_filename() if bool(fileName): image = part.get_payload(decode=True) content = io.BytesIO(image) res = process_file_from_stream( content, part.get_content_type()) file_name, content_type, metadata = res if content_type == 'image/gif' or content_type == 'image/png': continue content.seek(0) image_id = self.parser_app.media.put( content, filename=fileName, content_type=content_type, metadata=metadata) renditions = {'baseImage': {'href': image_id}} # if we have not got a composite item then create one if not comp_item: comp_item = dict() comp_item[ITEM_TYPE] = CONTENT_TYPE.COMPOSITE comp_item['guid'] = generate_guid( type=GUID_TAG) comp_item['versioncreated'] = utcnow() comp_item['groups'] = [] comp_item['headline'] = item['headline'] comp_item['groups'] = [] comp_item['original_source'] = item[ 'original_source'] if 'original_creator' in item: comp_item['original_creator'] = item[ 'original_creator'] # create a reference to the item that stores the body of the email item_ref = { 'guid': item['guid'], 'residRef': item['guid'], 'headline': item['headline'], 'location': 'ingest', 'itemClass': 'icls:text', 'original_source': item['original_source'] } if 'original_creator' in item: item_ref['original_creator'] = item[ 'original_creator'] refs.append(item_ref) media_item = dict() media_item['guid'] = generate_guid(type=GUID_TAG) media_item['versioncreated'] = utcnow() media_item[ITEM_TYPE] = CONTENT_TYPE.PICTURE media_item['renditions'] = renditions media_item['mimetype'] = content_type set_filemeta(media_item, metadata) media_item['slugline'] = fileName if text_body is not None: media_item['body_html'] = text_body media_item['headline'] = item['headline'] media_item['original_source'] = item[ 'original_source'] if 'original_creator' in item: media_item['original_creator'] = item[ 'original_creator'] new_items.append(media_item) # add a reference to this item in the composite item media_ref = { 'guid': media_item['guid'], 'residRef': media_item['guid'], 'headline': fileName, 'location': 'ingest', 'itemClass': 'icls:picture', 'original_source': item['original_source'] } if 'original_creator' in item: media_ref['original_creator'] = item[ 'original_creator'] refs.append(media_ref) if html_body is not None: item['body_html'] = html_body else: item['body_html'] = '<pre>' + text_body + '</pre>' item[FORMAT] = FORMATS.PRESERVED # if there is composite item then add the main group and references if comp_item: grefs = { 'refs': [{ 'idRef': 'main' }], 'id': 'root', 'role': 'grpRole:NEP' } comp_item['groups'].append(grefs) grefs = {'refs': refs, 'id': 'main', 'role': 'grpRole:Main'} comp_item['groups'].append(grefs) new_items.append(comp_item) new_items.append(item) return new_items except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
def parse(self, data, provider=None): config = provider.get('config', {}) # If the channel is configured to process structured email generated from a google form if config.get('formatted', False): return self._parse_formatted_email(data, provider) try: new_items = [] # create an item for the body text of the email # either text or html item = dict() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['versioncreated'] = utcnow() comp_item = None # a list to keep the references to the attachments refs = [] html_body = None text_body = None for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) item['headline'] = self.parse_header(msg['subject']) field_from = self.parse_header(msg['from']) item['original_source'] = field_from try: if email_regex.findall(field_from): email_address = email_regex.findall(field_from)[0] user = get_resource_service('users').get_user_by_email(email_address) item['original_creator'] = user[eve.utils.config.ID_FIELD] except UserNotRegisteredException: pass item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt # this will loop through all the available multiparts in mail for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) try: # if we don't know the charset just have a go! if part.get_content_charset() is None: text_body = body.decode() else: charset = part.get_content_charset() text_body = body.decode(charset) continue except Exception as ex: logger.exception( "Exception parsing text body for {0} from {1}: {2}".format(item['headline'], field_from, ex)) continue if part.get_content_type() == "text/html": body = part.get_payload(decode=True) try: if part.get_content_charset() is None: html_body = body.decode() else: charset = part.get_content_charset() html_body = body.decode(charset) html_body = self.safe_html(html_body) continue except Exception as ex: logger.exception( "Exception parsing html body for {0} from {1}: {2}".format(item['headline'], field_from, ex)) continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue # we are only going to pull off image attachments at this stage if part.get_content_maintype() != 'image': continue fileName = part.get_filename() if bool(fileName): image = part.get_payload(decode=True) content = io.BytesIO(image) res = process_file_from_stream(content, part.get_content_type()) file_name, content_type, metadata = res if content_type == 'image/gif' or content_type == 'image/png': continue content.seek(0) image_id = self.parser_app.media.put(content, filename=fileName, content_type=content_type, metadata=metadata) renditions = {'baseImage': {'href': image_id}} # if we have not got a composite item then create one if not comp_item: comp_item = dict() comp_item[ITEM_TYPE] = CONTENT_TYPE.COMPOSITE comp_item['guid'] = generate_guid(type=GUID_TAG) comp_item['versioncreated'] = utcnow() comp_item['groups'] = [] comp_item['headline'] = item['headline'] comp_item['groups'] = [] comp_item['original_source'] = item['original_source'] if 'original_creator' in item: comp_item['original_creator'] = item['original_creator'] # create a reference to the item that stores the body of the email item_ref = {'guid': item['guid'], 'residRef': item['guid'], 'headline': item['headline'], 'location': 'ingest', 'itemClass': 'icls:text', 'original_source': item['original_source']} if 'original_creator' in item: item_ref['original_creator'] = item['original_creator'] refs.append(item_ref) media_item = dict() media_item['guid'] = generate_guid(type=GUID_TAG) media_item['versioncreated'] = utcnow() media_item[ITEM_TYPE] = CONTENT_TYPE.PICTURE media_item['renditions'] = renditions media_item['mimetype'] = content_type set_filemeta(media_item, metadata) media_item['slugline'] = fileName if text_body is not None: media_item['body_html'] = text_body media_item['headline'] = item['headline'] media_item['original_source'] = item['original_source'] if 'original_creator' in item: media_item['original_creator'] = item['original_creator'] new_items.append(media_item) # add a reference to this item in the composite item media_ref = {'guid': media_item['guid'], 'residRef': media_item['guid'], 'headline': fileName, 'location': 'ingest', 'itemClass': 'icls:picture', 'original_source': item['original_source']} if 'original_creator' in item: media_ref['original_creator'] = item['original_creator'] refs.append(media_ref) if html_body is not None: item['body_html'] = html_body else: item['body_html'] = '<pre>' + text_body + '</pre>' item[FORMAT] = FORMATS.PRESERVED # if there is composite item then add the main group and references if comp_item: grefs = {'refs': [{'idRef': 'main'}], 'id': 'root', 'role': 'grpRole:NEP'} comp_item['groups'].append(grefs) grefs = {'refs': refs, 'id': 'main', 'role': 'grpRole:Main'} comp_item['groups'].append(grefs) new_items.append(comp_item) new_items.append(item) return new_items except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)