def extract_headers_metadata(self, headers): self.result.headers = safe_dict(dict(headers)) headers = [(safe_string(k), safe_string(v)) for k, v in headers] for field, value in headers: field = field.lower() if field is None or value is None: continue if field == 'subject': self.update('title', value) if field == 'message-id': self.update('message_id', value) if field == 'in-reply-to': self.result.emit_in_reply_to(value) if field == 'references': for email_addr in value.split(): self.result.emit_in_reply_to(email_addr) if field == 'date': date = value try: date = email.utils.parsedate(date) date = datetime.fromtimestamp(mktime(date)) self.update('created_at', date) except Exception as ex: log.warning("Failed to parse [%s]: %s", date, ex) if field == 'from': for (name, _) in self.parse_emails(value): self.update('author', name) if field in ['to', 'cc', 'bcc']: self.parse_emails(value)
def ingest(self, file_path): message = Message(file_path) self._parse_headers(message) self.extract_plain_text_content(message.getField('1000')) self.update('message_id', message.getField('1035')) # all associated person names, i.e. sender, recipient etc. NAME_FIELDS = ['0C1A', '0E04', '0040', '004D'] EMAIL_FIELDS = [ '0C1F', '0076', '0078', '1046', '3003', '0065', '3FFC', '403E' ] for field in NAME_FIELDS + EMAIL_FIELDS: self.parse_emails(message.getField(field)) self.update('title', message.getField('0037')) self.update('title', message.getField('0070')) self.update('author', message.getField('0C1A')) # from pprint import pprint # pprint(self.result.to_dict()) self.extract_olefileio_metadata(message) self.result.flag(self.result.FLAG_EMAIL) self.result.flag(self.result.FLAG_PLAINTEXT) for attachment in message.attachments: name = safe_string(attachment.longFilename) name = name or safe_string(attachment.shortFilename) self.ingest_attachment(name, attachment.mimeType, attachment.data)
def ingest_attachment(self, name, mime_type, body): has_body = body is not None and len(body) if safe_string(name) is None and not has_body: # Hello, Outlook. return file_name = safe_filename(name, default='attachment') name = safe_string(name) or file_name foreign_id = join_path(self.result.id, name) file_path = join_path(self.work_path, file_name) with open(file_path, 'wb') as fh: if isinstance(body, str): body = body.encode('utf-8') if body is not None: fh.write(body) if isinstance(mime_type, bytes): mime_type = mime_type.decode('utf-8') self.manager.handle_child(self.result, file_path, id=foreign_id, file_name=name, mime_type=mime_type)
def generate_rows(self, table): headers = [safe_string(h) for h in table.field_names] for row in table: try: data = OrderedDict() for header, value in zip(headers, row): data[header] = safe_string(value) yield data except Exception as ex: log.warning("Cannot decode DBF row: %s", ex)
def generate_rows(self, reader, has_header=False): headers = next(reader) if has_header else [] headers = [safe_string(h) for h in headers] for row in reader: while len(headers) < len(row): next_col = len(headers) + 1 headers.append('Column %s' % next_col) data = OrderedDict() for header, value in zip(headers, row): data[header] = safe_string(value) yield data
def get_email_addresses(self, doc, tag): path = './%s/emailAddress' % tag for address in doc.findall(path): email = safe_string(address.get('OPFContactEmailAddressAddress')) if not self.check_email(email): email = None self.result.emit_email(email) name = safe_string(address.get('OPFContactEmailAddressName')) if self.check_email(name): name = None if name or email: yield (name, email)
def update(self): """Apply the outcome of the result to the document.""" doc = self.document if self.status == self.STATUS_SUCCESS: doc.status = Document.STATUS_SUCCESS doc.error_message = None else: doc.status = Document.STATUS_FAIL doc.error_message = stringify(self.error_message) schema = model['Document'] for flag, name in self.SCHEMATA: if flag in self.flags: schema = model[name] doc.schema = schema.name doc.foreign_id = safe_string(self.id) doc.content_hash = self.checksum or doc.content_hash doc.pdf_version = self.pdf_checksum doc.title = self.title or doc.meta.get('title') doc.file_name = self.file_name or doc.meta.get('file_name') doc.file_size = self.size or doc.meta.get('file_size') doc.summary = self.summary or doc.meta.get('summary') doc.author = self.author or doc.meta.get('author') doc.generator = self.generator or doc.meta.get('generator') doc.mime_type = self.mime_type or doc.meta.get('mime_type') doc.encoding = self.encoding or doc.meta.get('encoding') doc.date = self.date or doc.meta.get('date') doc.authored_at = self.created_at or doc.meta.get('authored_at') doc.modified_at = self.modified_at or doc.meta.get('modified_at') doc.published_at = self.published_at or doc.meta.get('published_at') doc.message_id = self.message_id or doc.meta.get('message_id') doc.in_reply_to = ensure_list(self.in_reply_to) doc.columns = list(self.columns.keys()) doc.body_raw = self.body_html doc.body_text = self.body_text doc.headers = self.headers for kw in self.keywords: doc.add_keyword(safe_string(kw)) for lang in self.languages: doc.add_language(safe_string(lang)) db.session.flush() collector = DocumentTagCollector(doc, 'ingestors') for entity in self.entities: collector.emit(entity, DocumentTag.TYPE_PERSON) for email in self.emails: collector.emit(email, DocumentTag.TYPE_EMAIL) collector.save()
def ingest(self, file_path): self.result.flag(self.result.FLAG_EMAIL) try: doc = self.parse_xml(file_path) except TypeError: raise ProcessingException("Cannot parse OPF XML file.") if len(doc.findall('//email')) != 1: raise ProcessingException("More than one email in file.") email = doc.find('//email') props = email.getchildren() props = {c.tag: safe_string(c.text) for c in props if c.text} headers = { 'Subject': props.get('OPFMessageCopySubject'), 'Message-ID': props.pop('OPFMessageCopyMessageID', None), 'From': self.get_contacts(email, 'OPFMessageCopyFromAddresses'), 'Sender': self.get_contacts(email, 'OPFMessageCopySenderAddress'), 'To': self.get_contacts(email, 'OPFMessageCopyToAddresses'), 'CC': self.get_contacts(email, 'OPFMessageCopyCCAddresses'), 'BCC': self.get_contacts(email, 'OPFMessageCopyBCCAddresses'), } date = props.get('OPFMessageCopySentTime') if date is not None: date = datetime.strptime(date, '%Y-%m-%dT%H:%M:%S') date = time.mktime(date.timetuple()) headers['Date'] = utils.formatdate(date) self.result.headers = safe_dict(headers) self.update('title', props.pop('OPFMessageCopySubject', None)) self.update('title', props.pop('OPFMessageCopyThreadTopic', None)) for tag in ('OPFMessageCopyFromAddresses', 'OPFMessageCopySenderAddress'): self.update('author', self.get_contact_name(email, tag)) self.update('summary', props.pop('OPFMessageCopyPreview', None)) self.update('created_at', props.pop('OPFMessageCopySentTime', None)) self.update('modified_at', props.pop('OPFMessageCopyModDate', None)) body = props.pop('OPFMessageCopyBody', None) html = props.pop('OPFMessageCopyHTMLBody', None) has_html = '1E0' == props.pop('OPFMessageGetHasHTML', None) if has_html and safe_string(html): self.extract_html_content(html) self.result.flag(self.result.FLAG_HTML) else: self.extract_plain_text_content(body) self.result.flag(self.result.FLAG_PLAINTEXT)
def emit_page(self, index, text): """Emit a plain text page.""" record = DocumentRecord() record.document_id = self.document.id record.text = safe_string(text) record.index = index db.session.add(record)
def ingest(self, file_path, result=None, work_path=None): """Main execution step of an ingestor.""" if result is None: file_name = os.path.basename(file_path) if file_path else None result = self.RESULT_CLASS(file_path=file_path, file_name=file_name) self.checksum_file(result, file_path) self.before(result) result.status = Result.STATUS_PENDING try: ingestor_class = self.auction(file_path, result) log.debug("Ingestor [%s]: %s", result, ingestor_class.__name__) self.delegate(ingestor_class, result, file_path, work_path=work_path) result.status = Result.STATUS_SUCCESS except ProcessingException as pexc: result.error_message = safe_string(pexc) result.status = Result.STATUS_FAILURE log.warning("Failed [%s]: %s", result, result.error_message) finally: if result.status == Result.STATUS_PENDING: result.status = Result.STATUS_STOPPED self.after(result) return result
def parse_emails(self, text): """Parse an email list with the side effect of adding them to the relevant result lists.""" parsed = address.parse_list(safe_string(text)) # If the snippet didn't parse, assume it is just a name. if not len(parsed): return [(text, None)] values = [] for addr in parsed: name = stringify(addr.display_name) email = stringify(addr.address) if not self.check_email(email): email = None if self.check_email(name): email = email or name name = None self.result.emit_email(email) self.result.emit_name(name) values.append((name, email)) return values
def update(self, name, value): """Set a metadata value if it is not already set with a value.""" existing = getattr(self.result, name) if existing: return if not isinstance(value, (date, datetime)): value = safe_string(value) if value is None: return setattr(self.result, name, value)
def generate_csv(self, table): for row in table.getElementsByType(TableRow): values = [] for cell in row.getElementsByType(TableCell): repeat = cell.getAttribute("numbercolumnsrepeated") or 1 value = self.convert_cell(cell) value = safe_string(value) for i in range(int(repeat)): values.append(value) yield values
def convert_cell(self, cell, sheet): value = cell.value try: if cell.ctype == 3: if value == 0: return None year, month, day, hour, minute, second = \ xlrd.xldate_as_tuple(value, sheet.book.datemode) if (year, month, day) == (0, 0, 0): value = time(hour, minute, second) return value.isoformat() else: value = datetime(year, month, day, hour, minute, second) return value.isoformat() except Exception: pass return safe_string(value)
def _emit_iterator_rows(self, iterator): for data in iterator: for column in data.keys(): column = safe_string(column) self.columns[column] = None yield data
def emit_name(self, text): text = safe_string(text) if text is None: return self.entities.append(text)
def emit_in_reply_to(self, text): text = safe_string(text) if text is None: return if text not in self.in_reply_to: self.in_reply_to.append(text)
def emit_pdf_alternative(self, file_path): self.pdf_path = safe_string(file_path)
def emit_page(self, index, text): self.pages.append({'text': safe_string(text), 'index': index})
def label(self): return safe_string(self.file_name) or self.checksum or '<result>'
def emit_language(self, text): text = safe_string(text) if text is None: return if text not in self.keywords: self.languages.append(text)
def emit_email(self, text): text = safe_string(text) if text is None: return self.emails.append(text)
def emit_html_body(self, html, text): self.body_html = safe_string(html) self.emit_text_body(text)
def emit_text_body(self, text): self.body_text = safe_string(text)
def generate_csv(self, sheet): for row in sheet.rows: try: yield [safe_string(c.value) for c in row] except (ValueError, OverflowError, ParseError) as ve: log.warning("Failed to read Excel row: %s", ve)