示例#1
0
    def match(cls, file_path, result=None):
        mime_types = [normalize_mimetype(m, default=None) for m in cls.MIME_TYPES]  # noqa
        mime_types = [m for m in mime_types if m is not None]
        mime_type = normalize_mimetype(result.mime_type, default=None)
        if mime_type in mime_types:
            return cls.SCORE

        extensions = [normalize_extension(e) for e in cls.EXTENSIONS]
        extensions = [e for e in extensions if e is not None]
        extension = normalize_extension(result.file_name)
        if extension in extensions:
            return cls.SCORE

        return -1
示例#2
0
    def ingest_message(self, data):
        try:
            msg = mime.from_string(data)
            if msg.headers is not None:
                self.extract_headers_metadata(msg.headers.items())
        except DecodingError as derr:
            raise ProcessingException('Cannot parse email: %s' % derr)

        try:
            if msg.subject:
                self.update('title', str(msg.subject))
        except DecodingError as derr:
            log.warning("Decoding subject: %s", derr)

        try:
            if msg.message_id:
                self.update('message_id', str(msg.message_id))
        except DecodingError as derr:
            log.warning("Decoding message ID: %s", derr)

        self.extract_plain_text_content(None)
        self.result.flag(self.result.FLAG_EMAIL)
        bodies = defaultdict(list)

        for part in msg.walk(with_self=True):
            try:
                if part.body is None:
                    continue
            except (DecodingError, ValueError) as de:
                log.warning("Cannot decode part [%s]: %s", self.result, de)
                continue

            file_name = part.detected_file_name

            # HACK HACK HACK - WTF flanker?
            # Disposition headers can have multiple filename declarations,
            # flanker decides to concatenate.
            if file_name is not None and len(file_name) > 4:
                half = len(file_name) // 2
                if file_name[:half] == file_name[half:]:
                    file_name = file_name[:half]

            mime_type = str(part.detected_content_type)
            mime_type = normalize_mimetype(mime_type)

            if part.is_attachment():
                self.ingest_attachment(file_name, mime_type, part.body)

            if part.is_body():
                bodies[mime_type].append(part.body)

        if 'text/html' in bodies:
            self.extract_html_content('\n\n'.join(bodies['text/html']))
            self.result.flag(self.result.FLAG_HTML)

        if 'text/plain' in bodies:
            self.extract_plain_text_content('\n\n'.join(bodies['text/plain']))
            self.result.flag(self.result.FLAG_PLAINTEXT)
示例#3
0
async def convert(request):
    acquired = lock.acquire(blocking=False)
    if not acquired:
        return web.Response(status=503)
    data = await request.post()
    upload = data['file']
    fd, upload_file = mkstemp()
    out_file = None
    try:
        os.close(fd)
        with open(upload_file, 'wb') as fh:
            shutil.copyfileobj(upload.file, fh, BUFFER_SIZE)

        extension = normalize_extension(upload.filename)
        mime_type = normalize_mimetype(upload.content_type, default=None)
        filters = list(FORMATS.get_filters(extension, mime_type))
        timeout = int(request.query.get('timeout', 300))
        timeout = max(10, timeout - 5)

        await converter.prepare()
        await asyncio.sleep(0)
        out_file = converter.convert_file(upload_file,
                                          filters,
                                          timeout=timeout)
        out_size = 0
        if os.path.exists(out_file):
            out_size = os.path.getsize(out_file)
        lock.release()
        await asyncio.sleep(0)

        response = web.StreamResponse()
        response.content_length = out_size
        response.content_type = 'application/pdf'
        await response.prepare(request)
        with open(out_file, 'rb') as f:
            while True:
                chunk = f.read(BUFFER_SIZE)
                if not chunk:
                    break
                await response.write(chunk)
        return response
    except Exception as exc:
        log.exception('Conversion failed.')
        converter.terminate()
        lock.release()
        status = 400 if isinstance(exc, ConversionFailure) else 503
        return web.Response(text=str(exc), status=status)
    finally:
        if os.path.exists(upload_file):
            os.remove(upload_file)
        if out_file is not None and os.path.exists(out_file):
            os.remove(out_file)
示例#4
0
    def mime_type(self):
        mime_type = self.meta.get('mime_type')

        if mime_type is None and self.file_name:
            mime_type, _ = mimetypes.guess_type(self.file_name)

        # derive mime type from headers
        if mime_type is None:
            mime_type = self.headers.get('content_type')

        mime_type = normalize_mimetype(mime_type)
        if mime_type != DEFAULT:
            return mime_type
async def convert(request):
    data = await request.post()
    upload = data['file']
    extension = normalize_extension(upload.filename)
    mime_type = normalize_mimetype(upload.content_type, default=None)
    log.info('PDF convert: %s [%s]', upload.filename, mime_type)
    fd, upload_file = mkstemp()
    os.close(fd)
    fd, out_file = mkstemp(suffix='.pdf')
    os.close(fd)

    try:
        with open(upload_file, 'wb') as fh:
            shutil.copyfileobj(upload.file, fh, BUFFER_SIZE)

        filters = list(FORMATS.get_filters(extension, mime_type))
        timeout = int(request.query.get('timeout', 300))

        await asyncio.sleep(0)
        converter.convert_file(upload_file, out_file, filters, timeout=timeout)
        out_size = os.path.getsize(out_file)
        if out_size == 0:
            raise ConversionFailure("Could not convert.")
        await asyncio.sleep(0)

        response = web.StreamResponse()
        response.content_length = out_size
        response.content_type = 'application/pdf'
        await response.prepare(request)
        with open(out_file, 'rb') as f:
            while True:
                chunk = f.read(BUFFER_SIZE)
                if not chunk:
                    break
                await response.write(chunk)
        return response
    except ConversionFailure as fail:
        log.info("Failed to convert: %s", fail)
        return web.Response(text=str(fail), status=400)
    except Exception as exc:
        log.exception('System error: %s.', exc)
        converter.terminate()
    finally:
        os.remove(upload_file)
        os.remove(out_file)
示例#6
0
    def __init__(self):
        self.media_types = defaultdict(list)
        self.extensions = defaultdict(list)
        for xcd_file in self.FILES:
            doc = etree.parse(xcd_file)
            path = './*[@oor:package="org.openoffice.TypeDetection"]/node/node'
            for tnode in doc.xpath(path, namespaces=NS):
                node = {}
                for prop in tnode.findall('./prop'):
                    name = prop.get(NAME)
                    for value in prop.findall('./value'):
                        node[name] = value.text

                name = node.get('PreferredFilter', tnode.get(NAME))
                media_type = normalize_mimetype(node.get('MediaType'),
                                                default=None)
                if media_type is not None:
                    self.media_types[media_type].append(name)

                for ext in parse_extensions(node.get('Extensions')):
                    self.extensions[ext].append(name)
示例#7
0
    def auction(self, file_path, result):
        if not is_file(file_path):
            result.mime_type = DirectoryIngestor.MIME_TYPE
            return DirectoryIngestor

        if not useful_mimetype(result.mime_type):
            mime_type = self.MAGIC.from_file(file_path)
            result.mime_type = normalize_mimetype(mime_type)

        best_score, best_cls = 0, None
        for cls in self.ingestors:
            result.manager = self
            score = cls.match(file_path, result=result)
            if score > best_score:
                best_score = score
                best_cls = cls

        if best_cls is None:
            raise ProcessingException("Format not supported: %s" %
                                      result.mime_type)
        return best_cls
示例#8
0
 def content_type(self):
     content_type = self.headers.get('content-type')
     return normalize_mimetype(content_type)
示例#9
0
文件: rule.py 项目: paterry/memorious
 def configure(self):
     self.clean = normalize_mimetype(self.value)